├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── build-ecr-images.sh ├── charts ├── cluster-issuer │ ├── Chart.yaml │ ├── templates │ │ └── cluster_issuer.yaml │ └── values.yaml ├── dex │ ├── Chart.yaml │ ├── crds │ │ └── dex.yaml │ ├── templates │ │ ├── cluster_role.yaml │ │ ├── cluster_role_binding.yaml │ │ ├── config_map.yaml │ │ ├── deployment.yaml │ │ ├── secrets │ │ │ ├── static-oidc-client.yaml │ │ │ └── static-passwords.yaml │ │ ├── service.yaml │ │ ├── service_account.yaml │ │ └── virtual_service.yaml │ └── values.yaml ├── ebs-sc │ ├── Chart.yaml │ └── templates │ │ └── storage-class.yaml ├── istio-ingress │ ├── Chart.yaml │ ├── templates │ │ ├── certificate.yaml │ │ ├── cluster_roles.yaml │ │ ├── gateway.yaml │ │ └── virtual_service.yaml │ └── values.yaml ├── karpenter-components │ ├── Chart.yaml │ ├── templates │ │ ├── node-class.yaml │ │ └── node-pool.yaml │ └── values.yaml ├── machine-learning │ ├── data-prep │ │ ├── coco-data │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── coco-data.yaml │ │ │ └── values.yaml │ │ ├── data-process │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── data-process.yaml │ │ │ └── values.yaml │ │ ├── databricks-dolly-15k-data │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── data.yaml │ │ │ └── values.yaml │ │ ├── mpijob-data-process │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── dist-data-process.yaml │ │ │ └── values.yaml │ │ ├── ray-data-process │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── ray-data-process.yaml │ │ │ └── values.yaml │ │ └── redpajama-data │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ └── redpajama-data.yaml │ │ │ └── values.yaml │ ├── model-prep │ │ ├── hf-snapshot │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── hf-snapshot.yaml │ │ │ └── values.yaml │ │ ├── rayserve-tnx-autocausalengine │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── engine.yaml │ │ │ └── values.yaml │ │ └── rayserve-vllm-asyncllmengine │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ └── engine.yaml │ │ │ └── values.yaml │ ├── serving │ │ ├── djl-lmi-server │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── server.yaml │ │ │ └── values.yaml │ │ ├── generic-server │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── server.yaml │ │ │ └── values.yaml │ │ ├── rayserve │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── rayservice.yaml │ │ │ └── values.yaml │ │ ├── triton-inference-server-lws │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── server.yaml │ │ │ └── values.yaml │ │ └── triton-inference-server │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ └── server.yaml │ │ │ └── values.yaml │ ├── testing │ │ ├── maskrcnn-jupyter │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── jupyter.yaml │ │ │ └── values.yaml │ │ └── maskrcnn-optimized-jupyter │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ └── jupyter.yaml │ │ │ └── values.yaml │ └── training │ │ ├── maskrcnn-optimized │ │ ├── Chart.yaml │ │ ├── templates │ │ │ └── maskrcnn.yaml │ │ └── values.yaml │ │ ├── maskrcnn │ │ ├── Chart.yaml │ │ ├── templates │ │ │ └── maskrcnn.yaml │ │ └── values.yaml │ │ ├── mpijob-horovod-tensorflow-gpu │ │ ├── Chart.yaml │ │ ├── templates │ │ │ └── train.yaml │ │ └── values.yaml │ │ ├── pytorchjob-distributed │ │ ├── Chart.yaml │ │ ├── templates │ │ │ └── train.yaml │ │ └── values.yaml │ │ ├── pytorchjob-elastic │ │ ├── Chart.yaml │ │ ├── templates │ │ │ └── train.yaml │ │ └── values.yaml │ │ └── raytrain │ │ ├── Chart.yaml │ │ ├── templates │ │ └── train.yaml │ │ └── values.yaml ├── ml-platform │ ├── kubeflow-admission-webhook │ │ ├── Chart.yaml │ │ ├── crds │ │ │ └── poddefaults.yaml │ │ ├── templates │ │ │ ├── authorization_policy.yaml │ │ │ ├── certificate.yaml │ │ │ ├── cluster_role_binding.yaml │ │ │ ├── cluster_roles.yaml │ │ │ ├── deplyment.yaml │ │ │ ├── issuer.yaml │ │ │ ├── mutating_webhook_configuration.yaml │ │ │ ├── service.yaml │ │ │ └── service_account.yaml │ │ └── values.yaml │ ├── kubeflow-central-dashboard │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── authorization_policy.yaml │ │ │ ├── cluster_role.yaml │ │ │ ├── cluster_role_binding.yaml │ │ │ ├── config_map.yaml │ │ │ ├── deployment.yaml │ │ │ ├── role.yaml │ │ │ ├── role_binding.yaml │ │ │ ├── service.yaml │ │ │ ├── service_account.yaml │ │ │ └── virtual_service.yaml │ │ └── values.yaml │ ├── kubeflow-katib │ │ ├── Chart.yaml │ │ ├── crds │ │ │ ├── experiments.yaml │ │ │ ├── suggestions.yaml │ │ │ └── trials.yaml │ │ ├── templates │ │ │ ├── authorization_policy.yaml │ │ │ ├── certificate.yaml │ │ │ ├── cluster_role_bindings.yaml │ │ │ ├── cluster_roles.yaml │ │ │ ├── config_maps.yaml │ │ │ ├── deployments.yaml │ │ │ ├── issuer.yaml │ │ │ ├── mutating_web_hook.yaml │ │ │ ├── service_accounts.yaml │ │ │ ├── services.yaml │ │ │ ├── validating_web_hook.yaml │ │ │ └── virtual_service.yaml │ │ └── values.yaml │ ├── kubeflow-notebooks │ │ ├── Chart.yaml │ │ ├── crds │ │ │ └── notebooks.yaml │ │ ├── templates │ │ │ ├── controller │ │ │ │ ├── cluster_role_binding.yaml │ │ │ │ ├── cluster_roles.yaml │ │ │ │ ├── config_map.yaml │ │ │ │ ├── deployment.yaml │ │ │ │ ├── role.yaml │ │ │ │ ├── role_binding.yaml │ │ │ │ ├── service.yaml │ │ │ │ └── service_account.yaml │ │ │ └── webapp │ │ │ │ ├── authorization_policy.yaml │ │ │ │ ├── cluster_role_binding.yaml │ │ │ │ ├── cluster_roles.yaml │ │ │ │ ├── config-maps │ │ │ │ ├── config.yaml │ │ │ │ └── logos.yaml │ │ │ │ ├── deployment.yaml │ │ │ │ ├── destination_rule.yaml │ │ │ │ ├── role.yaml │ │ │ │ ├── role_binding.yaml │ │ │ │ ├── service.yaml │ │ │ │ ├── service_account.yaml │ │ │ │ └── virtual_service.yaml │ │ └── values.yaml │ ├── kubeflow-pipelines │ │ ├── Chart.yaml │ │ ├── crds │ │ │ ├── clusterworkflowtemplates.yaml │ │ │ ├── compositecontrollers.yaml │ │ │ ├── controllerrevision.yaml │ │ │ ├── cronworkflows.yaml │ │ │ ├── decoratorcontroller.yaml │ │ │ ├── scheduledworkflows.yaml │ │ │ ├── viewers.yaml │ │ │ ├── workfloweventbindings.yaml │ │ │ ├── workflows.yaml │ │ │ ├── workflowtaskresults.yaml │ │ │ ├── workflowtasksets.yaml │ │ │ └── workflowtemplates.yaml │ │ ├── templates │ │ │ ├── authorization_policies.yaml │ │ │ ├── certficate.yaml │ │ │ ├── cluster_role_bindings.yaml │ │ │ ├── cluster_roles.yaml │ │ │ ├── composite_controller.yaml │ │ │ ├── config_maps.yaml │ │ │ ├── deployments.yaml │ │ │ ├── destination_rules.yaml │ │ │ ├── issuer.yaml │ │ │ ├── mutating_webhook_configuration.yaml │ │ │ ├── priority_class.yaml │ │ │ ├── role_bindings.yaml │ │ │ ├── roles.yaml │ │ │ ├── secrets.yaml │ │ │ ├── service_accounts.yaml │ │ │ ├── services.yaml │ │ │ ├── stateful_set.yaml │ │ │ └── virtual_services.yaml │ │ └── values.yaml │ ├── kubeflow-profiles-and-kfam │ │ ├── Chart.yaml │ │ ├── crds │ │ │ └── profiles.yaml │ │ ├── templates │ │ │ ├── cluster_role_binding.yaml │ │ │ ├── config-maps │ │ │ │ ├── namespace_labels_data.yaml │ │ │ │ └── profiles_config.yaml │ │ │ ├── deployment.yaml │ │ │ ├── role.yaml │ │ │ ├── role_binding.yaml │ │ │ ├── service.yaml │ │ │ ├── service_account.yaml │ │ │ └── virtual_service.yaml │ │ └── values.yaml │ ├── kubeflow-roles │ │ ├── Chart.yaml │ │ └── templates │ │ │ └── cluster-roles.yaml │ ├── kubeflow-tensorboards │ │ ├── Chart.yaml │ │ ├── crds │ │ │ └── tensorboard.yaml │ │ ├── templates │ │ │ ├── controller │ │ │ │ ├── cluster_role_binding.yaml │ │ │ │ ├── cluster_roles.yaml │ │ │ │ ├── config_map.yaml │ │ │ │ ├── deployment.yaml │ │ │ │ ├── role.yaml │ │ │ │ ├── role_binding.yaml │ │ │ │ ├── service.yaml │ │ │ │ └── service_account.yaml │ │ │ └── webapp │ │ │ │ ├── authorization_policy.yaml │ │ │ │ ├── cluster_role_binding.yaml │ │ │ │ ├── cluster_roles.yaml │ │ │ │ ├── deployment.yaml │ │ │ │ ├── destination_rule.yaml │ │ │ │ ├── service.yaml │ │ │ │ ├── service_account.yaml │ │ │ │ └── virtual_service.yaml │ │ └── values.yaml │ ├── kubeflow-training-operator │ │ ├── Chart.yaml │ │ ├── crds │ │ │ ├── mxjobs.yaml │ │ │ ├── paddlejobs.yaml │ │ │ ├── pytorchjobs.yaml │ │ │ ├── tfjobs.yaml │ │ │ └── xgboostjobs.yaml │ │ ├── templates │ │ │ ├── cluster_role_binding.yaml │ │ │ ├── cluster_roles.yaml │ │ │ ├── deployment.yaml │ │ │ ├── role.yaml │ │ │ ├── service.yaml │ │ │ └── service_account.yaml │ │ └── values.yaml │ ├── kubeflow-user-profile-defaults │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── pod_default.yaml │ │ │ ├── role_bindings.yaml │ │ │ └── roles.yaml │ │ └── values.yaml │ ├── kubeflow-user-profile │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── config_map.yaml │ │ │ └── profile.yaml │ │ └── values.yaml │ └── kubeflow-volumes │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── controller │ │ │ ├── admission_webhooks.yaml │ │ │ ├── certificate.yaml │ │ │ ├── cluster_role_bindings.yaml │ │ │ ├── cluster_roles.yaml │ │ │ ├── config_map.yaml │ │ │ ├── deployment.yaml │ │ │ ├── issuer.yaml │ │ │ ├── pvcviewers_crd.yaml │ │ │ ├── role.yaml │ │ │ ├── role_binding.yaml │ │ │ ├── service.yaml │ │ │ └── service_account.yaml │ │ └── webapp │ │ │ ├── authorization_policy.yaml │ │ │ ├── cluster_role_binding.yaml │ │ │ ├── cluster_roles.yaml │ │ │ ├── config_map.yaml │ │ │ ├── deployment.yaml │ │ │ ├── destination_rule.yaml │ │ │ ├── service.yaml │ │ │ ├── service_account.yaml │ │ │ └── virtual_service.yaml │ │ └── values.yaml ├── mpi-operator │ ├── Chart.yaml │ ├── crds │ │ └── mpijob.yaml │ ├── templates │ │ └── mpi-operator.yaml │ └── values.yaml ├── nvidia-device-plugin │ ├── Chart.yaml │ └── templates │ │ └── daemonset.yaml ├── oauth2-proxy-route │ ├── Chart.yaml │ ├── templates │ │ ├── authorization_policy.yaml │ │ └── virtual_service.yaml │ └── values.yaml ├── pv-efs │ ├── Chart.yaml │ └── templates │ │ ├── pv.yaml │ │ ├── pvc.yaml │ │ └── storage-class.yaml └── pv-fsx │ ├── Chart.yaml │ └── templates │ ├── pv.yaml │ ├── pvc.yaml │ └── storage-class.yaml ├── containers ├── aws-samples-maskrcnn │ ├── Dockerfile │ ├── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh │ └── notebooks │ │ └── mask-rcnn-tensorflow-viz.ipynb ├── megatron-deepspeed │ ├── Dockerfile │ └── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh ├── nemo-megatron │ ├── Dockerfile │ └── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh ├── ray-pytorch-neuronx-vllm │ ├── Dockerfile │ ├── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh │ └── patches │ │ ├── vllm-neuron-0.6.6.post1.patch │ │ ├── vllm-neuron-0.8.1.patch │ │ └── vllm_v0.5.0_neuron.patch ├── ray-pytorch-neuronx │ ├── Dockerfile │ └── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh ├── ray-pytorch │ ├── Dockerfile │ └── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh ├── tensorpack-maskrcnn │ ├── Dockerfile │ ├── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh │ └── notebooks │ │ └── mask-rcnn-tensorpack-viz.ipynb ├── tritonserver-neuronx-djl-lmi │ ├── Dockerfile │ └── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh ├── tritonserver-neuronx-vllm │ ├── Dockerfile │ ├── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh │ └── patch │ │ ├── vllm-neuron-0.6.6.post1.patch │ │ └── vllm-neuron-0.8.1.patch ├── tritonserver-neuronx │ ├── Dockerfile │ └── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh ├── tritonserver-ray-vllm │ ├── Dockerfile │ ├── build_tools │ │ ├── build_and_push.sh │ │ └── set_env.sh │ └── resources │ │ ├── kubessh │ │ └── server.py └── tritonserver-trtllm │ ├── Dockerfile │ ├── build_tools │ ├── build_and_push.sh │ └── set_env.sh │ └── resources │ ├── kubessh │ └── server.py ├── eks-cluster ├── legacy │ ├── README.md │ ├── apply-aws-auth-cm.sh │ ├── apply-nvidia-plugin.sh │ ├── aws-auth-cm.yaml │ ├── configure-eks-auth.sh │ ├── efs-sc.yaml │ ├── fsx-sc.yaml │ ├── install-eksctl.sh │ ├── prepare-data.sh │ ├── pv-kubeflow-efs-gp-bursting.yaml │ ├── pv-kubeflow-fsx.yaml │ ├── pvc-kubeflow-efs-gp-bursting.yaml │ ├── pvc-kubeflow-fsx.yaml │ ├── replicate-data.yaml │ ├── set-cluster.sh │ ├── tiller-rbac-config.yaml │ └── update-kubeconfig.sh ├── terraform │ └── aws-eks-cluster-and-nodegroup │ │ ├── istio │ │ ├── main.tf │ │ ├── variables.tf │ │ └── versions.tf │ │ ├── kubeflow │ │ ├── main.tf │ │ ├── variables.tf │ │ └── versions.tf │ │ ├── main.tf │ │ ├── mlflow │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf │ │ ├── outputs.tf │ │ ├── slurm │ │ ├── main.tf │ │ ├── variables.tf │ │ └── versions.tf │ │ ├── variables.tf │ │ └── versions.tf ├── tests │ ├── test-gpu-efa.yaml │ ├── test-gpu.yaml │ └── test-neuron.yaml ├── user-data.txt └── utils │ ├── attach-pvc-fsx.yaml │ ├── attach-pvc.yaml │ ├── install-kubectl-linux.sh │ ├── prepare-s3-bucket.sh │ ├── s3-backend.sh │ ├── stage-data-fsx.yaml │ └── stage-data.yaml ├── examples ├── agentic │ └── mcp-gateway-registry │ │ ├── README.md │ │ └── server.yaml ├── inference │ ├── README.md │ ├── djl-serving │ │ ├── tensorrt-llm │ │ │ ├── llama3-8b-instruct │ │ │ │ ├── README.md │ │ │ │ └── server.yaml │ │ │ └── mistral-7b-instruct-v0.2 │ │ │ │ ├── README.md │ │ │ │ └── server.yaml │ │ └── transformers-neuronx │ │ │ ├── llama3-8b-instruct │ │ │ ├── README.md │ │ │ └── server.yaml │ │ │ └── mistral-7b-instruct-v0.2 │ │ │ ├── README.md │ │ │ └── server.yaml │ ├── rayserve │ │ ├── facebook-bart-large-cnn │ │ │ ├── README.md │ │ │ └── rayservice.yaml │ │ ├── meta-llama3-8b-neuron │ │ │ ├── README.md │ │ │ ├── engine_config.yaml │ │ │ └── rayservice.yaml │ │ ├── meta-llama3-8b-vllm-neuron │ │ │ ├── README.md │ │ │ ├── engine_config.yaml │ │ │ └── rayservice.yaml │ │ ├── meta-llama3-8b-vllm │ │ │ ├── README.md │ │ │ ├── engine_config.yaml │ │ │ └── rayservice.yaml │ │ ├── meta-llama32-11b-vis-inst-vllm │ │ │ ├── README.md │ │ │ ├── engine_config.yaml │ │ │ └── rayservice.yaml │ │ ├── meta-llama33-70b-instruct-neuron │ │ │ ├── README.md │ │ │ ├── engine_config.yaml │ │ │ └── rayservice.yaml │ │ ├── meta-llama33-70b-instruct-vllm │ │ │ ├── README.md │ │ │ ├── engine_config.yaml │ │ │ └── rayservice.yaml │ │ └── mistral-8x22b-instruct-v01-vllm │ │ │ ├── README.md │ │ │ ├── engine_config.yaml │ │ │ └── rayservice.yaml │ └── triton-inference-server │ │ ├── python_backend │ │ ├── baai-bge-reranker-large-neuron │ │ │ ├── README.md │ │ │ └── triton_server.yaml │ │ ├── llama3-8b-instruct-lmi-neuron │ │ │ ├── README.md │ │ │ └── triton_server.yaml │ │ ├── llama3-8b-instruct-neuron │ │ │ ├── README.md │ │ │ └── triton_server.yaml │ │ ├── mistral-7b-instruct-v01-neuron │ │ │ ├── README.md │ │ │ └── triton_server.yaml │ │ └── xlm-roberta-base-neuron │ │ │ ├── README.md │ │ │ └── triton_server.yaml │ │ ├── ray_vllm_backend │ │ └── mistral-8x22b-instruct-v01 │ │ │ ├── README.md │ │ │ └── triton_server.yaml │ │ ├── tensorrtllm_backend │ │ ├── llama2-7b │ │ │ ├── README.md │ │ │ ├── hf_to_trtllm.yaml │ │ │ ├── triton_model.yaml │ │ │ ├── triton_server.yaml │ │ │ └── trtllm_engine.yaml │ │ ├── llama3-8b-instruct │ │ │ ├── README.md │ │ │ ├── hf_to_trtllm.yaml │ │ │ ├── triton_model.yaml │ │ │ ├── triton_server.yaml │ │ │ └── trtllm_engine.yaml │ │ ├── mistral-7b-instruct-v01 │ │ │ ├── README.md │ │ │ ├── hf_to_trtllm.yaml │ │ │ ├── triton_model.yaml │ │ │ ├── triton_server.yaml │ │ │ └── trtllm_engine.yaml │ │ ├── mistral-7b-instruct-v01_llama3-8b │ │ │ ├── README.md │ │ │ ├── llama3_8b_hf_to_trtllm.yaml │ │ │ ├── mistral_7b_hf_to_trtllm.yaml │ │ │ ├── triton_llama3_8b_model.yaml │ │ │ ├── triton_mistral_7b_model.yaml │ │ │ ├── triton_server.yaml │ │ │ ├── trtllm_llama3_8b_engine.yaml │ │ │ └── trtllm_mistral_7b_engine.yaml │ │ └── mistral-8x22b-instruct-v01 │ │ │ ├── README.md │ │ │ ├── hf_to_trtllm.yaml │ │ │ ├── triton_model.yaml │ │ │ ├── triton_server.yaml │ │ │ └── trtllm_engine.yaml │ │ └── vllm_backend │ │ ├── deepseek-r1-distill-llama-8b-neuron │ │ ├── README.md │ │ └── triton_server.yaml │ │ ├── deepseek-r1-distill-llama-8b │ │ ├── README.md │ │ └── triton_server.yaml │ │ ├── llama3-8b-instruct-neuron │ │ ├── README.md │ │ └── triton_server.yaml │ │ ├── llama3-8b-instruct │ │ ├── README.md │ │ └── triton_server.yaml │ │ ├── mistral-7b-instruct-v02-neuron │ │ ├── README.md │ │ └── triton_server.yaml │ │ └── mistral-7b-instruct-v02 │ │ ├── README.md │ │ └── triton_server.yaml ├── legacy │ ├── README.md │ ├── maskrcnn │ │ ├── README.md │ │ ├── train-maskrcnn-aws.yaml │ │ └── train-maskrcnn-tensorpack.yaml │ └── neuronx-nemo-megatron │ │ ├── llama2_13b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── preprocess.yaml │ │ └── pretrain.yaml │ │ ├── llama2_70b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── preprocess.yaml │ │ └── pretrain.yaml │ │ └── llama2_7b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── preprocess.yaml │ │ └── pretrain.yaml └── training │ ├── README.md │ ├── accelerate │ ├── bert-glue-mrpc │ │ ├── README.md │ │ ├── pipeline.ipynb │ │ └── pretrain.yaml │ └── llama2-ft-fsdp │ │ ├── 13b.yaml │ │ ├── 70b.yaml │ │ ├── 7b.yaml │ │ └── README.md │ ├── megatron-deepspeed │ └── gpt2_345m │ │ ├── README.md │ │ ├── pretrain-ddp-tp-pp-zero1.yaml │ │ ├── pretrain-ddp-zero1.yaml │ │ └── wikicorpus.yaml │ ├── nemo-megatron │ ├── llama2-7b-peft │ │ ├── README.md │ │ ├── hf_to_nemo.yaml │ │ ├── merge_peft.yaml │ │ ├── nemo_to_hf.yaml │ │ ├── peft.yaml │ │ ├── peft_accuracy.yaml │ │ ├── peft_eval.yaml │ │ └── preprocess.yaml │ ├── llama31-8b-peft-dolphin │ │ ├── README.md │ │ ├── hf_to_nemo.yaml │ │ ├── merge_peft.yaml │ │ ├── nemo_to_hf.yaml │ │ ├── peft.yaml │ │ ├── peft_eval.yaml │ │ └── preprocess.yaml │ ├── mistral-7b-v01-peft-dolphin │ │ ├── README.md │ │ ├── hf_to_nemo.yaml │ │ ├── merge_peft.yaml │ │ ├── nemo_to_hf.yaml │ │ ├── peft.yaml │ │ ├── peft_eval.yaml │ │ └── preprocess.yaml │ └── mistral-7b-v01-peft │ │ ├── README.md │ │ ├── hf_to_nemo.yaml │ │ ├── merge_peft.yaml │ │ ├── nemo_to_hf.yaml │ │ ├── peft.yaml │ │ ├── peft_accuracy.yaml │ │ ├── peft_eval.yaml │ │ └── preprocess.yaml │ ├── neuronx-distributed-training │ └── llama3_70b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── neuronx-distributed │ ├── gpt_neox_20b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── gpt_neox_6.9b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama2_13b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama2_13b_ptl │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama2_70b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama2_70b_ptl │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama2_7b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama2_7b_ptl │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama31_70b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama31_8b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama3_70b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ ├── llama3_70b_ptl │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ └── llama3_8b │ │ ├── README.md │ │ ├── compile.yaml │ │ ├── pretrain.yaml │ │ └── wikicorpus.yaml │ └── raytrain │ └── lightning-bert │ ├── README.md │ └── fine-tune.yaml ├── kfp ├── components │ ├── packages │ │ └── helm_charts_component.yaml │ └── src │ │ └── helm-charts-component │ │ ├── container │ │ ├── Dockerfile │ │ └── build_tools │ │ │ ├── build_and_push.sh │ │ │ └── set_env.sh │ │ └── helm_charts_component.py └── pipelines │ ├── packages │ └── helm_charts_pipeline.yaml │ └── src │ └── helm-charts-pipeline │ └── helm_charts_pipeline.py └── tutorials └── maskrcnn-blog └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | **/.terraform* 2 | **/terraform.tfstate* 3 | **/Dockerfile.* 4 | **/.ipynb_checkpoints/* 5 | **/.DS_Store 6 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /build-ecr-images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set region 4 | region= 5 | if [ "$#" -eq 1 ]; then 6 | region=$1 7 | else 8 | echo "usage: $0 " 9 | exit 1 10 | fi 11 | 12 | cd containers 13 | for dir in `ls -d *` 14 | do 15 | $dir/build_tools/build_and_push.sh $region 16 | done 17 | -------------------------------------------------------------------------------- /charts/cluster-issuer/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: cluster-issuer 3 | description: A Helm chart for Kubeflow self-signing certificate issuer 4 | type: application 5 | version: 1.0.0 6 | appVersion: "v1.8.0" 7 | -------------------------------------------------------------------------------- /charts/cluster-issuer/templates/cluster_issuer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: ClusterIssuer 3 | metadata: 4 | labels: 5 | app.kubernetes.io/component: cert-manager 6 | app.kubernetes.io/name: cert-manager 7 | name: self-signed-issuer 8 | namespace: {{ .Values.cluster_issuer.namespace }} 9 | spec: 10 | selfSigned: {} 11 | --- 12 | apiVersion: cert-manager.io/v1 13 | kind: Certificate 14 | metadata: 15 | name: self-signed-ca 16 | namespace: {{ .Values.cluster_issuer.namespace }} 17 | spec: 18 | isCA: true 19 | commonName: self-signed-ca 20 | secretName: ca-secret 21 | privateKey: 22 | algorithm: RSA 23 | encoding: PKCS1 24 | size: 2048 25 | issuerRef: 26 | name: self-signed-issuer 27 | kind: ClusterIssuer 28 | group: cert-manager.io 29 | --- 30 | apiVersion: cert-manager.io/v1 31 | kind: ClusterIssuer 32 | metadata: 33 | labels: 34 | app.kubernetes.io/component: cert-manager 35 | app.kubernetes.io/name: cert-manager 36 | name: {{ .Values.cluster_issuer.name }} 37 | namespace: {{ .Values.cluster_issuer.namespace }} 38 | spec: 39 | ca: 40 | secretName: ca-secret -------------------------------------------------------------------------------- /charts/cluster-issuer/values.yaml: -------------------------------------------------------------------------------- 1 | cluster_issuer: 2 | name: cluster-self-signing-issuer 3 | namespace: cert-manager 4 | -------------------------------------------------------------------------------- /charts/dex/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "v2.36.0" 3 | description: A Helm chart for Dex 4 | name: dex 5 | type: application 6 | version: 1.0.0 7 | -------------------------------------------------------------------------------- /charts/dex/crds/dex.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | name: authcodes.dex.coreos.com 5 | spec: 6 | group: dex.coreos.com 7 | names: 8 | kind: AuthCode 9 | listKind: AuthCodeList 10 | plural: authcodes 11 | singular: authcode 12 | scope: Namespaced 13 | versions: 14 | - name: v1 15 | schema: 16 | openAPIV3Schema: 17 | type: object 18 | x-kubernetes-preserve-unknown-fields: true 19 | served: true 20 | storage: true 21 | -------------------------------------------------------------------------------- /charts/dex/templates/cluster_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: dex 5 | rules: 6 | - apiGroups: 7 | - dex.coreos.com 8 | resources: 9 | - '*' 10 | verbs: 11 | - '*' 12 | - apiGroups: 13 | - apiextensions.k8s.io 14 | resources: 15 | - customresourcedefinitions 16 | verbs: 17 | - create 18 | -------------------------------------------------------------------------------- /charts/dex/templates/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: dex 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: dex 9 | subjects: 10 | - kind: ServiceAccount 11 | name: dex 12 | namespace: {{ .Values.dex.namespace }} 13 | -------------------------------------------------------------------------------- /charts/dex/templates/config_map.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: dex 5 | namespace: {{ .Values.dex.namespace }} 6 | data: 7 | config.yaml: | 8 | issuer: https://istio-ingressgateway.{{ .Values.ingress.namespace }}.svc.cluster.local/dex 9 | storage: 10 | type: kubernetes 11 | config: 12 | inCluster: true 13 | web: 14 | http: 0.0.0.0:5556 15 | logger: 16 | level: "debug" 17 | format: text 18 | oauth2: 19 | skipApprovalScreen: true 20 | enablePasswordDB: true 21 | staticPasswords: 22 | - email: {{ .Values.dex.user.email }} 23 | hashFromEnv: DEX_USER_HASH 24 | username: {{ .Values.dex.user.username }} 25 | userID: {{ .Values.dex.user.userid }} 26 | staticClients: 27 | - idEnv: OIDC_CLIENT_ID 28 | redirectURIs: ["https://istio-ingressgateway.{{ .Values.ingress.namespace }}.svc.cluster.local/oauth2/callback"] 29 | name: 'Dex Login Application' 30 | secretEnv: OIDC_CLIENT_SECRET -------------------------------------------------------------------------------- /charts/dex/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | app: dex 6 | name: dex 7 | namespace: {{ .Values.dex.namespace }} 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: dex 13 | template: 14 | metadata: 15 | labels: 16 | app: dex 17 | spec: 18 | serviceAccountName: dex 19 | containers: 20 | - image: ghcr.io/dexidp/dex:v2.36.0 21 | name: dex 22 | command: ["dex", "serve", "/etc/dex/cfg/config.yaml"] 23 | ports: 24 | - name: http 25 | containerPort: 5556 26 | volumeMounts: 27 | - name: config 28 | mountPath: /etc/dex/cfg 29 | envFrom: 30 | - secretRef: 31 | name: dex-oidc-client 32 | - secretRef: 33 | name: dex-passwords 34 | env: 35 | - name: KUBERNETES_POD_NAMESPACE 36 | valueFrom: 37 | fieldRef: 38 | fieldPath: metadata.namespace 39 | volumes: 40 | - name: config 41 | configMap: 42 | name: dex 43 | items: 44 | - key: config.yaml 45 | path: config.yaml 46 | -------------------------------------------------------------------------------- /charts/dex/templates/secrets/static-oidc-client.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: dex-oidc-client 5 | namespace: {{ .Values.dex.namespace }} 6 | type: Opaque 7 | stringData: 8 | OIDC_CLIENT_ID: {{ .Values.dex.oidc.client_id }} 9 | OIDC_CLIENT_SECRET: {{ .Values.dex.oidc.client_secret }} -------------------------------------------------------------------------------- /charts/dex/templates/secrets/static-passwords.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: dex-passwords 5 | namespace: {{ .Values.dex.namespace }} 6 | stringData: 7 | DEX_USER_HASH: {{ .Values.dex.user.bcrypt_hash }} 8 | -------------------------------------------------------------------------------- /charts/dex/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: dex 5 | namespace: {{ .Values.dex.namespace }} 6 | spec: 7 | ports: 8 | - name: dex 9 | port: 5556 10 | protocol: TCP 11 | targetPort: 5556 12 | selector: 13 | app: dex 14 | type: ClusterIP 15 | -------------------------------------------------------------------------------- /charts/dex/templates/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: dex 5 | namespace: {{ .Values.dex.namespace }} 6 | -------------------------------------------------------------------------------- /charts/dex/templates/virtual_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1beta1 2 | kind: VirtualService 3 | metadata: 4 | name: dex 5 | namespace: {{ .Values.dex.namespace }} 6 | spec: 7 | gateways: 8 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 9 | hosts: 10 | - '*' 11 | http: 12 | - match: 13 | - uri: 14 | prefix: /dex/ 15 | route: 16 | - destination: 17 | host: dex.{{ .Values.dex.namespace }}.svc.cluster.local 18 | port: 19 | number: 5556 20 | -------------------------------------------------------------------------------- /charts/dex/values.yaml: -------------------------------------------------------------------------------- 1 | dex: 2 | namespace: 3 | user: 4 | email: 5 | username: 6 | userid: 7 | bcrypt_hash: 8 | oidc: 9 | client_id: 10 | client_secret: 11 | ingress: 12 | namespace: 13 | gateway: 14 | 15 | -------------------------------------------------------------------------------- /charts/ebs-sc/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "gp3" 3 | description: A Helm chart for EBS storage class 4 | name: ebs-sc 5 | version: 1.0.1 6 | -------------------------------------------------------------------------------- /charts/ebs-sc/templates/storage-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: storage.k8s.io/v1 2 | kind: StorageClass 3 | metadata: 4 | annotations: 5 | storageclass.kubernetes.io/is-default-class: "true" 6 | name: ebs-sc 7 | parameters: 8 | fsType: ext4 9 | type: gp3 10 | provisioner: kubernetes.io/aws-ebs 11 | reclaimPolicy: Delete 12 | volumeBindingMode: Immediate 13 | --- 14 | apiVersion: storage.k8s.io/v1 15 | kind: StorageClass 16 | metadata: 17 | annotations: 18 | storageclass.kubernetes.io/is-default-class: "false" 19 | name: ebs-sc-wait 20 | parameters: 21 | fsType: ext4 22 | type: gp3 23 | provisioner: kubernetes.io/aws-ebs 24 | reclaimPolicy: Delete 25 | volumeBindingMode: WaitForFirstConsumer -------------------------------------------------------------------------------- /charts/istio-ingress/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: istio-ingress 3 | description: A Helm chart for istio ingress components 4 | type: application 5 | version: 1.0.0 6 | appVersion: "1.20.2" 7 | -------------------------------------------------------------------------------- /charts/istio-ingress/templates/certificate.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Certificate 3 | metadata: 4 | name: gateway-cert 5 | namespace: {{ .Values.ingress.namespace }} 6 | spec: 7 | secretName: gateway-cert 8 | 9 | duration: 2160h # 90d 10 | renewBefore: 360h # 15d 11 | subject: 12 | organizations: 13 | - aws 14 | 15 | isCA: false 16 | privateKey: 17 | algorithm: RSA 18 | encoding: PKCS1 19 | size: 2048 20 | usages: 21 | - server auth 22 | dnsNames: 23 | - "istio-ingressgateway.{{ .Values.ingress.namespace }}.svc.cluster.local" 24 | issuerRef: 25 | name: {{ .Values.cluster_issuer.name }} 26 | kind: ClusterIssuer 27 | group: cert-manager.io -------------------------------------------------------------------------------- /charts/istio-ingress/templates/cluster_roles.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | kind: ClusterRole 5 | metadata: 6 | name: istio-ingress-admin 7 | labels: 8 | rbac.authorization.kubeflow.org/aggregate-to-kubeflow-admin: "true" 9 | aggregationRule: 10 | clusterRoleSelectors: 11 | - matchLabels: 12 | rbac.authorization.kubeflow.org/aggregate-to-istio-ingress-admin: "true" 13 | rules: [] 14 | 15 | --- 16 | 17 | apiVersion: rbac.authorization.k8s.io/v1 18 | kind: ClusterRole 19 | metadata: 20 | name: istio-ingress-edit 21 | labels: 22 | rbac.authorization.kubeflow.org/aggregate-to-kubeflow-edit: "true" 23 | rbac.authorization.kubeflow.org/aggregate-to-istio-ingress-admin: "true" 24 | rules: 25 | - apiGroups: 26 | - istio.io 27 | - networking.istio.io 28 | resources: ["*"] 29 | verbs: 30 | - get 31 | - list 32 | - watch 33 | - create 34 | - delete 35 | - deletecollection 36 | - patch 37 | - update 38 | 39 | --- 40 | 41 | apiVersion: rbac.authorization.k8s.io/v1 42 | kind: ClusterRole 43 | metadata: 44 | name: istio-ingress-view 45 | labels: 46 | rbac.authorization.kubeflow.org/aggregate-to-kubeflow-view: "true" 47 | rules: 48 | - apiGroups: 49 | - istio.io 50 | - networking.istio.io 51 | resources: ["*"] 52 | verbs: 53 | - get 54 | - list 55 | - watch 56 | -------------------------------------------------------------------------------- /charts/istio-ingress/templates/gateway.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1beta1 2 | kind: Gateway 3 | metadata: 4 | name: {{ .Values.ingress.gateway }} 5 | namespace: {{ .Values.ingress.namespace }} 6 | spec: 7 | selector: 8 | app: istio-ingressgateway 9 | servers: 10 | - hosts: 11 | - '*' 12 | port: 13 | name: https-8443 14 | number: 8443 15 | protocol: HTTPS 16 | tls: 17 | mode: SIMPLE 18 | credentialName: gateway-cert 19 | - hosts: 20 | - '*' 21 | port: 22 | name: http-8080 23 | number: 8080 24 | protocol: HTTP -------------------------------------------------------------------------------- /charts/istio-ingress/templates/virtual_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1beta1 2 | kind: VirtualService 3 | metadata: 4 | name: {{ .Values.ingress.gateway }}-health-check 5 | namespace: {{ .Values.ingress.namespace }} 6 | spec: 7 | hosts: 8 | - '*' 9 | gateways: 10 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 11 | http: 12 | - match: 13 | - uri: 14 | exact: {{ .Values.healthcheck.path }} 15 | method: 16 | exact: GET 17 | port: {{ .Values.healthcheck.port }} 18 | directResponse: 19 | status: 200 -------------------------------------------------------------------------------- /charts/istio-ingress/values.yaml: -------------------------------------------------------------------------------- 1 | ingress: 2 | namespace: ingress 3 | gateway: ingress-gateway 4 | healthcheck: 5 | port: 8080 6 | path: /healthcheck 7 | cluster_issuer: 8 | name: cluster-self-signing-issuer -------------------------------------------------------------------------------- /charts/karpenter-components/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "v1beta1" 3 | description: A Helm chart for Karpenter components 4 | name: karpenter-components 5 | version: 1.0.4 6 | -------------------------------------------------------------------------------- /charts/karpenter-components/values.yaml: -------------------------------------------------------------------------------- 1 | namespace: "karpenter" 2 | role_name: 3 | cluster_id: 4 | consolidate_after: "600s" 5 | capacity_type: "on-demand" 6 | max_pods: 20 7 | -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/coco-data/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: coco-data 3 | description: A Helm chart for downloading and extracting COCO data to a persistent volume 4 | type: application 5 | version: 1.0.0 6 | appVersion: "v2017" 7 | -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/coco-data/values.yaml: -------------------------------------------------------------------------------- 1 | pvc: 2 | name: pv-fsx 3 | mount_path: /fsx 4 | data_path: data/coco2017 # relative to mount_path -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/data-process/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: data-process 3 | description: A Helm chart for data processing 4 | type: application 5 | version: 1.0.0 6 | appVersion: "1.0.0" 7 | -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/data-process/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | pvc: 3 | - name: pv-fsx 4 | mount_path: /fsx 5 | - name: pv-efs 6 | mount_path: /efs 7 | ebs: {} 8 | git: 9 | repo_url: 10 | branch: 11 | commit: 12 | pre_script: [] 13 | post_script: [] 14 | process: 15 | env: [] 16 | command: [] 17 | args: [] 18 | resources: 19 | requests: {} 20 | limits: {} -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/databricks-dolly-15k-data/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: databricks-dolly-15k 3 | description: A Helm chart for downloading databricks/databricks-dolly-15k data 4 | type: application 5 | version: 1.0.0 6 | appVersion: "v1" 7 | -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/databricks-dolly-15k-data/values.yaml: -------------------------------------------------------------------------------- 1 | pvc: 2 | name: pv-fsx 3 | mount_path: /fsx 4 | data_path: data/databricks-dolly-15k # relative to mount_path -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/mpijob-data-process/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for distributed data processing using MPI Job 4 | name: mpijob-data-process 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/ray-data-process/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "2.9.0" 3 | description: A Helm chart for running RayJob for data processing 4 | name: raytrain 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/ray-data-process/values.yaml: -------------------------------------------------------------------------------- 1 | ray: 2 | version: 2.9.0 3 | dashboard: 4 | host: '0.0.0.0' 5 | ports: [] 6 | resources: 7 | requests: {} 8 | limits: {} 9 | runtime_env_yaml: 10 | image: 11 | image_pull_policy: IfNotPresent 12 | resources: 13 | nnodes: 1 14 | node_type: 15 | requests: {} 16 | limits: {} 17 | tolerations: [] 18 | pvc: 19 | - name: pv-fsx 20 | mount_path: /fsx 21 | - name: pv-efs 22 | mount_path: /efs 23 | git: 24 | repo_url: 25 | branch: 26 | commit: 27 | pre_script: [] 28 | post_script: [] 29 | process: 30 | env: [] 31 | command: [] 32 | args: [] 33 | -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/redpajama-data/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: redpajama-data 3 | description: A Helm chart for downloading redpajama data 4 | type: application 5 | version: 1.0.0 6 | appVersion: "v2017" 7 | -------------------------------------------------------------------------------- /charts/machine-learning/data-prep/redpajama-data/values.yaml: -------------------------------------------------------------------------------- 1 | pvc: 2 | name: pv-fsx 3 | mount_path: /fsx 4 | data_path: data/redpajama # relative to mount_path -------------------------------------------------------------------------------- /charts/machine-learning/model-prep/hf-snapshot/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: hugging-face-snapshot 3 | description: A Helm chart for downloading Hugging Face snapshot 4 | type: application 5 | version: 1.0.0 6 | appVersion: "v1" 7 | -------------------------------------------------------------------------------- /charts/machine-learning/model-prep/hf-snapshot/values.yaml: -------------------------------------------------------------------------------- 1 | image: public.ecr.aws/docker/library/python:slim-bullseye 2 | ebs: 3 | storage: 400Gi 4 | mount_path: /tmp 5 | resources: 6 | requests: 7 | cpu: "1000m" 8 | memory: "2048Mi" 9 | limits: 10 | cpu: "1000m" 11 | memory: "2048Mi" 12 | pvc: 13 | - name: pv-fsx 14 | mount_path: /fsx 15 | - name: pv-efs 16 | mount_path: /efs 17 | env: [] 18 | snapshot: {} 19 | -------------------------------------------------------------------------------- /charts/machine-learning/model-prep/rayserve-tnx-autocausalengine/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: rayserve-tnx-engine 3 | description: A Helm chart for Ray Serve Transformers Neuronx Auto Causal LLM Engine 4 | type: application 5 | version: 1.0.0 6 | appVersion: "v1" 7 | -------------------------------------------------------------------------------- /charts/machine-learning/model-prep/rayserve-tnx-autocausalengine/values.yaml: -------------------------------------------------------------------------------- 1 | image: public.ecr.aws/docker/library/python:slim-bullseye 2 | resources: 3 | requests: 4 | cpu: "1000m" 5 | memory: "512Mi" 6 | limits: 7 | cpu: "1000m" 8 | memory: "1024Mi" 9 | pvc: 10 | - name: pv-fsx 11 | mount_path: /fsx 12 | - name: pv-efs 13 | mount_path: /efs 14 | env: [] 15 | engine_path: /fsx/rayserve/engines/tnx_autocausalengine.zip 16 | -------------------------------------------------------------------------------- /charts/machine-learning/model-prep/rayserve-vllm-asyncllmengine/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: rayserve-vllm-engine 3 | description: A Helm chart for Ray Serve vLLM Engine 4 | type: application 5 | version: 1.0.0 6 | appVersion: "v1" 7 | -------------------------------------------------------------------------------- /charts/machine-learning/model-prep/rayserve-vllm-asyncllmengine/values.yaml: -------------------------------------------------------------------------------- 1 | image: public.ecr.aws/docker/library/python:slim-bullseye 2 | resources: 3 | requests: 4 | cpu: "1000m" 5 | memory: "512Mi" 6 | limits: 7 | cpu: "1000m" 8 | memory: "1024Mi" 9 | pvc: 10 | - name: pv-fsx 11 | mount_path: /fsx 12 | - name: pv-efs 13 | mount_path: /efs 14 | env: [] 15 | engine_path: /fsx/rayserve/engines/vllm_asyncllmengine.zip 16 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/djl-lmi-server/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for Deep Java Library Large Model Inference (LMI) 4 | name: djl-serving 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/djl-lmi-server/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | name: 3 | pull_policy: IfNotPresent 4 | resources: 5 | node_type: g5.48xlarge 6 | requests: {} 7 | limits: {} 8 | tolerations: [] 9 | pvc: 10 | - name: pv-fsx 11 | mount_path: /fsx 12 | - name: pv-efs 13 | mount_path: /efs 14 | ebs: {} 15 | git: 16 | repo_url: 17 | branch: 18 | commit: 19 | pre_script: [] 20 | post_script: [] 21 | server: 22 | name: djl-lmi-server 23 | args: [] 24 | command: [] 25 | ports: 26 | http: 8000 27 | grpc: 8001 28 | metrics: 8002 29 | readiness_probe: 30 | period_secs: 5 31 | failure_threshold: 3 32 | startup_probe: 33 | period_secs: 10 34 | failure_threshold: 30 35 | liveness_probe: 36 | period_secs: 10 37 | failure_threshold: 3 38 | autoscaling: 39 | minReplicas: 1 40 | maxReplicas: 1 41 | metrics: [] 42 | 43 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/generic-server/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for generic server 4 | name: generic-server 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/generic-server/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | name: 3 | pull_policy: IfNotPresent 4 | resources: 5 | node_type: g5.48xlarge 6 | requests: {} 7 | limits: {} 8 | tolerations: [] 9 | pvc: 10 | - name: pv-fsx 11 | mount_path: /fsx 12 | - name: pv-efs 13 | mount_path: /efs 14 | ebs: {} 15 | git: 16 | repo_url: 17 | branch: 18 | commit: 19 | pre_script: [] 20 | post_script: [] 21 | server: 22 | name: generic-server 23 | args: [] 24 | command: [] 25 | ports: [] 26 | readiness_probe: 27 | period_secs: 5 28 | failure_threshold: 3 29 | path: / 30 | port: 31 | startup_probe: 32 | period_secs: 10 33 | failure_threshold: 30 34 | path: / 35 | port: 36 | liveness_probe: 37 | period_secs: 10 38 | failure_threshold: 3 39 | path: / 40 | port: 41 | resources: 42 | requests: 43 | cpu: 1 44 | memory: 2Gi 45 | limits: 46 | cpu: 4 47 | memory: 8Gi 48 | autoscaling: 49 | minReplicas: 1 50 | maxReplicas: 2 51 | metrics: 52 | - type: Pods 53 | pods: 54 | metric: 55 | name: cpu 56 | target: 57 | type: Utilization 58 | averageValue: 80 59 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/rayserve/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "2.32.0" 3 | description: A Helm chart for running RayService for serving 4 | name: rayserve 5 | version: 2.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/rayserve/values.yaml: -------------------------------------------------------------------------------- 1 | ray: 2 | version: '2.44.0' 3 | dashboard: 4 | host: '0.0.0.0' 5 | ports: [] 6 | resources: 7 | requests: {} 8 | limits: {} 9 | tolerations: [] 10 | serve_config_v2: 11 | service_unhealthy_threshold_secs: 900 12 | deployment_unhealthy_threshold_secs: 300 13 | env: 14 | restart_policy: 15 | head: OnFailure 16 | worker: OnFailure 17 | image: 18 | image_pull_policy: IfNotPresent 19 | resources: 20 | min_replicas: 1 21 | max_replicas: 1 22 | node_type: 23 | requests: {} 24 | limits: {} 25 | tolerations: [] 26 | pvc: [] 27 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/triton-inference-server-lws/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: 1.0.0 3 | description: Triton Inference Server with LeaderWorkingSet 4 | name: triton-inference-server-lws 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/triton-inference-server-lws/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | name: 3 | pull_policy: IfNotPresent 4 | lws: 5 | size: 2 6 | resources: 7 | node_type: g5.48xlarge 8 | requests: {} 9 | limits: {} 10 | tolerations: [] 11 | pvc: 12 | - name: pv-fsx 13 | mount_path: /fsx 14 | - name: pv-efs 15 | mount_path: /efs 16 | ebs: {} 17 | git: 18 | repo_url: 19 | branch: 20 | commit: 21 | pre_script: [] 22 | post_script: [] 23 | server: 24 | name: triton-inference-server 25 | args: [] 26 | command: [] 27 | ports: [] 28 | readiness_probe: 29 | period_secs: 5 30 | failure_threshold: 3 31 | startup_probe: 32 | period_secs: 10 33 | failure_threshold: 30 34 | liveness_probe: 35 | period_secs: 10 36 | failure_threshold: 3 37 | autoscaling: 38 | minReplicas: 1 39 | maxReplicas: 2 40 | metrics: 41 | - type: Pods 42 | pods: 43 | metric: 44 | name: avg_time_queue_us 45 | target: 46 | type: AverageValue 47 | averageValue: 50 48 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/triton-inference-server/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for Triton Inference Server 4 | name: triton-inference-server 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/serving/triton-inference-server/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | name: 3 | pull_policy: IfNotPresent 4 | resources: 5 | node_type: g5.48xlarge 6 | requests: {} 7 | limits: {} 8 | tolerations: [] 9 | pvc: 10 | - name: pv-fsx 11 | mount_path: /fsx 12 | - name: pv-efs 13 | mount_path: /efs 14 | ebs: {} 15 | git: 16 | repo_url: 17 | branch: 18 | commit: 19 | pre_script: [] 20 | post_script: [] 21 | server: 22 | name: triton-inference-server 23 | args: [] 24 | command: [] 25 | ports: [] 26 | readiness_probe: 27 | period_secs: 5 28 | failure_threshold: 3 29 | startup_probe: 30 | period_secs: 10 31 | failure_threshold: 30 32 | liveness_probe: 33 | period_secs: 10 34 | failure_threshold: 3 35 | autoscaling: 36 | minReplicas: 1 37 | maxReplicas: 2 38 | metrics: 39 | - type: Pods 40 | pods: 41 | metric: 42 | name: avg_time_queue_us 43 | target: 44 | type: AverageValue 45 | averageValue: 50 46 | -------------------------------------------------------------------------------- /charts/machine-learning/testing/maskrcnn-jupyter/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for Jupyter 4 | name: jupyter 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/testing/maskrcnn-jupyter/values.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | name: maskrcnn-jupyter 3 | namespace: kubeflow 4 | shared_fs: fsx 5 | shared_pvc: pv-fsx # pv-efs 6 | source_cidr: # Public IP source CIDR 7 | log_dir: # relative path on shared file-system to directory containing 'train_log' folder 8 | image: 9 | image_pull_policy: Always 10 | jupyter: 11 | name: jupyter 12 | port: 443 13 | target_port: 8888 14 | gpu_instance_type: g5.xlarge 15 | tensorboard: 16 | name: tensorboard 17 | port: 6443 18 | target_port: 6443 19 | upstream_port: 6006 20 | nginx: 21 | name: nginx 22 | ssl_certificate: /etc/ssl/domain.crt 23 | ssl_certificate_key: /etc/ssl/domain.key 24 | user: tensorboard 25 | htpasswd: # MD5 password hash -------------------------------------------------------------------------------- /charts/machine-learning/testing/maskrcnn-optimized-jupyter/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for Jupyter 4 | name: jupyter 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/testing/maskrcnn-optimized-jupyter/values.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | name: maskrcnn-optimized-jupyter 3 | namespace: kubeflow 4 | shared_fs: fsx 5 | shared_pvc: pv-fsx # pv-efs 6 | source_cidr: # Public IP source CIDR 7 | log_dir: # relative path on shared file-system to directory containing 'train_log' folder 8 | image: 9 | image_pull_policy: Always 10 | jupyter: 11 | name: jupyter 12 | port: 443 13 | target_port: 8888 14 | gpu_instance_type: g5.xlarge 15 | tensorboard: 16 | name: tensorboard 17 | port: 6443 18 | target_port: 6443 19 | upstream_port: 6006 20 | nginx: 21 | name: nginx 22 | ssl_certificate: /etc/ssl/domain.crt 23 | ssl_certificate_key: /etc/ssl/domain.key 24 | user: tensorboard 25 | htpasswd: # MD5 password hash -------------------------------------------------------------------------------- /charts/machine-learning/training/maskrcnn-optimized/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for running Mask RCNN (optimized) using kubeflow mpi-operator and mpi-job 4 | name: maskrcnn-optimized 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/training/maskrcnn-optimized/values.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | namespace: kubeflow 3 | shared_fs: fsx #efs 4 | shared_pvc: pv-fsx # pv-efs 5 | maskrcnn: 6 | name: mask-rcnn-tensorflow 7 | gpus: 16 8 | gpu_nodes: 2 9 | gpus_per_node: 8 10 | gpu_instance_type: p4d.24xlarge 11 | image: 12 | train_script: /mask-rcnn-tensorflow/MaskRCNN/train.py 13 | batch_size_per_gpu: 4 14 | data_fs: fsx #efs 15 | data_dir: data/coco2017 #data 16 | working_dir: /mask-rcnn-tensorflow 17 | images_per_epoch: 120000 18 | lr_epoch_schedule: "[(16, 0.1), (20, 0.01), (24, None)]" 19 | base_lr: 0.0015625 # for a total batch size of 1, adjusted automatically to actual total batch size 20 | eval_period_in_epochs: 1 21 | data_train: "[\"train2017\"]" 22 | data_val: "(\"val2017\")" 23 | mode_fpn: 'True' 24 | mode_mask: 'True' 25 | backbone_norm: FreezeBN 26 | backbone_weights: data/coco2017/pretrained-models/ImageNet-R50-AlignPadding.npz 27 | image_pull_policy: Always 28 | extra_config: 'TRAIN.GRADIENT_CLIP=0.36' 29 | nccl_socket_ifname: ^lo,docker0 30 | if_exclude: lo,docker0 31 | tf_device_min_sys_mem_mb: 4096 32 | tf_enable_auto_mixed_precision: 0 33 | nccl_debug: INFO 34 | horovod_autotune: 1 35 | horovod_log_level: INFO 36 | backoff_limit: 2000 37 | -------------------------------------------------------------------------------- /charts/machine-learning/training/maskrcnn/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for running Mask RCNN using kubeflow mpi-operator and mpi-job 4 | name: maskrcnn 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/training/maskrcnn/values.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | namespace: kubeflow 3 | shared_fs: fsx 4 | shared_pvc: pv-fsx # pv-efs 5 | maskrcnn: 6 | name: maskrcnn 7 | gpus: 16 8 | gpu_nodes: 2 9 | gpus_per_node: 8 10 | gpu_instance_type: p4d.24xlarge 11 | image: 12 | train_script: /tensorpack/examples/FasterRCNN/train.py 13 | data_fs: fsx # efs 14 | data_dir: data/coco2017 15 | steps_per_epoch: 7500 # Must be equal to 120000/gpus 16 | lr_schedule: "[240000,320000,360000]" 17 | base_lr: 0.01 # For a total batch size=8, adjusted automatically to actual total batch size 18 | eval_period_in_epochs: 1 19 | data_train: "[\"coco_train2017\"]" 20 | data_val: "(\"coco_val2017\")" 21 | mode_fpn: 'True' 22 | mode_mask: 'True' 23 | backbone_norm: FreezeBN 24 | backbone_weights: data/coco2017/pretrained-models/ImageNet-R50-AlignPadding.npz 25 | image_pull_policy: Always 26 | nccl_socket_ifname: ^lo,docker0 27 | if_exclude: lo,docker0 28 | tf_device_min_sys_mem_mb: 4096 29 | tf_enable_auto_mixed_precision: 0 30 | nccl_debug: INFO 31 | horovod_autotune: 1 32 | horovod_log_level: INFO 33 | extra_config: 'TRAIN.CHECKPOINT_PERIOD=2' 34 | backoff_limit: 2000 35 | -------------------------------------------------------------------------------- /charts/machine-learning/training/mpijob-horovod-tensorflow-gpu/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for running MPIJob with Horovod and Tensorflow using GPUs 4 | name: mpijob-horovod-tensorflow-gpu 5 | version: 1.0.2 6 | -------------------------------------------------------------------------------- /charts/machine-learning/training/pytorchjob-distributed/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for running distributed PytorchJob using Master and Workers 4 | name: pytorchjob-distributed 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/training/pytorchjob-distributed/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | backoff_limit: 2000 3 | resources: 4 | requests: {} 5 | limits: {} 6 | nnodes: 2 7 | nproc_per_node: 8 | node_type: 9 | tolerations: [] 10 | pvc: 11 | - name: pv-fsx 12 | mount_path: /fsx 13 | - name: pv-efs 14 | mount_path: /efs 15 | git: 16 | repo_url: 17 | branch: 18 | commit: 19 | pre_script: [] 20 | post_script: [] 21 | train: 22 | env: [] 23 | command: [] 24 | args: [] 25 | -------------------------------------------------------------------------------- /charts/machine-learning/training/pytorchjob-elastic/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for running elastic PytorchJob 4 | name: pytorchjob-elastic 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/training/pytorchjob-elastic/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | backoff_limit: 2000 3 | resources: 4 | nnodes: 1 5 | nproc_per_node: 6 | node_type: 7 | requests: {} 8 | limits: {} 9 | tolerations: [] 10 | elastic_policy: 11 | rdzv_backend: c10d 12 | rdzv_port: 44000 13 | min_replicas: 1 14 | max_replicas: 1 15 | pvc: 16 | - name: pv-fsx 17 | mount_path: /fsx 18 | - name: pv-efs 19 | mount_path: /efs 20 | git: 21 | repo_url: 22 | branch: 23 | commit: 24 | pre_script: [] 25 | post_script: [] 26 | train: 27 | env: [] 28 | command: [] 29 | args: [] 30 | -------------------------------------------------------------------------------- /charts/machine-learning/training/raytrain/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "2.22.0" 3 | description: A Helm chart for running RayJob for training 4 | name: raytrain 5 | version: 2.0.0 6 | -------------------------------------------------------------------------------- /charts/machine-learning/training/raytrain/values.yaml: -------------------------------------------------------------------------------- 1 | ray: 2 | version: 2.22.0 3 | dashboard: 4 | host: '0.0.0.0' 5 | ports: [] 6 | resources: 7 | requests: {} 8 | limits: {} 9 | runtime_env_yaml: 10 | image: 11 | image_pull_policy: IfNotPresent 12 | resources: 13 | nnodes: 1 14 | node_type: 15 | requests: {} 16 | limits: {} 17 | tolerations: [] 18 | pvc: 19 | - name: pv-fsx 20 | mount_path: /fsx 21 | - name: pv-efs 22 | mount_path: /efs 23 | git: 24 | repo_url: 25 | branch: 26 | commit: 27 | pre_script: [] 28 | post_script: [] 29 | train: 30 | env: [] 31 | command: [] 32 | args: [] 33 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: v1.9.2 3 | description: A Helm chart for kubeflow admission webhook 4 | name: admission-webhook 5 | type: application 6 | version: 1.0.0 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/templates/authorization_policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.istio.io/v1 2 | kind: AuthorizationPolicy 3 | metadata: 4 | labels: 5 | control-plane: profiles 6 | name: profiles-kfam 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | action: ALLOW 10 | rules: 11 | - from: 12 | - source: 13 | principals: 14 | - cluster.local/ns/{{ .Values.kubeflow.namespace }}/sa/centraldashboard 15 | selector: 16 | matchLabels: 17 | control-plane: profiles 18 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/templates/certificate.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Certificate 3 | metadata: 4 | labels: 5 | app: poddefaults 6 | app.kubernetes.io/component: poddefaults 7 | app.kubernetes.io/name: poddefaults 8 | name: admission-webhook-cert 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | spec: 11 | commonName: admission-webhook-service.{{ .Values.kubeflow.namespace }}.svc 12 | dnsNames: 13 | - admission-webhook-service.{{ .Values.kubeflow.namespace }}.svc 14 | - admission-webhook-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local 15 | isCA: true 16 | issuerRef: 17 | kind: Issuer 18 | name: admission-webhook-selfsigned-issuer 19 | secretName: webhook-certs 20 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/templates/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app: poddefaults 6 | app.kubernetes.io/component: poddefaults 7 | app.kubernetes.io/name: poddefaults 8 | name: admission-webhook-cluster-role-binding 9 | roleRef: 10 | apiGroup: rbac.authorization.k8s.io 11 | kind: ClusterRole 12 | name: admission-webhook-cluster-role 13 | subjects: 14 | - kind: ServiceAccount 15 | name: admission-webhook-service-account 16 | namespace: {{ .Values.kubeflow.namespace }} 17 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/templates/deplyment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | app: poddefaults 6 | app.kubernetes.io/component: poddefaults 7 | app.kubernetes.io/name: poddefaults 8 | name: admission-webhook-deployment 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | spec: 11 | selector: 12 | matchLabels: 13 | app: poddefaults 14 | app.kubernetes.io/component: poddefaults 15 | app.kubernetes.io/name: poddefaults 16 | template: 17 | metadata: 18 | annotations: 19 | sidecar.istio.io/inject: 'false' 20 | labels: 21 | app: poddefaults 22 | app.kubernetes.io/component: poddefaults 23 | app.kubernetes.io/name: poddefaults 24 | spec: 25 | containers: 26 | - args: 27 | - --tlsCertFile=/etc/webhook/certs/tls.crt 28 | - --tlsKeyFile=/etc/webhook/certs/tls.key 29 | image: docker.io/kubeflownotebookswg/poddefaults-webhook:v1.9.2 30 | name: admission-webhook 31 | ports: 32 | - containerPort: 4443 33 | name: https-webhook 34 | volumeMounts: 35 | - mountPath: /etc/webhook/certs 36 | name: webhook-cert 37 | readOnly: true 38 | serviceAccountName: admission-webhook-service-account 39 | volumes: 40 | - name: webhook-cert 41 | secret: 42 | secretName: webhook-certs 43 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/templates/issuer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Issuer 3 | metadata: 4 | labels: 5 | app: poddefaults 6 | app.kubernetes.io/component: poddefaults 7 | app.kubernetes.io/name: poddefaults 8 | kustomize.component: poddefaults 9 | name: admission-webhook-selfsigned-issuer 10 | namespace: {{ .Values.kubeflow.namespace }} 11 | spec: 12 | selfSigned: {} 13 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/templates/mutating_webhook_configuration.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: MutatingWebhookConfiguration 3 | metadata: 4 | annotations: 5 | cert-manager.io/inject-ca-from: {{ .Values.kubeflow.namespace }}/admission-webhook-cert 6 | labels: 7 | app: poddefaults 8 | app.kubernetes.io/component: poddefaults 9 | app.kubernetes.io/name: poddefaults 10 | name: admission-webhook-mutating-webhook-configuration 11 | webhooks: 12 | - admissionReviewVersions: 13 | - v1beta1 14 | - v1 15 | clientConfig: 16 | caBundle: '' 17 | service: 18 | name: admission-webhook-service 19 | namespace: {{ .Values.kubeflow.namespace }} 20 | path: /apply-poddefault 21 | failurePolicy: Fail 22 | name: admission-webhook-deployment.kubeflow.org 23 | namespaceSelector: 24 | matchLabels: 25 | app.kubernetes.io/part-of: kubeflow-profile 26 | rules: 27 | - apiGroups: 28 | - '' 29 | apiVersions: 30 | - v1 31 | operations: 32 | - CREATE 33 | resources: 34 | - pods 35 | sideEffects: None 36 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: poddefaults 6 | app.kubernetes.io/component: poddefaults 7 | app.kubernetes.io/name: poddefaults 8 | name: admission-webhook-service 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | spec: 11 | ports: 12 | - name: https-webhook 13 | port: 443 14 | targetPort: https-webhook 15 | selector: 16 | app: poddefaults 17 | app.kubernetes.io/component: poddefaults 18 | app.kubernetes.io/name: poddefaults 19 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/templates/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: poddefaults 6 | app.kubernetes.io/component: poddefaults 7 | app.kubernetes.io/name: poddefaults 8 | name: admission-webhook-service-account 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-admission-webhook/values.yaml: -------------------------------------------------------------------------------- 1 | kubeflow: 2 | namespace: kubeflow 3 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: v1.9.2 3 | description: A Helm chart for Kubeflow central dashboard 4 | name: kubeflow-central-dashboard 5 | type: application 6 | version: 1.0.0 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/templates/authorization_policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.istio.io/v1 2 | kind: AuthorizationPolicy 3 | metadata: 4 | labels: 5 | app: centraldashboard 6 | app.kubernetes.io/component: centraldashboard 7 | app.kubernetes.io/name: centraldashboard 8 | name: central-dashboard 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | spec: 11 | action: ALLOW 12 | rules: 13 | - from: 14 | - source: 15 | principals: 16 | - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }} 17 | selector: 18 | matchLabels: 19 | app: centraldashboard 20 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/templates/cluster_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | labels: 5 | app: centraldashboard 6 | app.kubernetes.io/component: centraldashboard 7 | app.kubernetes.io/name: centraldashboard 8 | name: centraldashboard 9 | rules: 10 | - apiGroups: 11 | - '' 12 | resources: 13 | - events 14 | - namespaces 15 | - nodes 16 | verbs: 17 | - get 18 | - list 19 | - watch 20 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/templates/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app: centraldashboard 6 | app.kubernetes.io/component: centraldashboard 7 | app.kubernetes.io/name: centraldashboard 8 | name: centraldashboard 9 | roleRef: 10 | apiGroup: rbac.authorization.k8s.io 11 | kind: ClusterRole 12 | name: centraldashboard 13 | subjects: 14 | - kind: ServiceAccount 15 | name: centraldashboard 16 | namespace: {{ .Values.kubeflow.namespace }} 17 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/templates/role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | labels: 5 | app: centraldashboard 6 | app.kubernetes.io/component: centraldashboard 7 | app.kubernetes.io/name: centraldashboard 8 | name: centraldashboard 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | rules: 11 | - apiGroups: 12 | - '' 13 | - app.k8s.io 14 | resources: 15 | - applications 16 | - pods 17 | - pods/exec 18 | - pods/log 19 | verbs: 20 | - get 21 | - list 22 | - watch 23 | - apiGroups: 24 | - '' 25 | resources: 26 | - secrets 27 | - configmaps 28 | verbs: 29 | - get 30 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/templates/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | labels: 5 | app: centraldashboard 6 | app.kubernetes.io/component: centraldashboard 7 | app.kubernetes.io/name: centraldashboard 8 | name: centraldashboard 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | roleRef: 11 | apiGroup: rbac.authorization.k8s.io 12 | kind: Role 13 | name: centraldashboard 14 | subjects: 15 | - kind: ServiceAccount 16 | name: centraldashboard 17 | namespace: {{ .Values.kubeflow.namespace }} 18 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: centraldashboard 6 | app.kubernetes.io/component: centraldashboard 7 | app.kubernetes.io/name: centraldashboard 8 | name: centraldashboard 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | spec: 11 | ports: 12 | - port: 80 13 | protocol: TCP 14 | targetPort: 8082 15 | selector: 16 | app: centraldashboard 17 | app.kubernetes.io/component: centraldashboard 18 | app.kubernetes.io/name: centraldashboard 19 | sessionAffinity: None 20 | type: ClusterIP 21 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/templates/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: centraldashboard 6 | app.kubernetes.io/component: centraldashboard 7 | app.kubernetes.io/name: centraldashboard 8 | name: centraldashboard 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/templates/virtual_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1beta1 2 | kind: VirtualService 3 | metadata: 4 | labels: 5 | app: centraldashboard 6 | app.kubernetes.io/component: centraldashboard 7 | app.kubernetes.io/name: centraldashboard 8 | name: centraldashboard 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | spec: 11 | gateways: 12 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 13 | hosts: 14 | - '*' 15 | http: 16 | - match: 17 | - uri: 18 | prefix: / 19 | rewrite: 20 | uri: / 21 | route: 22 | - destination: 23 | host: centraldashboard.{{ .Values.kubeflow.namespace }}.svc.cluster.local 24 | port: 25 | number: 80 26 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-central-dashboard/values.yaml: -------------------------------------------------------------------------------- 1 | kubeflow: 2 | namespace: kubeflow 3 | ingress: 4 | namespace: ingress 5 | gateway: ingress-gateway 6 | sa: istio-ingressgateway 7 | user: 8 | profile: 9 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: v0.17.0 3 | description: A Helm chart for Kubeflow Katib 4 | name: kubeflow-katib 5 | type: application 6 | version: 1.0.1 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/crds/experiments.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | name: experiments.kubeflow.org 5 | spec: 6 | group: kubeflow.org 7 | names: 8 | categories: 9 | - all 10 | - kubeflow 11 | - katib 12 | kind: Experiment 13 | plural: experiments 14 | singular: experiment 15 | scope: Namespaced 16 | versions: 17 | - additionalPrinterColumns: 18 | - jsonPath: .status.conditions[-1:].type 19 | name: Type 20 | type: string 21 | - jsonPath: .status.conditions[-1:].status 22 | name: Status 23 | type: string 24 | - jsonPath: .metadata.creationTimestamp 25 | name: Age 26 | type: date 27 | name: v1beta1 28 | schema: 29 | openAPIV3Schema: 30 | type: object 31 | x-kubernetes-preserve-unknown-fields: true 32 | served: true 33 | storage: true 34 | subresources: 35 | status: {} 36 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/crds/suggestions.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | name: suggestions.kubeflow.org 5 | spec: 6 | group: kubeflow.org 7 | names: 8 | categories: 9 | - all 10 | - kubeflow 11 | - katib 12 | kind: Suggestion 13 | plural: suggestions 14 | singular: suggestion 15 | scope: Namespaced 16 | versions: 17 | - additionalPrinterColumns: 18 | - jsonPath: .status.conditions[-1:].type 19 | name: Type 20 | type: string 21 | - jsonPath: .status.conditions[-1:].status 22 | name: Status 23 | type: string 24 | - jsonPath: .spec.requests 25 | name: Requested 26 | type: string 27 | - jsonPath: .status.suggestionCount 28 | name: Assigned 29 | type: string 30 | - jsonPath: .metadata.creationTimestamp 31 | name: Age 32 | type: date 33 | name: v1beta1 34 | schema: 35 | openAPIV3Schema: 36 | type: object 37 | x-kubernetes-preserve-unknown-fields: true 38 | served: true 39 | storage: true 40 | subresources: 41 | status: {} 42 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/crds/trials.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | name: trials.kubeflow.org 5 | spec: 6 | group: kubeflow.org 7 | names: 8 | categories: 9 | - all 10 | - kubeflow 11 | - katib 12 | kind: Trial 13 | plural: trials 14 | singular: trial 15 | scope: Namespaced 16 | versions: 17 | - additionalPrinterColumns: 18 | - jsonPath: .status.conditions[-1:].type 19 | name: Type 20 | type: string 21 | - jsonPath: .status.conditions[-1:].status 22 | name: Status 23 | type: string 24 | - jsonPath: .metadata.creationTimestamp 25 | name: Age 26 | type: date 27 | name: v1beta1 28 | schema: 29 | openAPIV3Schema: 30 | type: object 31 | x-kubernetes-preserve-unknown-fields: true 32 | served: true 33 | storage: true 34 | subresources: 35 | status: {} 36 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/templates/authorization_policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.istio.io/v1beta1 2 | kind: AuthorizationPolicy 3 | metadata: 4 | name: katib-ui 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | action: ALLOW 8 | rules: 9 | - from: 10 | - source: 11 | principals: 12 | - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }} 13 | selector: 14 | matchLabels: 15 | katib.kubeflow.org/component: ui 16 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/templates/certificate.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Certificate 3 | metadata: 4 | name: katib-webhook-cert 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | commonName: katib-controller.{{ .Values.kubeflow.namespace }}.svc 8 | dnsNames: 9 | - katib-controller.{{ .Values.kubeflow.namespace }}.svc 10 | - katib-controller.{{ .Values.kubeflow.namespace }}.svc.cluster.local 11 | isCA: true 12 | issuerRef: 13 | kind: Issuer 14 | name: katib-selfsigned-issuer 15 | secretName: katib-webhook-cert 16 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/templates/cluster_role_bindings.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: katib-controller 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: katib-controller 9 | subjects: 10 | - kind: ServiceAccount 11 | name: katib-controller 12 | namespace: {{ .Values.kubeflow.namespace }} 13 | --- 14 | apiVersion: rbac.authorization.k8s.io/v1 15 | kind: ClusterRoleBinding 16 | metadata: 17 | name: katib-ui 18 | roleRef: 19 | apiGroup: rbac.authorization.k8s.io 20 | kind: ClusterRole 21 | name: katib-ui 22 | subjects: 23 | - kind: ServiceAccount 24 | name: katib-ui 25 | namespace: {{ .Values.kubeflow.namespace }} 26 | --- 27 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/templates/issuer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Issuer 3 | metadata: 4 | name: katib-selfsigned-issuer 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | selfSigned: {} 8 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/templates/mutating_web_hook.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: MutatingWebhookConfiguration 3 | metadata: 4 | annotations: 5 | cert-manager.io/inject-ca-from: {{ .Values.kubeflow.namespace }}/katib-webhook-cert 6 | name: katib.kubeflow.org 7 | webhooks: 8 | - admissionReviewVersions: 9 | - v1 10 | clientConfig: 11 | caBundle: Cg== 12 | service: 13 | name: katib-controller 14 | namespace: {{ .Values.kubeflow.namespace }} 15 | path: /mutate-experiment 16 | failurePolicy: Ignore 17 | name: defaulter.experiment.katib.kubeflow.org 18 | rules: 19 | - apiGroups: 20 | - kubeflow.org 21 | apiVersions: 22 | - v1beta1 23 | operations: 24 | - CREATE 25 | - UPDATE 26 | resources: 27 | - experiments 28 | sideEffects: None 29 | - admissionReviewVersions: 30 | - v1 31 | clientConfig: 32 | caBundle: Cg== 33 | service: 34 | name: katib-controller 35 | namespace: {{ .Values.kubeflow.namespace }} 36 | path: /mutate-pod 37 | failurePolicy: Ignore 38 | name: mutator.pod.katib.kubeflow.org 39 | namespaceSelector: 40 | matchLabels: 41 | katib.kubeflow.org/metrics-collector-injection: enabled 42 | rules: 43 | - apiGroups: 44 | - '' 45 | apiVersions: 46 | - v1 47 | operations: 48 | - CREATE 49 | resources: 50 | - pods 51 | sideEffects: None 52 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/templates/service_accounts.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: katib-controller 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | --- 7 | apiVersion: v1 8 | kind: ServiceAccount 9 | metadata: 10 | name: katib-ui 11 | namespace: {{ .Values.kubeflow.namespace }} 12 | --- 13 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/templates/validating_web_hook.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingWebhookConfiguration 3 | metadata: 4 | annotations: 5 | cert-manager.io/inject-ca-from: {{ .Values.kubeflow.namespace }}/katib-webhook-cert 6 | name: katib.kubeflow.org 7 | webhooks: 8 | - admissionReviewVersions: 9 | - v1 10 | clientConfig: 11 | caBundle: Cg== 12 | service: 13 | name: katib-controller 14 | namespace: {{ .Values.kubeflow.namespace }} 15 | path: /validate-experiment 16 | failurePolicy: Ignore 17 | name: validator.experiment.katib.kubeflow.org 18 | rules: 19 | - apiGroups: 20 | - kubeflow.org 21 | apiVersions: 22 | - v1beta1 23 | operations: 24 | - CREATE 25 | - UPDATE 26 | resources: 27 | - experiments 28 | sideEffects: None 29 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/templates/virtual_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1alpha3 2 | kind: VirtualService 3 | metadata: 4 | name: katib-ui 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | gateways: 8 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 9 | hosts: 10 | - '*' 11 | http: 12 | - match: 13 | - uri: 14 | prefix: /katib/ 15 | rewrite: 16 | uri: /katib/ 17 | route: 18 | - destination: 19 | host: katib-ui.{{ .Values.kubeflow.namespace }}.svc.cluster.local 20 | port: 21 | number: 80 22 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-katib/values.yaml: -------------------------------------------------------------------------------- 1 | kubeflow: 2 | namespace: kubeflow 3 | ingress: 4 | namespace: ingress 5 | gateway: ingress-gateway 6 | sa: istio-ingressgateway -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: v1.9.2 3 | description: A Helm chart for Kubeflow notebooks 4 | name: kubeflow-notebooks 5 | type: application 6 | version: 1.0.0 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/controller/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | control-plane: notebook-controller 6 | name: notebook-controller-role-binding 7 | roleRef: 8 | apiGroup: rbac.authorization.k8s.io 9 | kind: ClusterRole 10 | name: notebook-controller-role 11 | subjects: 12 | - kind: ServiceAccount 13 | name: notebook-controller-service-account 14 | namespace: {{ .Values.kubeflow.namespace }} 15 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/controller/config_map.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | annotations: {} 5 | name: notebook-controller 6 | namespace: {{ .Values.kubeflow.namespace }} 7 | data: 8 | CLUSTER_DOMAIN: cluster.local 9 | CULL_IDLE_TIME: '{{ .Values.cullingPolicy.cullIdleTime }}' 10 | ENABLE_CULLING: '{{ .Values.cullingPolicy.enableCulling }}' 11 | IDLENESS_CHECK_PERIOD: '{{ .Values.cullingPolicy.idlenessCheckPeriod }}' 12 | ISTIO_GATEWAY: {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 13 | USE_ISTIO: 'true' -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/controller/role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: notebook-controller-leader-election-role 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | rules: 7 | - apiGroups: 8 | - '' 9 | resources: 10 | - configmaps 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - create 16 | - update 17 | - patch 18 | - delete 19 | - apiGroups: 20 | - '' 21 | resources: 22 | - configmaps/status 23 | verbs: 24 | - get 25 | - update 26 | - patch 27 | - apiGroups: 28 | - '' 29 | resources: 30 | - events 31 | verbs: 32 | - create 33 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/controller/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: notebook-controller-leader-election-rolebinding 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: Role 9 | name: notebook-controller-leader-election-role 10 | subjects: 11 | - kind: ServiceAccount 12 | name: notebook-controller-service-account 13 | namespace: {{ .Values.kubeflow.namespace }} 14 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/controller/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: notebook-controller-service 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | ports: 8 | - port: 443 9 | name: https 10 | selector: 11 | control-plane: notebook-controller 12 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/controller/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: notebook-controller-service-account 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/webapp/authorization_policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.istio.io/v1 2 | kind: AuthorizationPolicy 3 | metadata: 4 | name: kubeflow-notebooks-webapp 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | action: ALLOW 8 | rules: 9 | - from: 10 | - source: 11 | principals: 12 | - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }} 13 | selector: 14 | matchLabels: 15 | app: kubeflow-notebooks-webapp 16 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/webapp/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: kubeflow-notebooks-webapp-cluster-role-binding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: kubeflow-notebooks-webapp-cluster-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: kubeflow-notebooks-webapp-service-account 12 | namespace: {{ .Values.kubeflow.namespace }} 13 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/webapp/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | app: kubeflow-notebooks-webapp 6 | name: kubeflow-notebooks-webapp-deployment 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: kubeflow-notebooks-webapp 13 | template: 14 | metadata: 15 | labels: 16 | app: kubeflow-notebooks-webapp 17 | spec: 18 | containers: 19 | - env: 20 | - name: APP_PREFIX 21 | value: /jupyter 22 | - name: UI 23 | value: default 24 | - name: USERID_HEADER 25 | value: x-auth-request-email 26 | - name: USERID_PREFIX 27 | value: '' 28 | - name: APP_SECURE_COOKIES 29 | value: 'true' 30 | image: docker.io/kubeflownotebookswg/jupyter-web-app:v1.9.2 31 | name: kubeflow-notebooks-webapp 32 | ports: 33 | - containerPort: 5000 34 | volumeMounts: 35 | - mountPath: /etc/config 36 | name: config-volume 37 | - mountPath: /src/apps/default/static/assets/logos 38 | name: logos-volume 39 | serviceAccountName: kubeflow-notebooks-webapp-service-account 40 | volumes: 41 | - configMap: 42 | name: kubeflow-notebooks-webapp-config 43 | name: config-volume 44 | - configMap: 45 | name: kubeflow-notebooks-webapp-logos 46 | name: logos-volume 47 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/webapp/destination_rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1alpha3 2 | kind: DestinationRule 3 | metadata: 4 | name: kubeflow-notebooks-webapp 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | host: kubeflow-notebooks-webapp-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local 8 | trafficPolicy: 9 | tls: 10 | mode: ISTIO_MUTUAL 11 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/webapp/role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: kubeflow-notebooks-webapp-jupyter-notebook-role 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | rules: 7 | - apiGroups: 8 | - authorization.k8s.io 9 | resources: 10 | - subjectaccessreviews 11 | verbs: 12 | - create 13 | - apiGroups: 14 | - kubeflow.org 15 | resources: 16 | - notebooks 17 | - notebooks/finalizers 18 | - poddefaults 19 | verbs: 20 | - get 21 | - list 22 | - create 23 | - delete 24 | - patch 25 | - update 26 | - apiGroups: 27 | - '' 28 | resources: 29 | - persistentvolumeclaims 30 | verbs: 31 | - create 32 | - delete 33 | - get 34 | - list 35 | - apiGroups: 36 | - '' 37 | resources: 38 | - events 39 | - nodes 40 | verbs: 41 | - list 42 | - apiGroups: 43 | - storage.k8s.io 44 | resources: 45 | - storageclasses 46 | verbs: 47 | - get 48 | - list 49 | - watch 50 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/webapp/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | labels: 5 | name: kubeflow-notebooks-webapp-jupyter-notebook-role-binding 6 | namespace: {{ .Values.kubeflow.namespace }} 7 | roleRef: 8 | apiGroup: rbac.authorization.k8s.io 9 | kind: Role 10 | name: kubeflow-notebooks-webapp-jupyter-notebook-role 11 | subjects: 12 | - kind: ServiceAccount 13 | name: jupyter-notebook 14 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/webapp/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: kubeflow-notebooks-webapp 6 | run: kubeflow-notebooks-webapp 7 | name: kubeflow-notebooks-webapp-service 8 | namespace: {{ .Values.kubeflow.namespace }} 9 | spec: 10 | ports: 11 | - name: http 12 | port: 80 13 | protocol: TCP 14 | targetPort: 5000 15 | selector: 16 | app: kubeflow-notebooks-webapp 17 | type: ClusterIP 18 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/webapp/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: kubeflow-notebooks-webapp-service-account 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/templates/webapp/virtual_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1beta1 2 | kind: VirtualService 3 | metadata: 4 | name: kubeflow-notebooks-webapp 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | gateways: 8 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 9 | hosts: 10 | - '*' 11 | http: 12 | - headers: 13 | request: 14 | add: 15 | x-forwarded-prefix: /jupyter 16 | match: 17 | - uri: 18 | prefix: /jupyter/ 19 | rewrite: 20 | uri: / 21 | route: 22 | - destination: 23 | host: kubeflow-notebooks-webapp-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local 24 | port: 25 | number: 80 26 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-notebooks/values.yaml: -------------------------------------------------------------------------------- 1 | kubeflow: 2 | namespace: kubeflow 3 | ingress: 4 | namespace: ingress 5 | gateway: ingress-gateway 6 | cullingPolicy: 7 | enableCulling: false 8 | cullIdleTime: 30 9 | idlenessCheckPeriod: 5 -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: 2.0.1 3 | description: A Helm chart for Kubeflow pipelines 4 | name: kubeflow-pipelines 5 | type: application 6 | version: 1.0.0 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/crds/clusterworkflowtemplates.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | labels: 5 | application-crd-id: kubeflow-pipelines 6 | name: clusterworkflowtemplates.argoproj.io 7 | spec: 8 | group: argoproj.io 9 | names: 10 | kind: ClusterWorkflowTemplate 11 | listKind: ClusterWorkflowTemplateList 12 | plural: clusterworkflowtemplates 13 | shortNames: 14 | - clusterwftmpl 15 | - cwft 16 | singular: clusterworkflowtemplate 17 | scope: Cluster 18 | versions: 19 | - name: v1alpha1 20 | schema: 21 | openAPIV3Schema: 22 | properties: 23 | apiVersion: 24 | type: string 25 | kind: 26 | type: string 27 | metadata: 28 | type: object 29 | spec: 30 | type: object 31 | x-kubernetes-map-type: atomic 32 | x-kubernetes-preserve-unknown-fields: true 33 | required: 34 | - metadata 35 | - spec 36 | type: object 37 | served: true 38 | storage: true 39 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/crds/cronworkflows.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | labels: 5 | application-crd-id: kubeflow-pipelines 6 | name: cronworkflows.argoproj.io 7 | spec: 8 | group: argoproj.io 9 | names: 10 | kind: CronWorkflow 11 | listKind: CronWorkflowList 12 | plural: cronworkflows 13 | shortNames: 14 | - cwf 15 | - cronwf 16 | singular: cronworkflow 17 | scope: Namespaced 18 | versions: 19 | - name: v1alpha1 20 | schema: 21 | openAPIV3Schema: 22 | properties: 23 | apiVersion: 24 | type: string 25 | kind: 26 | type: string 27 | metadata: 28 | type: object 29 | spec: 30 | type: object 31 | x-kubernetes-map-type: atomic 32 | x-kubernetes-preserve-unknown-fields: true 33 | status: 34 | type: object 35 | x-kubernetes-map-type: atomic 36 | x-kubernetes-preserve-unknown-fields: true 37 | required: 38 | - metadata 39 | - spec 40 | type: object 41 | served: true 42 | storage: true 43 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/crds/scheduledworkflows.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | labels: 5 | app.kubernetes.io/component: ml-pipeline 6 | app.kubernetes.io/name: kubeflow-pipelines 7 | application-crd-id: kubeflow-pipelines 8 | name: scheduledworkflows.kubeflow.org 9 | spec: 10 | group: kubeflow.org 11 | names: 12 | kind: ScheduledWorkflow 13 | listKind: ScheduledWorkflowList 14 | plural: scheduledworkflows 15 | shortNames: 16 | - swf 17 | singular: scheduledworkflow 18 | scope: Namespaced 19 | versions: 20 | - name: v1beta1 21 | schema: 22 | openAPIV3Schema: 23 | properties: 24 | apiVersion: 25 | type: string 26 | kind: 27 | type: string 28 | metadata: 29 | type: object 30 | spec: 31 | type: object 32 | x-kubernetes-map-type: atomic 33 | x-kubernetes-preserve-unknown-fields: true 34 | status: 35 | type: object 36 | x-kubernetes-map-type: atomic 37 | x-kubernetes-preserve-unknown-fields: true 38 | required: 39 | - spec 40 | - status 41 | type: object 42 | served: true 43 | storage: true 44 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/crds/viewers.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | labels: 5 | app.kubernetes.io/component: ml-pipeline 6 | app.kubernetes.io/name: kubeflow-pipelines 7 | application-crd-id: kubeflow-pipelines 8 | name: viewers.kubeflow.org 9 | spec: 10 | group: kubeflow.org 11 | names: 12 | kind: Viewer 13 | listKind: ViewerList 14 | plural: viewers 15 | shortNames: 16 | - vi 17 | singular: viewer 18 | scope: Namespaced 19 | versions: 20 | - name: v1beta1 21 | schema: 22 | openAPIV3Schema: 23 | properties: 24 | apiVersion: 25 | type: string 26 | kind: 27 | type: string 28 | metadata: 29 | type: object 30 | spec: 31 | type: object 32 | x-kubernetes-map-type: atomic 33 | x-kubernetes-preserve-unknown-fields: true 34 | required: 35 | - spec 36 | type: object 37 | served: true 38 | storage: true 39 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/crds/workfloweventbindings.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | labels: 5 | application-crd-id: kubeflow-pipelines 6 | name: workfloweventbindings.argoproj.io 7 | spec: 8 | group: argoproj.io 9 | names: 10 | kind: WorkflowEventBinding 11 | listKind: WorkflowEventBindingList 12 | plural: workfloweventbindings 13 | shortNames: 14 | - wfeb 15 | singular: workfloweventbinding 16 | scope: Namespaced 17 | versions: 18 | - name: v1alpha1 19 | schema: 20 | openAPIV3Schema: 21 | properties: 22 | apiVersion: 23 | type: string 24 | kind: 25 | type: string 26 | metadata: 27 | type: object 28 | spec: 29 | type: object 30 | x-kubernetes-map-type: atomic 31 | x-kubernetes-preserve-unknown-fields: true 32 | required: 33 | - metadata 34 | - spec 35 | type: object 36 | served: true 37 | storage: true 38 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/crds/workflows.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | labels: 5 | application-crd-id: kubeflow-pipelines 6 | name: workflows.argoproj.io 7 | spec: 8 | group: argoproj.io 9 | names: 10 | kind: Workflow 11 | listKind: WorkflowList 12 | plural: workflows 13 | shortNames: 14 | - wf 15 | singular: workflow 16 | scope: Namespaced 17 | versions: 18 | - additionalPrinterColumns: 19 | - description: Status of the workflow 20 | jsonPath: .status.phase 21 | name: Status 22 | type: string 23 | - description: When the workflow was started 24 | format: date-time 25 | jsonPath: .status.startedAt 26 | name: Age 27 | type: date 28 | name: v1alpha1 29 | schema: 30 | openAPIV3Schema: 31 | properties: 32 | apiVersion: 33 | type: string 34 | kind: 35 | type: string 36 | metadata: 37 | type: object 38 | spec: 39 | type: object 40 | x-kubernetes-map-type: atomic 41 | x-kubernetes-preserve-unknown-fields: true 42 | status: 43 | type: object 44 | x-kubernetes-map-type: atomic 45 | x-kubernetes-preserve-unknown-fields: true 46 | required: 47 | - metadata 48 | - spec 49 | type: object 50 | served: true 51 | storage: true 52 | subresources: {} 53 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/crds/workflowtasksets.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | labels: 5 | application-crd-id: kubeflow-pipelines 6 | name: workflowtasksets.argoproj.io 7 | spec: 8 | group: argoproj.io 9 | names: 10 | kind: WorkflowTaskSet 11 | listKind: WorkflowTaskSetList 12 | plural: workflowtasksets 13 | shortNames: 14 | - wfts 15 | singular: workflowtaskset 16 | scope: Namespaced 17 | versions: 18 | - name: v1alpha1 19 | schema: 20 | openAPIV3Schema: 21 | properties: 22 | apiVersion: 23 | type: string 24 | kind: 25 | type: string 26 | metadata: 27 | type: object 28 | spec: 29 | type: object 30 | x-kubernetes-map-type: atomic 31 | x-kubernetes-preserve-unknown-fields: true 32 | status: 33 | type: object 34 | x-kubernetes-map-type: atomic 35 | x-kubernetes-preserve-unknown-fields: true 36 | required: 37 | - metadata 38 | - spec 39 | type: object 40 | served: true 41 | storage: true 42 | subresources: 43 | status: {} 44 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/crds/workflowtemplates.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | labels: 5 | application-crd-id: kubeflow-pipelines 6 | name: workflowtemplates.argoproj.io 7 | spec: 8 | group: argoproj.io 9 | names: 10 | kind: WorkflowTemplate 11 | listKind: WorkflowTemplateList 12 | plural: workflowtemplates 13 | shortNames: 14 | - wftmpl 15 | singular: workflowtemplate 16 | scope: Namespaced 17 | versions: 18 | - name: v1alpha1 19 | schema: 20 | openAPIV3Schema: 21 | properties: 22 | apiVersion: 23 | type: string 24 | kind: 25 | type: string 26 | metadata: 27 | type: object 28 | spec: 29 | type: object 30 | x-kubernetes-map-type: atomic 31 | x-kubernetes-preserve-unknown-fields: true 32 | required: 33 | - metadata 34 | - spec 35 | type: object 36 | served: true 37 | storage: true 38 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/templates/certficate.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Certificate 3 | metadata: 4 | labels: 5 | app: cache-server-cert-manager 6 | name: kfp-cache-cert 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | commonName: kfp-cache-cert 10 | dnsNames: 11 | - cache-server 12 | - cache-server.{{ .Values.kubeflow.namespace }} 13 | - cache-server.{{ .Values.kubeflow.namespace }}.svc 14 | isCA: true 15 | issuerRef: 16 | kind: Issuer 17 | name: kfp-cache-selfsigned-issuer 18 | secretName: webhook-server-cert 19 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/templates/composite_controller.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: metacontroller.k8s.io/v1alpha1 2 | kind: CompositeController 3 | metadata: 4 | labels: 5 | app: kubeflow-pipelines-profile-controller 6 | app.kubernetes.io/component: ml-pipeline 7 | app.kubernetes.io/name: kubeflow-pipelines 8 | application-crd-id: kubeflow-pipelines 9 | name: kubeflow-pipelines-profile-controller 10 | namespace: {{ .Values.kubeflow.namespace }} 11 | spec: 12 | childResources: 13 | - apiVersion: v1 14 | resource: secrets 15 | updateStrategy: 16 | method: OnDelete 17 | - apiVersion: v1 18 | resource: configmaps 19 | updateStrategy: 20 | method: OnDelete 21 | - apiVersion: apps/v1 22 | resource: deployments 23 | updateStrategy: 24 | method: InPlace 25 | - apiVersion: v1 26 | resource: services 27 | updateStrategy: 28 | method: InPlace 29 | - apiVersion: networking.istio.io/v1alpha3 30 | resource: destinationrules 31 | updateStrategy: 32 | method: InPlace 33 | - apiVersion: security.istio.io/v1beta1 34 | resource: authorizationpolicies 35 | updateStrategy: 36 | method: InPlace 37 | generateSelector: true 38 | hooks: 39 | sync: 40 | webhook: 41 | url: http://kubeflow-pipelines-profile-controller/sync 42 | parentResource: 43 | apiVersion: v1 44 | resource: namespaces 45 | resyncPeriodSeconds: 3600 46 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/templates/issuer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Issuer 3 | metadata: 4 | labels: 5 | app: cache-server-cert-manager 6 | name: kfp-cache-selfsigned-issuer 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | selfSigned: {} 10 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/templates/mutating_webhook_configuration.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: MutatingWebhookConfiguration 3 | metadata: 4 | annotations: 5 | cert-manager.io/inject-ca-from: {{ .Values.kubeflow.namespace }}/kfp-cache-cert 6 | labels: 7 | app: cache-server-cert-manager 8 | name: cache-webhook-kubeflow 9 | webhooks: 10 | - admissionReviewVersions: 11 | - v1beta1 12 | clientConfig: 13 | service: 14 | name: cache-server 15 | namespace: {{ .Values.kubeflow.namespace }} 16 | path: /mutate 17 | failurePolicy: Ignore 18 | name: cache-server.{{ .Values.kubeflow.namespace }}.svc 19 | objectSelector: 20 | matchLabels: 21 | pipelines.kubeflow.org/cache_enabled: 'true' 22 | rules: 23 | - apiGroups: 24 | - '' 25 | apiVersions: 26 | - v1 27 | operations: 28 | - CREATE 29 | resources: 30 | - pods 31 | sideEffects: None 32 | timeoutSeconds: 5 33 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/templates/priority_class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: scheduling.k8s.io/v1 2 | kind: PriorityClass 3 | metadata: 4 | labels: 5 | application-crd-id: kubeflow-pipelines 6 | name: workflow-controller 7 | value: 1000000 8 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/templates/secrets.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | labels: 5 | application-crd-id: kubeflow-pipelines 6 | name: mlpipeline-minio-artifact 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | stringData: 9 | accesskey: {{ .Values.minio.access_key }} 10 | secretkey: {{ .Values.minio.secret_key }} 11 | --- 12 | apiVersion: v1 13 | kind: Secret 14 | metadata: 15 | labels: 16 | app.kubernetes.io/component: ml-pipeline 17 | app.kubernetes.io/name: kubeflow-pipelines 18 | application-crd-id: kubeflow-pipelines 19 | name: mysql-secret 20 | namespace: {{ .Values.kubeflow.namespace }} 21 | stringData: 22 | username: root 23 | password: "" 24 | --- 25 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/templates/virtual_services.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: networking.istio.io/v1alpha3 3 | kind: VirtualService 4 | metadata: 5 | labels: 6 | application-crd-id: kubeflow-pipelines 7 | name: metadata-grpc 8 | namespace: {{ .Values.kubeflow.namespace }} 9 | spec: 10 | gateways: 11 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 12 | hosts: 13 | - '*' 14 | http: 15 | - match: 16 | - uri: 17 | prefix: /ml_metadata 18 | rewrite: 19 | uri: /ml_metadata 20 | route: 21 | - destination: 22 | host: metadata-envoy-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local 23 | port: 24 | number: 9090 25 | --- 26 | apiVersion: networking.istio.io/v1alpha3 27 | kind: VirtualService 28 | metadata: 29 | labels: 30 | app.kubernetes.io/component: ml-pipeline 31 | app.kubernetes.io/name: kubeflow-pipelines 32 | application-crd-id: kubeflow-pipelines 33 | name: ml-pipeline-ui 34 | namespace: {{ .Values.kubeflow.namespace }} 35 | spec: 36 | gateways: 37 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 38 | hosts: 39 | - '*' 40 | http: 41 | - match: 42 | - uri: 43 | prefix: /pipeline 44 | rewrite: 45 | uri: /pipeline 46 | route: 47 | - destination: 48 | host: ml-pipeline-ui.{{ .Values.kubeflow.namespace }}.svc.cluster.local 49 | port: 50 | number: 80 51 | timeout: 300s 52 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-pipelines/values.yaml: -------------------------------------------------------------------------------- 1 | kubeflow: 2 | namespace: kubeflow 3 | ingress: 4 | namespace: ingress 5 | gateway: ingress-gateway 6 | sa: 7 | minio: 8 | access_key: 9 | secret_key: -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: v1.9.2 3 | description: A Helm chart for Kubeflow Profiles and Access Management 4 | name: kubeflow-profiles-and-kfam 5 | type: application 6 | version: 1.0.0 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/templates/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | control-plane: profiles 6 | name: profiles-cluster-rolebinding 7 | roleRef: 8 | apiGroup: rbac.authorization.k8s.io 9 | kind: ClusterRole 10 | name: cluster-admin 11 | subjects: 12 | - kind: ServiceAccount 13 | name: profiles-controller-service-account 14 | namespace: {{ .Values.kubeflow.namespace }} 15 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/templates/config-maps/namespace_labels_data.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | namespace-labels.yaml: '# Below is a list of labels to be set by default. 4 | 5 | # 6 | 7 | # To add a namespace label, use `key: ''value''`, for example: 8 | 9 | # istio.io/rev: ''asm-191-1'' 10 | 11 | # 12 | 13 | # To remove a namespace label, use `key: ''''`. For example: 14 | 15 | # istio-injection: '''' 16 | 17 | # 18 | 19 | # Profile controller will not replace a namespace label if its key already 20 | 21 | # exists. If you want to override the value of a previously applied label, you 22 | 23 | # need to: 24 | 25 | # 1. Remove the label by using `key: ''''` and deploy. 26 | 27 | # 2. Add the label by using `key: ''value''` and deploy. 28 | 29 | # 30 | 31 | katib.kubeflow.org/metrics-collector-injection: "enabled" 32 | 33 | serving.kubeflow.org/inferenceservice: "enabled" 34 | 35 | pipelines.kubeflow.org/enabled: "true" 36 | 37 | app.kubernetes.io/part-of: "kubeflow-profile" 38 | 39 | ' 40 | kind: ConfigMap 41 | metadata: 42 | labels: 43 | control-plane: profiles 44 | name: namespace-labels-data 45 | namespace: {{ .Values.kubeflow.namespace }} 46 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/templates/config-maps/profiles_config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | ADMIN: '' 4 | USERID_HEADER: x-auth-request-email 5 | USERID_PREFIX: '' 6 | WORKLOAD_IDENTITY: '' 7 | ISTIO_INGRESS_GATEWAY_PRINCIPAL: "cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }}" 8 | NOTEBOOK_CONTROLLER_PRINCIPAL: "cluster.local/ns/{{ .Values.kubeflow.namespace }}/sa/{{ .Values.notebook_controller.sa }}" 9 | KFP_UI_PRINCIPAL: "cluster.local/ns/{{ .Values.kubeflow.namespace }}/sa/{{ .Values.pipeline_ui.sa }}" 10 | kind: ConfigMap 11 | metadata: 12 | labels: 13 | control-plane: profiles 14 | name: profiles-config 15 | namespace: {{ .Values.kubeflow.namespace }} 16 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/templates/role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | labels: 5 | control-plane: profiles 6 | name: profiles-leader-election-role 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | rules: 9 | - apiGroups: 10 | - '' 11 | resources: 12 | - configmaps 13 | verbs: 14 | - get 15 | - list 16 | - watch 17 | - create 18 | - update 19 | - patch 20 | - delete 21 | - apiGroups: 22 | - '' 23 | resources: 24 | - configmaps/status 25 | verbs: 26 | - get 27 | - update 28 | - patch 29 | - apiGroups: 30 | - '' 31 | resources: 32 | - events 33 | verbs: 34 | - create 35 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/templates/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | labels: 5 | control-plane: profiles 6 | name: profiles-leader-election-rolebinding 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: Role 11 | name: profiles-leader-election-role 12 | subjects: 13 | - kind: ServiceAccount 14 | name: profiles-controller-service-account 15 | namespace: {{ .Values.kubeflow.namespace }} 16 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: profiles 6 | name: profiles-kfam 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | ports: 10 | - port: 8081 11 | selector: 12 | control-plane: profiles 13 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/templates/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | control-plane: profiles 6 | annotations: 7 | eks.amazonaws.com/role-arn: {{ .Values.profile_controller.role_arn }} 8 | name: profiles-controller-service-account 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/templates/virtual_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1alpha3 2 | kind: VirtualService 3 | metadata: 4 | labels: 5 | control-plane: profiles 6 | name: profiles-kfam 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | gateways: 10 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 11 | hosts: 12 | - '*' 13 | http: 14 | - headers: 15 | request: 16 | add: 17 | x-forwarded-prefix: /kfam 18 | match: 19 | - uri: 20 | prefix: /kfam/ 21 | rewrite: 22 | uri: /kfam/ 23 | route: 24 | - destination: 25 | host: profiles-kfam.{{ .Values.kubeflow.namespace }}.svc.cluster.local 26 | port: 27 | number: 8081 28 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-profiles-and-kfam/values.yaml: -------------------------------------------------------------------------------- 1 | profile_controller: 2 | role_arn: 3 | kubeflow: 4 | namespace: kubeflow 5 | ingress: 6 | namespace: ingress 7 | gateway: ingress-gateway 8 | sa: istio-ingressgateway 9 | notebook_controller: 10 | sa: notebook-controller-service-account 11 | pipeline_ui: 12 | sa: ml-pipeline-ui 13 | 14 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-roles/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: kubeflow-roles 3 | description: A Helm chart for Kubeflow roles 4 | type: application 5 | version: 1.0.0 6 | appVersion: "v1.9.2" 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: v1.9.2 3 | description: A Helm chart for Kubeflow tensorboards 4 | name: kubeflow-tensorboards 5 | type: application 6 | version: 1.0.0 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/controller/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: tensorboard-controller-manager-rolebinding 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: ClusterRole 9 | name: tensorboard-controller-manager-role 10 | subjects: 11 | - kind: ServiceAccount 12 | name: tensorboard-controller-manager 13 | namespace: {{ .Values.kubeflow.namespace }} 14 | 15 | --- 16 | 17 | apiVersion: rbac.authorization.k8s.io/v1 18 | kind: ClusterRoleBinding 19 | metadata: 20 | name: tensorboard-controller-proxy-rolebinding 21 | roleRef: 22 | apiGroup: rbac.authorization.k8s.io 23 | kind: ClusterRole 24 | name: tensorboard-controller-proxy-role 25 | subjects: 26 | - kind: ServiceAccount 27 | name: tensorboard-controller-manager 28 | namespace: {{ .Values.kubeflow.namespace }} 29 | --- -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/controller/config_map.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | ISTIO_GATEWAY: {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 4 | RWO_PVC_SCHEDULING: 'True' 5 | TENSORBOARD_IMAGE: tensorflow/tensorflow:2.12.0 6 | kind: ConfigMap 7 | metadata: 8 | name: tensorboard-controller-config 9 | namespace: {{ .Values.kubeflow.namespace }} 10 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/controller/role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: tensorboard-controller-leader-election-role 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | rules: 7 | - apiGroups: 8 | - '' 9 | resources: 10 | - configmaps 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - create 16 | - update 17 | - patch 18 | - delete 19 | - apiGroups: 20 | - coordination.k8s.io 21 | resources: 22 | - leases 23 | verbs: 24 | - get 25 | - list 26 | - watch 27 | - create 28 | - update 29 | - patch 30 | - delete 31 | - apiGroups: 32 | - '' 33 | resources: 34 | - events 35 | verbs: 36 | - create 37 | - patch 38 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/controller/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: tensorboard-controller-leader-election-rolebinding 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: Role 9 | name: tensorboard-controller-leader-election-role 10 | subjects: 11 | - kind: ServiceAccount 12 | name: tensorboard-controller-manager 13 | namespace: {{ .Values.kubeflow.namespace }} 14 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/controller/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: tensorboard-controller 6 | control-plane: tensorboard-controller-manager 7 | name: tensorboard-controller-manager-metrics-service 8 | namespace: {{ .Values.kubeflow.namespace }} 9 | spec: 10 | ports: 11 | - name: https 12 | port: 8443 13 | protocol: TCP 14 | targetPort: https 15 | selector: 16 | app: tensorboard-controller 17 | control-plane: tensorboard-controller-manager 18 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/controller/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: tensorboard-controller-manager 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/webapp/authorization_policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.istio.io/v1 2 | kind: AuthorizationPolicy 3 | metadata: 4 | name: tensorboards-web-app 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | action: ALLOW 8 | rules: 9 | - from: 10 | - source: 11 | principals: 12 | - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }} 13 | selector: 14 | matchLabels: 15 | app: tensorboards-web-app 16 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/webapp/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: tensorboards-web-app-cluster-role-binding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: tensorboards-web-app-cluster-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: tensorboards-web-app-service-account 12 | namespace: {{ .Values.kubeflow.namespace }} 13 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/webapp/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | app: tensorboards-web-app 6 | name: tensorboards-web-app-deployment 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: tensorboards-web-app 13 | template: 14 | metadata: 15 | labels: 16 | app: tensorboards-web-app 17 | spec: 18 | containers: 19 | - env: 20 | - name: APP_PREFIX 21 | value: /tensorboards 22 | - name: USERID_HEADER 23 | value: x-auth-request-email 24 | - name: USERID_PREFIX 25 | value: '' 26 | - name: APP_SECURE_COOKIES 27 | value: 'true ' 28 | image: docker.io/kubeflownotebookswg/tensorboards-web-app:v1.9.2 29 | name: tensorboards-web-app 30 | ports: 31 | - containerPort: 5000 32 | serviceAccountName: tensorboards-web-app-service-account 33 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/webapp/destination_rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1alpha3 2 | kind: DestinationRule 3 | metadata: 4 | name: tensorboards-web-app 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | host: tensorboards-web-app-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local 8 | trafficPolicy: 9 | tls: 10 | mode: ISTIO_MUTUAL 11 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/webapp/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: tensorboards-web-app 6 | name: tensorboards-web-app-service 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | ports: 10 | - name: http 11 | port: 80 12 | protocol: TCP 13 | targetPort: 5000 14 | selector: 15 | app: tensorboards-web-app 16 | type: ClusterIP 17 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/webapp/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: tensorboards-web-app-service-account 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/templates/webapp/virtual_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1beta1 2 | kind: VirtualService 3 | metadata: 4 | name: tensorboards-web-app-tensorboards-web-app 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | gateways: 8 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 9 | hosts: 10 | - '*' 11 | http: 12 | - headers: 13 | request: 14 | add: 15 | x-forwarded-prefix: /tensorboards 16 | match: 17 | - uri: 18 | prefix: /tensorboards/ 19 | rewrite: 20 | uri: / 21 | route: 22 | - destination: 23 | host: tensorboards-web-app-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local 24 | port: 25 | number: 80 26 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-tensorboards/values.yaml: -------------------------------------------------------------------------------- 1 | kubeflow: 2 | namespace: kubeflow 3 | ingress: 4 | namespace: ingress 5 | gateway: ingress-gateway 6 | sa: istio-ingressgateway 7 | 8 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-training-operator/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "v1.9.2" 3 | description: A Helm chart for Kubeflow training-operator 4 | name: kubeflow-training-operator 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-training-operator/templates/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | labels: 6 | app: training-operator 7 | name: training-operator 8 | namespace: {{ .Values.namespace }} 9 | roleRef: 10 | apiGroup: rbac.authorization.k8s.io 11 | kind: ClusterRole 12 | name: training-operator 13 | subjects: 14 | - kind: ServiceAccount 15 | name: training-operator 16 | namespace: {{ .Values.namespace }} -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-training-operator/templates/service.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | annotations: 6 | prometheus.io/path: /metrics 7 | prometheus.io/scrape: "true" 8 | prometheus.io/port: "8080" 9 | labels: 10 | app: training-operator 11 | name: training-operator 12 | namespace: {{ .Values.namespace }} 13 | spec: 14 | ports: 15 | - name: monitoring-port 16 | port: 8080 17 | targetPort: 8080 18 | selector: 19 | control-plane: kubeflow-training-operator 20 | type: ClusterIP 21 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-training-operator/templates/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: training-operator 6 | name: training-operator 7 | namespace: {{ .Values.namespace }} 8 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-training-operator/values.yaml: -------------------------------------------------------------------------------- 1 | namespace: kubeflow -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-user-profile-defaults/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: v1.9.2 3 | description: A Helm chart for Kubeflow user Pod Defaults 4 | name: kubeflow-user-defaults 5 | type: application 6 | version: 1.0.7 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-user-profile-defaults/templates/pod_default.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1alpha1 2 | kind: PodDefault 3 | metadata: 4 | name: access-ml-pipeline 5 | namespace: {{ .Values.user.profile }} 6 | spec: 7 | desc: Allow access to Kubeflow Pipelines 8 | selector: 9 | matchLabels: 10 | access-ml-pipeline: "true" 11 | env: 12 | - name: KF_PIPELINES_SA_TOKEN_PATH 13 | value: /var/run/secrets/kubeflow/pipelines/token 14 | volumes: 15 | - name: volume-kf-pipeline-token 16 | projected: 17 | sources: 18 | - serviceAccountToken: 19 | path: token 20 | expirationSeconds: 7200 21 | ## defined by the `TOKEN_REVIEW_AUDIENCE` environment variable on the `ml-pipeline` deployment 22 | audience: pipelines.kubeflow.org 23 | volumeMounts: 24 | - mountPath: /var/run/secrets/kubeflow/pipelines 25 | name: volume-kf-pipeline-token 26 | readOnly: true -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-user-profile-defaults/templates/role_bindings.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: RoleBinding 4 | metadata: 5 | name: pipeline-runner-binding 6 | namespace: {{ .Values.user.profile }} 7 | roleRef: 8 | apiGroup: rbac.authorization.k8s.io 9 | kind: Role 10 | name: pipeline-runner 11 | subjects: 12 | - kind: ServiceAccount 13 | name: default 14 | namespace: {{ .Values.user.profile }} 15 | --- 16 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-user-profile-defaults/values.yaml: -------------------------------------------------------------------------------- 1 | user: 2 | profile: kubeflow-user-example-com -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-user-profile/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: v1.9.2 3 | description: A Helm chart for Kubeflow user namespace 4 | name: kubeflow-user-profile 5 | type: application 6 | version: 1.0.0 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-user-profile/templates/config_map.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: default-install-config 5 | data: 6 | profile-name: {{ .Values.user.profile }} 7 | user: {{ .Values.user.email }} 8 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-user-profile/templates/profile.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1beta1 2 | kind: Profile 3 | metadata: 4 | name: {{ .Values.user.profile }} 5 | spec: 6 | owner: 7 | kind: User 8 | name: {{ .Values.user.email }} 9 | {{- if .Values.awsIamForServiceAccount.awsIamRole }} 10 | plugins: 11 | - kind: AwsIamForServiceAccount 12 | spec: 13 | awsIamRole: '{{ .Values.awsIamForServiceAccount.awsIamRole }}' 14 | annotateOnly: true 15 | {{- end }} -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-user-profile/values.yaml: -------------------------------------------------------------------------------- 1 | user: 2 | profile: kubeflow-user-example-com 3 | email: user@example.com 4 | awsIamForServiceAccount: 5 | awsIamRole: -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: v1.9.2 3 | description: A Helm chart for Kubeflow Volumes 4 | name: kubeflow-volumes 5 | type: application 6 | version: 1.0.0 7 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/controller/certificate.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Certificate 3 | metadata: 4 | name: pvcviewer-server-cert 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | dnsNames: 8 | - pvcviewer-webhook-service.{{ .Values.kubeflow.namespace }}.svc 9 | - pvcviewer-webhook-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local 10 | issuerRef: 11 | kind: Issuer 12 | name: pvcviewer-selfsigned-issuer 13 | secretName: pvcviewer-server-cert -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/controller/cluster_role_bindings.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: pvcviewer-manager-rolebinding 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: ClusterRole 9 | name: pvcviewer-manager-role 10 | subjects: 11 | - kind: ServiceAccount 12 | name: pvcviewer-controller-sa 13 | namespace: {{ .Values.kubeflow.namespace }} 14 | --- 15 | apiVersion: rbac.authorization.k8s.io/v1 16 | kind: ClusterRoleBinding 17 | metadata: 18 | name: pvcviewer-proxy-rolebinding 19 | roleRef: 20 | apiGroup: rbac.authorization.k8s.io 21 | kind: ClusterRole 22 | name: pvcviewer-proxy-role 23 | subjects: 24 | - kind: ServiceAccount 25 | name: pvcviewer-controller-sa 26 | namespace: {{ .Values.kubeflow.namespace }} 27 | --- 28 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/controller/config_map.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | ISTIO_GATEWAY: {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 4 | kind: ConfigMap 5 | metadata: 6 | name: pvcviewer-controller-config 7 | namespace: {{ .Values.kubeflow.namespace }} -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/controller/issuer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Issuer 3 | metadata: 4 | name: pvcviewer-selfsigned-issuer 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | spec: 7 | selfSigned: {} -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/controller/role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | name: pvcviewer-leader-election-role 6 | namespace: {{ .Values.kubeflow.namespace }} 7 | rules: 8 | - apiGroups: 9 | - "" 10 | resources: 11 | - configmaps 12 | verbs: 13 | - get 14 | - list 15 | - watch 16 | - create 17 | - update 18 | - patch 19 | - delete 20 | - apiGroups: 21 | - coordination.k8s.io 22 | resources: 23 | - leases 24 | verbs: 25 | - get 26 | - list 27 | - watch 28 | - create 29 | - update 30 | - patch 31 | - delete 32 | - apiGroups: 33 | - "" 34 | resources: 35 | - events 36 | verbs: 37 | - create 38 | - patch -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/controller/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: pvcviewer-leader-election-rolebinding 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: Role 9 | name: pvcviewer-leader-election-role 10 | subjects: 11 | - kind: ServiceAccount 12 | name: pvcviewer-controller-sa 13 | namespace: {{ .Values.kubeflow.namespace }} 14 | --- -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/controller/service.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | labels: 6 | control-plane: pvcviewer-controller-manager 7 | name: pvcviewer-webhook-service 8 | namespace: {{ .Values.kubeflow.namespace }} 9 | spec: 10 | ports: 11 | - port: 443 12 | protocol: TCP 13 | targetPort: 9443 14 | selector: 15 | control-plane: pvcviewer-controller-manager 16 | --- -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/controller/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: pvcviewer-controller-sa 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/webapp/authorization_policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.istio.io/v1beta1 2 | kind: AuthorizationPolicy 3 | metadata: 4 | labels: 5 | app: volumes-web-app 6 | 7 | name: volumes-web-app 8 | namespace: {{ .Values.kubeflow.namespace }} 9 | spec: 10 | action: ALLOW 11 | rules: 12 | - from: 13 | - source: 14 | principals: 15 | - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }} 16 | selector: 17 | matchLabels: 18 | app: volumes-web-app 19 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/webapp/cluster_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app: volumes-web-app 6 | name: kubeflow-volume-cluster-role-binding 7 | roleRef: 8 | apiGroup: rbac.authorization.k8s.io 9 | kind: ClusterRole 10 | name: kubeflow-volume-cluster-role 11 | subjects: 12 | - kind: ServiceAccount 13 | name: volumes-web-app-service-account 14 | namespace: {{ .Values.kubeflow.namespace }} 15 | --- 16 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/webapp/config_map.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: volumes-web-app-viewer-spec 5 | namespace: {{ .Values.kubeflow.namespace }} 6 | data: 7 | viewer-spec.yaml : | 8 | podTemplate: 9 | containers: 10 | - name: main 11 | image: $VOLUME_VIEWER_IMAGE 12 | env: 13 | - name: FB_ADDRESS 14 | value: "0.0.0.0" 15 | - name: FB_PORT 16 | value: "8080" 17 | - name: FB_DATABASE 18 | value: /tmp/filebrowser.db 19 | - name: FB_NOAUTH 20 | value: "true" 21 | - name: FB_BASEURL 22 | value: /pvcviewers/$NAMESPACE/$NAME/ 23 | readinessProbe: 24 | tcpSocket: 25 | port: 8080 26 | initialDelaySeconds: 2 27 | periodSeconds: 10 28 | # viewer-volume is provided automatically by the volumes web app 29 | volumeMounts: 30 | - name: viewer-volume 31 | mountPath: /data 32 | workingDir: /data 33 | serviceAccountName: default-editor 34 | networking: 35 | targetPort: 8080 36 | basePrefix: "/pvcviewers" 37 | rewrite: "/" 38 | timeout: 30s 39 | rwoScheduling: true -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/webapp/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | app: volumes-web-app 6 | name: volumes-web-app-deployment 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: volumes-web-app 13 | template: 14 | metadata: 15 | labels: 16 | app: volumes-web-app 17 | spec: 18 | containers: 19 | - name: volumes-web-app 20 | image: docker.io/kubeflownotebookswg/volumes-web-app:v1.9.2 21 | ports: 22 | - containerPort: 5000 23 | env: 24 | - name: APP_PREFIX 25 | value: /volumes 26 | - name: USERID_HEADER 27 | value: x-auth-request-email 28 | - name: USERID_PREFIX 29 | value: '' 30 | - name: APP_SECURE_COOKIES 31 | value: 'true' 32 | - name: VOLUME_VIEWER_IMAGE 33 | value: filebrowser/filebrowser:latest 34 | volumeMounts: 35 | - name: viewer-spec 36 | mountPath: /etc/config/viewer-spec.yaml 37 | subPath: viewer-spec.yaml 38 | serviceAccountName: volumes-web-app-service-account 39 | volumes: 40 | - name: viewer-spec 41 | configMap: 42 | name: volumes-web-app-viewer-spec 43 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/webapp/destination_rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1alpha3 2 | kind: DestinationRule 3 | metadata: 4 | labels: 5 | app: volumes-web-app 6 | 7 | name: volumes-web-app 8 | namespace: {{ .Values.kubeflow.namespace }} 9 | spec: 10 | host: volumes-web-app-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local 11 | trafficPolicy: 12 | tls: 13 | mode: ISTIO_MUTUAL 14 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/webapp/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: volumes-web-app 6 | run: volumes-web-app 7 | name: volumes-web-app-service 8 | namespace: {{ .Values.kubeflow.namespace }} 9 | spec: 10 | ports: 11 | - name: http 12 | port: 80 13 | protocol: TCP 14 | targetPort: 5000 15 | selector: 16 | app: volumes-web-app 17 | type: ClusterIP 18 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/webapp/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: volumes-web-app 6 | 7 | name: volumes-web-app-service-account 8 | namespace: {{ .Values.kubeflow.namespace }} 9 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/templates/webapp/virtual_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1alpha3 2 | kind: VirtualService 3 | metadata: 4 | labels: 5 | app: volumes-web-app 6 | name: volumes-web-app-virtual-service 7 | namespace: {{ .Values.kubeflow.namespace }} 8 | spec: 9 | gateways: 10 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 11 | hosts: 12 | - '*' 13 | http: 14 | - headers: 15 | request: 16 | add: 17 | x-forwarded-prefix: /volumes 18 | match: 19 | - uri: 20 | prefix: /volumes/ 21 | rewrite: 22 | uri: / 23 | route: 24 | - destination: 25 | host: volumes-web-app-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local 26 | port: 27 | number: 80 28 | -------------------------------------------------------------------------------- /charts/ml-platform/kubeflow-volumes/values.yaml: -------------------------------------------------------------------------------- 1 | kubeflow: 2 | namespace: kubeflow 3 | ingress: 4 | namespace: ingress 5 | gateway: ingress-gateway 6 | sa: istio-ingressgateway 7 | -------------------------------------------------------------------------------- /charts/mpi-operator/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "0.4.0" 3 | description: A Helm chart for kubeflow mpi-operator 4 | name: mpi-operator 5 | version: 2.1.0 6 | -------------------------------------------------------------------------------- /charts/mpi-operator/values.yaml: -------------------------------------------------------------------------------- 1 | namespace: kubeflow 2 | image: mpioperator/mpi-operator:0.4.0 3 | pullpolicy: Always 4 | -------------------------------------------------------------------------------- /charts/nvidia-device-plugin/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "v0.14.3" 3 | description: A Helm chart for Nvidia device plugin 4 | name: nvidia-device-plugin 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/oauth2-proxy-route/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: oauth2-proxy-route 3 | description: A Helm chart for oauth2-proxy route 4 | type: application 5 | version: 1.0.0 6 | appVersion: "7.5.1" 7 | -------------------------------------------------------------------------------- /charts/oauth2-proxy-route/templates/authorization_policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.istio.io/v1 2 | kind: AuthorizationPolicy 3 | metadata: 4 | name: external-auth 5 | namespace: {{ .Values.ingress.namespace }} 6 | spec: 7 | selector: 8 | matchLabels: 9 | app: istio-ingressgateway 10 | istio: ingressgateway 11 | action: CUSTOM 12 | provider: 13 | name: oauth2-proxy 14 | rules: 15 | - to: 16 | - operation: 17 | notPaths: ["/dex/*", "/authservice/logout"] -------------------------------------------------------------------------------- /charts/oauth2-proxy-route/templates/virtual_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.istio.io/v1beta1 2 | kind: VirtualService 3 | metadata: 4 | name: oauth2-proxy 5 | namespace: {{ .Values.oauth2_proxy.namespace }} 6 | spec: 7 | hosts: 8 | - '*' 9 | gateways: 10 | - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }} 11 | http: 12 | - name: logout 13 | match: 14 | - uri: 15 | exact: /authservice/logout 16 | redirect: 17 | uri: /oauth2/sign_out 18 | - name: "oauth2" 19 | match: 20 | - uri: 21 | prefix: "/oauth2" 22 | route: 23 | - destination: 24 | host: oauth2-proxy.{{ .Values.oauth2_proxy.namespace }}.svc.cluster.local 25 | port: 26 | number: 80 -------------------------------------------------------------------------------- /charts/oauth2-proxy-route/values.yaml: -------------------------------------------------------------------------------- 1 | oauth2_proxy: 2 | namespace: auth 3 | ingress: 4 | namespace: ingress 5 | gateway: ingress-gateway -------------------------------------------------------------------------------- /charts/pv-efs/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "2.5.2" 3 | description: A Helm chart for k8s persistent volume for EFS 4 | name: pv-efs 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /charts/pv-efs/templates/pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: {{ .Values.efs.volume_name | default "pv-efs" }} 5 | spec: 6 | capacity: 7 | storage: {{ .Values.efs.storage | default "1000Gi" }} 8 | volumeMode: Filesystem 9 | accessModes: 10 | - ReadWriteMany 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: {{ .Values.efs.class_name | default "efs-sc" }} 13 | csi: 14 | driver: efs.csi.aws.com 15 | volumeHandle: {{ .Values.efs.fs_id }} -------------------------------------------------------------------------------- /charts/pv-efs/templates/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: {{ .Values.efs.claim_name | default "pv-efs" }} 5 | namespace: {{ .Values.namespace }} 6 | spec: 7 | accessModes: 8 | - ReadWriteMany 9 | storageClassName: {{ .Values.efs.class_name | default "efs-sc" }} 10 | volumeName: {{ .Values.efs.volume_name | default "pv-efs" }} 11 | resources: 12 | requests: 13 | storage: {{ .Values.efs.storage | default "1000Gi" }} -------------------------------------------------------------------------------- /charts/pv-efs/templates/storage-class.yaml: -------------------------------------------------------------------------------- 1 | kind: StorageClass 2 | apiVersion: storage.k8s.io/v1 3 | metadata: 4 | name: {{ .Values.efs.class_name | default "efs-sc" }} 5 | provisioner: efs.csi.aws.com -------------------------------------------------------------------------------- /charts/pv-fsx/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.8.0" 3 | description: A Helm chart for k8s persistent volume for FSx for Lustre 4 | name: pv-fsx 5 | version: 1.1.0 6 | -------------------------------------------------------------------------------- /charts/pv-fsx/templates/pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: {{ .Values.fsx.volume_name | default "pv-fsx" }} 5 | spec: 6 | capacity: 7 | storage: {{ .Values.fsx.storage | default "1200Gi" }} 8 | volumeMode: Filesystem 9 | accessModes: 10 | - ReadWriteMany 11 | mountOptions: 12 | - noatime 13 | - flock 14 | persistentVolumeReclaimPolicy: Retain 15 | storageClassName: {{ .Values.fsx.class_name | default "fsx-sc" }} 16 | csi: 17 | driver: fsx.csi.aws.com 18 | volumeHandle: {{ .Values.fsx.fs_id }} 19 | volumeAttributes: 20 | dnsname: {{ .Values.fsx.dns_name }} 21 | mountname: {{ .Values.fsx.mount_name }} -------------------------------------------------------------------------------- /charts/pv-fsx/templates/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: {{ .Values.fsx.claim_name | default "pv-fsx" }} 5 | namespace: {{ .Values.namespace }} 6 | spec: 7 | accessModes: 8 | - ReadWriteMany 9 | storageClassName: {{ .Values.fsx.class_name | default "fsx-sc" }} 10 | volumeName: {{ .Values.fsx.volume_name | default "pv-fsx" }} 11 | resources: 12 | requests: 13 | storage: {{ .Values.fsx.storage | default "1200Gi" }} -------------------------------------------------------------------------------- /charts/pv-fsx/templates/storage-class.yaml: -------------------------------------------------------------------------------- 1 | kind: StorageClass 2 | apiVersion: storage.k8s.io/v1 3 | metadata: 4 | name: {{ .Values.fsx.class_name | default "fsx-sc" }} 5 | provisioner: fsx.csi.aws.com -------------------------------------------------------------------------------- /containers/aws-samples-maskrcnn/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=mask-rcnn-tensorflow 4 | export IMAGE_TAG=tf2.12.0-cu11.8-ubuntu22.04-59168dc 5 | -------------------------------------------------------------------------------- /containers/megatron-deepspeed/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:24.01-py3 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | ENV DEBCONF_NONINTERACTIVE_SEEN=true 5 | 6 | RUN git clone https://github.com/NVIDIA/apex /apex 7 | RUN cd /apex && git fetch origin b496d85fb88a801d8e680872a12822de310951fd 8 | RUN cd /apex && git reset --hard b496d85fb88a801d8e680872a12822de310951fd 9 | 10 | RUN pip3 install --upgrade pip 11 | RUN cd /apex && pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ 12 | RUN pip3 install deepspeed==0.13.4 13 | RUN pip3 install git+https://github.com/microsoft/Megatron-DeepSpeed.git@a9856ce0e75dbe69c96d4e241e8a191b344118d7 14 | 15 | RUN apt-get update 16 | RUN apt-get install -y libaio-dev 17 | 18 | CMD ["/bin/bash"] 19 | -------------------------------------------------------------------------------- /containers/megatron-deepspeed/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=megatron-deepspeed-ngc 4 | export IMAGE_TAG=mt-a9856ce-ds-0.13.4-ngc-24.01-py3 5 | -------------------------------------------------------------------------------- /containers/nemo-megatron/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:24.10-py3 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | ENV DEBCONF_NONINTERACTIVE_SEEN=true 5 | 6 | RUN apt-get update && apt-get install -y libsndfile1 ffmpeg 7 | 8 | RUN pip3 install --upgrade pip 9 | RUN git clone https://github.com/NVIDIA/NeMo.git /NeMo 10 | RUN cd /NeMo && git fetch origin 6b0f0886f933c6e21c92b2f1981f66993134be7e 11 | RUN cd /NeMo && git reset --hard 6b0f0886f933c6e21c92b2f1981f66993134be7e 12 | RUN cd /NeMo && pip install -e . 13 | 14 | RUN cd /NeMo && pip install -r /NeMo/requirements/requirements_common.txt 15 | RUN cd /NeMo && pip install -r /NeMo/requirements/requirements_lightning.txt 16 | RUN cd /NeMo && pip install -r /NeMo/requirements/requirements_nlp.txt 17 | 18 | RUN pip3 install git+https://github.com/NVIDIA/NeMo-Run.git@5ed6128f9285e61cfee73d780b663c9d780f20c7 19 | RUN pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@9c11ab4ca24ead28c3c1e29f8904f8258d7543cb 20 | 21 | RUN pip3 install transformers==4.48.1 22 | RUN pip3 install datasets==3.2.0 23 | RUN pip3 install huggingface_hub==0.27.1 24 | 25 | CMD ["/bin/bash"] 26 | -------------------------------------------------------------------------------- /containers/nemo-megatron/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=nemo-megatron 4 | export IMAGE_TAG=nemo-6b0f088-megatron-9c11ab4-ngc-24.10-py3 5 | -------------------------------------------------------------------------------- /containers/ray-pytorch-neuronx-vllm/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=ray-neuronx-vllm 4 | export IMAGE_TAG=ray2.44.0-py311-2.22.0-0.8.5.post1 5 | -------------------------------------------------------------------------------- /containers/ray-pytorch-neuronx-vllm/patches/vllm-neuron-0.6.6.post1.patch: -------------------------------------------------------------------------------- 1 | diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py 2 | index 3f626968..af3eb0c5 100644 3 | --- a/vllm/worker/neuron_worker.py 4 | +++ b/vllm/worker/neuron_worker.py 5 | @@ -58,7 +58,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): 6 | # Set the number of GPU blocks to be the same as the maximum number of 7 | # sequences that can be processed in a single batch. This is equivalent 8 | # to schedule without PagedAttention. 9 | - num_gpu_blocks = self.scheduler_config.max_num_seqs 10 | + num_gpu_blocks = self.scheduler_config.max_num_seqs + 1 11 | 12 | # Swap not yet supported with Neuron backend. 13 | num_cpu_blocks = 0 14 | @@ -72,7 +72,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): 15 | 16 | # Different values are not tested. 17 | assert num_cpu_blocks == 0 18 | - assert num_gpu_blocks == self.scheduler_config.max_num_seqs 19 | + assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1 20 | 21 | self.cache_config.num_gpu_blocks = num_gpu_blocks 22 | self.cache_config.num_cpu_blocks = num_cpu_blocks 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /containers/ray-pytorch-neuronx-vllm/patches/vllm_v0.5.0_neuron.patch: -------------------------------------------------------------------------------- 1 | diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py 2 | index e7f0e887..87564b76 100644 3 | --- a/vllm/executor/neuron_executor.py 4 | +++ b/vllm/executor/neuron_executor.py 5 | @@ -48,9 +48,9 @@ class NeuronExecutor(ExecutorBase): 6 | def execute_model( 7 | self, 8 | execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: 9 | - assert (execute_model_req.blocks_to_swap_in == {} 10 | - and execute_model_req.blocks_to_swap_out == {} 11 | - and execute_model_req.blocks_to_copy == {}), ( 12 | + assert (not execute_model_req.blocks_to_swap_in 13 | + and not execute_model_req.blocks_to_swap_out 14 | + and not execute_model_req.blocks_to_copy), ( 15 | "Cache operations are not supported for Neuron backend.") 16 | assert execute_model_req.num_lookahead_slots == 0, ( 17 | "lookahead not supported for Neuron backend.") 18 | -------------------------------------------------------------------------------- /containers/ray-pytorch-neuronx/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=ray-neuronx 4 | export IMAGE_TAG=ray2.44.0-py311-2.22.0 5 | -------------------------------------------------------------------------------- /containers/ray-pytorch/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rayproject/ray:2.44.0-py311-cu125 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | ENV DEBCONF_NONINTERACTIVE_SEEN=true 5 | 6 | RUN pip install --upgrade pip 7 | RUN pip install torch==2.5.1 torchvision torchaudio 8 | 9 | CMD ["/bin/bash"] 10 | -------------------------------------------------------------------------------- /containers/ray-pytorch/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=ray-pytorch-cuda 4 | export IMAGE_TAG=2.44.0-py311-cu125-2.5.1 5 | -------------------------------------------------------------------------------- /containers/tensorpack-maskrcnn/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=mask-rcnn-tensorpack 4 | export IMAGE_TAG=tf2.12.0-cu118-ubuntu22.04-fac024f 5 | -------------------------------------------------------------------------------- /containers/tritonserver-neuronx-djl-lmi/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=tritonserver-neuronx-djl-lmi 4 | export IMAGE_TAG=24.06-2.21.0-c343d60 5 | -------------------------------------------------------------------------------- /containers/tritonserver-neuronx-vllm/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=tritonserver-neuronx-vllm 4 | export IMAGE_TAG=24.06-2.22.0-0.8.5.post1 5 | -------------------------------------------------------------------------------- /containers/tritonserver-neuronx-vllm/patch/vllm-neuron-0.6.6.post1.patch: -------------------------------------------------------------------------------- 1 | diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py 2 | index 3f626968..af3eb0c5 100644 3 | --- a/vllm/worker/neuron_worker.py 4 | +++ b/vllm/worker/neuron_worker.py 5 | @@ -58,7 +58,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): 6 | # Set the number of GPU blocks to be the same as the maximum number of 7 | # sequences that can be processed in a single batch. This is equivalent 8 | # to schedule without PagedAttention. 9 | - num_gpu_blocks = self.scheduler_config.max_num_seqs 10 | + num_gpu_blocks = self.scheduler_config.max_num_seqs + 1 11 | 12 | # Swap not yet supported with Neuron backend. 13 | num_cpu_blocks = 0 14 | @@ -72,7 +72,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): 15 | 16 | # Different values are not tested. 17 | assert num_cpu_blocks == 0 18 | - assert num_gpu_blocks == self.scheduler_config.max_num_seqs 19 | + assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1 20 | 21 | self.cache_config.num_gpu_blocks = num_gpu_blocks 22 | self.cache_config.num_cpu_blocks = num_cpu_blocks 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /containers/tritonserver-neuronx/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=tritonserver-neuronx 4 | export IMAGE_TAG=24.06-2.22.0 5 | -------------------------------------------------------------------------------- /containers/tritonserver-ray-vllm/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.01-vllm-python-py3 2 | FROM ${BASE_IMAGE} 3 | 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | ENV DEBCONF_NONINTERACTIVE_SEEN=true 6 | 7 | RUN apt update \ 8 | && apt install --yes \ 9 | apt-transport-https \ 10 | ca-certificates \ 11 | curl \ 12 | gnupg \ 13 | cgroup-tools \ 14 | && rm -rf /var/lib/apt/lists/* 15 | 16 | RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key \ 17 | | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg \ 18 | && chmod 644 /etc/apt/keyrings/kubernetes-apt-keyring.gpg 19 | 20 | RUN echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' \ 21 | | tee /etc/apt/sources.list.d/kubernetes.list \ 22 | && chmod 644 /etc/apt/sources.list.d/kubernetes.list 23 | 24 | RUN apt update \ 25 | && apt install --yes \ 26 | kubectl \ 27 | && apt autoremove --yes \ 28 | && apt purge --yes \ 29 | && rm -rf /var/lib/apt/lists/* 30 | 31 | WORKDIR /workspace 32 | 33 | COPY --chmod=555 resources/kubessh /usr/local/bin/kubessh 34 | COPY resources/server.py . -------------------------------------------------------------------------------- /containers/tritonserver-ray-vllm/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=tritonserver-ray-vllm 4 | export IMAGE_TAG=25.01-vllm-python-py3 5 | -------------------------------------------------------------------------------- /containers/tritonserver-ray-vllm/resources/kubessh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | pod=$1 18 | shift 19 | kubectl exec $pod -- /bin/sh -c "$*" 20 | -------------------------------------------------------------------------------- /containers/tritonserver-trtllm/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export IMAGE_NAME=tritonserver-trtllm 4 | export IMAGE_TAG=24.12-trtllm-python-py3 5 | -------------------------------------------------------------------------------- /containers/tritonserver-trtllm/resources/kubessh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | pod=$1 18 | shift 19 | kubectl exec $pod -- /bin/sh -c "$*" 20 | -------------------------------------------------------------------------------- /eks-cluster/legacy/README.md: -------------------------------------------------------------------------------- 1 | ## Deprecated scripts 2 | 3 | The scripts in this folder are no longer used and maintained. These scripts have been deprecated. -------------------------------------------------------------------------------- /eks-cluster/legacy/apply-aws-auth-cm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl apply -f aws-auth-cm.yaml 4 | -------------------------------------------------------------------------------- /eks-cluster/legacy/apply-nvidia-plugin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml 4 | kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" 5 | -------------------------------------------------------------------------------- /eks-cluster/legacy/aws-auth-cm.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: aws-auth 5 | namespace: kube-system 6 | data: 7 | mapRoles: | 8 | - rolearn: 9 | username: system:node:{{EC2PrivateDNSName}} 10 | groups: 11 | - system:bootstrappers 12 | - system:nodes 13 | #- rolearn: 14 | #username: system:node:{{EC2PrivateDNSName}} 15 | #groups: 16 | #- system:bootstrappers 17 | #- system:nodes 18 | -------------------------------------------------------------------------------- /eks-cluster/legacy/efs-sc.yaml: -------------------------------------------------------------------------------- 1 | kind: StorageClass 2 | apiVersion: storage.k8s.io/v1 3 | metadata: 4 | name: efs-sc 5 | provisioner: efs.csi.aws.com 6 | -------------------------------------------------------------------------------- /eks-cluster/legacy/fsx-sc.yaml: -------------------------------------------------------------------------------- 1 | kind: StorageClass 2 | apiVersion: storage.k8s.io/v1 3 | metadata: 4 | name: fsx-sc 5 | provisioner: fsx.csi.aws.com 6 | -------------------------------------------------------------------------------- /eks-cluster/legacy/install-eksctl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # 4 | #Permission is hereby granted, free of charge, to any person obtaining a copy of this 5 | #software and associated documentation files (the "Software"), to deal in the Software 6 | #without restriction, including without limitation the rights to use, copy, modify, 7 | #merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 8 | #permit persons to whom the Software is furnished to do so. 9 | # 10 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 11 | #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 12 | #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 13 | #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 14 | #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 15 | #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 16 | 17 | # WARNING: THIS FILE IS DEPRECATED AND IS NOT USED 18 | 19 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp 20 | sudo mv /tmp/eksctl /usr/local/bin 21 | eksctl version 22 | -------------------------------------------------------------------------------- /eks-cluster/legacy/prepare-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Customize S3_BUCKET 4 | S3_BUCKET= 5 | 6 | # Customize S3_PREFIX 7 | S3_PREFIX=mask-rcnn/eks/input 8 | 9 | # Uncomment one of the options below 10 | # For EFS uncomment below 11 | DATA_DIR=/efs 12 | # For FSX uncomment below 13 | #DATA_DIR=/fsx 14 | # For EBS uncomment below 15 | #DATA_DIR=$HOME 16 | 17 | 18 | if [ -e $DATA_DIR/data ] 19 | then 20 | echo "$DATA_DIR/data already exists" 21 | exit 1 22 | fi 23 | 24 | mkdir -p $DATA_DIR/data 25 | 26 | aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX/data $DATA_DIR/data 27 | 28 | if [ -f ./run.sh ] 29 | then 30 | cp run.sh $DATA_DIR/ 31 | fi 32 | -------------------------------------------------------------------------------- /eks-cluster/legacy/pv-kubeflow-efs-gp-bursting.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: pv-efs 5 | spec: 6 | capacity: 7 | storage: 1000Gi 8 | volumeMode: Filesystem 9 | accessModes: 10 | - ReadWriteMany 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: efs-sc 13 | csi: 14 | driver: efs.csi.aws.com 15 | volumeHandle: 16 | -------------------------------------------------------------------------------- /eks-cluster/legacy/pv-kubeflow-fsx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: pv-fsx 5 | spec: 6 | capacity: 7 | storage: 1200Gi 8 | volumeMode: Filesystem 9 | accessModes: 10 | - ReadWriteMany 11 | mountOptions: 12 | - noatime 13 | - flock 14 | persistentVolumeReclaimPolicy: Retain 15 | csi: 16 | driver: fsx.csi.aws.com 17 | volumeHandle: 18 | volumeAttributes: 19 | dnsname: .fsx..amazonaws.com 20 | mountname: 21 | -------------------------------------------------------------------------------- /eks-cluster/legacy/pvc-kubeflow-efs-gp-bursting.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: pv-efs 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | storageClassName: efs-sc 9 | resources: 10 | requests: 11 | storage: 100Gi 12 | -------------------------------------------------------------------------------- /eks-cluster/legacy/pvc-kubeflow-fsx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: pv-fsx 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | storageClassName: "" 9 | resources: 10 | requests: 11 | storage: 1200Gi 12 | volumeName: pv-fsx 13 | -------------------------------------------------------------------------------- /eks-cluster/legacy/replicate-data.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: replicate-data 6 | data: 7 | replicate-data.sh: | 8 | aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX /ebs 9 | while true; do echo $(date -u) >> /root/date.txt; sleep 3600; done 10 | --- 11 | apiVersion: apps/v1 12 | kind: DaemonSet 13 | metadata: 14 | name: replicate-data 15 | spec: 16 | selector: 17 | matchLabels: 18 | name: replicate-data 19 | template: 20 | metadata: 21 | labels: 22 | name: replicate-data 23 | spec: 24 | restartPolicy: Always 25 | volumes: 26 | - name: ebs 27 | hostPath: 28 | path: /ebs 29 | type: DirectoryOrCreate 30 | - name: config 31 | configMap: 32 | defaultMode: 420 33 | items: 34 | - key: replicate-data.sh 35 | mode: 365 36 | path: replicate-data.sh 37 | name: replicate-data 38 | containers: 39 | - name: replicate-data 40 | env: 41 | - name: S3_BUCKET 42 | value: my-bucket 43 | - name: S3_PREFIX 44 | value: my-bucket-prefix 45 | command: 46 | - sh 47 | - /etc/config/replicate-data.sh 48 | image: # use image with aws cli support 49 | imagePullPolicy: IfNotPresent 50 | volumeMounts: 51 | - mountPath: /etc/config 52 | name: config 53 | - mountPath: /ebs 54 | name: ebs 55 | -------------------------------------------------------------------------------- /eks-cluster/legacy/set-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export EKS_CLUSTER=my-eks-cluster 4 | export AWS_REGION=us-west-2 5 | -------------------------------------------------------------------------------- /eks-cluster/legacy/tiller-rbac-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: tiller 5 | namespace: kube-system 6 | --- 7 | apiVersion: rbac.authorization.k8s.io/v1 8 | kind: ClusterRoleBinding 9 | metadata: 10 | name: tiller 11 | roleRef: 12 | apiGroup: rbac.authorization.k8s.io 13 | kind: ClusterRole 14 | name: cluster-admin 15 | subjects: 16 | - kind: ServiceAccount 17 | name: tiller 18 | namespace: kube-system 19 | -------------------------------------------------------------------------------- /eks-cluster/legacy/update-kubeconfig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install --upgrade pip 4 | pip install awscli --upgrade --user 5 | 6 | source ./set-cluster.sh 7 | aws eks --region $AWS_REGION update-kubeconfig --name $EKS_CLUSTER 8 | -------------------------------------------------------------------------------- /eks-cluster/terraform/aws-eks-cluster-and-nodegroup/istio/variables.tf: -------------------------------------------------------------------------------- 1 | variable "istio_system_namespace" { 2 | description = "Istio system namespace" 3 | type = string 4 | } 5 | 6 | variable "auth_namespace" { 7 | description = "Auth namespace" 8 | type = string 9 | } -------------------------------------------------------------------------------- /eks-cluster/terraform/aws-eks-cluster-and-nodegroup/istio/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.5.1" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = ">= 2.7.0" 8 | } 9 | 10 | kubectl = { 11 | source = "gavinbunney/kubectl" 12 | version = ">= 1.14.0" 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /eks-cluster/terraform/aws-eks-cluster-and-nodegroup/kubeflow/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.5.1" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = ">= 2.7.0" 8 | } 9 | 10 | kubectl = { 11 | source = "gavinbunney/kubectl" 12 | version = ">= 1.14.0" 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /eks-cluster/terraform/aws-eks-cluster-and-nodegroup/mlflow/outputs.tf: -------------------------------------------------------------------------------- 1 | output "db_secret_arn" { 2 | description = "DB secret ARN" 3 | value = aws_rds_cluster.db.master_user_secret[0].secret_arn 4 | } 5 | -------------------------------------------------------------------------------- /eks-cluster/terraform/aws-eks-cluster-and-nodegroup/mlflow/variables.tf: -------------------------------------------------------------------------------- 1 | variable "mlflow_namespace" { 2 | description = "MFlow namespace" 3 | type = string 4 | } 5 | 6 | variable "mlflow_version" { 7 | description = "MFlow chart version" 8 | type = string 9 | } 10 | 11 | 12 | variable "force_destroy_bucket" { 13 | description = "MFlow force destroy bucket" 14 | type = bool 15 | } 16 | 17 | variable "eks_cluster_id" { 18 | description = "EKS cluster id" 19 | type = string 20 | } 21 | 22 | variable "eks_oidc_provider_arn" { 23 | description = "EKS OIDC provider ARN" 24 | type = string 25 | } 26 | 27 | variable "eks_oidc_issuer" { 28 | description = "EKS OIDC issuer" 29 | type = string 30 | } 31 | 32 | variable "admin_username" { 33 | description = "MLFlow admin username" 34 | type = string 35 | } 36 | 37 | variable "admin_password" { 38 | description = "MLFlow admin password" 39 | type = string 40 | } 41 | 42 | variable "db_max_capacity" { 43 | description = "MLFlow DB max capacity" 44 | type = number 45 | } 46 | 47 | variable "db_subnet_ids" { 48 | description = "MLFlow DB subnet ids" 49 | type = list 50 | } 51 | 52 | variable "db_vpc_id" { 53 | description = "MLFlow DB VPC id" 54 | type = string 55 | } 56 | 57 | variable "db_port" { 58 | description = "MLFlow DB port" 59 | type = number 60 | } -------------------------------------------------------------------------------- /eks-cluster/terraform/aws-eks-cluster-and-nodegroup/mlflow/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.5.1" 3 | 4 | } -------------------------------------------------------------------------------- /eks-cluster/terraform/aws-eks-cluster-and-nodegroup/slurm/variables.tf: -------------------------------------------------------------------------------- 1 | variable "local_helm_repo" { 2 | description = "Local Helm charts path" 3 | type = string 4 | } 5 | 6 | variable "slurm_namespace" { 7 | description = "Slurm namespace" 8 | type = string 9 | } 10 | 11 | variable "efs_fs_id" { 12 | description = "EFS file-system id" 13 | type = string 14 | } 15 | 16 | variable "ssh_public_key" { 17 | description = "Slurm SSH public key for node login" 18 | type = string 19 | } 20 | 21 | variable "storage_capacity" { 22 | description = "Shared storage capacity" 23 | type = string 24 | } 25 | 26 | variable "password" { 27 | description = "Slurm password for user rocky" 28 | type = string 29 | } -------------------------------------------------------------------------------- /eks-cluster/terraform/aws-eks-cluster-and-nodegroup/slurm/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.5.1" 3 | 4 | } -------------------------------------------------------------------------------- /eks-cluster/terraform/aws-eks-cluster-and-nodegroup/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.5.1" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = ">= 2.7.0" 8 | } 9 | 10 | kubectl = { 11 | source = "gavinbunney/kubectl" 12 | version = ">= 1.14.0" 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /eks-cluster/tests/test-gpu-efa.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: ReplicaSet 3 | metadata: 4 | name: test-gpu-efa 5 | labels: 6 | app: test-gpu-efa 7 | spec: 8 | replicas: 2 9 | selector: 10 | matchLabels: 11 | app: test-gpu-efa 12 | template: 13 | metadata: 14 | labels: 15 | app: test-gpu-efa 16 | spec: 17 | containers: 18 | - name: test-gpu-efa 19 | image: '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-ec2' 20 | command: ["/bin/bash"] 21 | securityContext: 22 | privileged: true 23 | args: ["-c", "trap : TERM INT; sleep infinity & wait"] 24 | volumeMounts: 25 | - name: fsx 26 | mountPath: /fsx 27 | resources: 28 | requests: 29 | "nvidia.com/gpu": 8 30 | "vpc.amazonaws.com/efa": 1 31 | limits: 32 | "nvidia.com/gpu": 8 33 | "vpc.amazonaws.com/efa": 1 34 | volumes: 35 | - name: fsx 36 | persistentVolumeClaim: 37 | claimName: pv-fsx # k8s persistent-volume-claim name 38 | nodeSelector: 39 | node.kubernetes.io/instance-type: "p4d.24xlarge" 40 | tolerations: 41 | - key: "nvidia.com/gpu" 42 | operator: "Exists" 43 | effect: "NoSchedule" 44 | -------------------------------------------------------------------------------- /eks-cluster/tests/test-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: test-gpu 5 | annotations: 6 | karpenter.sh/do-not-disrupt: "true" 7 | sidecar.istio.io/inject: 'false' 8 | spec: 9 | containers: 10 | - name: test-gpu 11 | image: ubuntu:latest 12 | command: ["/bin/bash"] 13 | securityContext: 14 | privileged: true 15 | args: ["-c", "trap : TERM INT; sleep infinity & wait"] 16 | volumeMounts: 17 | - name: fsx 18 | mountPath: /fsx 19 | resources: 20 | requests: 21 | "nvidia.com/gpu": 8 22 | limits: 23 | "nvidia.com/gpu": 8 24 | volumes: 25 | - name: fsx 26 | persistentVolumeClaim: 27 | claimName: pv-fsx # k8s persistent-volume-claim name 28 | nodeSelector: 29 | node.kubernetes.io/instance-type: "g5.48xlarge" 30 | tolerations: 31 | - key: "nvidia.com/gpu" 32 | operator: "Exists" 33 | effect: "NoSchedule" 34 | -------------------------------------------------------------------------------- /eks-cluster/tests/test-neuron.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: test-neuron 5 | annotations: 6 | karpenter.sh/do-not-disrupt: "true" 7 | sidecar.istio.io/inject: 'false' 8 | spec: 9 | containers: 10 | - name: test-neuron 11 | image: ubuntu:latest 12 | command: ["/bin/bash"] 13 | securityContext: 14 | privileged: true 15 | args: ["-c", "trap : TERM INT; sleep infinity & wait"] 16 | volumeMounts: 17 | - name: fsx 18 | mountPath: /fsx 19 | resources: 20 | requests: 21 | "aws.amazon.com/neuron": 16 22 | "aws.amazon.com/neuroncore": 32 23 | "aws.amazon.com/neurondevice": 16 24 | "vpc.amazonaws.com/efa": 8 25 | limits: 26 | "aws.amazon.com/neuron": 16 27 | "aws.amazon.com/neuroncore": 32 28 | "aws.amazon.com/neurondevice": 16 29 | "vpc.amazonaws.com/efa": 8 30 | volumes: 31 | - name: fsx 32 | persistentVolumeClaim: 33 | claimName: pv-fsx # k8s persistent-volume-claim name 34 | nodeSelector: 35 | node.kubernetes.io/instance-type: "trn1.32xlarge" 36 | tolerations: 37 | - key: "aws.amazon.com/neuron" 38 | operator: "Exists" 39 | effect: "NoSchedule" 40 | - key: "aws.amazon.com/efa" 41 | operator: "Exists" 42 | effect: "NoSchedule" 43 | -------------------------------------------------------------------------------- /eks-cluster/utils/attach-pvc-fsx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: attach-pvc-fsx 5 | spec: 6 | containers: 7 | - name: attach-pvc-fsx 8 | image: ubuntu:latest 9 | command: ["/bin/bash"] 10 | securityContext: 11 | privileged: true 12 | args: ["-c", "trap : TERM INT; sleep infinity & wait"] 13 | volumeMounts: 14 | - name: fsx 15 | mountPath: /fsx 16 | volumes: 17 | - name: fsx 18 | persistentVolumeClaim: 19 | claimName: pv-fsx # k8s persistent-volume-claim name 20 | -------------------------------------------------------------------------------- /eks-cluster/utils/attach-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: attach-pvc 5 | spec: 6 | containers: 7 | - name: attach-pvc 8 | image: ubuntu:latest 9 | command: ["/bin/bash"] 10 | securityContext: 11 | privileged: true 12 | args: ["-c", "trap : TERM INT; sleep infinity & wait"] 13 | volumeMounts: 14 | - name: efs 15 | mountPath: /efs 16 | - name: fsx 17 | mountPath: /fsx 18 | volumes: 19 | - name: efs 20 | persistentVolumeClaim: 21 | claimName: pv-efs # k8s persistent-volume-claim name 22 | - name: fsx 23 | persistentVolumeClaim: 24 | claimName: pv-fsx # k8s persistent-volume-claim name 25 | -------------------------------------------------------------------------------- /eks-cluster/utils/install-kubectl-linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.31.3/2024-12-12/bin/linux/amd64/kubectl 4 | 5 | chmod +x ./kubectl 6 | 7 | sudo mv ./kubectl /usr/local/bin/ 8 | 9 | kubectl version 10 | 11 | curl -Lo aws-iam-authenticator https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.6.30/aws-iam-authenticator_0.6.30_linux_amd64 12 | 13 | chmod +x ./aws-iam-authenticator 14 | sudo mv ./aws-iam-authenticator /usr/local/bin/ 15 | aws-iam-authenticator help 16 | -------------------------------------------------------------------------------- /eks-cluster/utils/prepare-s3-bucket.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | [[ $# -ne 1 ]] && echo "usage: $0 s3-bucket" && exit 1 4 | 5 | S3_BUCKET=$1 6 | 7 | S3_PREFIX=ml-platform/data/coco2017 8 | 9 | # Stage directory must be on a volume with atleast 100 GB available space 10 | STAGE_DIR=$HOME/stage/data/coco2017 11 | 12 | if [ -e $STAGE_DIR ] 13 | then 14 | echo "$STAGE_DIR already exists" 15 | exit 1 16 | fi 17 | 18 | mkdir -p $STAGE_DIR 19 | 20 | wget -O $STAGE_DIR/train2017.zip http://images.cocodataset.org/zips/train2017.zip 21 | unzip $STAGE_DIR/train2017.zip -d $STAGE_DIR 22 | rm $STAGE_DIR/train2017.zip 23 | 24 | wget -O $STAGE_DIR/val2017.zip http://images.cocodataset.org/zips/val2017.zip 25 | unzip $STAGE_DIR/val2017.zip -d $STAGE_DIR 26 | rm $STAGE_DIR/val2017.zip 27 | 28 | wget -O $STAGE_DIR/annotations_trainval2017.zip http://images.cocodataset.org/annotations/annotations_trainval2017.zip 29 | unzip $STAGE_DIR/annotations_trainval2017.zip -d $STAGE_DIR 30 | rm $STAGE_DIR/annotations_trainval2017.zip 31 | 32 | mkdir $STAGE_DIR/pretrained-models 33 | wget -O $STAGE_DIR/pretrained-models/ImageNet-R50-AlignPadding.npz http://models.tensorpack.com/FasterRCNN/ImageNet-R50-AlignPadding.npz 34 | 35 | aws s3 cp --recursive $STAGE_DIR s3://$S3_BUCKET/$S3_PREFIX 36 | rm -rf $STAGE_DIR 37 | -------------------------------------------------------------------------------- /eks-cluster/utils/s3-backend.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | [[ $# -ne 2 ]] && echo "usage: $0 s3-bucket s3-prefix" && exit 1 4 | 5 | export S3_BUCKET_NAME=$1 6 | export S3_BUCKET_PREFIX=$2 7 | export PATH_TO_BACKUP=terraform/state 8 | export BUCKET_REGION=$(aws configure get region) 9 | 10 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 11 | 12 | cat < $DIR/../terraform/aws-eks-cluster-and-nodegroup/backend.tf 13 | terraform { 14 | backend "s3" { 15 | bucket = "${S3_BUCKET_NAME}" 16 | key = "${S3_BUCKET_PREFIX}/${PATH_TO_BACKUP}" 17 | region = "${BUCKET_REGION}" 18 | } 19 | } 20 | EOF 21 | -------------------------------------------------------------------------------- /eks-cluster/utils/stage-data-fsx.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: stage-data-fsx 6 | data: 7 | stage-data.sh: | 8 | aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX $STAGE_DIR 9 | --- 10 | apiVersion: v1 11 | kind: Pod 12 | metadata: 13 | name: stage-data-fsx 14 | spec: 15 | restartPolicy: Never 16 | volumes: 17 | - name: fsx 18 | persistentVolumeClaim: 19 | claimName: pv-fsx # persistent volume claim name 20 | - name: config 21 | configMap: 22 | defaultMode: 420 23 | items: 24 | - key: stage-data.sh 25 | mode: 365 26 | path: stage-data.sh 27 | name: stage-data-fsx 28 | containers: 29 | - name: stage-data-fsx 30 | env: 31 | - name: S3_BUCKET 32 | value: my-bucket 33 | - name: S3_PREFIX 34 | value: ml-platform/data/coco2017 35 | - name: STAGE_DIR 36 | value: /fsx/data/coco2017 37 | command: 38 | - sh 39 | - /etc/config/stage-data.sh 40 | image: amazon/aws-cli # use image with aws cli support 41 | imagePullPolicy: IfNotPresent 42 | volumeMounts: 43 | - mountPath: /etc/config 44 | name: config 45 | - mountPath: /fsx 46 | name: fsx 47 | -------------------------------------------------------------------------------- /eks-cluster/utils/stage-data.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: stage-data 6 | data: 7 | stage-data.sh: | 8 | aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX $STAGE_DIR 9 | --- 10 | apiVersion: v1 11 | kind: Pod 12 | metadata: 13 | name: stage-data 14 | spec: 15 | restartPolicy: Never 16 | volumes: 17 | - name: efs 18 | persistentVolumeClaim: 19 | claimName: pv-efs # persistent volume claim name 20 | - name: config 21 | configMap: 22 | defaultMode: 420 23 | items: 24 | - key: stage-data.sh 25 | mode: 365 26 | path: stage-data.sh 27 | name: stage-data 28 | containers: 29 | - name: stage-data 30 | env: 31 | - name: S3_BUCKET 32 | value: my-bucket 33 | - name: S3_PREFIX 34 | value: ml-platform/data/coco2017 35 | - name: STAGE_DIR 36 | value: /efs/data/coco2017 37 | command: 38 | - sh 39 | - /etc/config/stage-data.sh 40 | image: amazon/aws-cli # use image with aws cli support 41 | imagePullPolicy: IfNotPresent 42 | volumeMounts: 43 | - mountPath: /etc/config 44 | name: config 45 | - mountPath: /efs 46 | name: efs 47 | -------------------------------------------------------------------------------- /examples/inference/rayserve/facebook-bart-large-cnn/rayservice.yaml: -------------------------------------------------------------------------------- 1 | ray: 2 | version: '2.44.0' 3 | dashboard: 4 | host: '0.0.0.0' 5 | ports: 6 | - name: gcs 7 | port: 6379 8 | - name: client 9 | port: 10001 10 | - name: dashboard 11 | port: 8265 12 | - name: serve 13 | port: 8000 14 | resources: 15 | requests: 16 | cpu: 300m 17 | limits: 18 | cpu: 2 19 | serve_config_v2: 20 | serveConfigV2: | 21 | applications: 22 | - name: text_summarizer 23 | import_path: text_summarizer.text_summarizer:deployment 24 | runtime_env: 25 | working_dir: "https://github.com/ray-project/serve_config_examples/archive/refs/heads/master.zip" 26 | pip: 27 | - "transformers==4.42.4" 28 | service_unhealthy_threshold_secs: 900 29 | deployment_unhealthy_threshold_secs: 300 30 | image: 31 | image_pull_policy: IfNotPresent 32 | resources: 33 | requests: 34 | "nvidia.com/gpu": 1 35 | limits: 36 | "nvidia.com/gpu": 1 37 | node_type: 'g5.xlarge' 38 | tolerations: 39 | - key: "nvidia.com/gpu" 40 | operator: "Exists" 41 | effect: "NoSchedule" -------------------------------------------------------------------------------- /examples/inference/rayserve/meta-llama3-8b-neuron/engine_config.yaml: -------------------------------------------------------------------------------- 1 | image: public.ecr.aws/docker/library/python:slim-bullseye 2 | inline_script: 3 | - |+ 4 | cat > /tmp/engine.json < /tmp/engine.json < /tmp/engine.json < /tmp/engine.json < /tmp/engine.json < /tmp/engine.json < /tmp/engine.json <&1 | tee $OUTPUT_LOG' 44 | -------------------------------------------------------------------------------- /examples/inference/triton-inference-server/tensorrtllm_backend/llama2-7b/trtllm_engine.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | node_type: g5.48xlarge 4 | requests: 5 | "nvidia.com/gpu": 8 6 | limits: 7 | "nvidia.com/gpu": 8 8 | ebs: 9 | storage: 400Gi 10 | mount_path: /tmp 11 | tolerations: 12 | - key: "nvidia.com/gpu" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | pre_script: 16 | - mkdir -p $LOG_ROOT 17 | - TP_SIZE=8 18 | - PP_SIZE=1 19 | - OUTPUT_LOG=$LOG_ROOT/build_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log 20 | - CKPT_PATH=$OUTPUT_ROOT/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE} 21 | - TMP_ENGINE_DIR=/tmp/engine_tp_${TP_SIZE}_pp_${PP_SIZE} 22 | post_script: 23 | - rm -rf $OUTPUT_ROOT/engine_tp_${TP_SIZE}_pp_${PP_SIZE} 24 | - cp -r $TMP_ENGINE_DIR $OUTPUT_ROOT/ 25 | process: 26 | env: 27 | - name: LOG_ROOT 28 | value: /efs/home/{{ .Release.Name }}/logs 29 | - name: OUTPUT_ROOT 30 | value: /efs/home/{{ .Release.Name }}/trtllm 31 | - name: MODEL_PATH 32 | value: /fsx/pretrained-models/meta-llama/Llama-2-7b-hf 33 | command: 34 | - trtllm-build 35 | args: 36 | - --checkpoint_dir ${CKPT_PATH} 37 | - --max_num_tokens 2048 38 | - --gpus_per_node 8 39 | - --remove_input_padding enable 40 | - --gemm_plugin float16 41 | - --gpt_attention_plugin float16 42 | - --paged_kv_cache enable 43 | - --context_fmha enable 44 | - --output_dir ${TMP_ENGINE_DIR} 45 | - --max_batch_size 8 46 | - '2>&1 | tee $OUTPUT_LOG' 47 | -------------------------------------------------------------------------------- /examples/inference/triton-inference-server/tensorrtllm_backend/llama3-8b-instruct/hf_to_trtllm.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | node_type: g5.48xlarge 4 | requests: 5 | "nvidia.com/gpu": 8 6 | limits: 7 | "nvidia.com/gpu": 8 8 | ebs: 9 | storage: 400Gi 10 | mount_path: /tmp 11 | tolerations: 12 | - key: "nvidia.com/gpu" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | pre_script: 16 | - mkdir -p $LOG_ROOT 17 | - mkdir -p $OUTPUT_ROOT 18 | - TP_SIZE=8 19 | - PP_SIZE=1 20 | - OUTPUT_LOG=$LOG_ROOT/hf_to_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log 21 | - TMP_OUTPUT_PATH=/tmp/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE} 22 | - SCRIPT_DIR=TensorRT-LLM/examples/llama 23 | - cd $SCRIPT_DIR 24 | - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0 25 | post_script: 26 | - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/ 27 | process: 28 | env: 29 | - name: LOG_ROOT 30 | value: /efs/home/{{ .Release.Name }}/logs 31 | - name: OUTPUT_ROOT 32 | value: /efs/home/{{ .Release.Name }}/trtllm 33 | - name: MODEL_PATH 34 | value: "/fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct" 35 | command: 36 | - python3 37 | args: 38 | - convert_checkpoint.py 39 | - --model_dir=$MODEL_PATH 40 | - --output_dir=$TMP_OUTPUT_PATH 41 | - --dtype=float16 42 | - --tp_size=$TP_SIZE 43 | - '2>&1 | tee $OUTPUT_LOG' 44 | -------------------------------------------------------------------------------- /examples/inference/triton-inference-server/tensorrtllm_backend/llama3-8b-instruct/trtllm_engine.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | node_type: g5.48xlarge 4 | requests: 5 | "nvidia.com/gpu": 8 6 | limits: 7 | "nvidia.com/gpu": 8 8 | ebs: 9 | storage: 400Gi 10 | mount_path: /tmp 11 | tolerations: 12 | - key: "nvidia.com/gpu" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | pre_script: 16 | - mkdir -p $LOG_ROOT 17 | - TP_SIZE=8 18 | - PP_SIZE=1 19 | - OUTPUT_LOG=$LOG_ROOT/build_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log 20 | - CKPT_PATH=$OUTPUT_ROOT/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE} 21 | - TMP_ENGINE_DIR=/tmp/engine_tp_${TP_SIZE}_pp_${PP_SIZE} 22 | post_script: 23 | - rm -rf $OUTPUT_ROOT/engine_tp_${TP_SIZE}_pp_${PP_SIZE} 24 | - cp -r $TMP_ENGINE_DIR $OUTPUT_ROOT/ 25 | process: 26 | env: 27 | - name: LOG_ROOT 28 | value: /efs/home/{{ .Release.Name }}/logs 29 | - name: OUTPUT_ROOT 30 | value: /efs/home/{{ .Release.Name }}/trtllm 31 | - name: MODEL_PATH 32 | value: "/fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct" 33 | command: 34 | - trtllm-build 35 | args: 36 | - --checkpoint_dir ${CKPT_PATH} 37 | - --max_num_tokens 8192 38 | - --gpus_per_node 8 39 | - --remove_input_padding enable 40 | - --gemm_plugin float16 41 | - --gpt_attention_plugin float16 42 | - --paged_kv_cache enable 43 | - --context_fmha enable 44 | - --output_dir ${TMP_ENGINE_DIR} 45 | - --max_batch_size 4 46 | - '2>&1 | tee $OUTPUT_LOG' 47 | -------------------------------------------------------------------------------- /examples/inference/triton-inference-server/tensorrtllm_backend/mistral-7b-instruct-v01/hf_to_trtllm.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | node_type: g5.48xlarge 4 | requests: 5 | "nvidia.com/gpu": 8 6 | limits: 7 | "nvidia.com/gpu": 8 8 | ebs: 9 | storage: 400Gi 10 | mount_path: /tmp 11 | tolerations: 12 | - key: "nvidia.com/gpu" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | pre_script: 16 | - mkdir -p $LOG_ROOT 17 | - mkdir -p $OUTPUT_ROOT 18 | - TP_SIZE=8 19 | - PP_SIZE=1 20 | - OUTPUT_LOG=$LOG_ROOT/hf_to_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log 21 | - TMP_OUTPUT_PATH=/tmp/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE} 22 | - SCRIPT_DIR=TensorRT-LLM/examples/llama 23 | - cd $SCRIPT_DIR 24 | - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0 25 | post_script: 26 | - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/ 27 | process: 28 | env: 29 | - name: LOG_ROOT 30 | value: /efs/home/{{ .Release.Name }}/logs 31 | - name: OUTPUT_ROOT 32 | value: /efs/home/{{ .Release.Name }}/trtllm 33 | - name: MODEL_PATH 34 | value: /fsx/pretrained-models/mistralai/Mistral-7B-Instruct-v0.1 35 | command: 36 | - python3 37 | args: 38 | - convert_checkpoint.py 39 | - --model_dir=$MODEL_PATH 40 | - --output_dir=$TMP_OUTPUT_PATH 41 | - --dtype=float16 42 | - --tp_size=$TP_SIZE 43 | - '2>&1 | tee $OUTPUT_LOG' 44 | -------------------------------------------------------------------------------- /examples/inference/triton-inference-server/tensorrtllm_backend/mistral-7b-instruct-v01/trtllm_engine.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | node_type: g5.48xlarge 4 | requests: 5 | "nvidia.com/gpu": 8 6 | limits: 7 | "nvidia.com/gpu": 8 8 | ebs: 9 | storage: 400Gi 10 | mount_path: /tmp 11 | tolerations: 12 | - key: "nvidia.com/gpu" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | pre_script: 16 | - mkdir -p $LOG_ROOT 17 | - TP_SIZE=8 18 | - PP_SIZE=1 19 | - OUTPUT_LOG=$LOG_ROOT/build_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log 20 | - CKPT_PATH=$OUTPUT_ROOT/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE} 21 | - TMP_ENGINE_DIR=/tmp/engine_tp_${TP_SIZE}_pp_${PP_SIZE} 22 | post_script: 23 | - rm -rf $OUTPUT_ROOT/engine_tp_${TP_SIZE}_pp_${PP_SIZE} 24 | - cp -r $TMP_ENGINE_DIR $OUTPUT_ROOT/ 25 | process: 26 | env: 27 | - name: LOG_ROOT 28 | value: /efs/home/{{ .Release.Name }}/logs 29 | - name: OUTPUT_ROOT 30 | value: /efs/home/{{ .Release.Name }}/trtllm 31 | - name: MODEL_PATH 32 | value: /fsx/pretrained-models/mistralai/Mistral-7B-Instruct-v0.1 33 | command: 34 | - trtllm-build 35 | args: 36 | - --checkpoint_dir ${CKPT_PATH} 37 | - --max_num_tokens 32768 38 | - --gpus_per_node 8 39 | - --remove_input_padding enable 40 | - --gemm_plugin float16 41 | - --gpt_attention_plugin float16 42 | - --paged_kv_cache enable 43 | - --context_fmha enable 44 | - --output_dir ${TMP_ENGINE_DIR} 45 | - --max_batch_size 4 46 | - '2>&1 | tee $OUTPUT_LOG' 47 | -------------------------------------------------------------------------------- /examples/inference/triton-inference-server/tensorrtllm_backend/mistral-7b-instruct-v01_llama3-8b/llama3_8b_hf_to_trtllm.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | node_type: g5.48xlarge 4 | requests: 5 | "nvidia.com/gpu": 8 6 | limits: 7 | "nvidia.com/gpu": 8 8 | ebs: 9 | storage: 400Gi 10 | mount_path: /tmp 11 | tolerations: 12 | - key: "nvidia.com/gpu" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | pre_script: 16 | - mkdir -p $LOG_ROOT 17 | - mkdir -p $OUTPUT_ROOT 18 | - TP_SIZE=8 19 | - PP_SIZE=1 20 | - MODEL_NAME=llama3_8b_instruct 21 | - OUTPUT_LOG=$LOG_ROOT/${MODEL_NAME}_hf_to_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log 22 | - TMP_OUTPUT_PATH=/tmp/${MODEL_NAME}_ckpt_tp_${TP_SIZE}_pp_${PP_SIZE} 23 | - SCRIPT_DIR=TensorRT-LLM/examples/llama 24 | - cd $SCRIPT_DIR 25 | - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0 26 | post_script: 27 | - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/ 28 | process: 29 | env: 30 | - name: LOG_ROOT 31 | value: /efs/home/{{ .Release.Name }}/logs 32 | - name: OUTPUT_ROOT 33 | value: /efs/home/{{ .Release.Name }}/trtllm 34 | - name: MODEL_PATH 35 | value: /fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct 36 | command: 37 | - python3 38 | args: 39 | - convert_checkpoint.py 40 | - --model_dir=$MODEL_PATH 41 | - --output_dir=$TMP_OUTPUT_PATH 42 | - --dtype=float16 43 | - --tp_size=$TP_SIZE 44 | - '2>&1 | tee $OUTPUT_LOG' 45 | -------------------------------------------------------------------------------- /examples/inference/triton-inference-server/tensorrtllm_backend/mistral-7b-instruct-v01_llama3-8b/mistral_7b_hf_to_trtllm.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | node_type: g5.48xlarge 4 | requests: 5 | "nvidia.com/gpu": 8 6 | limits: 7 | "nvidia.com/gpu": 8 8 | ebs: 9 | storage: 400Gi 10 | mount_path: /tmp 11 | tolerations: 12 | - key: "nvidia.com/gpu" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | pre_script: 16 | - mkdir -p $LOG_ROOT 17 | - mkdir -p $OUTPUT_ROOT 18 | - TP_SIZE=8 19 | - PP_SIZE=1 20 | - MODEL_NAME=mistral_7b_instruct 21 | - OUTPUT_LOG=$LOG_ROOT/${MODEL_NAME}_hf_to_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log 22 | - TMP_OUTPUT_PATH=/tmp/${MODEL_NAME}_ckpt_tp_${TP_SIZE}_pp_${PP_SIZE} 23 | - SCRIPT_DIR=TensorRT-LLM/examples/llama 24 | - cd $SCRIPT_DIR 25 | - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0 26 | post_script: 27 | - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/ 28 | process: 29 | env: 30 | - name: LOG_ROOT 31 | value: /efs/home/{{ .Release.Name }}/logs 32 | - name: OUTPUT_ROOT 33 | value: /efs/home/{{ .Release.Name }}/trtllm 34 | - name: MODEL_PATH 35 | value: /fsx/pretrained-models/mistralai/Mistral-7B-Instruct-v0.1 36 | command: 37 | - python3 38 | args: 39 | - convert_checkpoint.py 40 | - --model_dir=$MODEL_PATH 41 | - --output_dir=$TMP_OUTPUT_PATH 42 | - --dtype=float16 43 | - --tp_size=$TP_SIZE 44 | - '2>&1 | tee $OUTPUT_LOG' 45 | -------------------------------------------------------------------------------- /examples/inference/triton-inference-server/tensorrtllm_backend/mistral-8x22b-instruct-v01/hf_to_trtllm.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | node_type: p4d.24xlarge 4 | requests: 5 | "nvidia.com/gpu": 8 6 | limits: 7 | "nvidia.com/gpu": 8 8 | ebs: 9 | storage: 400Gi 10 | mount_path: /tmp 11 | tolerations: 12 | - key: "nvidia.com/gpu" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | pre_script: 16 | - mkdir -p $LOG_ROOT 17 | - mkdir -p $OUTPUT_ROOT 18 | - TP_SIZE=8 19 | - PP_SIZE=2 20 | - OUTPUT_LOG=$LOG_ROOT/hf_to_trtllm.log 21 | - TMP_OUTPUT_PATH=/tmp/ckpt 22 | - SCRIPT_DIR=TensorRT-LLM/examples/llama 23 | - cd $SCRIPT_DIR 24 | - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0 25 | post_script: 26 | - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/ 27 | process: 28 | env: 29 | - name: LOG_ROOT 30 | value: /efs/home/{{ .Release.Name }}/logs 31 | - name: OUTPUT_ROOT 32 | value: /efs/home/{{ .Release.Name }}/trtllm 33 | - name: MODEL_PATH 34 | value: /fsx/pretrained-models/mistralai/Mixtral-8x22B-Instruct-v0.1 35 | command: 36 | - python3 37 | args: 38 | - convert_checkpoint.py 39 | - --model_dir=$MODEL_PATH 40 | - --output_dir=$TMP_OUTPUT_PATH 41 | - --dtype=float16 42 | - --tp_size=$TP_SIZE 43 | - --pp_size=$PP_SIZE 44 | - '2>&1 | tee $OUTPUT_LOG' 45 | -------------------------------------------------------------------------------- /examples/inference/triton-inference-server/tensorrtllm_backend/mistral-8x22b-instruct-v01/trtllm_engine.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | node_type: p4d.24xlarge 4 | requests: 5 | "nvidia.com/gpu": 8 6 | limits: 7 | "nvidia.com/gpu": 8 8 | ebs: 9 | storage: 400Gi 10 | mount_path: /tmp 11 | tolerations: 12 | - key: "nvidia.com/gpu" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | pre_script: 16 | - mkdir -p $LOG_ROOT 17 | - OUTPUT_LOG=$LOG_ROOT/build_trtllm.log 18 | - CKPT_PATH=$OUTPUT_ROOT/ckpt 19 | - TMP_ENGINE_DIR=/tmp/engine 20 | post_script: 21 | - rm -rf $OUTPUT_ROOT/engine 22 | - cp -r $TMP_ENGINE_DIR $OUTPUT_ROOT/ 23 | process: 24 | env: 25 | - name: LOG_ROOT 26 | value: /efs/home/{{ .Release.Name }}/logs 27 | - name: OUTPUT_ROOT 28 | value: /efs/home/{{ .Release.Name }}/trtllm 29 | command: 30 | - trtllm-build 31 | args: 32 | - --checkpoint_dir ${CKPT_PATH} 33 | - --max_num_tokens 16384 34 | - --gpus_per_node 8 35 | - --remove_input_padding enable 36 | - --gemm_plugin float16 37 | - --gpt_attention_plugin float16 38 | - --paged_kv_cache enable 39 | - --context_fmha enable 40 | - --output_dir ${TMP_ENGINE_DIR} 41 | - --max_batch_size 8 42 | - '2>&1 | tee $OUTPUT_LOG' 43 | -------------------------------------------------------------------------------- /examples/legacy/README.md: -------------------------------------------------------------------------------- 1 | ## Legacy Tutorials 2 | 3 | 4 | ### [TensorFlow](https://www.tensorflow.org/) 5 | 6 | 7 | | Model | Accelerator | Notes | 8 | | ----------- | ----------- | -------- | 9 | | [Mask R-CNN](./maskrcnn/README.md) | Nvidia GPU | Mask R-CNN training for [AWS Samples Mask R-CNN](https://github.com/aws-samples/mask-rcnn-tensorflow) on COCO 2017 dataset | 10 | 11 | 12 | ### [Neuronx Nemo Megatron](https://github.com/aws-neuron/neuronx-nemo-megatron) 13 | 14 | 15 | | Model | Accelerator | Notes | 16 | | ----------- | ----------- | -------- | 17 | | [Llama 2 7B Pre-training](./neuronx-nemo-megatron/llama2_7b/README.md) | AWS Trainium1 | LLama 2 7B pre-training on Wikicorpus dataset | 18 | | [Llama 2 13B Pre-training](./neuronx-nemo-megatron/llama2_13b/README.md) | AWS Trainium1 | LLama 2 13B pre-training on Wikicorpus dataset | 19 | | [Llama 2 70B Pre-training](./neuronx-nemo-megatron/llama2_70b/README.md) | AWS Trainium1 | LLama 2 70B pre-training on Wikicorpus dataset | 20 | 21 | -------------------------------------------------------------------------------- /examples/legacy/maskrcnn/train-maskrcnn-aws.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | pre_script: 3 | - DATE=$(date '+%Y-%m-%d-%H-%M-%S') 4 | - LOG_DIR=$HOME/logs/maskrcnn-tensorflow-$DATE 5 | - cd /mask-rcnn-tensorflow 6 | resources: 7 | gpu_nodes: 2 8 | gpus_per_node: 8 9 | gpu_instance_type: "p4d.24xlarge" 10 | requests: 11 | "nvidia.com/gpu": 8 12 | limits: 13 | "nvidia.com/gpu": 8 14 | tensorflow: 15 | sys_memory_mb: "4096" 16 | train: 17 | command: 18 | - python3 19 | args: 20 | - /mask-rcnn-tensorflow/MaskRCNN/train.py 21 | - "--logdir $LOG_DIR" 22 | - "--images_per_epoch 120000" 23 | - --config 24 | - MODE_MASK='True' 25 | - MODE_FPN='True' 26 | - DATA.BASEDIR=/fsx/data/coco2017 27 | - DATA.TRAIN='["train2017"]' 28 | - DATA.VAL='("val2017")' 29 | - TRAIN.BATCH_SIZE_PER_GPU=4 30 | - TRAIN.EVAL_PERIOD=1 31 | - TRAIN.LR_EPOCH_SCHEDULE="[(16, 0.1), (20, 0.01), (24, None)]" 32 | - TRAIN.BASE_LR=0.0015625 33 | - BACKBONE.WEIGHTS=/fsx/data/coco2017/pretrained-models/ImageNet-R50-AlignPadding.npz 34 | - BACKBONE.NORM=FreezeBN 35 | - PREPROC.PREDEFINED_PADDING=True 36 | - TRAINER=horovod 37 | - TRAIN.GRADIENT_CLIP=0.36 38 | -------------------------------------------------------------------------------- /examples/legacy/maskrcnn/train-maskrcnn-tensorpack.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | pre_script: 3 | - DATE=$(date '+%Y-%m-%d-%H-%M-%S') 4 | - LOG_DIR=$HOME/logs/maskrcnn-tensorpack-$DATE 5 | - cd /mask-rcnn-tensorflow 6 | resources: 7 | gpu_nodes: 2 8 | gpus_per_node: 8 9 | gpu_instance_type: "p4d.24xlarge" 10 | requests: 11 | "nvidia.com/gpu": 8 12 | limits: 13 | "nvidia.com/gpu": 8 14 | tensorflow: 15 | sys_memory_mb: "2560" 16 | train: 17 | command: 18 | - python3 19 | args: 20 | - /tensorpack/examples/FasterRCNN/train.py 21 | - "--logdir $LOG_DIR" 22 | - --config 23 | - MODE_MASK='True' 24 | - MODE_FPN='True' 25 | - DATA.BASEDIR=/fsx/data/coco2017 26 | - DATA.TRAIN='["coco_train2017"]' 27 | - DATA.VAL='("coco_val2017")' 28 | - TRAIN.EVAL_PERIOD=1 29 | - TRAIN.STEPS_PER_EPOCH=7500 30 | - TRAIN.LR_SCHEDULE='[240000,320000,360000]' 31 | - TRAIN.BASE_LR=0.01 32 | - BACKBONE.WEIGHTS=/fsx/data/coco2017/pretrained-models/ImageNet-R50-AlignPadding.npz 33 | - BACKBONE.NORM=FreezeBN 34 | - TRAINER=horovod 35 | - TRAIN.CHECKPOINT_PERIOD=2 36 | -------------------------------------------------------------------------------- /examples/training/megatron-deepspeed/gpt2_345m/wikicorpus.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 1 5 | limits: 6 | "nvidia.com/gpu": 1 7 | tolerations: 8 | - key: "nvidia.com/gpu" 9 | operator: "Exists" 10 | effect: "NoSchedule" 11 | ebs: 12 | storage: 100Gi 13 | mount_path: /tmp 14 | git: 15 | repo_url: 'https://github.com/microsoft/Megatron-DeepSpeed.git' 16 | branch: main 17 | commit: a9856ce0e75dbe69c96d4e241e8a191b344118d7 18 | pre_script: 19 | - pip install --upgrade pip 20 | - pip install transformers==4.38.1 datasets==2.17.1 21 | - pip install nltk==3.8.1 22 | - python <&1 | tee $OUTPUT_LOG' 34 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/llama2-7b-peft/merge_peft.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 8 5 | limits: 6 | "nvidia.com/gpu": 8 7 | ebs: 8 | storage: 500Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/nlp_language_modeling/merge_lora_weights 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/merge_peft.log 19 | - PATH_TO_BASE_MODEL=$MODEL_PATH/ckpt.nemo 20 | - echo "PATH_TO_BASE_MODEL=$PATH_TO_BASE_MODEL" 21 | - PATH_TO_PEFT_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/$EXP_NAME.nemo 22 | - echo "PATH_TO_PEFT_MODEL=$PATH_TO_PEFT_MODEL" 23 | - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo 24 | - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL" 25 | process: 26 | env: 27 | - name: LOG_ROOT 28 | value: /efs/home/{{ .Release.Name }}/logs 29 | - name: MODEL_PATH 30 | value: /fsx/pretrained-models/meta-llama/Llama-2-7b-hf 31 | - name: EXP_NAME 32 | value: peft_pubmedqa 33 | command: 34 | - python 35 | args: 36 | - merge.py 37 | - trainer.accelerator=cpu 38 | - gpt_model_file=$PATH_TO_BASE_MODEL 39 | - lora_model_path=$PATH_TO_PEFT_MODEL 40 | - merged_model_path=$PATH_TO_MERGED_MODEL 41 | - '2>&1 | tee $OUTPUT_LOG' 42 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/llama2-7b-peft/nemo_to_hf.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 8 5 | limits: 6 | "nvidia.com/gpu": 8 7 | ebs: 8 | storage: 500Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/nemo_to_hf.log 19 | - TMP_OUTPUT_PATH=/tmp/hf_peft_model 20 | - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo 21 | - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL" 22 | post_script: 23 | - cp -r $TMP_OUTPUT_PATH $MODEL_PATH/ 24 | process: 25 | env: 26 | - name: LOG_ROOT 27 | value: /efs/home/{{ .Release.Name }}/logs 28 | - name: MODEL_PATH 29 | value: /fsx/pretrained-models/meta-llama/Llama-2-7b-hf 30 | - name: EXP_NAME 31 | value: peft_pubmedqa 32 | command: 33 | - python3 34 | args: 35 | - convert_llama_nemo_to_hf.py 36 | - --input_name_or_path=$PATH_TO_MERGED_MODEL 37 | - --output_path=$TMP_OUTPUT_PATH/model.bin 38 | - --hf_input_path=$MODEL_PATH 39 | - --hf_output_path=$TMP_OUTPUT_PATH 40 | - --cpu-only 41 | - '2>&1 | tee $OUTPUT_LOG' 42 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/llama2-7b-peft/peft_accuracy.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | cpu: "300m" 5 | memory: "256Mi" 6 | limits: 7 | cpu: "1000m" 8 | memory: "2048Mi" 9 | ebs: 10 | storage: 200Gi 11 | mount_path: /tmp 12 | inline_script: 13 | - |+ 14 | cat > /tmp/run_accuracy_metric_calculation.py <&1 | tee $OUTPUT_LOG' 55 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/llama31-8b-peft-dolphin/hf_to_nemo.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 4 5 | limits: 6 | "nvidia.com/gpu": 4 7 | ebs: 8 | storage: 400Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/hf_to_nemo.log 19 | - TMP_MODEL_PATH=/tmp/model 20 | - cp -r $MODEL_PATH $TMP_MODEL_PATH 21 | - 'if [ -f $TMP_MODEL_PATH/tokenizer.model ]; then rm -f $TMP_MODEL_PATH/tokenizer.model; fi' 22 | post_script: 23 | - cp -r $TMP_MODEL_PATH/ckpt.nemo $MODEL_PATH/ 24 | process: 25 | env: 26 | - name: LOG_ROOT 27 | value: /efs/home/{{ .Release.Name }}/logs 28 | - name: MODEL_PATH 29 | value: /fsx/pretrained-models/meta-llama/Llama-3.1-8B 30 | command: 31 | - python3 32 | args: 33 | - convert_llama_hf_to_nemo.py 34 | - --input_name_or_path=$TMP_MODEL_PATH 35 | - --output_path=$TMP_MODEL_PATH/ckpt.nemo 36 | - --llama31=True 37 | - '2>&1' 38 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/llama31-8b-peft-dolphin/merge_peft.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 8 5 | limits: 6 | "nvidia.com/gpu": 8 7 | ebs: 8 | storage: 500Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/nlp_language_modeling/merge_lora_weights 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/merge_peft.log 19 | - PATH_TO_BASE_MODEL=$MODEL_PATH/ckpt.nemo 20 | - echo "PATH_TO_BASE_MODEL=$PATH_TO_BASE_MODEL" 21 | - PATH_TO_PEFT_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/$EXP_NAME.nemo 22 | - echo "PATH_TO_PEFT_MODEL=$PATH_TO_PEFT_MODEL" 23 | - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo 24 | - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL" 25 | process: 26 | env: 27 | - name: LOG_ROOT 28 | value: /efs/home/{{ .Release.Name }}/logs 29 | - name: MODEL_PATH 30 | value: /fsx/pretrained-models/meta-llama/Llama-3.1-8B 31 | - name: EXP_NAME 32 | value: peft_dolphin 33 | - name: HF_TOKEN 34 | value: "{{ .Values.hf_token }}" 35 | command: 36 | - python 37 | args: 38 | - merge.py 39 | - trainer.accelerator=cpu 40 | - gpt_model_file=$PATH_TO_BASE_MODEL 41 | - lora_model_path=$PATH_TO_PEFT_MODEL 42 | - merged_model_path=$PATH_TO_MERGED_MODEL 43 | - '2>&1 | tee $OUTPUT_LOG' 44 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/llama31-8b-peft-dolphin/nemo_to_hf.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 8 5 | limits: 6 | "nvidia.com/gpu": 8 7 | ebs: 8 | storage: 500Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/nemo_to_hf.log 19 | - TMP_OUTPUT_PATH=/tmp/$EXP_NAME 20 | - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo 21 | - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL" 22 | post_script: 23 | - cp -r $TMP_OUTPUT_PATH $MODEL_PATH/ 24 | process: 25 | env: 26 | - name: LOG_ROOT 27 | value: /efs/home/{{ .Release.Name }}/logs 28 | - name: MODEL_PATH 29 | value: /fsx/pretrained-models/meta-llama/Llama-3.1-8B 30 | - name: EXP_NAME 31 | value: peft_dolphin 32 | - name: HF_TOKEN 33 | value: "{{ .Values.hf_token }}" 34 | command: 35 | - python3 36 | args: 37 | - convert_llama_nemo_to_hf.py 38 | - --input_name_or_path=$PATH_TO_MERGED_MODEL 39 | - --output_path=$TMP_OUTPUT_PATH/model.bin 40 | - --hf_input_path=$MODEL_PATH 41 | - --hf_output_path=$TMP_OUTPUT_PATH 42 | - --cpu-only 43 | - '2>&1 | tee $OUTPUT_LOG' 44 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/mistral-7b-v01-peft-dolphin/hf_to_nemo.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 4 5 | limits: 6 | "nvidia.com/gpu": 4 7 | ebs: 8 | storage: 400Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/hf_to_nemo.log 19 | post_script: 20 | - cp -r /tmp/ckpt.nemo $MODEL_PATH/ 21 | process: 22 | env: 23 | - name: LOG_ROOT 24 | value: /efs/home/{{ .Release.Name }}/logs 25 | - name: MODEL_PATH 26 | value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1 27 | command: 28 | - python3 29 | args: 30 | - convert_mistral_7b_hf_to_nemo.py 31 | - --input_name_or_path=$MODEL_PATH/ 32 | - --output_path=/tmp/ckpt.nemo 33 | - '2>&1 | tee $OUTPUT_LOG' 34 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/mistral-7b-v01-peft-dolphin/merge_peft.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 8 5 | limits: 6 | "nvidia.com/gpu": 8 7 | ebs: 8 | storage: 500Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/nlp_language_modeling/merge_lora_weights 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/merge_peft.log 19 | - PATH_TO_BASE_MODEL=$MODEL_PATH/ckpt.nemo 20 | - echo "PATH_TO_BASE_MODEL=$PATH_TO_BASE_MODEL" 21 | - PATH_TO_PEFT_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/$EXP_NAME.nemo 22 | - echo "PATH_TO_PEFT_MODEL=$PATH_TO_PEFT_MODEL" 23 | - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo 24 | - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL" 25 | process: 26 | env: 27 | - name: LOG_ROOT 28 | value: /efs/home/{{ .Release.Name }}/logs 29 | - name: MODEL_PATH 30 | value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1 31 | - name: EXP_NAME 32 | value: peft_dolphin 33 | command: 34 | - python 35 | args: 36 | - merge.py 37 | - trainer.accelerator=cpu 38 | - gpt_model_file=$PATH_TO_BASE_MODEL 39 | - lora_model_path=$PATH_TO_PEFT_MODEL 40 | - merged_model_path=$PATH_TO_MERGED_MODEL 41 | - '2>&1 | tee $OUTPUT_LOG' 42 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/mistral-7b-v01-peft-dolphin/nemo_to_hf.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 8 5 | limits: 6 | "nvidia.com/gpu": 8 7 | ebs: 8 | storage: 500Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/nemo_to_hf.log 19 | - TMP_OUTPUT_PATH=/tmp/$EXP_NAME 20 | - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo 21 | - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL" 22 | post_script: 23 | - cp -r $TMP_OUTPUT_PATH $MODEL_PATH/ 24 | process: 25 | env: 26 | - name: LOG_ROOT 27 | value: /efs/home/{{ .Release.Name }}/logs 28 | - name: MODEL_PATH 29 | value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1 30 | - name: EXP_NAME 31 | value: peft_dolphin 32 | command: 33 | - python3 34 | args: 35 | - convert_mistral_7b_nemo_to_hf.py 36 | - --input_name_or_path=$PATH_TO_MERGED_MODEL 37 | - --output_path=$TMP_OUTPUT_PATH 38 | - --hf_model_name=$MODEL_PATH 39 | - '2>&1 | tee $OUTPUT_LOG' 40 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/mistral-7b-v01-peft/hf_to_nemo.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 4 5 | limits: 6 | "nvidia.com/gpu": 4 7 | ebs: 8 | storage: 400Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/hf_to_nemo.log 19 | post_script: 20 | - cp -r /tmp/ckpt.nemo $MODEL_PATH/ 21 | process: 22 | env: 23 | - name: LOG_ROOT 24 | value: /efs/home/{{ .Release.Name }}/logs 25 | - name: MODEL_PATH 26 | value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1 27 | command: 28 | - python3 29 | args: 30 | - convert_mistral_7b_hf_to_nemo.py 31 | - --input_name_or_path=$MODEL_PATH/ 32 | - --output_path=/tmp/ckpt.nemo 33 | - '2>&1 | tee $OUTPUT_LOG' 34 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/mistral-7b-v01-peft/merge_peft.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 8 5 | limits: 6 | "nvidia.com/gpu": 8 7 | ebs: 8 | storage: 500Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/nlp_language_modeling/merge_lora_weights 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/merge_peft.log 19 | - PATH_TO_BASE_MODEL=$MODEL_PATH/ckpt.nemo 20 | - echo "PATH_TO_BASE_MODEL=$PATH_TO_BASE_MODEL" 21 | - PATH_TO_PEFT_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/$EXP_NAME.nemo 22 | - echo "PATH_TO_PEFT_MODEL=$PATH_TO_PEFT_MODEL" 23 | - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo 24 | - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL" 25 | process: 26 | env: 27 | - name: LOG_ROOT 28 | value: /efs/home/{{ .Release.Name }}/logs 29 | - name: MODEL_PATH 30 | value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1 31 | - name: EXP_NAME 32 | value: peft_pubmedqa 33 | command: 34 | - python 35 | args: 36 | - merge.py 37 | - trainer.accelerator=cpu 38 | - gpt_model_file=$PATH_TO_BASE_MODEL 39 | - lora_model_path=$PATH_TO_PEFT_MODEL 40 | - merged_model_path=$PATH_TO_MERGED_MODEL 41 | - '2>&1 | tee $OUTPUT_LOG' 42 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/mistral-7b-v01-peft/nemo_to_hf.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | "nvidia.com/gpu": 8 5 | limits: 6 | "nvidia.com/gpu": 8 7 | ebs: 8 | storage: 500Gi 9 | mount_path: /tmp 10 | tolerations: 11 | - key: "nvidia.com/gpu" 12 | operator: "Exists" 13 | effect: "NoSchedule" 14 | pre_script: 15 | - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters 16 | - cd $SCRIPT_DIR 17 | - mkdir -p $LOG_ROOT 18 | - OUTPUT_LOG=$LOG_ROOT/nemo_to_hf.log 19 | - TMP_OUTPUT_PATH=/tmp/hf_peft_model 20 | - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo 21 | - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL" 22 | post_script: 23 | - cp -r $TMP_OUTPUT_PATH $MODEL_PATH/ 24 | process: 25 | env: 26 | - name: LOG_ROOT 27 | value: /efs/home/{{ .Release.Name }}/logs 28 | - name: MODEL_PATH 29 | value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1 30 | - name: EXP_NAME 31 | value: peft_pubmedqa 32 | command: 33 | - python3 34 | args: 35 | - convert_mistral_7b_nemo_to_hf.py 36 | - --input_name_or_path=$PATH_TO_MERGED_MODEL 37 | - --output_path=$TMP_OUTPUT_PATH 38 | - --hf_model_name=$MODEL_PATH 39 | - '2>&1 | tee $OUTPUT_LOG' 40 | -------------------------------------------------------------------------------- /examples/training/nemo-megatron/mistral-7b-v01-peft/peft_accuracy.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | resources: 3 | requests: 4 | cpu: "300m" 5 | memory: "256Mi" 6 | limits: 7 | cpu: "1000m" 8 | memory: "2048Mi" 9 | ebs: 10 | storage: 200Gi 11 | mount_path: /tmp 12 | inline_script: 13 | - |+ 14 | cat > /tmp/run_accuracy_metric_calculation.py <&1 | tee $OUTPUT_LOG' 55 | -------------------------------------------------------------------------------- /examples/training/neuronx-distributed/gpt_neox_20b/wikicorpus.yaml: -------------------------------------------------------------------------------- 1 | image: 'public.ecr.aws/neuron/pytorch-training-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04' 2 | backoff_limit: 2000 3 | ebs: 4 | storage: 200Gi 5 | mount_path: /tmp 6 | resources: 7 | requests: 8 | "aws.amazon.com/neuron": 1 9 | limits: 10 | "aws.amazon.com/neuron": 1 11 | tolerations: 12 | - key: "aws.amazon.com/neuron" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | git: 16 | repo_url: "https://github.com/aws-neuron/neuronx-distributed.git" 17 | commit: a070deb86991affd589c48441bf819e6d4bb159b 18 | branch: main 19 | pre_script: 20 | - pip3 install --upgrade pip 21 | - pip3 install -r $GIT_CLONE_DIR/examples/training/tp_dp_gpt_neox_hf_pretrain/common/requirements.txt huggingface-hub==0.27.1 22 | - mkdir -p $DATA_ROOT 23 | - mkdir -p $LOG_ROOT 24 | process: 25 | env: 26 | - name: HOME 27 | value: /tmp 28 | - name: DATA_ROOT 29 | value: /fsx/home/{{ .Release.Name }} 30 | - name: LOG_ROOT 31 | value: /efs/home/{{ .Release.Name }}/logs 32 | command: 33 | - HOME=$DATA_ROOT python3 34 | args: 35 | - $GIT_CLONE_DIR/examples/training/tp_dp_gpt_neox_hf_pretrain/common/get_dataset.py 36 | - '2>&1 | tee $LOG_ROOT/dataset.log' 37 | -------------------------------------------------------------------------------- /examples/training/neuronx-distributed/gpt_neox_6.9b/wikicorpus.yaml: -------------------------------------------------------------------------------- 1 | image: 'public.ecr.aws/neuron/pytorch-training-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04' 2 | backoff_limit: 2000 3 | ebs: 4 | storage: 200Gi 5 | mount_path: /tmp 6 | resources: 7 | requests: 8 | "aws.amazon.com/neuron": 1 9 | limits: 10 | "aws.amazon.com/neuron": 1 11 | tolerations: 12 | - key: "aws.amazon.com/neuron" 13 | operator: "Exists" 14 | effect: "NoSchedule" 15 | git: 16 | repo_url: "https://github.com/aws-neuron/neuronx-distributed.git" 17 | commit: a070deb86991affd589c48441bf819e6d4bb159b 18 | branch: main 19 | pre_script: 20 | - pip3 install --upgrade pip 21 | - pip3 install -r $GIT_CLONE_DIR/examples/training/tp_dp_gpt_neox_hf_pretrain/common/requirements.txt huggingface-hub==0.27.1 22 | - mkdir -p $DATA_ROOT 23 | - mkdir -p $LOG_ROOT 24 | process: 25 | env: 26 | - name: HOME 27 | value: /tmp 28 | - name: DATA_ROOT 29 | value: /fsx/home/{{ .Release.Name }} 30 | - name: LOG_ROOT 31 | value: /efs/home/{{ .Release.Name }}/logs 32 | command: 33 | - HOME=$DATA_ROOT python3 34 | args: 35 | - $GIT_CLONE_DIR/examples/training/tp_dp_gpt_neox_hf_pretrain/common/get_dataset.py 36 | - '2>&1 | tee $LOG_ROOT/dataset.log' 37 | -------------------------------------------------------------------------------- /examples/training/raytrain/lightning-bert/fine-tune.yaml: -------------------------------------------------------------------------------- 1 | ray: 2 | version: '2.44.0' 3 | dashboard: 4 | host: '0.0.0.0' 5 | ports: 6 | - name: gcs-server 7 | port: 6379 8 | - name: client 9 | port: 10001 10 | - name: dashboard 11 | port: 8265 12 | resources: 13 | requests: 14 | cpu: 300m 15 | limits: 16 | cpu: 2 17 | runtime_env_yaml: 18 | runtimeEnvYAML: | 19 | pip: 20 | - pytorch-lightning==2.2.1 21 | - transformers==4.38.2 22 | - datasets==2.18.0 23 | - scikit-learn==1.4.0 24 | image: 25 | image_pull_policy: Always 26 | resources: 27 | requests: 28 | "nvidia.com/gpu": 1 29 | limits: 30 | "nvidia.com/gpu": 1 31 | nnodes: 1 32 | node_type: 'g5.xlarge' 33 | tolerations: 34 | - key: "nvidia.com/gpu" 35 | operator: "Exists" 36 | effect: "NoSchedule" 37 | pvc: 38 | - name: pv-efs 39 | mount_path: /efs 40 | git: 41 | repo_url: https://github.com/ray-project/kuberay.git 42 | branch: master 43 | commit: 0ea404b84e45b7b8822b071c7c02b2afb3bb3eae 44 | pre_script: 45 | - export LOGS_DIR=$HOME/logs/$HOSTNAME 46 | - mkdir -p $LOGS_DIR 47 | - export OUTPUT_LOG=$LOGS_DIR/fine-tune.log 48 | train: 49 | env: 50 | - name: HOME 51 | value: "/efs/home/{{ .Release.Name }}" 52 | command: 53 | - python 54 | args: 55 | - ray-operator/config/samples/pytorch-text-classifier/fine-tune-pytorch-text-classifier.py 56 | - '2>&1 | tee $OUTPUT_LOG' 57 | -------------------------------------------------------------------------------- /kfp/components/src/helm-charts-component/container/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lts/ubuntu:22.04_stable 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | ENV DEBCONF_NONINTERACTIVE_SEEN=true 5 | 6 | RUN apt-get update 7 | RUN apt-get -y install software-properties-common 8 | RUN apt-get -y install wget 9 | RUN apt-get -y install apt-transport-https 10 | RUN apt-get -y install gnupg2 11 | RUN apt-get -y install git tar zip unzip 12 | 13 | RUN wget -qO - https://baltocdn.com/helm/signing.asc > ./helm-ubuntu-public-key.asc 14 | RUN gpg --no-default-keyring --keyring ./helm_keyring.gpg --import ./helm-ubuntu-public-key.asc 15 | RUN gpg --no-default-keyring --keyring ./helm_keyring.gpg --export > ./helm.gpg 16 | RUN mv ./helm.gpg /etc/apt/trusted.gpg.d/ 17 | RUN rm ./helm-ubuntu-public-key.asc 18 | 19 | RUN add-apt-repository -y "deb [arch=amd64] https://baltocdn.com/helm/stable/debian/ all main" 20 | RUN apt-get update 21 | 22 | RUN apt-get -y install helm 23 | RUN apt-get -y install python3-minimal 24 | RUN apt-get -y install python3-pip 25 | RUN apt-get -y install awscli 26 | 27 | RUN pip3 install kubernetes 28 | RUN pip3 install boto3 29 | RUN pip3 install pyhelm 30 | RUN pip3 install PyYAML -------------------------------------------------------------------------------- /kfp/components/src/helm-charts-component/container/build_tools/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export IMAGE_NAME=eks/universal-client 4 | export IMAGE_TAG=1.0.0 5 | -------------------------------------------------------------------------------- /kfp/pipelines/src/helm-charts-pipeline/helm_charts_pipeline.py: -------------------------------------------------------------------------------- 1 | from kfp import dsl 2 | from kfp import compiler 3 | from kfp import components 4 | 5 | from typing import List, Dict 6 | 7 | helm_charts_component = components.load_component_from_file('kfp/components/packages/helm_charts_component.yaml') 8 | 9 | @dsl.pipeline 10 | def helm_charts_pipeline(chart_configs: List[Dict]) -> str: 11 | helm_charts_task = helm_charts_component(chart_configs=chart_configs) 12 | return helm_charts_task.output 13 | 14 | compiler.Compiler().compile(helm_charts_pipeline, package_path='kfp/pipelines/packages/helm_charts_pipeline.yaml') --------------------------------------------------------------------------------