├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── build-ecr-images.sh
├── charts
    ├── cluster-issuer
    │   ├── Chart.yaml
    │   ├── templates
    │   │   └── cluster_issuer.yaml
    │   └── values.yaml
    ├── dex
    │   ├── Chart.yaml
    │   ├── crds
    │   │   └── dex.yaml
    │   ├── templates
    │   │   ├── cluster_role.yaml
    │   │   ├── cluster_role_binding.yaml
    │   │   ├── config_map.yaml
    │   │   ├── deployment.yaml
    │   │   ├── secrets
    │   │   │   ├── static-oidc-client.yaml
    │   │   │   └── static-passwords.yaml
    │   │   ├── service.yaml
    │   │   ├── service_account.yaml
    │   │   └── virtual_service.yaml
    │   └── values.yaml
    ├── ebs-sc
    │   ├── Chart.yaml
    │   └── templates
    │   │   └── storage-class.yaml
    ├── istio-ingress
    │   ├── Chart.yaml
    │   ├── templates
    │   │   ├── certificate.yaml
    │   │   ├── cluster_roles.yaml
    │   │   ├── gateway.yaml
    │   │   └── virtual_service.yaml
    │   └── values.yaml
    ├── karpenter-components
    │   ├── Chart.yaml
    │   ├── templates
    │   │   ├── node-class.yaml
    │   │   └── node-pool.yaml
    │   └── values.yaml
    ├── machine-learning
    │   ├── data-prep
    │   │   ├── coco-data
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── coco-data.yaml
    │   │   │   └── values.yaml
    │   │   ├── data-process
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── data-process.yaml
    │   │   │   └── values.yaml
    │   │   ├── databricks-dolly-15k-data
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── data.yaml
    │   │   │   └── values.yaml
    │   │   ├── mpijob-data-process
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── dist-data-process.yaml
    │   │   │   └── values.yaml
    │   │   ├── ray-data-process
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── ray-data-process.yaml
    │   │   │   └── values.yaml
    │   │   └── redpajama-data
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │       └── redpajama-data.yaml
    │   │   │   └── values.yaml
    │   ├── model-prep
    │   │   ├── hf-snapshot
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── hf-snapshot.yaml
    │   │   │   └── values.yaml
    │   │   ├── rayserve-tnx-autocausalengine
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── engine.yaml
    │   │   │   └── values.yaml
    │   │   └── rayserve-vllm-asyncllmengine
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │       └── engine.yaml
    │   │   │   └── values.yaml
    │   ├── serving
    │   │   ├── djl-lmi-server
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── server.yaml
    │   │   │   └── values.yaml
    │   │   ├── generic-server
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── server.yaml
    │   │   │   └── values.yaml
    │   │   ├── rayserve
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── rayservice.yaml
    │   │   │   └── values.yaml
    │   │   ├── triton-inference-server-lws
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── server.yaml
    │   │   │   └── values.yaml
    │   │   └── triton-inference-server
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │       └── server.yaml
    │   │   │   └── values.yaml
    │   ├── testing
    │   │   ├── maskrcnn-jupyter
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │   │   └── jupyter.yaml
    │   │   │   └── values.yaml
    │   │   └── maskrcnn-optimized-jupyter
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │       └── jupyter.yaml
    │   │   │   └── values.yaml
    │   └── training
    │   │   ├── maskrcnn-optimized
    │   │       ├── Chart.yaml
    │   │       ├── templates
    │   │       │   └── maskrcnn.yaml
    │   │       └── values.yaml
    │   │   ├── maskrcnn
    │   │       ├── Chart.yaml
    │   │       ├── templates
    │   │       │   └── maskrcnn.yaml
    │   │       └── values.yaml
    │   │   ├── mpijob-horovod-tensorflow-gpu
    │   │       ├── Chart.yaml
    │   │       ├── templates
    │   │       │   └── train.yaml
    │   │       └── values.yaml
    │   │   ├── pytorchjob-distributed
    │   │       ├── Chart.yaml
    │   │       ├── templates
    │   │       │   └── train.yaml
    │   │       └── values.yaml
    │   │   ├── pytorchjob-elastic
    │   │       ├── Chart.yaml
    │   │       ├── templates
    │   │       │   └── train.yaml
    │   │       └── values.yaml
    │   │   └── raytrain
    │   │       ├── Chart.yaml
    │   │       ├── templates
    │   │           └── train.yaml
    │   │       └── values.yaml
    ├── ml-platform
    │   ├── kubeflow-admission-webhook
    │   │   ├── Chart.yaml
    │   │   ├── crds
    │   │   │   └── poddefaults.yaml
    │   │   ├── templates
    │   │   │   ├── authorization_policy.yaml
    │   │   │   ├── certificate.yaml
    │   │   │   ├── cluster_role_binding.yaml
    │   │   │   ├── cluster_roles.yaml
    │   │   │   ├── deplyment.yaml
    │   │   │   ├── issuer.yaml
    │   │   │   ├── mutating_webhook_configuration.yaml
    │   │   │   ├── service.yaml
    │   │   │   └── service_account.yaml
    │   │   └── values.yaml
    │   ├── kubeflow-central-dashboard
    │   │   ├── Chart.yaml
    │   │   ├── templates
    │   │   │   ├── authorization_policy.yaml
    │   │   │   ├── cluster_role.yaml
    │   │   │   ├── cluster_role_binding.yaml
    │   │   │   ├── config_map.yaml
    │   │   │   ├── deployment.yaml
    │   │   │   ├── role.yaml
    │   │   │   ├── role_binding.yaml
    │   │   │   ├── service.yaml
    │   │   │   ├── service_account.yaml
    │   │   │   └── virtual_service.yaml
    │   │   └── values.yaml
    │   ├── kubeflow-katib
    │   │   ├── Chart.yaml
    │   │   ├── crds
    │   │   │   ├── experiments.yaml
    │   │   │   ├── suggestions.yaml
    │   │   │   └── trials.yaml
    │   │   ├── templates
    │   │   │   ├── authorization_policy.yaml
    │   │   │   ├── certificate.yaml
    │   │   │   ├── cluster_role_bindings.yaml
    │   │   │   ├── cluster_roles.yaml
    │   │   │   ├── config_maps.yaml
    │   │   │   ├── deployments.yaml
    │   │   │   ├── issuer.yaml
    │   │   │   ├── mutating_web_hook.yaml
    │   │   │   ├── service_accounts.yaml
    │   │   │   ├── services.yaml
    │   │   │   ├── validating_web_hook.yaml
    │   │   │   └── virtual_service.yaml
    │   │   └── values.yaml
    │   ├── kubeflow-notebooks
    │   │   ├── Chart.yaml
    │   │   ├── crds
    │   │   │   └── notebooks.yaml
    │   │   ├── templates
    │   │   │   ├── controller
    │   │   │   │   ├── cluster_role_binding.yaml
    │   │   │   │   ├── cluster_roles.yaml
    │   │   │   │   ├── config_map.yaml
    │   │   │   │   ├── deployment.yaml
    │   │   │   │   ├── role.yaml
    │   │   │   │   ├── role_binding.yaml
    │   │   │   │   ├── service.yaml
    │   │   │   │   └── service_account.yaml
    │   │   │   └── webapp
    │   │   │   │   ├── authorization_policy.yaml
    │   │   │   │   ├── cluster_role_binding.yaml
    │   │   │   │   ├── cluster_roles.yaml
    │   │   │   │   ├── config-maps
    │   │   │   │       ├── config.yaml
    │   │   │   │       └── logos.yaml
    │   │   │   │   ├── deployment.yaml
    │   │   │   │   ├── destination_rule.yaml
    │   │   │   │   ├── role.yaml
    │   │   │   │   ├── role_binding.yaml
    │   │   │   │   ├── service.yaml
    │   │   │   │   ├── service_account.yaml
    │   │   │   │   └── virtual_service.yaml
    │   │   └── values.yaml
    │   ├── kubeflow-pipelines
    │   │   ├── Chart.yaml
    │   │   ├── crds
    │   │   │   ├── clusterworkflowtemplates.yaml
    │   │   │   ├── compositecontrollers.yaml
    │   │   │   ├── controllerrevision.yaml
    │   │   │   ├── cronworkflows.yaml
    │   │   │   ├── decoratorcontroller.yaml
    │   │   │   ├── scheduledworkflows.yaml
    │   │   │   ├── viewers.yaml
    │   │   │   ├── workfloweventbindings.yaml
    │   │   │   ├── workflows.yaml
    │   │   │   ├── workflowtaskresults.yaml
    │   │   │   ├── workflowtasksets.yaml
    │   │   │   └── workflowtemplates.yaml
    │   │   ├── templates
    │   │   │   ├── authorization_policies.yaml
    │   │   │   ├── certficate.yaml
    │   │   │   ├── cluster_role_bindings.yaml
    │   │   │   ├── cluster_roles.yaml
    │   │   │   ├── composite_controller.yaml
    │   │   │   ├── config_maps.yaml
    │   │   │   ├── deployments.yaml
    │   │   │   ├── destination_rules.yaml
    │   │   │   ├── issuer.yaml
    │   │   │   ├── mutating_webhook_configuration.yaml
    │   │   │   ├── priority_class.yaml
    │   │   │   ├── role_bindings.yaml
    │   │   │   ├── roles.yaml
    │   │   │   ├── secrets.yaml
    │   │   │   ├── service_accounts.yaml
    │   │   │   ├── services.yaml
    │   │   │   ├── stateful_set.yaml
    │   │   │   └── virtual_services.yaml
    │   │   └── values.yaml
    │   ├── kubeflow-profiles-and-kfam
    │   │   ├── Chart.yaml
    │   │   ├── crds
    │   │   │   └── profiles.yaml
    │   │   ├── templates
    │   │   │   ├── cluster_role_binding.yaml
    │   │   │   ├── config-maps
    │   │   │   │   ├── namespace_labels_data.yaml
    │   │   │   │   └── profiles_config.yaml
    │   │   │   ├── deployment.yaml
    │   │   │   ├── role.yaml
    │   │   │   ├── role_binding.yaml
    │   │   │   ├── service.yaml
    │   │   │   ├── service_account.yaml
    │   │   │   └── virtual_service.yaml
    │   │   └── values.yaml
    │   ├── kubeflow-roles
    │   │   ├── Chart.yaml
    │   │   └── templates
    │   │   │   └── cluster-roles.yaml
    │   ├── kubeflow-tensorboards
    │   │   ├── Chart.yaml
    │   │   ├── crds
    │   │   │   └── tensorboard.yaml
    │   │   ├── templates
    │   │   │   ├── controller
    │   │   │   │   ├── cluster_role_binding.yaml
    │   │   │   │   ├── cluster_roles.yaml
    │   │   │   │   ├── config_map.yaml
    │   │   │   │   ├── deployment.yaml
    │   │   │   │   ├── role.yaml
    │   │   │   │   ├── role_binding.yaml
    │   │   │   │   ├── service.yaml
    │   │   │   │   └── service_account.yaml
    │   │   │   └── webapp
    │   │   │   │   ├── authorization_policy.yaml
    │   │   │   │   ├── cluster_role_binding.yaml
    │   │   │   │   ├── cluster_roles.yaml
    │   │   │   │   ├── deployment.yaml
    │   │   │   │   ├── destination_rule.yaml
    │   │   │   │   ├── service.yaml
    │   │   │   │   ├── service_account.yaml
    │   │   │   │   └── virtual_service.yaml
    │   │   └── values.yaml
    │   ├── kubeflow-training-operator
    │   │   ├── Chart.yaml
    │   │   ├── crds
    │   │   │   ├── mxjobs.yaml
    │   │   │   ├── paddlejobs.yaml
    │   │   │   ├── pytorchjobs.yaml
    │   │   │   ├── tfjobs.yaml
    │   │   │   └── xgboostjobs.yaml
    │   │   ├── templates
    │   │   │   ├── cluster_role_binding.yaml
    │   │   │   ├── cluster_roles.yaml
    │   │   │   ├── deployment.yaml
    │   │   │   ├── role.yaml
    │   │   │   ├── service.yaml
    │   │   │   └── service_account.yaml
    │   │   └── values.yaml
    │   ├── kubeflow-user-profile-defaults
    │   │   ├── Chart.yaml
    │   │   ├── templates
    │   │   │   ├── pod_default.yaml
    │   │   │   ├── role_bindings.yaml
    │   │   │   └── roles.yaml
    │   │   └── values.yaml
    │   ├── kubeflow-user-profile
    │   │   ├── Chart.yaml
    │   │   ├── templates
    │   │   │   ├── config_map.yaml
    │   │   │   └── profile.yaml
    │   │   └── values.yaml
    │   └── kubeflow-volumes
    │   │   ├── Chart.yaml
    │   │   ├── templates
    │   │       ├── controller
    │   │       │   ├── admission_webhooks.yaml
    │   │       │   ├── certificate.yaml
    │   │       │   ├── cluster_role_bindings.yaml
    │   │       │   ├── cluster_roles.yaml
    │   │       │   ├── config_map.yaml
    │   │       │   ├── deployment.yaml
    │   │       │   ├── issuer.yaml
    │   │       │   ├── pvcviewers_crd.yaml
    │   │       │   ├── role.yaml
    │   │       │   ├── role_binding.yaml
    │   │       │   ├── service.yaml
    │   │       │   └── service_account.yaml
    │   │       └── webapp
    │   │       │   ├── authorization_policy.yaml
    │   │       │   ├── cluster_role_binding.yaml
    │   │       │   ├── cluster_roles.yaml
    │   │       │   ├── config_map.yaml
    │   │       │   ├── deployment.yaml
    │   │       │   ├── destination_rule.yaml
    │   │       │   ├── service.yaml
    │   │       │   ├── service_account.yaml
    │   │       │   └── virtual_service.yaml
    │   │   └── values.yaml
    ├── mpi-operator
    │   ├── Chart.yaml
    │   ├── crds
    │   │   └── mpijob.yaml
    │   ├── templates
    │   │   └── mpi-operator.yaml
    │   └── values.yaml
    ├── nvidia-device-plugin
    │   ├── Chart.yaml
    │   └── templates
    │   │   └── daemonset.yaml
    ├── oauth2-proxy-route
    │   ├── Chart.yaml
    │   ├── templates
    │   │   ├── authorization_policy.yaml
    │   │   └── virtual_service.yaml
    │   └── values.yaml
    ├── pv-efs
    │   ├── Chart.yaml
    │   └── templates
    │   │   ├── pv.yaml
    │   │   ├── pvc.yaml
    │   │   └── storage-class.yaml
    └── pv-fsx
    │   ├── Chart.yaml
    │   └── templates
    │       ├── pv.yaml
    │       ├── pvc.yaml
    │       └── storage-class.yaml
├── containers
    ├── aws-samples-maskrcnn
    │   ├── Dockerfile
    │   ├── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    │   └── notebooks
    │   │   └── mask-rcnn-tensorflow-viz.ipynb
    ├── megatron-deepspeed
    │   ├── Dockerfile
    │   └── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    ├── nemo-megatron
    │   ├── Dockerfile
    │   └── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    ├── ray-pytorch-neuronx-vllm
    │   ├── Dockerfile
    │   ├── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    │   └── patches
    │   │   ├── vllm-neuron-0.6.6.post1.patch
    │   │   ├── vllm-neuron-0.8.1.patch
    │   │   └── vllm_v0.5.0_neuron.patch
    ├── ray-pytorch-neuronx
    │   ├── Dockerfile
    │   └── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    ├── ray-pytorch
    │   ├── Dockerfile
    │   └── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    ├── tensorpack-maskrcnn
    │   ├── Dockerfile
    │   ├── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    │   └── notebooks
    │   │   └── mask-rcnn-tensorpack-viz.ipynb
    ├── tritonserver-neuronx-djl-lmi
    │   ├── Dockerfile
    │   └── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    ├── tritonserver-neuronx-vllm
    │   ├── Dockerfile
    │   ├── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    │   └── patch
    │   │   ├── vllm-neuron-0.6.6.post1.patch
    │   │   └── vllm-neuron-0.8.1.patch
    ├── tritonserver-neuronx
    │   ├── Dockerfile
    │   └── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    ├── tritonserver-ray-vllm
    │   ├── Dockerfile
    │   ├── build_tools
    │   │   ├── build_and_push.sh
    │   │   └── set_env.sh
    │   └── resources
    │   │   ├── kubessh
    │   │   └── server.py
    └── tritonserver-trtllm
    │   ├── Dockerfile
    │   ├── build_tools
    │       ├── build_and_push.sh
    │       └── set_env.sh
    │   └── resources
    │       ├── kubessh
    │       └── server.py
├── eks-cluster
    ├── legacy
    │   ├── README.md
    │   ├── apply-aws-auth-cm.sh
    │   ├── apply-nvidia-plugin.sh
    │   ├── aws-auth-cm.yaml
    │   ├── configure-eks-auth.sh
    │   ├── efs-sc.yaml
    │   ├── fsx-sc.yaml
    │   ├── install-eksctl.sh
    │   ├── prepare-data.sh
    │   ├── pv-kubeflow-efs-gp-bursting.yaml
    │   ├── pv-kubeflow-fsx.yaml
    │   ├── pvc-kubeflow-efs-gp-bursting.yaml
    │   ├── pvc-kubeflow-fsx.yaml
    │   ├── replicate-data.yaml
    │   ├── set-cluster.sh
    │   ├── tiller-rbac-config.yaml
    │   └── update-kubeconfig.sh
    ├── terraform
    │   └── aws-eks-cluster-and-nodegroup
    │   │   ├── istio
    │   │       ├── main.tf
    │   │       ├── variables.tf
    │   │       └── versions.tf
    │   │   ├── kubeflow
    │   │       ├── main.tf
    │   │       ├── variables.tf
    │   │       └── versions.tf
    │   │   ├── main.tf
    │   │   ├── mlflow
    │   │       ├── main.tf
    │   │       ├── outputs.tf
    │   │       ├── variables.tf
    │   │       └── versions.tf
    │   │   ├── outputs.tf
    │   │   ├── slurm
    │   │       ├── main.tf
    │   │       ├── variables.tf
    │   │       └── versions.tf
    │   │   ├── variables.tf
    │   │   └── versions.tf
    ├── tests
    │   ├── test-gpu-efa.yaml
    │   ├── test-gpu.yaml
    │   └── test-neuron.yaml
    ├── user-data.txt
    └── utils
    │   ├── attach-pvc-fsx.yaml
    │   ├── attach-pvc.yaml
    │   ├── install-kubectl-linux.sh
    │   ├── prepare-s3-bucket.sh
    │   ├── s3-backend.sh
    │   ├── stage-data-fsx.yaml
    │   └── stage-data.yaml
├── examples
    ├── agentic
    │   └── mcp-gateway-registry
    │   │   ├── README.md
    │   │   └── server.yaml
    ├── inference
    │   ├── README.md
    │   ├── djl-serving
    │   │   ├── tensorrt-llm
    │   │   │   ├── llama3-8b-instruct
    │   │   │   │   ├── README.md
    │   │   │   │   └── server.yaml
    │   │   │   └── mistral-7b-instruct-v0.2
    │   │   │   │   ├── README.md
    │   │   │   │   └── server.yaml
    │   │   └── transformers-neuronx
    │   │   │   ├── llama3-8b-instruct
    │   │   │       ├── README.md
    │   │   │       └── server.yaml
    │   │   │   └── mistral-7b-instruct-v0.2
    │   │   │       ├── README.md
    │   │   │       └── server.yaml
    │   ├── rayserve
    │   │   ├── facebook-bart-large-cnn
    │   │   │   ├── README.md
    │   │   │   └── rayservice.yaml
    │   │   ├── meta-llama3-8b-neuron
    │   │   │   ├── README.md
    │   │   │   ├── engine_config.yaml
    │   │   │   └── rayservice.yaml
    │   │   ├── meta-llama3-8b-vllm-neuron
    │   │   │   ├── README.md
    │   │   │   ├── engine_config.yaml
    │   │   │   └── rayservice.yaml
    │   │   ├── meta-llama3-8b-vllm
    │   │   │   ├── README.md
    │   │   │   ├── engine_config.yaml
    │   │   │   └── rayservice.yaml
    │   │   ├── meta-llama32-11b-vis-inst-vllm
    │   │   │   ├── README.md
    │   │   │   ├── engine_config.yaml
    │   │   │   └── rayservice.yaml
    │   │   ├── meta-llama33-70b-instruct-neuron
    │   │   │   ├── README.md
    │   │   │   ├── engine_config.yaml
    │   │   │   └── rayservice.yaml
    │   │   ├── meta-llama33-70b-instruct-vllm
    │   │   │   ├── README.md
    │   │   │   ├── engine_config.yaml
    │   │   │   └── rayservice.yaml
    │   │   └── mistral-8x22b-instruct-v01-vllm
    │   │   │   ├── README.md
    │   │   │   ├── engine_config.yaml
    │   │   │   └── rayservice.yaml
    │   └── triton-inference-server
    │   │   ├── python_backend
    │   │       ├── baai-bge-reranker-large-neuron
    │   │       │   ├── README.md
    │   │       │   └── triton_server.yaml
    │   │       ├── llama3-8b-instruct-lmi-neuron
    │   │       │   ├── README.md
    │   │       │   └── triton_server.yaml
    │   │       ├── llama3-8b-instruct-neuron
    │   │       │   ├── README.md
    │   │       │   └── triton_server.yaml
    │   │       ├── mistral-7b-instruct-v01-neuron
    │   │       │   ├── README.md
    │   │       │   └── triton_server.yaml
    │   │       └── xlm-roberta-base-neuron
    │   │       │   ├── README.md
    │   │       │   └── triton_server.yaml
    │   │   ├── ray_vllm_backend
    │   │       └── mistral-8x22b-instruct-v01
    │   │       │   ├── README.md
    │   │       │   └── triton_server.yaml
    │   │   ├── tensorrtllm_backend
    │   │       ├── llama2-7b
    │   │       │   ├── README.md
    │   │       │   ├── hf_to_trtllm.yaml
    │   │       │   ├── triton_model.yaml
    │   │       │   ├── triton_server.yaml
    │   │       │   └── trtllm_engine.yaml
    │   │       ├── llama3-8b-instruct
    │   │       │   ├── README.md
    │   │       │   ├── hf_to_trtllm.yaml
    │   │       │   ├── triton_model.yaml
    │   │       │   ├── triton_server.yaml
    │   │       │   └── trtllm_engine.yaml
    │   │       ├── mistral-7b-instruct-v01
    │   │       │   ├── README.md
    │   │       │   ├── hf_to_trtllm.yaml
    │   │       │   ├── triton_model.yaml
    │   │       │   ├── triton_server.yaml
    │   │       │   └── trtllm_engine.yaml
    │   │       ├── mistral-7b-instruct-v01_llama3-8b
    │   │       │   ├── README.md
    │   │       │   ├── llama3_8b_hf_to_trtllm.yaml
    │   │       │   ├── mistral_7b_hf_to_trtllm.yaml
    │   │       │   ├── triton_llama3_8b_model.yaml
    │   │       │   ├── triton_mistral_7b_model.yaml
    │   │       │   ├── triton_server.yaml
    │   │       │   ├── trtllm_llama3_8b_engine.yaml
    │   │       │   └── trtllm_mistral_7b_engine.yaml
    │   │       └── mistral-8x22b-instruct-v01
    │   │       │   ├── README.md
    │   │       │   ├── hf_to_trtllm.yaml
    │   │       │   ├── triton_model.yaml
    │   │       │   ├── triton_server.yaml
    │   │       │   └── trtllm_engine.yaml
    │   │   └── vllm_backend
    │   │       ├── deepseek-r1-distill-llama-8b-neuron
    │   │           ├── README.md
    │   │           └── triton_server.yaml
    │   │       ├── deepseek-r1-distill-llama-8b
    │   │           ├── README.md
    │   │           └── triton_server.yaml
    │   │       ├── llama3-8b-instruct-neuron
    │   │           ├── README.md
    │   │           └── triton_server.yaml
    │   │       ├── llama3-8b-instruct
    │   │           ├── README.md
    │   │           └── triton_server.yaml
    │   │       ├── mistral-7b-instruct-v02-neuron
    │   │           ├── README.md
    │   │           └── triton_server.yaml
    │   │       └── mistral-7b-instruct-v02
    │   │           ├── README.md
    │   │           └── triton_server.yaml
    ├── legacy
    │   ├── README.md
    │   ├── maskrcnn
    │   │   ├── README.md
    │   │   ├── train-maskrcnn-aws.yaml
    │   │   └── train-maskrcnn-tensorpack.yaml
    │   └── neuronx-nemo-megatron
    │   │   ├── llama2_13b
    │   │       ├── README.md
    │   │       ├── compile.yaml
    │   │       ├── preprocess.yaml
    │   │       └── pretrain.yaml
    │   │   ├── llama2_70b
    │   │       ├── README.md
    │   │       ├── compile.yaml
    │   │       ├── preprocess.yaml
    │   │       └── pretrain.yaml
    │   │   └── llama2_7b
    │   │       ├── README.md
    │   │       ├── compile.yaml
    │   │       ├── preprocess.yaml
    │   │       └── pretrain.yaml
    └── training
    │   ├── README.md
    │   ├── accelerate
    │       ├── bert-glue-mrpc
    │       │   ├── README.md
    │       │   ├── pipeline.ipynb
    │       │   └── pretrain.yaml
    │       └── llama2-ft-fsdp
    │       │   ├── 13b.yaml
    │       │   ├── 70b.yaml
    │       │   ├── 7b.yaml
    │       │   └── README.md
    │   ├── megatron-deepspeed
    │       └── gpt2_345m
    │       │   ├── README.md
    │       │   ├── pretrain-ddp-tp-pp-zero1.yaml
    │       │   ├── pretrain-ddp-zero1.yaml
    │       │   └── wikicorpus.yaml
    │   ├── nemo-megatron
    │       ├── llama2-7b-peft
    │       │   ├── README.md
    │       │   ├── hf_to_nemo.yaml
    │       │   ├── merge_peft.yaml
    │       │   ├── nemo_to_hf.yaml
    │       │   ├── peft.yaml
    │       │   ├── peft_accuracy.yaml
    │       │   ├── peft_eval.yaml
    │       │   └── preprocess.yaml
    │       ├── llama31-8b-peft-dolphin
    │       │   ├── README.md
    │       │   ├── hf_to_nemo.yaml
    │       │   ├── merge_peft.yaml
    │       │   ├── nemo_to_hf.yaml
    │       │   ├── peft.yaml
    │       │   ├── peft_eval.yaml
    │       │   └── preprocess.yaml
    │       ├── mistral-7b-v01-peft-dolphin
    │       │   ├── README.md
    │       │   ├── hf_to_nemo.yaml
    │       │   ├── merge_peft.yaml
    │       │   ├── nemo_to_hf.yaml
    │       │   ├── peft.yaml
    │       │   ├── peft_eval.yaml
    │       │   └── preprocess.yaml
    │       └── mistral-7b-v01-peft
    │       │   ├── README.md
    │       │   ├── hf_to_nemo.yaml
    │       │   ├── merge_peft.yaml
    │       │   ├── nemo_to_hf.yaml
    │       │   ├── peft.yaml
    │       │   ├── peft_accuracy.yaml
    │       │   ├── peft_eval.yaml
    │       │   └── preprocess.yaml
    │   ├── neuronx-distributed-training
    │       └── llama3_70b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │   ├── neuronx-distributed
    │       ├── gpt_neox_20b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── gpt_neox_6.9b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama2_13b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama2_13b_ptl
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama2_70b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama2_70b_ptl
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama2_7b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama2_7b_ptl
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama31_70b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama31_8b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama3_70b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       ├── llama3_70b_ptl
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │       └── llama3_8b
    │       │   ├── README.md
    │       │   ├── compile.yaml
    │       │   ├── pretrain.yaml
    │       │   └── wikicorpus.yaml
    │   └── raytrain
    │       └── lightning-bert
    │           ├── README.md
    │           └── fine-tune.yaml
├── kfp
    ├── components
    │   ├── packages
    │   │   └── helm_charts_component.yaml
    │   └── src
    │   │   └── helm-charts-component
    │   │       ├── container
    │   │           ├── Dockerfile
    │   │           └── build_tools
    │   │           │   ├── build_and_push.sh
    │   │           │   └── set_env.sh
    │   │       └── helm_charts_component.py
    └── pipelines
    │   ├── packages
    │       └── helm_charts_pipeline.yaml
    │   └── src
    │       └── helm-charts-pipeline
    │           └── helm_charts_pipeline.py
└── tutorials
    └── maskrcnn-blog
        └── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | **/.terraform*
2 | **/terraform.tfstate*
3 | **/Dockerfile.*
4 | **/.ipynb_checkpoints/*
5 | **/.DS_Store
6 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/build-ecr-images.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # set region
 4 | region=
 5 | if [ "$#" -eq 1 ]; then
 6 |     region=$1
 7 | else
 8 |     echo "usage: $0 <aws-region>"
 9 |     exit 1
10 | fi
11 | 
12 | cd containers
13 | for dir in `ls -d *`
14 | do
15 | $dir/build_tools/build_and_push.sh $region
16 | done
17 | 


--------------------------------------------------------------------------------
/charts/cluster-issuer/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: cluster-issuer
3 | description: A Helm chart for Kubeflow self-signing certificate issuer
4 | type: application
5 | version: 1.0.0
6 | appVersion: "v1.8.0"
7 | 


--------------------------------------------------------------------------------
/charts/cluster-issuer/templates/cluster_issuer.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: cert-manager.io/v1
 2 | kind: ClusterIssuer
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/component: cert-manager
 6 |     app.kubernetes.io/name: cert-manager
 7 |   name: self-signed-issuer
 8 |   namespace: {{ .Values.cluster_issuer.namespace }}
 9 | spec:
10 |   selfSigned: {}
11 | ---
12 | apiVersion: cert-manager.io/v1
13 | kind: Certificate
14 | metadata:
15 |   name: self-signed-ca
16 |   namespace: {{ .Values.cluster_issuer.namespace }}
17 | spec:
18 |   isCA: true
19 |   commonName: self-signed-ca
20 |   secretName: ca-secret
21 |   privateKey:
22 |     algorithm: RSA
23 |     encoding: PKCS1
24 |     size: 2048
25 |   issuerRef:
26 |     name: self-signed-issuer
27 |     kind: ClusterIssuer
28 |     group: cert-manager.io
29 | ---
30 | apiVersion: cert-manager.io/v1
31 | kind: ClusterIssuer
32 | metadata:
33 |   labels:
34 |     app.kubernetes.io/component: cert-manager
35 |     app.kubernetes.io/name: cert-manager
36 |   name: {{ .Values.cluster_issuer.name }}
37 |   namespace: {{ .Values.cluster_issuer.namespace }}
38 | spec:
39 |   ca:
40 |     secretName: ca-secret


--------------------------------------------------------------------------------
/charts/cluster-issuer/values.yaml:
--------------------------------------------------------------------------------
1 | cluster_issuer:
2 |   name: cluster-self-signing-issuer
3 |   namespace: cert-manager
4 | 


--------------------------------------------------------------------------------
/charts/dex/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "v2.36.0"
3 | description: A Helm chart for Dex
4 | name: dex
5 | type: application
6 | version: 1.0.0
7 | 


--------------------------------------------------------------------------------
/charts/dex/crds/dex.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   name: authcodes.dex.coreos.com
 5 | spec:
 6 |   group: dex.coreos.com
 7 |   names:
 8 |     kind: AuthCode
 9 |     listKind: AuthCodeList
10 |     plural: authcodes
11 |     singular: authcode
12 |   scope: Namespaced
13 |   versions:
14 |   - name: v1
15 |     schema:
16 |       openAPIV3Schema:
17 |         type: object
18 |         x-kubernetes-preserve-unknown-fields: true
19 |     served: true
20 |     storage: true
21 | 


--------------------------------------------------------------------------------
/charts/dex/templates/cluster_role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: dex
 5 | rules:
 6 | - apiGroups:
 7 |   - dex.coreos.com
 8 |   resources:
 9 |   - '*'
10 |   verbs:
11 |   - '*'
12 | - apiGroups:
13 |   - apiextensions.k8s.io
14 |   resources:
15 |   - customresourcedefinitions
16 |   verbs:
17 |   - create
18 | 


--------------------------------------------------------------------------------
/charts/dex/templates/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: dex
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: dex
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: dex
12 |   namespace:  {{ .Values.dex.namespace }}
13 | 


--------------------------------------------------------------------------------
/charts/dex/templates/config_map.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: dex
 5 |   namespace:  {{ .Values.dex.namespace }}
 6 | data:
 7 |   config.yaml: |
 8 |     issuer: https://istio-ingressgateway.{{ .Values.ingress.namespace }}.svc.cluster.local/dex
 9 |     storage:
10 |       type: kubernetes
11 |       config:
12 |         inCluster: true
13 |     web:
14 |       http: 0.0.0.0:5556
15 |     logger:
16 |       level: "debug"
17 |       format: text
18 |     oauth2:
19 |       skipApprovalScreen: true
20 |     enablePasswordDB: true
21 |     staticPasswords:
22 |     - email: {{ .Values.dex.user.email }}
23 |       hashFromEnv: DEX_USER_HASH
24 |       username: {{ .Values.dex.user.username }}
25 |       userID: {{ .Values.dex.user.userid }}
26 |     staticClients:
27 |     - idEnv: OIDC_CLIENT_ID
28 |       redirectURIs: ["https://istio-ingressgateway.{{ .Values.ingress.namespace }}.svc.cluster.local/oauth2/callback"]
29 |       name: 'Dex Login Application'
30 |       secretEnv: OIDC_CLIENT_SECRET


--------------------------------------------------------------------------------
/charts/dex/templates/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   labels:
 5 |     app: dex
 6 |   name: dex
 7 |   namespace: {{ .Values.dex.namespace }}
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |         app: dex
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: dex
17 |     spec:
18 |       serviceAccountName: dex
19 |       containers:
20 |       - image: ghcr.io/dexidp/dex:v2.36.0
21 |         name: dex
22 |         command: ["dex", "serve", "/etc/dex/cfg/config.yaml"]
23 |         ports:
24 |         - name: http
25 |           containerPort: 5556
26 |         volumeMounts:
27 |         - name: config
28 |           mountPath: /etc/dex/cfg
29 |         envFrom:
30 |           - secretRef:
31 |               name: dex-oidc-client
32 |           - secretRef:
33 |               name: dex-passwords
34 |         env:
35 |           - name: KUBERNETES_POD_NAMESPACE
36 |             valueFrom:
37 |               fieldRef:
38 |                 fieldPath: metadata.namespace
39 |       volumes:
40 |       - name: config
41 |         configMap:
42 |           name: dex
43 |           items:
44 |           - key: config.yaml
45 |             path: config.yaml
46 | 


--------------------------------------------------------------------------------
/charts/dex/templates/secrets/static-oidc-client.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 |   name: dex-oidc-client
5 |   namespace:  {{ .Values.dex.namespace }}
6 | type: Opaque
7 | stringData:
8 |   OIDC_CLIENT_ID: {{ .Values.dex.oidc.client_id }}
9 |   OIDC_CLIENT_SECRET: {{ .Values.dex.oidc.client_secret }}


--------------------------------------------------------------------------------
/charts/dex/templates/secrets/static-passwords.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 |   name: dex-passwords
5 |   namespace: {{ .Values.dex.namespace }}
6 | stringData:
7 |   DEX_USER_HASH: {{ .Values.dex.user.bcrypt_hash  }}
8 | 


--------------------------------------------------------------------------------
/charts/dex/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: dex
 5 |   namespace: {{ .Values.dex.namespace }}
 6 | spec:
 7 |   ports:
 8 |   - name: dex
 9 |     port: 5556
10 |     protocol: TCP
11 |     targetPort: 5556
12 |   selector:
13 |     app: dex
14 |   type: ClusterIP
15 | 


--------------------------------------------------------------------------------
/charts/dex/templates/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: dex
5 |   namespace: {{ .Values.dex.namespace }}
6 | 


--------------------------------------------------------------------------------
/charts/dex/templates/virtual_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1beta1
 2 | kind: VirtualService
 3 | metadata:
 4 |   name: dex
 5 |   namespace: {{ .Values.dex.namespace }}
 6 | spec:
 7 |   gateways:
 8 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
 9 |   hosts:
10 |   - '*'
11 |   http:
12 |   - match:
13 |     - uri:
14 |         prefix: /dex/
15 |     route:
16 |     - destination:
17 |         host: dex.{{ .Values.dex.namespace }}.svc.cluster.local
18 |         port:
19 |           number: 5556
20 | 


--------------------------------------------------------------------------------
/charts/dex/values.yaml:
--------------------------------------------------------------------------------
 1 | dex:
 2 |   namespace:
 3 |   user:
 4 |     email:
 5 |     username:
 6 |     userid:
 7 |     bcrypt_hash:
 8 |   oidc:
 9 |     client_id:
10 |     client_secret:
11 | ingress:
12 |   namespace:
13 |   gateway:
14 | 
15 | 


--------------------------------------------------------------------------------
/charts/ebs-sc/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "gp3"
3 | description: A Helm chart for EBS storage class
4 | name: ebs-sc
5 | version: 1.0.1
6 | 


--------------------------------------------------------------------------------
/charts/ebs-sc/templates/storage-class.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: storage.k8s.io/v1
 2 | kind: StorageClass
 3 | metadata:
 4 |   annotations:
 5 |     storageclass.kubernetes.io/is-default-class: "true"
 6 |   name: ebs-sc
 7 | parameters:
 8 |   fsType: ext4
 9 |   type: gp3
10 | provisioner: kubernetes.io/aws-ebs
11 | reclaimPolicy: Delete
12 | volumeBindingMode: Immediate
13 | ---
14 | apiVersion: storage.k8s.io/v1
15 | kind: StorageClass
16 | metadata:
17 |   annotations:
18 |     storageclass.kubernetes.io/is-default-class: "false"
19 |   name: ebs-sc-wait
20 | parameters:
21 |   fsType: ext4
22 |   type: gp3
23 | provisioner: kubernetes.io/aws-ebs
24 | reclaimPolicy: Delete
25 | volumeBindingMode: WaitForFirstConsumer


--------------------------------------------------------------------------------
/charts/istio-ingress/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: istio-ingress
3 | description: A Helm chart for istio ingress components
4 | type: application
5 | version: 1.0.0
6 | appVersion: "1.20.2"
7 | 


--------------------------------------------------------------------------------
/charts/istio-ingress/templates/certificate.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: cert-manager.io/v1
 2 | kind: Certificate
 3 | metadata:
 4 |   name: gateway-cert
 5 |   namespace: {{ .Values.ingress.namespace }}
 6 | spec:
 7 |   secretName: gateway-cert
 8 | 
 9 |   duration: 2160h # 90d
10 |   renewBefore: 360h # 15d
11 |   subject:
12 |     organizations:
13 |       - aws
14 | 
15 |   isCA: false
16 |   privateKey:
17 |     algorithm: RSA
18 |     encoding: PKCS1
19 |     size: 2048
20 |   usages:
21 |     - server auth
22 |   dnsNames:
23 |     - "istio-ingressgateway.{{ .Values.ingress.namespace }}.svc.cluster.local"
24 |   issuerRef:
25 |     name: {{ .Values.cluster_issuer.name }}
26 |     kind: ClusterIssuer
27 |     group: cert-manager.io


--------------------------------------------------------------------------------
/charts/istio-ingress/templates/cluster_roles.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | apiVersion: rbac.authorization.k8s.io/v1
 4 | kind: ClusterRole
 5 | metadata:
 6 |   name: istio-ingress-admin
 7 |   labels:
 8 |     rbac.authorization.kubeflow.org/aggregate-to-kubeflow-admin: "true"
 9 | aggregationRule:
10 |   clusterRoleSelectors:
11 |   - matchLabels:
12 |       rbac.authorization.kubeflow.org/aggregate-to-istio-ingress-admin: "true"
13 | rules: []
14 | 
15 | ---
16 | 
17 | apiVersion: rbac.authorization.k8s.io/v1
18 | kind: ClusterRole
19 | metadata:
20 |   name: istio-ingress-edit
21 |   labels:
22 |     rbac.authorization.kubeflow.org/aggregate-to-kubeflow-edit: "true"
23 |     rbac.authorization.kubeflow.org/aggregate-to-istio-ingress-admin: "true"
24 | rules:
25 | - apiGroups: 
26 |   - istio.io
27 |   - networking.istio.io
28 |   resources: ["*"]
29 |   verbs:
30 |   - get
31 |   - list
32 |   - watch
33 |   - create
34 |   - delete
35 |   - deletecollection
36 |   - patch
37 |   - update
38 | 
39 | ---
40 | 
41 | apiVersion: rbac.authorization.k8s.io/v1
42 | kind: ClusterRole
43 | metadata:
44 |   name: istio-ingress-view
45 |   labels:
46 |     rbac.authorization.kubeflow.org/aggregate-to-kubeflow-view: "true"
47 | rules:
48 | - apiGroups:
49 |   - istio.io
50 |   - networking.istio.io
51 |   resources: ["*"]
52 |   verbs:
53 |   - get
54 |   - list
55 |   - watch
56 | 


--------------------------------------------------------------------------------
/charts/istio-ingress/templates/gateway.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1beta1
 2 | kind: Gateway
 3 | metadata:
 4 |   name: {{ .Values.ingress.gateway }} 
 5 |   namespace: {{ .Values.ingress.namespace }} 
 6 | spec:
 7 |   selector:
 8 |     app: istio-ingressgateway
 9 |   servers:
10 |   - hosts:
11 |       - '*'
12 |     port:
13 |       name: https-8443
14 |       number: 8443
15 |       protocol: HTTPS
16 |     tls:
17 |       mode: SIMPLE
18 |       credentialName: gateway-cert
19 |   - hosts:
20 |       - '*'
21 |     port:
22 |       name: http-8080
23 |       number: 8080
24 |       protocol: HTTP


--------------------------------------------------------------------------------
/charts/istio-ingress/templates/virtual_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1beta1
 2 | kind: VirtualService
 3 | metadata:
 4 |   name: {{ .Values.ingress.gateway }}-health-check
 5 |   namespace: {{ .Values.ingress.namespace }} 
 6 | spec:
 7 |   hosts:
 8 |   - '*'
 9 |   gateways:
10 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
11 |   http:
12 |   - match:
13 |     - uri:
14 |         exact: {{ .Values.healthcheck.path }}
15 |       method:
16 |         exact: GET
17 |       port: {{ .Values.healthcheck.port }}
18 |     directResponse:
19 |       status: 200


--------------------------------------------------------------------------------
/charts/istio-ingress/values.yaml:
--------------------------------------------------------------------------------
1 | ingress:
2 |   namespace: ingress
3 |   gateway: ingress-gateway
4 | healthcheck:
5 |   port: 8080
6 |   path: /healthcheck
7 | cluster_issuer: 
8 |   name: cluster-self-signing-issuer


--------------------------------------------------------------------------------
/charts/karpenter-components/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "v1beta1"
3 | description: A Helm chart for Karpenter components
4 | name: karpenter-components
5 | version: 1.0.4
6 | 


--------------------------------------------------------------------------------
/charts/karpenter-components/values.yaml:
--------------------------------------------------------------------------------
1 | namespace: "karpenter"
2 | role_name:
3 | cluster_id:
4 | consolidate_after: "600s"
5 | capacity_type: "on-demand"
6 | max_pods: 20
7 | 


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/coco-data/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: coco-data
3 | description: A Helm chart for downloading and extracting COCO data to a persistent volume
4 | type: application
5 | version: 1.0.0
6 | appVersion: "v2017"
7 | 


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/coco-data/values.yaml:
--------------------------------------------------------------------------------
1 | pvc:
2 |   name: pv-fsx
3 |   mount_path: /fsx
4 |   data_path: data/coco2017 # relative to mount_path


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/data-process/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: data-process
3 | description: A Helm chart for data processing
4 | type: application
5 | version: 1.0.0
6 | appVersion: "1.0.0"
7 | 


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/data-process/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | pvc:
 3 |   - name: pv-fsx
 4 |     mount_path: /fsx
 5 |   - name: pv-efs
 6 |     mount_path: /efs
 7 | ebs: {}
 8 | git:
 9 |   repo_url:
10 |   branch:
11 |   commit:
12 | pre_script: []
13 | post_script: []
14 | process:
15 |   env: []
16 |   command: []
17 |   args: []
18 | resources:
19 |   requests: {}
20 |   limits: {}


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/databricks-dolly-15k-data/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: databricks-dolly-15k
3 | description: A Helm chart for downloading databricks/databricks-dolly-15k data
4 | type: application
5 | version: 1.0.0
6 | appVersion: "v1"
7 | 


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/databricks-dolly-15k-data/values.yaml:
--------------------------------------------------------------------------------
1 | pvc:
2 |   name: pv-fsx
3 |   mount_path: /fsx
4 |   data_path: data/databricks-dolly-15k # relative to mount_path


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/mpijob-data-process/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for distributed data processing using MPI Job
4 | name: mpijob-data-process
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/ray-data-process/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "2.9.0"
3 | description: A Helm chart for running RayJob for data processing
4 | name: raytrain
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/ray-data-process/values.yaml:
--------------------------------------------------------------------------------
 1 | ray:
 2 |   version: 2.9.0
 3 |   dashboard:
 4 |     host: '0.0.0.0'
 5 |   ports: []
 6 |   resources:
 7 |     requests: {}
 8 |     limits: {}
 9 |   runtime_env_yaml:
10 | image:
11 | image_pull_policy: IfNotPresent
12 | resources:
13 |   nnodes: 1
14 |   node_type:
15 |   requests: {}
16 |   limits: {}
17 | tolerations: []
18 | pvc:
19 |   - name: pv-fsx
20 |     mount_path: /fsx
21 |   - name: pv-efs
22 |     mount_path: /efs
23 | git:
24 |   repo_url:
25 |   branch:
26 |   commit:
27 | pre_script: []
28 | post_script: []
29 | process:
30 |   env: []
31 |   command: []
32 |   args: []
33 | 


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/redpajama-data/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: redpajama-data
3 | description: A Helm chart for downloading redpajama data
4 | type: application
5 | version: 1.0.0
6 | appVersion: "v2017"
7 | 


--------------------------------------------------------------------------------
/charts/machine-learning/data-prep/redpajama-data/values.yaml:
--------------------------------------------------------------------------------
1 | pvc:
2 |   name: pv-fsx
3 |   mount_path: /fsx
4 |   data_path: data/redpajama # relative to mount_path


--------------------------------------------------------------------------------
/charts/machine-learning/model-prep/hf-snapshot/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: hugging-face-snapshot
3 | description: A Helm chart for downloading Hugging Face snapshot
4 | type: application
5 | version: 1.0.0
6 | appVersion: "v1"
7 | 


--------------------------------------------------------------------------------
/charts/machine-learning/model-prep/hf-snapshot/values.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | ebs:
 3 |   storage: 400Gi
 4 |   mount_path: /tmp
 5 | resources:
 6 |   requests:
 7 |     cpu: "1000m"
 8 |     memory: "2048Mi"
 9 |   limits:
10 |     cpu: "1000m"
11 |     memory: "2048Mi"
12 | pvc:
13 |   - name: pv-fsx
14 |     mount_path: /fsx
15 |   - name: pv-efs
16 |     mount_path: /efs
17 | env: []
18 | snapshot: {}
19 | 


--------------------------------------------------------------------------------
/charts/machine-learning/model-prep/rayserve-tnx-autocausalengine/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: rayserve-tnx-engine
3 | description: A Helm chart for Ray Serve Transformers Neuronx Auto Causal LLM Engine
4 | type: application
5 | version: 1.0.0
6 | appVersion: "v1"
7 | 


--------------------------------------------------------------------------------
/charts/machine-learning/model-prep/rayserve-tnx-autocausalengine/values.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | resources:
 3 |   requests:
 4 |     cpu: "1000m"
 5 |     memory: "512Mi"
 6 |   limits:
 7 |     cpu: "1000m"
 8 |     memory: "1024Mi"
 9 | pvc:
10 |   - name: pv-fsx
11 |     mount_path: /fsx
12 |   - name: pv-efs
13 |     mount_path: /efs
14 | env: []
15 | engine_path: /fsx/rayserve/engines/tnx_autocausalengine.zip
16 | 


--------------------------------------------------------------------------------
/charts/machine-learning/model-prep/rayserve-vllm-asyncllmengine/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: rayserve-vllm-engine
3 | description: A Helm chart for Ray Serve vLLM Engine
4 | type: application
5 | version: 1.0.0
6 | appVersion: "v1"
7 | 


--------------------------------------------------------------------------------
/charts/machine-learning/model-prep/rayserve-vllm-asyncllmengine/values.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | resources:
 3 |   requests:
 4 |     cpu: "1000m"
 5 |     memory: "512Mi"
 6 |   limits:
 7 |     cpu: "1000m"
 8 |     memory: "1024Mi"
 9 | pvc:
10 |   - name: pv-fsx
11 |     mount_path: /fsx
12 |   - name: pv-efs
13 |     mount_path: /efs
14 | env: []
15 | engine_path: /fsx/rayserve/engines/vllm_asyncllmengine.zip
16 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/djl-lmi-server/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for Deep Java Library Large Model Inference (LMI)
4 | name: djl-serving
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/djl-lmi-server/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   name:
 3 |   pull_policy: IfNotPresent
 4 | resources:
 5 |   node_type: g5.48xlarge
 6 |   requests: {}
 7 |   limits: {}
 8 | tolerations: []
 9 | pvc:
10 |   - name: pv-fsx
11 |     mount_path: /fsx
12 |   - name: pv-efs
13 |     mount_path: /efs
14 | ebs: {}
15 | git:
16 |   repo_url:
17 |   branch:
18 |   commit:
19 | pre_script: []
20 | post_script: []
21 | server:
22 |   name: djl-lmi-server
23 |   args: []
24 |   command: []
25 |   ports:
26 |     http: 8000
27 |     grpc: 8001
28 |     metrics: 8002
29 |   readiness_probe:
30 |     period_secs: 5
31 |     failure_threshold: 3
32 |   startup_probe:
33 |     period_secs: 10
34 |     failure_threshold: 30
35 |   liveness_probe:
36 |     period_secs: 10
37 |     failure_threshold: 3
38 |   autoscaling: 
39 |     minReplicas: 1
40 |     maxReplicas: 1
41 |     metrics: []
42 |   
43 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/generic-server/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for generic server
4 | name: generic-server 
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/generic-server/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   name:
 3 |   pull_policy: IfNotPresent
 4 | resources:
 5 |   node_type: g5.48xlarge
 6 |   requests: {}
 7 |   limits: {}
 8 | tolerations: []
 9 | pvc:
10 |   - name: pv-fsx
11 |     mount_path: /fsx
12 |   - name: pv-efs
13 |     mount_path: /efs
14 | ebs: {}
15 | git:
16 |   repo_url:
17 |   branch:
18 |   commit:
19 | pre_script: []
20 | post_script: []
21 | server:
22 |   name: generic-server 
23 |   args: []
24 |   command: []
25 |   ports: []
26 |   readiness_probe:
27 |     period_secs: 5
28 |     failure_threshold: 3
29 |     path: /
30 |     port: 
31 |   startup_probe:
32 |     period_secs: 10
33 |     failure_threshold: 30
34 |     path: /
35 |     port: 
36 |   liveness_probe:
37 |     period_secs: 10
38 |     failure_threshold: 3
39 |     path: /
40 |     port: 
41 |   resources:
42 |     requests:
43 |       cpu: 1
44 |       memory: 2Gi
45 |     limits:
46 |       cpu: 4
47 |       memory: 8Gi
48 |   autoscaling:
49 |     minReplicas: 1
50 |     maxReplicas: 2
51 |     metrics:
52 |       - type: Pods
53 |         pods:
54 |           metric:
55 |             name: cpu
56 |           target:
57 |             type: Utilization
58 |             averageValue: 80
59 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/rayserve/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "2.32.0"
3 | description: A Helm chart for running RayService for serving
4 | name: rayserve
5 | version: 2.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/rayserve/values.yaml:
--------------------------------------------------------------------------------
 1 | ray:
 2 |   version: '2.44.0'
 3 |   dashboard:
 4 |     host: '0.0.0.0'
 5 |   ports: []
 6 |   resources:
 7 |     requests: {}
 8 |     limits: {}
 9 |   tolerations: []
10 |   serve_config_v2:
11 |   service_unhealthy_threshold_secs: 900
12 |   deployment_unhealthy_threshold_secs: 300
13 |   env: 
14 |   restart_policy:
15 |     head: OnFailure
16 |     worker: OnFailure
17 | image:
18 | image_pull_policy: IfNotPresent
19 | resources:
20 |   min_replicas: 1
21 |   max_replicas: 1
22 |   node_type:
23 |   requests: {}
24 |   limits: {}
25 | tolerations: []
26 | pvc: []
27 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/triton-inference-server-lws/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: 1.0.0
3 | description: Triton Inference Server with LeaderWorkingSet
4 | name: triton-inference-server-lws
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/triton-inference-server-lws/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   name:
 3 |   pull_policy: IfNotPresent
 4 | lws:
 5 |   size: 2
 6 | resources:
 7 |   node_type: g5.48xlarge
 8 |   requests: {}
 9 |   limits: {}
10 | tolerations: []
11 | pvc:
12 |   - name: pv-fsx
13 |     mount_path: /fsx
14 |   - name: pv-efs
15 |     mount_path: /efs
16 | ebs: {}
17 | git:
18 |   repo_url:
19 |   branch:
20 |   commit:
21 | pre_script: []
22 | post_script: []
23 | server:
24 |   name: triton-inference-server 
25 |   args: []
26 |   command: []
27 |   ports: []
28 |   readiness_probe:
29 |     period_secs: 5
30 |     failure_threshold: 3
31 |   startup_probe:
32 |     period_secs: 10
33 |     failure_threshold: 30
34 |   liveness_probe:
35 |     period_secs: 10
36 |     failure_threshold: 3
37 |   autoscaling:
38 |     minReplicas: 1
39 |     maxReplicas: 2
40 |     metrics:
41 |       - type: Pods
42 |         pods:
43 |           metric:
44 |             name: avg_time_queue_us
45 |           target:
46 |             type: AverageValue
47 |             averageValue: 50
48 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/triton-inference-server/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for Triton Inference Server
4 | name: triton-inference-server 
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/serving/triton-inference-server/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   name:
 3 |   pull_policy: IfNotPresent
 4 | resources:
 5 |   node_type: g5.48xlarge
 6 |   requests: {}
 7 |   limits: {}
 8 | tolerations: []
 9 | pvc:
10 |   - name: pv-fsx
11 |     mount_path: /fsx
12 |   - name: pv-efs
13 |     mount_path: /efs
14 | ebs: {}
15 | git:
16 |   repo_url:
17 |   branch:
18 |   commit:
19 | pre_script: []
20 | post_script: []
21 | server:
22 |   name: triton-inference-server 
23 |   args: []
24 |   command: []
25 |   ports: []
26 |   readiness_probe:
27 |     period_secs: 5
28 |     failure_threshold: 3
29 |   startup_probe:
30 |     period_secs: 10
31 |     failure_threshold: 30
32 |   liveness_probe:
33 |     period_secs: 10
34 |     failure_threshold: 3
35 |   autoscaling:
36 |     minReplicas: 1
37 |     maxReplicas: 2
38 |     metrics:
39 |       - type: Pods
40 |         pods:
41 |           metric:
42 |             name: avg_time_queue_us
43 |           target:
44 |             type: AverageValue
45 |             averageValue: 50
46 | 


--------------------------------------------------------------------------------
/charts/machine-learning/testing/maskrcnn-jupyter/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for Jupyter
4 | name: jupyter 
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/testing/maskrcnn-jupyter/values.yaml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   name: maskrcnn-jupyter
 3 |   namespace: kubeflow
 4 |   shared_fs: fsx
 5 |   shared_pvc: pv-fsx # pv-efs
 6 |   source_cidr:  # Public IP source CIDR
 7 |   log_dir:  # relative path on shared file-system to directory containing 'train_log' folder
 8 | image:
 9 | image_pull_policy: Always
10 | jupyter:
11 |   name: jupyter 
12 |   port: 443 
13 |   target_port: 8888
14 |   gpu_instance_type: g5.xlarge
15 | tensorboard:
16 |   name: tensorboard
17 |   port: 6443
18 |   target_port: 6443
19 |   upstream_port: 6006
20 | nginx:
21 |   name: nginx
22 |   ssl_certificate: /etc/ssl/domain.crt
23 |   ssl_certificate_key: /etc/ssl/domain.key
24 |   user: tensorboard
25 |   htpasswd:      # MD5 password hash


--------------------------------------------------------------------------------
/charts/machine-learning/testing/maskrcnn-optimized-jupyter/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for Jupyter
4 | name: jupyter 
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/testing/maskrcnn-optimized-jupyter/values.yaml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   name: maskrcnn-optimized-jupyter
 3 |   namespace: kubeflow
 4 |   shared_fs: fsx
 5 |   shared_pvc: pv-fsx # pv-efs
 6 |   source_cidr:  # Public IP source CIDR
 7 |   log_dir:  # relative path on shared file-system to directory containing 'train_log' folder
 8 | image:
 9 | image_pull_policy: Always
10 | jupyter:
11 |   name: jupyter 
12 |   port: 443 
13 |   target_port: 8888
14 |   gpu_instance_type: g5.xlarge
15 | tensorboard:
16 |   name: tensorboard
17 |   port: 6443
18 |   target_port: 6443
19 |   upstream_port: 6006
20 | nginx:
21 |   name: nginx
22 |   ssl_certificate: /etc/ssl/domain.crt
23 |   ssl_certificate_key: /etc/ssl/domain.key
24 |   user: tensorboard
25 |   htpasswd:      # MD5 password hash


--------------------------------------------------------------------------------
/charts/machine-learning/training/maskrcnn-optimized/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for running Mask RCNN (optimized) using kubeflow mpi-operator and mpi-job
4 | name: maskrcnn-optimized
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/maskrcnn-optimized/values.yaml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   namespace: kubeflow
 3 |   shared_fs: fsx #efs
 4 |   shared_pvc: pv-fsx # pv-efs
 5 | maskrcnn:
 6 |   name: mask-rcnn-tensorflow
 7 |   gpus: 16
 8 |   gpu_nodes: 2
 9 |   gpus_per_node: 8
10 |   gpu_instance_type: p4d.24xlarge
11 |   image:
12 |   train_script: /mask-rcnn-tensorflow/MaskRCNN/train.py
13 |   batch_size_per_gpu: 4
14 |   data_fs: fsx #efs
15 |   data_dir: data/coco2017 #data
16 |   working_dir:  /mask-rcnn-tensorflow
17 |   images_per_epoch: 120000 
18 |   lr_epoch_schedule: "[(16, 0.1), (20, 0.01), (24, None)]"
19 |   base_lr: 0.0015625 # for a total batch size of 1, adjusted automatically to actual total batch size
20 |   eval_period_in_epochs: 1 
21 |   data_train: "[\"train2017\"]"
22 |   data_val: "(\"val2017\")" 
23 |   mode_fpn: 'True'
24 |   mode_mask: 'True'
25 |   backbone_norm: FreezeBN
26 |   backbone_weights: data/coco2017/pretrained-models/ImageNet-R50-AlignPadding.npz
27 |   image_pull_policy: Always 
28 |   extra_config: 'TRAIN.GRADIENT_CLIP=0.36'
29 |   nccl_socket_ifname: ^lo,docker0
30 |   if_exclude:  lo,docker0
31 |   tf_device_min_sys_mem_mb: 4096
32 |   tf_enable_auto_mixed_precision: 0
33 |   nccl_debug: INFO
34 |   horovod_autotune: 1
35 |   horovod_log_level: INFO
36 |   backoff_limit: 2000
37 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/maskrcnn/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for running Mask RCNN using kubeflow mpi-operator and mpi-job
4 | name: maskrcnn
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/maskrcnn/values.yaml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   namespace: kubeflow
 3 |   shared_fs: fsx
 4 |   shared_pvc: pv-fsx # pv-efs
 5 | maskrcnn:
 6 |   name: maskrcnn
 7 |   gpus: 16
 8 |   gpu_nodes: 2
 9 |   gpus_per_node: 8
10 |   gpu_instance_type: p4d.24xlarge
11 |   image:
12 |   train_script: /tensorpack/examples/FasterRCNN/train.py
13 |   data_fs: fsx # efs
14 |   data_dir: data/coco2017
15 |   steps_per_epoch: 7500  # Must be equal to 120000/gpus 
16 |   lr_schedule: "[240000,320000,360000]"
17 |   base_lr: 0.01 # For a total batch size=8, adjusted automatically to actual total batch size
18 |   eval_period_in_epochs: 1
19 |   data_train: "[\"coco_train2017\"]"
20 |   data_val: "(\"coco_val2017\")"
21 |   mode_fpn: 'True'
22 |   mode_mask: 'True'
23 |   backbone_norm: FreezeBN
24 |   backbone_weights: data/coco2017/pretrained-models/ImageNet-R50-AlignPadding.npz
25 |   image_pull_policy: Always
26 |   nccl_socket_ifname: ^lo,docker0
27 |   if_exclude:  lo,docker0
28 |   tf_device_min_sys_mem_mb: 4096
29 |   tf_enable_auto_mixed_precision: 0
30 |   nccl_debug: INFO
31 |   horovod_autotune: 1
32 |   horovod_log_level: INFO
33 |   extra_config: 'TRAIN.CHECKPOINT_PERIOD=2' 
34 |   backoff_limit: 2000
35 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/mpijob-horovod-tensorflow-gpu/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for running MPIJob with Horovod and Tensorflow using GPUs
4 | name: mpijob-horovod-tensorflow-gpu
5 | version: 1.0.2
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/pytorchjob-distributed/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for running distributed PytorchJob using Master and Workers
4 | name: pytorchjob-distributed
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/pytorchjob-distributed/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | backoff_limit: 2000
 3 | resources:
 4 |   requests: {}
 5 |   limits: {}
 6 |   nnodes: 2
 7 |   nproc_per_node:
 8 |   node_type: 
 9 | tolerations: []
10 | pvc:
11 |   - name: pv-fsx
12 |     mount_path: /fsx
13 |   - name: pv-efs
14 |     mount_path: /efs
15 | git:
16 |   repo_url:
17 |   branch:
18 |   commit:
19 | pre_script: []
20 | post_script: []
21 | train:
22 |   env: []
23 |   command: []
24 |   args: []
25 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/pytorchjob-elastic/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for running elastic PytorchJob
4 | name: pytorchjob-elastic
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/pytorchjob-elastic/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | backoff_limit: 2000
 3 | resources:
 4 |   nnodes: 1
 5 |   nproc_per_node: 
 6 |   node_type:
 7 |   requests: {}
 8 |   limits: {}
 9 | tolerations: []
10 | elastic_policy:
11 |   rdzv_backend: c10d
12 |   rdzv_port: 44000
13 |   min_replicas: 1
14 |   max_replicas: 1
15 | pvc:
16 |   - name: pv-fsx
17 |     mount_path: /fsx
18 |   - name: pv-efs
19 |     mount_path: /efs
20 | git:
21 |   repo_url:
22 |   branch:
23 |   commit:
24 | pre_script: []
25 | post_script: []
26 | train:
27 |   env: []
28 |   command: []
29 |   args: []
30 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/raytrain/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "2.22.0"
3 | description: A Helm chart for running RayJob for training
4 | name: raytrain
5 | version: 2.0.0
6 | 


--------------------------------------------------------------------------------
/charts/machine-learning/training/raytrain/values.yaml:
--------------------------------------------------------------------------------
 1 | ray:
 2 |   version: 2.22.0
 3 |   dashboard:
 4 |     host: '0.0.0.0'
 5 |   ports: []
 6 |   resources:
 7 |     requests: {}
 8 |     limits: {}
 9 |   runtime_env_yaml:
10 | image:
11 | image_pull_policy: IfNotPresent
12 | resources:
13 |   nnodes: 1
14 |   node_type:
15 |   requests: {}
16 |   limits: {}
17 | tolerations: []
18 | pvc:
19 |   - name: pv-fsx
20 |     mount_path: /fsx
21 |   - name: pv-efs
22 |     mount_path: /efs
23 | git:
24 |   repo_url:
25 |   branch:
26 |   commit:
27 | pre_script: []
28 | post_script: []
29 | train:
30 |   env: []
31 |   command: []
32 |   args: []
33 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: v1.9.2
3 | description: A Helm chart for kubeflow admission webhook
4 | name: admission-webhook
5 | type: application
6 | version: 1.0.0
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/templates/authorization_policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: security.istio.io/v1
 2 | kind: AuthorizationPolicy
 3 | metadata:
 4 |   labels:
 5 |     control-plane: profiles
 6 |   name: profiles-kfam
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   action: ALLOW
10 |   rules:
11 |   - from:
12 |     - source:
13 |         principals:
14 |         - cluster.local/ns/{{ .Values.kubeflow.namespace }}/sa/centraldashboard
15 |   selector:
16 |     matchLabels:
17 |       control-plane: profiles
18 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/templates/certificate.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: cert-manager.io/v1
 2 | kind: Certificate
 3 | metadata:
 4 |   labels:
 5 |     app: poddefaults
 6 |     app.kubernetes.io/component: poddefaults
 7 |     app.kubernetes.io/name: poddefaults
 8 |   name: admission-webhook-cert
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | spec:
11 |   commonName: admission-webhook-service.{{ .Values.kubeflow.namespace }}.svc
12 |   dnsNames:
13 |   - admission-webhook-service.{{ .Values.kubeflow.namespace }}.svc
14 |   - admission-webhook-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local
15 |   isCA: true
16 |   issuerRef:
17 |     kind: Issuer
18 |     name: admission-webhook-selfsigned-issuer
19 |   secretName: webhook-certs
20 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/templates/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app: poddefaults
 6 |     app.kubernetes.io/component: poddefaults
 7 |     app.kubernetes.io/name: poddefaults
 8 |   name: admission-webhook-cluster-role-binding
 9 | roleRef:
10 |   apiGroup: rbac.authorization.k8s.io
11 |   kind: ClusterRole
12 |   name: admission-webhook-cluster-role
13 | subjects:
14 | - kind: ServiceAccount
15 |   name: admission-webhook-service-account
16 |   namespace: {{ .Values.kubeflow.namespace }}
17 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/templates/deplyment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   labels:
 5 |     app: poddefaults
 6 |     app.kubernetes.io/component: poddefaults
 7 |     app.kubernetes.io/name: poddefaults
 8 |   name: admission-webhook-deployment
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | spec:
11 |   selector:
12 |     matchLabels:
13 |       app: poddefaults
14 |       app.kubernetes.io/component: poddefaults
15 |       app.kubernetes.io/name: poddefaults
16 |   template:
17 |     metadata:
18 |       annotations:
19 |         sidecar.istio.io/inject: 'false'
20 |       labels:
21 |         app: poddefaults
22 |         app.kubernetes.io/component: poddefaults
23 |         app.kubernetes.io/name: poddefaults
24 |     spec:
25 |       containers:
26 |       - args:
27 |         - --tlsCertFile=/etc/webhook/certs/tls.crt
28 |         - --tlsKeyFile=/etc/webhook/certs/tls.key
29 |         image: docker.io/kubeflownotebookswg/poddefaults-webhook:v1.9.2
30 |         name: admission-webhook
31 |         ports:
32 |         - containerPort: 4443
33 |           name: https-webhook
34 |         volumeMounts:
35 |         - mountPath: /etc/webhook/certs
36 |           name: webhook-cert
37 |           readOnly: true
38 |       serviceAccountName: admission-webhook-service-account
39 |       volumes:
40 |       - name: webhook-cert
41 |         secret:
42 |           secretName: webhook-certs
43 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/templates/issuer.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: cert-manager.io/v1
 2 | kind: Issuer
 3 | metadata:
 4 |   labels:
 5 |     app: poddefaults
 6 |     app.kubernetes.io/component: poddefaults
 7 |     app.kubernetes.io/name: poddefaults
 8 |     kustomize.component: poddefaults
 9 |   name: admission-webhook-selfsigned-issuer
10 |   namespace: {{ .Values.kubeflow.namespace }}
11 | spec:
12 |   selfSigned: {}
13 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/templates/mutating_webhook_configuration.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: admissionregistration.k8s.io/v1
 2 | kind: MutatingWebhookConfiguration
 3 | metadata:
 4 |   annotations:
 5 |     cert-manager.io/inject-ca-from: {{ .Values.kubeflow.namespace }}/admission-webhook-cert
 6 |   labels:
 7 |     app: poddefaults
 8 |     app.kubernetes.io/component: poddefaults
 9 |     app.kubernetes.io/name: poddefaults
10 |   name: admission-webhook-mutating-webhook-configuration
11 | webhooks:
12 | - admissionReviewVersions:
13 |   - v1beta1
14 |   - v1
15 |   clientConfig:
16 |     caBundle: ''
17 |     service:
18 |       name: admission-webhook-service
19 |       namespace: {{ .Values.kubeflow.namespace }}
20 |       path: /apply-poddefault
21 |   failurePolicy: Fail
22 |   name: admission-webhook-deployment.kubeflow.org
23 |   namespaceSelector:
24 |     matchLabels:
25 |       app.kubernetes.io/part-of: kubeflow-profile
26 |   rules:
27 |   - apiGroups:
28 |     - ''
29 |     apiVersions:
30 |     - v1
31 |     operations:
32 |     - CREATE
33 |     resources:
34 |     - pods
35 |   sideEffects: None
36 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: poddefaults
 6 |     app.kubernetes.io/component: poddefaults
 7 |     app.kubernetes.io/name: poddefaults
 8 |   name: admission-webhook-service
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | spec:
11 |   ports:
12 |   - name: https-webhook
13 |     port: 443
14 |     targetPort: https-webhook
15 |   selector:
16 |     app: poddefaults
17 |     app.kubernetes.io/component: poddefaults
18 |     app.kubernetes.io/name: poddefaults
19 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/templates/service_account.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   labels:
 5 |     app: poddefaults
 6 |     app.kubernetes.io/component: poddefaults
 7 |     app.kubernetes.io/name: poddefaults
 8 |   name: admission-webhook-service-account
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-admission-webhook/values.yaml:
--------------------------------------------------------------------------------
1 | kubeflow:
2 |   namespace: kubeflow
3 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: v1.9.2
3 | description: A Helm chart for Kubeflow central dashboard
4 | name: kubeflow-central-dashboard
5 | type: application
6 | version: 1.0.0
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/templates/authorization_policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: security.istio.io/v1
 2 | kind: AuthorizationPolicy
 3 | metadata:
 4 |   labels:
 5 |     app: centraldashboard
 6 |     app.kubernetes.io/component: centraldashboard
 7 |     app.kubernetes.io/name: centraldashboard
 8 |   name: central-dashboard
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | spec:
11 |   action: ALLOW
12 |   rules:
13 |   - from:
14 |     - source:
15 |         principals:
16 |         - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }}
17 |   selector:
18 |     matchLabels:
19 |       app: centraldashboard
20 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/templates/cluster_role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   labels:
 5 |     app: centraldashboard
 6 |     app.kubernetes.io/component: centraldashboard
 7 |     app.kubernetes.io/name: centraldashboard
 8 |   name: centraldashboard
 9 | rules:
10 | - apiGroups:
11 |   - ''
12 |   resources:
13 |   - events
14 |   - namespaces
15 |   - nodes
16 |   verbs:
17 |   - get
18 |   - list
19 |   - watch
20 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/templates/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app: centraldashboard
 6 |     app.kubernetes.io/component: centraldashboard
 7 |     app.kubernetes.io/name: centraldashboard
 8 |   name: centraldashboard
 9 | roleRef:
10 |   apiGroup: rbac.authorization.k8s.io
11 |   kind: ClusterRole
12 |   name: centraldashboard
13 | subjects:
14 | - kind: ServiceAccount
15 |   name: centraldashboard
16 |   namespace: {{ .Values.kubeflow.namespace }}
17 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/templates/role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   labels:
 5 |     app: centraldashboard
 6 |     app.kubernetes.io/component: centraldashboard
 7 |     app.kubernetes.io/name: centraldashboard
 8 |   name: centraldashboard
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | rules:
11 | - apiGroups:
12 |   - ''
13 |   - app.k8s.io
14 |   resources:
15 |   - applications
16 |   - pods
17 |   - pods/exec
18 |   - pods/log
19 |   verbs:
20 |   - get
21 |   - list
22 |   - watch
23 | - apiGroups:
24 |   - ''
25 |   resources:
26 |   - secrets
27 |   - configmaps
28 |   verbs:
29 |   - get
30 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/templates/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app: centraldashboard
 6 |     app.kubernetes.io/component: centraldashboard
 7 |     app.kubernetes.io/name: centraldashboard
 8 |   name: centraldashboard
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | roleRef:
11 |   apiGroup: rbac.authorization.k8s.io
12 |   kind: Role
13 |   name: centraldashboard
14 | subjects:
15 | - kind: ServiceAccount
16 |   name: centraldashboard
17 |   namespace: {{ .Values.kubeflow.namespace }}
18 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: centraldashboard
 6 |     app.kubernetes.io/component: centraldashboard
 7 |     app.kubernetes.io/name: centraldashboard
 8 |   name: centraldashboard
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | spec:
11 |   ports:
12 |   - port: 80
13 |     protocol: TCP
14 |     targetPort: 8082
15 |   selector:
16 |     app: centraldashboard
17 |     app.kubernetes.io/component: centraldashboard
18 |     app.kubernetes.io/name: centraldashboard
19 |   sessionAffinity: None
20 |   type: ClusterIP
21 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/templates/service_account.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   labels:
 5 |     app: centraldashboard
 6 |     app.kubernetes.io/component: centraldashboard
 7 |     app.kubernetes.io/name: centraldashboard
 8 |   name: centraldashboard
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/templates/virtual_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1beta1
 2 | kind: VirtualService
 3 | metadata:
 4 |   labels:
 5 |     app: centraldashboard
 6 |     app.kubernetes.io/component: centraldashboard
 7 |     app.kubernetes.io/name: centraldashboard
 8 |   name: centraldashboard
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | spec:
11 |   gateways:
12 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
13 |   hosts:
14 |   - '*'
15 |   http:
16 |   - match:
17 |     - uri:
18 |         prefix: /
19 |     rewrite:
20 |       uri: /
21 |     route:
22 |     - destination:
23 |         host: centraldashboard.{{ .Values.kubeflow.namespace }}.svc.cluster.local
24 |         port:
25 |           number: 80
26 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-central-dashboard/values.yaml:
--------------------------------------------------------------------------------
1 | kubeflow:
2 |   namespace: kubeflow
3 | ingress:
4 |   namespace: ingress
5 |   gateway: ingress-gateway
6 |   sa: istio-ingressgateway
7 | user:
8 |   profile:
9 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: v0.17.0
3 | description: A Helm chart for Kubeflow Katib
4 | name: kubeflow-katib
5 | type: application
6 | version: 1.0.1
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/crds/experiments.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   name: experiments.kubeflow.org
 5 | spec:
 6 |   group: kubeflow.org
 7 |   names:
 8 |     categories:
 9 |     - all
10 |     - kubeflow
11 |     - katib
12 |     kind: Experiment
13 |     plural: experiments
14 |     singular: experiment
15 |   scope: Namespaced
16 |   versions:
17 |   - additionalPrinterColumns:
18 |     - jsonPath: .status.conditions[-1:].type
19 |       name: Type
20 |       type: string
21 |     - jsonPath: .status.conditions[-1:].status
22 |       name: Status
23 |       type: string
24 |     - jsonPath: .metadata.creationTimestamp
25 |       name: Age
26 |       type: date
27 |     name: v1beta1
28 |     schema:
29 |       openAPIV3Schema:
30 |         type: object
31 |         x-kubernetes-preserve-unknown-fields: true
32 |     served: true
33 |     storage: true
34 |     subresources:
35 |       status: {}
36 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/crds/suggestions.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   name: suggestions.kubeflow.org
 5 | spec:
 6 |   group: kubeflow.org
 7 |   names:
 8 |     categories:
 9 |     - all
10 |     - kubeflow
11 |     - katib
12 |     kind: Suggestion
13 |     plural: suggestions
14 |     singular: suggestion
15 |   scope: Namespaced
16 |   versions:
17 |   - additionalPrinterColumns:
18 |     - jsonPath: .status.conditions[-1:].type
19 |       name: Type
20 |       type: string
21 |     - jsonPath: .status.conditions[-1:].status
22 |       name: Status
23 |       type: string
24 |     - jsonPath: .spec.requests
25 |       name: Requested
26 |       type: string
27 |     - jsonPath: .status.suggestionCount
28 |       name: Assigned
29 |       type: string
30 |     - jsonPath: .metadata.creationTimestamp
31 |       name: Age
32 |       type: date
33 |     name: v1beta1
34 |     schema:
35 |       openAPIV3Schema:
36 |         type: object
37 |         x-kubernetes-preserve-unknown-fields: true
38 |     served: true
39 |     storage: true
40 |     subresources:
41 |       status: {}
42 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/crds/trials.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   name: trials.kubeflow.org
 5 | spec:
 6 |   group: kubeflow.org
 7 |   names:
 8 |     categories:
 9 |     - all
10 |     - kubeflow
11 |     - katib
12 |     kind: Trial
13 |     plural: trials
14 |     singular: trial
15 |   scope: Namespaced
16 |   versions:
17 |   - additionalPrinterColumns:
18 |     - jsonPath: .status.conditions[-1:].type
19 |       name: Type
20 |       type: string
21 |     - jsonPath: .status.conditions[-1:].status
22 |       name: Status
23 |       type: string
24 |     - jsonPath: .metadata.creationTimestamp
25 |       name: Age
26 |       type: date
27 |     name: v1beta1
28 |     schema:
29 |       openAPIV3Schema:
30 |         type: object
31 |         x-kubernetes-preserve-unknown-fields: true
32 |     served: true
33 |     storage: true
34 |     subresources:
35 |       status: {}
36 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/templates/authorization_policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: security.istio.io/v1beta1
 2 | kind: AuthorizationPolicy
 3 | metadata:
 4 |   name: katib-ui
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   action: ALLOW
 8 |   rules:
 9 |   - from:
10 |     - source:
11 |         principals:
12 |         - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }}
13 |   selector:
14 |     matchLabels:
15 |       katib.kubeflow.org/component: ui
16 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/templates/certificate.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: cert-manager.io/v1
 2 | kind: Certificate
 3 | metadata:
 4 |   name: katib-webhook-cert
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   commonName: katib-controller.{{ .Values.kubeflow.namespace }}.svc
 8 |   dnsNames:
 9 |   - katib-controller.{{ .Values.kubeflow.namespace }}.svc
10 |   - katib-controller.{{ .Values.kubeflow.namespace }}.svc.cluster.local
11 |   isCA: true
12 |   issuerRef:
13 |     kind: Issuer
14 |     name: katib-selfsigned-issuer
15 |   secretName: katib-webhook-cert
16 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/templates/cluster_role_bindings.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: katib-controller
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: katib-controller
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: katib-controller
12 |   namespace: {{ .Values.kubeflow.namespace }}
13 | ---
14 | apiVersion: rbac.authorization.k8s.io/v1
15 | kind: ClusterRoleBinding
16 | metadata:
17 |   name: katib-ui
18 | roleRef:
19 |   apiGroup: rbac.authorization.k8s.io
20 |   kind: ClusterRole
21 |   name: katib-ui
22 | subjects:
23 | - kind: ServiceAccount
24 |   name: katib-ui
25 |   namespace: {{ .Values.kubeflow.namespace }}
26 | ---
27 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/templates/issuer.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: cert-manager.io/v1
2 | kind: Issuer
3 | metadata:
4 |   name: katib-selfsigned-issuer
5 |   namespace: {{ .Values.kubeflow.namespace }}
6 | spec:
7 |   selfSigned: {}
8 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/templates/mutating_web_hook.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: admissionregistration.k8s.io/v1
 2 | kind: MutatingWebhookConfiguration
 3 | metadata:
 4 |   annotations:
 5 |     cert-manager.io/inject-ca-from: {{ .Values.kubeflow.namespace }}/katib-webhook-cert
 6 |   name: katib.kubeflow.org
 7 | webhooks:
 8 | - admissionReviewVersions:
 9 |   - v1
10 |   clientConfig:
11 |     caBundle: Cg==
12 |     service:
13 |       name: katib-controller
14 |       namespace: {{ .Values.kubeflow.namespace }}
15 |       path: /mutate-experiment
16 |   failurePolicy: Ignore
17 |   name: defaulter.experiment.katib.kubeflow.org
18 |   rules:
19 |   - apiGroups:
20 |     - kubeflow.org
21 |     apiVersions:
22 |     - v1beta1
23 |     operations:
24 |     - CREATE
25 |     - UPDATE
26 |     resources:
27 |     - experiments
28 |   sideEffects: None
29 | - admissionReviewVersions:
30 |   - v1
31 |   clientConfig:
32 |     caBundle: Cg==
33 |     service:
34 |       name: katib-controller
35 |       namespace: {{ .Values.kubeflow.namespace }}
36 |       path: /mutate-pod
37 |   failurePolicy: Ignore
38 |   name: mutator.pod.katib.kubeflow.org
39 |   namespaceSelector:
40 |     matchLabels:
41 |       katib.kubeflow.org/metrics-collector-injection: enabled
42 |   rules:
43 |   - apiGroups:
44 |     - ''
45 |     apiVersions:
46 |     - v1
47 |     operations:
48 |     - CREATE
49 |     resources:
50 |     - pods
51 |   sideEffects: None
52 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/templates/service_accounts.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: katib-controller
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | ---
 7 | apiVersion: v1
 8 | kind: ServiceAccount
 9 | metadata:
10 |   name: katib-ui
11 |   namespace: {{ .Values.kubeflow.namespace }}
12 | ---
13 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/templates/validating_web_hook.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: admissionregistration.k8s.io/v1
 2 | kind: ValidatingWebhookConfiguration
 3 | metadata:
 4 |   annotations:
 5 |     cert-manager.io/inject-ca-from: {{ .Values.kubeflow.namespace }}/katib-webhook-cert
 6 |   name: katib.kubeflow.org
 7 | webhooks:
 8 | - admissionReviewVersions:
 9 |   - v1
10 |   clientConfig:
11 |     caBundle: Cg==
12 |     service:
13 |       name: katib-controller
14 |       namespace: {{ .Values.kubeflow.namespace }}
15 |       path: /validate-experiment
16 |   failurePolicy: Ignore
17 |   name: validator.experiment.katib.kubeflow.org
18 |   rules:
19 |   - apiGroups:
20 |     - kubeflow.org
21 |     apiVersions:
22 |     - v1beta1
23 |     operations:
24 |     - CREATE
25 |     - UPDATE
26 |     resources:
27 |     - experiments
28 |   sideEffects: None
29 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/templates/virtual_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1alpha3
 2 | kind: VirtualService
 3 | metadata:
 4 |   name: katib-ui
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   gateways:
 8 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
 9 |   hosts:
10 |   - '*'
11 |   http:
12 |   - match:
13 |     - uri:
14 |         prefix: /katib/
15 |     rewrite:
16 |       uri: /katib/
17 |     route:
18 |     - destination:
19 |         host: katib-ui.{{ .Values.kubeflow.namespace }}.svc.cluster.local
20 |         port:
21 |           number: 80
22 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-katib/values.yaml:
--------------------------------------------------------------------------------
1 | kubeflow:
2 |   namespace: kubeflow
3 | ingress:
4 |   namespace: ingress
5 |   gateway: ingress-gateway
6 |   sa: istio-ingressgateway


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: v1.9.2
3 | description: A Helm chart for Kubeflow notebooks
4 | name: kubeflow-notebooks
5 | type: application
6 | version: 1.0.0
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/controller/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     control-plane: notebook-controller
 6 |   name: notebook-controller-role-binding
 7 | roleRef:
 8 |   apiGroup: rbac.authorization.k8s.io
 9 |   kind: ClusterRole
10 |   name: notebook-controller-role
11 | subjects:
12 | - kind: ServiceAccount
13 |   name: notebook-controller-service-account
14 |   namespace: {{ .Values.kubeflow.namespace }}
15 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/controller/config_map.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   annotations: {}
 5 |   name: notebook-controller
 6 |   namespace: {{ .Values.kubeflow.namespace }}
 7 | data:
 8 |   CLUSTER_DOMAIN: cluster.local
 9 |   CULL_IDLE_TIME: '{{ .Values.cullingPolicy.cullIdleTime }}'
10 |   ENABLE_CULLING: '{{ .Values.cullingPolicy.enableCulling }}'
11 |   IDLENESS_CHECK_PERIOD: '{{ .Values.cullingPolicy.idlenessCheckPeriod }}'
12 |   ISTIO_GATEWAY: {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
13 |   USE_ISTIO: 'true'


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/controller/role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   name: notebook-controller-leader-election-role
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | rules:
 7 | - apiGroups:
 8 |   - ''
 9 |   resources:
10 |   - configmaps
11 |   verbs:
12 |   - get
13 |   - list
14 |   - watch
15 |   - create
16 |   - update
17 |   - patch
18 |   - delete
19 | - apiGroups:
20 |   - ''
21 |   resources:
22 |   - configmaps/status
23 |   verbs:
24 |   - get
25 |   - update
26 |   - patch
27 | - apiGroups:
28 |   - ''
29 |   resources:
30 |   - events
31 |   verbs:
32 |   - create
33 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/controller/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: notebook-controller-leader-election-rolebinding
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | roleRef:
 7 |   apiGroup: rbac.authorization.k8s.io
 8 |   kind: Role
 9 |   name: notebook-controller-leader-election-role
10 | subjects:
11 | - kind: ServiceAccount
12 |   name: notebook-controller-service-account
13 |   namespace: {{ .Values.kubeflow.namespace }}
14 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/controller/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: notebook-controller-service
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   ports:
 8 |   - port: 443
 9 |     name: https
10 |   selector:
11 |     control-plane: notebook-controller
12 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/controller/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: notebook-controller-service-account
5 |   namespace: {{ .Values.kubeflow.namespace }}
6 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/webapp/authorization_policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: security.istio.io/v1
 2 | kind: AuthorizationPolicy
 3 | metadata:
 4 |   name: kubeflow-notebooks-webapp
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   action: ALLOW
 8 |   rules:
 9 |   - from:
10 |     - source:
11 |         principals:
12 |         - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }}
13 |   selector:
14 |     matchLabels:
15 |       app: kubeflow-notebooks-webapp
16 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/webapp/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: kubeflow-notebooks-webapp-cluster-role-binding
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: kubeflow-notebooks-webapp-cluster-role
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: kubeflow-notebooks-webapp-service-account
12 |   namespace: {{ .Values.kubeflow.namespace }}
13 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/webapp/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   labels:
 5 |     app: kubeflow-notebooks-webapp
 6 |   name: kubeflow-notebooks-webapp-deployment
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       app: kubeflow-notebooks-webapp
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: kubeflow-notebooks-webapp
17 |     spec:
18 |       containers:
19 |       - env:
20 |         - name: APP_PREFIX
21 |           value: /jupyter
22 |         - name: UI
23 |           value: default
24 |         - name: USERID_HEADER
25 |           value: x-auth-request-email
26 |         - name: USERID_PREFIX
27 |           value: ''
28 |         - name: APP_SECURE_COOKIES
29 |           value: 'true'
30 |         image: docker.io/kubeflownotebookswg/jupyter-web-app:v1.9.2
31 |         name: kubeflow-notebooks-webapp
32 |         ports:
33 |         - containerPort: 5000
34 |         volumeMounts:
35 |         - mountPath: /etc/config
36 |           name: config-volume
37 |         - mountPath: /src/apps/default/static/assets/logos
38 |           name: logos-volume
39 |       serviceAccountName: kubeflow-notebooks-webapp-service-account
40 |       volumes:
41 |       - configMap:
42 |           name: kubeflow-notebooks-webapp-config
43 |         name: config-volume
44 |       - configMap:
45 |           name: kubeflow-notebooks-webapp-logos
46 |         name: logos-volume
47 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/webapp/destination_rule.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1alpha3
 2 | kind: DestinationRule
 3 | metadata:
 4 |   name: kubeflow-notebooks-webapp
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   host: kubeflow-notebooks-webapp-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local
 8 |   trafficPolicy:
 9 |     tls:
10 |       mode: ISTIO_MUTUAL
11 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/webapp/role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   name: kubeflow-notebooks-webapp-jupyter-notebook-role
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | rules:
 7 | - apiGroups:
 8 |   - authorization.k8s.io
 9 |   resources:
10 |   - subjectaccessreviews
11 |   verbs:
12 |   - create
13 | - apiGroups:
14 |   - kubeflow.org
15 |   resources:
16 |   - notebooks
17 |   - notebooks/finalizers
18 |   - poddefaults
19 |   verbs:
20 |   - get
21 |   - list
22 |   - create
23 |   - delete
24 |   - patch
25 |   - update
26 | - apiGroups:
27 |   - ''
28 |   resources:
29 |   - persistentvolumeclaims
30 |   verbs:
31 |   - create
32 |   - delete
33 |   - get
34 |   - list
35 | - apiGroups:
36 |   - ''
37 |   resources:
38 |   - events
39 |   - nodes
40 |   verbs:
41 |   - list
42 | - apiGroups:
43 |   - storage.k8s.io
44 |   resources:
45 |   - storageclasses
46 |   verbs:
47 |   - get
48 |   - list
49 |   - watch
50 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/webapp/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   labels:
 5 |   name: kubeflow-notebooks-webapp-jupyter-notebook-role-binding
 6 |   namespace: {{ .Values.kubeflow.namespace }}
 7 | roleRef:
 8 |   apiGroup: rbac.authorization.k8s.io
 9 |   kind: Role
10 |   name: kubeflow-notebooks-webapp-jupyter-notebook-role
11 | subjects:
12 | - kind: ServiceAccount
13 |   name: jupyter-notebook
14 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/webapp/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: kubeflow-notebooks-webapp
 6 |     run: kubeflow-notebooks-webapp
 7 |   name: kubeflow-notebooks-webapp-service
 8 |   namespace: {{ .Values.kubeflow.namespace }}
 9 | spec:
10 |   ports:
11 |   - name: http
12 |     port: 80
13 |     protocol: TCP
14 |     targetPort: 5000
15 |   selector:
16 |     app: kubeflow-notebooks-webapp
17 |   type: ClusterIP
18 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/webapp/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: kubeflow-notebooks-webapp-service-account
5 |   namespace: {{ .Values.kubeflow.namespace }}
6 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/templates/webapp/virtual_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1beta1
 2 | kind: VirtualService
 3 | metadata:
 4 |   name: kubeflow-notebooks-webapp
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   gateways:
 8 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
 9 |   hosts:
10 |   - '*'
11 |   http:
12 |   - headers:
13 |       request:
14 |         add:
15 |           x-forwarded-prefix: /jupyter
16 |     match:
17 |     - uri:
18 |         prefix: /jupyter/
19 |     rewrite:
20 |       uri: /
21 |     route:
22 |     - destination:
23 |         host: kubeflow-notebooks-webapp-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local
24 |         port:
25 |           number: 80
26 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-notebooks/values.yaml:
--------------------------------------------------------------------------------
1 | kubeflow:
2 |   namespace: kubeflow
3 | ingress:
4 |   namespace: ingress
5 |   gateway: ingress-gateway
6 | cullingPolicy:
7 |   enableCulling: false
8 |   cullIdleTime: 30
9 |   idlenessCheckPeriod: 5


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 2.0.1
3 | description: A Helm chart for Kubeflow pipelines
4 | name: kubeflow-pipelines
5 | type: application
6 | version: 1.0.0
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/crds/clusterworkflowtemplates.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   labels:
 5 |     application-crd-id: kubeflow-pipelines
 6 |   name: clusterworkflowtemplates.argoproj.io
 7 | spec:
 8 |   group: argoproj.io
 9 |   names:
10 |     kind: ClusterWorkflowTemplate
11 |     listKind: ClusterWorkflowTemplateList
12 |     plural: clusterworkflowtemplates
13 |     shortNames:
14 |     - clusterwftmpl
15 |     - cwft
16 |     singular: clusterworkflowtemplate
17 |   scope: Cluster
18 |   versions:
19 |   - name: v1alpha1
20 |     schema:
21 |       openAPIV3Schema:
22 |         properties:
23 |           apiVersion:
24 |             type: string
25 |           kind:
26 |             type: string
27 |           metadata:
28 |             type: object
29 |           spec:
30 |             type: object
31 |             x-kubernetes-map-type: atomic
32 |             x-kubernetes-preserve-unknown-fields: true
33 |         required:
34 |         - metadata
35 |         - spec
36 |         type: object
37 |     served: true
38 |     storage: true
39 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/crds/cronworkflows.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   labels:
 5 |     application-crd-id: kubeflow-pipelines
 6 |   name: cronworkflows.argoproj.io
 7 | spec:
 8 |   group: argoproj.io
 9 |   names:
10 |     kind: CronWorkflow
11 |     listKind: CronWorkflowList
12 |     plural: cronworkflows
13 |     shortNames:
14 |     - cwf
15 |     - cronwf
16 |     singular: cronworkflow
17 |   scope: Namespaced
18 |   versions:
19 |   - name: v1alpha1
20 |     schema:
21 |       openAPIV3Schema:
22 |         properties:
23 |           apiVersion:
24 |             type: string
25 |           kind:
26 |             type: string
27 |           metadata:
28 |             type: object
29 |           spec:
30 |             type: object
31 |             x-kubernetes-map-type: atomic
32 |             x-kubernetes-preserve-unknown-fields: true
33 |           status:
34 |             type: object
35 |             x-kubernetes-map-type: atomic
36 |             x-kubernetes-preserve-unknown-fields: true
37 |         required:
38 |         - metadata
39 |         - spec
40 |         type: object
41 |     served: true
42 |     storage: true
43 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/crds/scheduledworkflows.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/component: ml-pipeline
 6 |     app.kubernetes.io/name: kubeflow-pipelines
 7 |     application-crd-id: kubeflow-pipelines
 8 |   name: scheduledworkflows.kubeflow.org
 9 | spec:
10 |   group: kubeflow.org
11 |   names:
12 |     kind: ScheduledWorkflow
13 |     listKind: ScheduledWorkflowList
14 |     plural: scheduledworkflows
15 |     shortNames:
16 |     - swf
17 |     singular: scheduledworkflow
18 |   scope: Namespaced
19 |   versions:
20 |   - name: v1beta1
21 |     schema:
22 |       openAPIV3Schema:
23 |         properties:
24 |           apiVersion:
25 |             type: string
26 |           kind:
27 |             type: string
28 |           metadata:
29 |             type: object
30 |           spec:
31 |             type: object
32 |             x-kubernetes-map-type: atomic
33 |             x-kubernetes-preserve-unknown-fields: true
34 |           status:
35 |             type: object
36 |             x-kubernetes-map-type: atomic
37 |             x-kubernetes-preserve-unknown-fields: true
38 |         required:
39 |         - spec
40 |         - status
41 |         type: object
42 |     served: true
43 |     storage: true
44 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/crds/viewers.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/component: ml-pipeline
 6 |     app.kubernetes.io/name: kubeflow-pipelines
 7 |     application-crd-id: kubeflow-pipelines
 8 |   name: viewers.kubeflow.org
 9 | spec:
10 |   group: kubeflow.org
11 |   names:
12 |     kind: Viewer
13 |     listKind: ViewerList
14 |     plural: viewers
15 |     shortNames:
16 |     - vi
17 |     singular: viewer
18 |   scope: Namespaced
19 |   versions:
20 |   - name: v1beta1
21 |     schema:
22 |       openAPIV3Schema:
23 |         properties:
24 |           apiVersion:
25 |             type: string
26 |           kind:
27 |             type: string
28 |           metadata:
29 |             type: object
30 |           spec:
31 |             type: object
32 |             x-kubernetes-map-type: atomic
33 |             x-kubernetes-preserve-unknown-fields: true
34 |         required:
35 |         - spec
36 |         type: object
37 |     served: true
38 |     storage: true
39 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/crds/workfloweventbindings.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   labels:
 5 |     application-crd-id: kubeflow-pipelines
 6 |   name: workfloweventbindings.argoproj.io
 7 | spec:
 8 |   group: argoproj.io
 9 |   names:
10 |     kind: WorkflowEventBinding
11 |     listKind: WorkflowEventBindingList
12 |     plural: workfloweventbindings
13 |     shortNames:
14 |     - wfeb
15 |     singular: workfloweventbinding
16 |   scope: Namespaced
17 |   versions:
18 |   - name: v1alpha1
19 |     schema:
20 |       openAPIV3Schema:
21 |         properties:
22 |           apiVersion:
23 |             type: string
24 |           kind:
25 |             type: string
26 |           metadata:
27 |             type: object
28 |           spec:
29 |             type: object
30 |             x-kubernetes-map-type: atomic
31 |             x-kubernetes-preserve-unknown-fields: true
32 |         required:
33 |         - metadata
34 |         - spec
35 |         type: object
36 |     served: true
37 |     storage: true
38 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/crds/workflows.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   labels:
 5 |     application-crd-id: kubeflow-pipelines
 6 |   name: workflows.argoproj.io
 7 | spec:
 8 |   group: argoproj.io
 9 |   names:
10 |     kind: Workflow
11 |     listKind: WorkflowList
12 |     plural: workflows
13 |     shortNames:
14 |     - wf
15 |     singular: workflow
16 |   scope: Namespaced
17 |   versions:
18 |   - additionalPrinterColumns:
19 |     - description: Status of the workflow
20 |       jsonPath: .status.phase
21 |       name: Status
22 |       type: string
23 |     - description: When the workflow was started
24 |       format: date-time
25 |       jsonPath: .status.startedAt
26 |       name: Age
27 |       type: date
28 |     name: v1alpha1
29 |     schema:
30 |       openAPIV3Schema:
31 |         properties:
32 |           apiVersion:
33 |             type: string
34 |           kind:
35 |             type: string
36 |           metadata:
37 |             type: object
38 |           spec:
39 |             type: object
40 |             x-kubernetes-map-type: atomic
41 |             x-kubernetes-preserve-unknown-fields: true
42 |           status:
43 |             type: object
44 |             x-kubernetes-map-type: atomic
45 |             x-kubernetes-preserve-unknown-fields: true
46 |         required:
47 |         - metadata
48 |         - spec
49 |         type: object
50 |     served: true
51 |     storage: true
52 |     subresources: {}
53 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/crds/workflowtasksets.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   labels:
 5 |     application-crd-id: kubeflow-pipelines
 6 |   name: workflowtasksets.argoproj.io
 7 | spec:
 8 |   group: argoproj.io
 9 |   names:
10 |     kind: WorkflowTaskSet
11 |     listKind: WorkflowTaskSetList
12 |     plural: workflowtasksets
13 |     shortNames:
14 |     - wfts
15 |     singular: workflowtaskset
16 |   scope: Namespaced
17 |   versions:
18 |   - name: v1alpha1
19 |     schema:
20 |       openAPIV3Schema:
21 |         properties:
22 |           apiVersion:
23 |             type: string
24 |           kind:
25 |             type: string
26 |           metadata:
27 |             type: object
28 |           spec:
29 |             type: object
30 |             x-kubernetes-map-type: atomic
31 |             x-kubernetes-preserve-unknown-fields: true
32 |           status:
33 |             type: object
34 |             x-kubernetes-map-type: atomic
35 |             x-kubernetes-preserve-unknown-fields: true
36 |         required:
37 |         - metadata
38 |         - spec
39 |         type: object
40 |     served: true
41 |     storage: true
42 |     subresources:
43 |       status: {}
44 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/crds/workflowtemplates.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apiextensions.k8s.io/v1
 2 | kind: CustomResourceDefinition
 3 | metadata:
 4 |   labels:
 5 |     application-crd-id: kubeflow-pipelines
 6 |   name: workflowtemplates.argoproj.io
 7 | spec:
 8 |   group: argoproj.io
 9 |   names:
10 |     kind: WorkflowTemplate
11 |     listKind: WorkflowTemplateList
12 |     plural: workflowtemplates
13 |     shortNames:
14 |     - wftmpl
15 |     singular: workflowtemplate
16 |   scope: Namespaced
17 |   versions:
18 |   - name: v1alpha1
19 |     schema:
20 |       openAPIV3Schema:
21 |         properties:
22 |           apiVersion:
23 |             type: string
24 |           kind:
25 |             type: string
26 |           metadata:
27 |             type: object
28 |           spec:
29 |             type: object
30 |             x-kubernetes-map-type: atomic
31 |             x-kubernetes-preserve-unknown-fields: true
32 |         required:
33 |         - metadata
34 |         - spec
35 |         type: object
36 |     served: true
37 |     storage: true
38 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/templates/certficate.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: cert-manager.io/v1
 2 | kind: Certificate
 3 | metadata:
 4 |   labels:
 5 |     app: cache-server-cert-manager
 6 |   name: kfp-cache-cert
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   commonName: kfp-cache-cert
10 |   dnsNames:
11 |   - cache-server
12 |   - cache-server.{{ .Values.kubeflow.namespace }}
13 |   - cache-server.{{ .Values.kubeflow.namespace }}.svc
14 |   isCA: true
15 |   issuerRef:
16 |     kind: Issuer
17 |     name: kfp-cache-selfsigned-issuer
18 |   secretName: webhook-server-cert
19 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/templates/composite_controller.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: metacontroller.k8s.io/v1alpha1
 2 | kind: CompositeController
 3 | metadata:
 4 |   labels:
 5 |     app: kubeflow-pipelines-profile-controller
 6 |     app.kubernetes.io/component: ml-pipeline
 7 |     app.kubernetes.io/name: kubeflow-pipelines
 8 |     application-crd-id: kubeflow-pipelines
 9 |   name: kubeflow-pipelines-profile-controller
10 |   namespace: {{ .Values.kubeflow.namespace }}
11 | spec:
12 |   childResources:
13 |   - apiVersion: v1
14 |     resource: secrets
15 |     updateStrategy:
16 |       method: OnDelete
17 |   - apiVersion: v1
18 |     resource: configmaps
19 |     updateStrategy:
20 |       method: OnDelete
21 |   - apiVersion: apps/v1
22 |     resource: deployments
23 |     updateStrategy:
24 |       method: InPlace
25 |   - apiVersion: v1
26 |     resource: services
27 |     updateStrategy:
28 |       method: InPlace
29 |   - apiVersion: networking.istio.io/v1alpha3
30 |     resource: destinationrules
31 |     updateStrategy:
32 |       method: InPlace
33 |   - apiVersion: security.istio.io/v1beta1
34 |     resource: authorizationpolicies
35 |     updateStrategy:
36 |       method: InPlace
37 |   generateSelector: true
38 |   hooks:
39 |     sync:
40 |       webhook:
41 |         url: http://kubeflow-pipelines-profile-controller/sync
42 |   parentResource:
43 |     apiVersion: v1
44 |     resource: namespaces
45 |   resyncPeriodSeconds: 3600
46 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/templates/issuer.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: cert-manager.io/v1
 2 | kind: Issuer
 3 | metadata:
 4 |   labels:
 5 |     app: cache-server-cert-manager
 6 |   name: kfp-cache-selfsigned-issuer
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   selfSigned: {}
10 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/templates/mutating_webhook_configuration.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: admissionregistration.k8s.io/v1
 2 | kind: MutatingWebhookConfiguration
 3 | metadata:
 4 |   annotations:
 5 |     cert-manager.io/inject-ca-from: {{ .Values.kubeflow.namespace }}/kfp-cache-cert
 6 |   labels:
 7 |     app: cache-server-cert-manager
 8 |   name: cache-webhook-kubeflow
 9 | webhooks:
10 | - admissionReviewVersions:
11 |   - v1beta1
12 |   clientConfig:
13 |     service:
14 |       name: cache-server
15 |       namespace: {{ .Values.kubeflow.namespace }}
16 |       path: /mutate
17 |   failurePolicy: Ignore
18 |   name: cache-server.{{ .Values.kubeflow.namespace }}.svc
19 |   objectSelector:
20 |     matchLabels:
21 |       pipelines.kubeflow.org/cache_enabled: 'true'
22 |   rules:
23 |   - apiGroups:
24 |     - ''
25 |     apiVersions:
26 |     - v1
27 |     operations:
28 |     - CREATE
29 |     resources:
30 |     - pods
31 |   sideEffects: None
32 |   timeoutSeconds: 5
33 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/templates/priority_class.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: scheduling.k8s.io/v1
2 | kind: PriorityClass
3 | metadata:
4 |   labels:
5 |     application-crd-id: kubeflow-pipelines
6 |   name: workflow-controller
7 | value: 1000000
8 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/templates/secrets.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Secret
 3 | metadata:
 4 |   labels:
 5 |     application-crd-id: kubeflow-pipelines
 6 |   name: mlpipeline-minio-artifact
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | stringData:
 9 |   accesskey: {{ .Values.minio.access_key }}
10 |   secretkey: {{ .Values.minio.secret_key }}
11 | ---
12 | apiVersion: v1
13 | kind: Secret
14 | metadata:
15 |   labels:
16 |     app.kubernetes.io/component: ml-pipeline
17 |     app.kubernetes.io/name: kubeflow-pipelines
18 |     application-crd-id: kubeflow-pipelines
19 |   name: mysql-secret
20 |   namespace: {{ .Values.kubeflow.namespace }}
21 | stringData:
22 |   username: root
23 |   password: ""
24 | ---
25 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/templates/virtual_services.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: networking.istio.io/v1alpha3
 3 | kind: VirtualService
 4 | metadata:
 5 |   labels:
 6 |     application-crd-id: kubeflow-pipelines
 7 |   name: metadata-grpc
 8 |   namespace: {{ .Values.kubeflow.namespace }}
 9 | spec:
10 |   gateways:
11 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
12 |   hosts:
13 |   - '*'
14 |   http:
15 |   - match:
16 |     - uri:
17 |         prefix: /ml_metadata
18 |     rewrite:
19 |       uri: /ml_metadata
20 |     route:
21 |     - destination:
22 |         host: metadata-envoy-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local
23 |         port:
24 |           number: 9090
25 | ---
26 | apiVersion: networking.istio.io/v1alpha3
27 | kind: VirtualService
28 | metadata:
29 |   labels:
30 |     app.kubernetes.io/component: ml-pipeline
31 |     app.kubernetes.io/name: kubeflow-pipelines
32 |     application-crd-id: kubeflow-pipelines
33 |   name: ml-pipeline-ui
34 |   namespace: {{ .Values.kubeflow.namespace }}
35 | spec:
36 |   gateways:
37 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
38 |   hosts:
39 |   - '*'
40 |   http:
41 |   - match:
42 |     - uri:
43 |         prefix: /pipeline
44 |     rewrite:
45 |       uri: /pipeline
46 |     route:
47 |     - destination:
48 |         host: ml-pipeline-ui.{{ .Values.kubeflow.namespace }}.svc.cluster.local
49 |         port:
50 |           number: 80
51 |     timeout: 300s
52 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-pipelines/values.yaml:
--------------------------------------------------------------------------------
1 | kubeflow:
2 |   namespace: kubeflow
3 | ingress:
4 |   namespace: ingress
5 |   gateway: ingress-gateway
6 |   sa: 
7 | minio:
8 |   access_key:
9 |   secret_key:


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: v1.9.2
3 | description: A Helm chart for Kubeflow Profiles and Access Management
4 | name: kubeflow-profiles-and-kfam
5 | type: application
6 | version: 1.0.0
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/templates/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     control-plane: profiles
 6 |   name: profiles-cluster-rolebinding
 7 | roleRef:
 8 |   apiGroup: rbac.authorization.k8s.io
 9 |   kind: ClusterRole
10 |   name: cluster-admin
11 | subjects:
12 | - kind: ServiceAccount
13 |   name: profiles-controller-service-account
14 |   namespace: {{ .Values.kubeflow.namespace }}
15 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/templates/config-maps/namespace_labels_data.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | data:
 3 |   namespace-labels.yaml: '# Below is a list of labels to be set by default.
 4 | 
 5 |     #
 6 | 
 7 |     # To add a namespace label, use `key: ''value''`, for example:
 8 | 
 9 |     # istio.io/rev: ''asm-191-1''
10 | 
11 |     #
12 | 
13 |     # To remove a namespace label, use `key: ''''`. For example:
14 | 
15 |     # istio-injection: ''''
16 | 
17 |     #
18 | 
19 |     # Profile controller will not replace a namespace label if its key already
20 | 
21 |     # exists. If you want to override the value of a previously applied label, you
22 | 
23 |     # need to:
24 | 
25 |     # 1. Remove the label by using `key: ''''` and deploy.
26 | 
27 |     # 2. Add the label by using `key: ''value''` and deploy.
28 | 
29 |     #
30 | 
31 |     katib.kubeflow.org/metrics-collector-injection: "enabled"
32 | 
33 |     serving.kubeflow.org/inferenceservice: "enabled"
34 | 
35 |     pipelines.kubeflow.org/enabled: "true"
36 | 
37 |     app.kubernetes.io/part-of: "kubeflow-profile"
38 | 
39 |     '
40 | kind: ConfigMap
41 | metadata:
42 |   labels:
43 |     control-plane: profiles
44 |   name: namespace-labels-data
45 |   namespace: {{ .Values.kubeflow.namespace }}
46 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/templates/config-maps/profiles_config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | data:
 3 |   ADMIN: ''
 4 |   USERID_HEADER: x-auth-request-email
 5 |   USERID_PREFIX: ''
 6 |   WORKLOAD_IDENTITY: ''
 7 |   ISTIO_INGRESS_GATEWAY_PRINCIPAL: "cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }}"
 8 |   NOTEBOOK_CONTROLLER_PRINCIPAL: "cluster.local/ns/{{ .Values.kubeflow.namespace }}/sa/{{ .Values.notebook_controller.sa }}"
 9 |   KFP_UI_PRINCIPAL: "cluster.local/ns/{{ .Values.kubeflow.namespace }}/sa/{{ .Values.pipeline_ui.sa }}"
10 | kind: ConfigMap
11 | metadata:
12 |   labels:
13 |     control-plane: profiles
14 |   name: profiles-config
15 |   namespace: {{ .Values.kubeflow.namespace }}
16 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/templates/role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   labels:
 5 |     control-plane: profiles
 6 |   name: profiles-leader-election-role
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | rules:
 9 | - apiGroups:
10 |   - ''
11 |   resources:
12 |   - configmaps
13 |   verbs:
14 |   - get
15 |   - list
16 |   - watch
17 |   - create
18 |   - update
19 |   - patch
20 |   - delete
21 | - apiGroups:
22 |   - ''
23 |   resources:
24 |   - configmaps/status
25 |   verbs:
26 |   - get
27 |   - update
28 |   - patch
29 | - apiGroups:
30 |   - ''
31 |   resources:
32 |   - events
33 |   verbs:
34 |   - create
35 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/templates/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   labels:
 5 |     control-plane: profiles
 6 |   name: profiles-leader-election-rolebinding
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | roleRef:
 9 |   apiGroup: rbac.authorization.k8s.io
10 |   kind: Role
11 |   name: profiles-leader-election-role
12 | subjects:
13 | - kind: ServiceAccount
14 |   name: profiles-controller-service-account
15 |   namespace: {{ .Values.kubeflow.namespace }}
16 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     control-plane: profiles
 6 |   name: profiles-kfam
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   ports:
10 |   - port: 8081
11 |   selector:
12 |     control-plane: profiles
13 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/templates/service_account.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   labels:
 5 |     control-plane: profiles
 6 |   annotations:
 7 |     eks.amazonaws.com/role-arn: {{ .Values.profile_controller.role_arn }}
 8 |   name: profiles-controller-service-account
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/templates/virtual_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1alpha3
 2 | kind: VirtualService
 3 | metadata:
 4 |   labels:
 5 |     control-plane: profiles
 6 |   name: profiles-kfam
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   gateways:
10 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
11 |   hosts:
12 |   - '*'
13 |   http:
14 |   - headers:
15 |       request:
16 |         add:
17 |           x-forwarded-prefix: /kfam
18 |     match:
19 |     - uri:
20 |         prefix: /kfam/
21 |     rewrite:
22 |       uri: /kfam/
23 |     route:
24 |     - destination:
25 |         host: profiles-kfam.{{ .Values.kubeflow.namespace }}.svc.cluster.local
26 |         port:
27 |           number: 8081
28 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-profiles-and-kfam/values.yaml:
--------------------------------------------------------------------------------
 1 | profile_controller:
 2 |   role_arn:
 3 | kubeflow:
 4 |   namespace: kubeflow
 5 | ingress:
 6 |   namespace: ingress
 7 |   gateway: ingress-gateway
 8 |   sa: istio-ingressgateway
 9 | notebook_controller:
10 |   sa: notebook-controller-service-account
11 | pipeline_ui:
12 |   sa: ml-pipeline-ui
13 | 
14 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-roles/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: kubeflow-roles
3 | description: A Helm chart for Kubeflow roles
4 | type: application
5 | version: 1.0.0
6 | appVersion: "v1.9.2"
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: v1.9.2
3 | description: A Helm chart for Kubeflow tensorboards
4 | name: kubeflow-tensorboards
5 | type: application
6 | version: 1.0.0
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/controller/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRoleBinding
 4 | metadata:
 5 |   name: tensorboard-controller-manager-rolebinding
 6 | roleRef:
 7 |   apiGroup: rbac.authorization.k8s.io
 8 |   kind: ClusterRole
 9 |   name: tensorboard-controller-manager-role
10 | subjects:
11 | - kind: ServiceAccount
12 |   name: tensorboard-controller-manager
13 |   namespace: {{ .Values.kubeflow.namespace }}
14 | 
15 | ---
16 | 
17 | apiVersion: rbac.authorization.k8s.io/v1
18 | kind: ClusterRoleBinding
19 | metadata:
20 |   name: tensorboard-controller-proxy-rolebinding
21 | roleRef:
22 |   apiGroup: rbac.authorization.k8s.io
23 |   kind: ClusterRole
24 |   name: tensorboard-controller-proxy-role
25 | subjects:
26 | - kind: ServiceAccount
27 |   name: tensorboard-controller-manager
28 |   namespace: {{ .Values.kubeflow.namespace }}
29 | ---


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/controller/config_map.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | data:
 3 |   ISTIO_GATEWAY: {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
 4 |   RWO_PVC_SCHEDULING: 'True'
 5 |   TENSORBOARD_IMAGE: tensorflow/tensorflow:2.12.0
 6 | kind: ConfigMap
 7 | metadata:
 8 |   name: tensorboard-controller-config
 9 |   namespace: {{ .Values.kubeflow.namespace }}
10 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/controller/role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   name: tensorboard-controller-leader-election-role
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | rules:
 7 | - apiGroups:
 8 |   - ''
 9 |   resources:
10 |   - configmaps
11 |   verbs:
12 |   - get
13 |   - list
14 |   - watch
15 |   - create
16 |   - update
17 |   - patch
18 |   - delete
19 | - apiGroups:
20 |   - coordination.k8s.io
21 |   resources:
22 |   - leases
23 |   verbs:
24 |   - get
25 |   - list
26 |   - watch
27 |   - create
28 |   - update
29 |   - patch
30 |   - delete
31 | - apiGroups:
32 |   - ''
33 |   resources:
34 |   - events
35 |   verbs:
36 |   - create
37 |   - patch
38 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/controller/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: tensorboard-controller-leader-election-rolebinding
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | roleRef:
 7 |   apiGroup: rbac.authorization.k8s.io
 8 |   kind: Role
 9 |   name: tensorboard-controller-leader-election-role
10 | subjects:
11 | - kind: ServiceAccount
12 |   name: tensorboard-controller-manager
13 |   namespace: {{ .Values.kubeflow.namespace }}
14 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/controller/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: tensorboard-controller
 6 |     control-plane: tensorboard-controller-manager
 7 |   name: tensorboard-controller-manager-metrics-service
 8 |   namespace: {{ .Values.kubeflow.namespace }}
 9 | spec:
10 |   ports:
11 |   - name: https
12 |     port: 8443
13 |     protocol: TCP
14 |     targetPort: https
15 |   selector:
16 |     app: tensorboard-controller
17 |     control-plane: tensorboard-controller-manager
18 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/controller/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: tensorboard-controller-manager
5 |   namespace: {{ .Values.kubeflow.namespace }}
6 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/webapp/authorization_policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: security.istio.io/v1
 2 | kind: AuthorizationPolicy
 3 | metadata:
 4 |   name: tensorboards-web-app
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   action: ALLOW
 8 |   rules:
 9 |   - from:
10 |     - source:
11 |         principals:
12 |         - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }}
13 |   selector:
14 |     matchLabels:
15 |       app: tensorboards-web-app
16 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/webapp/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: tensorboards-web-app-cluster-role-binding
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: tensorboards-web-app-cluster-role
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: tensorboards-web-app-service-account
12 |   namespace: {{ .Values.kubeflow.namespace }}
13 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/webapp/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   labels:
 5 |     app: tensorboards-web-app
 6 |   name: tensorboards-web-app-deployment
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       app: tensorboards-web-app
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: tensorboards-web-app
17 |     spec:
18 |       containers:
19 |       - env:
20 |         - name: APP_PREFIX
21 |           value: /tensorboards
22 |         - name: USERID_HEADER
23 |           value: x-auth-request-email
24 |         - name: USERID_PREFIX
25 |           value: ''
26 |         - name: APP_SECURE_COOKIES
27 |           value: 'true '
28 |         image: docker.io/kubeflownotebookswg/tensorboards-web-app:v1.9.2
29 |         name: tensorboards-web-app
30 |         ports:
31 |         - containerPort: 5000
32 |       serviceAccountName: tensorboards-web-app-service-account
33 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/webapp/destination_rule.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1alpha3
 2 | kind: DestinationRule
 3 | metadata:
 4 |   name: tensorboards-web-app
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   host: tensorboards-web-app-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local
 8 |   trafficPolicy:
 9 |     tls:
10 |       mode: ISTIO_MUTUAL
11 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/webapp/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: tensorboards-web-app
 6 |   name: tensorboards-web-app-service
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   ports:
10 |   - name: http
11 |     port: 80
12 |     protocol: TCP
13 |     targetPort: 5000
14 |   selector:
15 |     app: tensorboards-web-app
16 |   type: ClusterIP
17 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/webapp/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: tensorboards-web-app-service-account
5 |   namespace: {{ .Values.kubeflow.namespace }}
6 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/templates/webapp/virtual_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1beta1
 2 | kind: VirtualService
 3 | metadata:
 4 |   name: tensorboards-web-app-tensorboards-web-app
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   gateways:
 8 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
 9 |   hosts:
10 |   - '*'
11 |   http:
12 |   - headers:
13 |       request:
14 |         add:
15 |           x-forwarded-prefix: /tensorboards
16 |     match:
17 |     - uri:
18 |         prefix: /tensorboards/
19 |     rewrite:
20 |       uri: /
21 |     route:
22 |     - destination:
23 |         host: tensorboards-web-app-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local
24 |         port:
25 |           number: 80
26 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-tensorboards/values.yaml:
--------------------------------------------------------------------------------
1 | kubeflow:
2 |   namespace: kubeflow
3 | ingress:
4 |   namespace: ingress
5 |   gateway: ingress-gateway
6 |   sa: istio-ingressgateway
7 | 
8 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-training-operator/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "v1.9.2"
3 | description: A Helm chart for Kubeflow training-operator
4 | name: kubeflow-training-operator
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-training-operator/templates/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRoleBinding
 4 | metadata:
 5 |   labels:
 6 |     app: training-operator
 7 |   name: training-operator
 8 |   namespace: {{ .Values.namespace }}
 9 | roleRef:
10 |   apiGroup: rbac.authorization.k8s.io
11 |   kind: ClusterRole
12 |   name: training-operator
13 | subjects:
14 | - kind: ServiceAccount
15 |   name: training-operator
16 |   namespace: {{ .Values.namespace }}


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-training-operator/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   annotations:
 6 |     prometheus.io/path: /metrics
 7 |     prometheus.io/scrape: "true"
 8 |     prometheus.io/port: "8080"
 9 |   labels:
10 |     app: training-operator
11 |   name: training-operator
12 |   namespace: {{ .Values.namespace }}
13 | spec:
14 |   ports:
15 |   - name: monitoring-port
16 |     port: 8080
17 |     targetPort: 8080
18 |   selector:
19 |     control-plane: kubeflow-training-operator
20 |   type: ClusterIP
21 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-training-operator/templates/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   labels:
5 |     app: training-operator
6 |   name: training-operator
7 |   namespace: {{ .Values.namespace }}
8 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-training-operator/values.yaml:
--------------------------------------------------------------------------------
1 | namespace: kubeflow


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-user-profile-defaults/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: v1.9.2
3 | description: A Helm chart for Kubeflow user Pod Defaults
4 | name: kubeflow-user-defaults
5 | type: application
6 | version: 1.0.7
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-user-profile-defaults/templates/pod_default.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1alpha1
 2 | kind: PodDefault
 3 | metadata:
 4 |   name: access-ml-pipeline
 5 |   namespace: {{ .Values.user.profile }}
 6 | spec:
 7 |   desc: Allow access to Kubeflow Pipelines
 8 |   selector:
 9 |     matchLabels:
10 |       access-ml-pipeline: "true"
11 |   env:
12 |     - name: KF_PIPELINES_SA_TOKEN_PATH
13 |       value: /var/run/secrets/kubeflow/pipelines/token
14 |   volumes:
15 |     - name: volume-kf-pipeline-token
16 |       projected:
17 |         sources:
18 |           - serviceAccountToken:
19 |               path: token
20 |               expirationSeconds: 7200
21 |               ## defined by the `TOKEN_REVIEW_AUDIENCE` environment variable on the `ml-pipeline` deployment
22 |               audience: pipelines.kubeflow.org      
23 |   volumeMounts:
24 |     - mountPath: /var/run/secrets/kubeflow/pipelines
25 |       name: volume-kf-pipeline-token
26 |       readOnly: true


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-user-profile-defaults/templates/role_bindings.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: RoleBinding
 4 | metadata:
 5 |   name: pipeline-runner-binding
 6 |   namespace: {{ .Values.user.profile }}
 7 | roleRef:
 8 |   apiGroup: rbac.authorization.k8s.io
 9 |   kind: Role
10 |   name: pipeline-runner
11 | subjects:
12 | - kind: ServiceAccount
13 |   name: default
14 |   namespace: {{ .Values.user.profile }}
15 | ---
16 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-user-profile-defaults/values.yaml:
--------------------------------------------------------------------------------
1 | user: 
2 |   profile: kubeflow-user-example-com


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-user-profile/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: v1.9.2
3 | description: A Helm chart for Kubeflow user namespace
4 | name: kubeflow-user-profile
5 | type: application
6 | version: 1.0.0
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-user-profile/templates/config_map.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: default-install-config
5 | data:
6 |   profile-name: {{ .Values.user.profile }}
7 |   user: {{ .Values.user.email }}
8 |   


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-user-profile/templates/profile.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1beta1
 2 | kind: Profile
 3 | metadata:
 4 |   name: {{ .Values.user.profile }}
 5 | spec:
 6 |   owner:
 7 |     kind: User
 8 |     name: {{ .Values.user.email }}
 9 |   {{- if .Values.awsIamForServiceAccount.awsIamRole }}
10 |   plugins:
11 |   - kind: AwsIamForServiceAccount
12 |     spec:
13 |       awsIamRole: '{{ .Values.awsIamForServiceAccount.awsIamRole }}'
14 |       annotateOnly: true
15 |   {{- end }}


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-user-profile/values.yaml:
--------------------------------------------------------------------------------
1 | user: 
2 |   profile: kubeflow-user-example-com
3 |   email: user@example.com
4 | awsIamForServiceAccount:
5 |   awsIamRole:


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: v1.9.2
3 | description: A Helm chart for Kubeflow Volumes
4 | name: kubeflow-volumes
5 | type: application
6 | version: 1.0.0
7 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/controller/certificate.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: cert-manager.io/v1
 2 | kind: Certificate
 3 | metadata:
 4 |   name: pvcviewer-server-cert
 5 |   namespace:  {{ .Values.kubeflow.namespace }}
 6 | spec:
 7 |   dnsNames:
 8 |   - pvcviewer-webhook-service.{{ .Values.kubeflow.namespace }}.svc
 9 |   - pvcviewer-webhook-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local
10 |   issuerRef:
11 |     kind: Issuer
12 |     name: pvcviewer-selfsigned-issuer
13 |   secretName: pvcviewer-server-cert


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/controller/cluster_role_bindings.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRoleBinding
 4 | metadata:
 5 |   name: pvcviewer-manager-rolebinding
 6 | roleRef:
 7 |   apiGroup: rbac.authorization.k8s.io
 8 |   kind: ClusterRole
 9 |   name: pvcviewer-manager-role
10 | subjects:
11 | - kind: ServiceAccount
12 |   name:  pvcviewer-controller-sa
13 |   namespace: {{ .Values.kubeflow.namespace }}
14 | ---
15 | apiVersion: rbac.authorization.k8s.io/v1
16 | kind: ClusterRoleBinding
17 | metadata:
18 |   name: pvcviewer-proxy-rolebinding
19 | roleRef:
20 |   apiGroup: rbac.authorization.k8s.io
21 |   kind: ClusterRole
22 |   name: pvcviewer-proxy-role
23 | subjects:
24 | - kind: ServiceAccount
25 |   name:  pvcviewer-controller-sa
26 |   namespace: {{ .Values.kubeflow.namespace }}
27 | ---
28 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/controller/config_map.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 |   ISTIO_GATEWAY: {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
4 | kind: ConfigMap
5 | metadata:
6 |   name: pvcviewer-controller-config
7 |   namespace: {{ .Values.kubeflow.namespace }}


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/controller/issuer.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: cert-manager.io/v1
2 | kind: Issuer
3 | metadata:
4 |   name: pvcviewer-selfsigned-issuer
5 |   namespace: {{ .Values.kubeflow.namespace }}
6 | spec:
7 |   selfSigned: {}


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/controller/role.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: Role
 4 | metadata:
 5 |   name: pvcviewer-leader-election-role
 6 |   namespace: {{ .Values.kubeflow.namespace }}
 7 | rules:
 8 | - apiGroups:
 9 |   - ""
10 |   resources:
11 |   - configmaps
12 |   verbs:
13 |   - get
14 |   - list
15 |   - watch
16 |   - create
17 |   - update
18 |   - patch
19 |   - delete
20 | - apiGroups:
21 |   - coordination.k8s.io
22 |   resources:
23 |   - leases
24 |   verbs:
25 |   - get
26 |   - list
27 |   - watch
28 |   - create
29 |   - update
30 |   - patch
31 |   - delete
32 | - apiGroups:
33 |   - ""
34 |   resources:
35 |   - events
36 |   verbs:
37 |   - create
38 |   - patch


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/controller/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: pvcviewer-leader-election-rolebinding
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | roleRef:
 7 |   apiGroup: rbac.authorization.k8s.io
 8 |   kind: Role
 9 |   name: pvcviewer-leader-election-role
10 | subjects:
11 | - kind: ServiceAccount
12 |   name:  pvcviewer-controller-sa
13 |   namespace: {{ .Values.kubeflow.namespace }}
14 | ---


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/controller/service.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   labels:
 6 |     control-plane: pvcviewer-controller-manager
 7 |   name: pvcviewer-webhook-service
 8 |   namespace: {{ .Values.kubeflow.namespace }}
 9 | spec:
10 |   ports:
11 |     - port: 443
12 |       protocol: TCP
13 |       targetPort: 9443
14 |   selector:
15 |     control-plane: pvcviewer-controller-manager
16 | ---


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/controller/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name:  pvcviewer-controller-sa
5 |   namespace: {{ .Values.kubeflow.namespace }}
6 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/webapp/authorization_policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: security.istio.io/v1beta1
 2 | kind: AuthorizationPolicy
 3 | metadata:
 4 |   labels:
 5 |     app: volumes-web-app
 6 |     
 7 |   name: volumes-web-app
 8 |   namespace: {{ .Values.kubeflow.namespace }}
 9 | spec:
10 |   action: ALLOW
11 |   rules:
12 |   - from:
13 |     - source:
14 |         principals:
15 |         - cluster.local/ns/{{ .Values.ingress.namespace }}/sa/{{ .Values.ingress.sa }}
16 |   selector:
17 |     matchLabels:
18 |       app: volumes-web-app
19 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/webapp/cluster_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app: volumes-web-app
 6 |   name: kubeflow-volume-cluster-role-binding
 7 | roleRef:
 8 |   apiGroup: rbac.authorization.k8s.io
 9 |   kind: ClusterRole
10 |   name: kubeflow-volume-cluster-role
11 | subjects:
12 | - kind: ServiceAccount
13 |   name: volumes-web-app-service-account
14 |   namespace: {{ .Values.kubeflow.namespace }}
15 | ---
16 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/webapp/config_map.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: volumes-web-app-viewer-spec
 5 |   namespace: {{ .Values.kubeflow.namespace }}
 6 | data:
 7 |   viewer-spec.yaml : | 
 8 |     podTemplate:
 9 |       containers:
10 |         - name: main
11 |           image: $VOLUME_VIEWER_IMAGE
12 |           env:
13 |             - name: FB_ADDRESS
14 |               value: "0.0.0.0"
15 |             - name: FB_PORT
16 |               value: "8080"
17 |             - name: FB_DATABASE
18 |               value: /tmp/filebrowser.db
19 |             - name: FB_NOAUTH
20 |               value: "true"
21 |             - name: FB_BASEURL
22 |               value: /pvcviewers/$NAMESPACE/$NAME/
23 |           readinessProbe:
24 |             tcpSocket:
25 |               port: 8080
26 |             initialDelaySeconds: 2
27 |             periodSeconds: 10
28 |           # viewer-volume is provided automatically by the volumes web app
29 |           volumeMounts:
30 |             - name: viewer-volume
31 |               mountPath: /data
32 |           workingDir: /data
33 |           serviceAccountName: default-editor
34 |     networking:
35 |       targetPort: 8080
36 |       basePrefix: "/pvcviewers"
37 |       rewrite: "/"
38 |       timeout: 30s
39 |     rwoScheduling: true


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/webapp/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   labels:
 5 |     app: volumes-web-app
 6 |   name: volumes-web-app-deployment
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       app: volumes-web-app
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: volumes-web-app
17 |     spec:
18 |       containers:
19 |       - name: volumes-web-app
20 |         image: docker.io/kubeflownotebookswg/volumes-web-app:v1.9.2
21 |         ports:
22 |         - containerPort: 5000
23 |         env:
24 |         - name: APP_PREFIX
25 |           value: /volumes
26 |         - name: USERID_HEADER
27 |           value: x-auth-request-email
28 |         - name: USERID_PREFIX
29 |           value: ''
30 |         - name: APP_SECURE_COOKIES
31 |           value: 'true'
32 |         - name: VOLUME_VIEWER_IMAGE
33 |           value: filebrowser/filebrowser:latest
34 |         volumeMounts: 
35 |         - name: viewer-spec
36 |           mountPath: /etc/config/viewer-spec.yaml
37 |           subPath: viewer-spec.yaml 
38 |       serviceAccountName: volumes-web-app-service-account
39 |       volumes: 
40 |       - name: viewer-spec
41 |         configMap: 
42 |           name: volumes-web-app-viewer-spec
43 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/webapp/destination_rule.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1alpha3
 2 | kind: DestinationRule
 3 | metadata:
 4 |   labels:
 5 |     app: volumes-web-app
 6 |     
 7 |   name: volumes-web-app
 8 |   namespace: {{ .Values.kubeflow.namespace }}
 9 | spec:
10 |   host: volumes-web-app-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local
11 |   trafficPolicy:
12 |     tls:
13 |       mode: ISTIO_MUTUAL
14 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/webapp/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: volumes-web-app
 6 |     run: volumes-web-app
 7 |   name: volumes-web-app-service
 8 |   namespace: {{ .Values.kubeflow.namespace }}
 9 | spec:
10 |   ports:
11 |   - name: http
12 |     port: 80
13 |     protocol: TCP
14 |     targetPort: 5000
15 |   selector:
16 |     app: volumes-web-app
17 |   type: ClusterIP
18 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/webapp/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   labels:
5 |     app: volumes-web-app
6 |     
7 |   name: volumes-web-app-service-account
8 |   namespace: {{ .Values.kubeflow.namespace }}
9 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/templates/webapp/virtual_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1alpha3
 2 | kind: VirtualService
 3 | metadata:
 4 |   labels:
 5 |     app: volumes-web-app
 6 |   name: volumes-web-app-virtual-service
 7 |   namespace: {{ .Values.kubeflow.namespace }}
 8 | spec:
 9 |   gateways:
10 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
11 |   hosts:
12 |   - '*'
13 |   http:
14 |   - headers:
15 |       request:
16 |         add:
17 |           x-forwarded-prefix: /volumes
18 |     match:
19 |     - uri:
20 |         prefix: /volumes/
21 |     rewrite:
22 |       uri: /
23 |     route:
24 |     - destination:
25 |         host: volumes-web-app-service.{{ .Values.kubeflow.namespace }}.svc.cluster.local
26 |         port:
27 |           number: 80
28 | 


--------------------------------------------------------------------------------
/charts/ml-platform/kubeflow-volumes/values.yaml:
--------------------------------------------------------------------------------
1 | kubeflow:
2 |   namespace: kubeflow
3 | ingress:
4 |   namespace: ingress
5 |   gateway: ingress-gateway
6 |   sa: istio-ingressgateway
7 | 


--------------------------------------------------------------------------------
/charts/mpi-operator/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "0.4.0"
3 | description: A Helm chart for kubeflow mpi-operator 
4 | name: mpi-operator
5 | version: 2.1.0
6 | 


--------------------------------------------------------------------------------
/charts/mpi-operator/values.yaml:
--------------------------------------------------------------------------------
1 | namespace: kubeflow
2 | image: mpioperator/mpi-operator:0.4.0
3 | pullpolicy: Always
4 | 


--------------------------------------------------------------------------------
/charts/nvidia-device-plugin/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "v0.14.3"
3 | description: A Helm chart for Nvidia device plugin
4 | name: nvidia-device-plugin
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/oauth2-proxy-route/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: oauth2-proxy-route
3 | description: A Helm chart for oauth2-proxy route
4 | type: application
5 | version: 1.0.0
6 | appVersion: "7.5.1"
7 | 


--------------------------------------------------------------------------------
/charts/oauth2-proxy-route/templates/authorization_policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: security.istio.io/v1
 2 | kind: AuthorizationPolicy
 3 | metadata:
 4 |   name: external-auth
 5 |   namespace: {{ .Values.ingress.namespace }} 
 6 | spec:
 7 |   selector:
 8 |     matchLabels:
 9 |       app: istio-ingressgateway
10 |       istio: ingressgateway
11 |   action: CUSTOM
12 |   provider:
13 |     name: oauth2-proxy
14 |   rules:
15 |   - to:
16 |     - operation:
17 |         notPaths: ["/dex/*", "/authservice/logout"]  


--------------------------------------------------------------------------------
/charts/oauth2-proxy-route/templates/virtual_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.istio.io/v1beta1
 2 | kind: VirtualService
 3 | metadata:
 4 |   name: oauth2-proxy
 5 |   namespace: {{ .Values.oauth2_proxy.namespace }} 
 6 | spec:
 7 |   hosts:
 8 |   - '*'
 9 |   gateways:
10 |   - {{ .Values.ingress.namespace }}/{{ .Values.ingress.gateway }}
11 |   http:
12 |   - name: logout
13 |     match:
14 |     - uri:
15 |         exact: /authservice/logout
16 |     redirect:
17 |       uri: /oauth2/sign_out
18 |   - name: "oauth2"
19 |     match:
20 |     - uri:
21 |         prefix: "/oauth2"
22 |     route:
23 |     - destination:
24 |         host: oauth2-proxy.{{ .Values.oauth2_proxy.namespace }}.svc.cluster.local
25 |         port:
26 |           number: 80


--------------------------------------------------------------------------------
/charts/oauth2-proxy-route/values.yaml:
--------------------------------------------------------------------------------
1 | oauth2_proxy:
2 |   namespace: auth
3 | ingress:
4 |   namespace: ingress
5 |   gateway: ingress-gateway


--------------------------------------------------------------------------------
/charts/pv-efs/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "2.5.2"
3 | description: A Helm chart for k8s persistent volume for EFS
4 | name: pv-efs
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/charts/pv-efs/templates/pv.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: {{ .Values.efs.volume_name | default "pv-efs" }}
 5 | spec:
 6 |   capacity:
 7 |     storage: {{ .Values.efs.storage | default "1000Gi" }}  
 8 |   volumeMode: Filesystem
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: {{ .Values.efs.class_name | default "efs-sc" }}
13 |   csi:
14 |     driver: efs.csi.aws.com
15 |     volumeHandle: {{ .Values.efs.fs_id }}


--------------------------------------------------------------------------------
/charts/pv-efs/templates/pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: {{ .Values.efs.claim_name | default "pv-efs" }}
 5 |   namespace: {{ .Values.namespace }}
 6 | spec:
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   storageClassName: {{ .Values.efs.class_name | default "efs-sc" }}
10 |   volumeName: {{ .Values.efs.volume_name | default "pv-efs" }}
11 |   resources:
12 |     requests:
13 |       storage: {{ .Values.efs.storage | default "1000Gi" }} 


--------------------------------------------------------------------------------
/charts/pv-efs/templates/storage-class.yaml:
--------------------------------------------------------------------------------
1 | kind: StorageClass
2 | apiVersion: storage.k8s.io/v1
3 | metadata:
4 |   name: {{ .Values.efs.class_name | default "efs-sc" }}
5 | provisioner: efs.csi.aws.com


--------------------------------------------------------------------------------
/charts/pv-fsx/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.8.0"
3 | description: A Helm chart for k8s persistent volume for FSx for Lustre
4 | name: pv-fsx
5 | version: 1.1.0
6 | 


--------------------------------------------------------------------------------
/charts/pv-fsx/templates/pv.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: {{ .Values.fsx.volume_name | default "pv-fsx" }}
 5 | spec:
 6 |   capacity:
 7 |     storage: {{ .Values.fsx.storage | default "1200Gi" }} 
 8 |   volumeMode: Filesystem
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   mountOptions:
12 |     - noatime
13 |     - flock
14 |   persistentVolumeReclaimPolicy: Retain
15 |   storageClassName: {{ .Values.fsx.class_name | default "fsx-sc" }}
16 |   csi:
17 |     driver: fsx.csi.aws.com
18 |     volumeHandle:  {{ .Values.fsx.fs_id }}
19 |     volumeAttributes:
20 |       dnsname: {{ .Values.fsx.dns_name }}
21 |       mountname: {{ .Values.fsx.mount_name }}


--------------------------------------------------------------------------------
/charts/pv-fsx/templates/pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: {{ .Values.fsx.claim_name | default "pv-fsx" }}
 5 |   namespace: {{ .Values.namespace }}
 6 | spec:
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   storageClassName: {{ .Values.fsx.class_name | default "fsx-sc" }}
10 |   volumeName: {{ .Values.fsx.volume_name | default "pv-fsx" }}
11 |   resources:
12 |     requests:
13 |       storage: {{ .Values.fsx.storage | default "1200Gi" }} 


--------------------------------------------------------------------------------
/charts/pv-fsx/templates/storage-class.yaml:
--------------------------------------------------------------------------------
1 | kind: StorageClass
2 | apiVersion: storage.k8s.io/v1
3 | metadata:
4 |   name: {{ .Values.fsx.class_name | default "fsx-sc" }}
5 | provisioner: fsx.csi.aws.com


--------------------------------------------------------------------------------
/containers/aws-samples-maskrcnn/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=mask-rcnn-tensorflow
4 | export IMAGE_TAG=tf2.12.0-cu11.8-ubuntu22.04-59168dc
5 | 


--------------------------------------------------------------------------------
/containers/megatron-deepspeed/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:24.01-py3
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | ENV DEBCONF_NONINTERACTIVE_SEEN=true
 5 | 
 6 | RUN git clone https://github.com/NVIDIA/apex /apex
 7 | RUN cd /apex && git fetch origin b496d85fb88a801d8e680872a12822de310951fd
 8 | RUN cd /apex && git reset --hard b496d85fb88a801d8e680872a12822de310951fd
 9 | 
10 | RUN pip3 install --upgrade pip
11 | RUN cd /apex && pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
12 | RUN pip3 install deepspeed==0.13.4
13 | RUN pip3 install git+https://github.com/microsoft/Megatron-DeepSpeed.git@a9856ce0e75dbe69c96d4e241e8a191b344118d7
14 | 
15 | RUN apt-get update
16 | RUN apt-get install -y libaio-dev
17 | 
18 | CMD ["/bin/bash"]
19 | 


--------------------------------------------------------------------------------
/containers/megatron-deepspeed/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=megatron-deepspeed-ngc
4 | export IMAGE_TAG=mt-a9856ce-ds-0.13.4-ngc-24.01-py3
5 | 


--------------------------------------------------------------------------------
/containers/nemo-megatron/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:24.10-py3
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | ENV DEBCONF_NONINTERACTIVE_SEEN=true
 5 | 
 6 | RUN apt-get update && apt-get install -y libsndfile1 ffmpeg
 7 | 
 8 | RUN pip3 install --upgrade pip
 9 | RUN git clone https://github.com/NVIDIA/NeMo.git /NeMo
10 | RUN cd /NeMo && git fetch origin 6b0f0886f933c6e21c92b2f1981f66993134be7e
11 | RUN cd /NeMo && git reset --hard 6b0f0886f933c6e21c92b2f1981f66993134be7e
12 | RUN cd /NeMo && pip install -e .
13 | 
14 | RUN cd /NeMo && pip install -r /NeMo/requirements/requirements_common.txt
15 | RUN cd /NeMo && pip install -r /NeMo/requirements/requirements_lightning.txt
16 | RUN cd /NeMo && pip install -r /NeMo/requirements/requirements_nlp.txt
17 | 
18 | RUN pip3 install git+https://github.com/NVIDIA/NeMo-Run.git@5ed6128f9285e61cfee73d780b663c9d780f20c7
19 | RUN pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@9c11ab4ca24ead28c3c1e29f8904f8258d7543cb
20 | 
21 | RUN pip3 install transformers==4.48.1
22 | RUN pip3 install datasets==3.2.0
23 | RUN pip3 install huggingface_hub==0.27.1
24 | 
25 | CMD ["/bin/bash"]
26 | 


--------------------------------------------------------------------------------
/containers/nemo-megatron/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=nemo-megatron
4 | export IMAGE_TAG=nemo-6b0f088-megatron-9c11ab4-ngc-24.10-py3
5 | 


--------------------------------------------------------------------------------
/containers/ray-pytorch-neuronx-vllm/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=ray-neuronx-vllm
4 | export IMAGE_TAG=ray2.44.0-py311-2.22.0-0.8.5.post1
5 | 


--------------------------------------------------------------------------------
/containers/ray-pytorch-neuronx-vllm/patches/vllm-neuron-0.6.6.post1.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
 2 | index 3f626968..af3eb0c5 100644
 3 | --- a/vllm/worker/neuron_worker.py
 4 | +++ b/vllm/worker/neuron_worker.py
 5 | @@ -58,7 +58,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 6 |          # Set the number of GPU blocks to be the same as the maximum number of
 7 |          # sequences that can be processed in a single batch. This is equivalent
 8 |          # to schedule without PagedAttention.
 9 | -        num_gpu_blocks = self.scheduler_config.max_num_seqs
10 | +        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
11 |  
12 |          # Swap not yet supported with Neuron backend.
13 |          num_cpu_blocks = 0
14 | @@ -72,7 +72,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
15 |  
16 |          # Different values are not tested.
17 |          assert num_cpu_blocks == 0
18 | -        assert num_gpu_blocks == self.scheduler_config.max_num_seqs
19 | +        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
20 |  
21 |          self.cache_config.num_gpu_blocks = num_gpu_blocks
22 |          self.cache_config.num_cpu_blocks = num_cpu_blocks
23 |          
24 |          
25 |          
26 | 
27 | 


--------------------------------------------------------------------------------
/containers/ray-pytorch-neuronx-vllm/patches/vllm_v0.5.0_neuron.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
 2 | index e7f0e887..87564b76 100644
 3 | --- a/vllm/executor/neuron_executor.py
 4 | +++ b/vllm/executor/neuron_executor.py
 5 | @@ -48,9 +48,9 @@ class NeuronExecutor(ExecutorBase):
 6 |     def execute_model(
 7 |             self,
 8 |             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
 9 | -        assert (execute_model_req.blocks_to_swap_in == {}
10 | -                and execute_model_req.blocks_to_swap_out == {}
11 | -                and execute_model_req.blocks_to_copy == {}), (
12 | +        assert (not execute_model_req.blocks_to_swap_in
13 | +                and not execute_model_req.blocks_to_swap_out
14 | +                and not execute_model_req.blocks_to_copy), (
15 |                     "Cache operations are not supported for Neuron backend.")
16 |         assert execute_model_req.num_lookahead_slots == 0, (
17 |             "lookahead not supported for Neuron backend.")
18 | 


--------------------------------------------------------------------------------
/containers/ray-pytorch-neuronx/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=ray-neuronx
4 | export IMAGE_TAG=ray2.44.0-py311-2.22.0
5 | 


--------------------------------------------------------------------------------
/containers/ray-pytorch/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rayproject/ray:2.44.0-py311-cu125
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | ENV DEBCONF_NONINTERACTIVE_SEEN=true
 5 | 
 6 | RUN pip install --upgrade pip
 7 | RUN pip install torch==2.5.1 torchvision torchaudio
 8 | 
 9 | CMD ["/bin/bash"]
10 | 


--------------------------------------------------------------------------------
/containers/ray-pytorch/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=ray-pytorch-cuda
4 | export IMAGE_TAG=2.44.0-py311-cu125-2.5.1
5 | 


--------------------------------------------------------------------------------
/containers/tensorpack-maskrcnn/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=mask-rcnn-tensorpack
4 | export IMAGE_TAG=tf2.12.0-cu118-ubuntu22.04-fac024f
5 | 


--------------------------------------------------------------------------------
/containers/tritonserver-neuronx-djl-lmi/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=tritonserver-neuronx-djl-lmi
4 | export IMAGE_TAG=24.06-2.21.0-c343d60
5 | 


--------------------------------------------------------------------------------
/containers/tritonserver-neuronx-vllm/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=tritonserver-neuronx-vllm
4 | export IMAGE_TAG=24.06-2.22.0-0.8.5.post1
5 | 


--------------------------------------------------------------------------------
/containers/tritonserver-neuronx-vllm/patch/vllm-neuron-0.6.6.post1.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
 2 | index 3f626968..af3eb0c5 100644
 3 | --- a/vllm/worker/neuron_worker.py
 4 | +++ b/vllm/worker/neuron_worker.py
 5 | @@ -58,7 +58,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 6 |          # Set the number of GPU blocks to be the same as the maximum number of
 7 |          # sequences that can be processed in a single batch. This is equivalent
 8 |          # to schedule without PagedAttention.
 9 | -        num_gpu_blocks = self.scheduler_config.max_num_seqs
10 | +        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
11 |  
12 |          # Swap not yet supported with Neuron backend.
13 |          num_cpu_blocks = 0
14 | @@ -72,7 +72,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
15 |  
16 |          # Different values are not tested.
17 |          assert num_cpu_blocks == 0
18 | -        assert num_gpu_blocks == self.scheduler_config.max_num_seqs
19 | +        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
20 |  
21 |          self.cache_config.num_gpu_blocks = num_gpu_blocks
22 |          self.cache_config.num_cpu_blocks = num_cpu_blocks
23 |          
24 |          
25 |          
26 | 
27 | 


--------------------------------------------------------------------------------
/containers/tritonserver-neuronx/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=tritonserver-neuronx
4 | export IMAGE_TAG=24.06-2.22.0
5 | 


--------------------------------------------------------------------------------
/containers/tritonserver-ray-vllm/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.01-vllm-python-py3
 2 | FROM ${BASE_IMAGE}
 3 | 
 4 | ENV DEBIAN_FRONTEND=noninteractive
 5 | ENV DEBCONF_NONINTERACTIVE_SEEN=true
 6 | 
 7 | RUN apt update \
 8 |  && apt install --yes \
 9 |     apt-transport-https \
10 |     ca-certificates \
11 |     curl \
12 |     gnupg \
13 |     cgroup-tools \
14 |  && rm -rf /var/lib/apt/lists/*
15 | 
16 | RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key \
17 |   | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg \
18 |  && chmod 644 /etc/apt/keyrings/kubernetes-apt-keyring.gpg
19 | 
20 | RUN echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' \
21 |   | tee /etc/apt/sources.list.d/kubernetes.list \
22 |  && chmod 644 /etc/apt/sources.list.d/kubernetes.list
23 | 
24 | RUN apt update \
25 |  && apt install --yes \
26 |     kubectl \
27 |  && apt autoremove --yes \
28 |  && apt purge --yes \
29 |  && rm -rf /var/lib/apt/lists/*
30 | 
31 | WORKDIR /workspace
32 | 
33 | COPY --chmod=555 resources/kubessh /usr/local/bin/kubessh
34 | COPY resources/server.py .


--------------------------------------------------------------------------------
/containers/tritonserver-ray-vllm/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=tritonserver-ray-vllm
4 | export IMAGE_TAG=25.01-vllm-python-py3
5 | 


--------------------------------------------------------------------------------
/containers/tritonserver-ray-vllm/resources/kubessh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | pod=$1
18 | shift
19 | kubectl exec $pod  -- /bin/sh -c "$*"
20 | 


--------------------------------------------------------------------------------
/containers/tritonserver-trtllm/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export IMAGE_NAME=tritonserver-trtllm
4 | export IMAGE_TAG=24.12-trtllm-python-py3
5 | 


--------------------------------------------------------------------------------
/containers/tritonserver-trtllm/resources/kubessh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | pod=$1
18 | shift
19 | kubectl exec $pod  -- /bin/sh -c "$*"
20 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/README.md:
--------------------------------------------------------------------------------
1 | ## Deprecated scripts
2 | 
3 | The scripts in this folder are no longer used and maintained. These scripts have been deprecated.


--------------------------------------------------------------------------------
/eks-cluster/legacy/apply-aws-auth-cm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl apply -f aws-auth-cm.yaml
4 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/apply-nvidia-plugin.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml
4 | kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
5 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/aws-auth-cm.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: aws-auth
 5 |   namespace: kube-system
 6 | data:
 7 |   mapRoles: |
 8 |     - rolearn: <workers NodeInstanceRole ARN1>
 9 |       username: system:node:{{EC2PrivateDNSName}}
10 |       groups:
11 |         - system:bootstrappers
12 |         - system:nodes
13 |     #- rolearn: <workers NodeInstanceRole ARN2>
14 |       #username: system:node:{{EC2PrivateDNSName}}
15 |       #groups:
16 |         #- system:bootstrappers
17 |         #- system:nodes
18 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/efs-sc.yaml:
--------------------------------------------------------------------------------
1 | kind: StorageClass
2 | apiVersion: storage.k8s.io/v1
3 | metadata:
4 |   name: efs-sc
5 | provisioner: efs.csi.aws.com
6 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/fsx-sc.yaml:
--------------------------------------------------------------------------------
1 | kind: StorageClass
2 | apiVersion: storage.k8s.io/v1
3 | metadata:
4 |   name: fsx-sc
5 | provisioner: fsx.csi.aws.com
6 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/install-eksctl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | # 
 4 | #Permission is hereby granted, free of charge, to any person obtaining a copy of this
 5 | #software and associated documentation files (the "Software"), to deal in the Software
 6 | #without restriction, including without limitation the rights to use, copy, modify,
 7 | #merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
 8 | #permit persons to whom the Software is furnished to do so.
 9 | #
10 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
11 | #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
12 | #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
13 | #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
14 | #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
15 | #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16 | 
17 | # WARNING: THIS FILE IS DEPRECATED AND IS NOT USED 
18 | 
19 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
20 | sudo mv /tmp/eksctl /usr/local/bin
21 | eksctl version
22 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/prepare-data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Customize S3_BUCKET
 4 | S3_BUCKET=
 5 | 
 6 | # Customize S3_PREFIX
 7 | S3_PREFIX=mask-rcnn/eks/input
 8 | 
 9 | # Uncomment one of the options below
10 | # For EFS uncomment below
11 | DATA_DIR=/efs
12 | # For FSX uncomment below
13 | #DATA_DIR=/fsx
14 | # For EBS uncomment below
15 | #DATA_DIR=$HOME
16 | 
17 | 
18 | if [ -e $DATA_DIR/data ]
19 | then
20 | echo "$DATA_DIR/data already exists"
21 | exit 1
22 | fi
23 | 
24 | mkdir -p $DATA_DIR/data
25 | 
26 | aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX/data $DATA_DIR/data
27 | 
28 | if [ -f ./run.sh ] 
29 | then
30 | 	cp run.sh $DATA_DIR/
31 | fi
32 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/pv-kubeflow-efs-gp-bursting.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: pv-efs
 5 | spec:
 6 |   capacity:
 7 |     storage: 1000Gi
 8 |   volumeMode: Filesystem
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: efs-sc
13 |   csi:
14 |     driver: efs.csi.aws.com
15 |     volumeHandle: 
16 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/pv-kubeflow-fsx.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: pv-fsx
 5 | spec:
 6 |   capacity:
 7 |     storage: 1200Gi 
 8 |   volumeMode: Filesystem
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   mountOptions:
12 |     - noatime
13 |     - flock
14 |   persistentVolumeReclaimPolicy: Retain
15 |   csi:
16 |     driver: fsx.csi.aws.com
17 |     volumeHandle: <fsx-file-system-id>
18 |     volumeAttributes:
19 |       dnsname: <fsx-file-system-id>.fsx.<aws-region>.amazonaws.com
20 |       mountname: <fsx-mount-name>
21 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/pvc-kubeflow-efs-gp-bursting.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: pv-efs 
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteMany
 8 |   storageClassName: efs-sc 
 9 |   resources:
10 |     requests:
11 |       storage: 100Gi
12 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/pvc-kubeflow-fsx.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: pv-fsx
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteMany
 8 |   storageClassName: "" 
 9 |   resources:
10 |     requests:
11 |       storage: 1200Gi
12 |   volumeName: pv-fsx
13 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/replicate-data.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: replicate-data 
 6 | data:
 7 |   replicate-data.sh: |
 8 |     aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX /ebs
 9 |     while true; do echo $(date -u) >> /root/date.txt; sleep 3600; done
10 | ---
11 | apiVersion: apps/v1
12 | kind: DaemonSet
13 | metadata:
14 |   name: replicate-data 
15 | spec:
16 |   selector:
17 |     matchLabels:
18 |       name: replicate-data 
19 |   template:
20 |     metadata:
21 |       labels:
22 |         name: replicate-data 
23 |     spec:
24 |       restartPolicy: Always
25 |       volumes:
26 |       - name: ebs
27 |         hostPath:
28 |            path: /ebs
29 |            type: DirectoryOrCreate
30 |       - name: config
31 |         configMap:
32 |           defaultMode: 420
33 |           items:
34 |           - key: replicate-data.sh
35 |             mode: 365
36 |             path: replicate-data.sh
37 |           name: replicate-data 
38 |       containers:
39 |       - name: replicate-data 
40 |         env:
41 |         - name: S3_BUCKET
42 |           value: my-bucket 
43 |         - name: S3_PREFIX
44 |           value: my-bucket-prefix 
45 |         command:
46 |         -  sh 
47 |         - /etc/config/replicate-data.sh
48 |         image: # use image with aws cli support  
49 |         imagePullPolicy: IfNotPresent
50 |         volumeMounts:
51 |         - mountPath: /etc/config
52 |           name: config
53 |         - mountPath: /ebs
54 |           name: ebs
55 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/set-cluster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | export EKS_CLUSTER=my-eks-cluster
4 | export AWS_REGION=us-west-2
5 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/tiller-rbac-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: tiller
 5 |   namespace: kube-system
 6 | ---
 7 | apiVersion: rbac.authorization.k8s.io/v1
 8 | kind: ClusterRoleBinding
 9 | metadata:
10 |   name: tiller
11 | roleRef:
12 |   apiGroup: rbac.authorization.k8s.io
13 |   kind: ClusterRole
14 |   name: cluster-admin
15 | subjects:
16 |   - kind: ServiceAccount
17 |     name: tiller
18 |     namespace: kube-system
19 | 


--------------------------------------------------------------------------------
/eks-cluster/legacy/update-kubeconfig.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pip install --upgrade pip
4 | pip install awscli --upgrade --user
5 | 
6 | source ./set-cluster.sh
7 | aws eks --region $AWS_REGION update-kubeconfig --name $EKS_CLUSTER
8 | 


--------------------------------------------------------------------------------
/eks-cluster/terraform/aws-eks-cluster-and-nodegroup/istio/variables.tf:
--------------------------------------------------------------------------------
1 | variable "istio_system_namespace" {
2 |   description = "Istio system namespace"
3 |   type = string
4 | }
5 | 
6 | variable "auth_namespace" {
7 |   description = "Auth namespace"
8 |   type = string
9 | }


--------------------------------------------------------------------------------
/eks-cluster/terraform/aws-eks-cluster-and-nodegroup/istio/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.5.1"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = ">= 2.7.0"
 8 |     }
 9 | 
10 |     kubectl = {
11 |       source  = "gavinbunney/kubectl"
12 |       version = ">= 1.14.0"
13 |     }
14 |   }
15 | }


--------------------------------------------------------------------------------
/eks-cluster/terraform/aws-eks-cluster-and-nodegroup/kubeflow/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.5.1"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = ">= 2.7.0"
 8 |     }
 9 | 
10 |     kubectl = {
11 |       source  = "gavinbunney/kubectl"
12 |       version = ">= 1.14.0"
13 |     }
14 |   }
15 | }


--------------------------------------------------------------------------------
/eks-cluster/terraform/aws-eks-cluster-and-nodegroup/mlflow/outputs.tf:
--------------------------------------------------------------------------------
1 | output "db_secret_arn" {
2 |   description = "DB secret ARN"
3 |   value = aws_rds_cluster.db.master_user_secret[0].secret_arn
4 | }
5 | 


--------------------------------------------------------------------------------
/eks-cluster/terraform/aws-eks-cluster-and-nodegroup/mlflow/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "mlflow_namespace" {
 2 |   description = "MFlow namespace"
 3 |   type        = string
 4 | }
 5 | 
 6 | variable "mlflow_version" {
 7 |   description = "MFlow chart version"
 8 |   type        = string
 9 | }
10 | 
11 | 
12 | variable "force_destroy_bucket" {
13 |   description = "MFlow force destroy bucket"
14 |   type        = bool
15 | }
16 | 
17 | variable "eks_cluster_id" {
18 |   description = "EKS cluster id"
19 |   type        = string
20 | }
21 | 
22 | variable "eks_oidc_provider_arn" {
23 |   description = "EKS OIDC provider ARN"
24 |   type        = string
25 | }
26 | 
27 | variable "eks_oidc_issuer" {
28 |   description = "EKS OIDC issuer"
29 |   type        = string
30 | }
31 | 
32 | variable "admin_username" {
33 |   description = "MLFlow admin username"
34 |   type        = string
35 | }
36 | 
37 | variable "admin_password" {
38 |   description = "MLFlow admin password"
39 |   type        = string
40 | }
41 | 
42 | variable "db_max_capacity" {
43 |   description = "MLFlow DB max capacity"
44 |   type        = number
45 | }
46 | 
47 | variable "db_subnet_ids" {
48 |   description = "MLFlow DB subnet ids"
49 |   type        = list
50 | }
51 | 
52 | variable "db_vpc_id" {
53 |   description = "MLFlow DB VPC id"
54 |   type        = string
55 | }
56 | 
57 | variable "db_port" {
58 |   description = "MLFlow DB port"
59 |   type        = number
60 | }


--------------------------------------------------------------------------------
/eks-cluster/terraform/aws-eks-cluster-and-nodegroup/mlflow/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 |   required_version = ">= 1.5.1"
3 | 
4 | }


--------------------------------------------------------------------------------
/eks-cluster/terraform/aws-eks-cluster-and-nodegroup/slurm/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "local_helm_repo" {
 2 |   description = "Local Helm charts path"
 3 |   type        = string
 4 | }
 5 | 
 6 | variable "slurm_namespace" {
 7 |   description = "Slurm namespace"
 8 |   type        = string
 9 | }
10 | 
11 | variable "efs_fs_id" {
12 |   description = "EFS file-system id"
13 |   type = string
14 | }
15 | 
16 | variable "ssh_public_key" {
17 |   description = "Slurm SSH public key for node login"
18 |   type        = string
19 | }
20 | 
21 | variable "storage_capacity" {
22 |   description = "Shared storage capacity"
23 |   type        = string
24 | }
25 | 
26 | variable "password" {
27 |   description = "Slurm password for user rocky"
28 |   type        = string
29 | }


--------------------------------------------------------------------------------
/eks-cluster/terraform/aws-eks-cluster-and-nodegroup/slurm/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 |   required_version = ">= 1.5.1"
3 | 
4 | }


--------------------------------------------------------------------------------
/eks-cluster/terraform/aws-eks-cluster-and-nodegroup/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.5.1"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = ">= 2.7.0"
 8 |     }
 9 | 
10 |     kubectl = {
11 |       source  = "gavinbunney/kubectl"
12 |       version = ">= 1.14.0"
13 |     }
14 |   }
15 | }


--------------------------------------------------------------------------------
/eks-cluster/tests/test-gpu-efa.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: ReplicaSet
 3 | metadata:
 4 |   name: test-gpu-efa
 5 |   labels:
 6 |     app: test-gpu-efa
 7 | spec:
 8 |   replicas: 2
 9 |   selector:
10 |     matchLabels:
11 |       app: test-gpu-efa
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: test-gpu-efa
16 |     spec:
17 |       containers:
18 |       - name: test-gpu-efa
19 |         image: '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-ec2'
20 |         command: ["/bin/bash"]
21 |         securityContext:
22 |           privileged: true
23 |         args: ["-c", "trap : TERM INT; sleep infinity & wait"]
24 |         volumeMounts:
25 |         - name:  fsx 
26 |           mountPath: /fsx
27 |         resources:
28 |           requests:
29 |             "nvidia.com/gpu": 8 
30 |             "vpc.amazonaws.com/efa": 1
31 |           limits:
32 |             "nvidia.com/gpu": 8
33 |             "vpc.amazonaws.com/efa": 1
34 |       volumes:
35 |       - name:  fsx
36 |         persistentVolumeClaim:
37 |           claimName: pv-fsx # k8s persistent-volume-claim name
38 |       nodeSelector:
39 |         node.kubernetes.io/instance-type: "p4d.24xlarge" 
40 |       tolerations:
41 |         - key: "nvidia.com/gpu"
42 |           operator: "Exists"
43 |           effect: "NoSchedule"
44 | 


--------------------------------------------------------------------------------
/eks-cluster/tests/test-gpu.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: test-gpu
 5 |   annotations:
 6 |     karpenter.sh/do-not-disrupt: "true"
 7 |     sidecar.istio.io/inject: 'false'
 8 | spec:
 9 |   containers:
10 |   - name: test-gpu 
11 |     image: ubuntu:latest
12 |     command: ["/bin/bash"]
13 |     securityContext:
14 |       privileged: true
15 |     args: ["-c", "trap : TERM INT; sleep infinity & wait"]
16 |     volumeMounts:
17 |     - name:  fsx 
18 |       mountPath: /fsx
19 |     resources:
20 |       requests:
21 |         "nvidia.com/gpu": 8 
22 |       limits:
23 |         "nvidia.com/gpu": 8 
24 |   volumes:
25 |   - name:  fsx
26 |     persistentVolumeClaim:
27 |       claimName: pv-fsx # k8s persistent-volume-claim name
28 |   nodeSelector:
29 |     node.kubernetes.io/instance-type: "g5.48xlarge" 
30 |   tolerations:
31 |     - key: "nvidia.com/gpu"
32 |       operator: "Exists"
33 |       effect: "NoSchedule"
34 | 


--------------------------------------------------------------------------------
/eks-cluster/tests/test-neuron.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: test-neuron
 5 |   annotations:
 6 |     karpenter.sh/do-not-disrupt: "true"
 7 |     sidecar.istio.io/inject: 'false'
 8 | spec:
 9 |   containers:
10 |   - name: test-neuron 
11 |     image: ubuntu:latest
12 |     command: ["/bin/bash"]
13 |     securityContext:
14 |       privileged: true
15 |     args: ["-c", "trap : TERM INT; sleep infinity & wait"]
16 |     volumeMounts:
17 |     - name:  fsx 
18 |       mountPath: /fsx
19 |     resources:
20 |       requests:
21 |         "aws.amazon.com/neuron": 16
22 |         "aws.amazon.com/neuroncore": 32
23 |         "aws.amazon.com/neurondevice": 16
24 |         "vpc.amazonaws.com/efa": 8
25 |       limits:
26 |         "aws.amazon.com/neuron": 16
27 |         "aws.amazon.com/neuroncore": 32
28 |         "aws.amazon.com/neurondevice": 16
29 |         "vpc.amazonaws.com/efa": 8
30 |   volumes:
31 |   - name:  fsx
32 |     persistentVolumeClaim:
33 |       claimName: pv-fsx # k8s persistent-volume-claim name
34 |   nodeSelector:
35 |     node.kubernetes.io/instance-type: "trn1.32xlarge" 
36 |   tolerations:
37 |     - key: "aws.amazon.com/neuron"
38 |       operator: "Exists"
39 |       effect: "NoSchedule"
40 |     - key: "aws.amazon.com/efa"
41 |       operator: "Exists"
42 |       effect: "NoSchedule"
43 | 


--------------------------------------------------------------------------------
/eks-cluster/utils/attach-pvc-fsx.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: attach-pvc-fsx
 5 | spec:
 6 |   containers:
 7 |   - name: attach-pvc-fsx
 8 |     image: ubuntu:latest
 9 |     command: ["/bin/bash"]
10 |     securityContext:
11 |       privileged: true
12 |     args: ["-c", "trap : TERM INT; sleep infinity & wait"]
13 |     volumeMounts:
14 |     - name:  fsx 
15 |       mountPath: /fsx
16 |   volumes:
17 |   - name:  fsx
18 |     persistentVolumeClaim:
19 |       claimName: pv-fsx # k8s persistent-volume-claim name
20 | 


--------------------------------------------------------------------------------
/eks-cluster/utils/attach-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: attach-pvc
 5 | spec:
 6 |   containers:
 7 |   - name: attach-pvc
 8 |     image: ubuntu:latest
 9 |     command: ["/bin/bash"]
10 |     securityContext:
11 |       privileged: true
12 |     args: ["-c", "trap : TERM INT; sleep infinity & wait"]
13 |     volumeMounts:
14 |     - name:  efs 
15 |       mountPath: /efs
16 |     - name:  fsx 
17 |       mountPath: /fsx
18 |   volumes:
19 |   - name:  efs
20 |     persistentVolumeClaim:
21 |       claimName: pv-efs # k8s persistent-volume-claim name
22 |   - name:  fsx
23 |     persistentVolumeClaim:
24 |       claimName: pv-fsx # k8s persistent-volume-claim name
25 | 


--------------------------------------------------------------------------------
/eks-cluster/utils/install-kubectl-linux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.31.3/2024-12-12/bin/linux/amd64/kubectl
 4 | 
 5 | chmod +x ./kubectl
 6 | 
 7 | sudo mv ./kubectl /usr/local/bin/
 8 | 
 9 | kubectl version
10 | 
11 | curl -Lo aws-iam-authenticator https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.6.30/aws-iam-authenticator_0.6.30_linux_amd64
12 | 
13 | chmod +x ./aws-iam-authenticator
14 | sudo mv ./aws-iam-authenticator /usr/local/bin/
15 | aws-iam-authenticator help
16 | 


--------------------------------------------------------------------------------
/eks-cluster/utils/prepare-s3-bucket.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | [[ $# -ne 1 ]] && echo "usage:  $0 s3-bucket" && exit 1
 4 | 
 5 | S3_BUCKET=$1
 6 | 
 7 | S3_PREFIX=ml-platform/data/coco2017
 8 | 
 9 | # Stage directory must be on a volume with atleast 100 GB available space
10 | STAGE_DIR=$HOME/stage/data/coco2017
11 | 
12 | if [ -e $STAGE_DIR ]
13 | then
14 | echo "$STAGE_DIR already exists"
15 | exit 1
16 | fi
17 | 
18 | mkdir -p $STAGE_DIR
19 | 
20 | wget -O $STAGE_DIR/train2017.zip http://images.cocodataset.org/zips/train2017.zip
21 | unzip $STAGE_DIR/train2017.zip  -d $STAGE_DIR
22 | rm $STAGE_DIR/train2017.zip
23 | 
24 | wget -O $STAGE_DIR/val2017.zip http://images.cocodataset.org/zips/val2017.zip
25 | unzip $STAGE_DIR/val2017.zip -d $STAGE_DIR
26 | rm $STAGE_DIR/val2017.zip
27 | 
28 | wget -O $STAGE_DIR/annotations_trainval2017.zip http://images.cocodataset.org/annotations/annotations_trainval2017.zip
29 | unzip $STAGE_DIR/annotations_trainval2017.zip -d $STAGE_DIR
30 | rm $STAGE_DIR/annotations_trainval2017.zip
31 | 
32 | mkdir $STAGE_DIR/pretrained-models
33 | wget -O $STAGE_DIR/pretrained-models/ImageNet-R50-AlignPadding.npz http://models.tensorpack.com/FasterRCNN/ImageNet-R50-AlignPadding.npz
34 | 
35 | aws s3 cp --recursive $STAGE_DIR s3://$S3_BUCKET/$S3_PREFIX
36 | rm -rf $STAGE_DIR
37 | 


--------------------------------------------------------------------------------
/eks-cluster/utils/s3-backend.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | [[ $# -ne 2 ]] && echo "usage:  $0 s3-bucket s3-prefix" && exit 1
 4 | 
 5 | export S3_BUCKET_NAME=$1
 6 | export S3_BUCKET_PREFIX=$2
 7 | export PATH_TO_BACKUP=terraform/state
 8 | export BUCKET_REGION=$(aws configure get region)
 9 | 
10 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
11 | 
12 | cat <<EOF > $DIR/../terraform/aws-eks-cluster-and-nodegroup/backend.tf
13 | terraform {
14 |     backend "s3" {
15 |         bucket = "${S3_BUCKET_NAME}"
16 |         key    = "${S3_BUCKET_PREFIX}/${PATH_TO_BACKUP}"
17 |         region = "${BUCKET_REGION}"
18 |     }
19 | }
20 | EOF
21 | 


--------------------------------------------------------------------------------
/eks-cluster/utils/stage-data-fsx.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: stage-data-fsx 
 6 | data:
 7 |   stage-data.sh: |
 8 |     aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX $STAGE_DIR
 9 | ---
10 | apiVersion: v1
11 | kind: Pod
12 | metadata:
13 |   name: stage-data-fsx 
14 | spec:
15 |   restartPolicy: Never
16 |   volumes:
17 |   - name: fsx
18 |     persistentVolumeClaim:
19 |       claimName: pv-fsx # persistent volume claim name 
20 |   - name: config
21 |     configMap:
22 |       defaultMode: 420
23 |       items:
24 |       - key: stage-data.sh
25 |         mode: 365
26 |         path: stage-data.sh
27 |       name: stage-data-fsx 
28 |   containers:
29 |   - name: stage-data-fsx
30 |     env:
31 |     - name: S3_BUCKET
32 |       value: my-bucket
33 |     - name: S3_PREFIX
34 |       value: ml-platform/data/coco2017 
35 |     - name: STAGE_DIR 
36 |       value: /fsx/data/coco2017
37 |     command:
38 |     -  sh 
39 |     - /etc/config/stage-data.sh
40 |     image: amazon/aws-cli # use image with aws cli support
41 |     imagePullPolicy: IfNotPresent
42 |     volumeMounts:
43 |     - mountPath: /etc/config
44 |       name: config
45 |     - mountPath: /fsx
46 |       name: fsx
47 | 


--------------------------------------------------------------------------------
/eks-cluster/utils/stage-data.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: stage-data 
 6 | data:
 7 |   stage-data.sh: |
 8 |     aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX $STAGE_DIR
 9 | ---
10 | apiVersion: v1
11 | kind: Pod
12 | metadata:
13 |   name: stage-data 
14 | spec:
15 |   restartPolicy: Never
16 |   volumes:
17 |   - name: efs
18 |     persistentVolumeClaim:
19 |       claimName: pv-efs # persistent volume claim name 
20 |   - name: config
21 |     configMap:
22 |       defaultMode: 420
23 |       items:
24 |       - key: stage-data.sh
25 |         mode: 365
26 |         path: stage-data.sh
27 |       name: stage-data 
28 |   containers:
29 |   - name: stage-data
30 |     env:
31 |     - name: S3_BUCKET
32 |       value: my-bucket
33 |     - name: S3_PREFIX
34 |       value: ml-platform/data/coco2017
35 |     - name: STAGE_DIR 
36 |       value: /efs/data/coco2017
37 |     command:
38 |     -  sh 
39 |     - /etc/config/stage-data.sh
40 |     image: amazon/aws-cli # use image with aws cli support
41 |     imagePullPolicy: IfNotPresent
42 |     volumeMounts:
43 |     - mountPath: /etc/config
44 |       name: config
45 |     - mountPath: /efs
46 |       name: efs
47 | 


--------------------------------------------------------------------------------
/examples/inference/rayserve/facebook-bart-large-cnn/rayservice.yaml:
--------------------------------------------------------------------------------
 1 | ray:
 2 |   version: '2.44.0'
 3 |   dashboard:
 4 |     host: '0.0.0.0'
 5 |   ports:
 6 |     - name: gcs
 7 |       port: 6379
 8 |     - name: client
 9 |       port: 10001
10 |     - name: dashboard
11 |       port: 8265
12 |     - name: serve
13 |       port: 8000
14 |   resources:
15 |     requests:
16 |       cpu: 300m 
17 |     limits:
18 |       cpu: 2
19 |   serve_config_v2: 
20 |     serveConfigV2: |
21 |       applications:
22 |         - name: text_summarizer
23 |           import_path: text_summarizer.text_summarizer:deployment
24 |           runtime_env:
25 |             working_dir: "https://github.com/ray-project/serve_config_examples/archive/refs/heads/master.zip"
26 |             pip:
27 |               - "transformers==4.42.4"
28 |   service_unhealthy_threshold_secs: 900
29 |   deployment_unhealthy_threshold_secs: 300
30 | image:
31 | image_pull_policy: IfNotPresent
32 | resources:
33 |   requests:
34 |     "nvidia.com/gpu": 1 
35 |   limits:
36 |     "nvidia.com/gpu": 1 
37 |   node_type: 'g5.xlarge' 
38 | tolerations:
39 |   - key: "nvidia.com/gpu"
40 |     operator: "Exists"
41 |     effect: "NoSchedule"


--------------------------------------------------------------------------------
/examples/inference/rayserve/meta-llama3-8b-neuron/engine_config.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | inline_script:
 3 | - |+
 4 |   cat > /tmp/engine.json <<EOF
 5 |   {
 6 |     "pretrained_model_name_or_path": "/fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct",
 7 |     "n_positions": 8192,
 8 |     "tp_degree": 8,
 9 |     "batch_size": 1,
10 |     "amp": "f16",
11 |     "neuron_config": {
12 |       "on_device_generation": {
13 |         "max_length": 8192, 
14 |         "top_k": 50, 
15 |         "do_sample": true
16 |       }
17 |     }
18 |   }
19 | 
20 |   EOF
21 | pre_script:
22 |   - mkdir -p $CONFIG_ROOT
23 |   - cp /tmp/engine.json $CONFIG_ROOT/engine.json
24 | process:
25 |   env:
26 |     - name: CONFIG_ROOT
27 |       value: /efs/home/{{ .Release.Name }}/config
28 | 


--------------------------------------------------------------------------------
/examples/inference/rayserve/meta-llama3-8b-vllm-neuron/engine_config.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | inline_script:
 3 | - |+
 4 |   cat > /tmp/engine.json <<EOF
 5 |   {
 6 |     "model": "$MODEL_PATH",
 7 |     "disable_log_requests": true,
 8 |     "tensor_parallel_size": 8,
 9 |     "max_num_seqs": 4,
10 |     "dtype": "float16",
11 |     "max_model_len": 8192,
12 |     "block_size": 8192
13 |   }
14 | 
15 |   EOF
16 | pre_script:
17 |   - mkdir -p $CONFIG_ROOT
18 |   - cp /tmp/engine.json $CONFIG_ROOT/engine.json
19 | process:
20 |   env:
21 |     - name: CONFIG_ROOT
22 |       value: /efs/home/{{ .Release.Name }}/config
23 |     - name: MODEL_PATH
24 |       value: /fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct
25 | 


--------------------------------------------------------------------------------
/examples/inference/rayserve/meta-llama3-8b-vllm/engine_config.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | inline_script:
 3 | - |+
 4 |   cat > /tmp/engine.json <<EOF
 5 |   {
 6 |     "model": "/fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct",
 7 |     "tokenizer": "/fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct",
 8 |     "disable_log_requests": true,
 9 |     "tensor_parallel_size": 8
10 |   }
11 | 
12 |   EOF
13 | pre_script:
14 |   - mkdir -p $CONFIG_ROOT
15 |   - cp /tmp/engine.json $CONFIG_ROOT/engine.json
16 | process:
17 |   env:
18 |     - name: CONFIG_ROOT
19 |       value: /efs/home/{{ .Release.Name }}/config
20 | 


--------------------------------------------------------------------------------
/examples/inference/rayserve/meta-llama32-11b-vis-inst-vllm/engine_config.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | inline_script:
 3 | - |+
 4 |   cat > /tmp/engine.json <<EOF
 5 |   {
 6 |     "model": "/fsx/pretrained-models/meta-llama/Llama-3.2-11B-Vision-Instruct",
 7 |     "tokenizer": "/fsx/pretrained-models/meta-llama/Llama-3.2-11B-Vision-Instruct",
 8 |     "disable_log_requests": true,
 9 |     "tensor_parallel_size": 8,
10 |     "max_model_len": 4096,
11 |     "dtype": "auto",
12 |     "gpu_memory_utilization": 0.9,
13 |     "swap_space": 16,
14 |     "enforce_eager": true,
15 |     "max_num_seqs": 8
16 |   }
17 | 
18 |   EOF
19 | pre_script:
20 |   - mkdir -p $CONFIG_ROOT
21 |   - cp /tmp/engine.json $CONFIG_ROOT/engine.json
22 | process:
23 |   env:
24 |     - name: CONFIG_ROOT
25 |       value: /efs/home/{{ .Release.Name }}/config
26 | 


--------------------------------------------------------------------------------
/examples/inference/rayserve/meta-llama33-70b-instruct-neuron/engine_config.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | inline_script:
 3 | - |+
 4 |   cat > /tmp/engine.json <<EOF
 5 |   {
 6 |     "pretrained_model_name_or_path": "/fsx/pretrained-models/meta-llama/Llama-3.3-70B-Instruct",
 7 |     "n_positions": 8192,
 8 |     "tp_degree": 32,
 9 |     "pp_stages": 2,
10 |     "batch_size": 1,
11 |     "amp": "f16",
12 |     "neuron_config": {
13 |       "on_device_generation": {
14 |         "max_length": 8192, 
15 |         "top_k": 50, 
16 |         "do_sample": true
17 |       }
18 |     }
19 |   }
20 | 
21 |   EOF
22 | pre_script:
23 |   - mkdir -p $CONFIG_ROOT
24 |   - cp /tmp/engine.json $CONFIG_ROOT/engine.json
25 | process:
26 |   env:
27 |     - name: CONFIG_ROOT
28 |       value: /efs/home/{{ .Release.Name }}/config
29 | 


--------------------------------------------------------------------------------
/examples/inference/rayserve/meta-llama33-70b-instruct-vllm/engine_config.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | inline_script:
 3 | - |+
 4 |   cat > /tmp/engine.json <<EOF
 5 |   {
 6 |     "model": "/fsx/pretrained-models/meta-llama/Llama-3.3-70B-Instruct",
 7 |     "tokenizer": "/fsx/pretrained-models/meta-llama/Llama-3.3-70B-Instruct",
 8 |     "disable_log_requests": true,
 9 |     "tensor_parallel_size": 8,
10 |     "pipeline_parallel_size": 2
11 |   }
12 | 
13 |   EOF
14 | pre_script:
15 |   - mkdir -p $CONFIG_ROOT
16 |   - cp /tmp/engine.json $CONFIG_ROOT/engine.json
17 | process:
18 |   env:
19 |     - name: CONFIG_ROOT
20 |       value: /efs/home/{{ .Release.Name }}/config
21 | 


--------------------------------------------------------------------------------
/examples/inference/rayserve/mistral-8x22b-instruct-v01-vllm/engine_config.yaml:
--------------------------------------------------------------------------------
 1 | image: public.ecr.aws/docker/library/python:slim-bullseye
 2 | inline_script:
 3 | - |+
 4 |   cat > /tmp/engine.json <<EOF
 5 |   {
 6 |     "model": "/fsx/pretrained-models/mistralai/Mixtral-8x22B-Instruct-v0.1",
 7 |     "tokenizer": "/fsx/pretrained-models/mistralai/Mixtral-8x22B-Instruct-v0.1",
 8 |     "disable_log_requests": true,
 9 |     "tensor_parallel_size": 8,
10 |     "pipeline_parallel_size": 2
11 |   }
12 | 
13 |   EOF
14 | pre_script:
15 |   - mkdir -p $CONFIG_ROOT
16 |   - cp /tmp/engine.json $CONFIG_ROOT/engine.json
17 | process:
18 |   env:
19 |     - name: CONFIG_ROOT
20 |       value: /efs/home/{{ .Release.Name }}/config
21 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/llama2-7b/hf_to_trtllm.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   node_type: g5.48xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - mkdir -p $OUTPUT_ROOT
18 |   - TP_SIZE=8
19 |   - PP_SIZE=1
20 |   - OUTPUT_LOG=$LOG_ROOT/hf_to_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log
21 |   - TMP_OUTPUT_PATH=/tmp/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE}
22 |   - SCRIPT_DIR=TensorRT-LLM/examples/llama
23 |   - cd $SCRIPT_DIR
24 |   - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0
25 | post_script:
26 |   - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/
27 | process:
28 |   env:
29 |     - name: LOG_ROOT
30 |       value: /efs/home/{{ .Release.Name }}/logs
31 |     - name: OUTPUT_ROOT
32 |       value: /efs/home/{{ .Release.Name }}/trtllm
33 |     - name: MODEL_PATH
34 |       value: /fsx/pretrained-models/meta-llama/Llama-2-7b-hf
35 |   command:
36 |     - python3
37 |   args:
38 |     - convert_checkpoint.py 
39 |     - --model_dir=$MODEL_PATH
40 |     - --output_dir=$TMP_OUTPUT_PATH
41 |     - --dtype=float16
42 |     - --tp_size=$TP_SIZE
43 |     - '2>&1 | tee $OUTPUT_LOG'
44 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/llama2-7b/trtllm_engine.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   node_type: g5.48xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - TP_SIZE=8
18 |   - PP_SIZE=1
19 |   - OUTPUT_LOG=$LOG_ROOT/build_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log
20 |   - CKPT_PATH=$OUTPUT_ROOT/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE}
21 |   - TMP_ENGINE_DIR=/tmp/engine_tp_${TP_SIZE}_pp_${PP_SIZE}
22 | post_script:
23 |   - rm -rf $OUTPUT_ROOT/engine_tp_${TP_SIZE}_pp_${PP_SIZE}
24 |   - cp -r $TMP_ENGINE_DIR $OUTPUT_ROOT/
25 | process:
26 |   env:
27 |     - name: LOG_ROOT
28 |       value: /efs/home/{{ .Release.Name }}/logs
29 |     - name: OUTPUT_ROOT
30 |       value: /efs/home/{{ .Release.Name }}/trtllm
31 |     - name: MODEL_PATH
32 |       value: /fsx/pretrained-models/meta-llama/Llama-2-7b-hf
33 |   command:
34 |     - trtllm-build
35 |   args:
36 |     - --checkpoint_dir ${CKPT_PATH}
37 |     - --max_num_tokens 2048
38 |     - --gpus_per_node 8
39 |     - --remove_input_padding enable
40 |     - --gemm_plugin float16
41 |     - --gpt_attention_plugin float16 
42 |     - --paged_kv_cache enable
43 |     - --context_fmha enable
44 |     - --output_dir ${TMP_ENGINE_DIR}
45 |     - --max_batch_size 8
46 |     - '2>&1 | tee $OUTPUT_LOG'
47 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/llama3-8b-instruct/hf_to_trtllm.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   node_type: g5.48xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - mkdir -p $OUTPUT_ROOT
18 |   - TP_SIZE=8
19 |   - PP_SIZE=1
20 |   - OUTPUT_LOG=$LOG_ROOT/hf_to_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log
21 |   - TMP_OUTPUT_PATH=/tmp/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE}
22 |   - SCRIPT_DIR=TensorRT-LLM/examples/llama
23 |   - cd $SCRIPT_DIR
24 |   - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0
25 | post_script:
26 |   - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/
27 | process:
28 |   env:
29 |     - name: LOG_ROOT
30 |       value: /efs/home/{{ .Release.Name }}/logs
31 |     - name: OUTPUT_ROOT
32 |       value: /efs/home/{{ .Release.Name }}/trtllm
33 |     - name: MODEL_PATH
34 |       value: "/fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct"
35 |   command:
36 |     - python3
37 |   args:
38 |     - convert_checkpoint.py 
39 |     - --model_dir=$MODEL_PATH
40 |     - --output_dir=$TMP_OUTPUT_PATH
41 |     - --dtype=float16
42 |     - --tp_size=$TP_SIZE
43 |     - '2>&1 | tee $OUTPUT_LOG'
44 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/llama3-8b-instruct/trtllm_engine.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   node_type: g5.48xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - TP_SIZE=8
18 |   - PP_SIZE=1
19 |   - OUTPUT_LOG=$LOG_ROOT/build_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log
20 |   - CKPT_PATH=$OUTPUT_ROOT/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE}
21 |   - TMP_ENGINE_DIR=/tmp/engine_tp_${TP_SIZE}_pp_${PP_SIZE}
22 | post_script:
23 |   - rm -rf $OUTPUT_ROOT/engine_tp_${TP_SIZE}_pp_${PP_SIZE}
24 |   - cp -r $TMP_ENGINE_DIR $OUTPUT_ROOT/
25 | process:
26 |   env:
27 |     - name: LOG_ROOT
28 |       value: /efs/home/{{ .Release.Name }}/logs
29 |     - name: OUTPUT_ROOT
30 |       value: /efs/home/{{ .Release.Name }}/trtllm
31 |     - name: MODEL_PATH
32 |       value: "/fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct"
33 |   command:
34 |     - trtllm-build
35 |   args:
36 |     - --checkpoint_dir ${CKPT_PATH}
37 |     - --max_num_tokens 8192
38 |     - --gpus_per_node 8
39 |     - --remove_input_padding enable
40 |     - --gemm_plugin float16
41 |     - --gpt_attention_plugin float16 
42 |     - --paged_kv_cache enable
43 |     - --context_fmha enable
44 |     - --output_dir ${TMP_ENGINE_DIR}
45 |     - --max_batch_size 4
46 |     - '2>&1 | tee $OUTPUT_LOG'
47 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/mistral-7b-instruct-v01/hf_to_trtllm.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   node_type: g5.48xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - mkdir -p $OUTPUT_ROOT
18 |   - TP_SIZE=8
19 |   - PP_SIZE=1
20 |   - OUTPUT_LOG=$LOG_ROOT/hf_to_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log
21 |   - TMP_OUTPUT_PATH=/tmp/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE}
22 |   - SCRIPT_DIR=TensorRT-LLM/examples/llama
23 |   - cd $SCRIPT_DIR
24 |   - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0
25 | post_script:
26 |   - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/
27 | process:
28 |   env:
29 |     - name: LOG_ROOT
30 |       value: /efs/home/{{ .Release.Name }}/logs
31 |     - name: OUTPUT_ROOT
32 |       value: /efs/home/{{ .Release.Name }}/trtllm
33 |     - name: MODEL_PATH
34 |       value: /fsx/pretrained-models/mistralai/Mistral-7B-Instruct-v0.1
35 |   command:
36 |     - python3
37 |   args:
38 |     - convert_checkpoint.py 
39 |     - --model_dir=$MODEL_PATH
40 |     - --output_dir=$TMP_OUTPUT_PATH
41 |     - --dtype=float16
42 |     - --tp_size=$TP_SIZE
43 |     - '2>&1 | tee $OUTPUT_LOG'
44 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/mistral-7b-instruct-v01/trtllm_engine.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   node_type: g5.48xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - TP_SIZE=8
18 |   - PP_SIZE=1
19 |   - OUTPUT_LOG=$LOG_ROOT/build_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log
20 |   - CKPT_PATH=$OUTPUT_ROOT/ckpt_tp_${TP_SIZE}_pp_${PP_SIZE}
21 |   - TMP_ENGINE_DIR=/tmp/engine_tp_${TP_SIZE}_pp_${PP_SIZE}
22 | post_script:
23 |   - rm -rf $OUTPUT_ROOT/engine_tp_${TP_SIZE}_pp_${PP_SIZE}
24 |   - cp -r $TMP_ENGINE_DIR $OUTPUT_ROOT/
25 | process:
26 |   env:
27 |     - name: LOG_ROOT
28 |       value: /efs/home/{{ .Release.Name }}/logs
29 |     - name: OUTPUT_ROOT
30 |       value: /efs/home/{{ .Release.Name }}/trtllm
31 |     - name: MODEL_PATH
32 |       value: /fsx/pretrained-models/mistralai/Mistral-7B-Instruct-v0.1
33 |   command:
34 |     - trtllm-build
35 |   args:
36 |     - --checkpoint_dir ${CKPT_PATH}
37 |     - --max_num_tokens 32768
38 |     - --gpus_per_node 8
39 |     - --remove_input_padding enable
40 |     - --gemm_plugin float16
41 |     - --gpt_attention_plugin float16 
42 |     - --paged_kv_cache enable
43 |     - --context_fmha enable
44 |     - --output_dir ${TMP_ENGINE_DIR}
45 |     - --max_batch_size 4
46 |     - '2>&1 | tee $OUTPUT_LOG'
47 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/mistral-7b-instruct-v01_llama3-8b/llama3_8b_hf_to_trtllm.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   node_type: g5.48xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - mkdir -p $OUTPUT_ROOT
18 |   - TP_SIZE=8
19 |   - PP_SIZE=1
20 |   - MODEL_NAME=llama3_8b_instruct
21 |   - OUTPUT_LOG=$LOG_ROOT/${MODEL_NAME}_hf_to_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log
22 |   - TMP_OUTPUT_PATH=/tmp/${MODEL_NAME}_ckpt_tp_${TP_SIZE}_pp_${PP_SIZE}
23 |   - SCRIPT_DIR=TensorRT-LLM/examples/llama
24 |   - cd $SCRIPT_DIR
25 |   - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0
26 | post_script:
27 |   - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/
28 | process:
29 |   env:
30 |     - name: LOG_ROOT
31 |       value: /efs/home/{{ .Release.Name }}/logs
32 |     - name: OUTPUT_ROOT
33 |       value: /efs/home/{{ .Release.Name }}/trtllm
34 |     - name: MODEL_PATH
35 |       value: /fsx/pretrained-models/meta-llama/Meta-Llama-3-8B-Instruct
36 |   command:
37 |     - python3
38 |   args:
39 |     - convert_checkpoint.py 
40 |     - --model_dir=$MODEL_PATH
41 |     - --output_dir=$TMP_OUTPUT_PATH
42 |     - --dtype=float16
43 |     - --tp_size=$TP_SIZE
44 |     - '2>&1 | tee $OUTPUT_LOG'
45 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/mistral-7b-instruct-v01_llama3-8b/mistral_7b_hf_to_trtllm.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   node_type: g5.48xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - mkdir -p $OUTPUT_ROOT
18 |   - TP_SIZE=8
19 |   - PP_SIZE=1
20 |   - MODEL_NAME=mistral_7b_instruct
21 |   - OUTPUT_LOG=$LOG_ROOT/${MODEL_NAME}_hf_to_trtllm_tp_${TP_SIZE}_pp_${PP_SIZE}.log
22 |   - TMP_OUTPUT_PATH=/tmp/${MODEL_NAME}_ckpt_tp_${TP_SIZE}_pp_${PP_SIZE}
23 |   - SCRIPT_DIR=TensorRT-LLM/examples/llama
24 |   - cd $SCRIPT_DIR
25 |   - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0
26 | post_script:
27 |   - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/
28 | process:
29 |   env:
30 |     - name: LOG_ROOT
31 |       value: /efs/home/{{ .Release.Name }}/logs
32 |     - name: OUTPUT_ROOT
33 |       value: /efs/home/{{ .Release.Name }}/trtllm
34 |     - name: MODEL_PATH
35 |       value: /fsx/pretrained-models/mistralai/Mistral-7B-Instruct-v0.1
36 |   command:
37 |     - python3
38 |   args:
39 |     - convert_checkpoint.py 
40 |     - --model_dir=$MODEL_PATH
41 |     - --output_dir=$TMP_OUTPUT_PATH
42 |     - --dtype=float16
43 |     - --tp_size=$TP_SIZE
44 |     - '2>&1 | tee $OUTPUT_LOG'
45 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/mistral-8x22b-instruct-v01/hf_to_trtllm.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   node_type: p4d.24xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - mkdir -p $OUTPUT_ROOT
18 |   - TP_SIZE=8
19 |   - PP_SIZE=2
20 |   - OUTPUT_LOG=$LOG_ROOT/hf_to_trtllm.log
21 |   - TMP_OUTPUT_PATH=/tmp/ckpt
22 |   - SCRIPT_DIR=TensorRT-LLM/examples/llama
23 |   - cd $SCRIPT_DIR
24 |   - pip3 install datasets==3.1.0 evaluate~=0.4.3 rouge_score~=0.1.2 sentencepiece~=0.2.0
25 | post_script:
26 |   - cp -r $TMP_OUTPUT_PATH $OUTPUT_ROOT/
27 | process:
28 |   env:
29 |     - name: LOG_ROOT
30 |       value: /efs/home/{{ .Release.Name }}/logs
31 |     - name: OUTPUT_ROOT
32 |       value: /efs/home/{{ .Release.Name }}/trtllm
33 |     - name: MODEL_PATH
34 |       value: /fsx/pretrained-models/mistralai/Mixtral-8x22B-Instruct-v0.1
35 |   command:
36 |     - python3
37 |   args:
38 |     - convert_checkpoint.py 
39 |     - --model_dir=$MODEL_PATH
40 |     - --output_dir=$TMP_OUTPUT_PATH
41 |     - --dtype=float16
42 |     - --tp_size=$TP_SIZE
43 |     - --pp_size=$PP_SIZE
44 |     - '2>&1 | tee $OUTPUT_LOG'
45 | 


--------------------------------------------------------------------------------
/examples/inference/triton-inference-server/tensorrtllm_backend/mistral-8x22b-instruct-v01/trtllm_engine.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   node_type: p4d.24xlarge
 4 |   requests:
 5 |     "nvidia.com/gpu": 8
 6 |   limits:
 7 |     "nvidia.com/gpu": 8
 8 | ebs:
 9 |   storage: 400Gi
10 |   mount_path: /tmp
11 | tolerations:
12 |   - key: "nvidia.com/gpu"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | pre_script: 
16 |   - mkdir -p $LOG_ROOT
17 |   - OUTPUT_LOG=$LOG_ROOT/build_trtllm.log
18 |   - CKPT_PATH=$OUTPUT_ROOT/ckpt
19 |   - TMP_ENGINE_DIR=/tmp/engine
20 | post_script:
21 |   - rm -rf $OUTPUT_ROOT/engine
22 |   - cp -r $TMP_ENGINE_DIR $OUTPUT_ROOT/
23 | process:
24 |   env:
25 |     - name: LOG_ROOT
26 |       value: /efs/home/{{ .Release.Name }}/logs
27 |     - name: OUTPUT_ROOT
28 |       value: /efs/home/{{ .Release.Name }}/trtllm
29 |   command:
30 |     - trtllm-build
31 |   args:
32 |     - --checkpoint_dir ${CKPT_PATH}
33 |     - --max_num_tokens 16384
34 |     - --gpus_per_node 8
35 |     - --remove_input_padding enable
36 |     - --gemm_plugin float16
37 |     - --gpt_attention_plugin float16 
38 |     - --paged_kv_cache enable
39 |     - --context_fmha enable
40 |     - --output_dir ${TMP_ENGINE_DIR}
41 |     - --max_batch_size 8
42 |     - '2>&1 | tee $OUTPUT_LOG'
43 | 


--------------------------------------------------------------------------------
/examples/legacy/README.md:
--------------------------------------------------------------------------------
 1 | ## Legacy Tutorials
 2 | 
 3 | 
 4 | ### [TensorFlow](https://www.tensorflow.org/)
 5 | 
 6 | 
 7 | | Model      | Accelerator | Notes |
 8 | | ----------- | ----------- | -------- |
 9 | | [Mask R-CNN](./maskrcnn/README.md)     | Nvidia GPU | Mask R-CNN training for [AWS Samples Mask R-CNN](https://github.com/aws-samples/mask-rcnn-tensorflow)  on COCO 2017 dataset   |
10 | 
11 | 
12 | ### [Neuronx Nemo Megatron](https://github.com/aws-neuron/neuronx-nemo-megatron)
13 | 
14 | 
15 | | Model      | Accelerator | Notes |
16 | | ----------- | ----------- | -------- |
17 | | [Llama 2 7B Pre-training](./neuronx-nemo-megatron/llama2_7b/README.md)    | AWS Trainium1 | LLama 2 7B pre-training on Wikicorpus dataset     |
18 | | [Llama 2 13B Pre-training](./neuronx-nemo-megatron/llama2_13b/README.md)   | AWS Trainium1  | LLama 2 13B pre-training on Wikicorpus dataset   |
19 | | [Llama 2 70B Pre-training](./neuronx-nemo-megatron/llama2_70b/README.md)    | AWS Trainium1  | LLama 2 70B pre-training on Wikicorpus dataset    |
20 | 
21 | 


--------------------------------------------------------------------------------
/examples/legacy/maskrcnn/train-maskrcnn-aws.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | pre_script:
 3 |   - DATE=$(date '+%Y-%m-%d-%H-%M-%S')
 4 |   - LOG_DIR=$HOME/logs/maskrcnn-tensorflow-$DATE
 5 |   - cd /mask-rcnn-tensorflow
 6 | resources:
 7 |   gpu_nodes: 2
 8 |   gpus_per_node: 8
 9 |   gpu_instance_type: "p4d.24xlarge"
10 |   requests:
11 |     "nvidia.com/gpu": 8
12 |   limits:
13 |     "nvidia.com/gpu": 8 
14 | tensorflow:
15 |   sys_memory_mb: "4096"
16 | train:
17 |   command:
18 |     - python3
19 |   args:
20 |     - /mask-rcnn-tensorflow/MaskRCNN/train.py 
21 |     - "--logdir $LOG_DIR"
22 |     - "--images_per_epoch 120000"
23 |     - --config  
24 |     - MODE_MASK='True'
25 |     - MODE_FPN='True'
26 |     - DATA.BASEDIR=/fsx/data/coco2017
27 |     - DATA.TRAIN='["train2017"]' 
28 |     - DATA.VAL='("val2017")'
29 |     - TRAIN.BATCH_SIZE_PER_GPU=4
30 |     - TRAIN.EVAL_PERIOD=1
31 |     - TRAIN.LR_EPOCH_SCHEDULE="[(16, 0.1), (20, 0.01), (24, None)]"
32 |     - TRAIN.BASE_LR=0.0015625
33 |     - BACKBONE.WEIGHTS=/fsx/data/coco2017/pretrained-models/ImageNet-R50-AlignPadding.npz
34 |     - BACKBONE.NORM=FreezeBN
35 |     - PREPROC.PREDEFINED_PADDING=True
36 |     - TRAINER=horovod
37 |     - TRAIN.GRADIENT_CLIP=0.36 
38 | 


--------------------------------------------------------------------------------
/examples/legacy/maskrcnn/train-maskrcnn-tensorpack.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | pre_script:
 3 |   - DATE=$(date '+%Y-%m-%d-%H-%M-%S')
 4 |   - LOG_DIR=$HOME/logs/maskrcnn-tensorpack-$DATE
 5 |   - cd /mask-rcnn-tensorflow
 6 | resources:
 7 |   gpu_nodes: 2
 8 |   gpus_per_node: 8
 9 |   gpu_instance_type: "p4d.24xlarge"
10 |   requests:
11 |     "nvidia.com/gpu": 8
12 |   limits:
13 |     "nvidia.com/gpu": 8 
14 | tensorflow:
15 |   sys_memory_mb: "2560"
16 | train:
17 |   command:
18 |     - python3
19 |   args:
20 |     - /tensorpack/examples/FasterRCNN/train.py
21 |     - "--logdir $LOG_DIR"
22 |     - --config
23 |     - MODE_MASK='True'
24 |     - MODE_FPN='True'
25 |     - DATA.BASEDIR=/fsx/data/coco2017
26 |     - DATA.TRAIN='["coco_train2017"]'
27 |     - DATA.VAL='("coco_val2017")'
28 |     - TRAIN.EVAL_PERIOD=1
29 |     - TRAIN.STEPS_PER_EPOCH=7500
30 |     - TRAIN.LR_SCHEDULE='[240000,320000,360000]'
31 |     - TRAIN.BASE_LR=0.01
32 |     - BACKBONE.WEIGHTS=/fsx/data/coco2017/pretrained-models/ImageNet-R50-AlignPadding.npz
33 |     - BACKBONE.NORM=FreezeBN
34 |     - TRAINER=horovod
35 |     - TRAIN.CHECKPOINT_PERIOD=2 
36 | 


--------------------------------------------------------------------------------
/examples/training/megatron-deepspeed/gpt2_345m/wikicorpus.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 1
 5 |   limits:
 6 |     "nvidia.com/gpu": 1
 7 | tolerations:
 8 |   - key: "nvidia.com/gpu"
 9 |     operator: "Exists"
10 |     effect: "NoSchedule"
11 | ebs:
12 |   storage: 100Gi
13 |   mount_path: /tmp
14 | git:
15 |   repo_url: 'https://github.com/microsoft/Megatron-DeepSpeed.git'
16 |   branch: main
17 |   commit: a9856ce0e75dbe69c96d4e241e8a191b344118d7
18 | pre_script:
19 |   - pip install --upgrade pip
20 |   - pip install transformers==4.38.1 datasets==2.17.1
21 |   - pip install nltk==3.8.1
22 |   - python <<EOF
23 |   - import os
24 |   - from datasets import load_dataset
25 |   - dataset = load_dataset("wikicorpus", "raw_en", split="train", trust_remote_code=True)
26 |   - dataset.to_json(os.path.join("/tmp", "train.json"))
27 |   - EOF
28 |   - bash dataset/download_vocab.sh
29 |   - mkdir -p $DATA_ROOT
30 | post_script: []
31 | process:
32 |   env:
33 |     - name: HOME
34 |       value: "/efs/home/{{ .Release.Name }}"
35 |     - name: DATA_ROOT
36 |       value: "/fsx/home/{{ .Release.Name }}/data/wikicorpus"
37 |     - name: XDG_CACHE_HOME
38 |       value: "/tmp/.cache"
39 |   command:
40 |     - python3
41 |   args:
42 |     - tools/preprocess_data.py
43 |     - --input /tmp/train.json
44 |     - --output-prefix $DATA_ROOT/gpt2
45 |     - --vocab-file gpt2-vocab.json
46 |     - --dataset-impl mmap
47 |     - --tokenizer-type GPT2BPETokenizer
48 |     - --merge-file gpt2-merges.txt
49 |     - --append-eod
50 |     - --workers 4
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/llama2-7b-peft/hf_to_nemo.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 4
 5 |   limits:
 6 |     "nvidia.com/gpu": 4
 7 | ebs:
 8 |   storage: 400Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/hf_to_nemo.log
19 | post_script:
20 |   - cp -r /tmp/ckpt.nemo $MODEL_PATH/
21 | process:
22 |   env:
23 |     - name: LOG_ROOT
24 |       value: /efs/home/{{ .Release.Name }}/logs
25 |     - name: MODEL_PATH
26 |       value: /fsx/pretrained-models/meta-llama/Llama-2-7b-hf
27 |   command:
28 |     - python3
29 |   args:
30 |     - convert_llama_hf_to_nemo.py
31 |     - --input_name_or_path=$MODEL_PATH/ 
32 |     - --output_path=/tmp/ckpt.nemo 
33 |     - '2>&1 | tee $OUTPUT_LOG'
34 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/llama2-7b-peft/merge_peft.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 8
 5 |   limits:
 6 |     "nvidia.com/gpu": 8
 7 | ebs:
 8 |   storage: 500Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/nlp_language_modeling/merge_lora_weights
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/merge_peft.log
19 |   - PATH_TO_BASE_MODEL=$MODEL_PATH/ckpt.nemo
20 |   - echo "PATH_TO_BASE_MODEL=$PATH_TO_BASE_MODEL"
21 |   - PATH_TO_PEFT_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/$EXP_NAME.nemo
22 |   - echo "PATH_TO_PEFT_MODEL=$PATH_TO_PEFT_MODEL"
23 |   - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo
24 |   - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL"
25 | process:
26 |   env:
27 |     - name: LOG_ROOT
28 |       value: /efs/home/{{ .Release.Name }}/logs
29 |     - name: MODEL_PATH
30 |       value: /fsx/pretrained-models/meta-llama/Llama-2-7b-hf
31 |     - name: EXP_NAME
32 |       value: peft_pubmedqa
33 |   command:
34 |     - python
35 |   args:
36 |     - merge.py
37 |     - trainer.accelerator=cpu
38 |     - gpt_model_file=$PATH_TO_BASE_MODEL
39 |     - lora_model_path=$PATH_TO_PEFT_MODEL
40 |     - merged_model_path=$PATH_TO_MERGED_MODEL
41 |     - '2>&1 | tee $OUTPUT_LOG'
42 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/llama2-7b-peft/nemo_to_hf.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 8
 5 |   limits:
 6 |     "nvidia.com/gpu": 8
 7 | ebs:
 8 |   storage: 500Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/nemo_to_hf.log
19 |   - TMP_OUTPUT_PATH=/tmp/hf_peft_model
20 |   - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo
21 |   - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL"
22 | post_script:
23 |   - cp -r $TMP_OUTPUT_PATH $MODEL_PATH/
24 | process:
25 |   env:
26 |     - name: LOG_ROOT
27 |       value: /efs/home/{{ .Release.Name }}/logs
28 |     - name: MODEL_PATH
29 |       value: /fsx/pretrained-models/meta-llama/Llama-2-7b-hf
30 |     - name: EXP_NAME
31 |       value: peft_pubmedqa
32 |   command:
33 |     - python3
34 |   args:
35 |     - convert_llama_nemo_to_hf.py
36 |     - --input_name_or_path=$PATH_TO_MERGED_MODEL
37 |     - --output_path=$TMP_OUTPUT_PATH/model.bin
38 |     - --hf_input_path=$MODEL_PATH
39 |     - --hf_output_path=$TMP_OUTPUT_PATH
40 |     - --cpu-only
41 |     - '2>&1 | tee $OUTPUT_LOG'
42 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/llama2-7b-peft/peft_accuracy.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   requests:
 4 |     cpu: "300m"
 5 |     memory: "256Mi"
 6 |   limits:
 7 |     cpu: "1000m"
 8 |     memory: "2048Mi"
 9 | ebs:
10 |   storage: 200Gi
11 |   mount_path: /tmp
12 | inline_script:
13 | - |+
14 |   cat > /tmp/run_accuracy_metric_calculation.py <<EOF
15 |   
16 |   import json
17 |   import os
18 |   from sklearn.metrics import accuracy_score, f1_score
19 | 
20 |   results = []
21 |   output_prefix = os.environ['OUTPUT_PREFIX']
22 |   results_path = f"{output_prefix}_test_pubmedqa_inputs_preds_labels.jsonl"
23 |   with open(results_path,'rt') as f:
24 |     while st := f.readline():
25 |       results.append(json.loads(st))
26 | 
27 |   truth = []
28 |   preds = []
29 |   
30 |   for result in results:
31 |     truth.append(result['label'])
32 |     preds.append(result['pred'])
33 | 
34 |   acc = accuracy_score(truth, preds)
35 |   maf = f1_score(truth, preds, average='macro')
36 | 
37 |   print('Accuracy %f' % acc)
38 |   print('Macro-F1 %f' % maf)
39 | 
40 |   EOF
41 | pre_script: 
42 |   - export OUTPUT_PREFIX=$LOG_ROOT/nemo_experiments/$EXP_NAME/eval_results
43 |   - OUTPUT_LOG=$LOG_ROOT/peft_accuracy.log
44 | process:
45 |   env:
46 |     - name: LOG_ROOT
47 |       value: "/efs/home/{{ .Release.Name }}/logs"
48 |     - name: EXP_NAME
49 |       value: "peft_pubmedqa"
50 |   command:
51 |     -  "python"
52 |   args: 
53 |     - /tmp/run_accuracy_metric_calculation.py
54 |     - '2>&1 | tee $OUTPUT_LOG' 
55 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/llama31-8b-peft-dolphin/hf_to_nemo.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 4
 5 |   limits:
 6 |     "nvidia.com/gpu": 4
 7 | ebs:
 8 |   storage: 400Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/hf_to_nemo.log
19 |   - TMP_MODEL_PATH=/tmp/model
20 |   - cp -r $MODEL_PATH $TMP_MODEL_PATH
21 |   - 'if [ -f $TMP_MODEL_PATH/tokenizer.model ]; then rm -f $TMP_MODEL_PATH/tokenizer.model; fi'
22 | post_script:
23 |   - cp -r $TMP_MODEL_PATH/ckpt.nemo $MODEL_PATH/
24 | process:
25 |   env:
26 |     - name: LOG_ROOT
27 |       value: /efs/home/{{ .Release.Name }}/logs
28 |     - name: MODEL_PATH
29 |       value: /fsx/pretrained-models/meta-llama/Llama-3.1-8B
30 |   command:
31 |     - python3
32 |   args:
33 |     - convert_llama_hf_to_nemo.py
34 |     - --input_name_or_path=$TMP_MODEL_PATH 
35 |     - --output_path=$TMP_MODEL_PATH/ckpt.nemo
36 |     - --llama31=True
37 |     - '2>&1'
38 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/llama31-8b-peft-dolphin/merge_peft.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 8
 5 |   limits:
 6 |     "nvidia.com/gpu": 8
 7 | ebs:
 8 |   storage: 500Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/nlp_language_modeling/merge_lora_weights
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/merge_peft.log
19 |   - PATH_TO_BASE_MODEL=$MODEL_PATH/ckpt.nemo
20 |   - echo "PATH_TO_BASE_MODEL=$PATH_TO_BASE_MODEL"
21 |   - PATH_TO_PEFT_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/$EXP_NAME.nemo
22 |   - echo "PATH_TO_PEFT_MODEL=$PATH_TO_PEFT_MODEL"
23 |   - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo
24 |   - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL"
25 | process:
26 |   env:
27 |     - name: LOG_ROOT
28 |       value: /efs/home/{{ .Release.Name }}/logs
29 |     - name: MODEL_PATH
30 |       value: /fsx/pretrained-models/meta-llama/Llama-3.1-8B
31 |     - name: EXP_NAME
32 |       value: peft_dolphin
33 |     - name: HF_TOKEN
34 |       value: "{{ .Values.hf_token }}"
35 |   command:
36 |     - python
37 |   args:
38 |     - merge.py
39 |     - trainer.accelerator=cpu
40 |     - gpt_model_file=$PATH_TO_BASE_MODEL
41 |     - lora_model_path=$PATH_TO_PEFT_MODEL
42 |     - merged_model_path=$PATH_TO_MERGED_MODEL
43 |     - '2>&1 | tee $OUTPUT_LOG'
44 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/llama31-8b-peft-dolphin/nemo_to_hf.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 8
 5 |   limits:
 6 |     "nvidia.com/gpu": 8
 7 | ebs:
 8 |   storage: 500Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/nemo_to_hf.log
19 |   - TMP_OUTPUT_PATH=/tmp/$EXP_NAME
20 |   - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo
21 |   - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL"
22 | post_script:
23 |   - cp -r $TMP_OUTPUT_PATH $MODEL_PATH/
24 | process:
25 |   env:
26 |     - name: LOG_ROOT
27 |       value: /efs/home/{{ .Release.Name }}/logs
28 |     - name: MODEL_PATH
29 |       value: /fsx/pretrained-models/meta-llama/Llama-3.1-8B
30 |     - name: EXP_NAME
31 |       value: peft_dolphin
32 |     - name: HF_TOKEN
33 |       value: "{{ .Values.hf_token }}"
34 |   command:
35 |     - python3
36 |   args:
37 |     - convert_llama_nemo_to_hf.py
38 |     - --input_name_or_path=$PATH_TO_MERGED_MODEL
39 |     - --output_path=$TMP_OUTPUT_PATH/model.bin
40 |     - --hf_input_path=$MODEL_PATH
41 |     - --hf_output_path=$TMP_OUTPUT_PATH
42 |     - --cpu-only
43 |     - '2>&1 | tee $OUTPUT_LOG'
44 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/mistral-7b-v01-peft-dolphin/hf_to_nemo.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 4
 5 |   limits:
 6 |     "nvidia.com/gpu": 4
 7 | ebs:
 8 |   storage: 400Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/hf_to_nemo.log
19 | post_script:
20 |   - cp -r /tmp/ckpt.nemo $MODEL_PATH/
21 | process:
22 |   env:
23 |     - name: LOG_ROOT
24 |       value: /efs/home/{{ .Release.Name }}/logs
25 |     - name: MODEL_PATH
26 |       value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1
27 |   command:
28 |     - python3
29 |   args:
30 |     - convert_mistral_7b_hf_to_nemo.py
31 |     - --input_name_or_path=$MODEL_PATH/ 
32 |     - --output_path=/tmp/ckpt.nemo 
33 |     - '2>&1 | tee $OUTPUT_LOG'
34 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/mistral-7b-v01-peft-dolphin/merge_peft.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 8
 5 |   limits:
 6 |     "nvidia.com/gpu": 8
 7 | ebs:
 8 |   storage: 500Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/nlp_language_modeling/merge_lora_weights
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/merge_peft.log
19 |   - PATH_TO_BASE_MODEL=$MODEL_PATH/ckpt.nemo
20 |   - echo "PATH_TO_BASE_MODEL=$PATH_TO_BASE_MODEL"
21 |   - PATH_TO_PEFT_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/$EXP_NAME.nemo
22 |   - echo "PATH_TO_PEFT_MODEL=$PATH_TO_PEFT_MODEL"
23 |   - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo
24 |   - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL"
25 | process:
26 |   env:
27 |     - name: LOG_ROOT
28 |       value: /efs/home/{{ .Release.Name }}/logs
29 |     - name: MODEL_PATH
30 |       value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1
31 |     - name: EXP_NAME
32 |       value: peft_dolphin
33 |   command:
34 |     - python
35 |   args:
36 |     - merge.py
37 |     - trainer.accelerator=cpu
38 |     - gpt_model_file=$PATH_TO_BASE_MODEL
39 |     - lora_model_path=$PATH_TO_PEFT_MODEL
40 |     - merged_model_path=$PATH_TO_MERGED_MODEL
41 |     - '2>&1 | tee $OUTPUT_LOG'
42 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/mistral-7b-v01-peft-dolphin/nemo_to_hf.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 8
 5 |   limits:
 6 |     "nvidia.com/gpu": 8
 7 | ebs:
 8 |   storage: 500Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/nemo_to_hf.log
19 |   - TMP_OUTPUT_PATH=/tmp/$EXP_NAME
20 |   - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo
21 |   - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL"
22 | post_script:
23 |   - cp -r $TMP_OUTPUT_PATH $MODEL_PATH/
24 | process:
25 |   env:
26 |     - name: LOG_ROOT
27 |       value: /efs/home/{{ .Release.Name }}/logs
28 |     - name: MODEL_PATH
29 |       value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1
30 |     - name: EXP_NAME
31 |       value: peft_dolphin
32 |   command:
33 |     - python3
34 |   args:
35 |     - convert_mistral_7b_nemo_to_hf.py
36 |     - --input_name_or_path=$PATH_TO_MERGED_MODEL
37 |     - --output_path=$TMP_OUTPUT_PATH
38 |     - --hf_model_name=$MODEL_PATH
39 |     - '2>&1 | tee $OUTPUT_LOG'
40 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/mistral-7b-v01-peft/hf_to_nemo.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 4
 5 |   limits:
 6 |     "nvidia.com/gpu": 4
 7 | ebs:
 8 |   storage: 400Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/hf_to_nemo.log
19 | post_script:
20 |   - cp -r /tmp/ckpt.nemo $MODEL_PATH/
21 | process:
22 |   env:
23 |     - name: LOG_ROOT
24 |       value: /efs/home/{{ .Release.Name }}/logs
25 |     - name: MODEL_PATH
26 |       value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1
27 |   command:
28 |     - python3
29 |   args:
30 |     - convert_mistral_7b_hf_to_nemo.py
31 |     - --input_name_or_path=$MODEL_PATH/ 
32 |     - --output_path=/tmp/ckpt.nemo 
33 |     - '2>&1 | tee $OUTPUT_LOG'
34 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/mistral-7b-v01-peft/merge_peft.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 8
 5 |   limits:
 6 |     "nvidia.com/gpu": 8
 7 | ebs:
 8 |   storage: 500Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/nlp_language_modeling/merge_lora_weights
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/merge_peft.log
19 |   - PATH_TO_BASE_MODEL=$MODEL_PATH/ckpt.nemo
20 |   - echo "PATH_TO_BASE_MODEL=$PATH_TO_BASE_MODEL"
21 |   - PATH_TO_PEFT_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/$EXP_NAME.nemo
22 |   - echo "PATH_TO_PEFT_MODEL=$PATH_TO_PEFT_MODEL"
23 |   - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo
24 |   - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL"
25 | process:
26 |   env:
27 |     - name: LOG_ROOT
28 |       value: /efs/home/{{ .Release.Name }}/logs
29 |     - name: MODEL_PATH
30 |       value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1
31 |     - name: EXP_NAME
32 |       value: peft_pubmedqa
33 |   command:
34 |     - python
35 |   args:
36 |     - merge.py
37 |     - trainer.accelerator=cpu
38 |     - gpt_model_file=$PATH_TO_BASE_MODEL
39 |     - lora_model_path=$PATH_TO_PEFT_MODEL
40 |     - merged_model_path=$PATH_TO_MERGED_MODEL
41 |     - '2>&1 | tee $OUTPUT_LOG'
42 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/mistral-7b-v01-peft/nemo_to_hf.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 | resources:
 3 |   requests:
 4 |     "nvidia.com/gpu": 8
 5 |   limits:
 6 |     "nvidia.com/gpu": 8
 7 | ebs:
 8 |   storage: 500Gi
 9 |   mount_path: /tmp
10 | tolerations:
11 |   - key: "nvidia.com/gpu"
12 |     operator: "Exists"
13 |     effect: "NoSchedule"
14 | pre_script: 
15 |   - SCRIPT_DIR=/NeMo/scripts/checkpoint_converters
16 |   - cd $SCRIPT_DIR
17 |   - mkdir -p $LOG_ROOT
18 |   - OUTPUT_LOG=$LOG_ROOT/nemo_to_hf.log
19 |   - TMP_OUTPUT_PATH=/tmp/hf_peft_model
20 |   - PATH_TO_MERGED_MODEL=$LOG_ROOT/nemo_experiments/$EXP_NAME/checkpoints/merged_model.nemo
21 |   - echo "PATH_TO_MERGED_MODEL=$PATH_TO_MERGED_MODEL"
22 | post_script:
23 |   - cp -r $TMP_OUTPUT_PATH $MODEL_PATH/
24 | process:
25 |   env:
26 |     - name: LOG_ROOT
27 |       value: /efs/home/{{ .Release.Name }}/logs
28 |     - name: MODEL_PATH
29 |       value: /fsx/pretrained-models/mistralai/Mistral-7B-v0.1
30 |     - name: EXP_NAME
31 |       value: peft_pubmedqa
32 |   command:
33 |     - python3
34 |   args:
35 |     - convert_mistral_7b_nemo_to_hf.py
36 |     - --input_name_or_path=$PATH_TO_MERGED_MODEL
37 |     - --output_path=$TMP_OUTPUT_PATH
38 |     - --hf_model_name=$MODEL_PATH
39 |     - '2>&1 | tee $OUTPUT_LOG'
40 | 


--------------------------------------------------------------------------------
/examples/training/nemo-megatron/mistral-7b-v01-peft/peft_accuracy.yaml:
--------------------------------------------------------------------------------
 1 | image: 
 2 | resources:
 3 |   requests:
 4 |     cpu: "300m"
 5 |     memory: "256Mi"
 6 |   limits:
 7 |     cpu: "1000m"
 8 |     memory: "2048Mi"
 9 | ebs:
10 |   storage: 200Gi
11 |   mount_path: /tmp
12 | inline_script:
13 | - |+
14 |   cat > /tmp/run_accuracy_metric_calculation.py <<EOF
15 |   
16 |   import json
17 |   import os
18 |   from sklearn.metrics import accuracy_score, f1_score
19 | 
20 |   results = []
21 |   output_prefix = os.environ['OUTPUT_PREFIX']
22 |   results_path = f"{output_prefix}_test_pubmedqa_inputs_preds_labels.jsonl"
23 |   with open(results_path,'rt') as f:
24 |     while st := f.readline():
25 |       results.append(json.loads(st))
26 | 
27 |   truth = []
28 |   preds = []
29 |   
30 |   for result in results:
31 |     truth.append(result['label'])
32 |     preds.append(result['pred'])
33 | 
34 |   acc = accuracy_score(truth, preds)
35 |   maf = f1_score(truth, preds, average='macro')
36 | 
37 |   print('Accuracy %f' % acc)
38 |   print('Macro-F1 %f' % maf)
39 | 
40 |   EOF
41 | pre_script: 
42 |   - export OUTPUT_PREFIX=$LOG_ROOT/nemo_experiments/$EXP_NAME/eval_results
43 |   - OUTPUT_LOG=$LOG_ROOT/peft_accuracy.log
44 | process:
45 |   env:
46 |     - name: LOG_ROOT
47 |       value: "/efs/home/{{ .Release.Name }}/logs"
48 |     - name: EXP_NAME
49 |       value: "peft_pubmedqa"
50 |   command:
51 |     -  "python"
52 |   args: 
53 |     - /tmp/run_accuracy_metric_calculation.py
54 |     - '2>&1 | tee $OUTPUT_LOG' 
55 | 


--------------------------------------------------------------------------------
/examples/training/neuronx-distributed/gpt_neox_20b/wikicorpus.yaml:
--------------------------------------------------------------------------------
 1 | image: 'public.ecr.aws/neuron/pytorch-training-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04'
 2 | backoff_limit: 2000
 3 | ebs:
 4 |   storage: 200Gi
 5 |   mount_path: /tmp
 6 | resources:
 7 |   requests:
 8 |     "aws.amazon.com/neuron": 1 
 9 |   limits:
10 |     "aws.amazon.com/neuron": 1 
11 | tolerations:
12 |   - key: "aws.amazon.com/neuron"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | git:
16 |   repo_url: "https://github.com/aws-neuron/neuronx-distributed.git"
17 |   commit: a070deb86991affd589c48441bf819e6d4bb159b
18 |   branch: main
19 | pre_script: 
20 |   - pip3 install --upgrade pip
21 |   - pip3 install -r $GIT_CLONE_DIR/examples/training/tp_dp_gpt_neox_hf_pretrain/common/requirements.txt huggingface-hub==0.27.1
22 |   - mkdir -p $DATA_ROOT
23 |   - mkdir -p $LOG_ROOT 
24 | process:
25 |   env:
26 |     - name: HOME
27 |       value: /tmp
28 |     - name: DATA_ROOT
29 |       value: /fsx/home/{{ .Release.Name }}
30 |     - name: LOG_ROOT
31 |       value: /efs/home/{{ .Release.Name }}/logs
32 |   command:
33 |     -  HOME=$DATA_ROOT python3 
34 |   args:
35 |     - $GIT_CLONE_DIR/examples/training/tp_dp_gpt_neox_hf_pretrain/common/get_dataset.py 
36 |     - '2>&1 | tee $LOG_ROOT/dataset.log'
37 | 


--------------------------------------------------------------------------------
/examples/training/neuronx-distributed/gpt_neox_6.9b/wikicorpus.yaml:
--------------------------------------------------------------------------------
 1 | image: 'public.ecr.aws/neuron/pytorch-training-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04'
 2 | backoff_limit: 2000
 3 | ebs:
 4 |   storage: 200Gi
 5 |   mount_path: /tmp
 6 | resources:
 7 |   requests:
 8 |     "aws.amazon.com/neuron": 1 
 9 |   limits:
10 |     "aws.amazon.com/neuron": 1 
11 | tolerations:
12 |   - key: "aws.amazon.com/neuron"
13 |     operator: "Exists"
14 |     effect: "NoSchedule"
15 | git:
16 |   repo_url: "https://github.com/aws-neuron/neuronx-distributed.git"
17 |   commit: a070deb86991affd589c48441bf819e6d4bb159b
18 |   branch: main
19 | pre_script: 
20 |   - pip3 install --upgrade pip
21 |   - pip3 install -r $GIT_CLONE_DIR/examples/training/tp_dp_gpt_neox_hf_pretrain/common/requirements.txt huggingface-hub==0.27.1
22 |   - mkdir -p $DATA_ROOT
23 |   - mkdir -p $LOG_ROOT 
24 | process:
25 |   env:
26 |     - name: HOME
27 |       value: /tmp
28 |     - name: DATA_ROOT
29 |       value: /fsx/home/{{ .Release.Name }}
30 |     - name: LOG_ROOT
31 |       value: /efs/home/{{ .Release.Name }}/logs
32 |   command:
33 |     -  HOME=$DATA_ROOT python3 
34 |   args:
35 |     - $GIT_CLONE_DIR/examples/training/tp_dp_gpt_neox_hf_pretrain/common/get_dataset.py 
36 |     - '2>&1 | tee $LOG_ROOT/dataset.log'
37 | 


--------------------------------------------------------------------------------
/examples/training/raytrain/lightning-bert/fine-tune.yaml:
--------------------------------------------------------------------------------
 1 | ray:
 2 |   version: '2.44.0'
 3 |   dashboard:
 4 |     host: '0.0.0.0'
 5 |   ports:
 6 |     - name: gcs-server
 7 |       port: 6379
 8 |     - name: client
 9 |       port: 10001
10 |     - name: dashboard
11 |       port: 8265
12 |   resources:
13 |     requests:
14 |       cpu: 300m 
15 |     limits:
16 |       cpu: 2
17 |   runtime_env_yaml:
18 |     runtimeEnvYAML: |
19 |       pip:
20 |         - pytorch-lightning==2.2.1
21 |         - transformers==4.38.2
22 |         - datasets==2.18.0
23 |         - scikit-learn==1.4.0
24 | image:
25 | image_pull_policy: Always
26 | resources:
27 |   requests:
28 |     "nvidia.com/gpu": 1 
29 |   limits:
30 |     "nvidia.com/gpu": 1 
31 |   nnodes: 1 
32 |   node_type: 'g5.xlarge' 
33 | tolerations:
34 |   - key: "nvidia.com/gpu"
35 |     operator: "Exists"
36 |     effect: "NoSchedule"
37 | pvc:
38 |   - name: pv-efs
39 |     mount_path: /efs
40 | git:
41 |   repo_url: https://github.com/ray-project/kuberay.git 
42 |   branch: master 
43 |   commit: 0ea404b84e45b7b8822b071c7c02b2afb3bb3eae 
44 | pre_script:
45 |   - export LOGS_DIR=$HOME/logs/$HOSTNAME
46 |   - mkdir -p $LOGS_DIR
47 |   - export OUTPUT_LOG=$LOGS_DIR/fine-tune.log
48 | train:
49 |   env:
50 |     - name: HOME
51 |       value: "/efs/home/{{ .Release.Name }}"
52 |   command:
53 |     - python
54 |   args:
55 |     - ray-operator/config/samples/pytorch-text-classifier/fine-tune-pytorch-text-classifier.py
56 |     - '2>&1 | tee $OUTPUT_LOG' 
57 | 


--------------------------------------------------------------------------------
/kfp/components/src/helm-charts-component/container/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/lts/ubuntu:22.04_stable
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | ENV DEBCONF_NONINTERACTIVE_SEEN=true
 5 | 
 6 | RUN apt-get update
 7 | RUN apt-get -y install software-properties-common
 8 | RUN apt-get -y install wget
 9 | RUN apt-get -y install apt-transport-https
10 | RUN apt-get -y install gnupg2
11 | RUN apt-get -y install git tar zip unzip
12 | 
13 | RUN wget -qO - https://baltocdn.com/helm/signing.asc  > ./helm-ubuntu-public-key.asc
14 | RUN gpg --no-default-keyring --keyring ./helm_keyring.gpg --import  ./helm-ubuntu-public-key.asc
15 | RUN gpg --no-default-keyring --keyring ./helm_keyring.gpg  --export > ./helm.gpg
16 | RUN mv ./helm.gpg /etc/apt/trusted.gpg.d/
17 | RUN rm ./helm-ubuntu-public-key.asc
18 | 
19 | RUN add-apt-repository -y  "deb [arch=amd64] https://baltocdn.com/helm/stable/debian/ all main"
20 | RUN apt-get update
21 | 
22 | RUN apt-get -y install helm
23 | RUN apt-get -y install python3-minimal
24 | RUN apt-get -y install python3-pip
25 | RUN apt-get -y install awscli
26 | 
27 | RUN pip3 install kubernetes
28 | RUN pip3 install boto3
29 | RUN pip3 install pyhelm
30 | RUN pip3 install PyYAML


--------------------------------------------------------------------------------
/kfp/components/src/helm-charts-component/container/build_tools/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | export IMAGE_NAME=eks/universal-client
4 | export IMAGE_TAG=1.0.0
5 | 


--------------------------------------------------------------------------------
/kfp/pipelines/src/helm-charts-pipeline/helm_charts_pipeline.py:
--------------------------------------------------------------------------------
 1 | from kfp import dsl
 2 | from kfp import compiler
 3 | from kfp import components
 4 | 
 5 | from typing import List, Dict
 6 | 
 7 | helm_charts_component = components.load_component_from_file('kfp/components/packages/helm_charts_component.yaml')
 8 | 
 9 | @dsl.pipeline
10 | def helm_charts_pipeline(chart_configs: List[Dict]) -> str:
11 |     helm_charts_task = helm_charts_component(chart_configs=chart_configs)
12 |     return helm_charts_task.output
13 | 
14 | compiler.Compiler().compile(helm_charts_pipeline, package_path='kfp/pipelines/packages/helm_charts_pipeline.yaml')


--------------------------------------------------------------------------------