├── CONTRIBUTING.md ├── README.md ├── SECURITY.md ├── docker ├── nccl-tests │ ├── Dockerfile │ └── README.md ├── node-ordering │ ├── Dockerfile │ ├── README.md │ ├── entrypoint.sh │ └── node_ordering.py └── rccl-tests │ ├── Dockerfile │ └── README.md ├── docs ├── adding-ssh-keys-to-worker-nodes.md ├── deploying-monitoring-stack-manually.md ├── images │ ├── rms-application-information.png │ └── tiers.png ├── importing-images-from-fss-skopeo.md ├── running-active-health-checks.md ├── running-gpu-rdma-healthchecks-with-node-problem-detector.md ├── running-ib-write-bw-test.md ├── running-pytorch-jobs-on-oke-using-hostnetwork-with-rdma.md ├── using-cluster-autoscaler-with-cluster-networks.md └── using-rdma-network-locality-when-running-workloads-on-oke.md ├── files ├── oke-nvme-raid.sh └── oke-ubuntu-cloud-init.sh ├── manifests ├── active-health-checks │ ├── active-health-checks-dcgm-diag.yaml │ ├── active-health-checks-gpu-fryer.yaml │ ├── active-health-checks-nccl-tests.yaml │ ├── active-health-checks-rccl-tests.yaml │ └── active-health-checks-rvs.yaml ├── nccl-tests │ ├── kueue │ │ ├── BM.GPU.4.8.yaml │ │ ├── BM.GPU.A100-v2.8.yaml │ │ ├── BM.GPU.B200.8.yaml │ │ ├── BM.GPU.B4.8.yaml │ │ ├── BM.GPU.GB200-v2.4.yaml │ │ ├── BM.GPU.GB200.4.yaml │ │ ├── BM.GPU.H100.8.yaml │ │ └── BM.GPU.H200.8.yaml │ └── volcano │ │ ├── BM.GPU.A100-v2.8-nccl-test.yaml │ │ ├── BM.GPU.B4.8-nccl-test.yaml │ │ ├── BM.GPU.H100.8-nccl-test.yaml │ │ ├── BM.GPU.H200.8-nccl-test.yaml │ │ └── BM.GPU4.8-nccl-test.yaml ├── rccl-tests │ ├── kueue │ │ └── BM.GPU.MI300X.8.yaml │ └── volcano │ │ └── BM.GPU.MI300X.8.yaml └── service-account │ └── oke-kubeconfig-sa-token.yaml └── terraform ├── bv.tf ├── datasources.tf ├── files ├── amd-device-metrics-exporter │ └── values.yaml ├── cert-manager │ ├── cluster-issuer-prod.yaml │ └── cluster-issuer-staging.yaml ├── grafana │ ├── alerts │ │ ├── alert-rules.yaml │ │ ├── cpu-profile.yaml │ │ ├── gpu-bad-pages.yaml │ │ ├── gpu-bus.yaml │ │ ├── gpu-count.yaml │ │ ├── gpu-ecc.yaml │ │ ├── gpu-fabric-manager.yaml │ │ ├── gpu-pcie.yaml │ │ ├── gpu-row-remap.yaml │ │ ├── node-pcie.yaml │ │ ├── oca-version.yaml │ │ ├── rdma-link-flapping.yaml │ │ ├── rdma-link.yaml │ │ ├── rdma-rttcc.yaml │ │ └── rdma-wpa-auth.yaml │ └── dashboards │ │ ├── amd │ │ ├── gpu.json │ │ ├── job.json │ │ ├── node.json │ │ └── overview.json │ │ ├── common │ │ ├── api-server.json │ │ ├── coredns.json │ │ ├── kubelet.json │ │ ├── persistent-volumes.json │ │ ├── pods-resources.json │ │ ├── prometheus.json │ │ └── scheduling.json │ │ └── nvidia │ │ ├── cluster-level-metrics.json │ │ ├── command-center.json │ │ ├── gpu-health-status.json │ │ ├── gpu-metrics.json │ │ ├── host-metrics.json │ │ └── node-problem-detector.json ├── kube-prometheus │ ├── values.yaml │ └── values.yaml.tftpl ├── lustre │ └── lustre-pv.yaml.tpl ├── nginx-ingress │ └── values.yaml.tpl ├── node-problem-detector │ └── values.yaml ├── nvidia-dcgm-exporter │ ├── .helmignore │ ├── Chart.yaml │ ├── README.md │ ├── oke-values.yaml │ ├── templates │ │ ├── NOTES.txt │ │ ├── _helpers.tpl │ │ ├── clusterrole.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── daemonset.yaml │ │ ├── metrics-configmap.yaml │ │ ├── role.yaml │ │ ├── rolebinding.yaml │ │ ├── service-monitor.yaml │ │ ├── service.yaml │ │ ├── serviceaccount.yaml │ │ ├── tls-secret.yaml │ │ └── web-config-configmap.yaml │ └── values.yaml └── oke-ons-webhook │ ├── Chart.yaml │ ├── files │ └── notification_template.j2 │ ├── templates │ ├── _helpers.tpl │ ├── configmap.yaml │ ├── deployment.yaml │ ├── rbac.yaml │ └── service.yaml │ └── values.yaml ├── fss.tf ├── grafana.tf ├── helm-module ├── helm-deployment.tf └── variables.tf ├── iam.tf ├── image.tf ├── lustre.tf ├── oke-cluster.tf ├── oke-workers.tf ├── orm-private-endpoint.tf ├── output.tf ├── provider.tf ├── schema.yaml ├── tls.tf ├── topic.tf ├── validation.tf ├── variables.tf ├── versions.tf ├── via-operator-grafana.tf ├── via-operator-helm-deployments.tf ├── via-provider-amd-device-metrics-exporter.tf ├── via-provider-grafana.tf ├── via-provider-kube-prometheus-stack.tf ├── via-provider-lustre-client.tf ├── via-provider-nginx.tf ├── via-provider-node-problem-detector.tf ├── via-provider-nvidia-dcgm-exporter.tf └── via-provider-oke-ons-webhook.tf /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/README.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/SECURITY.md -------------------------------------------------------------------------------- /docker/nccl-tests/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docker/nccl-tests/Dockerfile -------------------------------------------------------------------------------- /docker/nccl-tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docker/nccl-tests/README.md -------------------------------------------------------------------------------- /docker/node-ordering/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docker/node-ordering/Dockerfile -------------------------------------------------------------------------------- /docker/node-ordering/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docker/node-ordering/README.md -------------------------------------------------------------------------------- /docker/node-ordering/entrypoint.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docker/node-ordering/entrypoint.sh -------------------------------------------------------------------------------- /docker/node-ordering/node_ordering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docker/node-ordering/node_ordering.py -------------------------------------------------------------------------------- /docker/rccl-tests/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docker/rccl-tests/Dockerfile -------------------------------------------------------------------------------- /docker/rccl-tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docker/rccl-tests/README.md -------------------------------------------------------------------------------- /docs/adding-ssh-keys-to-worker-nodes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/adding-ssh-keys-to-worker-nodes.md -------------------------------------------------------------------------------- /docs/deploying-monitoring-stack-manually.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/deploying-monitoring-stack-manually.md -------------------------------------------------------------------------------- /docs/images/rms-application-information.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/images/rms-application-information.png -------------------------------------------------------------------------------- /docs/images/tiers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/images/tiers.png -------------------------------------------------------------------------------- /docs/importing-images-from-fss-skopeo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/importing-images-from-fss-skopeo.md -------------------------------------------------------------------------------- /docs/running-active-health-checks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/running-active-health-checks.md -------------------------------------------------------------------------------- /docs/running-gpu-rdma-healthchecks-with-node-problem-detector.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/running-gpu-rdma-healthchecks-with-node-problem-detector.md -------------------------------------------------------------------------------- /docs/running-ib-write-bw-test.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/running-ib-write-bw-test.md -------------------------------------------------------------------------------- /docs/running-pytorch-jobs-on-oke-using-hostnetwork-with-rdma.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/running-pytorch-jobs-on-oke-using-hostnetwork-with-rdma.md -------------------------------------------------------------------------------- /docs/using-cluster-autoscaler-with-cluster-networks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/using-cluster-autoscaler-with-cluster-networks.md -------------------------------------------------------------------------------- /docs/using-rdma-network-locality-when-running-workloads-on-oke.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/docs/using-rdma-network-locality-when-running-workloads-on-oke.md -------------------------------------------------------------------------------- /files/oke-nvme-raid.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/files/oke-nvme-raid.sh -------------------------------------------------------------------------------- /files/oke-ubuntu-cloud-init.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/files/oke-ubuntu-cloud-init.sh -------------------------------------------------------------------------------- /manifests/active-health-checks/active-health-checks-dcgm-diag.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/active-health-checks/active-health-checks-dcgm-diag.yaml -------------------------------------------------------------------------------- /manifests/active-health-checks/active-health-checks-gpu-fryer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/active-health-checks/active-health-checks-gpu-fryer.yaml -------------------------------------------------------------------------------- /manifests/active-health-checks/active-health-checks-nccl-tests.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/active-health-checks/active-health-checks-nccl-tests.yaml -------------------------------------------------------------------------------- /manifests/active-health-checks/active-health-checks-rccl-tests.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/active-health-checks/active-health-checks-rccl-tests.yaml -------------------------------------------------------------------------------- /manifests/active-health-checks/active-health-checks-rvs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/active-health-checks/active-health-checks-rvs.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/kueue/BM.GPU.4.8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/kueue/BM.GPU.4.8.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/kueue/BM.GPU.A100-v2.8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/kueue/BM.GPU.A100-v2.8.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/kueue/BM.GPU.B200.8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/kueue/BM.GPU.B200.8.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/kueue/BM.GPU.B4.8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/kueue/BM.GPU.B4.8.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/kueue/BM.GPU.GB200-v2.4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/kueue/BM.GPU.GB200-v2.4.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/kueue/BM.GPU.GB200.4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/kueue/BM.GPU.GB200.4.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/kueue/BM.GPU.H100.8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/kueue/BM.GPU.H100.8.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/kueue/BM.GPU.H200.8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/kueue/BM.GPU.H200.8.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/volcano/BM.GPU.A100-v2.8-nccl-test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/volcano/BM.GPU.A100-v2.8-nccl-test.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/volcano/BM.GPU.B4.8-nccl-test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/volcano/BM.GPU.B4.8-nccl-test.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/volcano/BM.GPU.H100.8-nccl-test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/volcano/BM.GPU.H100.8-nccl-test.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/volcano/BM.GPU.H200.8-nccl-test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/volcano/BM.GPU.H200.8-nccl-test.yaml -------------------------------------------------------------------------------- /manifests/nccl-tests/volcano/BM.GPU4.8-nccl-test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/nccl-tests/volcano/BM.GPU4.8-nccl-test.yaml -------------------------------------------------------------------------------- /manifests/rccl-tests/kueue/BM.GPU.MI300X.8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/rccl-tests/kueue/BM.GPU.MI300X.8.yaml -------------------------------------------------------------------------------- /manifests/rccl-tests/volcano/BM.GPU.MI300X.8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/rccl-tests/volcano/BM.GPU.MI300X.8.yaml -------------------------------------------------------------------------------- /manifests/service-account/oke-kubeconfig-sa-token.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/manifests/service-account/oke-kubeconfig-sa-token.yaml -------------------------------------------------------------------------------- /terraform/bv.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/bv.tf -------------------------------------------------------------------------------- /terraform/datasources.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/datasources.tf -------------------------------------------------------------------------------- /terraform/files/amd-device-metrics-exporter/values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/amd-device-metrics-exporter/values.yaml -------------------------------------------------------------------------------- /terraform/files/cert-manager/cluster-issuer-prod.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/cert-manager/cluster-issuer-prod.yaml -------------------------------------------------------------------------------- /terraform/files/cert-manager/cluster-issuer-staging.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/cert-manager/cluster-issuer-staging.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/alert-rules.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/alert-rules.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/cpu-profile.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/cpu-profile.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/gpu-bad-pages.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/gpu-bad-pages.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/gpu-bus.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/gpu-bus.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/gpu-count.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/gpu-count.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/gpu-ecc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/gpu-ecc.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/gpu-fabric-manager.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/gpu-fabric-manager.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/gpu-pcie.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/gpu-pcie.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/gpu-row-remap.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/gpu-row-remap.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/node-pcie.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/node-pcie.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/oca-version.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/oca-version.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/rdma-link-flapping.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/rdma-link-flapping.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/rdma-link.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/rdma-link.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/rdma-rttcc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/rdma-rttcc.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/alerts/rdma-wpa-auth.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/alerts/rdma-wpa-auth.yaml -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/amd/gpu.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/amd/gpu.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/amd/job.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/amd/job.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/amd/node.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/amd/node.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/amd/overview.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/amd/overview.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/common/api-server.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/common/api-server.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/common/coredns.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/common/coredns.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/common/kubelet.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/common/kubelet.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/common/persistent-volumes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/common/persistent-volumes.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/common/pods-resources.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/common/pods-resources.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/common/prometheus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/common/prometheus.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/common/scheduling.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/common/scheduling.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/nvidia/cluster-level-metrics.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/nvidia/cluster-level-metrics.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/nvidia/command-center.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/nvidia/command-center.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/nvidia/gpu-health-status.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/nvidia/gpu-health-status.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/nvidia/gpu-metrics.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/nvidia/gpu-metrics.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/nvidia/host-metrics.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/nvidia/host-metrics.json -------------------------------------------------------------------------------- /terraform/files/grafana/dashboards/nvidia/node-problem-detector.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/grafana/dashboards/nvidia/node-problem-detector.json -------------------------------------------------------------------------------- /terraform/files/kube-prometheus/values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/kube-prometheus/values.yaml -------------------------------------------------------------------------------- /terraform/files/kube-prometheus/values.yaml.tftpl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/kube-prometheus/values.yaml.tftpl -------------------------------------------------------------------------------- /terraform/files/lustre/lustre-pv.yaml.tpl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/lustre/lustre-pv.yaml.tpl -------------------------------------------------------------------------------- /terraform/files/nginx-ingress/values.yaml.tpl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nginx-ingress/values.yaml.tpl -------------------------------------------------------------------------------- /terraform/files/node-problem-detector/values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/node-problem-detector/values.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/.helmignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/.helmignore -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/Chart.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/Chart.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/README.md -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/oke-values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/oke-values.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/NOTES.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/NOTES.txt -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/_helpers.tpl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/_helpers.tpl -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/clusterrole.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/clusterrole.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/clusterrolebinding.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/daemonset.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/daemonset.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/metrics-configmap.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/metrics-configmap.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/role.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/role.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/rolebinding.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/rolebinding.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/service-monitor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/service-monitor.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/service.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/service.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/serviceaccount.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/tls-secret.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/tls-secret.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/templates/web-config-configmap.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/templates/web-config-configmap.yaml -------------------------------------------------------------------------------- /terraform/files/nvidia-dcgm-exporter/values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/nvidia-dcgm-exporter/values.yaml -------------------------------------------------------------------------------- /terraform/files/oke-ons-webhook/Chart.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/oke-ons-webhook/Chart.yaml -------------------------------------------------------------------------------- /terraform/files/oke-ons-webhook/files/notification_template.j2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/oke-ons-webhook/files/notification_template.j2 -------------------------------------------------------------------------------- /terraform/files/oke-ons-webhook/templates/_helpers.tpl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/oke-ons-webhook/templates/_helpers.tpl -------------------------------------------------------------------------------- /terraform/files/oke-ons-webhook/templates/configmap.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/oke-ons-webhook/templates/configmap.yaml -------------------------------------------------------------------------------- /terraform/files/oke-ons-webhook/templates/deployment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/oke-ons-webhook/templates/deployment.yaml -------------------------------------------------------------------------------- /terraform/files/oke-ons-webhook/templates/rbac.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/oke-ons-webhook/templates/rbac.yaml -------------------------------------------------------------------------------- /terraform/files/oke-ons-webhook/templates/service.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/oke-ons-webhook/templates/service.yaml -------------------------------------------------------------------------------- /terraform/files/oke-ons-webhook/values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/files/oke-ons-webhook/values.yaml -------------------------------------------------------------------------------- /terraform/fss.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/fss.tf -------------------------------------------------------------------------------- /terraform/grafana.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/grafana.tf -------------------------------------------------------------------------------- /terraform/helm-module/helm-deployment.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/helm-module/helm-deployment.tf -------------------------------------------------------------------------------- /terraform/helm-module/variables.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/helm-module/variables.tf -------------------------------------------------------------------------------- /terraform/iam.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/iam.tf -------------------------------------------------------------------------------- /terraform/image.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/image.tf -------------------------------------------------------------------------------- /terraform/lustre.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/lustre.tf -------------------------------------------------------------------------------- /terraform/oke-cluster.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/oke-cluster.tf -------------------------------------------------------------------------------- /terraform/oke-workers.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/oke-workers.tf -------------------------------------------------------------------------------- /terraform/orm-private-endpoint.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/orm-private-endpoint.tf -------------------------------------------------------------------------------- /terraform/output.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/output.tf -------------------------------------------------------------------------------- /terraform/provider.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/provider.tf -------------------------------------------------------------------------------- /terraform/schema.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/schema.yaml -------------------------------------------------------------------------------- /terraform/tls.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/tls.tf -------------------------------------------------------------------------------- /terraform/topic.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/topic.tf -------------------------------------------------------------------------------- /terraform/validation.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/validation.tf -------------------------------------------------------------------------------- /terraform/variables.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/variables.tf -------------------------------------------------------------------------------- /terraform/versions.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/versions.tf -------------------------------------------------------------------------------- /terraform/via-operator-grafana.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-operator-grafana.tf -------------------------------------------------------------------------------- /terraform/via-operator-helm-deployments.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-operator-helm-deployments.tf -------------------------------------------------------------------------------- /terraform/via-provider-amd-device-metrics-exporter.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-provider-amd-device-metrics-exporter.tf -------------------------------------------------------------------------------- /terraform/via-provider-grafana.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-provider-grafana.tf -------------------------------------------------------------------------------- /terraform/via-provider-kube-prometheus-stack.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-provider-kube-prometheus-stack.tf -------------------------------------------------------------------------------- /terraform/via-provider-lustre-client.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-provider-lustre-client.tf -------------------------------------------------------------------------------- /terraform/via-provider-nginx.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-provider-nginx.tf -------------------------------------------------------------------------------- /terraform/via-provider-node-problem-detector.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-provider-node-problem-detector.tf -------------------------------------------------------------------------------- /terraform/via-provider-nvidia-dcgm-exporter.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-provider-nvidia-dcgm-exporter.tf -------------------------------------------------------------------------------- /terraform/via-provider-oke-ons-webhook.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/HEAD/terraform/via-provider-oke-ons-webhook.tf --------------------------------------------------------------------------------