├── config.example
    ├── helm
    │   ├── dcgm-exporter.yml
    │   ├── metallb.yml
    │   └── metallb-resources.yml
    ├── containers
    │   └── dgx-firmware
    │   │   └── .gitkeep
    ├── files
    │   └── kubeflow
    │   │   └── user-namespace-params.env
    ├── pxe
    │   ├── ipmi_host_list
    │   ├── ipmi.conf
    │   └── dnsmasq.extra.conf
    ├── host_vars
    │   └── gpu01
    ├── requirements.yml
    ├── env.sh
    ├── README.md
    └── playbooks
    │   └── example.yml
├── workloads
    ├── services
    │   └── k8s
    │   │   ├── dgxie
    │   │       ├── templates
    │   │       │   └── NOTES.txt
    │   │       ├── Chart.yaml
    │   │       └── .helmignore
    │   │   └── k8s-dashboard-admin.yml
    ├── examples
    │   ├── k8s
    │   │   ├── kubeflow-pipeline-deploy
    │   │   │   ├── __init__.py
    │   │   │   ├── triton.py.tar.gz
    │   │   │   ├── kubeflow-pipelines-0.PNG
    │   │   │   ├── kubeflow-pipelines-1.PNG
    │   │   │   ├── kubeflow-pipelines-2.PNG
    │   │   │   └── kubeflow-pipelines-3.PNG
    │   │   ├── services
    │   │   │   ├── logging
    │   │   │   │   ├── README.md
    │   │   │   │   ├── kibana-service.yaml
    │   │   │   │   └── es-service.yaml
    │   │   │   ├── hello-world.yml
    │   │   │   ├── dhcpd.yml
    │   │   │   ├── nfs-client.yml
    │   │   │   ├── nfs-dgx-iso.yml
    │   │   │   ├── ambassador-service.yml
    │   │   │   └── pxe.yml
    │   │   ├── dask-rapids
    │   │   │   ├── parallel-sum.png
    │   │   │   ├── jupyterlab-nvsmi.png
    │   │   │   └── k8s
    │   │   │   │   └── rapids-dask-sa.yml
    │   │   ├── gpu-test-job.yml
    │   │   ├── deep-learning-examples
    │   │   │   ├── templates
    │   │   │   │   ├── service.yaml
    │   │   │   │   └── tests
    │   │   │   │   │   └── test-connection.yaml
    │   │   │   └── .helmignore
    │   │   ├── gpu-usage
    │   │   │   ├── mig-mixed-without-selector.yml
    │   │   │   ├── mig-mixed-with-selector.yml
    │   │   │   ├── mig-single.yml
    │   │   │   ├── gpu-without-selector.yml
    │   │   │   └── gpu-with-selector.yml
    │   │   ├── pytorch-job.yml
    │   │   ├── tensorflow-job.yml
    │   │   ├── cluster-gpu-test-job.yml
    │   │   ├── nbody.yml
    │   │   ├── ingress-nodeport.yml
    │   │   └── ingress-loadbalancer.yml
    │   └── slurm
    │   │   ├── mpi-hello
    │   │       ├── bootstrap-mpi.yml
    │   │       ├── hello-job.sh
    │   │       └── mpi-hello.c
    │   │   └── dask-rapids
    │   │       └── files
    │   │           ├── launch-dask-scheduler.sh
    │   │           ├── launch-dask-cuda-worker.sh
    │   │           └── conda-requirements.yml
    ├── bit
    │   ├── .gitignore
    │   └── hpl
    │   │   ├── syscfg-dgx1v.sh
    │   │   └── syscfg-dgx2.sh
    └── jenkins
    │   └── scripts
    │       ├── test-cluster-up.sh
    │       ├── test-setup-slurm.sh
    │       ├── files
    │           └── nginx-from-local-registry.yml
    │       ├── remote-script-for-mpi.sh
    │       ├── remote-script-for-slurm-gpu.sh
    │       ├── test-slurm-enroot-job.sh
    │       ├── remote-script-for-registry-test.sh
    │       ├── vagrant-startup.sh
    │       ├── test-ceph.sh
    │       ├── test-spack-minimal.sh
    │       ├── test-spack-install.sh
    │       ├── get-slurm-debug.sh
    │       ├── test-slurm-gpu.sh
    │       ├── test-dashboard.sh
    │       ├── test-mpi-job.sh
    │       └── test-slurm-nfs-mount.sh
├── roles
    ├── nhc
    │   ├── vars
    │   │   ├── main.yml
    │   │   ├── redhat.yml
    │   │   ├── ubuntu.yml
    │   │   ├── ubuntu-20.04.yml
    │   │   └── ubuntu-22.04.yml
    │   ├── .ansible-lint
    │   ├── meta
    │   │   └── main.yml
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   └── verify.yml
    │   ├── templates
    │   │   └── sysconfig_nhc.j2
    │   └── defaults
    │   │   └── main.yml
    ├── mofed
    │   ├── vars
    │   │   ├── ubuntu.yml
    │   │   ├── rhel8.yml
    │   │   └── rhel7.yml
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   └── verify.yml
    │   └── defaults
    │   │   └── main.yml
    ├── nfs
    │   ├── vars
    │   │   ├── redhat.yml
    │   │   └── ubuntu.yml
    │   ├── defaults
    │   │   └── main.yml
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── prepare.yml
    │   ├── handlers
    │   │   └── main.yml
    │   └── templates
    │   │   └── exports.j2
    ├── slurm
    │   ├── .ansible-lint
    │   ├── templates
    │   │   └── etc
    │   │   │   ├── munge
    │   │   │       └── munge.key.j2
    │   │   │   ├── ld.so.conf.d
    │   │   │       └── slurm.conf.j2
    │   │   │   ├── slurm
    │   │   │       ├── cgroup.conf
    │   │   │       ├── epilog.d
    │   │   │       │   ├── 80-exclusive-cleanup
    │   │   │       │   ├── 95-lastuserjob-rootless
    │   │   │       │   ├── 60-exclusive-cpu
    │   │   │       │   ├── 40-lastuserjob-processes
    │   │   │       │   ├── 41-lastuserjob-ssh
    │   │   │       │   ├── 42-lastuserjob-cleanup
    │   │   │       │   └── 50-lastuserjob-all-enroot-dirs
    │   │   │       ├── prolog.d
    │   │   │       │   ├── 95-all-rootless
    │   │   │       │   ├── 50-exclusive-gpu
    │   │   │       │   ├── 50-exclusive-cpu
    │   │   │       │   └── 50-exclusive-ssh
    │   │   │       ├── epilog.sh
    │   │   │       ├── prolog.sh
    │   │   │       ├── gres.conf
    │   │   │       └── shared
    │   │   │       │   └── bin
    │   │   │       │       ├── set_gpu_power_levels.sh
    │   │   │       │       └── set_gpu_clocks.sh
    │   │   │   ├── localusers
    │   │   │   └── rsyslog.d
    │   │   │       └── 99-slurm.conf
    │   ├── meta
    │   │   └── main.yml
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── prepare.yml
    │   ├── tasks
    │   │   ├── shmfix.yml
    │   │   ├── logging.yml
    │   │   ├── setup-user.yml
    │   │   ├── build-cleanup.yml
    │   │   ├── munge.yml
    │   │   ├── undrain.yml
    │   │   ├── service-files.yml
    │   │   ├── setup-role.yml
    │   │   └── misc-node.yml
    │   ├── vars
    │   │   ├── ubuntu.yml
    │   │   └── redhat.yml
    │   └── handlers
    │   │   └── main.yml
    ├── dns-config
    │   ├── defaults
    │   │   └── main.yml
    │   └── templates
    │   │   └── resolv.conf.j2
    ├── netapp-trident
    │   ├── .ansible-lint
    │   └── templates
    │   │   └── namespace.j2
    ├── autofs
    │   ├── templates
    │   │   └── master.j2
    │   ├── handlers
    │   │   └── main.yml
    │   └── tasks
    │   │   └── main.yml
    ├── move-home-dirs
    │   ├── defaults
    │   │   └── main.yml
    │   ├── vars
    │   │   └── main.yml
    │   └── tasks
    │   │   ├── move_user.yml
    │   │   └── main.yml
    ├── nvidia-dgx
    │   ├── templates
    │   │   ├── sources.list.j2
    │   │   ├── cachefilesd.conf.j2
    │   │   ├── dgx.list.j2
    │   │   └── dgxos5.list.j2
    │   ├── vars
    │   │   ├── dgxa100.yml
    │   │   └── redhat.yml
    │   ├── files
    │   │   └── nvidia-persistenced-override.conf
    │   ├── tasks
    │   │   ├── ubuntu-upgrade.yml
    │   │   ├── configure-raid.yml
    │   │   └── main.yml
    │   └── handlers
    │   │   └── main.yml
    ├── easy-build
    │   ├── meta
    │   │   └── main.yml
    │   ├── .ansible-lint
    │   ├── templates
    │   │   ├── z01_eb.sh
    │   │   └── z01_eb.csh
    │   └── defaults
    │   │   └── main.yml
    ├── openmpi
    │   ├── vars
    │   │   ├── redhat.yml
    │   │   └── ubuntu.yml
    │   ├── templates
    │   │   └── ld-openmpi.conf.j2
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── prepare.yml
    │   │   │   ├── verify.yml
    │   │   │   └── molecule.yml
    │   └── defaults
    │   │   └── main.yml
    ├── nvidia-gpu-operator-node-prep
    │   ├── files
    │   │   ├── nvidia-driver.conf
    │   │   └── blocklist-nouveau.conf
    │   └── handlers
    │   │   └── main.yml
    ├── nvidia-gpu-operator
    │   ├── .ansible-lint
    │   ├── meta
    │   │   └── main.yml
    │   ├── templates
    │   │   ├── client_configuration_token.tok
    │   │   └── gridd.conf
    │   └── tasks
    │   │   └── main.yml
    ├── cachefilesd
    │   ├── templates
    │   │   └── cachefilesd.j2
    │   ├── defaults
    │   │   └── main.yml
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── prepare.yml
    │   └── tasks
    │   │   └── main.yml
    ├── easy-build-packages
    │   ├── meta
    │   │   └── main.yml
    │   ├── defaults
    │   │   └── main.yml
    │   ├── tasks
    │   │   ├── redhat-pre-install.yml
    │   │   └── ubuntu-pre-install.yml
    │   └── .ansible-lint
    ├── nvidia-k8s-gpu-device-plugin
    │   ├── meta
    │   │   └── main.yml
    │   ├── .ansible-lint
    │   └── defaults
    │   │   └── main.yml
    ├── nvidia-k8s-gpu-feature-discovery
    │   ├── meta
    │   │   └── main.yml
    │   ├── .ansible-lint
    │   └── defaults
    │   │   └── main.yml
    ├── kerberos_client
    │   ├── meta
    │   │   ├── .galaxy_install_info
    │   │   └── main.yml
    │   ├── .ansible-lint
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── prepare.yml
    │   ├── defaults
    │   │   └── main.yml
    │   └── vars
    │   │   └── main.yml
    ├── ood-wrapper
    │   ├── vars
    │   │   ├── main.yml
    │   │   ├── ubuntu.yml
    │   │   └── redhat.yml
    │   ├── templates
    │   │   ├── desktop-submit.yml.erb.j2
    │   │   ├── desktop.yml.j2
    │   │   ├── bc_osc_codeserver
    │   │   │   ├── submit.yml.erb.j2
    │   │   │   ├── manifest.yml.j2
    │   │   │   └── form.yml.j2
    │   │   ├── desktop-form.yml.j2
    │   │   └── cluster.yml.j2
    │   └── tasks
    │   │   └── main.yml
    ├── pyxis
    │   ├── templates
    │   │   └── etc
    │   │   │   └── slurm
    │   │   │       ├── plugstack.conf
    │   │   │       └── plugstack.conf.d
    │   │   │           └── pyxis.conf
    │   ├── defaults
    │   │   └── main.yml
    │   └── handlers
    │   │   └── main.yml
    ├── facts
    │   ├── files
    │   │   ├── memory.fact
    │   │   └── gpus.fact
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   └── verify.yml
    │   └── tasks
    │   │   └── main.yml
    ├── nis_client
    │   ├── files
    │   │   └── policy-rc.d
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── prepare.yml
    │   └── handlers
    │   │   └── main.yml
    ├── roce_backend
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   └── config_dp.j2
    ├── singularity_wrapper
    │   ├── .ansible-lint
    │   ├── requirements.yml
    │   ├── roles.yml
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── molecule.yml
    │   ├── meta
    │   │   └── main.yml
    │   ├── defaults
    │   │   └── main.yml
    │   └── .yamllint
    ├── rsyslog_client
    │   ├── handlers
    │   │   └── main.yml
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── prepare.yml
    │   ├── templates
    │   │   └── 99-forward-syslog.conf
    │   ├── tasks
    │   │   └── main.yml
    │   └── defaults
    │   │   └── main.yml
    ├── rsyslog_server
    │   ├── handlers
    │   │   └── main.yml
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── prepare.yml
    │   ├── defaults
    │   │   └── main.yml
    │   └── templates
    │   │   └── 01-deepops-listen.conf
    ├── grafana
    │   ├── handlers
    │   │   └── main.yml
    │   ├── templates
    │   │   ├── grafana.ini.j2
    │   │   ├── prometheus-datasource.yml.j2
    │   │   ├── prometheus-dashboard.yml.j2
    │   │   └── docker.grafana.service.j2
    │   └── defaults
    │   │   └── main.yml
    ├── nvidia_hpc_sdk
    │   ├── templates
    │   │   ├── z95_nvhpc_modules.csh
    │   │   ├── z95_nvhpc_modules.sh
    │   │   ├── z95_nvhpc.csh
    │   │   └── z95_nvhpc.sh
    │   └── molecule
    │   │   └── default
    │   │       ├── converge.yml
    │   │       ├── prepare.yml
    │   │       ├── verify.yml
    │   │       └── molecule.yml
    ├── prometheus
    │   ├── handlers
    │   │   └── main.yml
    │   ├── templates
    │   │   ├── alert_rules.yml.j2
    │   │   ├── docker.prometheus.service.j2
    │   │   └── prometheus.yml.j2
    │   └── defaults
    │   │   └── main.yml
    ├── spack
    │   ├── templates
    │   │   ├── z00_spack.csh
    │   │   └── z00_spack.sh
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   └── verify.yml
    │   └── defaults
    │   │   └── main.yml
    ├── alertmanager
    │   ├── handlers
    │   │   └── main.yml
    │   ├── templates
    │   │   ├── alertmanager.yml.j2
    │   │   └── docker.alertmanager.service.j2
    │   └── defaults
    │   │   └── main.yml
    ├── nvidia-dcgm-exporter
    │   ├── handlers
    │   │   └── main.yml
    │   ├── templates
    │   │   ├── dcgm-exporter.yml.j2
    │   │   └── docker.dcgm-exporter.service.j2
    │   └── defaults
    │   │   └── main.yml
    ├── nvidia_dcgm
    │   ├── tasks
    │   │   ├── install-dgx.yml
    │   │   ├── install-ubuntu.yml
    │   │   └── install-redhat.yml
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── prepare.yml
    │   ├── vars
    │   │   └── main.yml
    │   └── .yamllint
    ├── nginx-docker-registry-cache
    │   ├── handlers
    │   │   └── main.yml
    │   ├── templates
    │   │   └── http-proxy.conf
    │   └── tasks
    │   │   └── client.yml
    ├── nvidia-dgx-firmware
    │   └── tasks
    │   │   ├── get-time.yml
    │   │   ├── run-diagnostics.yml
    │   │   ├── get-health.yml
    │   │   ├── get-ib.yml
    │   │   └── get-data.yml
    ├── prometheus-node-exporter
    │   ├── handlers
    │   │   └── main.yml
    │   ├── templates
    │   │   ├── node-exporter.yml.j2
    │   │   └── docker.node-exporter.service.j2
    │   └── defaults
    │   │   └── main.yml
    ├── docker-rootless
    │   ├── templates
    │   │   ├── z96_rootlessdocker_modules.sh
    │   │   └── rootless-docker
    │   │   │   ├── bin
    │   │   │       └── nvidia-container-runtime-hook
    │   │   │   └── config
    │   │   │       └── nvidia-container-runtime
    │   │   │           └── config.toml
    │   └── defaults
    │   │   └── main.yml
    ├── lmod
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   └── verify.yml
    │   └── defaults
    │   │   └── main.yml
    ├── openshift
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── prepare.yml
    │   │   │   ├── verify.yml
    │   │   │   └── molecule.yml
    │   └── defaults
    │   │   └── main.yml
    ├── nvidia_cuda
    │   ├── molecule
    │   │   └── default
    │   │   │   ├── converge.yml
    │   │   │   ├── verify.yml
    │   │   │   └── prepare.yml
    │   ├── files
    │   │   └── cuda-vars.sh
    │   ├── vars
    │   │   └── main.yml
    │   └── tasks
    │   │   ├── install-dgx.yml
    │   │   ├── install-redhat.yml
    │   │   ├── install-ubuntu.yml
    │   │   └── main.yml
    ├── prometheus-slurm-exporter
    │   ├── templates
    │   │   └── slurm-exporter.yml.j2
    │   ├── handlers
    │   │   └── main.yml
    │   └── defaults
    │   │   └── main.yml
    ├── nvidia-mig-manager
    │   └── defaults
    │   │   └── main.yml
    ├── docker-login
    │   ├── defaults
    │   │   └── main.yml
    │   └── tasks
    │   │   └── main.yml
    ├── nvidia-network-operator
    │   └── templates
    │   │   └── values.yaml
    ├── nfs-client-provisioner
    │   ├── defaults
    │   │   └── main.yml
    │   └── tasks
    │   │   └── main.yml
    ├── standalone-container-registry
    │   └── templates
    │   │   └── config.yml
    ├── nvidia-peer-memory
    │   └── tasks
    │   │   └── main.yml
    └── nvidia-gpu-tests
    │   ├── README.md
    │   └── defaults
    │       └── main.yml
├── playbooks
    ├── slurm-cluster
    │   ├── files
    │   │   └── cve_2021_44228.options
    │   ├── openmpi.yml
    │   ├── lmod.yml
    │   ├── open-ondemand.yml
    │   ├── nhc.yml
    │   ├── grafana.yml
    │   ├── spack-modules.yml
    │   ├── templates
    │   │   └── filebeat.conf
    │   ├── prometheus.yml
    │   ├── alertmanager.yml
    │   ├── easybuild-modules.yml
    │   ├── prometheus-node-exporter.yml
    │   ├── prometheus-slurm-exporter.yml
    │   └── nvidia-dcgm-exporter.yml
    ├── generic
    │   ├── cachefilesd.yml
    │   ├── dns-config.yml
    │   ├── anaconda.yml
    │   ├── authentication.yml
    │   ├── ntp-client.yml
    │   ├── nfs-client.yml
    │   ├── nfs-server.yml
    │   ├── rsyslog-client.yml
    │   ├── rsyslog-server.yml
    │   ├── chrony-client.yml
    │   ├── hosts.yml
    │   └── software.yml
    ├── provisioning
    │   └── maas.yml
    ├── container
    │   ├── singularity.yml
    │   ├── standalone-container-registry.yml
    │   ├── docker-login.yml
    │   ├── nginx-docker-registry-cache-server.yml
    │   ├── nginx-docker-registry-cache-client.yml
    │   ├── docker-rootless.yml
    │   └── pyxis.yml
    ├── nvidia-software
    │   ├── nvidia-dcgm.yml
    │   ├── nvidia-hpc-sdk.yml
    │   ├── nvidia-peer-memory.yml
    │   └── nvidia-driver.yml
    ├── utilities
    │   ├── nvidia-gpu-tests.yml
    │   ├── nvidia-set-gpu-clocks.yml
    │   ├── gpu-clocks.yml
    │   └── mofed.yml
    ├── bootstrap
    │   ├── bootstrap-openshift.yml
    │   └── bootstrap-rook.yml
    ├── nvidia-dgx
    │   ├── nvidia-dgx.yml
    │   ├── nvidia-dgx-diag.yml
    │   └── nvidia-dgx-fw-update.yml
    └── k8s-cluster
    │   ├── nvidia-network-operator.yaml
    │   ├── nvidia-k8s-gpu-device-plugin.yml
    │   ├── nvidia-k8s-gpu-feature-discovery.yml
    │   ├── nvidia-gpu-operator.yml
    │   ├── container-registry.yml
    │   └── netapp-trident.yml
├── virtual
    ├── .gitignore
    ├── vars_files
    │   ├── virt_k8s.yml
    │   └── virt_slurm.yml
    └── k8s_environment.sh
├── docs
    ├── img
    │   ├── roce_resnet50.PNG
    │   ├── nccl_latency_ring.PNG
    │   ├── nccl_bandwidth_ring.PNG
    │   ├── slurm_monitoring_grafana01.png
    │   ├── slurm_monitoring_grafana02.png
    │   ├── slurm_monitoring_grafana03.png
    │   ├── slurm_monitoring_grafana04.png
    │   ├── slurm_monitoring_grafana05.png
    │   ├── slurm_monitoring_grafana06.png
    │   ├── slurm_monitoring_alertmanager01.png
    │   ├── slurm_monitoring_alertmanager02.png
    │   └── slurm_monitoring_prometheus01.png
    ├── pxe
    │   └── maas-example-vms.png
    ├── slurm-cluster
    │   ├── ood-images
    │   │   ├── ood-01.png
    │   │   ├── ood-02.png
    │   │   ├── ood-03.png
    │   │   ├── ood-04.png
    │   │   ├── ood-05.png
    │   │   ├── ood-06.png
    │   │   ├── ood-07.png
    │   │   ├── ood-08.png
    │   │   └── ood-09.png
    │   └── slurm-prolog-epilog
    │   │   ├── prolog-checkmounts
    │   │   ├── hyperthreadingon
    │   │   ├── epilog-mps
    │   │   ├── epilog-dcgmstats
    │   │   ├── prolog-dcgmstats
    │   │   ├── prolog-lspci
    │   │   ├── hyperthreadingoff
    │   │   ├── epilog-ecc
    │   │   ├── prolog-dcgmhealth
    │   │   └── prolog-ecc
    └── cloud-native
    │   └── README.md
├── src
    ├── containers
    │   ├── dgxie
    │   │   ├── mboot.efi
    │   │   └── dnsmasq.conf
    │   ├── pxe
    │   │   └── dhcp
    │   │   │   ├── Dockerfile
    │   │   │   └── dnsmasq.conf
    │   └── ngc
    │   │   ├── pytorch
    │   │       └── Dockerfile-minimal
    │   │   ├── tensorflow
    │   │       └── Dockerfile-minimal
    │   │   ├── build.sh
    │   │   └── rapids
    │   │       └── Dockerfile-minimal
    └── repo
    │   ├── ansible-lint
    │   └── githooks
    │       ├── pre-commit
    │       ├── check-python.py
    │       └── check-shell.py
├── .gitignore
├── scripts
    ├── deepops
    │   ├── proxy.sh
    │   └── enable_linting.sh
    ├── pxe
    │   ├── build_and_restart_dgxie.sh
    │   └── setup_nat.sh
    └── generic
    │   ├── gpu_diag.sh
    │   └── install_docker.sh
├── .gitmodules
├── ansible.cfg
└── .github
    └── workflows
        └── ansible-lint-roles.yml


/config.example/helm/dcgm-exporter.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/config.example/containers/dgx-firmware/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/workloads/services/k8s/dgxie/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/roles/nhc/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | nhc_ssh_daemon: "sshd"
3 | 


--------------------------------------------------------------------------------
/roles/mofed/vars/ubuntu.yml:
--------------------------------------------------------------------------------
1 | ---
2 | mofed_distro: "ubuntu"
3 | 


--------------------------------------------------------------------------------
/roles/nfs/vars/redhat.yml:
--------------------------------------------------------------------------------
1 | nfs_server_daemon: nfs-server
2 | 


--------------------------------------------------------------------------------
/roles/slurm/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:
2 | - meta-no-info
3 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/kubeflow-pipeline-deploy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/roles/dns-config/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dns_config_search: []
3 | 


--------------------------------------------------------------------------------
/roles/netapp-trident/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:
2 | - var-naming
3 | 


--------------------------------------------------------------------------------
/roles/nfs/vars/ubuntu.yml:
--------------------------------------------------------------------------------
1 | nfs_server_daemon: nfs-kernel-server
2 | 


--------------------------------------------------------------------------------
/roles/autofs/templates/master.j2:
--------------------------------------------------------------------------------
1 | {{ autofs_mount }} yp:{{ autofs_map }}
2 | 


--------------------------------------------------------------------------------
/roles/move-home-dirs/defaults/main.yml:
--------------------------------------------------------------------------------
1 | move_home_dirs_new_root: /local
2 | 


--------------------------------------------------------------------------------
/roles/nhc/vars/redhat.yml:
--------------------------------------------------------------------------------
1 | ---
2 | nhc_build_deps:
3 | - gcc
4 | - make
5 | 


--------------------------------------------------------------------------------
/roles/nhc/vars/ubuntu.yml:
--------------------------------------------------------------------------------
1 | ---
2 | nhc_build_deps:
3 |   - build-essential
4 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/templates/sources.list.j2:
--------------------------------------------------------------------------------
1 | {{ dgx_default_ubuntu_repos }}
2 | 


--------------------------------------------------------------------------------
/roles/easy-build/meta/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 |   - { role: lmod }
4 | 


--------------------------------------------------------------------------------
/roles/move-home-dirs/vars/main.yml:
--------------------------------------------------------------------------------
1 | tmp_user: ansible-tmp-user-move_home_dirs
2 | 


--------------------------------------------------------------------------------
/roles/openmpi/vars/redhat.yml:
--------------------------------------------------------------------------------
1 | ---
2 | openmpi_deps:
3 |   - "@Development Tools"
4 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/files/cve_2021_44228.options:
--------------------------------------------------------------------------------
1 | -Dlog4j2.formatMsgNoLookups=true
2 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-operator-node-prep/files/nvidia-driver.conf:
--------------------------------------------------------------------------------
1 | i2c_core
2 | ipmi_msghandler
3 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-operator/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:
2 | - meta-no-info
3 | - role-name
4 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-operator/meta/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 |   - role: openshift
4 | 


--------------------------------------------------------------------------------
/virtual/.gitignore:
--------------------------------------------------------------------------------
1 | /.vagrant/
2 | /admin.conf
3 | /config
4 | /k8s-config
5 | Vagrantfile
6 | 


--------------------------------------------------------------------------------
/roles/cachefilesd/templates/cachefilesd.j2:
--------------------------------------------------------------------------------
1 | RUN={{ 'yes' if cachefilesd_enabled else 'no' }}
2 | 


--------------------------------------------------------------------------------
/roles/easy-build-packages/meta/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 |   - { role: easy-build }
4 | 


--------------------------------------------------------------------------------
/roles/nvidia-k8s-gpu-device-plugin/meta/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 |   - role: openshift
4 | 


--------------------------------------------------------------------------------
/docs/img/roce_resnet50.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/roce_resnet50.PNG


--------------------------------------------------------------------------------
/roles/nvidia-k8s-gpu-feature-discovery/meta/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 |   - role: openshift
4 | 


--------------------------------------------------------------------------------
/roles/openmpi/vars/ubuntu.yml:
--------------------------------------------------------------------------------
1 | ---
2 | openmpi_deps:
3 |   - build-essential
4 |   - libnuma-dev
5 | 


--------------------------------------------------------------------------------
/docs/pxe/maas-example-vms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/pxe/maas-example-vms.png


--------------------------------------------------------------------------------
/roles/kerberos_client/meta/.galaxy_install_info:
--------------------------------------------------------------------------------
1 | {install_date: 'Tue Nov  8 19:19:09 2016', version: ''}
2 | 


--------------------------------------------------------------------------------
/roles/nhc/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:
2 |   - meta-no-info  # meta/main.yml should contain relevant info
3 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-operator/templates/client_configuration_token.tok:
--------------------------------------------------------------------------------
1 | "{{ gpu_operator_nvaie_nls_token }}"
2 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | user: "{{ ansible_env.SUDO_USER | default(ansible_env.USER) }}"
3 | 


--------------------------------------------------------------------------------
/roles/pyxis/templates/etc/slurm/plugstack.conf:
--------------------------------------------------------------------------------
1 | include {{ slurm_config_dir }}/plugstack.conf.d/*.conf
2 | 


--------------------------------------------------------------------------------
/roles/pyxis/templates/etc/slurm/plugstack.conf.d/pyxis.conf:
--------------------------------------------------------------------------------
1 | required /usr/local/src/pyxis/spank_pyxis.so
2 | 


--------------------------------------------------------------------------------
/docs/img/nccl_latency_ring.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/nccl_latency_ring.PNG


--------------------------------------------------------------------------------
/playbooks/generic/cachefilesd.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 |   become: yes
4 |   roles:
5 |     - cachefilesd
6 | 


--------------------------------------------------------------------------------
/playbooks/generic/dns-config.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 |   become: true
4 |   roles:
5 |     - dns-config
6 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/openmpi.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 |   become: yes
4 |   roles:
5 |     - openmpi
6 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/munge/munge.key.j2:
--------------------------------------------------------------------------------
1 | {{ slurm_password|password_hash('sha512', slurm_cluster_name) }}
2 | 


--------------------------------------------------------------------------------
/src/containers/dgxie/mboot.efi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/src/containers/dgxie/mboot.efi


--------------------------------------------------------------------------------
/docs/img/nccl_bandwidth_ring.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/nccl_bandwidth_ring.PNG


--------------------------------------------------------------------------------
/playbooks/provisioning/maas.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 |   become: yes
4 |   roles:
5 |     - role: ansible-maas
6 | 


--------------------------------------------------------------------------------
/roles/facts/files/memory.fact:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "{ \"total_mb\": $(free -m | grep Mem: | awk '{print $2*0.95}') }"


--------------------------------------------------------------------------------
/roles/nhc/vars/ubuntu-20.04.yml:
--------------------------------------------------------------------------------
1 | ---
2 | nhc_build_deps:
3 |   - build-essential
4 | 
5 | nhc_ssh_daemon: "sshd:"
6 | 


--------------------------------------------------------------------------------
/roles/nhc/vars/ubuntu-22.04.yml:
--------------------------------------------------------------------------------
1 | ---
2 | nhc_build_deps:
3 |   - build-essential
4 | 
5 | nhc_ssh_daemon: "sshd:"
6 | 


--------------------------------------------------------------------------------
/roles/nis_client/files/policy-rc.d:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | echo "All runlevel operations denied by policy" >&2
3 | exit 101
4 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-operator-node-prep/files/blocklist-nouveau.conf:
--------------------------------------------------------------------------------
1 | blacklist nouveau
2 | options nouveau modeset=0
3 | 


--------------------------------------------------------------------------------
/roles/openmpi/templates/ld-openmpi.conf.j2:
--------------------------------------------------------------------------------
1 | {{ openmpi_install_prefix }}/lib
2 | {{ openmpi_install_prefix }}/lib64
3 | 


--------------------------------------------------------------------------------
/roles/roce_backend/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # tasks file for roce_backend role in allhosts.yaml and k8smaster.yaml
3 | 


--------------------------------------------------------------------------------
/playbooks/container/singularity.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 |   become: yes
4 |   roles:
5 |     - singularity_wrapper
6 | 


--------------------------------------------------------------------------------
/playbooks/generic/anaconda.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 |   become: yes
4 |   roles:
5 |   - andrewrothstein.miniconda
6 | 


--------------------------------------------------------------------------------
/roles/autofs/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart autofs
3 |   service: name=autofs state=restarted enabled=yes
4 | 


--------------------------------------------------------------------------------
/roles/singularity_wrapper/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:
2 |   - meta-no-info  # meta/main.yml should contain relevant info
3 | 


--------------------------------------------------------------------------------
/workloads/bit/.gitignore:
--------------------------------------------------------------------------------
1 | tmp
2 | results
3 | slurm*.out
4 | xhpl_cuda-10.1-dyn_mkl-static_ompi-3.1.3_gcc4.8.5_3-12-19b
5 | 


--------------------------------------------------------------------------------
/config.example/helm/metallb.yml:
--------------------------------------------------------------------------------
1 | ---
2 | controller:
3 |   nodeSelector:
4 |     node-role.kubernetes.io/control-plane: ""
5 | 


--------------------------------------------------------------------------------
/docs/img/slurm_monitoring_grafana01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana01.png


--------------------------------------------------------------------------------
/docs/img/slurm_monitoring_grafana02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana02.png


--------------------------------------------------------------------------------
/docs/img/slurm_monitoring_grafana03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana03.png


--------------------------------------------------------------------------------
/docs/img/slurm_monitoring_grafana04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana04.png


--------------------------------------------------------------------------------
/docs/img/slurm_monitoring_grafana05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana05.png


--------------------------------------------------------------------------------
/docs/img/slurm_monitoring_grafana06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana06.png


--------------------------------------------------------------------------------
/roles/nvidia-dgx/vars/dgxa100.yml:
--------------------------------------------------------------------------------
1 | DGX_SWBUILD_DATE: 2020-06-29
2 | DGX_SWBUILD_VERSION: 4.99.9
3 | DGX_COMMIT_ID: 9f56299
4 | 


--------------------------------------------------------------------------------
/roles/singularity_wrapper/requirements.yml:
--------------------------------------------------------------------------------
1 | ---
2 | roles:
3 | - name: Setup singularity roles
4 |   include_tasks: roles.yml
5 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/ld.so.conf.d/slurm.conf.j2:
--------------------------------------------------------------------------------
1 | {{ slurm_install_prefix }}/lib
2 | {{ slurm_install_prefix }}/lib64
3 | 


--------------------------------------------------------------------------------
/config.example/files/kubeflow/user-namespace-params.env:
--------------------------------------------------------------------------------
1 | user=deepops@example.com
2 | profile-name=kubeflow-deepops-example-com
3 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/ood-images/ood-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-01.png


--------------------------------------------------------------------------------
/docs/slurm-cluster/ood-images/ood-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-02.png


--------------------------------------------------------------------------------
/docs/slurm-cluster/ood-images/ood-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-03.png


--------------------------------------------------------------------------------
/docs/slurm-cluster/ood-images/ood-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-04.png


--------------------------------------------------------------------------------
/docs/slurm-cluster/ood-images/ood-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-05.png


--------------------------------------------------------------------------------
/docs/slurm-cluster/ood-images/ood-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-06.png


--------------------------------------------------------------------------------
/docs/slurm-cluster/ood-images/ood-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-07.png


--------------------------------------------------------------------------------
/docs/slurm-cluster/ood-images/ood-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-08.png


--------------------------------------------------------------------------------
/docs/slurm-cluster/ood-images/ood-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-09.png


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/lmod.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: "{{ hostlist | default('all') }}"
3 |   become: yes
4 |   roles:
5 |   - lmod
6 | 


--------------------------------------------------------------------------------
/roles/easy-build/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:  # or 'skip_list' to silence them completely
2 |   - meta-no-info
3 |   - role-name
4 | 


--------------------------------------------------------------------------------
/roles/netapp-trident/templates/namespace.j2:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 |   name: {{ trident_namespace }}
5 | 


--------------------------------------------------------------------------------
/docs/img/slurm_monitoring_alertmanager01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_alertmanager01.png


--------------------------------------------------------------------------------
/docs/img/slurm_monitoring_alertmanager02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_alertmanager02.png


--------------------------------------------------------------------------------
/docs/img/slurm_monitoring_prometheus01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_prometheus01.png


--------------------------------------------------------------------------------
/roles/nhc/meta/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 | - role: facts
4 | 
5 | galaxy_info:
6 |   namespace: deepops
7 |   role_name: nhc
8 | 


--------------------------------------------------------------------------------
/roles/rsyslog_client/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart rsyslog
3 |   service:
4 |     name: rsyslog
5 |     state: restarted
6 | 


--------------------------------------------------------------------------------
/roles/rsyslog_server/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: reload rsyslog
3 |   service:
4 |     name: rsyslog
5 |     state: restarted
6 | 


--------------------------------------------------------------------------------
/src/repo/ansible-lint:
--------------------------------------------------------------------------------
1 | ---
2 | exclude_paths:
3 | - ./roles/galaxy/
4 | - ./kubespray/
5 | use_default_rules: true
6 | verbosity: 2
7 | 


--------------------------------------------------------------------------------
/roles/singularity_wrapper/roles.yml:
--------------------------------------------------------------------------------
1 | - src: abims_sbr.singularity
2 |   version: 3.7.1-1
3 | - src: gantsign.golang
4 |   version: 2.4.0
5 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/services/logging/README.md:
--------------------------------------------------------------------------------
1 | https://github.com/kubernetes/kubernetes/tree/master/cluster/addons/fluentd-elasticsearch
2 | 


--------------------------------------------------------------------------------
/playbooks/nvidia-software/nvidia-dcgm.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: "{{ hostlist | default('all') }}"
3 |   become: true
4 |   roles:
5 |   - nvidia_dcgm
6 | 


--------------------------------------------------------------------------------
/roles/grafana/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart grafana
3 |   service:
4 |     name: "{{ grafana_svc_name }}"
5 |     state: restarted
6 | 


--------------------------------------------------------------------------------
/roles/nvidia-k8s-gpu-device-plugin/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:
2 |   - meta-no-info  # meta/main.yml should contain relevant info
3 |   - role-name
4 | 


--------------------------------------------------------------------------------
/roles/nvidia-k8s-gpu-feature-discovery/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:
2 |   - meta-no-info  # meta/main.yml should contain relevant info
3 |   - role-name
4 | 


--------------------------------------------------------------------------------
/roles/nvidia_hpc_sdk/templates/z95_nvhpc_modules.csh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env csh
2 | setenv MODULEPATH = "$MODULEPATH:{{ hpcsdk_install_dir }}/modulefiles)"
3 | 


--------------------------------------------------------------------------------
/virtual/vars_files/virt_k8s.yml:
--------------------------------------------------------------------------------
1 | ---
2 | container_registry_persistence_enabled: false
3 | rsyslog_client_tcp_host: "{{ groups['kube-master'][0] }}"
4 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/dask-rapids/parallel-sum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/dask-rapids/parallel-sum.png


--------------------------------------------------------------------------------
/playbooks/nvidia-software/nvidia-hpc-sdk.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: "{{ hostlist | default('all') }}"
3 |   become: true
4 |   roles:
5 |   - nvidia_hpc_sdk
6 | 


--------------------------------------------------------------------------------
/roles/nfs/defaults/main.yml:
--------------------------------------------------------------------------------
1 | nfs_idmapd_domain: localdomain
2 | 
3 | nfs_is_server: no
4 | nfs_is_client: no
5 | 
6 | nfs_exports: []
7 | nfs_mounts: []
8 | 


--------------------------------------------------------------------------------
/roles/prometheus/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart prometheus
3 |   service:
4 |     name: "{{ prometheus_svc_name }}"
5 |     state: restarted
6 | 


--------------------------------------------------------------------------------
/roles/spack/templates/z00_spack.csh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env csh
2 | 
3 | setenv SPACK_ROOT {{ spack_install_dir }}
4 | source $SPACK_ROOT/share/spack/setup-env.csh
5 | 


--------------------------------------------------------------------------------
/roles/spack/templates/z00_spack.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export SPACK_ROOT="{{ spack_install_dir }}"
4 | . "${SPACK_ROOT}/share/spack/setup-env.sh"
5 | 


--------------------------------------------------------------------------------
/playbooks/nvidia-software/nvidia-peer-memory.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: "{{ hostlist | default('all') }}"
3 |   become: true
4 |   roles:
5 |   - nvidia-peer-memory
6 | 


--------------------------------------------------------------------------------
/roles/alertmanager/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart alertmanager
3 |   service:
4 |     name: "{{ alertmanager_svc_name }}"
5 |     state: restarted
6 | 


--------------------------------------------------------------------------------
/roles/nvidia-dcgm-exporter/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart dcgm
3 |   service:
4 |     name: "{{ nvidia_dcgm_svc_name }}"
5 |     state: restarted
6 | 


--------------------------------------------------------------------------------
/roles/nvidia_dcgm/tasks/install-dgx.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: install DCGM from repos
3 |   package:
4 |     name: "datacenter-gpu-manager"
5 |     state: present
6 | 


--------------------------------------------------------------------------------
/roles/nvidia_hpc_sdk/templates/z95_nvhpc_modules.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export MODULEPATH="${MODULEPATH:+$MODULEPATH:}{{ hpcsdk_install_dir }}/modulefiles"
3 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/dask-rapids/jupyterlab-nvsmi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/dask-rapids/jupyterlab-nvsmi.png


--------------------------------------------------------------------------------
/workloads/services/k8s/dgxie/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for Kubernetes
4 | name: dgxie
5 | version: 0.1.2
6 | 


--------------------------------------------------------------------------------
/roles/easy-build-packages/defaults/main.yml:
--------------------------------------------------------------------------------
1 | sm_files_path: "{{ sm_prefix }}/easybuild_files"
2 | sm_files_repo_version: "4ef7ae6cc2284f69412a8db5e10dddd92024eeab"
3 | 


--------------------------------------------------------------------------------
/roles/slurm/meta/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 | - role: facts
4 | - role: rsyslog_client
5 | 
6 | galaxy_info:
7 |   namespace: deepops
8 |   role_name: slurm
9 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/cgroup.conf:
--------------------------------------------------------------------------------
1 | CgroupAutomount=yes
2 | 
3 | ConstrainCores=yes
4 | ConstrainDevices=yes
5 | ConstrainRAMSpace=yes
6 | #TaskAffinity=yes
7 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/files/nvidia-persistenced-override.conf:
--------------------------------------------------------------------------------
1 | [Service]
2 | ExecStart=
3 | ExecStart=/usr/bin/nvidia-persistenced --user root --persistence-mode --verbose
4 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/epilog.d/80-exclusive-cleanup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | # Cleanup caches
5 | sync
6 | echo 3 > /proc/sys/vm/drop_caches
7 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-cluster-up.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source workloads/jenkins/scripts/jenkins-common.sh
3 | 
4 | cd virtual || exit 1
5 | bash ./cluster_up.sh
6 | 


--------------------------------------------------------------------------------
/roles/nfs/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include nfs"
6 |       include_role:
7 |         name: "nfs"
8 | 


--------------------------------------------------------------------------------
/roles/nginx-docker-registry-cache/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart docker
3 |   systemd:
4 |     name: "docker"
5 |     state: "restarted"
6 |     daemon_reload: yes
7 | 


--------------------------------------------------------------------------------
/roles/nhc/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include nhc"
6 |       include_role:
7 |         name: "nhc"
8 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx-firmware/tasks/get-time.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Get current system time
3 |   shell: "now=$(date '+%Y%m%d-%H%M%S') && echo $now"
4 |   register: current_time
5 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/vars/redhat.yml:
--------------------------------------------------------------------------------
1 | dgx_repo_dir: "rhel{{ ansible_distribution_major_version }}"
2 | 
3 | dgx_extra_packages:
4 |   - dgx-conf-cachefilesd
5 |   - kernel-headers
6 | 


--------------------------------------------------------------------------------
/roles/prometheus-node-exporter/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart node-exporter
3 |   service:
4 |     name: "{{ node_exporter_svc_name }}"
5 |     state: restarted
6 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/kubeflow-pipeline-deploy/triton.py.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/triton.py.tar.gz


--------------------------------------------------------------------------------
/playbooks/generic/authentication.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 |   become: yes
4 |   roles:
5 |     - move-home-dirs
6 |     - kerberos_client
7 |     - nis_client
8 |     - autofs
9 | 


--------------------------------------------------------------------------------
/roles/docker-rootless/templates/z96_rootlessdocker_modules.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export MODULEPATH="${MODULEPATH:+$MODULEPATH:}{{ rootlessdocker_install_dir }}/modulefiles"
3 | 


--------------------------------------------------------------------------------
/roles/facts/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include facts"
6 |       include_role:
7 |         name: "facts"
8 | 


--------------------------------------------------------------------------------
/roles/grafana/templates/grafana.ini.j2:
--------------------------------------------------------------------------------
1 | [security]
2 | admin_user = {{ grafana_cfg_user }}
3 | admin_password = {{ grafana_cfg_pass }}
4 | 
5 | [auth.anonymous]
6 | enabled = true
7 | 


--------------------------------------------------------------------------------
/roles/lmod/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include lmod"
6 |       include_role:
7 |         name: "lmod"
8 | 


--------------------------------------------------------------------------------
/roles/mofed/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include mofed"
6 |       include_role:
7 |         name: "mofed"
8 | 


--------------------------------------------------------------------------------
/roles/slurm/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include slurm"
6 |       include_role:
7 |         name: "slurm"
8 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/localusers:
--------------------------------------------------------------------------------
1 | root
2 | {{ ansible_env.SUDO_USER | default(ansible_env.USER) }}
3 | {% for user in slurm_allow_ssh_user %}
4 | {{ user }}
5 | {% endfor %}
6 | 


--------------------------------------------------------------------------------
/roles/spack/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include spack"
6 |       include_role:
7 |         name: "spack"
8 | 


--------------------------------------------------------------------------------
/playbooks/generic/ntp-client.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 |   become: true
4 |   tasks:
5 |     - name: Configure NTP client
6 |       include_role:
7 |         name: geerlingguy.ntp
8 | 


--------------------------------------------------------------------------------
/roles/openmpi/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include openmpi"
6 |       include_role:
7 |         name: "openmpi"
8 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/prolog.d/95-all-rootless:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | command -v singularity || exit 0
3 | /usr/local/bin/singularity config fakeroot -a "${SLURM_JOB_USER}"
4 | 


--------------------------------------------------------------------------------
/roles/cachefilesd/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | cachefilesd_package_state: present
3 | cachefilesd_enabled: present
4 | cachefilesd_cache_dir: /var/cache/fscache
5 | cachefilesd_cache_tag: null
6 | 


--------------------------------------------------------------------------------
/roles/kerberos_client/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:
2 |   - meta-no-info  # meta/main.yml should contain relevant info
3 |   - meta-no-tags  # Tags must contain lowercase letters and digits only
4 | 


--------------------------------------------------------------------------------
/config.example/pxe/ipmi_host_list:
--------------------------------------------------------------------------------
1 | # This configuration file is used while rebooting DGX servers into PXE boot
2 | # This information is used to connect to the DGX BMC
3 | 10.0.0.1
4 | 10.0.0.2


--------------------------------------------------------------------------------
/playbooks/generic/nfs-client.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: "{{ hostlist | default('all') }}"
3 |   become: yes
4 |   roles:
5 |     - { role: nfs, nfs_is_client: yes }
6 |   tags:
7 |     - nfs_client
8 | 


--------------------------------------------------------------------------------
/playbooks/generic/nfs-server.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: "{{ hostlist | default('all') }}"
3 |   become: yes
4 |   roles:
5 |     - { role: nfs, nfs_is_server: yes }
6 |   tags:
7 |     - nfs_server
8 | 


--------------------------------------------------------------------------------
/playbooks/generic/rsyslog-client.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: "{{ hostlist | default('all') }}"
3 |   become: yes
4 |   roles:
5 |   - rsyslog_client
6 |   tags:
7 |   - rsyslog-client
8 |   - rsyslog
9 | 


--------------------------------------------------------------------------------
/playbooks/generic/rsyslog-server.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: "{{ hostlist | default('all') }}"
3 |   become: yes
4 |   roles:
5 |   - rsyslog_server
6 |   tags:
7 |   - rsyslog-server
8 |   - rsyslog
9 | 


--------------------------------------------------------------------------------
/roles/nis_client/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include nis_client"
6 |       include_role:
7 |         name: "nis_client"
8 | 


--------------------------------------------------------------------------------
/roles/openshift/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include openshift"
6 |       include_role:
7 |         name: "openshift"
8 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-0.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-0.PNG


--------------------------------------------------------------------------------
/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-1.PNG


--------------------------------------------------------------------------------
/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-2.PNG


--------------------------------------------------------------------------------
/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-3.PNG


--------------------------------------------------------------------------------
/roles/cachefilesd/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include cachefilesd"
6 |       include_role:
7 |         name: "cachefilesd"
8 | 


--------------------------------------------------------------------------------
/roles/easy-build-packages/tasks/redhat-pre-install.yml:
--------------------------------------------------------------------------------
1 | #
2 | # install software modules using EasyBuild:
3 | #
4 | ---
5 | - name: "RHEL install"
6 |   debug:
7 |     msg: "No installs for RHEL"
8 | 
9 | 


--------------------------------------------------------------------------------
/roles/nginx-docker-registry-cache/templates/http-proxy.conf:
--------------------------------------------------------------------------------
1 | [Service]
2 | Environment="HTTP_PROXY={{ nginx_docker_cache_proxy_url }}"
3 | Environment="HTTPS_PROXY={{ nginx_docker_cache_proxy_url }}"
4 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/templates/cachefilesd.conf.j2:
--------------------------------------------------------------------------------
1 | dir {{ cachefilesd_cache_dir }}
2 | tag {{ cachefilesd_cache_tag }}
3 | brun {{ cachefilesd_cache_brun }}
4 | bcull {{ cachefilesd_cache_bcull }}
5 | 
6 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/templates/dgx.list.j2:
--------------------------------------------------------------------------------
1 | deb {{ nvidia_dgx_ubuntu_baseurl }}/{{ ansible_distribution_release | lower }} {{ ansible_distribution_release | lower }} main multiverse restricted universe
2 | 


--------------------------------------------------------------------------------
/roles/nvidia_cuda/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include nvidia_cuda"
6 |       include_role:
7 |         name: "nvidia_cuda"
8 | 


--------------------------------------------------------------------------------
/roles/nvidia_dcgm/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include nvidia_dcgm"
6 |       include_role:
7 |         name: "nvidia_dcgm"
8 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/epilog.d/95-lastuserjob-rootless:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | command -v singularity || exit 0
3 | /usr/local/bin/singularity config fakeroot -r "${SLURM_JOB_USER}" || true
4 | 


--------------------------------------------------------------------------------
/roles/mofed/vars/rhel8.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | mofed_distro: "rhel"
 3 | mofed_pkg_prereqs:
 4 | - tcsh
 5 | - gcc-gfortran
 6 | - numactl-libs
 7 | - kernel-modules-extra
 8 | - tcl
 9 | - tk
10 | - fuse-libs
11 | 


--------------------------------------------------------------------------------
/roles/nvidia-dcgm-exporter/templates/dcgm-exporter.yml.j2:
--------------------------------------------------------------------------------
1 | - targets: [{{ groups['slurm-node'] | zip_longest([], fillvalue=':9400') | map('join') | join(',') }}]
2 |   labels:
3 |     module: dcgm-exporter
4 | 


--------------------------------------------------------------------------------
/roles/nvidia_cuda/files/cuda-vars.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export PATH="/usr/local/cuda/bin${PATH:+:${PATH}}"
4 | export LD_LIBRARY_PATH="/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
5 | 


--------------------------------------------------------------------------------
/roles/grafana/templates/prometheus-datasource.yml.j2:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 | 
3 | datasources:
4 | - name: Prometheus
5 |   type: prometheus
6 |   access: proxy
7 |   url: http://localhost:9090
8 |   isDefault: true
9 | 


--------------------------------------------------------------------------------
/roles/kerberos_client/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include kerberos_client"
6 |       include_role:
7 |         name: "kerberos_client"
8 | 


--------------------------------------------------------------------------------
/roles/prometheus-node-exporter/templates/node-exporter.yml.j2:
--------------------------------------------------------------------------------
1 | - targets: [{{ groups['slurm-node'] | zip_longest([], fillvalue=':9100') | map('join') | join(',') }}]
2 |   labels:
3 |     module: node-exporter
4 | 


--------------------------------------------------------------------------------
/roles/rsyslog_client/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include rsyslog_client"
6 |       include_role:
7 |         name: "rsyslog_client"
8 | 


--------------------------------------------------------------------------------
/roles/rsyslog_server/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include rsyslog_server"
6 |       include_role:
7 |         name: "rsyslog_server"
8 | 


--------------------------------------------------------------------------------
/config.example/pxe/ipmi.conf:
--------------------------------------------------------------------------------
1 | # This configuration file is used while rebooting DGX servers into PXE boot
2 | # This information is used to connect to the DGX BMC
3 | IPMI_USERNAME=dgxuser
4 | IPMI_PASSWORD=dgxuser


--------------------------------------------------------------------------------
/roles/easy-build/templates/z01_eb.sh:
--------------------------------------------------------------------------------
1 | export EASYBUILD_PREFIX={{ sm_prefix }}
2 | export EASYBUILD_MODULES_TOOL=Lmod
3 | module purge
4 | unset $(env | grep EBROOT | awk -F'=' '{print $1}')
5 | module load EasyBuild
6 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/shmfix.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | - name: Fix RemoveIPC
4 |   lineinfile:
5 |     dest: /etc/systemd/logind.conf
6 |     regexp: '^#RemoveIPC=yes'
7 |     line: 'RemoveIPC=no'
8 |   notify: "restart logind"
9 | 


--------------------------------------------------------------------------------
/roles/easy-build-packages/.ansible-lint:
--------------------------------------------------------------------------------
1 | skip_list:
2 |   - meta-no-info  # meta/main.yml should contain relevant info
3 |   - no-changed-when  # Commands should not change things if nothing needs doing
4 |   - role-name
5 | 


--------------------------------------------------------------------------------
/roles/easy-build/templates/z01_eb.csh:
--------------------------------------------------------------------------------
1 | setenv EASYBUILD_PREFIX {{ sm_prefix }}
2 | setenv EASYBUILD_MODULES_TOOL Lmod
3 | module purge
4 | unset $(env | grep EBROOT | awk -F'=' '{print $1}')
5 | module load EasyBuild
6 | 


--------------------------------------------------------------------------------
/playbooks/utilities/nvidia-gpu-tests.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: [ kube-node, slurm-node ]
3 |   gather_facts: no
4 |   tasks:
5 |     - name: Include NVIDIA GPU tests role
6 |       include_role:
7 |         name: nvidia-gpu-tests
8 | 


--------------------------------------------------------------------------------
/roles/nvidia_hpc_sdk/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   become: yes
5 |   tasks:
6 |     - name: "Include nvidia_hpc_sdk"
7 |       include_role:
8 |         name: "nvidia_hpc_sdk"
9 | 


--------------------------------------------------------------------------------
/roles/prometheus-slurm-exporter/templates/slurm-exporter.yml.j2:
--------------------------------------------------------------------------------
1 | - targets: [{{ groups[slurm_exporter_host_group] | zip_longest([], fillvalue=':8080') | map('join') | join(',') }}]
2 |   labels:
3 |     module: slurm-exporter
4 | 


--------------------------------------------------------------------------------
/roles/singularity_wrapper/molecule/default/converge.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Converge
3 |   hosts: all
4 |   tasks:
5 |     - name: "Include singularity_wrapper"
6 |       include_role:
7 |         name: "singularity_wrapper"
8 | 


--------------------------------------------------------------------------------
/playbooks/bootstrap/bootstrap-openshift.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: kube-master
3 |   become: true
4 |   tasks:
5 |     - name: Install required Python OpenShift packages/libraries
6 |       include_role:
7 |         name: openshift
8 | 


--------------------------------------------------------------------------------
/roles/easy-build/defaults/main.yml:
--------------------------------------------------------------------------------
1 | eb_tmp_dir: /tmp/easybuild
2 | eb_bootstrap_url: 'https://raw.githubusercontent.com/easybuilders/easybuild-framework/49533e6ef8f3ca27f984eeb212e157874cae9183/easybuild/scripts/bootstrap_eb.py'
3 | 


--------------------------------------------------------------------------------
/roles/mofed/vars/rhel7.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | mofed_distro: "rhel"
 3 | mofed_pkg_prereqs: 
 4 | - pciutils-libs 
 5 | - numactl-libs 
 6 | - gcc-gfortran
 7 | - tcsh
 8 | - libusbx
 9 | - libnl3
10 | - tcl
11 | - fuse-libs
12 | - tk
13 | 


--------------------------------------------------------------------------------
/roles/docker-rootless/templates/rootless-docker/bin/nvidia-container-runtime-hook:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | /usr/bin/nvidia-container-runtime-hook -config="{{ rootlessdocker_install_dir }}/config/nvidia-container-runtime/config.toml" "$@"
4 | 


--------------------------------------------------------------------------------
/roles/grafana/templates/prometheus-dashboard.yml.j2:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 | - name: 'default'
 5 |   folder: ''
 6 |   type: file
 7 |   editable: true
 8 |   options:
 9 |     path: {{ grafana_cfg_dashboard_path }}
10 | 


--------------------------------------------------------------------------------
/roles/rsyslog_client/templates/99-forward-syslog.conf:
--------------------------------------------------------------------------------
1 | {% if rsyslog_client_tcp_host is defined -%}
2 | action(type="omfwd" Target="{{ rsyslog_client_tcp_host }}" Port="{{ rsyslog_client_tcp_port }}" Protocol="tcp")
3 | {% endif -%}
4 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/services/hello-world.yml:
--------------------------------------------------------------------------------
 1 | kind: Pod
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: hello-world
 5 | spec:
 6 |   containers:
 7 |   - name: hello-world
 8 |     image: hello-world
 9 |   restartPolicy: Never
10 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/prolog.d/50-exclusive-gpu:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | command -v nvidia-smi || exit 0
5 | 
6 | /etc/slurm/shared/bin/set_gpu_power_levels.sh max
7 | /etc/slurm/shared/bin/set_gpu_clocks.sh max
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # ansible
 2 | *.retry
 3 | 
 4 | # misc.
 5 | .*.swp
 6 | 
 7 | # project-specific
 8 | /admin.conf
 9 | /config*/
10 | !/config.example/
11 | /roles/galaxy/
12 | /collections/*
13 | /k8s-config/
14 | /kubectl
15 | /tridentctl
16 | 


--------------------------------------------------------------------------------
/config.example/host_vars/gpu01:
--------------------------------------------------------------------------------
1 | ---
2 | # Example of a host-specific variable file
3 | # These variables would only be used for a host named "gpu01"
4 | 
5 | # Don't install Singularity on this host
6 | #slurm_cluster_install_singularity: false
7 | 


--------------------------------------------------------------------------------
/roles/nfs/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: restart rpcbind
 3 |   service:
 4 |     name: rpcbind
 5 |     state: restarted
 6 | 
 7 | - name: restart nfs
 8 |   service:
 9 |     name: "{{ nfs_server_daemon }}"
10 |     state: restarted
11 | 


--------------------------------------------------------------------------------
/roles/openmpi/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 


--------------------------------------------------------------------------------
/roles/openshift/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 


--------------------------------------------------------------------------------
/scripts/deepops/proxy.sh:
--------------------------------------------------------------------------------
1 | # edit the proxy details
2 | #
3 | # export http_proxy="http://10.0.2.5:3128"
4 | # export https_proxy="http://10.0.2.5:3128"
5 | # export no_proxy="localhost,cluster.local,127.0.0.1,::1,10.0.2.10,10.0.2.20,10.0.2.30"
6 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-setup-slurm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source workloads/jenkins/scripts/jenkins-common.sh
3 | 
4 | cd virtual || exit 1
5 | export DEEPOPS_DISABLE_K8S=true
6 | export DEEPOPS_ENABLE_SLURM=true
7 | bash ./cluster_up.sh
8 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "kubespray"]
2 | 	path = submodules/kubespray
3 | 	url = https://github.com/kubernetes-sigs/kubespray.git
4 | [submodule "packer-maas"]
5 | 	path = submodules/packer-maas
6 | 	url = https://github.com/DeepOps/packer-maas.git
7 | 


--------------------------------------------------------------------------------
/playbooks/nvidia-dgx/nvidia-dgx.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 |   become: true
4 |   tasks:
5 |     - name: Include NVIDIA DGX role
6 |       include_role:
7 |         name: nvidia-dgx
8 |   environment: "{{proxy_env if proxy_env is defined else {}}}"
9 | 


--------------------------------------------------------------------------------
/roles/nvidia_hpc_sdk/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/epilog.d/60-exclusive-cpu:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | command -v cpupower || exit 0
5 | 
6 | cpupower frequency-info | grep -e "governors: Not Available" && exit 0
7 | cpupower frequency-set -g powersave
8 | 


--------------------------------------------------------------------------------
/roles/dns-config/templates/resolv.conf.j2:
--------------------------------------------------------------------------------
1 | # {{ ansible_managed }}
2 | 
3 | {% for server in dns_config_servers %}
4 | nameserver {{ server }}
5 | {% endfor %}
6 | {% if dns_config_search %}
7 | search {{ dns_config_search | join(' ') }}
8 | {% endif %}
9 | 


--------------------------------------------------------------------------------
/roles/facts/files/gpus.fact:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if ! command -v lspci >/dev/null 2>&1; then
3 |     echo lspci not installed
4 |     exit 1
5 | fi
6 | count=$(lspci | grep -E "(3D|VGA compatible) controller: NVIDIA" --count)
7 | echo "{ \"count\": $count }"
8 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/prolog.d/50-exclusive-cpu:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | command -v cpupower || exit 0
5 | 
6 | cpupower frequency-info | grep -e "governors: Not Available" && exit 0
7 | cpupower frequency-set -g performance
8 | 


--------------------------------------------------------------------------------
/roles/docker-rootless/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # Install for rootless docker.
3 | 
4 | # Directory to install in
5 | sm_prefix: "/sw"
6 | sm_software_path: "{{ sm_prefix }}/software"
7 | rootlessdocker_install_dir: "{{ sm_software_path }}/rootless-docker"
8 | 


--------------------------------------------------------------------------------
/roles/facts/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/mofed/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/nfs/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/nhc/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/nvidia_cuda/vars/main.yml:
--------------------------------------------------------------------------------
1 | _ubuntu_repo_dir: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.', '') }}/{{ ansible_architecture }}"
2 | _rhel_repo_dir: "rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}"
3 | 


--------------------------------------------------------------------------------
/roles/slurm/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/spack/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/cachefilesd/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/nis_client/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/nvidia_cuda/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/nvidia_dcgm/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/openmpi/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/openshift/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/kerberos_client/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/nvidia_hpc_sdk/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/rsyslog_client/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/roles/rsyslog_server/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | 
 4 | - name: Verify
 5 |   hosts: all
 6 |   gather_facts: false
 7 |   tasks:
 8 |   - name: Example assertion
 9 |     assert:
10 |       that: true
11 | 


--------------------------------------------------------------------------------
/src/containers/pxe/dhcp/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | MAINTAINER Douglas Holt <dholt@nvidia.com>
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get -y install dnsmasq
 7 | 
 8 | VOLUME /etc/dnsmasq.d
 9 | 
10 | #ENTRYPOINT ["dnsmasq"]
11 | CMD ["dnsmasq", "-d"]
12 | 


--------------------------------------------------------------------------------
/config.example/requirements.yml:
--------------------------------------------------------------------------------
1 | # Put custom Ansible Galaxy requirements here
2 | # Galaxy roles that are used by the DeepOps roles can be found in roles/requirements.yml
3 | # https://docs.ansible.com/ansible/latest/galaxy/user_guide.html
4 | ---
5 | collections:
6 | roles:
7 | 


--------------------------------------------------------------------------------
/playbooks/container/standalone-container-registry.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install docker
 3 |   import_playbook: docker.yml
 4 | 
 5 | - hosts: "{{ hostlist | default('all') }}"
 6 |   become: true
 7 |   become_method: sudo
 8 |   roles:
 9 |   - standalone-container-registry
10 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/open-ondemand.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: slurm-master[0]
 3 |   become: yes
 4 |   roles:
 5 |     - {role: ood-wrapper, ood_is_server: yes }
 6 | 
 7 | - hosts: slurm-node
 8 |   become: yes
 9 |   roles:
10 |     - {role: ood-wrapper, ood_is_client: yes }
11 | 


--------------------------------------------------------------------------------
/roles/nginx-docker-registry-cache/tasks/client.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Set up Ubuntu client
3 |   include_tasks: client-ubuntu.yml
4 |   when: ansible_distribution == 'Ubuntu'
5 | 
6 | - name: Set up RHEL client
7 |   include_tasks: client-el.yml
8 |   when: ansible_os_family == 'RedHat'
9 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/epilog.d/40-lastuserjob-processes:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | if [ "$SLURM_JOB_USER" != root ]; then
5 |     if killall -9 -u "$SLURM_JOB_USER" ; then
6 |         logger -s -t slurm-epilog 'Killed residual user processes'
7 |     fi
8 | fi
9 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/epilog.d/41-lastuserjob-ssh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | if grep -q -w "$SLURM_JOB_USER" /etc/slurm/localusers.backup ; then
5 |     exit 0  # don't revoke access for these users
6 | fi
7 | 
8 | sed -i "/${SLURM_JOB_USER}/d" /etc/localusers
9 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/prolog.d/50-exclusive-ssh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | cp /etc/slurm/localusers.backup /etc/localusers
5 | 
6 | if ! grep -q -w "$SLURM_JOB_USER" /etc/slurm/localusers.backup ; then
7 |     echo "$SLURM_JOB_USER" >>/etc/localusers
8 | fi
9 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/nhc.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: "{{ hostlist | default('all') }}"
 3 |   become: true
 4 |   tasks:
 5 |   - name: Gather custom facts
 6 |     include_role:
 7 |       name: facts
 8 |   - name: Install Node Health Check
 9 |     include_role: 
10 |       name: nhc
11 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/grafana.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install docker
 3 |   import_playbook: ../container/docker.yml
 4 | 
 5 | - hosts: "{{ hostlist | default('all') }}"
 6 |   become: yes
 7 |   tasks:
 8 |     - name: configure grafana
 9 |       include_role:
10 |         name: grafana
11 | 


--------------------------------------------------------------------------------
/roles/nvidia_dcgm/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dcgm_is_dgx: false
3 | _ubuntu_repo_dir: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.', '') }}/{{ ansible_architecture }}"
4 | _rhel_repo_dir: "rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}"
5 | 


--------------------------------------------------------------------------------
/roles/singularity_wrapper/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | galaxy_info:
 3 |   role_name: singularity_wrapper
 4 |   namespace: deepops
 5 |   author: DeepOps Team
 6 |   company: NVIDIA
 7 |   description: Wrap lecourguille.singularity role
 8 |   license: 3-Clause BSD
 9 |   min_ansible_version: 2.9
10 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/logging.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: import slurm logs into rsyslog
 3 |   template:
 4 |     src: "etc/rsyslog.d/99-slurm.conf"
 5 |     dest: "/etc/rsyslog.d/99-slurm.conf"
 6 |     owner: "root"
 7 |     group: "root"
 8 |     mode: "0644"
 9 |   notify:
10 |   - restart rsyslog
11 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/files/nginx-from-local-registry.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: nginx-registry-local
 5 | spec:
 6 |   containers:
 7 |   - name: nginx
 8 |     image: registry.local:31500/nginx:1.21
 9 |   hostNetwork: true
10 |   dnsPolicy: Default
11 | 


--------------------------------------------------------------------------------
/playbooks/container/docker-login.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Ensure Docker is installed and configured
 3 | - name: Install docker
 4 |   import_playbook: docker.yml
 5 | 
 6 | # Log into Docker registries
 7 | - hosts: "{{ hostlist | default('all') }}"
 8 |   become: yes
 9 |   roles:
10 |   - docker-login
11 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/spack-modules.yml:
--------------------------------------------------------------------------------
 1 | # Deploy the Spack package manager
 2 | ---
 3 | - hosts: "{{ hostlist | default('all') }}"
 4 |   become: yes
 5 |   roles:
 6 |   - lmod
 7 | 
 8 | - hosts: "{{ hostlist | default('slurm-master[0]') }}"
 9 |   become: yes
10 |   roles:
11 |   - spack
12 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/templates/filebeat.conf:
--------------------------------------------------------------------------------
 1 | input {
 2 |   beats {
 3 |     port => {{ filebeat_port }} 
 4 |   }
 5 | }
 6 | 
 7 | output {
 8 |   elasticsearch {
 9 |     hosts => ["http://localhost:9200"]
10 |     index => "%{[@metadata][beat]}-%{[@metadata][version]}" 
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/roles/nhc/templates/sysconfig_nhc.j2:
--------------------------------------------------------------------------------
1 | PATH=/sbin:/usr/sbin:/bin:/usr/bin:{{ slurm_install_prefix|default('/usr/local') }}/bin
2 | NHC_RM=slurm
3 | SLURM_SINFO={{ slurm_install_prefix|default('/usr/local') }}/bin/sinfo
4 | SLURM_SCONTROL={{ slurm_install_prefix|default('/usr/local') }}/bin/scontrol
5 | 


--------------------------------------------------------------------------------
/roles/nvidia-mig-manager/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | mig_manager_url_deb: https://github.com/NVIDIA/mig-parted/releases/download/v0.4.2/nvidia-mig-manager_0.4.2-1_amd64.deb
3 | mig_manager_url_rpm: https://github.com/NVIDIA/mig-parted/releases/download/v0.4.2/nvidia-mig-manager-0.4.2-1.x86_64.rpm
4 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/prometheus.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install docker
 3 |   import_playbook: ../container/docker.yml
 4 | 
 5 | - hosts: "{{ hostlist | default('all') }}"
 6 |   become: yes
 7 |   tasks:
 8 |     - name: configure prometheus
 9 |       include_role:
10 |         name: prometheus
11 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/templates/desktop-submit.yml.erb.j2:
--------------------------------------------------------------------------------
1 | ---
2 | script:
3 |   native:
4 |     - "--gpus=<%= bc_num_gpus.blank? ? {{ ood_desktop_app_def_gpus }} : bc_num_gpus.to_i %>"
5 |     - "--cpus-per-gpu={{ ood_desktop_app_cpus_per_gpu }}"
6 |     - "--mem-per-gpu={{ ood_desktop_app_mem_per_gpu }}"
7 | 


--------------------------------------------------------------------------------
/roles/rsyslog_server/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | rsyslog_server_tcp_port: 514
3 | rsyslog_server_udp_port: 514
4 | rsyslog_enable_journal: yes
5 | rsyslog_log_file_path: "/var/log/deepops-hosts"
6 | rsyslog_log_file_path_pattern: "{{ rsyslog_log_file_path }}/%HOSTNAME%/%$YEAR%-%$MONTH%-%$DAY%/syslog.log"
7 | 


--------------------------------------------------------------------------------
/playbooks/generic/chrony-client.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # Chrony will ensure that the clocks of all your servers are up to sync and can be used to sync with an internal server
3 | - hosts: all
4 |   become: true
5 |   tasks:
6 |     - name: Configure Chrony client
7 |       include_role:
8 |         name: DeepOps.chrony
9 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/alertmanager.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install docker
 3 |   import_playbook: ../container/docker.yml
 4 | 
 5 | - hosts: "{{ hostlist | default('all') }}"
 6 |   become: yes
 7 |   tasks:
 8 |     - name: configure alertmanager
 9 |       include_role:
10 |         name: alertmanager
11 | 
12 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/tasks/ubuntu-upgrade.yml:
--------------------------------------------------------------------------------
 1 | # OTA upgrade stuff
 2 | - name: perform OTA upgrade to latest release (this takes a while)
 3 |   apt:
 4 |     upgrade: full
 5 |     update_cache: yes
 6 |     dpkg_options: "force-confdef,force-confold"
 7 | 
 8 | - name: reboot after full OTA upgrade
 9 |   reboot:
10 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/epilog.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | 
4 | logger -s -t slurm-epilog "START user=$SLURM_JOB_USER job=$SLURM_JOB_ID"
5 | {{ slurm_config_dir }}/shared/bin/run-parts.sh {{ slurm_config_dir }}/epilog.d
6 | logger -s -t slurm-epilog "END user=$SLURM_JOB_USER job=$SLURM_JOB_ID"
7 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/prolog.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | 
4 | logger -s -t slurm-prolog "START user=$SLURM_JOB_USER job=$SLURM_JOB_ID"
5 | {{ slurm_config_dir }}/shared/bin/run-parts.sh {{ slurm_config_dir }}/prolog.d
6 | logger -s -t slurm-prolog "END user=$SLURM_JOB_USER job=$SLURM_JOB_ID"
7 | 


--------------------------------------------------------------------------------
/playbooks/k8s-cluster/nvidia-network-operator.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | ## Playbook for installing nvidia-network-operator
 3 | #
 4 | - hosts: kube-master[0]
 5 |   become: true
 6 |   become_method: sudo
 7 |   tasks:
 8 |     - include_role:
 9 |         name: nvidia-network-operator
10 |         tasks_from: main
11 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/remote-script-for-mpi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #
 3 | # Test compiling and running an MPI program using NVIDIA HPC SDK
 4 | 
 5 | set -x
 6 | set -euo pipefail
 7 | 
 8 | module load nvhpc
 9 | 
10 | mpicc -o "${HOME}/hello" "${HOME}/mpi-hello.c"
11 | 
12 | srun --mpi=pmix -n2 "${HOME}/hello"
13 | 


--------------------------------------------------------------------------------
/roles/lmod/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This is an example playbook to execute Ansible tests.
 3 | #
 4 | # TODO: Add test to check for lmod bug in Ubuntu
 5 | 
 6 | - name: Verify
 7 |   hosts: all
 8 |   gather_facts: false
 9 |   tasks:
10 |   - name: Example assertion
11 |     assert:
12 |       that: true
13 | 


--------------------------------------------------------------------------------
/roles/prometheus-slurm-exporter/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: restart slurm-exporter
 3 |   service:
 4 |     name: "{{ slurm_exporter_svc_name }}"
 5 |     state: restarted
 6 | 
 7 | - name: restart grafana
 8 |   service:
 9 |     name: "{{ grafana_svc_name }}"
10 |     state: restarted
11 |   failed_when: false
12 | 


--------------------------------------------------------------------------------
/roles/easy-build-packages/tasks/ubuntu-pre-install.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # install software modules using EasyBuild:
 3 | #
 4 | ---
 5 | - name: "install depending software"
 6 |   become: true
 7 |   apt:
 8 |     name:
 9 |       - libssl-dev
10 |       - build-essential
11 |       - libsysfs-dev
12 |       - libibverbs-dev
13 | 
14 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-operator/templates/gridd.conf:
--------------------------------------------------------------------------------
1 | # See the official documentaion for more details: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html
2 | # Description: Set License Server Address
3 | # Data type: string
4 | # Format:  "<address>"
5 | ServerAddress="{{ vgpu_grid_license_server }}"
6 | 


--------------------------------------------------------------------------------
/roles/nvidia_hpc_sdk/templates/z95_nvhpc.csh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env csh
2 | 
3 | setenv NVARCH `uname -s`_`uname -m`
4 | setenv NVCOMPILERS {{ hpcsdk_install_dir }}
5 | setenv MANPATH "$MANPATH":$NVCOMPILERS/$NVARCH/{{ hpcsdk_version_dir }}/compilers/man
6 | set path = ($NVCOMPILERS/$NVARCH/{{ hpcsdk_version_dir }}/compilers/bin $path)
7 | 


--------------------------------------------------------------------------------
/virtual/k8s_environment.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Set up local environment to work with virtual k8s cluster
4 | 
5 | K8S_CONFIG_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )/config"
6 | 
7 | export KUBECONFIG="${K8S_CONFIG_DIR}/artifacts/admin.conf"
8 | export PATH="${K8S_CONFIG_DIR}/artifacts:${PATH}"
9 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/easybuild-modules.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # install software modules using EasyBuild.
 3 | #
 4 | ---
 5 | - hosts: all
 6 |   roles:
 7 |     - name: lmod
 8 | 
 9 | - hosts: slurm-master[0]
10 |   roles:
11 |     - name: easy-build
12 | 
13 | - hosts: slurm-master
14 |   roles:
15 |     - name: easy-build-packages
16 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/prometheus-node-exporter.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install docker
 3 |   import_playbook: ../container/docker.yml
 4 | 
 5 | - hosts: "{{ hostlist | default('all') }}"
 6 |   become: yes
 7 |   tasks:
 8 |     - name: configure node exporter
 9 |       include_role:
10 |         name: prometheus-node-exporter
11 | 


--------------------------------------------------------------------------------
/roles/nis_client/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart nis
3 |   service: name=ypbind state=restarted enabled=yes
4 |   when: (ansible_os_family == "Debian" and ansible_distribution_version in [ "14.04" ])
5 | 
6 | - name: restart nis
7 |   service: name=nis state=restarted enabled=yes
8 |   when: ansible_os_family == "RedHat"
9 | 


--------------------------------------------------------------------------------
/roles/kerberos_client/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | kerberos_client_kdc_hostname: kerberos
3 | kerberos_client_admin_hostname: kerberos
4 | kerberos_client_dns_lookup_realm: false
5 | kerberos_client_dns_lookup_kdc: false
6 | kerberos_client_ticket_lifetime: 24h
7 | kerberos_client_renew_lifetime: 7d
8 | kerberos_client_forwardable: true
9 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/prometheus-slurm-exporter.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install docker
 3 |   import_playbook: ../container/docker.yml
 4 | 
 5 | - hosts: "{{ hostlist | default('all') }}"
 6 |   become: yes
 7 |   tasks:
 8 |     - name: configure prometheus slurm exporter
 9 |       include_role:
10 |         name: prometheus-slurm-exporter
11 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/templates/desktop.yml.j2:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "{{ ood_desktop_app_title }}"
 3 | cluster: "{{ ood_cluster_name }}"
 4 | submit: "submit/{{ ood_cluster_name }}_desktop.yml.erb"
 5 | attributes:
 6 |   desktop: "xfce"
 7 |   bc_queue: null
 8 |   bc_account: null
 9 |   bc_num_gpus:
10 |     label: "Number of GPUs"
11 |     value: 1
12 | 


--------------------------------------------------------------------------------
/roles/openshift/defaults/main.yml:
--------------------------------------------------------------------------------
1 | deepops_dir: /opt/deepops
2 | deepops_venv: '{{ deepops_dir }}/venv'
3 | epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"
4 | epel_key_url: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"
5 | 


--------------------------------------------------------------------------------
/roles/alertmanager/templates/alertmanager.yml.j2:
--------------------------------------------------------------------------------
 1 | global:
 2 |   slack_api_url: 'https://hooks.slack.com/services/SLACK/API/KEY'
 3 | route:
 4 |   receiver: 'slack-notifications'
 5 |   repeat_interval: 2m
 6 | receivers:
 7 |   - name: 'slack-notifications'
 8 |     slack_configs:
 9 |     - channel: '#my-slack-channel'
10 |       send_resolved: true
11 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/rsyslog.d/99-slurm.conf:
--------------------------------------------------------------------------------
1 | input(type="imfile" File="/var/log/slurm/slurmd.log" Tag="slurmd")
2 | input(type="imfile" File="/var/log/slurm/prolog-epilog" Tag="slurm-prolog-epilog")
3 | input(type="imfile" File="/var/log/slurm/slurmctld.log" Tag="slurmctld")
4 | input(type="imfile" File="/var/log/slurm/slurmdbd.log" Tag="slurmdbd")
5 | 


--------------------------------------------------------------------------------
/playbooks/bootstrap/bootstrap-rook.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: true
 4 |   tasks:
 5 |     - name: install xfsprogs for ceph/rook
 6 |       package:
 7 |         name: xfsprogs
 8 |         state: present
 9 |     - name: install python3-setuptools for ceph/rook
10 |       package:
11 |         name: python3-setuptools
12 |         state: present
13 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/templates/bc_osc_codeserver/submit.yml.erb.j2:
--------------------------------------------------------------------------------
1 | ---
2 | batch_connect:
3 |   template: "basic"
4 | script:
5 |   native:
6 |     - "--gpus=<%= bc_num_gpus.blank? ? {{ ood_codeserver_app_def_gpus }} : bc_num_gpus.to_i %>"
7 |     - "--cpus-per-gpu={{ ood_codeserver_app_cpus_per_gpu }}"
8 |     - "--mem-per-gpu={{ ood_codeserver_app_mem_per_gpu }}"
9 | 


--------------------------------------------------------------------------------
/src/containers/pxe/dhcp/dnsmasq.conf:
--------------------------------------------------------------------------------
 1 | domain-needed
 2 | bogus-priv
 3 | strict-order
 4 | no-resolv
 5 | no-poll
 6 | expand-hosts
 7 | cache-size=2048
 8 | bind-interfaces
 9 | 
10 | server=8.8.8.8
11 | server=8.8.4.4
12 | domain=localdomain
13 | 
14 | log-queries
15 | log-dhcp
16 | log-facility=/var/log/dnsmasq.log
17 | 
18 | conf-dir=/etc/dnsmasq.d,*.conf
19 | 


--------------------------------------------------------------------------------
/playbooks/container/nginx-docker-registry-cache-server.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install docker
 3 |   import_playbook: docker.yml
 4 | 
 5 | - hosts: "{{ hostlist | default('all') }}"
 6 |   become: yes
 7 |   tasks:
 8 |   - name: deploy nginx container caching proxy
 9 |     include_role:
10 |       name: "nginx-docker-registry-cache"
11 |       tasks_from: server
12 | 


--------------------------------------------------------------------------------
/workloads/examples/slurm/mpi-hello/bootstrap-mpi.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Install OpenMPI packages from Ubuntu repos
 3 | - hosts: all
 4 |   become: true
 5 |   tasks:
 6 |   - name: install openmpi packages
 7 |     apt:
 8 |       name: "{{ item }}"
 9 |     with_items:
10 |     - openmpi-bin
11 |     - libopenmpi-dev
12 |     - libpmi2-pmix
13 |     - libpmi-pmix-dev
14 | 


--------------------------------------------------------------------------------
/config.example/pxe/dnsmasq.extra.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # Additional DNSMASQ configuration
 3 | #
 4 | 
 5 | # If the dhcp-ignore flag is specified in this fashion, only hosts configured with dhcp-host will be given DHCP
 6 | #dhcp-ignore=tag:!known 
 7 | 
 8 | # Example static IP; note this will not work for bonded interfaces
 9 | #dhcp-host=12:34:56:78,server-01,192.168.1.23
10 | 


--------------------------------------------------------------------------------
/roles/singularity_wrapper/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # vars for lecorguille.singularity
 3 | singularity_version: "3.11.4"
 4 | singularity_conf_path: "/etc/singularity/singularity.conf"
 5 | bind_paths: []
 6 | 
 7 | # vars for gantsign.golang
 8 | golang_version: "1.14.4"
 9 | golang_install_dir: "/opt/go/{{ golang_version }}"
10 | golang_gopath: "/opt/go/packages"
11 | 


--------------------------------------------------------------------------------
/playbooks/container/nginx-docker-registry-cache-client.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install docker
 3 |   import_playbook: docker.yml
 4 | 
 5 | - hosts: "{{ hostlist | default('all') }}"
 6 |   become: yes
 7 |   tasks:
 8 |   - name: configure nginx container caching proxy client
 9 |     include_role:
10 |       name: "nginx-docker-registry-cache"
11 |       tasks_from: client
12 | 


--------------------------------------------------------------------------------
/playbooks/k8s-cluster/nvidia-k8s-gpu-device-plugin.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: kube-master[0]
 3 |   become: true
 4 |   tasks:
 5 |     - name: install k8s GPU plugin
 6 |       include_role:
 7 |         name: nvidia-k8s-gpu-device-plugin
 8 |       run_once: true
 9 |   environment: "{{proxy_env if proxy_env is defined else {}}}"
10 |   tags:
11 |     - k8s_gpu_device_plugin
12 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/templates/bc_osc_codeserver/manifest.yml.j2:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: VS Code Server
 3 | category: Interactive Apps
 4 | subcategory: Servers
 5 | role: batch_connect
 6 | description: |
 7 |   This app will launch a [VS Code] instance using [Code Server] on a GPU node
 8 | 
 9 |   [VS Code]: https://code.visualstudio.com/
10 |   [Code Server]: https://coder.com/
11 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/prolog-checkmounts:
--------------------------------------------------------------------------------
 1 | #
 2 | # Check that mounts exist
 3 | #
 4 | MOUNTS="/raid /gpfs/fs1"
 5 | for i in $MOUNTS
 6 |         do
 7 | 	mount | grep $i &> /dev/null
 8 | 	if [ $? -ne 0 ]
 9 | 		then
10 | 		echo "$HOSTNAME is missing $i"
11 | 		echo "scontrol update nodename=$HOSTNAME state=drain reason="Mount missing: $i""
12 | 	fi
13 | done
14 | 


--------------------------------------------------------------------------------
/roles/alertmanager/defaults/main.yml:
--------------------------------------------------------------------------------
1 | alertmanager_config_dir: /etc/alertmanager
2 | alertmanager_config_src: templates/alertmanager.yml.j2
3 | alertmanager_container: "prom/alertmanager:v0.23.0"
4 | alertmanager_svc_name: "docker.alertmanager.service"
5 | alertmanager_docker_volume_name: "deepops_alertmanager_metrics"
6 | alertmanager_state: started
7 | alertmanager_enabled: yes
8 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/templates/dgxos5.list.j2:
--------------------------------------------------------------------------------
1 | deb {{ nvidia_dgx_os5_ubuntu_baseurl }}/{{ ansible_distribution_release | lower }}/{{ ansible_architecture }}/ {{ ansible_distribution_release | lower }} common dgx
2 | deb {{ nvidia_dgx_os5_ubuntu_baseurl }}/{{ ansible_distribution_release | lower }}/{{ ansible_architecture }}/ {{ ansible_distribution_release | lower }}-updates common dgx
3 | 


--------------------------------------------------------------------------------
/roles/nvidia_hpc_sdk/templates/z95_nvhpc.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export NVARCH="$(uname -s)_$(uname -m)"
4 | export NVCOMPILERS="{{ hpcsdk_install_dir }}"
5 | export MANPATH="${MANPATH:+$MANPATH:}{{ hpcsdk_install_dir }}/${NVCOMPILERS}/${NVARCH}/{{ hpcsdk_version_dir }}/compilers/man"
6 | export PATH="${NVCOMPILERS}/${NVARCH}/{{ hpcsdk_version_dir }}/compilers/bin:${PATH}"
7 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/setup-user.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: create slurm user home
 3 |   file:
 4 |    path: "{{ slurm_user_home }}"
 5 |    recurse: yes
 6 |    state: directory
 7 | 
 8 | - name: create slurm user
 9 |   user:
10 |     name: "{{ slurm_username }}"
11 |     state: present
12 |     system: yes
13 |     home: "{{ slurm_user_home }}"
14 |     uid: "{{ slurm_user_uid }}"
15 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/epilog.d/42-lastuserjob-cleanup:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | 
 4 | if [[ -n "$SLURM_JOB_USER" && "$SLURM_JOB_USER" != "root" ]]; then
 5 |     logger -s -t slurm-epilog 'Removed residual user files'
 6 |     for dir in /tmp /dev/shm ; do
 7 | 	find "${dir}" -user "${SLURM_JOB_USER}" -print0 | xargs -0 -r rm -fr ||:
 8 |     done
 9 | fi
10 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/gpu-test-job.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: cuda-container
 8 |       image: nvcr.io/nvidia/cuda:10.0-devel
 9 |       command: ["sleep", "6000"]
10 |       args:
11 |       resources:
12 |         limits:
13 |           nvidia.com/gpu: 1
14 |   restartPolicy: Never
15 | 
16 | 


--------------------------------------------------------------------------------
/roles/docker-login/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | docker_login_state: present
 3 | docker_login_reauth: yes
 4 | docker_login_disable_log_password: yes
 5 | 
 6 | #docker_login_registries:
 7 | #- registry: docker.io
 8 | #  username: myuser
 9 | #  password: mypassword
10 | #  email: docker@docker.io
11 | #- registry: nvcr.io
12 | #  username: '$oauthtoken'
13 | #  password: mypassword
14 | 


--------------------------------------------------------------------------------
/roles/prometheus/templates/alert_rules.yml.j2:
--------------------------------------------------------------------------------
 1 | groups:
 2 |     - name: default
 3 |       rules:
 4 |       - alert: InstanceDown
 5 |         expr: up{job="cluster"} == 0
 6 |         for: 5m
 7 |         labels:
 8 |           severity: critical
 9 |         annotations:
10 |           summary: "Instance down"
11 |           description: "Something has been down for more than 5 minutes."


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingon:
--------------------------------------------------------------------------------
1 | # Enable hypterthreading if requested
2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i hyperthreading | grep -v nohyperthreading > /dev/null
3 | if [ $? -eq 0 ]; then
4 | 	for i in /sys/devices/system/cpu/*/online ; do
5 | 		echo 1 > $i
6 | 		echo Enabling CPU $(echo $i | egrep -o cpu[0-9]+ | tr -d 'cpu')
7 | 	done
8 | fi
9 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/build-cleanup.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: make clean in build directory after install
 3 |   command: make clean
 4 |   args:
 5 |     chdir: "{{ slurm_build_dir }}"
 6 |   failed_when: false
 7 |   when: slurm_build_make_clean
 8 | 
 9 | - name: remove build directory
10 |   file:
11 |     path: "{{ slurm_build_dir }}"
12 |     state: absent
13 |   when: slurm_build_dir_cleanup
14 | 


--------------------------------------------------------------------------------
/playbooks/k8s-cluster/nvidia-k8s-gpu-feature-discovery.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: kube-master[0]
 3 |   become: true
 4 |   tasks:
 5 |     - name: install k8s GPU feature discovery
 6 |       include_role:
 7 |         name: nvidia-k8s-gpu-feature-discovery
 8 |       run_once: true
 9 |   environment: "{{proxy_env if proxy_env is defined else {}}}"
10 |   tags:
11 |     - k8s_gpu_feature_discovery
12 | 


--------------------------------------------------------------------------------
/roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml:
--------------------------------------------------------------------------------
1 | # Vars needed to install device plugin
2 | k8s_gpu_plugin_helm_repo: "https://nvidia.github.io/k8s-device-plugin"
3 | k8s_gpu_plugin_chart_name: "nvdp/nvidia-device-plugin"
4 | k8s_gpu_plugin_release_name: "nvidia-device-plugin"
5 | k8s_gpu_plugin_chart_version: "0.14.0"
6 | k8s_gpu_plugin_init_error: "false"
7 | k8s_gpu_mig_strategy: "mixed"
8 | 


--------------------------------------------------------------------------------
/roles/roce_backend/templates/config_dp.j2:
--------------------------------------------------------------------------------
 1 | resourceList:
 2 | {% for sriov_resource in sriov_resources %}
 3 |   - resourceName: "{{ sriov_resource.res_name }}"
 4 |     isRdma: true
 5 |     selectors:
 6 |       vendors: 
 7 |       - "{{ vendor }}"
 8 |       devices: 
 9 |       - "{{ dev_id }}"
10 |       pfNames: 
11 |       - "{{ sriov_resource.pf_name }}"
12 | {% endfor %}
13 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/remote-script-for-slurm-gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #
 3 | # Test compiling and running a GPU program using NVIDIA HPC SDK
 4 | 
 5 | set -x
 6 | set -euo pipefail
 7 | 
 8 | module load nvhpc
 9 | nvcc -o "${HOME}/deviceQuery" -I /usr/local/cuda/samples/common/inc /usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery.cpp
10 | srun -n1 -G1 "${HOME}/deviceQuery"
11 | 


--------------------------------------------------------------------------------
/playbooks/generic/hosts.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 |     - name: set /etc/hostname
 6 |       hostname:
 7 |         name: "{{ inventory_hostname }}"
 8 |       when: deepops_set_hostname | default(true)
 9 | 
10 |     - name: set /etc/hosts
11 |       include_role:
12 |         name: DeepOps.hosts
13 |       vars:
14 |         hosts_add_ansible_managed_hosts: true
15 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/gres.conf:
--------------------------------------------------------------------------------
 1 | {% if slurm_autodetect_nvml -%}
 2 | AutoDetect=nvml
 3 | {% else -%}
 4 | {% set cpu_topology = ansible_local["topology"]["cpu_topology"] -%}
 5 | {% set gpu_topology = ansible_local["topology"]["gpu_topology"] -%}
 6 | {% for affinity in gpu_topology %}
 7 | Name=gpu File=/dev/nvidia{{ loop.index0 }} Cores={{ affinity }}
 8 | {% endfor %}
 9 | {% endif -%}
10 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/epilog-mps:
--------------------------------------------------------------------------------
 1 | # Quit cuda mps if it's running
 2 | ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null
 3 | if [ $? -eq 0 ]; then
 4 |     echo quit | nvidia-cuda-mps-control
 5 | fi
 6 | 
 7 | # Test for presence of mps zombie
 8 | ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null
 9 | if [ $? -eq 0 ]; then
10 |     killall nvidia-cuda-mps-server
11 | fi
12 | 


--------------------------------------------------------------------------------
/roles/move-home-dirs/tasks/move_user.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: kill user processes for {{ user }}
 3 |   command: killall -u {{ user }}
 4 |   register: kill_user_procs
 5 |   failed_when: false
 6 |   changed_when: kill_user_procs.rc == 0
 7 | 
 8 | - name: move home directory for {{ user }}
 9 |   user:
10 |     name: "{{ user }}"
11 |     home: "{{ move_home_dirs_new_root }}/{{ user }}"
12 |     move_home: yes
13 | 


--------------------------------------------------------------------------------
/roles/nfs/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 
11 |   - name: workaround to enable service in container
12 |     shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d
13 |     when: ansible_distribution == "Ubuntu"
14 | 


--------------------------------------------------------------------------------
/config.example/env.sh:
--------------------------------------------------------------------------------
1 | # This file acts as a location to override the default configurations of deepops/scripts/*
2 | # Many of the scripts in this directory define global variables and set reasonable defaults
3 | # Global variables (in all caps) that are defined here will be automatically sourced and used in all scripts
4 | # See deepops/scripts/common.sh for implementation details
5 | 
6 | DEEPOPS_EXAMPLE_VAR=""
7 | 


--------------------------------------------------------------------------------
/roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml:
--------------------------------------------------------------------------------
1 | # Vars needed to install feature discovery
2 | k8s_gpu_feature_discovery_helm_repo: "https://nvidia.github.io/gpu-feature-discovery"
3 | k8s_gpu_feature_discovery_chart_name: "nvgfd/gpu-feature-discovery"
4 | k8s_gpu_feature_discovery_release_name: "gpu-feature-discovery"
5 | k8s_gpu_feature_discovery_chart_version: "0.8.0"
6 | k8s_gpu_mig_strategy: "mixed"
7 | 


--------------------------------------------------------------------------------
/roles/rsyslog_client/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: ensure rsyslog is installed
 3 |   package:
 4 |     name: rsyslog
 5 |     state: present
 6 | 
 7 | - name: configure syslog forwarding
 8 |   template:
 9 |     src: "99-forward-syslog.conf"
10 |     dest: "/etc/rsyslog.d/99-forward-syslog.conf"
11 |     owner: "root"
12 |     group: "root"
13 |     mode: "0644"
14 |   notify:
15 |   - restart rsyslog
16 |   
17 | 


--------------------------------------------------------------------------------
/roles/slurm/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 
11 |   - name: workaround to enable service in container
12 |     shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d
13 |     when: ansible_distribution == "Ubuntu"
14 | 


--------------------------------------------------------------------------------
/roles/cachefilesd/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 
11 |   - name: workaround to enable service in container
12 |     shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d
13 |     when: ansible_distribution == "Ubuntu"
14 | 


--------------------------------------------------------------------------------
/roles/nis_client/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 
11 |   - name: workaround to enable service in container
12 |     shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d
13 |     when: ansible_distribution == "Ubuntu"
14 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx-firmware/tasks/run-diagnostics.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Starting diagnostics step
 3 |   debug:
 4 |     msg: "Starting now"
 5 | 
 6 | - name: Check firmware
 7 |   include_tasks: check-firmware.yml
 8 |   ignore_errors: true
 9 | 
10 | - name: Get health
11 |   include_tasks: get-health.yml
12 |   ignore_errors: true
13 | 
14 | - name: Get IB
15 |   include_tasks: get-ib.yml
16 |   ignore_errors: true
17 | 


--------------------------------------------------------------------------------
/config.example/README.md:
--------------------------------------------------------------------------------
1 | Example DeepOps configuration
2 | =============================
3 | 
4 | This directory provides an example configuration for NVIDIA DeepOps.
5 | The files in this directory will help determine the behavior of the Ansible playbooks and other scripts that DeepOps uses to set up your systems.
6 | 
7 | For more details on how this works, see [how to configure DeepOps](../docs/deepops/configuration.md).
8 | 


--------------------------------------------------------------------------------
/roles/kerberos_client/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 
11 |   - name: workaround to enable service in container
12 |     shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d
13 |     when: ansible_distribution == "Ubuntu"
14 | 


--------------------------------------------------------------------------------
/roles/rsyslog_client/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 
11 |   - name: workaround to enable service in container
12 |     shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d
13 |     when: ansible_distribution == "Ubuntu"
14 | 


--------------------------------------------------------------------------------
/roles/rsyslog_server/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure apt cache is updated
 7 |     apt:
 8 |       update_cache: true
 9 |     when: ansible_distribution == "Ubuntu"
10 | 
11 |   - name: workaround to enable service in container
12 |     shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d
13 |     when: ansible_distribution == "Ubuntu"
14 | 


--------------------------------------------------------------------------------
/roles/singularity_wrapper/molecule/default/verify.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: verify
 3 |   hosts: all
 4 |   tasks:
 5 |   - name: check for path to singularity
 6 |     command: which singularity
 7 |     register: which_singularity
 8 |     changed_when: which_singularity.rc != 0
 9 | 
10 |   - name: verify path to singularity
11 |     assert:
12 |       that:
13 |       - "'/usr/local/bin/singularity' in which_singularity.stdout"
14 | 


--------------------------------------------------------------------------------
/workloads/examples/slurm/dask-rapids/files/launch-dask-scheduler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ANACONDA_ROOT="/usr/local/anaconda"
 4 | CONDA_ENV="/shared/conda"
 5 | export PATH="${CONDA_ENV}/bin:${ANACONDA_ROOT}/bin:${PATH}"
 6 | 
 7 | # shellcheck disable=SC1091
 8 | source activate "${CONDA_ENV}"
 9 | 
10 | echo "Launching dask-scheduler on $(hostname)"
11 | dask-scheduler --host "$(hostname)" || echo "Unable to start scheduler"
12 | 


--------------------------------------------------------------------------------
/roles/move-home-dirs/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: find non-system users who need moving
 3 |   shell: set -o pipefail && awk '-F:' '($1!="nobody")&&($1!="{{ tmp_user }}")&&($3>=1000){print $6,$1}' /etc/passwd | grep -v "^{{ move_home_dirs_new_root }}" | awk '{print $2}'
 4 |   changed_when: false
 5 |   register: user_list
 6 | 
 7 | - name: skip if no work to do
 8 |   include_tasks: move_users.yml
 9 |   when: user_list.stdout
10 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/templates/desktop-form.yml.j2:
--------------------------------------------------------------------------------
 1 | ---
 2 |  attributes:
 3 |    desktop: "mate"
 4 |    bc_vnc_idle: 0
 5 |    bc_vnc_resolution:
 6 |      required: true
 7 |    node_type: null
 8 | 
 9 |  form:
10 |    - bc_vnc_idle
11 |    - desktop
12 |    - bc_account
13 |    - bc_num_hours
14 |    - bc_num_slots
15 |    - bc_num_gpus
16 |    - node_type
17 |    - bc_queue
18 |    - bc_vnc_resolution
19 |    - bc_email_on_started
20 | 


--------------------------------------------------------------------------------
/roles/rsyslog_client/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # The destination host for TCP forwarding of rsyslog messages.
 3 | #
 4 | # Note that this isn't defined by default in this role, and must be defined for
 5 | # forwarding to take place.
 6 | #
 7 | # The DeepOps configuration for Slurm or Kubernetes may define this at the
 8 | # playbook level.
 9 | #
10 | # rsyslog_client_tcp_host: "10.0.0.1"
11 | 
12 | rsyslog_client_tcp_port: "514"
13 | 


--------------------------------------------------------------------------------
/playbooks/utilities/nvidia-set-gpu-clocks.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # set gpu clocks on all worker nodes
 3 | - hosts: all
 4 |   gather_facts: no
 5 |   become: yes
 6 |   tasks:
 7 |     - name: set the gpu clock to a specified amount
 8 |       shell: "nvidia-smi -lgc {{ gpu_clock_lock }}"
 9 |       when: not gpu_clock_reset
10 | 
11 |     - name: reset the gpu clock to the default
12 |       shell: "nvidia-smi -rgc"
13 |       when: gpu_clock_reset
14 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/deep-learning-examples/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ .Values.exampleName }}
 5 |   labels:
 6 |     app: {{ .Values.exampleName }}
 7 | spec:
 8 |   type: NodePort
 9 |   ports:
10 |     - name: jupyterlab
11 |       nodePort: {{ .Values.jupyterNodePort }}
12 |       port: 8888
13 |       targetPort: 8888
14 |   selector:
15 |     app: {{ .Values.exampleName }}
16 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-slurm-enroot-job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source workloads/jenkins/scripts/jenkins-common.sh
 3 | 
 4 | # Run a simple one-GPU enroot job
 5 | ssh -v \
 6 | 	-o "StrictHostKeyChecking no" \
 7 | 	-o "UserKnownHostsFile /dev/null" \
 8 | 	-l vagrant \
 9 | 	-i "${HOME}/.ssh/id_rsa" \
10 | 	"10.0.0.5${GPU01}" \
11 | 	srun -N1 -G1 \
12 | 	--container-image="nvcr.io#nvidia/cuda:10.2-base-ubuntu18.04" \
13 | 	nvidia-smi -L
14 | 


--------------------------------------------------------------------------------
/playbooks/utilities/gpu-clocks.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: slurm-node
 3 |   become: true
 4 |   tasks:
 5 |     - name: install custom facts
 6 |       include_role:
 7 |         name: facts
 8 | 
 9 |     - name: set GPU clocks permissions
10 |       command: nvidia-smi -acp UNRESTRICTED
11 |       changed_when: false
12 |       when:
13 |         - ansible_local['gpus']['count']
14 |   environment: "{{proxy_env if proxy_env is defined else{}}}"
15 | 
16 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/services/dhcpd.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: dhcp-server
 5 | spec:
 6 |   hostNetwork: true
 7 |   containers:
 8 |     - name: dhcp-server
 9 |       image: joebiellik/dhcpd
10 |       volumeMounts:
11 |         - name: config-volume
12 |           mountPath: /etc/dhcp
13 |   volumes:
14 |     - name: config-volume
15 |       configMap:
16 |         name: dhcpd
17 |   restartPolicy: Never
18 | 


--------------------------------------------------------------------------------
/src/containers/ngc/pytorch/Dockerfile-minimal:
--------------------------------------------------------------------------------
1 | # https://ngc.nvidia.com/catalog/containers/nvidia:pytorch
2 | FROM nvcr.io/nvidia/pytorch:20.12-py3
3 | 
4 | # Start Jupyter up by default rather than a shell
5 | ENTRYPOINT ["/bin/sh"]
6 | CMD ["-c", "jupyter lab  --notebook-dir=/workspace --ip=0.0.0.0 --no-browser --allow-root --port=8888 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' --NotebookApp.base_url=${NB_PREFIX}"]
7 | 


--------------------------------------------------------------------------------
/src/containers/ngc/tensorflow/Dockerfile-minimal:
--------------------------------------------------------------------------------
1 | # https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow
2 | FROM nvcr.io/nvidia/tensorflow:20.12-tf1-py3
3 | 
4 | # Start Jupyter up by default rather than a shell
5 | ENTRYPOINT ["/bin/sh"]
6 | CMD ["-c", "jupyter lab  --notebook-dir=/workspace --ip=0.0.0.0 --no-browser --allow-root --port=8888 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' --NotebookApp.base_url=${NB_PREFIX}"]
7 | 


--------------------------------------------------------------------------------
/roles/grafana/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | grafana_config_dir: /etc/grafana
 2 | grafana_config_template: templates/grafana.ini.j2
 3 | grafana_data_dir: /var/lib/grafana
 4 | grafana_user_id: 472
 5 | grafana_container: "grafana/grafana:8.5.10"
 6 | grafana_svc_name: "docker.grafana.service"
 7 | grafana_state: started
 8 | grafana_enabled: yes
 9 | 
10 | grafana_cfg_user: admin
11 | grafana_cfg_pass: deepops
12 | grafana_cfg_dashboard_path: "{{ grafana_data_dir }}/dashboards"
13 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/remote-script-for-registry-test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -ex
 4 | 
 5 | # Pull nginx container locally
 6 | sudo ctr images pull --all-platforms docker.io/library/nginx:1.21
 7 | 
 8 | # Tag docker container for local cluster registry
 9 | sudo ctr images tag docker.io/library/nginx:1.21 registry.local:31500/nginx:1.21
10 | 
11 | # Push to the local registry
12 | sudo ctr images push --plain-http registry.local:31500/nginx:1.21
13 | 


--------------------------------------------------------------------------------
/workloads/services/k8s/dgxie/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | 


--------------------------------------------------------------------------------
/playbooks/generic/software.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: true
 4 |   tasks:
 5 |     - name: remove old/undesirable packages
 6 |       package:
 7 |         name: "{{ software_remove_packages }}"
 8 |         state: absent
 9 |       when: software_remove_packages is defined
10 | 
11 |     - name: install extra packages
12 |       package:
13 |         name: "{{ software_extra_packages }}"
14 |         state: present
15 |       when: software_extra_packages is defined
16 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: apt update
 3 |   apt:
 4 |     update_cache: yes
 5 |   when: ansible_distribution == 'Ubuntu'
 6 | 
 7 | - name: restart cachefilesd
 8 |   service:
 9 |     name: cachefilesd
10 |     state: restarted
11 | 
12 | - name: reboot after driver install
13 |   reboot:
14 |   when: install_driver.changed and not nvidia_driver_skip_reboot
15 | 
16 | - name: restart docker
17 |   service:
18 |     name: docker
19 |     state: restarted
20 | 


--------------------------------------------------------------------------------
/roles/lmod/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # include some reasonable defaults for module paths
 3 | sm_prefix: "/sw"
 4 | sm_module_root: "{{ sm_prefix }}/modules"
 5 | sm_module_path: "{{ sm_module_root }}/all"
 6 | sm_software_path: "{{ sm_prefix }}/software"
 7 | 
 8 | epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"
 9 | epel_key_url: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"
10 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/templates/bc_osc_codeserver/form.yml.j2:
--------------------------------------------------------------------------------
 1 | cluster: "{{ ood_cluster_name }}"
 2 | form:
 3 |   - bc_num_gpus
 4 |   - bc_num_hours
 5 |   - working_dir
 6 | attributes:
 7 |   working_dir:
 8 |     label: "Working Directory"
 9 |     data-filepicker: true
10 |     data-target-file-type: dirs  # Valid values are: files, dirs, or both
11 |     readonly: false
12 |     help: "Select your project directory; defaults to $HOME"
13 |   bc_num_gpus:
14 |     label: "Number of GPUs"
15 |     value: 1
16 | 


--------------------------------------------------------------------------------
/roles/prometheus-node-exporter/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | node_exporter_container: "quay.io/prometheus/node-exporter:v1.3.1"
 2 | node_exporter_prom_dir: "/run/prometheus"
 3 | node_exporter_svc_name: "docker.node-exporter.service"
 4 | node_exporter_state: started
 5 | node_exporter_enabled: yes
 6 | 
 7 | prometheus_config_dir: /etc/prometheus
 8 | prometheus_cfg_endpoint_dir: "{{ prometheus_config_dir }}/endpoints"
 9 | node_exporter_conf_template: "node-exporter.yml.j2"
10 | 
11 | node_exporter_max_cpu: "0.5"
12 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/services/logging/kibana-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: kibana-logging
 5 |   namespace: logging
 6 |   labels:
 7 |     k8s-app: kibana-logging
 8 |     kubernetes.io/cluster-service: "true"
 9 |     addonmanager.kubernetes.io/mode: Reconcile
10 |     kubernetes.io/name: "Kibana"
11 | spec:
12 |   ports:
13 |   - port: 5601
14 |     protocol: TCP
15 |     targetPort: ui
16 |   type: NodePort
17 |   selector:
18 |     k8s-app: kibana-logging
19 | 


--------------------------------------------------------------------------------
/playbooks/utilities/mofed.yml:
--------------------------------------------------------------------------------
 1 | # Installs NVIDIA Mellanox OFED, a collection of software packages to enable
 2 | # high-speed networking with InfiniBand or RoCE on NVIDIA Mellanox networking
 3 | # adapters.
 4 | #
 5 | # This playbook automates the software installation process outlined in the
 6 | # MLNX_OFED documentation, here:
 7 | # https://community.mellanox.com/s/article/howto-install-mlnx-ofed-driver
 8 | ---
 9 | - hosts: "{{ hostlist | default('all') }}"
10 |   become: yes
11 |   roles:
12 |   - mofed
13 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/deep-learning-examples/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/workloads/examples/slurm/mpi-hello/hello-job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -J mpi-hello            # Job name
 3 | #SBATCH -n 2                    # Number of processes
 4 | #SBATCH -t 0:10:00              # Max wall time
 5 | #SBATCH -o hello-job.out        # Output file name
 6 | 
 7 | # Disable the Infiniband transport for OpenMPI (not present on all clusters)
 8 | export OMPI_MCA_btl="^openib"
 9 | 
10 | # Run the job (assumes the batch script is submitted from the same directory)
11 | mpirun -np 2 ./mpi-hello
12 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/vagrant-startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -ex
 2 | set -ex
 3 | source workloads/jenkins/scripts/jenkins-common.sh
 4 | 
 5 | cd virtual || exit 1
 6 | bash ./vagrant_shutdown.sh || true # Some previous VMs may not have been cleaned; this may fail if the environment is clean; so we proceed regardless
 7 | bash ./vagrant_startup.sh # If this fails the entire test should halt
 8 | cat virtual_inventory* # We can't look at config/inventory because that is created after this step
 9 | cat Vagrantfile
10 | 


--------------------------------------------------------------------------------
/playbooks/k8s-cluster/nvidia-gpu-operator.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Ensure OpenShift packages are installed
 3 |   import_playbook: ../bootstrap/bootstrap-openshift.yml
 4 | 
 5 | # GPU operator
 6 | - hosts: kube-master[0]
 7 |   become: yes
 8 |   tasks:
 9 |     - name: Install helm chart for GPU operator
10 |       include_role:
11 |         name: nvidia-gpu-operator
12 |       run_once: true
13 |   environment: "{{proxy_env if proxy_env is defined else {}}}"
14 |   tags:
15 |     - nvidia
16 |     - nvidia-gpu-operator
17 | 


--------------------------------------------------------------------------------
/roles/slurm/vars/ubuntu.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | slurm_pam_lib_dir: /lib/x86_64-linux-gnu/security
 3 | 
 4 | slurm_build_deps:
 5 |   - build-essential
 6 |   - libmunge-dev
 7 |   - libmariadb-dev
 8 |   - libmariadbclient-dev-compat
 9 |   - libpam0g-dev
10 |   - libdbus-1-dev
11 |   - python3-minimal
12 |   - ruby-dev
13 |   - wget
14 | 
15 | slurm_pmix_deps:
16 |   - build-essential
17 |   - libev-dev
18 |   - libevent-dev
19 |   - zlib1g
20 |   - zlib1g-dev
21 |   - pandoc
22 | 
23 | slurm_hwloc_deps:
24 |   - build-essential
25 | 


--------------------------------------------------------------------------------
/scripts/pxe/build_and_restart_dgxie.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -xe
 3 | 
 4 | source config/pxe/env
 5 | 
 6 | compose_directory_cmd="" #"--project-directory ."
 7 | compose_cmd="docker-compose --env-file ./config/pxe/env ${compose_directory} -f ${COMPOSE_FILE}"
 8 | 
 9 | 
10 | function tear_down() {
11 |     ${compose_cmd} down
12 | }
13 | 
14 | function build() {
15 |     ${compose_cmd} build
16 | }
17 | 
18 | function bring_up() {
19 |     ${compose_cmd} up -d
20 | }
21 | 
22 | 
23 | tear_down
24 | build
25 | bring_up
26 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/services/nfs-client.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: nfs
 5 | spec:
 6 |   capacity:
 7 |     storage: 500Gi
 8 |   accessModes:
 9 |     - ReadWriteMany
10 |   nfs:
11 |     server: 10.0.0.1
12 |     path: "/exports"
13 | ---
14 | apiVersion: v1
15 | kind: PersistentVolumeClaim
16 | metadata:
17 |   name: nfs
18 | spec:
19 |   accessModes:
20 |     - ReadWriteMany
21 |   storageClassName: ""
22 |   resources:
23 |     requests:
24 |       storage: 500Gi
25 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-ceph.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | source workloads/jenkins/scripts/jenkins-common.sh
 4 | 
 5 | # Ensure working directory is root
 6 | cd "${ROOT_DIR}" || exit 1
 7 | 
 8 | # Deploy rook, fail if it takes longer than 5 minutes
 9 | timeout 300 ./scripts/k8s/deploy_rook.sh -x
10 | 
11 | # Poll for completion, fail if it takes longer than 30 minutes
12 | timeout 1800 ./scripts/k8s/deploy_rook.sh -w
13 | 
14 | # Print Rook-Ceph information
15 | timeout 60 ./scripts/k8s/deploy_rook.sh -p
16 | 


--------------------------------------------------------------------------------
/roles/mofed/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | mofed_version: "5.6-2.0.9.0"
 3 | 
 4 | mofed_download_dir: "/tmp/mofed-install"
 5 | mofed_download_dest: "mofed.tgz"
 6 | mofed_download_name: "MLNX_OFED_LINUX-{{ mofed_version }}-{{ mofed_distro }}{{ ansible_distribution_version }}-{{ ansible_architecture }}"
 7 | mofed_download_url: "http://www.mellanox.com/downloads/ofed/MLNX_OFED-{{ mofed_version }}/{{ mofed_download_name }}.tgz"
 8 | 
 9 | mofed_install_flags: "--all --without-fw-update"
10 | 
11 | mofed_cleanup_install_dir: true
12 | 


--------------------------------------------------------------------------------
/scripts/generic/gpu_diag.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | diag_level=${1:-1}
 4 | 
 5 | hostname
 6 | 
 7 | # discover GPUs
 8 | dcgmi discovery -l
 9 | 
10 | # remove old groups
11 | for old_group in $(dcgmi group -l | grep "Group ID" | awk '{print $5}') ; do 
12 |     dcgmi group -d ${old_group} >/dev/null 2>&1
13 | done
14 | 
15 | # create new default group and record group number
16 | new_group=$(dcgmi group -c default --default | awk '{print $NF}')
17 | 
18 | dcgmi diag -g ${new_group} -r ${diag_level}
19 | 
20 | exit 0
21 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/gpu-usage/mig-mixed-without-selector.yml:
--------------------------------------------------------------------------------
 1 | # This yaml file will launch a container with a 1g.5gb MIG device
 2 | # It will deploy onto any GPU where this profile is available
 3 | apiVersion: v1
 4 | kind: Pod
 5 | metadata:
 6 |   name: gpu-pod
 7 | spec:
 8 |   containers:
 9 |     - name: gpu-pod
10 |       image:  nvcr.io/nvidia/k8s/cuda-sample:nbody
11 |       command: ["/bin/sh"]
12 |       args: ["-c", "nvidia-smi"]
13 |       resources:
14 |         limits:
15 |           nvidia.com/mig-1g.5gb: 1
16 | 


--------------------------------------------------------------------------------
/roles/openmpi/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | openmpi_version: 4.0.3
3 | openmpi_tag: "v{{ openmpi_version.split('.')[0] }}.{{ openmpi_version.split('.')[1] }}"
4 | openmpi_src_url: "https://download.open-mpi.org/release/open-mpi/{{ openmpi_tag }}/openmpi-{{ openmpi_version }}.tar.bz2"
5 | openmpi_build_dir: /tmp/openmpi-build
6 | openmpi_install_prefix: /usr/local
7 | openmpi_configure: "./configure --prefix={{ openmpi_install_prefix }} --disable-dependency-tracking --disable-getpwuid --with-pmix=internal"
8 | openmpi_force_rebuild: no
9 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/pytorch-job.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: pytorch-job
 5 | spec:
 6 |   backoffLimit: 5
 7 |   template:
 8 |     spec:
 9 |       containers:
10 |         - name: pytorch-container
11 |           image: nvcr.io/nvidia/pytorch:19.02-py3
12 |           command: ["/bin/sh"]
13 |           args: ["-c", "python /workspace/examples/upstream/mnist/main.py"]
14 |           resources:
15 |             limits:
16 |               nvidia.com/gpu: 1
17 |       restartPolicy: Never
18 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/services/logging/es-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: elasticsearch-logging
 5 |   namespace: logging
 6 |   labels:
 7 |     k8s-app: elasticsearch-logging
 8 |     kubernetes.io/cluster-service: "true"
 9 |     addonmanager.kubernetes.io/mode: Reconcile
10 |     kubernetes.io/name: "Elasticsearch"
11 | spec:
12 |   ports:
13 |   - port: 9200
14 |     protocol: TCP
15 |     targetPort: db
16 |   type: NodePort
17 |   selector:
18 |     k8s-app: elasticsearch-logging
19 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-spack-minimal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | source workloads/jenkins/scripts/jenkins-common.sh
 4 | 
 5 | # Install Spack, but do not install any modules
 6 | ansible-playbook -i virtual/config/inventory playbooks/slurm-cluster/spack-modules.yml
 7 | 
 8 | # After install, we expect spack to be in our PATH
 9 | ssh -v \
10 | 	-o "StrictHostKeyChecking no" \
11 | 	-o "UserKnownHostsFile /dev/null" \
12 | 	-l vagrant \
13 | 	-i "${HOME}/.ssh/id_rsa" \
14 | 	"10.0.0.5${GPU01}" \
15 | 	"which spack"
16 | 


--------------------------------------------------------------------------------
/src/containers/ngc/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | for dir in `ls -d */ | sed 's:/::g'`; do
 3 |   cd ${dir}
 4 | 
 5 |   echo "Building deepops-${dir}-minimal"
 6 |   docker build -t deepops-${dir}-minimal -f Dockerfile-minimal .
 7 |   docker tag deepops-${dir}-minimal deepops-${dir}-minimal:kubeflow
 8 | 
 9 |   if [ "${1}" != "minimal" ]; then
10 |     echo "Building deepops-${dir}"
11 |     docker build -t deepops-${dir} -f Dockerfile .
12 |     docker tag deepops-${dir} deepops-${dir}:kubeflow
13 |   fi
14 | 
15 |  cd -
16 | done
17 | 


--------------------------------------------------------------------------------
/roles/spack/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | spack_repo: "https://github.com/spack/spack.git"
 3 | spack_install_dir: "/sw/spack"
 4 | spack_version: "v0.18.1"
 5 | spack_user: "root"
 6 | spack_group: "root"
 7 | 
 8 | spack_ubuntu_deps:
 9 | - "gcc-7"
10 | - "gfortran-7"
11 | - "make"
12 | - "git"
13 | 
14 | spack_redhat_deps:
15 | - "gcc"
16 | - "gcc-c++"
17 | - "gcc-gfortran"
18 | - "make"
19 | - "git"
20 | 
21 | spack_build_packages: false
22 | spack_default_packages:
23 | - "cuda@10.2.89"
24 | - "openmpi@3.1.6 +cuda +pmi schedulers=auto"
25 | 


--------------------------------------------------------------------------------
/src/containers/dgxie/dnsmasq.conf:
--------------------------------------------------------------------------------
 1 | domain-needed
 2 | bogus-priv
 3 | strict-order
 4 | no-resolv
 5 | no-poll
 6 | expand-hosts
 7 | cache-size=2048
 8 | bind-interfaces
 9 | 
10 | server=#DNS1#
11 | server=#DNS2#
12 | domain=#DOMAIN#
13 | interface=#DHCP_INT#
14 | 
15 | log-queries
16 | log-dhcp
17 | log-facility=/var/log/dnsmasq.log
18 | 
19 | dhcp-authoritative
20 | dhcp-range=#DHCP_START#,#DHCP_END#,#LEASETIME#
21 | dhcp-option=tag:green,option:domain-search,#DOMAIN#
22 | dhcp-option=3,#GATEWAY#
23 | 
24 | conf-dir=/etc/dnsmasq.d,*.conf
25 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/epilog.d/50-lastuserjob-all-enroot-dirs:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | 
 4 | command -v enroot >/dev/null || exit 0  # enroot not installed
 5 | 
 6 | {% if enroot_runtime_path | default(none) %}
 7 | runtime_path="$(sudo -u "$SLURM_JOB_USER" sh -c 'echo "{{ enroot_runtime_path }}"')"
 8 | rm -rf "$runtime_path"
 9 | {% endif %}
10 | 
11 | {% if enroot_data_path | default(none) %}
12 | data_path="$(sudo -u "$SLURM_JOB_USER" sh -c 'echo "{{ enroot_data_path }}"')"
13 | rm -rf "$data_path"
14 | {% endif %}
15 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/tensorflow-job.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: tensorflow-job
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       restartPolicy: Never
 9 |       containers:
10 |         - name: tensorflow-job-container
11 |           image:  nvcr.io/nvidia/tensorflow:21.03-tf1-py3
12 |           command: ["/bin/sh"]
13 |           args: ["-c", "python /workspace/nvidia-examples/cnn/resnet.py --layers=50 --batch_size=64"]
14 |           resources:
15 |             limits:
16 |               nvidia.com/gpu: 1
17 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/epilog-dcgmstats:
--------------------------------------------------------------------------------
1 | # Stop DCGM GPU stats collection if requested
2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null
3 | if [ $? -eq 0 ]; then
4 |         OUTPUTDIR=`scontrol show job $SLURM_JOBID | grep WorkDir | cut -d = -f 2`
5 |         sudo -u $SLURM_JOB_USER dcgmi stats -x $SLURM_JOBID
6 |         sudo -u $SLURM_JOB_USER dcgmi stats -v -j $SLURM_JOBID | sudo -u $SLURM_JOB_USER tee $OUTPUTDIR/dcgm-gpu-stats-$HOSTNAME-$SLURM_JOBID.out
7 |         sudo -u $SLURM_JOB_USER nv-hostengine -t
8 | fi
9 | 


--------------------------------------------------------------------------------
/playbooks/container/docker-rootless.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install NVIDIA driver on GPU servers
 3 |   import_playbook: ../nvidia-software/nvidia-driver.yml
 4 | 
 5 | - name: Install NVIDIA container runtime on GPU servers
 6 |   import_playbook: nvidia-docker.yml
 7 |   vars:
 8 |     nvidia_docker_skip_docker_reload: true
 9 | 
10 | - name: Install Lmod
11 |   import_playbook: ../slurm-cluster/lmod.yml
12 | 
13 | - hosts: all
14 |   become: yes
15 |   tasks:
16 |     - name: install rootless docker
17 |       include_role:
18 |         name: docker-rootless
19 | 


--------------------------------------------------------------------------------
/roles/nfs/templates/exports.j2:
--------------------------------------------------------------------------------
 1 | # /etc/exports: the access control list for filesystems which may be exported
 2 | #   to NFS clients.  See exports(5).
 3 | #
 4 | # Example for NFSv2 and NFSv3:
 5 | # /srv/homes       hostname1(rw,sync,no_subtree_check) hostname2(ro,sync,no_subtree_check)
 6 | #
 7 | # Example for NFSv4:
 8 | # /srv/nfs4        gss/krb5i(rw,sync,fsid=0,crossmnt,no_subtree_check)
 9 | # /srv/nfs4/homes  gss/krb5i(rw,sync,no_subtree_check)
10 | #
11 | {% for export in nfs_exports %}
12 | {{ export.path }} {{ export.options }}
13 | {% endfor %}
14 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/templates/cluster.yml.j2:
--------------------------------------------------------------------------------
 1 | ---
 2 | v2:
 3 |   metadata:
 4 |     title: "{{ ood_cluster_title }}"
 5 |   login:
 6 |     host: "{{ ansible_fqdn }}"
 7 |   job:
 8 |     adapter: "slurm"
 9 |     cluster: "{{ ood_cluster_name }}"
10 |     bin: "/usr/local/bin"
11 |     conf: "/etc/slurm/slurm.conf"
12 |   batch_connect:
13 |     basic:
14 |       script_wrapper: |
15 |         %s
16 |     vnc:
17 |       script_wrapper: |
18 |         export PATH="/opt/TurboVNC/bin:$PATH"
19 |         export WEBSOCKIFY_CMD="/usr/bin/websockify"
20 |         %s
21 | 


--------------------------------------------------------------------------------
/roles/grafana/templates/docker.grafana.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Grafana
 3 | After=docker.service
 4 | Requires=docker.service
 5 | 
 6 | [Service]
 7 | TimeoutStartSec=0
 8 | Restart=always
 9 | ExecStartPre=-/usr/bin/docker stop %n
10 | ExecStartPre=-/usr/bin/docker rm %n
11 | ExecStartPre=/usr/bin/docker pull {{ grafana_container }}
12 | ExecStart=/usr/bin/docker run --rm --network host --name %n -v {{ grafana_config_dir }}:/etc/grafana -v {{ grafana_data_dir }}:/var/lib/grafana {{ grafana_container }}
13 | 
14 | [Install]
15 | WantedBy=multi-user.target
16 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-operator/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Set GPU Operator flags for systems with preinstalled NVIDIA software (DGX, etc).
 3 |   set_fact:
 4 |     gpu_operator_enable_driver: false
 5 |     gpu_operator_enable_toolkit: true
 6 |   when: gpu_operator_preinstalled_nvidia_software
 7 | 
 8 | - name: deploy nvidia gpu operator to kubernetes
 9 |   include_tasks: k8s.yml
10 |   when: not gpu_operator_nvaie_enable
11 | 
12 | - name: deploy nvidia gpu operator to nvidia ai enterprise
13 |   include_tasks: nvaie.yml
14 |   when: gpu_operator_nvaie_enable
15 | 
16 | 


--------------------------------------------------------------------------------
/roles/nvidia-network-operator/templates/values.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | #
 3 | # Default setting for DGX systems with IB networking
 4 | # 
 5 | 
 6 | nfd:
 7 |   enabled: true
 8 | sriovNetworkOperator:
 9 |   enabled: true
10 | 
11 | # NicClusterPolicy CR values:
12 | deployCR: true
13 | ofedDriver:
14 |   deploy: false
15 | rdmaSharedDevicePlugin:
16 |   deploy: false
17 | sriovDevicePlugin:
18 |   deploy: false
19 | 
20 | secondaryNetwork:
21 |   deploy: true
22 |   multus:
23 |     deploy: true
24 |   cniPlugins:
25 |     deploy: true
26 |   ipamPlugin:
27 |     deploy: true
28 | 


--------------------------------------------------------------------------------
/roles/slurm/vars/redhat.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | slurm_pam_lib_dir: /lib64/security
 3 | 
 4 | slurm_build_deps:
 5 |   - "@Development Tools"
 6 |   - munge-devel
 7 |   - munge-libs
 8 |   - python3
 9 |   - readline-devel
10 |   - mariadb-devel
11 |   - numactl-devel
12 |   - pam-devel
13 |   - http-parser-devel
14 |   - json-c-devel
15 |   - perl-ExtUtils-MakeMaker
16 |   - libatomic
17 | 
18 | slurm_pmix_deps:
19 |   - "@Development Tools"
20 |   - libev-devel
21 |   - libevent-devel
22 |   - zlib
23 |   - zlib-devel
24 | 
25 | slurm_hwloc_deps:
26 |   - "@Development Tools"
27 | 


--------------------------------------------------------------------------------
/src/repo/githooks/pre-commit:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "${DEEPOPS_BYPASS_LINT}" ]; then
 4 | 	exit 0
 5 | fi
 6 | 
 7 | FAILED=0
 8 | 
 9 | # Lint changed Ansible files
10 | if ! src/githooks/check-ansible.py; then
11 | 	FAILED=1
12 | 	echo "Failed Ansible lint"
13 | fi
14 | 
15 | # Lint changed shell scripts
16 | if ! src/githooks/check-shell.py; then
17 | 	FAILED=1
18 | 	echo "Failed shell lint"
19 | fi
20 | 
21 | # Lint changed Python files
22 | if ! src/githooks/check-python.py; then
23 | 	FAILED=1
24 | 	echo "Failed python lint"
25 | fi
26 | 
27 | exit ${FAILED}
28 | 


--------------------------------------------------------------------------------
/roles/nfs-client-provisioner/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | # Vars needed to install nfs-client-provisioner
 2 | k8s_nfs_server: "127.0.0.1"
 3 | k8s_nfs_export_path: "/export/deepops_nfs"
 4 | k8s_nfs_default_sc: "true"
 5 | k8s_nfs_client_repo_name: "nfs-subdir-external-provisioner"
 6 | k8s_nfs_client_helm_repo: "https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner"
 7 | k8s_nfs_client_chart_name: "{{ k8s_nfs_client_repo_name }}/nfs-subdir-external-provisioner"
 8 | k8s_nfs_client_release_name: "nfs-subdir-external-provisioner"
 9 | k8s_nfs_client_chart_version: "4.0.13"
10 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/deep-learning-examples/templates/tests/test-connection.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: "{{ include "deep-learning-examples.fullname" . }}-test-connection"
 5 |   labels:
 6 |     {{- include "deep-learning-examples.labels" . | nindent 4 }}
 7 |   annotations:
 8 |     "helm.sh/hook": test
 9 | spec:
10 |   containers:
11 |     - name: wget
12 |       image: busybox
13 |       command: ['wget']
14 |       args: ['{{ include "deep-learning-examples.fullname" . }}:{{ .Values.jupyterNodePort }}']
15 |   restartPolicy: Never
16 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/services/nfs-dgx-iso.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: nfs-dgx-iso
 5 | spec:
 6 |   capacity:
 7 |     storage: 5Gi
 8 |   accessModes:
 9 |     - ReadOnlyMany
10 |   nfs:
11 |     server: #<IP of NFS server>
12 |     path: "/path/to/iso/DGXServer-3.1.2.170902_f8777e"
13 | ---
14 | apiVersion: v1
15 | kind: PersistentVolumeClaim
16 | metadata:
17 |   name: nfs-dgx-iso
18 | spec:
19 |   accessModes:
20 |     - ReadOnlyMany
21 |   storageClassName: ""
22 |   resources:
23 |     requests:
24 |       storage: 5Gi
25 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx-firmware/tasks/get-health.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Run NVSM human-readable health show
 3 |   shell: "nvsm show health > {{ fw_dir }}/nvsm-show-health.log"
 4 |   become: yes
 5 | 
 6 | - name: Run NVSM dump health
 7 |   shell: "nvsm dump health"
 8 |   become: yes
 9 |   when: nvsm_dump_health
10 | 
11 | - name: Run quick DCGM diagnostic
12 |   shell: "dcgmi diag -r 1 > {{ fw_dir }}/dcgm_diag_1.log"
13 |   become: yes
14 | 
15 | - name: Run full DCGM diagnostic
16 |   shell: "dcgmi diag -r 3 > {{ fw_dir }}/dcgm_diag_3.log"
17 |   become: yes
18 |   when: dcgm_stress
19 | 


--------------------------------------------------------------------------------
/roles/prometheus/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | prometheus_config_dir: /etc/prometheus
 2 | prometheus_config_src: templates/prometheus.yml.j2
 3 | prometheus_alert_rules_src: templates/alert_rules.yml.j2
 4 | prometheus_container: "prom/prometheus:v2.37.0"
 5 | prometheus_svc_name: "docker.prometheus.service"
 6 | prometheus_docker_volume_name: "deepops_prometheus_metrics"
 7 | prometheus_state: started
 8 | prometheus_enabled: yes
 9 | 
10 | prometheus_cfg_scrape_interval: 15s
11 | prometheus_cfg_evaluation_interval: 15s
12 | prometheus_cfg_endpoint_dir: "{{ prometheus_config_dir }}/endpoints"
13 | 


--------------------------------------------------------------------------------
/roles/kerberos_client/vars/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | kerberos_client_redhat7_pkg:
 3 |   - libselinux-python
 4 |   - krb5-libs
 5 |   - krb5-workstation
 6 |   - cyrus-sasl-gssapi
 7 | 
 8 | kerberos_client_redhat8_pkg:
 9 |   - python3-libselinux
10 |   - krb5-libs
11 |   - krb5-workstation
12 |   - cyrus-sasl-gssapi
13 | 
14 | kerberos_client_ubuntu18_pkg:
15 |   - python-selinux
16 |   - krb5-user
17 |   - libsasl2-modules-gssapi-mit
18 |   - libpam-krb5
19 | 
20 | kerberos_client_ubuntu20_pkg:
21 |   - python3-selinux
22 |   - krb5-user
23 |   - libsasl2-modules-gssapi-mit
24 |   - libpam-krb5
25 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-operator-node-prep/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: update-initramfs (Debian)
 3 |   command: update-initramfs -u
 4 |   when: ansible_os_family == "Debian"
 5 |   listen: update-initramfs
 6 | 
 7 | - name: Backup initramfs (RedHat)
 8 |   shell: mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak
 9 |   when: ansible_os_family == "RedHat"
10 |   listen: update-initramfs
11 | 
12 | - name: update-initramfs (RedHat)
13 |   shell: dracut /boot/initramfs-$(uname -r).img $(uname -r)
14 |   when: ansible_os_family == "RedHat"
15 |   listen: update-initramfs
16 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/munge.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: install munge
 4 |   package:
 5 |     name: munge
 6 |     state: present
 7 |   notify:
 8 |     - restart munge
 9 | 
10 | - name: create munge.key
11 |   template:
12 |     src: templates/etc/munge/munge.key.j2
13 |     dest: /etc/munge/munge.key
14 |     mode: 0400
15 |     owner: munge
16 |   notify:
17 |     - restart munge
18 | 
19 | - name: start munge
20 |   service:
21 |     name: munge
22 |     enabled: yes
23 |     state: started
24 | 
25 | - name: flush handlers to ensure munge gets restarted now
26 |   meta: flush_handlers
27 | 


--------------------------------------------------------------------------------
/roles/kerberos_client/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | galaxy_info:
 3 |   author: "Benno Joy"
 4 |   company: AnsibleWorks
 5 |   license: BSD
 6 |   min_ansible_version: 1.4 
 7 |   role_name: kerberos_client
 8 |   namespace: deepops
 9 |   platforms:
10 |    - name: EL
11 |      versions:
12 |       - 5
13 |       - 6
14 |    - name: Fedora
15 |      versions:
16 |       - 16
17 |       - 17
18 |       - 18
19 |    - name: Ubuntu
20 |      versions:
21 |       - precise
22 |       - quantal
23 |       - raring
24 |       - saucy
25 |   categories:
26 |    - development
27 | dependencies: []
28 |   
29 | 


--------------------------------------------------------------------------------
/roles/prometheus/templates/docker.prometheus.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Prometheus
 3 | After=docker.service
 4 | Requires=docker.service
 5 | 
 6 | [Service]
 7 | TimeoutStartSec=0
 8 | Restart=always
 9 | ExecStartPre=-/usr/bin/docker stop %n
10 | ExecStartPre=-/usr/bin/docker rm %n
11 | ExecStartPre=/usr/bin/docker pull {{ prometheus_container }}
12 | ExecStart=/usr/bin/docker run --rm --network host --name %n -v {{ prometheus_config_dir }}:/etc/prometheus -v {{ prometheus_docker_volume_name }}:/prometheus {{ prometheus_container }}
13 | 
14 | [Install]
15 | WantedBy=multi-user.target
16 | 


--------------------------------------------------------------------------------
/roles/nvidia_cuda/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure man directories exist in minimal ubuntu image
 7 |     file:
 8 |       path: "/usr/share/man/man1"
 9 |       owner: "root"
10 |       group: "root"
11 |       state: "directory"
12 |       mode: "0755"
13 |     when: ansible_distribution == "Ubuntu"
14 | 
15 |   - name: Ensure dependencies are present for apt key management
16 |     apt:
17 |       update_cache: yes
18 |       name: "gpg-agent"
19 |       state: present
20 |     when: ansible_distribution == "Ubuntu"
21 | 


--------------------------------------------------------------------------------
/roles/nvidia_dcgm/molecule/default/prepare.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   become: yes
 4 |   tasks:
 5 | 
 6 |   - name: ensure man directories exist in minimal ubuntu image
 7 |     file:
 8 |       path: "/usr/share/man/man1"
 9 |       owner: "root"
10 |       group: "root"
11 |       state: "directory"
12 |       mode: "0755"
13 |     when: ansible_distribution == "Ubuntu"
14 | 
15 |   - name: Ensure dependencies are present for apt key management
16 |     apt:
17 |       update_cache: yes
18 |       name: "gpg-agent"
19 |       state: present
20 |     when: ansible_distribution == "Ubuntu"
21 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/vars/ubuntu.yml:
--------------------------------------------------------------------------------
 1 | install_from_src: true
 2 | 
 3 | ood_apache_service_name: apache2
 4 | ood_htpasswd_file: /etc/apache2/.htpasswd
 5 | 
 6 | ood_url_turbovnc_pkg: https://downloads.sourceforge.net/project/turbovnc/2.2.4/turbovnc_2.2.4_amd64.deb
 7 | 
 8 | ood_master_sw_deps:
 9 |   - liblz4-tool
10 |   - unzip
11 |   - websockify
12 | 
13 | ood_client_sw_deps:
14 |   - liblz4-tool
15 |   - unzip
16 |   - nmap
17 |   - websockify
18 |   - xfce4
19 |   - xfce4-terminal
20 |   - xfce4-goodies
21 |   - jupyter-notebook
22 |   - dbus-x11
23 |   - firefox
24 |   - nvidia-visual-profiler
25 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/cluster-gpu-test-job.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: cluster-gpu-tests
 5 | spec:
 6 |   parallelism: 4 # DYNAMIC_PARALLELISM
 7 |   completions: 4 # DYNAMIC_COMPLETIONS
 8 |   backoffLimit: 5
 9 |   template:
10 |     spec:
11 |       containers:
12 |         - name: cluster-gpu-tests
13 |           image: nvcr.io/nvidia/cuda:9.0-base 
14 |           command: ["/bin/bash","-c","nvidia-smi && sleep 10"]
15 |           args:
16 |           resources:
17 |             limits:
18 |               nvidia.com/gpu: 1
19 |       restartPolicy: Never
20 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-spack-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | source workloads/jenkins/scripts/jenkins-common.sh
 4 | 
 5 | # Install Spack, including building default modules
 6 | ansible-playbook \
 7 | 	-i virtual/config/inventory \
 8 | 	-e '{"spack_build_packages": true}' \
 9 | 	playbooks/slurm-cluster/spack-modules.yml
10 | 
11 | # After install, we expect a cuda module to exist
12 | ssh -v \
13 | 	-o "StrictHostKeyChecking no" \
14 | 	-o "UserKnownHostsFile /dev/null" \
15 | 	-l vagrant \
16 | 	-i "${HOME}/.ssh/id_rsa" \
17 | 	"10.0.0.5${GPU01}" \
18 | 	"spack find | grep cuda"
19 | 


--------------------------------------------------------------------------------
/roles/docker-rootless/templates/rootless-docker/config/nvidia-container-runtime/config.toml:
--------------------------------------------------------------------------------
 1 | disable-require = false
 2 | #swarm-resource = "DOCKER_RESOURCE_GPU"
 3 | 
 4 | [nvidia-container-cli]
 5 | #root = "/run/nvidia/driver"
 6 | #path = "/usr/bin/nvidia-container-cli"
 7 | environment = []
 8 | #debug = "/var/log/nvidia-container-toolkit.log"
 9 | #ldcache = "/etc/ld.so.cache"
10 | load-kmods = true
11 | no-cgroups = true
12 | #no-cgroups = false
13 | #user = "root:video"
14 | ldconfig = "@/sbin/ldconfig.real"
15 | 
16 | [nvidia-container-runtime]
17 | #debug = "/var/log/nvidia-container-runtime.log"
18 | 


--------------------------------------------------------------------------------
/roles/pyxis/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | slurm_install_pyxis: true
 2 | slurm_install_prefix: /usr/local
 3 | slurm_config_dir: /etc/slurm
 4 | slurm_pyxis_version: 0.11.1
 5 | slurm_pyxis_tarball_url: "https://github.com/NVIDIA/pyxis/archive/v{{ slurm_pyxis_version }}.tar.gz"
 6 | 
 7 | is_controller: no
 8 | is_compute: no
 9 | 
10 | pyxis_ubuntu_deps:
11 | - "bsdmainutils"
12 | 
13 | pyxis_el_deps:
14 | - "util-linux"
15 | 
16 | # /run is default partition of pyxis runtime_path
17 | resize_run_partition: false
18 | 
19 | # /run tmpfs size. ubuntu default is 10% of physical memory
20 | pyxis_run_tmpfs_size: 50%
21 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/services/ambassador-service.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   labels:
 6 |     service: ambassador
 7 |   name: ambassador
 8 |   annotations:
 9 |     getambassador.io/config: |
10 |       ---
11 |       apiVersion: ambassador/v0
12 |       kind:  Mapping
13 |       name:  httpbin_mapping
14 |       prefix: /httpbin/
15 |       service: httpbin.org:80
16 |       host_rewrite: httpbin.org
17 | spec:
18 |   type: NodePort
19 |   ports:
20 |   - name: ambassador
21 |     port: 80
22 |     targetPort: 80
23 |   selector:
24 |     service: ambassador
25 | 


--------------------------------------------------------------------------------
/roles/alertmanager/templates/docker.alertmanager.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Alert Manager
 3 | After=docker.service
 4 | Requires=docker.service
 5 | 
 6 | [Service]
 7 | TimeoutStartSec=0
 8 | Restart=always
 9 | ExecStartPre=-/usr/bin/docker stop %n
10 | ExecStartPre=-/usr/bin/docker rm %n
11 | ExecStartPre=/usr/bin/docker pull {{ alertmanager_container }}
12 | ExecStart=/usr/bin/docker run --rm --network host --name %n -v {{ alertmanager_config_dir }}:/etc/alertmanager -v {{ alertmanager_docker_volume_name }}:/alertmanager {{ alertmanager_container }}
13 | 
14 | [Install]
15 | WantedBy=multi-user.target
16 | 


--------------------------------------------------------------------------------
/virtual/vars_files/virt_slurm.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # For virtual cluster, ensure hosts file correctly uses private network
 3 | hosts_network_interface: "eth1"
 4 | 
 5 | # Ensure vagrant user has SSH access after pam_slurm for debugging
 6 | slurm_allow_ssh_user:
 7 | - "vagrant"
 8 | - "root"
 9 | 
10 | # Perform cleanup tasks during the install to minimize disk space impact
11 | hpcsdk_clean_up_tarball_after_extract: true
12 | hpcsdk_clean_up_temp_dir: true
13 | slurm_build_dir_cleanup: false
14 | 
15 | # Ensure we use the slurm management node for syslog
16 | rsyslog_client_tcp_host: "{{ groups['slurm-master'][0] }}"
17 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/get-slurm-debug.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | source workloads/jenkins/scripts/jenkins-common.sh
 4 | 
 5 | 
 6 | # Ensure working directory is virtual, so downstream Ansible picks up the correct inventory
 7 | cd "${VIRT_DIR}/virtual"
 8 | 
 9 | #  Collect all the standard debug
10 | ${ROOT_DIR}/scripts/slurm/debug.sh
11 | 
12 | # The debug script will create a time-stamped log dir
13 | logdir=$(ls -Art ./config | grep log_ | tail -n 1)
14 | 
15 | # Iterate over each .log file and pring to screen, ignoring the tar
16 | for logfile in $(ls ./config/${logdir}/*log); do
17 |     cat ${logfile}
18 | done
19 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/undrain.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # un-drain nodes that are down due to an unexpected reboot during install
 3 | # sudo scontrol update node=XXX state=idle
 4 | # where XXX are the nodes that have changed and are marked as *down*
 5 | - name: set nodes to idle
 6 |   command: "scontrol update node={{ item }} state=idle"
 7 |   register: undrain_nodes_result
 8 |   ignore_errors: yes
 9 |   with_items:
10 |     - "{{ groups['slurm-node'] }}"
11 |   environment:
12 |     PATH: '{{ slurm_install_prefix }}/bin:{{ ansible_env.PATH }}'
13 |   run_once: true
14 |   tags:
15 |     - never
16 |     - undrain
17 |   changed_when: false
18 | 


--------------------------------------------------------------------------------
/playbooks/nvidia-dgx/nvidia-dgx-diag.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This playbook is meant to diagnose/debug a cluster of DGX systems
 3 | # It is built around DGX-1/DGX-2 but parts of the role will work for any Tesla GPU system
 4 | # Because this is built as a diagnostic tool, many tasks have ignore_error set to true, this allows best-effort debugging
 5 | 
 6 | - hosts: all
 7 |   become: yes
 8 |   gather_facts: no
 9 |   strategy: free
10 |   tasks:
11 |     - name: Include NVIDIA DGX Firmware role
12 |       include_role:
13 |         name: nvidia-dgx-firmware
14 |       vars:
15 |         run_diagnostics: true
16 |         update_firmware: false
17 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmstats:
--------------------------------------------------------------------------------
 1 | # Start DCGM GPU stats collection if requested
 2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null
 3 | if [ $? -eq 0 ]; then
 4 |         GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'`
 5 |         sudo -u $SLURM_JOB_USER nv-hostengine ~$SLURM_JOB_USER/nvhost.pid
 6 |         sudo -u $SLURM_JOB_USER dcgmi group -c gpuinfo
 7 |         sudo -u $SLURM_JOB_USER dcgmi group -g 1 -a $GPULIST
 8 |         sudo -u $SLURM_JOB_USER dcgmi stats -g 1 --enable
 9 |         sudo -u $SLURM_JOB_USER dcgmi stats -g 1 -s $SLURM_JOBID
10 | fi
11 | 


--------------------------------------------------------------------------------
/playbooks/nvidia-dgx/nvidia-dgx-fw-update.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This playbook updates the firwmware on DGX nodes
 3 | # Refer to the README in roles/nvidia-dgx-firmware for more info
 4 | # NOTE: forcing use of the paramiko ssh plugin since running fw
 5 | # update container with interactive `-it` flag requires tty and
 6 | # ansible ssh flakes out in `auto` mode.
 7 | 
 8 | - hosts: all
 9 |   become: yes
10 |   connection: paramiko_ssh
11 |   tasks:
12 |     - name: Include NVIDIA DGX Firmware role
13 |       include_role:
14 |         name: nvidia-dgx-firmware
15 |       vars:
16 |         run_diagnostics: true
17 |         update_firmware: true
18 | 


--------------------------------------------------------------------------------
/src/containers/ngc/rapids/Dockerfile-minimal:
--------------------------------------------------------------------------------
 1 | # https://ngc.nvidia.com/catalog/containers/nvidia:rapidsai:rapidsai
 2 | FROM nvcr.io/nvidia/rapidsai/rapidsai:0.17-cuda10.1-runtime-ubuntu18.04
 3 | 
 4 | # RAPIDS is installed using conda and we need to work from this environment
 5 | ENV CONDA_ENV rapids
 6 | 
 7 | # Start using the built in RAPIDS conda environment
 8 | ENTRYPOINT ["/bin/sh"]
 9 | CMD ["-c", "/opt/conda/envs/${CONDA_ENV}/bin/jupyter lab  --notebook-dir=/rapids --ip=0.0.0.0 --no-browser --allow-root --port=8888 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' --NotebookApp.base_url=${NB_PREFIX}"]
10 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/prolog-lspci:
--------------------------------------------------------------------------------
 1 | #
 2 | # Check that all GPUs are present
 3 | #
 4 | NUMGPUS=`scontrol -a show nodes $HOSTNAME | grep "Gres=gpu" | cut -d : -f 2`
 5 | if [ $NUMGPUS -gt 0 ]; then
 6 |         PCIGPUSFOUND=`lspci | grep "3D controller: NVIDIA Corporation" | wc -l`
 7 |         if [ $PCIGPUSFOUND -ne $NUMGPUS ]; then
 8 |                 echo "Slurm expects $NUMGPUS GPUs but lspci found: $PCIGPUSFOUND"
 9 |                 scontrol update nodename=$HOSTNAME state=drain reason="Missing GPUs"
10 |                 exit 0
11 |         fi
12 |         echo "Slurm expects $NUMGPUS GPUs lspci found: $PCIGPUSFOUND"
13 | fi
14 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/tasks/configure-raid.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: create raid array mount point
 3 |   file:
 4 |     path: "{{ dgx_raid_mount_path }}"
 5 |     state: directory
 6 | 
 7 | - name: Stop cachefilesd when reconfiguring RAID array
 8 |   service:
 9 |     name: cachefilesd
10 |     state: stopped
11 | 
12 | - name: Configure RAID array
13 |   command: /usr/bin/configure_raid_array.py -c -f
14 | 
15 | - name: Restore SELinux label on RAID array
16 |   command: restorecon /raid
17 |   when:
18 |   - ansible_os_family == 'RedHat'
19 |   - (ansible_selinux is defined) and (ansible_selinux.status != "disabled")
20 |   notify: restart cachefilesd
21 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/service-files.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: copy service files
 3 |   copy:
 4 |     src: "{{ slurm_build_dir }}/etc/{{ item }}"
 5 |     dest: "/etc/systemd/system/{{ item }}"
 6 |     remote_src: yes
 7 |     mode: "0644"
 8 |   with_items:
 9 |     - slurmctld.service
10 |     - slurmdbd.service
11 |   when: is_controller
12 |   failed_when: false
13 | 
14 | - name: copy service files
15 |   copy:
16 |     src: "{{ slurm_build_dir }}/etc/{{ item }}"
17 |     dest: "/etc/systemd/system/{{ item }}"
18 |     remote_src: yes
19 |     mode: "0644"
20 |   with_items:
21 |     - slurmd.service
22 |   when: is_compute
23 |   failed_when: false
24 | 


--------------------------------------------------------------------------------
/roles/standalone-container-registry/templates/config.yml:
--------------------------------------------------------------------------------
 1 | version: 0.1
 2 | storage:
 3 |   filesystem:
 4 |     rootdirectory: /var/lib/registry
 5 | http:
 6 |   addr: "0.0.0.0:{{ standalone_container_registry_port }}"
 7 | log:
 8 |   accesslog:
 9 |     disabled: false
10 | {% if standalone_container_registry_cache_enable -%}
11 | proxy:
12 |   remoteurl: {{ standalone_container_registry_cache_upstream }}
13 |   {% if standalone_container_registry_cache_username is defined -%}
14 |   username: {{ standalone_container_registry_cache_username }}
15 |   password: {{ standalone_container_registry_cache_password }}
16 |   {% endif -%}
17 | {% endif -%}
18 | 


--------------------------------------------------------------------------------
/scripts/pxe/setup_nat.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export HOST_INT_PUB="${1}"
 4 | export HOST_INT_PRV="${2}"
 5 | 
 6 | ip a show dev "${HOST_INT_PUB}"
 7 | if [ $? -ne 0 ] ; then
 8 |     exit 1
 9 | fi
10 | 
11 | ip a show dev "${HOST_INT_PRV}"
12 | if [ $? -ne 0 ] ; then
13 |     exit 1
14 | fi
15 | 
16 | set -x
17 | sudo /sbin/iptables -t nat -A POSTROUTING -o ${HOST_INT_PUB} -j MASQUERADE
18 | sudo /sbin/iptables -A FORWARD -i ${HOST_INT_PUB} -o ${HOST_INT_PRV} -m state --state RELATED,ESTABLISHED -j ACCEPT
19 | sudo /sbin/iptables -A FORWARD -i ${HOST_INT_PRV} -o ${HOST_INT_PUB} -j ACCEPT
20 | sudo sysctl -w net.ipv4.ip_forward=1
21 | set +x
22 | 


--------------------------------------------------------------------------------
/roles/pyxis/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: restart munge
 3 |   service:
 4 |     name: munge
 5 |     state: restarted
 6 | 
 7 | - name: restart slurmd
 8 |   service:
 9 |     name: slurmd
10 |     state: restarted
11 |   when: is_compute
12 | 
13 | - name: restart slurmdbd
14 |   service:
15 |     name: slurmdbd
16 |     state: restarted
17 |   when: is_controller
18 | 
19 | - name: restart slurmctld
20 |   service:
21 |     name: slurmctld
22 |     state: restarted
23 |   when: is_controller
24 | 
25 | - name: restart logind
26 |   service:
27 |     name: systemd-logind.service
28 |     state: restarted
29 |     enabled: yes
30 |   when: is_compute
31 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/gpu-usage/mig-mixed-with-selector.yml:
--------------------------------------------------------------------------------
 1 | # This yaml file will launch a container with a 1g.5gb MIG device
 2 | # It will deploy only onto the specified GPU type (A100 40GB) even if this profile is available on other systems
 3 | apiVersion: v1
 4 | kind: Pod
 5 | metadata:
 6 |   name: gpu-pod
 7 | spec:
 8 |   nodeSelector:
 9 |     nvidia.com/gpu.product: A100-SXM4-40GB               
10 |   containers:
11 |     - name: gpu-pod
12 |       image:  nvcr.io/nvidia/k8s/cuda-sample:nbody
13 |       command: ["/bin/sh"]
14 |       args: ["-c", "nvidia-smi"]
15 |       resources:
16 |         limits:
17 |           nvidia.com/mig-1g.5gb: 1
18 | 


--------------------------------------------------------------------------------
/workloads/examples/slurm/dask-rapids/files/launch-dask-cuda-worker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ANACONDA_ROOT="/usr/local/anaconda"
 4 | CONDA_ENV="/shared/conda"
 5 | export PATH="${CONDA_ENV}/bin:${ANACONDA_ROOT}/bin:${PATH}"
 6 | 
 7 | # shellcheck disable=SC1091
 8 | source activate "${CONDA_ENV}"
 9 | 
10 | export CUDA_ROOT=/usr/local/cuda
11 | export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_ROOT/lib64"
12 | export NUMBAPRO_NVVM="$CUDA_ROOT/nvvm/lib64/libnvvm.so"
13 | export NUMBAPRO_LIBDEVICE="$CUDA_ROOT/nvvm/libdevice"
14 | 
15 | echo "Launching dask-cuda-worker with scheduler $1 and port $2"
16 | dask-cuda-worker "$1:$2" || echo "Unable to start worker"
17 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/setup-role.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: gather os specific variables
 3 |   include_vars: "{{ item }}"
 4 |   with_first_found:
 5 |     - files:
 6 |       - "{{ ansible_distribution|lower }}.yml"
 7 |       - "{{ ansible_os_family|lower }}.yml"
 8 |       paths:
 9 |       - ../vars
10 |       skip: true
11 |   tags:
12 |     - always
13 | 
14 | - name: trust GPG key for EPEL
15 |   rpm_key:
16 |     key: "{{ epel_key_url }}"
17 |     state: present
18 |   when: ansible_os_family == "RedHat"
19 | 
20 | - name: add epel repo
21 |   yum:
22 |     name: 
23 |       - "{{ epel_package }}"
24 |     state: present
25 |   when: ansible_os_family == "RedHat"
26 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-slurm-gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source workloads/jenkins/scripts/jenkins-common.sh
 3 | 
 4 | # Upload test script
 5 | scp  \
 6 | 	-o "StrictHostKeyChecking no" \
 7 | 	-o "UserKnownHostsFile /dev/null" \
 8 | 	-i "${HOME}/.ssh/id_rsa" \
 9 | 	workloads/jenkins/scripts/remote-script-for-slurm-gpu.sh \
10 | 	"vagrant@10.0.0.5${GPU01}:remote-script-for-slurm-gpu.sh"
11 | 
12 | # Compile and run CUDA sample 
13 | ssh \
14 | 	-o "StrictHostKeyChecking no" \
15 | 	-o "UserKnownHostsFile /dev/null" \
16 | 	-l vagrant \
17 | 	-i "${HOME}/.ssh/id_rsa" \
18 | 	"10.0.0.5${GPU01}" \
19 | 	"bash -l /home/vagrant/remote-script-for-slurm-gpu.sh"
20 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/gpu-usage/mig-single.yml:
--------------------------------------------------------------------------------
 1 | # This yaml file will launch a container with a 1g.5gb MIG device
 2 | # It will only deploy onto a node containing A100 40B GPUs
 3 | # The node must be configured in mig-strategy=single mode with all 1g.5gb profiles created
 4 | apiVersion: v1
 5 | kind: Pod
 6 | metadata:
 7 |   name: gpu-pod
 8 | spec:
 9 |   nodeSelector:
10 |     nvidia.com/gpu.product: A100-SXM4-40GB-MIG-1g.5gb
11 |   containers:
12 |     - name: gpu-pod
13 |       image:  nvcr.io/nvidia/k8s/cuda-sample:nbody
14 |       command: ["/bin/sh"]
15 |       args: ["-c", "nvidia-smi"]
16 |       resources:
17 |         limits:
18 |           nvidia.com/gpu: 1
19 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingoff:
--------------------------------------------------------------------------------
 1 | # Disable hypterthreading if requested
 2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i nohyperthreading > /dev/null
 3 | if [ $? -eq 0 ]; then
 4 | 	cat `find /sys/devices/system/cpu -name thread_siblings_list` | sort | uniq > /tmp/thread_siblings_list
 5 | 	for sibs in `cat /tmp/thread_siblings_list` ; do
 6 | 	    	echo $sibs | grep ',' >& /dev/null # if there is a comma (','), then need to disable 2nd
 7 |     		if [ $? -eq 0 ] ; then
 8 |       			x=`echo $sibs | cut -f 2 -d ','`
 9 |       			echo Disabling CPU $x
10 |       			echo 0 > /sys/devices/system/cpu/cpu$x/online
11 |     		fi
12 |   	done
13 | fi
14 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: gather os specific variables
 3 |   include_vars: "{{ item }}"
 4 |   with_first_found:
 5 |     - files:
 6 |       - "{{ ansible_distribution|lower }}.yml"
 7 |       - "{{ ansible_os_family|lower }}.yml"
 8 |       paths:
 9 |       - ../vars
10 |       skip: true
11 |   tags: vars
12 | 
13 | - name: Setup Open OnDemand server
14 |   include_tasks: server.yml
15 |   when: ood_is_server
16 | 
17 | - name: Setup Open OnDemand client
18 |   include_tasks: client.yml
19 |   when: ood_is_client
20 | 
21 | - name: Setup linuxhost adapter
22 |   include_tasks: linuxhost-adapter.yml
23 |   when: ood_is_server and ood_install_linuxhost_adapter
24 | 


--------------------------------------------------------------------------------
/roles/nvidia-peer-memory/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Check for DGX packages
 3 |   stat:
 4 |     path: /etc/dgx-release
 5 |   register: is_dgx
 6 | 
 7 | - name: Autoinstall DKMS modules
 8 |   command: dkms autoinstall
 9 |   when:
10 |     - ansible_local['gpus']['count']
11 |     - is_dgx.stat.exists
12 | 
13 | - name: Modprobe nv_peer_mem
14 |   modprobe:
15 |     name: nv_peer_mem
16 |     state: present
17 |   when:
18 |     - ansible_local['gpus']['count']
19 |     - is_dgx.stat.exists
20 | 
21 | - name: Start nv_peer_mem service
22 |   service:
23 |     name: nv_peer_mem
24 |     state: started
25 |   when:
26 |     - ansible_local['gpus']['count']
27 |     - is_dgx.stat.exists
28 | 


--------------------------------------------------------------------------------
/roles/nvidia_cuda/tasks/install-dgx.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: DGX | Install CUDA on DGX-1
 3 |   package:
 4 |     name: "{{ cuda_dgx_override_version | default(cuda_dgx_1_version) }}"
 5 |     state: present
 6 |   when: ansible_product_name is search("DGX-1")
 7 | 
 8 | - name: DGX | Install CUDA on DGX-2
 9 |   package:
10 |     name: "{{ cuda_dgx_override_version | default(cuda_dgx_2_version) }}"
11 |     state: present
12 |   when: ansible_product_name is search("DGX-2")
13 | 
14 | - name: DGX | Install CUDA on DGX A100
15 |   package:
16 |     name: "{{ cuda_dgx_override_version | default(cuda_dgx_a100_version) }}"
17 |     state: present
18 |   when: ansible_product_name is search("DGXA100")
19 | 


--------------------------------------------------------------------------------
/roles/prometheus-node-exporter/templates/docker.node-exporter.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Prometheus Node Exporter
 3 | After=docker.service
 4 | Requires=docker.service
 5 | 
 6 | [Service]
 7 | TimeoutStartSec=0
 8 | Restart=always
 9 | ExecStartPre=-/usr/bin/docker stop %n
10 | ExecStartPre=-/usr/bin/docker rm %n
11 | ExecStartPre=/usr/bin/docker pull {{ node_exporter_container }}
12 | ExecStart=/usr/bin/docker run --rm --network host --cpus={{ node_exporter_max_cpu }}  --pid=host --name %n -v {{ node_exporter_prom_dir }}:/run/prometheus {{ node_exporter_container }} --collector.textfile.directory="{{ node_exporter_prom_dir }}"
13 | 
14 | [Install]
15 | WantedBy=multi-user.target
16 | 


--------------------------------------------------------------------------------
/config.example/playbooks/example.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Your custom playbooks should go in your DeepOps configuration directory,
 3 | # under `config/playbooks`.
 4 | # 
 5 | # These playbooks can be used to make any customizations to your cluster
 6 | # that aren't already provided by DeepOps!
 7 | # For example, this playbook installs `cowsay` on all your Kubernetes nodes.
 8 | #
 9 | # For more details on how to write Ansible playbooks, see the Ansible
10 | # documentation:
11 | #   https://docs.ansible.com/ansible/latest/user_guide/playbooks.html
12 | 
13 | - hosts: kube-node
14 |   become: yes
15 |   tasks:
16 |   - name: install cowsay
17 |     package:
18 |       name: cowsay
19 |       state: present
20 | 


--------------------------------------------------------------------------------
/roles/nvidia_dcgm/tasks/install-ubuntu.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Ubuntu | remove old key
 3 |   apt_key:
 4 |     id: "{{ old_nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}"
 5 |     state: "absent"
 6 | 
 7 | - name: Ubuntu | install CUDA keyring
 8 |   apt:
 9 |     deb: "{{ nvidia_driver_ubuntu_cuda_keyring_url }}"
10 |     state: "present"
11 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
12 | 
13 | - name: Ubuntu | force apt update
14 |   apt:
15 |     update_cache: true
16 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
17 |   changed_when: false
18 | 
19 | - name: Ubuntu | install package
20 |   apt:
21 |     name: "{{ dcgm_pkg_name }}"
22 |     state: "present"
23 | 


--------------------------------------------------------------------------------
/roles/nvidia_hpc_sdk/molecule/default/molecule.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | dependency:
 3 |   name: galaxy
 4 | driver:
 5 |   name: docker
 6 | platforms:
 7 |   - name: nvhpc-ubuntu-1804
 8 |     image: geerlingguy/docker-ubuntu1804-ansible
 9 |     pre_build_image: true
10 |   - name: nvhpc-ubuntu-2004
11 |     image: geerlingguy/docker-ubuntu2004-ansible
12 |     pre_build_image: true
13 |   - name: nvhpc-centos-7
14 |     image: geerlingguy/docker-centos7-ansible
15 |     pre_build_image: true
16 | #  - name: nvhpc-centos-8
17 | #    image: geerlingguy/docker-centos8-ansible
18 | #    pre_build_image: true
19 | provisioner:
20 |   name: ansible
21 |   ansible_args:
22 |   - -vv
23 | verifier:
24 |   name: ansible
25 | 


--------------------------------------------------------------------------------
/roles/openmpi/molecule/default/molecule.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | dependency:
 3 |   name: galaxy
 4 | driver:
 5 |   name: docker
 6 | platforms:
 7 |   - name: openmpi-ubuntu-1804
 8 |     image: geerlingguy/docker-ubuntu1804-ansible
 9 |     pre_build_image: true
10 |   - name: openmpi-ubuntu-2004
11 |     image: geerlingguy/docker-ubuntu2004-ansible
12 |     pre_build_image: true
13 |   - name: openmpi-centos-7
14 |     image: geerlingguy/docker-centos7-ansible
15 |     pre_build_image: true
16 | #  - name: openmpi-centos-8
17 | #    image: geerlingguy/docker-centos8-ansible
18 | #    pre_build_image: true
19 | provisioner:
20 |   name: ansible
21 |   ansible_args:
22 |   - -vv
23 | verifier:
24 |   name: ansible
25 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/nbody.yml:
--------------------------------------------------------------------------------
 1 | # kubectl apply -f tests/nbody.yml
 2 | # kubectl scale deploy/cuda-nbody --replicas=2
 3 | apiVersion: apps/v1
 4 | kind: Deployment
 5 | metadata:
 6 |   name: cuda-nbody
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: cuda-nbody
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: cuda-nbody
16 |     spec:
17 |       containers:
18 |         - name: cuda-nbody-container
19 |           image: nvcr.io/nvidia/k8s/cuda-sample:nbody
20 |           command: ["/bin/sh"]
21 |           args: ["-c", "nbody -benchmark -numbodies=1000192"]
22 |           resources:
23 |             limits:
24 |               nvidia.com/gpu: 1
25 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/epilog-ecc:
--------------------------------------------------------------------------------
 1 | # Make sure ECC is on.
 2 | nvidia-smi -a | grep -A1 Ecc | grep -i disabled > /dev/null
 3 | if [ $? -eq 0 ]; then
 4 |     logger -t PROLOG "Enabling ECC"
 5 |     nvidia-smi -e 1
 6 |     GPUCOUNT=`nvidia-smi -L | wc -l`
 7 |     GPUMAXINDEX=`expr $GPUCOUNT - 1`
 8 |     systemctl stop collectd
 9 |     logger -t PROLOG "Triggering GPU reset"
10 |     for i in `seq 0 $GPUMAXINDEX`; do
11 |         e=`nvidia-smi -r -i $i 2>&1`
12 |         if [ $? -ne 0 ]; then
13 |             logger -t PROLOG "WARNING! GPU $i reset failed"
14 |             logger -t PROLOG "GPU $i reset error: $e"
15 |         fi
16 |     done
17 |     logger -t PROLOG "GPU reset done"
18 | fi
19 | 


--------------------------------------------------------------------------------
/roles/nvidia-dcgm-exporter/templates/docker.dcgm-exporter.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=NVIDIA DCGM Exporter
 3 | After=docker.service
 4 | Requires=docker.service
 5 | 
 6 | [Service]
 7 | TimeoutStartSec=0
 8 | Restart=always
 9 | ExecStartPre=-/usr/bin/docker stop %n
10 | ExecStartPre=-/usr/bin/docker rm %n
11 | ExecStartPre=/usr/bin/docker pull {{ nvidia_dcgm_container }}
12 | ExecStart=/usr/bin/docker run --rm --gpus all --cap-add=SYS_ADMIN --cpus="{{ nvidia_dcgm_max_cpu }}" --name %n -p 9400:9400 -v "{{ nvidia_dcgm_container_config_dir }}/{{ nvidia_dcgm_container_custom_metrics_file }}:/etc/dcgm-exporter/default-counters.csv" {{ nvidia_dcgm_container }}
13 | 
14 | [Install]
15 | WantedBy=multi-user.target
16 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Check for DGX
 3 |   fail:
 4 |     msg: "Role supports DGX systems only"
 5 |   when: ansible_product_name is not search("DGX")
 6 | 
 7 | - name: Ubuntu tasks for DGX OS 4/5
 8 |   include_tasks: ubuntu.yml
 9 |   when:
10 |     - ansible_distribution == 'Ubuntu'
11 | 
12 | - name: redhat family tasks
13 |   include_tasks: redhat.yml
14 |   when: ansible_os_family == 'RedHat'
15 | 
16 | - name: configure raid array
17 |   include_tasks: configure-raid.yml
18 |   when: dgx_configure_raid_array
19 | 
20 | - name: perform full OS upgrade on Ubuntu
21 |   include_tasks: ubuntu-upgrade.yml
22 |   when:
23 |     - dgx_full_upgrade
24 |     - ansible_distribution == 'Ubuntu'
25 | 


--------------------------------------------------------------------------------
/roles/openshift/molecule/default/molecule.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | dependency:
 3 |   name: galaxy
 4 | driver:
 5 |   name: docker
 6 | platforms:
 7 |   - name: openshift-ubuntu-1804
 8 |     image: geerlingguy/docker-ubuntu1804-ansible
 9 |     pre_build_image: true
10 |   - name: openshift-ubuntu-2004
11 |     image: geerlingguy/docker-ubuntu2004-ansible
12 |     pre_build_image: true
13 |   - name: openshift-centos-7
14 |     image: geerlingguy/docker-centos7-ansible
15 |     pre_build_image: true
16 | #  - name: openshift-centos-8
17 | #    image: geerlingguy/docker-centos8-ansible
18 | #    pre_build_image: true
19 | provisioner:
20 |   name: ansible
21 |   ansible_args:
22 |   - -vv
23 | verifier:
24 |   name: ansible
25 | 


--------------------------------------------------------------------------------
/roles/docker-login/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: ensure python prereqs are installed
 3 |   package:
 4 |     name: "{{ item }}"
 5 |     state: present
 6 |   with_items:
 7 |   - "python3-setuptools"
 8 |   - "python3-pip"
 9 | 
10 | - name: ensure docker pip package is installed
11 |   pip:
12 |     name: "docker"
13 |     state: present
14 | 
15 | - name: log into docker registry
16 |   docker_login:
17 |     state: "{{ docker_login_state }}"
18 |     registry: "{{ item.registry }}"
19 |     username: "{{ item.username }}"
20 |     password: "{{ item.password }}"
21 |     reauthorize: "{{ docker_login_reauth }}" 
22 |   with_items: "{{ docker_login_registries }}"
23 |   no_log: "{{ docker_login_disable_log_password }}"
24 | 


--------------------------------------------------------------------------------
/roles/cachefilesd/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: install
 3 |   package:
 4 |     name: cachefilesd
 5 |     state: "{{ cachefilesd_package_state }}"
 6 | 
 7 | - name: enable
 8 |   template:
 9 |     src: cachefilesd.j2
10 |     dest: /etc/default/cachefilesd
11 |     owner: "root"
12 |     group: "root"
13 |     mode: "0644"
14 | 
15 | - name: configure
16 |   template:
17 |     src: cachefilesd_config.j2
18 |     dest: /etc/cachefilesd.conf
19 |     owner: "root"
20 |     group: "root"
21 |     mode: "0644"
22 | 
23 | # Service start not tested in molecule as we require a kernel module
24 | - name: start
25 |   service:
26 |     name: cachefilesd
27 |     state: restarted
28 |     enabled: yes
29 |   tags:
30 |   - molecule-notest
31 | 


--------------------------------------------------------------------------------
/workloads/services/k8s/k8s-dashboard-admin.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: admin-user
 6 |   namespace: kube-system
 7 | ---
 8 | apiVersion: v1
 9 | kind: Secret
10 | metadata:
11 |   name: admin-user-secret
12 |   annotations:
13 |     kubernetes.io/service-account.name: admin-user
14 |   namespace: kube-system
15 | type: kubernetes.io/service-account-token
16 | 
17 | ---
18 | apiVersion: rbac.authorization.k8s.io/v1
19 | kind: ClusterRoleBinding
20 | metadata:
21 |   name: admin-user
22 | roleRef:
23 |   apiGroup: rbac.authorization.k8s.io
24 |   kind: ClusterRole
25 |   name: cluster-admin
26 | subjects:
27 | - kind: ServiceAccount
28 |   name: admin-user
29 |   namespace: kube-system
30 | 


--------------------------------------------------------------------------------
/roles/prometheus-slurm-exporter/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | slurm_exporter_container: "deepops/prometheus-slurm-exporter:latest"
 2 | slurm_exporter_svc_name: "docker.slurm-exporter.service"
 3 | slurm_exporter_state: started
 4 | slurm_exporter_enabled: yes
 5 | 
 6 | slurm_install_prefix: /usr/local
 7 | 
 8 | prometheus_config_dir: /etc/prometheus
 9 | prometheus_cfg_endpoint_dir: "{{ prometheus_config_dir }}/endpoints"
10 | slurm_exporter_conf_template: "slurm-exporter.yml.j2"
11 | 
12 | grafana_svc_name: "docker.grafana.service"
13 | grafana_data_dir: /var/lib/grafana
14 | grafana_cfg_dashboard_path: "{{ grafana_data_dir }}/dashboards"
15 | grafana_user_id: 472
16 | 
17 | slurm_exporter_host_group: "{{ slurm_monitoring_group | default('slurm-master') }}"
18 | 


--------------------------------------------------------------------------------
/roles/slurm/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: restart munge
 3 |   service:
 4 |     name: munge
 5 |     state: restarted
 6 | 
 7 | - name: restart slurmd
 8 |   service:
 9 |     name: slurmd
10 |     state: restarted
11 |   when: is_compute
12 | 
13 | - name: restart slurmdbd
14 |   service:
15 |     name: slurmdbd
16 |     state: restarted
17 |   when: is_controller
18 | 
19 | - name: restart slurmctld
20 |   service:
21 |     name: slurmctld
22 |     state: restarted
23 |   when: is_controller
24 | 
25 | - name: restart logind
26 |   service:
27 |     name: systemd-logind.service
28 |     state: restarted
29 |     enabled: yes
30 |   when: is_compute
31 | 
32 | - name: restart rsyslog
33 |   service:
34 |     name: rsyslog
35 |     state: restarted
36 | 


--------------------------------------------------------------------------------
/roles/nvidia-dcgm-exporter/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | nvidia_dcgm_container_version: "2.1.8-2.4.0-rc.2-ubuntu20.04"
 2 | nvidia_dcgm_container: "nvcr.io/nvidia/k8s/dcgm-exporter:{{ nvidia_dcgm_container_version }}"
 3 | nvidia_dcgm_container_config_dir: "/opt/deepops/nvidia-dcgm-exporter"
 4 | nvidia_dcgm_container_custom_metrics_file: "dcgm-custom-metrics.csv"
 5 | nvidia_dcgm_prom_dir: "/run/prometheus"
 6 | nvidia_dcgm_svc_name: "docker.dcgm-exporter.service"
 7 | nvidia_dcgm_state: started
 8 | nvidia_dcgm_enabled: yes
 9 | 
10 | prometheus_config_dir: /etc/prometheus
11 | prometheus_cfg_endpoint_dir: "{{ prometheus_config_dir }}/endpoints"
12 | nvidia_dcgm_exporter_conf_template: "dcgm-exporter.yml.j2"
13 | 
14 | has_gpus: false
15 | 
16 | nvidia_dcgm_max_cpu: "0.5"
17 | 


--------------------------------------------------------------------------------
/workloads/examples/slurm/dask-rapids/files/conda-requirements.yml:
--------------------------------------------------------------------------------
 1 | name: rapids
 2 | channels:
 3 |   - numba
 4 |   - conda-forge
 5 |   - nvidia/label/cuda10.0
 6 |   - rapidsai/label/cuda10.0
 7 |   - pytorch
 8 |   - defaults
 9 | dependencies:
10 |   - arrow-cpp=0.12
11 |   - bokeh
12 |   - cffi=1.11.5
13 |   - cmake=3.12
14 |   - cuda100
15 |   - cudf=0.5.1
16 |   - cuml=0.5.1
17 |   - cython=0.29
18 |   - dask=1.1.1
19 |   - distributed=1.25.3
20 |   - faiss-gpu=1.5.0
21 |   - jupyterlab
22 |   - matplotlib
23 |   - numba=0.42
24 |   - numpy=1.15.4
25 |   - nvstrings
26 |   - pandas=0.23.4
27 |   - paramiko
28 |   - pyarrow=0.12
29 |   - pytest
30 |   - python=3.7
31 |   - scikit-learn
32 |   - scipy
33 |   - pip:
34 |      - setuptools
35 |      - cupy-cuda100
36 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/services/pxe.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: pxe-server
 5 | spec:
 6 |   hostNetwork: true
 7 |   containers:
 8 |     - name: pxe-server
 9 |       image: deepops/provision/pxe      # change me
10 |       volumeMounts:
11 |         - name: config-volume
12 |           mountPath: /data
13 |         - name: nfs
14 |           mountPath: "/iso"
15 |   imagePullSecrets:
16 |   - name: secret                        # change me
17 |   volumes:
18 |     - name: config-volume
19 |       configMap:
20 |         name: pxe
21 |         items:
22 |         - key: machines.json
23 |           path: machines.json
24 |     - name: nfs
25 |       persistentVolumeClaim:
26 |         claimName: nfs-dgx-iso
27 |   restartPolicy: Never
28 | 


--------------------------------------------------------------------------------
/playbooks/k8s-cluster/container-registry.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: kube-master
 3 |   become: true
 4 |   tasks:
 5 |     - name: Install helm chart for container registry
 6 |       include_role:
 7 |         name: k8s-internal-container-registry
 8 |       run_once: true
 9 |   tags:
10 |     - container-registry
11 | 
12 | - hosts: kube-node
13 |   become: true
14 |   vars:
15 |     container_registry_hostname: registry.local
16 |   tasks:
17 |     - name: Set registry hostname in /etc/hosts
18 |       lineinfile:
19 |         path: /etc/hosts
20 |         line: "{{ hostvars[groups['kube-master'][0]]['ansible_host'] | default(hostvars[groups['kube-master'][0]]['ansible_default_ipv4']['address']) }} {{ container_registry_hostname }}"
21 |   tags:
22 |     - container-registry
23 | 


--------------------------------------------------------------------------------
/roles/nvidia_dcgm/.yamllint:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Based on ansible-lint config
 3 | extends: default
 4 | 
 5 | rules:
 6 |   braces:
 7 |     max-spaces-inside: 1
 8 |     level: error
 9 |   brackets:
10 |     max-spaces-inside: 1
11 |     level: error
12 |   colons:
13 |     max-spaces-after: -1
14 |     level: error
15 |   commas:
16 |     max-spaces-after: -1
17 |     level: error
18 |   comments: disable
19 |   comments-indentation: disable
20 |   document-start: disable
21 |   empty-lines:
22 |     max: 3
23 |     level: error
24 |   hyphens:
25 |     level: error
26 |   indentation: disable
27 |   key-duplicates: enable
28 |   line-length: disable
29 |   new-line-at-end-of-file: disable
30 |   new-lines:
31 |     type: unix
32 |   trailing-spaces: disable
33 |   truthy: disable
34 | 


--------------------------------------------------------------------------------
/workloads/bit/hpl/syscfg-dgx1v.sh:
--------------------------------------------------------------------------------
 1 | GPU_AFFINITY="0:1:2:3:4:5:6:7"
 2 | CPU_AFFINITY="0-4:5-9:10-14:15-19:20-24:25-29:30-34:35-39"
 3 | CPU_CORES_PER_RANK=4
 4 | MEM_AFFINITY="0:0:0:0:1:1:1:1"
 5 | UCX_AFFINITY="mlx5_0:mlx5_0:mlx5_1:mlx5_1:mlx5_2:mlx5_2:mlx5_3:mlx5_3"
 6 | GPU_CLOCK="877,1275"
 7 | 
 8 | export MONITOR_GPU=1
 9 | export TEST_SYSTEM_PARAMS=1
10 | export TEST_LOOPS=1
11 | export GPU_CLOCK_WARNING=$(echo ${GPU_CLOCK} | cut -f2 -d,)
12 | export GPU_POWER_WARNING=300
13 | export GPU_PCIE_GEN_WARNING=3
14 | export GPU_PCIE_WIDTH_WARNING=16
15 | 
16 | ## Depending on driver version, you may need to uncomment the following line
17 | # export LD_LIBRARY_PATH="/usr/local/cuda/compat:$LD_LIBRARY_PATH
18 | 
19 | export UCX_TLS=all
20 | export OMPI_MCA_pml_ucx_verbose=100
21 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmhealth:
--------------------------------------------------------------------------------
 1 | #
 2 | # Check that all GPUs are healthy via dcgm
 3 | #
 4 | if [ $NUMGPUS -gt 0 ]; then
 5 |         echo "Execute dcgm health check"
 6 |         GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'`
 7 |         rm /tmp/dcgm.out 2> /dev/null
 8 |         nv-hostengine
 9 |         dcgmi group -c gpuinfo
10 |         dcgmi group -g 1 -a $GPULIST
11 |         dcgmi diag -g 1 -r 1 1> /tmp/dcgm.out
12 |         dcgmi group -d 1
13 |         nv-hostengine -t
14 |         grep -i fail /tmp/dcgm.out > /dev/null
15 |         if [ $? -gt 0 ]; then
16 |                 scontrol update nodename=$HOSTNAME state=drain reason="Failed DCGM, see /tmp/dcgm.out"
17 |                 exit 0
18 |         fi
19 | fi
20 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/gpu-usage/gpu-without-selector.yml:
--------------------------------------------------------------------------------
 1 | # This yaml file will launch a container onto any node that has a nvidia.com/gpu resource
 2 | # This container could potentially be deployed on a NVIDIA A100, V100, T4, or any other GPU type
 3 | # If a node is configured with mig-strategy=single, this container could potentially run with a MIG device
 4 | #
 5 | # This deployment style is unpredictable in a heterogenous cluster and should not be used
 6 | apiVersion: v1
 7 | kind: Pod
 8 | metadata:
 9 |   name: gpu-pod
10 | spec:
11 |   containers:
12 |     - name: gpu-pod
13 |       image:  nvcr.io/nvidia/k8s/cuda-sample:nbody
14 |       command: ["/bin/sh"]
15 |       args: ["-c", "nvidia-smi"]
16 |       resources:
17 |         limits:
18 |           nvidia.com/gpu: 1
19 | 


--------------------------------------------------------------------------------
/config.example/helm/metallb-resources.yml:
--------------------------------------------------------------------------------
 1 | # This was autogenerated by MetalLB's custom resource generator.
 2 | apiVersion: metallb.io/v1beta1
 3 | kind: IPAddressPool
 4 | metadata:
 5 |   creationTimestamp: null
 6 |   name: default
 7 |   namespace: deepops-loadbalancer
 8 | # Default address range matches private network for the virtual cluster
 9 | # defined in virtual/.
10 | # You should set this address range based on your site's infrastructure.
11 | spec:
12 |   addresses:
13 |   - 10.0.0.100-10.0.0.110
14 | status: {}
15 | ---
16 | apiVersion: metallb.io/v1beta1
17 | kind: L2Advertisement
18 | metadata:
19 |   creationTimestamp: null
20 |   name: l2advertisement1
21 |   namespace: deepops-loadbalancer
22 | spec:
23 |   ipAddressPools:
24 |   - default
25 | status: {}
26 | ---
27 | 


--------------------------------------------------------------------------------
/playbooks/container/pyxis.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: slurm-master
 3 |   tasks:
 4 |     - name: set controller fact
 5 |       set_fact:
 6 |         is_controller: true
 7 |       tags: always
 8 | 
 9 | - hosts: slurm-node
10 |   tasks:
11 |     - name: set compute fact
12 |       set_fact:
13 |         is_compute: true
14 |       tags: always
15 | 
16 | - hosts: slurm-cluster
17 |   become: yes
18 |   tasks:
19 |     - name: set enroot DGX config fact
20 |       set_fact:
21 |         enroot_environ_config_files: "{{ enroot_environ_config_files_dgx }}"
22 |       when: ansible_product_name is search("DGX")
23 | 
24 | - hosts: slurm-node
25 |   become: yes
26 |   roles:
27 |     - name: nvidia.enroot
28 | 
29 | - hosts: slurm-cluster
30 |   become: yes
31 |   roles:
32 |     - name: pyxis
33 | 


--------------------------------------------------------------------------------
/roles/facts/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: apt install pciutils
 3 |   apt: name=pciutils update_cache=yes
 4 |   when: ansible_os_family == 'Debian'
 5 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
 6 | 
 7 | - name: yum install pciutils
 8 |   yum: name=pciutils update_cache=yes
 9 |   when: ansible_os_family == 'RedHat'
10 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
11 | 
12 | - name: create fact directory
13 |   file:
14 |     path: /etc/ansible/facts.d
15 |     state: directory
16 |     mode: 0755
17 | 
18 | - name: custom facts
19 |   copy:
20 |     src: "{{ item }}"
21 |     dest: /etc/ansible/facts.d/
22 |     mode: 0755
23 |   with_fileglob:
24 |     - '*'
25 | 
26 | - name: regather local facts
27 |   setup: filter=ansible_local
28 | 


--------------------------------------------------------------------------------
/roles/singularity_wrapper/.yamllint:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Based on ansible-lint config
 3 | extends: default
 4 | 
 5 | rules:
 6 |   braces:
 7 |     max-spaces-inside: 1
 8 |     level: error
 9 |   brackets:
10 |     max-spaces-inside: 1
11 |     level: error
12 |   colons:
13 |     max-spaces-after: -1
14 |     level: error
15 |   commas:
16 |     max-spaces-after: -1
17 |     level: error
18 |   comments: disable
19 |   comments-indentation: disable
20 |   document-start: disable
21 |   empty-lines:
22 |     max: 3
23 |     level: error
24 |   hyphens:
25 |     level: error
26 |   indentation: disable
27 |   key-duplicates: enable
28 |   line-length: disable
29 |   new-line-at-end-of-file: disable
30 |   new-lines:
31 |     type: unix
32 |   trailing-spaces: disable
33 |   truthy: disable
34 | 


--------------------------------------------------------------------------------
/roles/singularity_wrapper/molecule/default/molecule.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | dependency:
 3 |   name: galaxy
 4 |   options:
 5 |     requirements-file: requirements.yml
 6 | driver:
 7 |   name: docker
 8 | platforms:
 9 |   - name: singularity-ubuntu-1804
10 |     image: geerlingguy/docker-ubuntu1804-ansible
11 |     pre_build_image: true
12 |   - name: singularity-ubuntu-2004
13 |     image: geerlingguy/docker-ubuntu2004-ansible
14 |     pre_build_image: true
15 |   - name: singularity-centos-7
16 |     image: geerlingguy/docker-centos7-ansible
17 |     pre_build_image: true
18 | #  - name: singularity-centos-8
19 | #    image: geerlingguy/docker-centos8-ansible
20 | #    pre_build_image: true
21 | provisioner:
22 |   name: ansible
23 |   ansible_args:
24 |   - -vv
25 | verifier:
26 |   name: ansible
27 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-dashboard.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | source workloads/jenkins/scripts/jenkins-common.sh
 4 | 
 5 | # Ensure working directory is root
 6 | cd "${ROOT_DIR}"
 7 | 
 8 | # Deploy Dashboard
 9 | source ./scripts/k8s/deploy_dashboard_user.sh
10 | 
11 | # The deployment script exports the http endpoints, verify it returns a 200
12 | # It typically takes ~1 minutes for all pods and services to start, so we poll
13 | timeout=120
14 | time=0
15 | while [ ${time} -lt ${timeout} ]; do
16 |   curl -ks --raw -kL "${dashboard_url}" | grep "Kubernetes Dashboard" && \
17 |     echo "Dashboard URLs are all responding" && exit 0
18 |   let time=$time+15
19 |   sleep 15
20 | done
21 | 
22 | # Dashboard deployment failure
23 | echo "Dashboard did not come up in time"
24 | exit 1
25 | 


--------------------------------------------------------------------------------
/roles/nhc/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | nhc_version: "1.4.3"
 3 | nhc_src_url: "https://github.com/mej/nhc/releases/download/{{ nhc_version }}/lbnl-nhc-{{ nhc_version }}.tar.xz"
 4 | nhc_install_dir: "/usr"
 5 | nhc_config_dir: "/etc"
 6 | nhc_libexec_dir: "/usr/libexec"
 7 | nhc_build_dir: "/opt/deepops/build/nhc"
 8 | nhc_sysconfig_dir: "/etc/sysconfig"
 9 | 
10 | nhc_extract_dir: "{{ nhc_build_dir }}/lbnl-nhc-{{ nhc_version }}"
11 | nhc_configure: "./configure --prefix={{ nhc_install_dir }} --sysconfdir={{ nhc_config_dir }} --libexecdir={{ nhc_libexec_dir }}"
12 | nhc_test: "make test"
13 | nhc_run_test: false
14 | nhc_make: "make install"
15 | 
16 | nhc_config_template: "nhc.conf.j2"
17 | nhc_sysconfig_template: "sysconfig_nhc.j2"
18 | 
19 | nhc_force_reinstall: false
20 | nhc_config: "nhc.conf.j2"
21 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/dask-rapids/k8s/rapids-dask-sa.yml:
--------------------------------------------------------------------------------
 1 | kind: Role
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | metadata:
 4 |   namespace: rapids
 5 |   name: dask-scaler
 6 | rules:
 7 | - apiGroups: [""]
 8 |   resources: ["pods"]
 9 |   verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
10 | ---
11 | kind: RoleBinding
12 | apiVersion: rbac.authorization.k8s.io/v1
13 | metadata:
14 |   name: dask-scaler
15 |   namespace: rapids
16 | subjects:
17 | - kind: ServiceAccount
18 |   name: default # TODO: Create specific user for jupyter
19 |   namespace: rapids
20 | roleRef:
21 |   kind: Role
22 |   name: dask-scaler
23 |   apiGroup: rbac.authorization.k8s.io
24 | ---
25 | apiVersion: v1
26 | kind: ServiceAccount
27 | metadata:
28 |   name: dask-rapids
29 |   namespace: rapids
30 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-tests/README.md:
--------------------------------------------------------------------------------
 1 | Role Name
 2 | =========
 3 | # NVIDIA GPU Tests Role
 4 | 
 5 | This role is meant to be a quick tool for system validation or simple system burn in. It should not be used as a comprehensive performance test.
 6 | 
 7 | Running this will perform the following:
 8 | 
 9 | * Install the CUDA toolkit
10 | * Download and build cuda-samples
11 | * Run the Peer2Peer and MatrixMultiply tests
12 | * Run the DCGM diagnostics
13 | * Run a basic Tensorflow DL job
14 | 
15 | 
16 | # Requirements
17 | 
18 | This role can be applied to a heterogeneous cluster of GPU nodes.
19 | 
20 | The following should be installed on the system prior to running this role (these come standard in the DGX Operating System):
21 | 
22 | * CUDA toolkit
23 | * dcgmi
24 | * nvidia-docker
25 | * docker
26 | 
27 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx-firmware/tasks/get-ib.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: get hostname
 3 |   shell: hostname
 4 |   register: hostname
 5 | 
 6 | - name: check mlxconfig
 7 |   become: yes
 8 |   shell: "mlxconfig query  | egrep -e Device\\|LINK_TYPE_P1\\|LINK_TYPE_P2"
 9 |   register: mlx_config
10 |   ignore_errors: yes
11 | 
12 | - name: check ibstat
13 |   become: yes
14 |   shell: "ibstat | egrep -e mlx\\|Link"
15 |   register: ibstat
16 |   ignore_errors: yes
17 | 
18 | - name: save actual hostname
19 |   shell: echo "{{ hostname.stdout }}" >> "{{ fw_dir }}/{{ inventory_hostname }}.log"
20 | - name: save mlx_config
21 |   shell: echo "{{ mlx_config.stdout }}" >> "{{ fw_dir }}/{{ inventory_hostname }}.log"
22 | - name: save ibstat
23 |   shell: echo "{{ ibstat.stdout }}" >> "{{ fw_dir }}/{{ inventory_hostname }}.log"
24 | 


--------------------------------------------------------------------------------
/roles/nvidia_cuda/tasks/install-redhat.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: RedHat | trust GPG key for EPEL
 3 |   rpm_key:
 4 |     key: "{{ epel_key_url }}"
 5 |     state: present
 6 | 
 7 | - name: RedHat | add epel repo
 8 |   become: yes
 9 |   yum:
10 |     name: 
11 |       - "{{ epel_package }}"
12 |     state: present
13 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
14 | 
15 | - name: RedHat | add CUDA repo
16 |   yum_repository:
17 |     name: cuda
18 |     description: NVIDIA CUDA YUM Repo
19 |     baseurl: "{{ nvidia_driver_rhel_cuda_repo_baseurl }}"
20 |     gpgkey: "{{ nvidia_driver_rhel_cuda_repo_gpgkey }}"
21 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
22 | 
23 | - name: RedHat | install cuda
24 |   package:
25 |     name: "{{ cuda_version }}"
26 |     state: present
27 | 


--------------------------------------------------------------------------------
/roles/ood-wrapper/vars/redhat.yml:
--------------------------------------------------------------------------------
 1 | install_from_src: false
 2 | 
 3 | ood_apache_service_name: httpd24-httpd.service
 4 | ood_htpasswd_file: /opt/rh/httpd24/root/etc/httpd/.htpasswd
 5 | 
 6 | ood_url_turbovnc_pkg: https://downloads.sourceforge.net/project/turbovnc/2.2.4/turbovnc-2.2.4.x86_64.rpm
 7 | 
 8 | ood_master_sw_deps:
 9 |   - lz4-devel
10 |   - unzip
11 |   - python-websockify
12 | 
13 | ood_client_sw_deps:
14 |   - lz4-devel
15 |   - unzip
16 |   - nmap
17 |   - python-websockify
18 |   - '@xfce4'
19 |   - xfce4-session
20 |   - xfce4-settings
21 |   - xfce4-terminal
22 |   - xfdesktop
23 |   - gtk-xfce-engine
24 |   - gtk2-engines
25 |   - python2-jupyter-core
26 |   - python2-jupyroot
27 |   - python36-jupyter-core
28 |   - python36-jupyroot
29 |   - dbus-x11
30 |   - firefox
31 |   - cuda-nvvp-10-1
32 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/misc-node.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Configure nodes that are neither running controller services, nor slurmd.
 3 | # Examples include login nodes or CI nodes.
 4 | 
 5 | - name: create slurm directories
 6 |   file:
 7 |     path: "{{ item }}"
 8 |     state: directory
 9 |     owner: slurm
10 |     mode: 0755
11 |   with_items:
12 |     - "{{ slurm_config_dir }}"
13 | 
14 | - name: configure slurm.conf
15 |   template:
16 |     src: "{{ slurm_conf_template }}"
17 |     dest: "{{ slurm_config_dir }}/slurm.conf"
18 |     mode: "0644"
19 |   tags:
20 |   - config
21 | 
22 | - name: ensure all slurm services are stopped
23 |   service:
24 |     name: "{{ item }}"
25 |     state: stopped
26 |     enabled: no
27 |   with_items:
28 |   - slurmctld
29 |   - slurmd
30 |   - slurmdbd
31 |   failed_when: false
32 | 


--------------------------------------------------------------------------------
/scripts/deepops/enable_linting.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FAILED=0
 4 | if ! which ansible-lint; then
 5 | 	echo "ansible-lint not found in PATH"
 6 | 	FAILED=1
 7 | fi
 8 | if ! which shellcheck; then
 9 | 	echo "shellcheck not found in PATH"
10 | 	FAILED=1
11 | fi
12 | if ! which pylint; then
13 | 	echo "pylint not found in PATH"
14 | 	FAILED=1
15 | fi
16 | if [ ${FAILED} -ne 0 ]; then
17 | 	echo
18 | 	echo 'One or more required linters not found!'
19 | 	echo 'Please install the missing linter using pip or your system package manager,'
20 | 	echo 'and try again.'
21 | 	echo
22 | 	echo 'Pre-commit hook not enabled.'
23 | 	exit 1
24 | fi
25 | 
26 | echo "Enabling pre-commit hooks to lint Ansible, Shell, and Python"
27 | cp -v src/repo/githooks/pre-commit .git/hooks/pre-commit
28 | chmod +x .git/hooks/pre-commit
29 | 


--------------------------------------------------------------------------------
/docs/cloud-native/README.md:
--------------------------------------------------------------------------------
 1 | # Deprecated
 2 | 
 3 | Up-to-date Ansible playbooks and install guides for Cloud Native Core can now be found in the [dedicated NVIDIA/cloud-native-core repository](https://github.com/NVIDIA/cloud-native-core).
 4 | 
 5 | ## NVIDIA Cloud Native Core
 6 | 
 7 | NVIDIA Cloud Native Core is a collection of software to run cloud native workloads on NVIDIA GPUs.
 8 | NVIDIA Cloud Native Core is based on Ubuntu, Kubernetes, Helm, and the NVIDIA GPU and Network Operators.
 9 | 
10 | This software stack was previously known as "EGX Stack", and is designed to run well on the [NVIDIA EGX Platform](https://www.nvidia.com/en-us/data-center/products/egx/)
11 | 
12 | In DeepOps 22.01 and before, we included Ansible playbooks for installing and validating the EGX Platform in `playbooks/nvidia-egx`.
13 | 


--------------------------------------------------------------------------------
/roles/nvidia_dcgm/tasks/install-redhat.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: RedHat | trust GPG key for EPEL
 3 |   rpm_key:
 4 |     key: "{{ epel_key_url }}"
 5 |     state: present
 6 | 
 7 | - name: RedHat | add epel repo
 8 |   become: yes
 9 |   yum:
10 |     name:
11 |       - "{{ epel_package }}"
12 |     state: present
13 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
14 | 
15 | - name: RedHat | add CUDA repo
16 |   yum_repository:
17 |     name: cuda
18 |     description: NVIDIA CUDA YUM Repo
19 |     baseurl: "{{ nvidia_driver_rhel_cuda_repo_baseurl }}"
20 |     gpgkey: "{{ nvidia_driver_rhel_cuda_repo_gpgkey }}"
21 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
22 | 
23 | - name: RedHat | install package
24 |   package:
25 |     name: "{{ dcgm_pkg_name }}"
26 |     state: "present"
27 | 


--------------------------------------------------------------------------------
/roles/rsyslog_server/templates/01-deepops-listen.conf:
--------------------------------------------------------------------------------
 1 | # {{ ansible_managed }}
 2 | # Define ruleset for per-host files
 3 | template(name="perhost" type="string" string="{{ rsyslog_log_file_path_pattern }}")
 4 | ruleset(name="remote") {
 5 |   action(type="omfile" dynafile="perhost")
 6 | }
 7 | {% if rsyslog_enable_journal -%}
 8 | # Import journal messages into syslog
 9 | module(load="imjournal")
10 | {% endif -%}
11 | {% if rsyslog_server_tcp_port is defined -%}
12 | # Accept syslog messages on TCP
13 | module(load="imtcp")
14 | input(type="imtcp" port="{{ rsyslog_server_tcp_port }}" ruleset="remote")
15 | {% endif -%}
16 | {% if rsyslog_server_udp_port -%}
17 | # Accept syslog messages on UDP
18 | module(load="imudp")
19 | input(type="imudp" port="{{ rsyslog_server_udp_port }}" ruleset="remote")
20 | {% endif -%}
21 | 


--------------------------------------------------------------------------------
/playbooks/slurm-cluster/nvidia-dcgm-exporter.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install docker
 3 |   import_playbook: ../container/docker.yml
 4 | 
 5 | - name: Install NVIDIA driver
 6 |   import_playbook: ../nvidia-software/nvidia-driver.yml
 7 | 
 8 | - name: Install NVIDIA container runtime
 9 |   import_playbook: ../container/nvidia-docker.yml
10 | 
11 | - hosts:  "{{ hostlist | default('all') }}"
12 |   become: yes
13 |   tasks:
14 |     - name: install custom facts module
15 |       include_role:
16 |         name: facts
17 |     - name: set GPU fact
18 |       set_fact:
19 |         has_gpus: true
20 |       when: ansible_local['gpus']['count']
21 |     - name: configure dcgm exporter
22 |       include_role:
23 |         name: nvidia-dcgm-exporter
24 |       when: ansible_distribution == "Ubuntu" or ansible_os_family == "RedHat"
25 | 


--------------------------------------------------------------------------------
/roles/nvidia_cuda/tasks/install-ubuntu.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Ubuntu | remove ppa
 3 |   apt_repository:
 4 |     repo: ppa:graphics-drivers/ppa
 5 |     state: absent
 6 | 
 7 | - name: Ubuntu | ensure old key is absent
 8 |   apt_key:
 9 |     id: "{{ old_nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}"
10 |     state: "absent"
11 | 
12 | - name: Ubuntu | install CUDA keyring
13 |   apt:
14 |     deb: "{{ nvidia_driver_ubuntu_cuda_keyring_url }}"
15 |     state: "present"
16 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
17 | 
18 | - name: Ubuntu | force apt update
19 |   apt:
20 |     update_cache: true
21 |   environment: "{{ proxy_env if proxy_env is defined else {} }}"
22 |   changed_when: false
23 | 
24 | - name: Ubuntu | install cuda
25 |   package:
26 |     name: "{{ cuda_version }}"
27 |     state: present
28 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/shared/bin/set_gpu_power_levels.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | gpu_count="$(nvidia-smi -L | wc -l)"
 5 | 
 6 | for i in $(seq 0 "$(( gpu_count - 1 ))" )
 7 | do
 8 |     case "$1" in
 9 |         max)
10 |             next="$(nvidia-smi -i "$i" --query-gpu=power.max_limit --format=csv,noheader,nounits)"
11 |             ;;
12 |         default)
13 |             next="$(nvidia-smi -i "$i" --query-gpu=power.default_limit --format=csv,noheader,nounits)"
14 |             ;;
15 |         min)
16 |             next="$(nvidia-smi -i "$i" --query-gpu=power.min_limit --format=csv,noheader,nounits)"
17 |             ;;
18 |         *)
19 |             echo "Usage: $0 [max,default,min]"
20 |             exit 1
21 |             ;;
22 |     esac
23 |     nvidia-smi -i "$i" -pl "$next"
24 | done
25 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/ingress-nodeport.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Cluster ingress controller
 3 | # An ingress controller routes external traffic to services
 4 | #
 5 | 
 6 | # Ingress controller
 7 | controller:
 8 |   # Use host network to listen on ports 80 and 443
 9 |   hostNetwork: true
10 |   # Service type LoadBalancer requires a load balancer to be configured, e.g.
11 |   # MetalLB in an on-prem cluster. See metallb.yml for a sample definition.
12 |   # NodePort can be used instead where we don't have a load balancer.
13 |   service:
14 |     type: NodePort
15 |   # Always run on control-plane nodes
16 |   nodeSelector:
17 |     node-role.kubernetes.io/control-plane: ""
18 | 
19 | # Ingress back-end
20 | defaultBackend:
21 |   # Always run on control-plane nodes
22 |   nodeSelector:
23 |     node-role.kubernetes.io/control-plane: ""
24 | 


--------------------------------------------------------------------------------
/workloads/examples/slurm/mpi-hello/mpi-hello.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <unistd.h>
 3 | #include <stdio.h>
 4 | 
 5 | int main(int argc, char **argv) {
 6 |     // Initialize MPI
 7 |     MPI_Init(&argc, &argv);
 8 | 
 9 |     // Get the number of processes in the global communicator
10 |     int count;
11 |     MPI_Comm_size(MPI_COMM_WORLD, &count);
12 | 
13 |     // Get the rank of the current process
14 |     int rank;
15 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
16 | 
17 |     // Get the current hostname
18 |     char hostname[1024];
19 |     gethostname(hostname, sizeof(hostname));
20 | 
21 |     // Print a hello world message for this rank
22 |     printf("Hello from process %d of %d on host %s\n", rank, count, hostname);
23 | 
24 |     // Finalize the MPI environment before exiting
25 |     MPI_Finalize();
26 | }
27 | 


--------------------------------------------------------------------------------
/roles/nvidia-gpu-tests/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Directories to download samples
 3 | gpu_test_cuda_dir: "/tmp/cuda-samples"
 4 | gpu_test_samples_dir: "{{ gpu_test_cuda_dir }}/Samples"
 5 | 
 6 | # Whether to install the CUDA toolkit package
 7 | # Set to no if the CUDA toolkit is installed
 8 | gpu_test_install_toolkit: no
 9 | 
10 | # Whether or not to download and build cuda-samples
11 | # Set to no if they have already been built once
12 | gpu_test_build: yes
13 | 
14 | # 1 -> quick test (seconds), 2 -> regular test (minute), 3 -> long test (minutes)
15 | gpu_test_dcgm_level: 3
16 | 
17 | # Batch size to use for Tensorflow tests
18 | # DGX-2 -> (512, 50); DGX-1 -> (256,50); DGX-Station -> (128, 50)
19 | gpu_test_tf_batch_size: 128 # (choose from 32, 64, 128, 256, 512)
20 | gpu_test_tf_layers: 50 # (choose from 18, 34, 50, 101, 152)
21 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/gpu-usage/gpu-with-selector.yml:
--------------------------------------------------------------------------------
 1 | # This yaml file will launch a container with a 32GB V100 GPU
 2 | # Specifying the nvidia.com/gpu.product label and the nvidia.com/gpu resource type 
 3 | #    restricts the type of GPU this container will run on
 4 | #
 5 | # Specifying both a nodeSelector and resource type supports clusters with multiple GPU types or MIG configurations
 6 | # This is the preferred method of deployment
 7 | apiVersion: v1
 8 | kind: Pod
 9 | metadata:
10 |   name: gpu-pod
11 | spec:
12 |   nodeSelector:
13 |     nvidia.com/gpu.product: Tesla-V100-DGXS-32GB
14 |   containers:
15 |     - name: gpu-pod
16 |       image:  nvcr.io/nvidia/k8s/cuda-sample:nbody
17 |       command: ["/bin/sh"]
18 |       args: ["-c", "nvidia-smi"]
19 |       resources:
20 |         limits:
21 |           nvidia.com/gpu: 1
22 | 


--------------------------------------------------------------------------------
/workloads/examples/k8s/ingress-loadbalancer.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Cluster ingress controller
 3 | # An ingress controller routes external traffic to services
 4 | #
 5 | 
 6 | # Ingress controller
 7 | controller:
 8 |   # Use host network to listen on ports 80 and 443
 9 |   hostNetwork: true
10 |   # Service type LoadBalancer requires a load balancer to be configured, e.g.
11 |   # MetalLB in an on-prem cluster. See metallb.yml for a sample definition.
12 |   # NodePort can be used instead where we don't have a load balancer.
13 |   service:
14 |     type: LoadBalancer
15 |   # Always run on control-plane nodes
16 |   nodeSelector:
17 |     node-role.kubernetes.io/control-plane: ""
18 | 
19 | # Ingress back-end
20 | defaultBackend:
21 |   # Always run on control-plane nodes
22 |   nodeSelector:
23 |     node-role.kubernetes.io/control-plane: ""
24 | 


--------------------------------------------------------------------------------
/roles/prometheus/templates/prometheus.yml.j2:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: {{ prometheus_cfg_scrape_interval }}         # Default is every 1 minute.
 3 |   evaluation_interval: {{ prometheus_cfg_evaluation_interval }} # Default is every 1 minute.
 4 | 
 5 | # Alertmanager configuration
 6 | alerting:
 7 |   alertmanagers:
 8 |   - scheme: http
 9 |     static_configs:
10 |     - targets: ['localhost:9093']
11 | 
12 | # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
13 | rule_files:
14 |   - 'rules/alert_rules.yml'
15 | # - 'rules/second_rules.yml'
16 | 
17 | scrape_configs:
18 |   - job_name: 'cluster'
19 |     file_sd_configs:
20 |       - files:
21 |         - {{ prometheus_cfg_endpoint_dir }}/*.yml
22 |   - job_name: 'prometheus'
23 |     static_configs:
24 |       - targets: ['localhost:9090']
25 | 


--------------------------------------------------------------------------------
/workloads/bit/hpl/syscfg-dgx2.sh:
--------------------------------------------------------------------------------
 1 | GPU_AFFINITY="0:1:2:3:4:5:6:7:8:9:16:11:12:13:14:15"
 2 | CPU_AFFINITY="0-2:3-5:6-8:9-11:12-14:15-17:18-20:21-23:24-26:27-29:30-32:33-35:36-38:39-41:42-44:45-47"
 3 | CPU_CORES_PER_RANK=3
 4 | MEM_AFFINITY="0:0:0:0:0:0:0:01:1:1:1:1:1:1:1:1"
 5 | UCX_AFFINITY="mlx5_1:mlx5_1:mlx5_2:mlx5_2:mlx5_3:mlx5_3:mlx5_4:mlx5_4:mlx5_7:mlx5_7:mlx5_8:mlx5_8:mlx5_9:mlx5_9:mlx5_10:mlx_10"
 6 | GPU_CLOCK="877,1275"
 7 | 
 8 | export MONITOR_GPU=1
 9 | export TEST_SYSTEM_PARAMS=1
10 | export TEST_LOOPS=1
11 | export GPU_CLOCK_WARNING=$(echo ${GPU_CLOCK} | cut -f2 -d,)
12 | export GPU_POWER_WARNING=350
13 | export GPU_PCIE_GEN_WARNING=3
14 | export GPU_PCIE_WIDTH_WARNING=16
15 | 
16 | ## Depending on driver version, you may need to uncomment the following line
17 | # export LD_LIBRARY_PATH="/usr/local/cuda/compat:$LD_LIBRARY_PATH
18 | 
19 | 


--------------------------------------------------------------------------------
/roles/nvidia_cuda/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: check if we are running on DGX
 3 |   stat:
 4 |     path: "/etc/dgx-release"
 5 |   register: is_dgx
 6 | 
 7 | - name: DGX install tasks
 8 |   include_tasks: install-dgx.yml
 9 |   when: is_dgx.stat.exists == True
10 | 
11 | - name: ubuntu install tasks
12 |   include_tasks: install-ubuntu.yml
13 |   when: (ansible_distribution == "Ubuntu") and (is_dgx.stat.exists == False)
14 | 
15 | - name: redhat family install tasks
16 |   include_tasks: install-redhat.yml
17 |   when: (ansible_os_family == "RedHat") and (is_dgx.stat.exists == False)
18 | 
19 | - name: add profile script to set environment for toolkit
20 |   copy:
21 |     src: "cuda-vars.sh"
22 |     dest: "/etc/profile.d/cuda-vars.sh"
23 |     owner: "root"
24 |     group: "root"
25 |     mode: "0644"
26 |   when: cuda_toolkit_add_profile_script
27 | 


--------------------------------------------------------------------------------
/docs/slurm-cluster/slurm-prolog-epilog/prolog-ecc:
--------------------------------------------------------------------------------
 1 | # Disable ECC if requested
 2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i ecc > /dev/null
 3 | if [ $? -eq 0 ]; then
 4 |     logger -t PROLOG "Disabling ECC"
 5 |     nvidia-smi -e 0
 6 |     GPUCOUNT=`nvidia-smi -L | wc -l`
 7 |     GPUMAXINDEX=`expr $GPUCOUNT - 1`
 8 |     systemctl stop collectd
 9 |     logger -t PROLOG "Triggering GPU reset"
10 |     for i in `seq 0 $GPUMAXINDEX`; do
11 |         logger -t PROLOG "Resetting GPU $i"
12 |         e=`nvidia-smi -r -i $i 2>&1`
13 |         if [ $? -ne 0 ]; then
14 |             logger -t PROLOG "WARNING! GPU $i reset failed"
15 |             logger -t PROLOG "GPU $i reset error: $e"
16 |             nvidia-smi -e 1
17 |         fi
18 |     sleep 1
19 |     done
20 |     logger -t PROLOG "GPU reset done"
21 |     systemctl start collectd
22 | fi
23 | 


--------------------------------------------------------------------------------
/playbooks/k8s-cluster/netapp-trident.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Playbook for deploying NetApp Trident
 3 | 
 4 | - name: "Install NFS utils on worker nodes"
 5 |   hosts: kube-node
 6 |   become: true
 7 |   become_method: sudo
 8 |   tasks:
 9 |   - name: install nfs utils (Ubuntu)
10 |     package:
11 |       name: nfs-common
12 |     when: ansible_os_family == "Debian"
13 |   - name: install nfs utils (Red Hat / CentOS)
14 |     package:
15 |       name: nfs-utils
16 |     when: ansible_os_family == "RedHat"
17 | 
18 | - name: "Deploy NetApp Trident"
19 |   hosts: kube-master
20 |   become: true
21 |   vars_files:
22 |   - ../../config/group_vars/netapp-trident.yml
23 |   environment:
24 |     PATH: /usr/local/bin/:{{ ansible_env.PATH }}
25 |   tasks:
26 |   - name: Include netapp trident role
27 |     run_once: true
28 |     include_role:
29 |       name: netapp-trident
30 | 


--------------------------------------------------------------------------------
/roles/autofs/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: assert that variables are defined
 3 |   fail:
 4 |     msg: "Variable '{{ item }}' is not defined"
 5 |   when: item not in vars
 6 |   with_items:
 7 |     - autofs_mount
 8 |     - autofs_map
 9 | 
10 | - name: install packages
11 |   package: name=autofs
12 | 
13 | - name: configure /home
14 |   template:
15 |     src: templates/master.j2
16 |     dest: /etc/auto.master
17 |     owner: root
18 |     group: root
19 |     mode: 0644
20 |   notify: "restart autofs"
21 |   tags:
22 |     - configuration
23 | 
24 | - name: ensure mountpoint exists
25 |   file: 
26 |     path: "{{ autofs_mount }}"
27 |     state: directory
28 |     owner: "root"
29 |     group: "root"
30 |     mode: "0755"
31 |   when: autofs_mount is defined
32 | 
33 | - name: make sure autofs is running
34 |   service: name=autofs state=started enabled=yes
35 | 


--------------------------------------------------------------------------------
/roles/nvidia-dgx-firmware/tasks/get-data.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # This is used to generate a spreadsheet mapping NICs/IPs/Hostnames to a cluster of DGX-2s
3 | - name: Run NVSM human-readable health show
4 |   shell: "osversion=`cat /etc/dgx-release | grep 'DGX_SWBUILD_VERSION'`; host=`hostname`; copper_mac=`ip addr | grep -A 1 enp6s0 | grep link | awk '{print $2}'`; bmc_ip=`sudo ipmitool lan print | grep -i 'IP Address              :' | awk '{print $4}'`; bmc_mac=`sudo ipmitool lan print | grep -i 'MAC Address' | awk '{print $4}'`;host_mac=`ifconfig {{ nv_mgmt_interface }} | grep 'ether' | awk '{print $2}'`;host_ip=`ifconfig {{ nv_mgmt_interface }} | grep 'inet ' | awk '{print $2}'`;data=\"${host_ip},${host_mac},${bmc_ip},${bmc_mac},${copper_mac},${host},${osversion}\";echo ${data}"
5 |   register: command_output
6 | - debug:
7 |     msg: "{{ command_output.stdout }}"
8 |   ignore_errors: yes
9 | 


--------------------------------------------------------------------------------
/scripts/generic/install_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Source common libraries and env variables
 4 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 5 | ROOT_DIR="${SCRIPT_DIR}/../.."
 6 | source ${ROOT_DIR}/scripts/common.sh
 7 | 
 8 | DOCKER_COMPOSE_URL="${DOCKER_COMPOSE_URL:-https://github.com/docker/compose/releases/download/1.23.2/docker-compose-$(uname -s)-$(uname -m)}"
 9 | 
10 | type docker >/dev/null 2>&1
11 | if [ $? -ne 0 ] ; then
12 |     get_docker=$(mktemp)
13 |     curl -fsSL get.docker.com -o ${get_docker}
14 |     sudo sh ${get_docker}
15 |     sudo rm -f ${get_docker}
16 |     sudo usermod -aG docker $(whoami)
17 | fi
18 | 
19 | type docker-compose >/dev/null 2>&1
20 | if [ $? -ne 0 ] ; then
21 | sudo curl -L "${DOCKER_COMPOSE_URL}" -o /usr/local/bin/docker-compose
22 | sudo chmod +x /usr/local/bin/docker-compose
23 | fi
24 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-mpi-job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source workloads/jenkins/scripts/jenkins-common.sh
 3 | 
 4 | # Upload MPI source
 5 | scp  \
 6 | 	-o "StrictHostKeyChecking no" \
 7 | 	-o "UserKnownHostsFile /dev/null" \
 8 | 	-i "${HOME}/.ssh/id_rsa" \
 9 | 	workloads/examples/slurm/mpi-hello/mpi-hello.c \
10 | 	"vagrant@10.0.0.5${GPU01}:mpi-hello.c"
11 | 
12 | # Upload test script
13 | scp  \
14 | 	-o "StrictHostKeyChecking no" \
15 | 	-o "UserKnownHostsFile /dev/null" \
16 | 	-i "${HOME}/.ssh/id_rsa" \
17 | 	workloads/jenkins/scripts/remote-script-for-mpi.sh \
18 | 	"vagrant@10.0.0.5${GPU01}:remote-script-for-mpi.sh"
19 | 
20 | # Compile the program
21 | ssh \
22 | 	-o "StrictHostKeyChecking no" \
23 | 	-o "UserKnownHostsFile /dev/null" \
24 | 	-l vagrant \
25 | 	-i "${HOME}/.ssh/id_rsa" \
26 | 	"10.0.0.5${GPU01}" \
27 | 	"bash -l /home/vagrant/remote-script-for-mpi.sh"
28 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/etc/slurm/shared/bin/set_gpu_clocks.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | gpu_count="$(nvidia-smi -L | wc -l)"
 5 | 
 6 | case "$1" in
 7 |     default)
 8 |         nvidia-smi -rac           # Reset application clocks
 9 |         nvidia-smi -acp 0         # Reset application clock permissions
10 |         nvidia-smi -c DEFAULT     # Reset compute mode to default
11 |         ;;
12 |     max)
13 |         for i in $(seq 0 "$(( gpu_count - 1 ))" ) ; do
14 |             nextSM="$(nvidia-smi -i "$i" --query-gpu=clocks.max.sm --format=csv,noheader,nounits)"
15 |             nextMEM="$(nvidia-smi -i "$i" --query-gpu=clocks.max.mem --format=csv,noheader,nounits)"
16 |             nvidia-smi -i "${i}" -ac "${nextMEM}","${nextSM}"
17 |         done
18 |         ;;
19 |     *)
20 |         echo "Usage: $0 [default|max]"
21 |         exit 1
22 |         ;;
23 | esac
24 | 


--------------------------------------------------------------------------------
/src/repo/githooks/check-python.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Get a list of changed python scripts that are staged for commit.
 4 | Run shellcheck on only those files.
 5 | """
 6 | 
 7 | 
 8 | from __future__ import print_function
 9 | import subprocess
10 | import re
11 | import sys
12 | 
13 | 
14 | def get_changed_paths():
15 |     git_diff = subprocess.check_output("git diff --name-only --cached".split())
16 |     paths = []
17 |     for f in git_diff.split("\n"):
18 |         # Add playbook files
19 |         if re.match(r".*(\.py)$", f):
20 |             paths.append(f)
21 |     return paths
22 | 
23 | 
24 | def run_lint(paths):
25 |     cmd = ["pylint", "-rn", "-sn", "-d", "R,C"] + paths
26 |     return subprocess.call(cmd)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     changed = get_changed_paths()
31 |     if len(changed) > 0:
32 |         sys.exit(run_lint(changed))
33 | 


--------------------------------------------------------------------------------
/src/repo/githooks/check-shell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Get a list of changed bash scripts that are staged for commit.
 4 | Run shellcheck on only those files.
 5 | """
 6 | 
 7 | 
 8 | from __future__ import print_function
 9 | import subprocess
10 | import re
11 | import sys
12 | 
13 | 
14 | def get_changed_shell_paths():
15 |     git_diff = subprocess.check_output("git diff --name-only --cached".split())
16 |     paths = []
17 |     for f in git_diff.split("\n"):
18 |         # Add playbook files
19 |         if re.match(r".*(\.sh|\.bash)$", f):
20 |             paths.append(f)
21 |     return paths
22 | 
23 | 
24 | def run_lint(paths):
25 |     cmd = ["shellcheck", "-x"] + paths
26 |     return subprocess.call(cmd)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     changed = get_changed_shell_paths()
31 |     if len(changed) > 0:
32 |         sys.exit(run_lint(changed))
33 | 


--------------------------------------------------------------------------------
/ansible.cfg:
--------------------------------------------------------------------------------
 1 | [defaults]
 2 | collections_paths = ./collections
 3 | roles_path = ./roles/galaxy:./roles:./submodules/kubespray/roles
 4 | library = ./submodules/kubespray/library
 5 | inventory = ./config/inventory
 6 | host_key_checking = False
 7 | gathering = smart
 8 | fact_caching = jsonfile
 9 | fact_caching_connection = /var/tmp/ansible_cache
10 | fact_caching_timeout = 86400
11 | deprecation_warnings = False
12 | #vault_password_file = ./config/.vault-pass
13 | timeout=60
14 | stdout_callback = yaml
15 | bin_ansible_callbacks = True
16 | local_tmp=/tmp
17 | remote_tmp=/tmp
18 | forks = 25
19 | force_valid_group_names = ignore
20 | ansible_python_interpreter = /usr/bin/python3
21 | 
22 | [ssh_connection]
23 | pipelining = True
24 | ssh_args = -o ControlMaster=auto -o ControlPersist=5m -o ConnectionAttempts=100 -o UserKnownHostsFile=/dev/null
25 | control_path = ~/.ssh/ansible-%%r@%%h:%%p
26 | 


--------------------------------------------------------------------------------
/playbooks/nvidia-software/nvidia-driver.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: "{{ hostlist | default('all') }}"
 3 |   become: true
 4 |   tags:
 5 |   - nvidia
 6 |   - nvidia_driver
 7 |   tasks:
 8 |     - name: Check for DGX packages
 9 |       stat:
10 |         path: /etc/dgx-release
11 |       register: is_dgx
12 | 
13 |     - name: install custom facts
14 |       include_role:
15 |         name: facts
16 | 
17 |     - name: install nvidia driver
18 |       include_role:
19 |         name: nvidia.nvidia_driver
20 |       when: (ansible_local['gpus']['count'] and is_dgx.stat.exists == False) or (nvidia_driver_force_install|default(false))
21 | 
22 |     - name: test nvidia-smi
23 |       command: nvidia-smi
24 |       changed_when: false
25 |       when:
26 |         - ansible_local['gpus']['count']
27 |         - is_dgx.stat.exists == False
28 |   environment: "{{proxy_env if proxy_env is defined else{}}}"
29 | 


--------------------------------------------------------------------------------
/.github/workflows/ansible-lint-roles.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: run ansible-lint on deepops roles
 3 | on:
 4 |   - push
 5 |   - pull_request
 6 | jobs:
 7 |   lint:
 8 |     runs-on: ubuntu-20.04
 9 |     steps:
10 | 
11 |       - name: check out repo
12 |         uses: actions/checkout@v2
13 |         with:
14 |           path: "${{ github.repository }}"
15 | 
16 |       - name: set up python
17 |         uses: actions/setup-python@v2
18 |         with:
19 |           python-version: "3.9"
20 | 
21 |       - name: install dependencies
22 |         run: |
23 |           python3 -m pip install --upgrade pip
24 |           python3 -m pip install ansible-lint==5.4.0 ansible==4.8.0
25 | 
26 |       - name: run lint script
27 |         env:
28 |           ANSIBLE_LINT_EXCLUDE: "nvidia-dgx|nvidia-gpu-tests"
29 |         run: |
30 |           cd "${{ github.repository }}"
31 |           bash ./scripts/deepops/ansible-lint-roles.sh
32 | 


--------------------------------------------------------------------------------
/roles/nfs-client-provisioner/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # See the GitHub code repo: https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner
 3 | 
 4 | - name: install nfs-client-provisioner helm repo
 5 |   command: /usr/local/bin/helm repo add --force-update "{{ k8s_nfs_client_repo_name }}" "{{ k8s_nfs_client_helm_repo }}"
 6 |   changed_when: false
 7 | 
 8 | - name: update helm repos
 9 |   command: /usr/local/bin/helm repo update
10 |   changed_when: false
11 | 
12 | - name: install nfs-client-provisioner
13 |   command: /usr/local/bin/helm upgrade --install "{{ k8s_nfs_client_release_name }}" "{{ k8s_nfs_client_chart_name }}" --create-namespace --namespace deepops-nfs-client-provisioner --version "{{ k8s_nfs_client_chart_version }}" --set nfs.server="{{ k8s_nfs_server }}" --set nfs.path="{{ k8s_nfs_export_path }}" --set storageClass.defaultClass="{{ k8s_nfs_default_sc }}" --wait
14 |   changed_when: false
15 | 


--------------------------------------------------------------------------------
/workloads/jenkins/scripts/test-slurm-nfs-mount.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | source workloads/jenkins/scripts/jenkins-common.sh
 5 | 
 6 | # showmount path is different between centos and ubuntu
 7 | if [ "${DEEPOPS_VAGRANT_OS}" == "centos" ]; then
 8 | 	ssh -v \
 9 | 		-o "StrictHostKeyChecking no" \
10 | 		-o "UserKnownHostsFile /dev/null" \
11 | 		-l vagrant \
12 | 		-i "${HOME}/.ssh/id_rsa" \
13 | 		"10.0.0.5${GPU01}" \
14 | 		"/usr/sbin/showmount -e | grep home"
15 | else
16 | 	ssh -v \
17 | 		-o "StrictHostKeyChecking no" \
18 | 		-o "UserKnownHostsFile /dev/null" \
19 | 		-l vagrant \
20 | 		-i "${HOME}/.ssh/id_rsa" \
21 | 		"10.0.0.5${GPU01}" \
22 | 		"showmount -e | grep home"
23 | fi
24 | 
25 | 
26 | ssh -v \
27 | 	-o "StrictHostKeyChecking no" \
28 | 	-o "UserKnownHostsFile /dev/null" \
29 | 	-l vagrant \
30 | 	-i "${HOME}/.ssh/id_rsa" \
31 | 	"10.0.0.6${GPU01}" \
32 |         "mount | grep nfs | grep home"
33 | 


--------------------------------------------------------------------------------