├── config.example ├── helm │ ├── dcgm-exporter.yml │ ├── metallb.yml │ └── metallb-resources.yml ├── containers │ └── dgx-firmware │ │ └── .gitkeep ├── files │ └── kubeflow │ │ └── user-namespace-params.env ├── pxe │ ├── ipmi_host_list │ ├── ipmi.conf │ └── dnsmasq.extra.conf ├── host_vars │ └── gpu01 ├── requirements.yml ├── env.sh ├── README.md └── playbooks │ └── example.yml ├── workloads ├── services │ └── k8s │ │ ├── dgxie │ │ ├── templates │ │ │ └── NOTES.txt │ │ ├── Chart.yaml │ │ └── .helmignore │ │ └── k8s-dashboard-admin.yml ├── examples │ ├── k8s │ │ ├── kubeflow-pipeline-deploy │ │ │ ├── __init__.py │ │ │ ├── triton.py.tar.gz │ │ │ ├── kubeflow-pipelines-0.PNG │ │ │ ├── kubeflow-pipelines-1.PNG │ │ │ ├── kubeflow-pipelines-2.PNG │ │ │ └── kubeflow-pipelines-3.PNG │ │ ├── services │ │ │ ├── logging │ │ │ │ ├── README.md │ │ │ │ ├── kibana-service.yaml │ │ │ │ └── es-service.yaml │ │ │ ├── hello-world.yml │ │ │ ├── dhcpd.yml │ │ │ ├── nfs-client.yml │ │ │ ├── nfs-dgx-iso.yml │ │ │ ├── ambassador-service.yml │ │ │ └── pxe.yml │ │ ├── dask-rapids │ │ │ ├── parallel-sum.png │ │ │ ├── jupyterlab-nvsmi.png │ │ │ └── k8s │ │ │ │ └── rapids-dask-sa.yml │ │ ├── gpu-test-job.yml │ │ ├── deep-learning-examples │ │ │ ├── templates │ │ │ │ ├── service.yaml │ │ │ │ └── tests │ │ │ │ │ └── test-connection.yaml │ │ │ └── .helmignore │ │ ├── gpu-usage │ │ │ ├── mig-mixed-without-selector.yml │ │ │ ├── mig-mixed-with-selector.yml │ │ │ ├── mig-single.yml │ │ │ ├── gpu-without-selector.yml │ │ │ └── gpu-with-selector.yml │ │ ├── pytorch-job.yml │ │ ├── tensorflow-job.yml │ │ ├── cluster-gpu-test-job.yml │ │ ├── nbody.yml │ │ ├── ingress-nodeport.yml │ │ └── ingress-loadbalancer.yml │ └── slurm │ │ ├── mpi-hello │ │ ├── bootstrap-mpi.yml │ │ ├── hello-job.sh │ │ └── mpi-hello.c │ │ └── dask-rapids │ │ └── files │ │ ├── launch-dask-scheduler.sh │ │ ├── launch-dask-cuda-worker.sh │ │ └── conda-requirements.yml ├── bit │ ├── .gitignore │ └── hpl │ │ ├── syscfg-dgx1v.sh │ │ └── syscfg-dgx2.sh └── jenkins │ └── scripts │ ├── test-cluster-up.sh │ ├── test-setup-slurm.sh │ ├── files │ └── nginx-from-local-registry.yml │ ├── remote-script-for-mpi.sh │ ├── remote-script-for-slurm-gpu.sh │ ├── test-slurm-enroot-job.sh │ ├── remote-script-for-registry-test.sh │ ├── vagrant-startup.sh │ ├── test-ceph.sh │ ├── test-spack-minimal.sh │ ├── test-spack-install.sh │ ├── get-slurm-debug.sh │ ├── test-slurm-gpu.sh │ ├── test-dashboard.sh │ ├── test-mpi-job.sh │ └── test-slurm-nfs-mount.sh ├── roles ├── nhc │ ├── vars │ │ ├── main.yml │ │ ├── redhat.yml │ │ ├── ubuntu.yml │ │ ├── ubuntu-20.04.yml │ │ └── ubuntu-22.04.yml │ ├── .ansible-lint │ ├── meta │ │ └── main.yml │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ └── verify.yml │ ├── templates │ │ └── sysconfig_nhc.j2 │ └── defaults │ │ └── main.yml ├── mofed │ ├── vars │ │ ├── ubuntu.yml │ │ ├── rhel8.yml │ │ └── rhel7.yml │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ └── verify.yml │ └── defaults │ │ └── main.yml ├── nfs │ ├── vars │ │ ├── redhat.yml │ │ └── ubuntu.yml │ ├── defaults │ │ └── main.yml │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── prepare.yml │ ├── handlers │ │ └── main.yml │ └── templates │ │ └── exports.j2 ├── slurm │ ├── .ansible-lint │ ├── templates │ │ └── etc │ │ │ ├── munge │ │ │ └── munge.key.j2 │ │ │ ├── ld.so.conf.d │ │ │ └── slurm.conf.j2 │ │ │ ├── slurm │ │ │ ├── cgroup.conf │ │ │ ├── epilog.d │ │ │ │ ├── 80-exclusive-cleanup │ │ │ │ ├── 95-lastuserjob-rootless │ │ │ │ ├── 60-exclusive-cpu │ │ │ │ ├── 40-lastuserjob-processes │ │ │ │ ├── 41-lastuserjob-ssh │ │ │ │ ├── 42-lastuserjob-cleanup │ │ │ │ └── 50-lastuserjob-all-enroot-dirs │ │ │ ├── prolog.d │ │ │ │ ├── 95-all-rootless │ │ │ │ ├── 50-exclusive-gpu │ │ │ │ ├── 50-exclusive-cpu │ │ │ │ └── 50-exclusive-ssh │ │ │ ├── epilog.sh │ │ │ ├── prolog.sh │ │ │ ├── gres.conf │ │ │ └── shared │ │ │ │ └── bin │ │ │ │ ├── set_gpu_power_levels.sh │ │ │ │ └── set_gpu_clocks.sh │ │ │ ├── localusers │ │ │ └── rsyslog.d │ │ │ └── 99-slurm.conf │ ├── meta │ │ └── main.yml │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── prepare.yml │ ├── tasks │ │ ├── shmfix.yml │ │ ├── logging.yml │ │ ├── setup-user.yml │ │ ├── build-cleanup.yml │ │ ├── munge.yml │ │ ├── undrain.yml │ │ ├── service-files.yml │ │ ├── setup-role.yml │ │ └── misc-node.yml │ ├── vars │ │ ├── ubuntu.yml │ │ └── redhat.yml │ └── handlers │ │ └── main.yml ├── dns-config │ ├── defaults │ │ └── main.yml │ └── templates │ │ └── resolv.conf.j2 ├── netapp-trident │ ├── .ansible-lint │ └── templates │ │ └── namespace.j2 ├── autofs │ ├── templates │ │ └── master.j2 │ ├── handlers │ │ └── main.yml │ └── tasks │ │ └── main.yml ├── move-home-dirs │ ├── defaults │ │ └── main.yml │ ├── vars │ │ └── main.yml │ └── tasks │ │ ├── move_user.yml │ │ └── main.yml ├── nvidia-dgx │ ├── templates │ │ ├── sources.list.j2 │ │ ├── cachefilesd.conf.j2 │ │ ├── dgx.list.j2 │ │ └── dgxos5.list.j2 │ ├── vars │ │ ├── dgxa100.yml │ │ └── redhat.yml │ ├── files │ │ └── nvidia-persistenced-override.conf │ ├── tasks │ │ ├── ubuntu-upgrade.yml │ │ ├── configure-raid.yml │ │ └── main.yml │ └── handlers │ │ └── main.yml ├── easy-build │ ├── meta │ │ └── main.yml │ ├── .ansible-lint │ ├── templates │ │ ├── z01_eb.sh │ │ └── z01_eb.csh │ └── defaults │ │ └── main.yml ├── openmpi │ ├── vars │ │ ├── redhat.yml │ │ └── ubuntu.yml │ ├── templates │ │ └── ld-openmpi.conf.j2 │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── prepare.yml │ │ │ ├── verify.yml │ │ │ └── molecule.yml │ └── defaults │ │ └── main.yml ├── nvidia-gpu-operator-node-prep │ ├── files │ │ ├── nvidia-driver.conf │ │ └── blocklist-nouveau.conf │ └── handlers │ │ └── main.yml ├── nvidia-gpu-operator │ ├── .ansible-lint │ ├── meta │ │ └── main.yml │ ├── templates │ │ ├── client_configuration_token.tok │ │ └── gridd.conf │ └── tasks │ │ └── main.yml ├── cachefilesd │ ├── templates │ │ └── cachefilesd.j2 │ ├── defaults │ │ └── main.yml │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── prepare.yml │ └── tasks │ │ └── main.yml ├── easy-build-packages │ ├── meta │ │ └── main.yml │ ├── defaults │ │ └── main.yml │ ├── tasks │ │ ├── redhat-pre-install.yml │ │ └── ubuntu-pre-install.yml │ └── .ansible-lint ├── nvidia-k8s-gpu-device-plugin │ ├── meta │ │ └── main.yml │ ├── .ansible-lint │ └── defaults │ │ └── main.yml ├── nvidia-k8s-gpu-feature-discovery │ ├── meta │ │ └── main.yml │ ├── .ansible-lint │ └── defaults │ │ └── main.yml ├── kerberos_client │ ├── meta │ │ ├── .galaxy_install_info │ │ └── main.yml │ ├── .ansible-lint │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── prepare.yml │ ├── defaults │ │ └── main.yml │ └── vars │ │ └── main.yml ├── ood-wrapper │ ├── vars │ │ ├── main.yml │ │ ├── ubuntu.yml │ │ └── redhat.yml │ ├── templates │ │ ├── desktop-submit.yml.erb.j2 │ │ ├── desktop.yml.j2 │ │ ├── bc_osc_codeserver │ │ │ ├── submit.yml.erb.j2 │ │ │ ├── manifest.yml.j2 │ │ │ └── form.yml.j2 │ │ ├── desktop-form.yml.j2 │ │ └── cluster.yml.j2 │ └── tasks │ │ └── main.yml ├── pyxis │ ├── templates │ │ └── etc │ │ │ └── slurm │ │ │ ├── plugstack.conf │ │ │ └── plugstack.conf.d │ │ │ └── pyxis.conf │ ├── defaults │ │ └── main.yml │ └── handlers │ │ └── main.yml ├── facts │ ├── files │ │ ├── memory.fact │ │ └── gpus.fact │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ └── verify.yml │ └── tasks │ │ └── main.yml ├── nis_client │ ├── files │ │ └── policy-rc.d │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── prepare.yml │ └── handlers │ │ └── main.yml ├── roce_backend │ ├── tasks │ │ └── main.yml │ └── templates │ │ └── config_dp.j2 ├── singularity_wrapper │ ├── .ansible-lint │ ├── requirements.yml │ ├── roles.yml │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── molecule.yml │ ├── meta │ │ └── main.yml │ ├── defaults │ │ └── main.yml │ └── .yamllint ├── rsyslog_client │ ├── handlers │ │ └── main.yml │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── prepare.yml │ ├── templates │ │ └── 99-forward-syslog.conf │ ├── tasks │ │ └── main.yml │ └── defaults │ │ └── main.yml ├── rsyslog_server │ ├── handlers │ │ └── main.yml │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── prepare.yml │ ├── defaults │ │ └── main.yml │ └── templates │ │ └── 01-deepops-listen.conf ├── grafana │ ├── handlers │ │ └── main.yml │ ├── templates │ │ ├── grafana.ini.j2 │ │ ├── prometheus-datasource.yml.j2 │ │ ├── prometheus-dashboard.yml.j2 │ │ └── docker.grafana.service.j2 │ └── defaults │ │ └── main.yml ├── nvidia_hpc_sdk │ ├── templates │ │ ├── z95_nvhpc_modules.csh │ │ ├── z95_nvhpc_modules.sh │ │ ├── z95_nvhpc.csh │ │ └── z95_nvhpc.sh │ └── molecule │ │ └── default │ │ ├── converge.yml │ │ ├── prepare.yml │ │ ├── verify.yml │ │ └── molecule.yml ├── prometheus │ ├── handlers │ │ └── main.yml │ ├── templates │ │ ├── alert_rules.yml.j2 │ │ ├── docker.prometheus.service.j2 │ │ └── prometheus.yml.j2 │ └── defaults │ │ └── main.yml ├── spack │ ├── templates │ │ ├── z00_spack.csh │ │ └── z00_spack.sh │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ └── verify.yml │ └── defaults │ │ └── main.yml ├── alertmanager │ ├── handlers │ │ └── main.yml │ ├── templates │ │ ├── alertmanager.yml.j2 │ │ └── docker.alertmanager.service.j2 │ └── defaults │ │ └── main.yml ├── nvidia-dcgm-exporter │ ├── handlers │ │ └── main.yml │ ├── templates │ │ ├── dcgm-exporter.yml.j2 │ │ └── docker.dcgm-exporter.service.j2 │ └── defaults │ │ └── main.yml ├── nvidia_dcgm │ ├── tasks │ │ ├── install-dgx.yml │ │ ├── install-ubuntu.yml │ │ └── install-redhat.yml │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── prepare.yml │ ├── vars │ │ └── main.yml │ └── .yamllint ├── nginx-docker-registry-cache │ ├── handlers │ │ └── main.yml │ ├── templates │ │ └── http-proxy.conf │ └── tasks │ │ └── client.yml ├── nvidia-dgx-firmware │ └── tasks │ │ ├── get-time.yml │ │ ├── run-diagnostics.yml │ │ ├── get-health.yml │ │ ├── get-ib.yml │ │ └── get-data.yml ├── prometheus-node-exporter │ ├── handlers │ │ └── main.yml │ ├── templates │ │ ├── node-exporter.yml.j2 │ │ └── docker.node-exporter.service.j2 │ └── defaults │ │ └── main.yml ├── docker-rootless │ ├── templates │ │ ├── z96_rootlessdocker_modules.sh │ │ └── rootless-docker │ │ │ ├── bin │ │ │ └── nvidia-container-runtime-hook │ │ │ └── config │ │ │ └── nvidia-container-runtime │ │ │ └── config.toml │ └── defaults │ │ └── main.yml ├── lmod │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ └── verify.yml │ └── defaults │ │ └── main.yml ├── openshift │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── prepare.yml │ │ │ ├── verify.yml │ │ │ └── molecule.yml │ └── defaults │ │ └── main.yml ├── nvidia_cuda │ ├── molecule │ │ └── default │ │ │ ├── converge.yml │ │ │ ├── verify.yml │ │ │ └── prepare.yml │ ├── files │ │ └── cuda-vars.sh │ ├── vars │ │ └── main.yml │ └── tasks │ │ ├── install-dgx.yml │ │ ├── install-redhat.yml │ │ ├── install-ubuntu.yml │ │ └── main.yml ├── prometheus-slurm-exporter │ ├── templates │ │ └── slurm-exporter.yml.j2 │ ├── handlers │ │ └── main.yml │ └── defaults │ │ └── main.yml ├── nvidia-mig-manager │ └── defaults │ │ └── main.yml ├── docker-login │ ├── defaults │ │ └── main.yml │ └── tasks │ │ └── main.yml ├── nvidia-network-operator │ └── templates │ │ └── values.yaml ├── nfs-client-provisioner │ ├── defaults │ │ └── main.yml │ └── tasks │ │ └── main.yml ├── standalone-container-registry │ └── templates │ │ └── config.yml ├── nvidia-peer-memory │ └── tasks │ │ └── main.yml └── nvidia-gpu-tests │ ├── README.md │ └── defaults │ └── main.yml ├── playbooks ├── slurm-cluster │ ├── files │ │ └── cve_2021_44228.options │ ├── openmpi.yml │ ├── lmod.yml │ ├── open-ondemand.yml │ ├── nhc.yml │ ├── grafana.yml │ ├── spack-modules.yml │ ├── templates │ │ └── filebeat.conf │ ├── prometheus.yml │ ├── alertmanager.yml │ ├── easybuild-modules.yml │ ├── prometheus-node-exporter.yml │ ├── prometheus-slurm-exporter.yml │ └── nvidia-dcgm-exporter.yml ├── generic │ ├── cachefilesd.yml │ ├── dns-config.yml │ ├── anaconda.yml │ ├── authentication.yml │ ├── ntp-client.yml │ ├── nfs-client.yml │ ├── nfs-server.yml │ ├── rsyslog-client.yml │ ├── rsyslog-server.yml │ ├── chrony-client.yml │ ├── hosts.yml │ └── software.yml ├── provisioning │ └── maas.yml ├── container │ ├── singularity.yml │ ├── standalone-container-registry.yml │ ├── docker-login.yml │ ├── nginx-docker-registry-cache-server.yml │ ├── nginx-docker-registry-cache-client.yml │ ├── docker-rootless.yml │ └── pyxis.yml ├── nvidia-software │ ├── nvidia-dcgm.yml │ ├── nvidia-hpc-sdk.yml │ ├── nvidia-peer-memory.yml │ └── nvidia-driver.yml ├── utilities │ ├── nvidia-gpu-tests.yml │ ├── nvidia-set-gpu-clocks.yml │ ├── gpu-clocks.yml │ └── mofed.yml ├── bootstrap │ ├── bootstrap-openshift.yml │ └── bootstrap-rook.yml ├── nvidia-dgx │ ├── nvidia-dgx.yml │ ├── nvidia-dgx-diag.yml │ └── nvidia-dgx-fw-update.yml └── k8s-cluster │ ├── nvidia-network-operator.yaml │ ├── nvidia-k8s-gpu-device-plugin.yml │ ├── nvidia-k8s-gpu-feature-discovery.yml │ ├── nvidia-gpu-operator.yml │ ├── container-registry.yml │ └── netapp-trident.yml ├── virtual ├── .gitignore ├── vars_files │ ├── virt_k8s.yml │ └── virt_slurm.yml └── k8s_environment.sh ├── docs ├── img │ ├── roce_resnet50.PNG │ ├── nccl_latency_ring.PNG │ ├── nccl_bandwidth_ring.PNG │ ├── slurm_monitoring_grafana01.png │ ├── slurm_monitoring_grafana02.png │ ├── slurm_monitoring_grafana03.png │ ├── slurm_monitoring_grafana04.png │ ├── slurm_monitoring_grafana05.png │ ├── slurm_monitoring_grafana06.png │ ├── slurm_monitoring_alertmanager01.png │ ├── slurm_monitoring_alertmanager02.png │ └── slurm_monitoring_prometheus01.png ├── pxe │ └── maas-example-vms.png ├── slurm-cluster │ ├── ood-images │ │ ├── ood-01.png │ │ ├── ood-02.png │ │ ├── ood-03.png │ │ ├── ood-04.png │ │ ├── ood-05.png │ │ ├── ood-06.png │ │ ├── ood-07.png │ │ ├── ood-08.png │ │ └── ood-09.png │ └── slurm-prolog-epilog │ │ ├── prolog-checkmounts │ │ ├── hyperthreadingon │ │ ├── epilog-mps │ │ ├── epilog-dcgmstats │ │ ├── prolog-dcgmstats │ │ ├── prolog-lspci │ │ ├── hyperthreadingoff │ │ ├── epilog-ecc │ │ ├── prolog-dcgmhealth │ │ └── prolog-ecc └── cloud-native │ └── README.md ├── src ├── containers │ ├── dgxie │ │ ├── mboot.efi │ │ └── dnsmasq.conf │ ├── pxe │ │ └── dhcp │ │ │ ├── Dockerfile │ │ │ └── dnsmasq.conf │ └── ngc │ │ ├── pytorch │ │ └── Dockerfile-minimal │ │ ├── tensorflow │ │ └── Dockerfile-minimal │ │ ├── build.sh │ │ └── rapids │ │ └── Dockerfile-minimal └── repo │ ├── ansible-lint │ └── githooks │ ├── pre-commit │ ├── check-python.py │ └── check-shell.py ├── .gitignore ├── scripts ├── deepops │ ├── proxy.sh │ └── enable_linting.sh ├── pxe │ ├── build_and_restart_dgxie.sh │ └── setup_nat.sh └── generic │ ├── gpu_diag.sh │ └── install_docker.sh ├── .gitmodules ├── ansible.cfg └── .github └── workflows └── ansible-lint-roles.yml /config.example/helm/dcgm-exporter.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /config.example/containers/dgx-firmware/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /workloads/services/k8s/dgxie/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /roles/nhc/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nhc_ssh_daemon: "sshd" 3 | -------------------------------------------------------------------------------- /roles/mofed/vars/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | mofed_distro: "ubuntu" 3 | -------------------------------------------------------------------------------- /roles/nfs/vars/redhat.yml: -------------------------------------------------------------------------------- 1 | nfs_server_daemon: nfs-server 2 | -------------------------------------------------------------------------------- /roles/slurm/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: 2 | - meta-no-info 3 | -------------------------------------------------------------------------------- /workloads/examples/k8s/kubeflow-pipeline-deploy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /roles/dns-config/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dns_config_search: [] 3 | -------------------------------------------------------------------------------- /roles/netapp-trident/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: 2 | - var-naming 3 | -------------------------------------------------------------------------------- /roles/nfs/vars/ubuntu.yml: -------------------------------------------------------------------------------- 1 | nfs_server_daemon: nfs-kernel-server 2 | -------------------------------------------------------------------------------- /roles/autofs/templates/master.j2: -------------------------------------------------------------------------------- 1 | {{ autofs_mount }} yp:{{ autofs_map }} 2 | -------------------------------------------------------------------------------- /roles/move-home-dirs/defaults/main.yml: -------------------------------------------------------------------------------- 1 | move_home_dirs_new_root: /local 2 | -------------------------------------------------------------------------------- /roles/nhc/vars/redhat.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nhc_build_deps: 3 | - gcc 4 | - make 5 | -------------------------------------------------------------------------------- /roles/nhc/vars/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nhc_build_deps: 3 | - build-essential 4 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/templates/sources.list.j2: -------------------------------------------------------------------------------- 1 | {{ dgx_default_ubuntu_repos }} 2 | -------------------------------------------------------------------------------- /roles/easy-build/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - { role: lmod } 4 | -------------------------------------------------------------------------------- /roles/move-home-dirs/vars/main.yml: -------------------------------------------------------------------------------- 1 | tmp_user: ansible-tmp-user-move_home_dirs 2 | -------------------------------------------------------------------------------- /roles/openmpi/vars/redhat.yml: -------------------------------------------------------------------------------- 1 | --- 2 | openmpi_deps: 3 | - "@Development Tools" 4 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/files/cve_2021_44228.options: -------------------------------------------------------------------------------- 1 | -Dlog4j2.formatMsgNoLookups=true 2 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-operator-node-prep/files/nvidia-driver.conf: -------------------------------------------------------------------------------- 1 | i2c_core 2 | ipmi_msghandler 3 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-operator/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: 2 | - meta-no-info 3 | - role-name 4 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-operator/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - role: openshift 4 | -------------------------------------------------------------------------------- /virtual/.gitignore: -------------------------------------------------------------------------------- 1 | /.vagrant/ 2 | /admin.conf 3 | /config 4 | /k8s-config 5 | Vagrantfile 6 | -------------------------------------------------------------------------------- /roles/cachefilesd/templates/cachefilesd.j2: -------------------------------------------------------------------------------- 1 | RUN={{ 'yes' if cachefilesd_enabled else 'no' }} 2 | -------------------------------------------------------------------------------- /roles/easy-build-packages/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - { role: easy-build } 4 | -------------------------------------------------------------------------------- /roles/nvidia-k8s-gpu-device-plugin/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - role: openshift 4 | -------------------------------------------------------------------------------- /docs/img/roce_resnet50.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/roce_resnet50.PNG -------------------------------------------------------------------------------- /roles/nvidia-k8s-gpu-feature-discovery/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - role: openshift 4 | -------------------------------------------------------------------------------- /roles/openmpi/vars/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | openmpi_deps: 3 | - build-essential 4 | - libnuma-dev 5 | -------------------------------------------------------------------------------- /docs/pxe/maas-example-vms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/pxe/maas-example-vms.png -------------------------------------------------------------------------------- /roles/kerberos_client/meta/.galaxy_install_info: -------------------------------------------------------------------------------- 1 | {install_date: 'Tue Nov 8 19:19:09 2016', version: ''} 2 | -------------------------------------------------------------------------------- /roles/nhc/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: 2 | - meta-no-info # meta/main.yml should contain relevant info 3 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-operator/templates/client_configuration_token.tok: -------------------------------------------------------------------------------- 1 | "{{ gpu_operator_nvaie_nls_token }}" 2 | -------------------------------------------------------------------------------- /roles/ood-wrapper/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | user: "{{ ansible_env.SUDO_USER | default(ansible_env.USER) }}" 3 | -------------------------------------------------------------------------------- /roles/pyxis/templates/etc/slurm/plugstack.conf: -------------------------------------------------------------------------------- 1 | include {{ slurm_config_dir }}/plugstack.conf.d/*.conf 2 | -------------------------------------------------------------------------------- /roles/pyxis/templates/etc/slurm/plugstack.conf.d/pyxis.conf: -------------------------------------------------------------------------------- 1 | required /usr/local/src/pyxis/spank_pyxis.so 2 | -------------------------------------------------------------------------------- /docs/img/nccl_latency_ring.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/nccl_latency_ring.PNG -------------------------------------------------------------------------------- /playbooks/generic/cachefilesd.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | roles: 5 | - cachefilesd 6 | -------------------------------------------------------------------------------- /playbooks/generic/dns-config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: true 4 | roles: 5 | - dns-config 6 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/openmpi.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | roles: 5 | - openmpi 6 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/munge/munge.key.j2: -------------------------------------------------------------------------------- 1 | {{ slurm_password|password_hash('sha512', slurm_cluster_name) }} 2 | -------------------------------------------------------------------------------- /src/containers/dgxie/mboot.efi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/src/containers/dgxie/mboot.efi -------------------------------------------------------------------------------- /docs/img/nccl_bandwidth_ring.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/nccl_bandwidth_ring.PNG -------------------------------------------------------------------------------- /playbooks/provisioning/maas.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | roles: 5 | - role: ansible-maas 6 | -------------------------------------------------------------------------------- /roles/facts/files/memory.fact: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "{ \"total_mb\": $(free -m | grep Mem: | awk '{print $2*0.95}') }" -------------------------------------------------------------------------------- /roles/nhc/vars/ubuntu-20.04.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nhc_build_deps: 3 | - build-essential 4 | 5 | nhc_ssh_daemon: "sshd:" 6 | -------------------------------------------------------------------------------- /roles/nhc/vars/ubuntu-22.04.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nhc_build_deps: 3 | - build-essential 4 | 5 | nhc_ssh_daemon: "sshd:" 6 | -------------------------------------------------------------------------------- /roles/nis_client/files/policy-rc.d: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | echo "All runlevel operations denied by policy" >&2 3 | exit 101 4 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-operator-node-prep/files/blocklist-nouveau.conf: -------------------------------------------------------------------------------- 1 | blacklist nouveau 2 | options nouveau modeset=0 3 | -------------------------------------------------------------------------------- /roles/openmpi/templates/ld-openmpi.conf.j2: -------------------------------------------------------------------------------- 1 | {{ openmpi_install_prefix }}/lib 2 | {{ openmpi_install_prefix }}/lib64 3 | -------------------------------------------------------------------------------- /roles/roce_backend/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for roce_backend role in allhosts.yaml and k8smaster.yaml 3 | -------------------------------------------------------------------------------- /playbooks/container/singularity.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | roles: 5 | - singularity_wrapper 6 | -------------------------------------------------------------------------------- /playbooks/generic/anaconda.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | roles: 5 | - andrewrothstein.miniconda 6 | -------------------------------------------------------------------------------- /roles/autofs/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart autofs 3 | service: name=autofs state=restarted enabled=yes 4 | -------------------------------------------------------------------------------- /roles/singularity_wrapper/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: 2 | - meta-no-info # meta/main.yml should contain relevant info 3 | -------------------------------------------------------------------------------- /workloads/bit/.gitignore: -------------------------------------------------------------------------------- 1 | tmp 2 | results 3 | slurm*.out 4 | xhpl_cuda-10.1-dyn_mkl-static_ompi-3.1.3_gcc4.8.5_3-12-19b 5 | -------------------------------------------------------------------------------- /config.example/helm/metallb.yml: -------------------------------------------------------------------------------- 1 | --- 2 | controller: 3 | nodeSelector: 4 | node-role.kubernetes.io/control-plane: "" 5 | -------------------------------------------------------------------------------- /docs/img/slurm_monitoring_grafana01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana01.png -------------------------------------------------------------------------------- /docs/img/slurm_monitoring_grafana02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana02.png -------------------------------------------------------------------------------- /docs/img/slurm_monitoring_grafana03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana03.png -------------------------------------------------------------------------------- /docs/img/slurm_monitoring_grafana04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana04.png -------------------------------------------------------------------------------- /docs/img/slurm_monitoring_grafana05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana05.png -------------------------------------------------------------------------------- /docs/img/slurm_monitoring_grafana06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_grafana06.png -------------------------------------------------------------------------------- /roles/nvidia-dgx/vars/dgxa100.yml: -------------------------------------------------------------------------------- 1 | DGX_SWBUILD_DATE: 2020-06-29 2 | DGX_SWBUILD_VERSION: 4.99.9 3 | DGX_COMMIT_ID: 9f56299 4 | -------------------------------------------------------------------------------- /roles/singularity_wrapper/requirements.yml: -------------------------------------------------------------------------------- 1 | --- 2 | roles: 3 | - name: Setup singularity roles 4 | include_tasks: roles.yml 5 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/ld.so.conf.d/slurm.conf.j2: -------------------------------------------------------------------------------- 1 | {{ slurm_install_prefix }}/lib 2 | {{ slurm_install_prefix }}/lib64 3 | -------------------------------------------------------------------------------- /config.example/files/kubeflow/user-namespace-params.env: -------------------------------------------------------------------------------- 1 | user=deepops@example.com 2 | profile-name=kubeflow-deepops-example-com 3 | -------------------------------------------------------------------------------- /docs/slurm-cluster/ood-images/ood-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-01.png -------------------------------------------------------------------------------- /docs/slurm-cluster/ood-images/ood-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-02.png -------------------------------------------------------------------------------- /docs/slurm-cluster/ood-images/ood-03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-03.png -------------------------------------------------------------------------------- /docs/slurm-cluster/ood-images/ood-04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-04.png -------------------------------------------------------------------------------- /docs/slurm-cluster/ood-images/ood-05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-05.png -------------------------------------------------------------------------------- /docs/slurm-cluster/ood-images/ood-06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-06.png -------------------------------------------------------------------------------- /docs/slurm-cluster/ood-images/ood-07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-07.png -------------------------------------------------------------------------------- /docs/slurm-cluster/ood-images/ood-08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-08.png -------------------------------------------------------------------------------- /docs/slurm-cluster/ood-images/ood-09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/slurm-cluster/ood-images/ood-09.png -------------------------------------------------------------------------------- /playbooks/slurm-cluster/lmod.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: yes 4 | roles: 5 | - lmod 6 | -------------------------------------------------------------------------------- /roles/easy-build/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: # or 'skip_list' to silence them completely 2 | - meta-no-info 3 | - role-name 4 | -------------------------------------------------------------------------------- /roles/netapp-trident/templates/namespace.j2: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: {{ trident_namespace }} 5 | -------------------------------------------------------------------------------- /docs/img/slurm_monitoring_alertmanager01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_alertmanager01.png -------------------------------------------------------------------------------- /docs/img/slurm_monitoring_alertmanager02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_alertmanager02.png -------------------------------------------------------------------------------- /docs/img/slurm_monitoring_prometheus01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/docs/img/slurm_monitoring_prometheus01.png -------------------------------------------------------------------------------- /roles/nhc/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - role: facts 4 | 5 | galaxy_info: 6 | namespace: deepops 7 | role_name: nhc 8 | -------------------------------------------------------------------------------- /roles/rsyslog_client/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart rsyslog 3 | service: 4 | name: rsyslog 5 | state: restarted 6 | -------------------------------------------------------------------------------- /roles/rsyslog_server/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: reload rsyslog 3 | service: 4 | name: rsyslog 5 | state: restarted 6 | -------------------------------------------------------------------------------- /src/repo/ansible-lint: -------------------------------------------------------------------------------- 1 | --- 2 | exclude_paths: 3 | - ./roles/galaxy/ 4 | - ./kubespray/ 5 | use_default_rules: true 6 | verbosity: 2 7 | -------------------------------------------------------------------------------- /roles/singularity_wrapper/roles.yml: -------------------------------------------------------------------------------- 1 | - src: abims_sbr.singularity 2 | version: 3.7.1-1 3 | - src: gantsign.golang 4 | version: 2.4.0 5 | -------------------------------------------------------------------------------- /workloads/examples/k8s/services/logging/README.md: -------------------------------------------------------------------------------- 1 | https://github.com/kubernetes/kubernetes/tree/master/cluster/addons/fluentd-elasticsearch 2 | -------------------------------------------------------------------------------- /playbooks/nvidia-software/nvidia-dcgm.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: true 4 | roles: 5 | - nvidia_dcgm 6 | -------------------------------------------------------------------------------- /roles/grafana/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart grafana 3 | service: 4 | name: "{{ grafana_svc_name }}" 5 | state: restarted 6 | -------------------------------------------------------------------------------- /roles/nvidia-k8s-gpu-device-plugin/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: 2 | - meta-no-info # meta/main.yml should contain relevant info 3 | - role-name 4 | -------------------------------------------------------------------------------- /roles/nvidia-k8s-gpu-feature-discovery/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: 2 | - meta-no-info # meta/main.yml should contain relevant info 3 | - role-name 4 | -------------------------------------------------------------------------------- /roles/nvidia_hpc_sdk/templates/z95_nvhpc_modules.csh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env csh 2 | setenv MODULEPATH = "$MODULEPATH:{{ hpcsdk_install_dir }}/modulefiles)" 3 | -------------------------------------------------------------------------------- /virtual/vars_files/virt_k8s.yml: -------------------------------------------------------------------------------- 1 | --- 2 | container_registry_persistence_enabled: false 3 | rsyslog_client_tcp_host: "{{ groups['kube-master'][0] }}" 4 | -------------------------------------------------------------------------------- /workloads/examples/k8s/dask-rapids/parallel-sum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/dask-rapids/parallel-sum.png -------------------------------------------------------------------------------- /playbooks/nvidia-software/nvidia-hpc-sdk.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: true 4 | roles: 5 | - nvidia_hpc_sdk 6 | -------------------------------------------------------------------------------- /roles/nfs/defaults/main.yml: -------------------------------------------------------------------------------- 1 | nfs_idmapd_domain: localdomain 2 | 3 | nfs_is_server: no 4 | nfs_is_client: no 5 | 6 | nfs_exports: [] 7 | nfs_mounts: [] 8 | -------------------------------------------------------------------------------- /roles/prometheus/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart prometheus 3 | service: 4 | name: "{{ prometheus_svc_name }}" 5 | state: restarted 6 | -------------------------------------------------------------------------------- /roles/spack/templates/z00_spack.csh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env csh 2 | 3 | setenv SPACK_ROOT {{ spack_install_dir }} 4 | source $SPACK_ROOT/share/spack/setup-env.csh 5 | -------------------------------------------------------------------------------- /roles/spack/templates/z00_spack.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export SPACK_ROOT="{{ spack_install_dir }}" 4 | . "${SPACK_ROOT}/share/spack/setup-env.sh" 5 | -------------------------------------------------------------------------------- /playbooks/nvidia-software/nvidia-peer-memory.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: true 4 | roles: 5 | - nvidia-peer-memory 6 | -------------------------------------------------------------------------------- /roles/alertmanager/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart alertmanager 3 | service: 4 | name: "{{ alertmanager_svc_name }}" 5 | state: restarted 6 | -------------------------------------------------------------------------------- /roles/nvidia-dcgm-exporter/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart dcgm 3 | service: 4 | name: "{{ nvidia_dcgm_svc_name }}" 5 | state: restarted 6 | -------------------------------------------------------------------------------- /roles/nvidia_dcgm/tasks/install-dgx.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install DCGM from repos 3 | package: 4 | name: "datacenter-gpu-manager" 5 | state: present 6 | -------------------------------------------------------------------------------- /roles/nvidia_hpc_sdk/templates/z95_nvhpc_modules.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export MODULEPATH="${MODULEPATH:+$MODULEPATH:}{{ hpcsdk_install_dir }}/modulefiles" 3 | -------------------------------------------------------------------------------- /workloads/examples/k8s/dask-rapids/jupyterlab-nvsmi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/dask-rapids/jupyterlab-nvsmi.png -------------------------------------------------------------------------------- /workloads/services/k8s/dgxie/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for Kubernetes 4 | name: dgxie 5 | version: 0.1.2 6 | -------------------------------------------------------------------------------- /roles/easy-build-packages/defaults/main.yml: -------------------------------------------------------------------------------- 1 | sm_files_path: "{{ sm_prefix }}/easybuild_files" 2 | sm_files_repo_version: "4ef7ae6cc2284f69412a8db5e10dddd92024eeab" 3 | -------------------------------------------------------------------------------- /roles/slurm/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - role: facts 4 | - role: rsyslog_client 5 | 6 | galaxy_info: 7 | namespace: deepops 8 | role_name: slurm 9 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/cgroup.conf: -------------------------------------------------------------------------------- 1 | CgroupAutomount=yes 2 | 3 | ConstrainCores=yes 4 | ConstrainDevices=yes 5 | ConstrainRAMSpace=yes 6 | #TaskAffinity=yes 7 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/files/nvidia-persistenced-override.conf: -------------------------------------------------------------------------------- 1 | [Service] 2 | ExecStart= 3 | ExecStart=/usr/bin/nvidia-persistenced --user root --persistence-mode --verbose 4 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/epilog.d/80-exclusive-cleanup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | # Cleanup caches 5 | sync 6 | echo 3 > /proc/sys/vm/drop_caches 7 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-cluster-up.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source workloads/jenkins/scripts/jenkins-common.sh 3 | 4 | cd virtual || exit 1 5 | bash ./cluster_up.sh 6 | -------------------------------------------------------------------------------- /roles/nfs/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include nfs" 6 | include_role: 7 | name: "nfs" 8 | -------------------------------------------------------------------------------- /roles/nginx-docker-registry-cache/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart docker 3 | systemd: 4 | name: "docker" 5 | state: "restarted" 6 | daemon_reload: yes 7 | -------------------------------------------------------------------------------- /roles/nhc/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include nhc" 6 | include_role: 7 | name: "nhc" 8 | -------------------------------------------------------------------------------- /roles/nvidia-dgx-firmware/tasks/get-time.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Get current system time 3 | shell: "now=$(date '+%Y%m%d-%H%M%S') && echo $now" 4 | register: current_time 5 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/vars/redhat.yml: -------------------------------------------------------------------------------- 1 | dgx_repo_dir: "rhel{{ ansible_distribution_major_version }}" 2 | 3 | dgx_extra_packages: 4 | - dgx-conf-cachefilesd 5 | - kernel-headers 6 | -------------------------------------------------------------------------------- /roles/prometheus-node-exporter/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart node-exporter 3 | service: 4 | name: "{{ node_exporter_svc_name }}" 5 | state: restarted 6 | -------------------------------------------------------------------------------- /workloads/examples/k8s/kubeflow-pipeline-deploy/triton.py.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/triton.py.tar.gz -------------------------------------------------------------------------------- /playbooks/generic/authentication.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | roles: 5 | - move-home-dirs 6 | - kerberos_client 7 | - nis_client 8 | - autofs 9 | -------------------------------------------------------------------------------- /roles/docker-rootless/templates/z96_rootlessdocker_modules.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export MODULEPATH="${MODULEPATH:+$MODULEPATH:}{{ rootlessdocker_install_dir }}/modulefiles" 3 | -------------------------------------------------------------------------------- /roles/facts/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include facts" 6 | include_role: 7 | name: "facts" 8 | -------------------------------------------------------------------------------- /roles/grafana/templates/grafana.ini.j2: -------------------------------------------------------------------------------- 1 | [security] 2 | admin_user = {{ grafana_cfg_user }} 3 | admin_password = {{ grafana_cfg_pass }} 4 | 5 | [auth.anonymous] 6 | enabled = true 7 | -------------------------------------------------------------------------------- /roles/lmod/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include lmod" 6 | include_role: 7 | name: "lmod" 8 | -------------------------------------------------------------------------------- /roles/mofed/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include mofed" 6 | include_role: 7 | name: "mofed" 8 | -------------------------------------------------------------------------------- /roles/slurm/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include slurm" 6 | include_role: 7 | name: "slurm" 8 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/localusers: -------------------------------------------------------------------------------- 1 | root 2 | {{ ansible_env.SUDO_USER | default(ansible_env.USER) }} 3 | {% for user in slurm_allow_ssh_user %} 4 | {{ user }} 5 | {% endfor %} 6 | -------------------------------------------------------------------------------- /roles/spack/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include spack" 6 | include_role: 7 | name: "spack" 8 | -------------------------------------------------------------------------------- /playbooks/generic/ntp-client.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: true 4 | tasks: 5 | - name: Configure NTP client 6 | include_role: 7 | name: geerlingguy.ntp 8 | -------------------------------------------------------------------------------- /roles/openmpi/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include openmpi" 6 | include_role: 7 | name: "openmpi" 8 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/prolog.d/95-all-rootless: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | command -v singularity || exit 0 3 | /usr/local/bin/singularity config fakeroot -a "${SLURM_JOB_USER}" 4 | -------------------------------------------------------------------------------- /roles/cachefilesd/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | cachefilesd_package_state: present 3 | cachefilesd_enabled: present 4 | cachefilesd_cache_dir: /var/cache/fscache 5 | cachefilesd_cache_tag: null 6 | -------------------------------------------------------------------------------- /roles/kerberos_client/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: 2 | - meta-no-info # meta/main.yml should contain relevant info 3 | - meta-no-tags # Tags must contain lowercase letters and digits only 4 | -------------------------------------------------------------------------------- /config.example/pxe/ipmi_host_list: -------------------------------------------------------------------------------- 1 | # This configuration file is used while rebooting DGX servers into PXE boot 2 | # This information is used to connect to the DGX BMC 3 | 10.0.0.1 4 | 10.0.0.2 -------------------------------------------------------------------------------- /playbooks/generic/nfs-client.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: yes 4 | roles: 5 | - { role: nfs, nfs_is_client: yes } 6 | tags: 7 | - nfs_client 8 | -------------------------------------------------------------------------------- /playbooks/generic/nfs-server.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: yes 4 | roles: 5 | - { role: nfs, nfs_is_server: yes } 6 | tags: 7 | - nfs_server 8 | -------------------------------------------------------------------------------- /playbooks/generic/rsyslog-client.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: yes 4 | roles: 5 | - rsyslog_client 6 | tags: 7 | - rsyslog-client 8 | - rsyslog 9 | -------------------------------------------------------------------------------- /playbooks/generic/rsyslog-server.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: yes 4 | roles: 5 | - rsyslog_server 6 | tags: 7 | - rsyslog-server 8 | - rsyslog 9 | -------------------------------------------------------------------------------- /roles/nis_client/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include nis_client" 6 | include_role: 7 | name: "nis_client" 8 | -------------------------------------------------------------------------------- /roles/openshift/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include openshift" 6 | include_role: 7 | name: "openshift" 8 | -------------------------------------------------------------------------------- /workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-0.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-0.PNG -------------------------------------------------------------------------------- /workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-1.PNG -------------------------------------------------------------------------------- /workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-2.PNG -------------------------------------------------------------------------------- /workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/deepops/HEAD/workloads/examples/k8s/kubeflow-pipeline-deploy/kubeflow-pipelines-3.PNG -------------------------------------------------------------------------------- /roles/cachefilesd/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include cachefilesd" 6 | include_role: 7 | name: "cachefilesd" 8 | -------------------------------------------------------------------------------- /roles/easy-build-packages/tasks/redhat-pre-install.yml: -------------------------------------------------------------------------------- 1 | # 2 | # install software modules using EasyBuild: 3 | # 4 | --- 5 | - name: "RHEL install" 6 | debug: 7 | msg: "No installs for RHEL" 8 | 9 | -------------------------------------------------------------------------------- /roles/nginx-docker-registry-cache/templates/http-proxy.conf: -------------------------------------------------------------------------------- 1 | [Service] 2 | Environment="HTTP_PROXY={{ nginx_docker_cache_proxy_url }}" 3 | Environment="HTTPS_PROXY={{ nginx_docker_cache_proxy_url }}" 4 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/templates/cachefilesd.conf.j2: -------------------------------------------------------------------------------- 1 | dir {{ cachefilesd_cache_dir }} 2 | tag {{ cachefilesd_cache_tag }} 3 | brun {{ cachefilesd_cache_brun }} 4 | bcull {{ cachefilesd_cache_bcull }} 5 | 6 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/templates/dgx.list.j2: -------------------------------------------------------------------------------- 1 | deb {{ nvidia_dgx_ubuntu_baseurl }}/{{ ansible_distribution_release | lower }} {{ ansible_distribution_release | lower }} main multiverse restricted universe 2 | -------------------------------------------------------------------------------- /roles/nvidia_cuda/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include nvidia_cuda" 6 | include_role: 7 | name: "nvidia_cuda" 8 | -------------------------------------------------------------------------------- /roles/nvidia_dcgm/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include nvidia_dcgm" 6 | include_role: 7 | name: "nvidia_dcgm" 8 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/epilog.d/95-lastuserjob-rootless: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | command -v singularity || exit 0 3 | /usr/local/bin/singularity config fakeroot -r "${SLURM_JOB_USER}" || true 4 | -------------------------------------------------------------------------------- /roles/mofed/vars/rhel8.yml: -------------------------------------------------------------------------------- 1 | --- 2 | mofed_distro: "rhel" 3 | mofed_pkg_prereqs: 4 | - tcsh 5 | - gcc-gfortran 6 | - numactl-libs 7 | - kernel-modules-extra 8 | - tcl 9 | - tk 10 | - fuse-libs 11 | -------------------------------------------------------------------------------- /roles/nvidia-dcgm-exporter/templates/dcgm-exporter.yml.j2: -------------------------------------------------------------------------------- 1 | - targets: [{{ groups['slurm-node'] | zip_longest([], fillvalue=':9400') | map('join') | join(',') }}] 2 | labels: 3 | module: dcgm-exporter 4 | -------------------------------------------------------------------------------- /roles/nvidia_cuda/files/cuda-vars.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PATH="/usr/local/cuda/bin${PATH:+:${PATH}}" 4 | export LD_LIBRARY_PATH="/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" 5 | -------------------------------------------------------------------------------- /roles/grafana/templates/prometheus-datasource.yml.j2: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: Prometheus 5 | type: prometheus 6 | access: proxy 7 | url: http://localhost:9090 8 | isDefault: true 9 | -------------------------------------------------------------------------------- /roles/kerberos_client/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include kerberos_client" 6 | include_role: 7 | name: "kerberos_client" 8 | -------------------------------------------------------------------------------- /roles/prometheus-node-exporter/templates/node-exporter.yml.j2: -------------------------------------------------------------------------------- 1 | - targets: [{{ groups['slurm-node'] | zip_longest([], fillvalue=':9100') | map('join') | join(',') }}] 2 | labels: 3 | module: node-exporter 4 | -------------------------------------------------------------------------------- /roles/rsyslog_client/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include rsyslog_client" 6 | include_role: 7 | name: "rsyslog_client" 8 | -------------------------------------------------------------------------------- /roles/rsyslog_server/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include rsyslog_server" 6 | include_role: 7 | name: "rsyslog_server" 8 | -------------------------------------------------------------------------------- /config.example/pxe/ipmi.conf: -------------------------------------------------------------------------------- 1 | # This configuration file is used while rebooting DGX servers into PXE boot 2 | # This information is used to connect to the DGX BMC 3 | IPMI_USERNAME=dgxuser 4 | IPMI_PASSWORD=dgxuser -------------------------------------------------------------------------------- /roles/easy-build/templates/z01_eb.sh: -------------------------------------------------------------------------------- 1 | export EASYBUILD_PREFIX={{ sm_prefix }} 2 | export EASYBUILD_MODULES_TOOL=Lmod 3 | module purge 4 | unset $(env | grep EBROOT | awk -F'=' '{print $1}') 5 | module load EasyBuild 6 | -------------------------------------------------------------------------------- /roles/slurm/tasks/shmfix.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Fix RemoveIPC 4 | lineinfile: 5 | dest: /etc/systemd/logind.conf 6 | regexp: '^#RemoveIPC=yes' 7 | line: 'RemoveIPC=no' 8 | notify: "restart logind" 9 | -------------------------------------------------------------------------------- /roles/easy-build-packages/.ansible-lint: -------------------------------------------------------------------------------- 1 | skip_list: 2 | - meta-no-info # meta/main.yml should contain relevant info 3 | - no-changed-when # Commands should not change things if nothing needs doing 4 | - role-name 5 | -------------------------------------------------------------------------------- /roles/easy-build/templates/z01_eb.csh: -------------------------------------------------------------------------------- 1 | setenv EASYBUILD_PREFIX {{ sm_prefix }} 2 | setenv EASYBUILD_MODULES_TOOL Lmod 3 | module purge 4 | unset $(env | grep EBROOT | awk -F'=' '{print $1}') 5 | module load EasyBuild 6 | -------------------------------------------------------------------------------- /playbooks/utilities/nvidia-gpu-tests.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: [ kube-node, slurm-node ] 3 | gather_facts: no 4 | tasks: 5 | - name: Include NVIDIA GPU tests role 6 | include_role: 7 | name: nvidia-gpu-tests 8 | -------------------------------------------------------------------------------- /roles/nvidia_hpc_sdk/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | become: yes 5 | tasks: 6 | - name: "Include nvidia_hpc_sdk" 7 | include_role: 8 | name: "nvidia_hpc_sdk" 9 | -------------------------------------------------------------------------------- /roles/prometheus-slurm-exporter/templates/slurm-exporter.yml.j2: -------------------------------------------------------------------------------- 1 | - targets: [{{ groups[slurm_exporter_host_group] | zip_longest([], fillvalue=':8080') | map('join') | join(',') }}] 2 | labels: 3 | module: slurm-exporter 4 | -------------------------------------------------------------------------------- /roles/singularity_wrapper/molecule/default/converge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Converge 3 | hosts: all 4 | tasks: 5 | - name: "Include singularity_wrapper" 6 | include_role: 7 | name: "singularity_wrapper" 8 | -------------------------------------------------------------------------------- /playbooks/bootstrap/bootstrap-openshift.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: kube-master 3 | become: true 4 | tasks: 5 | - name: Install required Python OpenShift packages/libraries 6 | include_role: 7 | name: openshift 8 | -------------------------------------------------------------------------------- /roles/easy-build/defaults/main.yml: -------------------------------------------------------------------------------- 1 | eb_tmp_dir: /tmp/easybuild 2 | eb_bootstrap_url: 'https://raw.githubusercontent.com/easybuilders/easybuild-framework/49533e6ef8f3ca27f984eeb212e157874cae9183/easybuild/scripts/bootstrap_eb.py' 3 | -------------------------------------------------------------------------------- /roles/mofed/vars/rhel7.yml: -------------------------------------------------------------------------------- 1 | --- 2 | mofed_distro: "rhel" 3 | mofed_pkg_prereqs: 4 | - pciutils-libs 5 | - numactl-libs 6 | - gcc-gfortran 7 | - tcsh 8 | - libusbx 9 | - libnl3 10 | - tcl 11 | - fuse-libs 12 | - tk 13 | -------------------------------------------------------------------------------- /roles/docker-rootless/templates/rootless-docker/bin/nvidia-container-runtime-hook: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | /usr/bin/nvidia-container-runtime-hook -config="{{ rootlessdocker_install_dir }}/config/nvidia-container-runtime/config.toml" "$@" 4 | -------------------------------------------------------------------------------- /roles/grafana/templates/prometheus-dashboard.yml.j2: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'default' 5 | folder: '' 6 | type: file 7 | editable: true 8 | options: 9 | path: {{ grafana_cfg_dashboard_path }} 10 | -------------------------------------------------------------------------------- /roles/rsyslog_client/templates/99-forward-syslog.conf: -------------------------------------------------------------------------------- 1 | {% if rsyslog_client_tcp_host is defined -%} 2 | action(type="omfwd" Target="{{ rsyslog_client_tcp_host }}" Port="{{ rsyslog_client_tcp_port }}" Protocol="tcp") 3 | {% endif -%} 4 | -------------------------------------------------------------------------------- /workloads/examples/k8s/services/hello-world.yml: -------------------------------------------------------------------------------- 1 | kind: Pod 2 | apiVersion: v1 3 | metadata: 4 | name: hello-world 5 | spec: 6 | containers: 7 | - name: hello-world 8 | image: hello-world 9 | restartPolicy: Never 10 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/prolog.d/50-exclusive-gpu: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | command -v nvidia-smi || exit 0 5 | 6 | /etc/slurm/shared/bin/set_gpu_power_levels.sh max 7 | /etc/slurm/shared/bin/set_gpu_clocks.sh max 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ansible 2 | *.retry 3 | 4 | # misc. 5 | .*.swp 6 | 7 | # project-specific 8 | /admin.conf 9 | /config*/ 10 | !/config.example/ 11 | /roles/galaxy/ 12 | /collections/* 13 | /k8s-config/ 14 | /kubectl 15 | /tridentctl 16 | -------------------------------------------------------------------------------- /config.example/host_vars/gpu01: -------------------------------------------------------------------------------- 1 | --- 2 | # Example of a host-specific variable file 3 | # These variables would only be used for a host named "gpu01" 4 | 5 | # Don't install Singularity on this host 6 | #slurm_cluster_install_singularity: false 7 | -------------------------------------------------------------------------------- /roles/nfs/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart rpcbind 3 | service: 4 | name: rpcbind 5 | state: restarted 6 | 7 | - name: restart nfs 8 | service: 9 | name: "{{ nfs_server_daemon }}" 10 | state: restarted 11 | -------------------------------------------------------------------------------- /roles/openmpi/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | -------------------------------------------------------------------------------- /roles/openshift/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | -------------------------------------------------------------------------------- /scripts/deepops/proxy.sh: -------------------------------------------------------------------------------- 1 | # edit the proxy details 2 | # 3 | # export http_proxy="http://10.0.2.5:3128" 4 | # export https_proxy="http://10.0.2.5:3128" 5 | # export no_proxy="localhost,cluster.local,127.0.0.1,::1,10.0.2.10,10.0.2.20,10.0.2.30" 6 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-setup-slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source workloads/jenkins/scripts/jenkins-common.sh 3 | 4 | cd virtual || exit 1 5 | export DEEPOPS_DISABLE_K8S=true 6 | export DEEPOPS_ENABLE_SLURM=true 7 | bash ./cluster_up.sh 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "kubespray"] 2 | path = submodules/kubespray 3 | url = https://github.com/kubernetes-sigs/kubespray.git 4 | [submodule "packer-maas"] 5 | path = submodules/packer-maas 6 | url = https://github.com/DeepOps/packer-maas.git 7 | -------------------------------------------------------------------------------- /playbooks/nvidia-dgx/nvidia-dgx.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: true 4 | tasks: 5 | - name: Include NVIDIA DGX role 6 | include_role: 7 | name: nvidia-dgx 8 | environment: "{{proxy_env if proxy_env is defined else {}}}" 9 | -------------------------------------------------------------------------------- /roles/nvidia_hpc_sdk/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/epilog.d/60-exclusive-cpu: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | command -v cpupower || exit 0 5 | 6 | cpupower frequency-info | grep -e "governors: Not Available" && exit 0 7 | cpupower frequency-set -g powersave 8 | -------------------------------------------------------------------------------- /roles/dns-config/templates/resolv.conf.j2: -------------------------------------------------------------------------------- 1 | # {{ ansible_managed }} 2 | 3 | {% for server in dns_config_servers %} 4 | nameserver {{ server }} 5 | {% endfor %} 6 | {% if dns_config_search %} 7 | search {{ dns_config_search | join(' ') }} 8 | {% endif %} 9 | -------------------------------------------------------------------------------- /roles/facts/files/gpus.fact: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if ! command -v lspci >/dev/null 2>&1; then 3 | echo lspci not installed 4 | exit 1 5 | fi 6 | count=$(lspci | grep -E "(3D|VGA compatible) controller: NVIDIA" --count) 7 | echo "{ \"count\": $count }" 8 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/prolog.d/50-exclusive-cpu: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | command -v cpupower || exit 0 5 | 6 | cpupower frequency-info | grep -e "governors: Not Available" && exit 0 7 | cpupower frequency-set -g performance 8 | -------------------------------------------------------------------------------- /roles/docker-rootless/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Install for rootless docker. 3 | 4 | # Directory to install in 5 | sm_prefix: "/sw" 6 | sm_software_path: "{{ sm_prefix }}/software" 7 | rootlessdocker_install_dir: "{{ sm_software_path }}/rootless-docker" 8 | -------------------------------------------------------------------------------- /roles/facts/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/mofed/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/nfs/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/nhc/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/nvidia_cuda/vars/main.yml: -------------------------------------------------------------------------------- 1 | _ubuntu_repo_dir: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.', '') }}/{{ ansible_architecture }}" 2 | _rhel_repo_dir: "rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}" 3 | -------------------------------------------------------------------------------- /roles/slurm/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/spack/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/cachefilesd/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/nis_client/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/nvidia_cuda/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/nvidia_dcgm/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/openmpi/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/openshift/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/kerberos_client/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/nvidia_hpc_sdk/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/rsyslog_client/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /roles/rsyslog_server/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | 4 | - name: Verify 5 | hosts: all 6 | gather_facts: false 7 | tasks: 8 | - name: Example assertion 9 | assert: 10 | that: true 11 | -------------------------------------------------------------------------------- /src/containers/pxe/dhcp/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | MAINTAINER Douglas Holt 4 | 5 | RUN apt-get update && \ 6 | apt-get -y install dnsmasq 7 | 8 | VOLUME /etc/dnsmasq.d 9 | 10 | #ENTRYPOINT ["dnsmasq"] 11 | CMD ["dnsmasq", "-d"] 12 | -------------------------------------------------------------------------------- /config.example/requirements.yml: -------------------------------------------------------------------------------- 1 | # Put custom Ansible Galaxy requirements here 2 | # Galaxy roles that are used by the DeepOps roles can be found in roles/requirements.yml 3 | # https://docs.ansible.com/ansible/latest/galaxy/user_guide.html 4 | --- 5 | collections: 6 | roles: 7 | -------------------------------------------------------------------------------- /playbooks/container/standalone-container-registry.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install docker 3 | import_playbook: docker.yml 4 | 5 | - hosts: "{{ hostlist | default('all') }}" 6 | become: true 7 | become_method: sudo 8 | roles: 9 | - standalone-container-registry 10 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/open-ondemand.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: slurm-master[0] 3 | become: yes 4 | roles: 5 | - {role: ood-wrapper, ood_is_server: yes } 6 | 7 | - hosts: slurm-node 8 | become: yes 9 | roles: 10 | - {role: ood-wrapper, ood_is_client: yes } 11 | -------------------------------------------------------------------------------- /roles/nginx-docker-registry-cache/tasks/client.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Set up Ubuntu client 3 | include_tasks: client-ubuntu.yml 4 | when: ansible_distribution == 'Ubuntu' 5 | 6 | - name: Set up RHEL client 7 | include_tasks: client-el.yml 8 | when: ansible_os_family == 'RedHat' 9 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/epilog.d/40-lastuserjob-processes: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | if [ "$SLURM_JOB_USER" != root ]; then 5 | if killall -9 -u "$SLURM_JOB_USER" ; then 6 | logger -s -t slurm-epilog 'Killed residual user processes' 7 | fi 8 | fi 9 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/epilog.d/41-lastuserjob-ssh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | if grep -q -w "$SLURM_JOB_USER" /etc/slurm/localusers.backup ; then 5 | exit 0 # don't revoke access for these users 6 | fi 7 | 8 | sed -i "/${SLURM_JOB_USER}/d" /etc/localusers 9 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/prolog.d/50-exclusive-ssh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | cp /etc/slurm/localusers.backup /etc/localusers 5 | 6 | if ! grep -q -w "$SLURM_JOB_USER" /etc/slurm/localusers.backup ; then 7 | echo "$SLURM_JOB_USER" >>/etc/localusers 8 | fi 9 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/nhc.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: true 4 | tasks: 5 | - name: Gather custom facts 6 | include_role: 7 | name: facts 8 | - name: Install Node Health Check 9 | include_role: 10 | name: nhc 11 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/grafana.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install docker 3 | import_playbook: ../container/docker.yml 4 | 5 | - hosts: "{{ hostlist | default('all') }}" 6 | become: yes 7 | tasks: 8 | - name: configure grafana 9 | include_role: 10 | name: grafana 11 | -------------------------------------------------------------------------------- /roles/nvidia_dcgm/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dcgm_is_dgx: false 3 | _ubuntu_repo_dir: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.', '') }}/{{ ansible_architecture }}" 4 | _rhel_repo_dir: "rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}" 5 | -------------------------------------------------------------------------------- /roles/singularity_wrapper/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | galaxy_info: 3 | role_name: singularity_wrapper 4 | namespace: deepops 5 | author: DeepOps Team 6 | company: NVIDIA 7 | description: Wrap lecourguille.singularity role 8 | license: 3-Clause BSD 9 | min_ansible_version: 2.9 10 | -------------------------------------------------------------------------------- /roles/slurm/tasks/logging.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: import slurm logs into rsyslog 3 | template: 4 | src: "etc/rsyslog.d/99-slurm.conf" 5 | dest: "/etc/rsyslog.d/99-slurm.conf" 6 | owner: "root" 7 | group: "root" 8 | mode: "0644" 9 | notify: 10 | - restart rsyslog 11 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/files/nginx-from-local-registry.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: nginx-registry-local 5 | spec: 6 | containers: 7 | - name: nginx 8 | image: registry.local:31500/nginx:1.21 9 | hostNetwork: true 10 | dnsPolicy: Default 11 | -------------------------------------------------------------------------------- /playbooks/container/docker-login.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Ensure Docker is installed and configured 3 | - name: Install docker 4 | import_playbook: docker.yml 5 | 6 | # Log into Docker registries 7 | - hosts: "{{ hostlist | default('all') }}" 8 | become: yes 9 | roles: 10 | - docker-login 11 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/spack-modules.yml: -------------------------------------------------------------------------------- 1 | # Deploy the Spack package manager 2 | --- 3 | - hosts: "{{ hostlist | default('all') }}" 4 | become: yes 5 | roles: 6 | - lmod 7 | 8 | - hosts: "{{ hostlist | default('slurm-master[0]') }}" 9 | become: yes 10 | roles: 11 | - spack 12 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/templates/filebeat.conf: -------------------------------------------------------------------------------- 1 | input { 2 | beats { 3 | port => {{ filebeat_port }} 4 | } 5 | } 6 | 7 | output { 8 | elasticsearch { 9 | hosts => ["http://localhost:9200"] 10 | index => "%{[@metadata][beat]}-%{[@metadata][version]}" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /roles/nhc/templates/sysconfig_nhc.j2: -------------------------------------------------------------------------------- 1 | PATH=/sbin:/usr/sbin:/bin:/usr/bin:{{ slurm_install_prefix|default('/usr/local') }}/bin 2 | NHC_RM=slurm 3 | SLURM_SINFO={{ slurm_install_prefix|default('/usr/local') }}/bin/sinfo 4 | SLURM_SCONTROL={{ slurm_install_prefix|default('/usr/local') }}/bin/scontrol 5 | -------------------------------------------------------------------------------- /roles/nvidia-mig-manager/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | mig_manager_url_deb: https://github.com/NVIDIA/mig-parted/releases/download/v0.4.2/nvidia-mig-manager_0.4.2-1_amd64.deb 3 | mig_manager_url_rpm: https://github.com/NVIDIA/mig-parted/releases/download/v0.4.2/nvidia-mig-manager-0.4.2-1.x86_64.rpm 4 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/prometheus.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install docker 3 | import_playbook: ../container/docker.yml 4 | 5 | - hosts: "{{ hostlist | default('all') }}" 6 | become: yes 7 | tasks: 8 | - name: configure prometheus 9 | include_role: 10 | name: prometheus 11 | -------------------------------------------------------------------------------- /roles/ood-wrapper/templates/desktop-submit.yml.erb.j2: -------------------------------------------------------------------------------- 1 | --- 2 | script: 3 | native: 4 | - "--gpus=<%= bc_num_gpus.blank? ? {{ ood_desktop_app_def_gpus }} : bc_num_gpus.to_i %>" 5 | - "--cpus-per-gpu={{ ood_desktop_app_cpus_per_gpu }}" 6 | - "--mem-per-gpu={{ ood_desktop_app_mem_per_gpu }}" 7 | -------------------------------------------------------------------------------- /roles/rsyslog_server/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rsyslog_server_tcp_port: 514 3 | rsyslog_server_udp_port: 514 4 | rsyslog_enable_journal: yes 5 | rsyslog_log_file_path: "/var/log/deepops-hosts" 6 | rsyslog_log_file_path_pattern: "{{ rsyslog_log_file_path }}/%HOSTNAME%/%$YEAR%-%$MONTH%-%$DAY%/syslog.log" 7 | -------------------------------------------------------------------------------- /playbooks/generic/chrony-client.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Chrony will ensure that the clocks of all your servers are up to sync and can be used to sync with an internal server 3 | - hosts: all 4 | become: true 5 | tasks: 6 | - name: Configure Chrony client 7 | include_role: 8 | name: DeepOps.chrony 9 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/alertmanager.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install docker 3 | import_playbook: ../container/docker.yml 4 | 5 | - hosts: "{{ hostlist | default('all') }}" 6 | become: yes 7 | tasks: 8 | - name: configure alertmanager 9 | include_role: 10 | name: alertmanager 11 | 12 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/tasks/ubuntu-upgrade.yml: -------------------------------------------------------------------------------- 1 | # OTA upgrade stuff 2 | - name: perform OTA upgrade to latest release (this takes a while) 3 | apt: 4 | upgrade: full 5 | update_cache: yes 6 | dpkg_options: "force-confdef,force-confold" 7 | 8 | - name: reboot after full OTA upgrade 9 | reboot: 10 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/epilog.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | logger -s -t slurm-epilog "START user=$SLURM_JOB_USER job=$SLURM_JOB_ID" 5 | {{ slurm_config_dir }}/shared/bin/run-parts.sh {{ slurm_config_dir }}/epilog.d 6 | logger -s -t slurm-epilog "END user=$SLURM_JOB_USER job=$SLURM_JOB_ID" 7 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/prolog.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | logger -s -t slurm-prolog "START user=$SLURM_JOB_USER job=$SLURM_JOB_ID" 5 | {{ slurm_config_dir }}/shared/bin/run-parts.sh {{ slurm_config_dir }}/prolog.d 6 | logger -s -t slurm-prolog "END user=$SLURM_JOB_USER job=$SLURM_JOB_ID" 7 | -------------------------------------------------------------------------------- /playbooks/k8s-cluster/nvidia-network-operator.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | ## Playbook for installing nvidia-network-operator 3 | # 4 | - hosts: kube-master[0] 5 | become: true 6 | become_method: sudo 7 | tasks: 8 | - include_role: 9 | name: nvidia-network-operator 10 | tasks_from: main 11 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/remote-script-for-mpi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | # 3 | # Test compiling and running an MPI program using NVIDIA HPC SDK 4 | 5 | set -x 6 | set -euo pipefail 7 | 8 | module load nvhpc 9 | 10 | mpicc -o "${HOME}/hello" "${HOME}/mpi-hello.c" 11 | 12 | srun --mpi=pmix -n2 "${HOME}/hello" 13 | -------------------------------------------------------------------------------- /roles/lmod/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is an example playbook to execute Ansible tests. 3 | # 4 | # TODO: Add test to check for lmod bug in Ubuntu 5 | 6 | - name: Verify 7 | hosts: all 8 | gather_facts: false 9 | tasks: 10 | - name: Example assertion 11 | assert: 12 | that: true 13 | -------------------------------------------------------------------------------- /roles/prometheus-slurm-exporter/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart slurm-exporter 3 | service: 4 | name: "{{ slurm_exporter_svc_name }}" 5 | state: restarted 6 | 7 | - name: restart grafana 8 | service: 9 | name: "{{ grafana_svc_name }}" 10 | state: restarted 11 | failed_when: false 12 | -------------------------------------------------------------------------------- /roles/easy-build-packages/tasks/ubuntu-pre-install.yml: -------------------------------------------------------------------------------- 1 | # 2 | # install software modules using EasyBuild: 3 | # 4 | --- 5 | - name: "install depending software" 6 | become: true 7 | apt: 8 | name: 9 | - libssl-dev 10 | - build-essential 11 | - libsysfs-dev 12 | - libibverbs-dev 13 | 14 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-operator/templates/gridd.conf: -------------------------------------------------------------------------------- 1 | # See the official documentaion for more details: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html 2 | # Description: Set License Server Address 3 | # Data type: string 4 | # Format: "
" 5 | ServerAddress="{{ vgpu_grid_license_server }}" 6 | -------------------------------------------------------------------------------- /roles/nvidia_hpc_sdk/templates/z95_nvhpc.csh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env csh 2 | 3 | setenv NVARCH `uname -s`_`uname -m` 4 | setenv NVCOMPILERS {{ hpcsdk_install_dir }} 5 | setenv MANPATH "$MANPATH":$NVCOMPILERS/$NVARCH/{{ hpcsdk_version_dir }}/compilers/man 6 | set path = ($NVCOMPILERS/$NVARCH/{{ hpcsdk_version_dir }}/compilers/bin $path) 7 | -------------------------------------------------------------------------------- /virtual/k8s_environment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Set up local environment to work with virtual k8s cluster 4 | 5 | K8S_CONFIG_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )/config" 6 | 7 | export KUBECONFIG="${K8S_CONFIG_DIR}/artifacts/admin.conf" 8 | export PATH="${K8S_CONFIG_DIR}/artifacts:${PATH}" 9 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/easybuild-modules.yml: -------------------------------------------------------------------------------- 1 | # 2 | # install software modules using EasyBuild. 3 | # 4 | --- 5 | - hosts: all 6 | roles: 7 | - name: lmod 8 | 9 | - hosts: slurm-master[0] 10 | roles: 11 | - name: easy-build 12 | 13 | - hosts: slurm-master 14 | roles: 15 | - name: easy-build-packages 16 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/prometheus-node-exporter.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install docker 3 | import_playbook: ../container/docker.yml 4 | 5 | - hosts: "{{ hostlist | default('all') }}" 6 | become: yes 7 | tasks: 8 | - name: configure node exporter 9 | include_role: 10 | name: prometheus-node-exporter 11 | -------------------------------------------------------------------------------- /roles/nis_client/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart nis 3 | service: name=ypbind state=restarted enabled=yes 4 | when: (ansible_os_family == "Debian" and ansible_distribution_version in [ "14.04" ]) 5 | 6 | - name: restart nis 7 | service: name=nis state=restarted enabled=yes 8 | when: ansible_os_family == "RedHat" 9 | -------------------------------------------------------------------------------- /roles/kerberos_client/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | kerberos_client_kdc_hostname: kerberos 3 | kerberos_client_admin_hostname: kerberos 4 | kerberos_client_dns_lookup_realm: false 5 | kerberos_client_dns_lookup_kdc: false 6 | kerberos_client_ticket_lifetime: 24h 7 | kerberos_client_renew_lifetime: 7d 8 | kerberos_client_forwardable: true 9 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/prometheus-slurm-exporter.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install docker 3 | import_playbook: ../container/docker.yml 4 | 5 | - hosts: "{{ hostlist | default('all') }}" 6 | become: yes 7 | tasks: 8 | - name: configure prometheus slurm exporter 9 | include_role: 10 | name: prometheus-slurm-exporter 11 | -------------------------------------------------------------------------------- /roles/ood-wrapper/templates/desktop.yml.j2: -------------------------------------------------------------------------------- 1 | --- 2 | title: "{{ ood_desktop_app_title }}" 3 | cluster: "{{ ood_cluster_name }}" 4 | submit: "submit/{{ ood_cluster_name }}_desktop.yml.erb" 5 | attributes: 6 | desktop: "xfce" 7 | bc_queue: null 8 | bc_account: null 9 | bc_num_gpus: 10 | label: "Number of GPUs" 11 | value: 1 12 | -------------------------------------------------------------------------------- /roles/openshift/defaults/main.yml: -------------------------------------------------------------------------------- 1 | deepops_dir: /opt/deepops 2 | deepops_venv: '{{ deepops_dir }}/venv' 3 | epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm" 4 | epel_key_url: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}" 5 | -------------------------------------------------------------------------------- /roles/alertmanager/templates/alertmanager.yml.j2: -------------------------------------------------------------------------------- 1 | global: 2 | slack_api_url: 'https://hooks.slack.com/services/SLACK/API/KEY' 3 | route: 4 | receiver: 'slack-notifications' 5 | repeat_interval: 2m 6 | receivers: 7 | - name: 'slack-notifications' 8 | slack_configs: 9 | - channel: '#my-slack-channel' 10 | send_resolved: true 11 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/rsyslog.d/99-slurm.conf: -------------------------------------------------------------------------------- 1 | input(type="imfile" File="/var/log/slurm/slurmd.log" Tag="slurmd") 2 | input(type="imfile" File="/var/log/slurm/prolog-epilog" Tag="slurm-prolog-epilog") 3 | input(type="imfile" File="/var/log/slurm/slurmctld.log" Tag="slurmctld") 4 | input(type="imfile" File="/var/log/slurm/slurmdbd.log" Tag="slurmdbd") 5 | -------------------------------------------------------------------------------- /playbooks/bootstrap/bootstrap-rook.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: true 4 | tasks: 5 | - name: install xfsprogs for ceph/rook 6 | package: 7 | name: xfsprogs 8 | state: present 9 | - name: install python3-setuptools for ceph/rook 10 | package: 11 | name: python3-setuptools 12 | state: present 13 | -------------------------------------------------------------------------------- /roles/ood-wrapper/templates/bc_osc_codeserver/submit.yml.erb.j2: -------------------------------------------------------------------------------- 1 | --- 2 | batch_connect: 3 | template: "basic" 4 | script: 5 | native: 6 | - "--gpus=<%= bc_num_gpus.blank? ? {{ ood_codeserver_app_def_gpus }} : bc_num_gpus.to_i %>" 7 | - "--cpus-per-gpu={{ ood_codeserver_app_cpus_per_gpu }}" 8 | - "--mem-per-gpu={{ ood_codeserver_app_mem_per_gpu }}" 9 | -------------------------------------------------------------------------------- /src/containers/pxe/dhcp/dnsmasq.conf: -------------------------------------------------------------------------------- 1 | domain-needed 2 | bogus-priv 3 | strict-order 4 | no-resolv 5 | no-poll 6 | expand-hosts 7 | cache-size=2048 8 | bind-interfaces 9 | 10 | server=8.8.8.8 11 | server=8.8.4.4 12 | domain=localdomain 13 | 14 | log-queries 15 | log-dhcp 16 | log-facility=/var/log/dnsmasq.log 17 | 18 | conf-dir=/etc/dnsmasq.d,*.conf 19 | -------------------------------------------------------------------------------- /playbooks/container/nginx-docker-registry-cache-server.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install docker 3 | import_playbook: docker.yml 4 | 5 | - hosts: "{{ hostlist | default('all') }}" 6 | become: yes 7 | tasks: 8 | - name: deploy nginx container caching proxy 9 | include_role: 10 | name: "nginx-docker-registry-cache" 11 | tasks_from: server 12 | -------------------------------------------------------------------------------- /workloads/examples/slurm/mpi-hello/bootstrap-mpi.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Install OpenMPI packages from Ubuntu repos 3 | - hosts: all 4 | become: true 5 | tasks: 6 | - name: install openmpi packages 7 | apt: 8 | name: "{{ item }}" 9 | with_items: 10 | - openmpi-bin 11 | - libopenmpi-dev 12 | - libpmi2-pmix 13 | - libpmi-pmix-dev 14 | -------------------------------------------------------------------------------- /config.example/pxe/dnsmasq.extra.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Additional DNSMASQ configuration 3 | # 4 | 5 | # If the dhcp-ignore flag is specified in this fashion, only hosts configured with dhcp-host will be given DHCP 6 | #dhcp-ignore=tag:!known 7 | 8 | # Example static IP; note this will not work for bonded interfaces 9 | #dhcp-host=12:34:56:78,server-01,192.168.1.23 10 | -------------------------------------------------------------------------------- /roles/singularity_wrapper/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # vars for lecorguille.singularity 3 | singularity_version: "3.11.4" 4 | singularity_conf_path: "/etc/singularity/singularity.conf" 5 | bind_paths: [] 6 | 7 | # vars for gantsign.golang 8 | golang_version: "1.14.4" 9 | golang_install_dir: "/opt/go/{{ golang_version }}" 10 | golang_gopath: "/opt/go/packages" 11 | -------------------------------------------------------------------------------- /playbooks/container/nginx-docker-registry-cache-client.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install docker 3 | import_playbook: docker.yml 4 | 5 | - hosts: "{{ hostlist | default('all') }}" 6 | become: yes 7 | tasks: 8 | - name: configure nginx container caching proxy client 9 | include_role: 10 | name: "nginx-docker-registry-cache" 11 | tasks_from: client 12 | -------------------------------------------------------------------------------- /playbooks/k8s-cluster/nvidia-k8s-gpu-device-plugin.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: kube-master[0] 3 | become: true 4 | tasks: 5 | - name: install k8s GPU plugin 6 | include_role: 7 | name: nvidia-k8s-gpu-device-plugin 8 | run_once: true 9 | environment: "{{proxy_env if proxy_env is defined else {}}}" 10 | tags: 11 | - k8s_gpu_device_plugin 12 | -------------------------------------------------------------------------------- /roles/ood-wrapper/templates/bc_osc_codeserver/manifest.yml.j2: -------------------------------------------------------------------------------- 1 | --- 2 | name: VS Code Server 3 | category: Interactive Apps 4 | subcategory: Servers 5 | role: batch_connect 6 | description: | 7 | This app will launch a [VS Code] instance using [Code Server] on a GPU node 8 | 9 | [VS Code]: https://code.visualstudio.com/ 10 | [Code Server]: https://coder.com/ 11 | -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/prolog-checkmounts: -------------------------------------------------------------------------------- 1 | # 2 | # Check that mounts exist 3 | # 4 | MOUNTS="/raid /gpfs/fs1" 5 | for i in $MOUNTS 6 | do 7 | mount | grep $i &> /dev/null 8 | if [ $? -ne 0 ] 9 | then 10 | echo "$HOSTNAME is missing $i" 11 | echo "scontrol update nodename=$HOSTNAME state=drain reason="Mount missing: $i"" 12 | fi 13 | done 14 | -------------------------------------------------------------------------------- /roles/alertmanager/defaults/main.yml: -------------------------------------------------------------------------------- 1 | alertmanager_config_dir: /etc/alertmanager 2 | alertmanager_config_src: templates/alertmanager.yml.j2 3 | alertmanager_container: "prom/alertmanager:v0.23.0" 4 | alertmanager_svc_name: "docker.alertmanager.service" 5 | alertmanager_docker_volume_name: "deepops_alertmanager_metrics" 6 | alertmanager_state: started 7 | alertmanager_enabled: yes 8 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/templates/dgxos5.list.j2: -------------------------------------------------------------------------------- 1 | deb {{ nvidia_dgx_os5_ubuntu_baseurl }}/{{ ansible_distribution_release | lower }}/{{ ansible_architecture }}/ {{ ansible_distribution_release | lower }} common dgx 2 | deb {{ nvidia_dgx_os5_ubuntu_baseurl }}/{{ ansible_distribution_release | lower }}/{{ ansible_architecture }}/ {{ ansible_distribution_release | lower }}-updates common dgx 3 | -------------------------------------------------------------------------------- /roles/nvidia_hpc_sdk/templates/z95_nvhpc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export NVARCH="$(uname -s)_$(uname -m)" 4 | export NVCOMPILERS="{{ hpcsdk_install_dir }}" 5 | export MANPATH="${MANPATH:+$MANPATH:}{{ hpcsdk_install_dir }}/${NVCOMPILERS}/${NVARCH}/{{ hpcsdk_version_dir }}/compilers/man" 6 | export PATH="${NVCOMPILERS}/${NVARCH}/{{ hpcsdk_version_dir }}/compilers/bin:${PATH}" 7 | -------------------------------------------------------------------------------- /roles/slurm/tasks/setup-user.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: create slurm user home 3 | file: 4 | path: "{{ slurm_user_home }}" 5 | recurse: yes 6 | state: directory 7 | 8 | - name: create slurm user 9 | user: 10 | name: "{{ slurm_username }}" 11 | state: present 12 | system: yes 13 | home: "{{ slurm_user_home }}" 14 | uid: "{{ slurm_user_uid }}" 15 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/epilog.d/42-lastuserjob-cleanup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | if [[ -n "$SLURM_JOB_USER" && "$SLURM_JOB_USER" != "root" ]]; then 5 | logger -s -t slurm-epilog 'Removed residual user files' 6 | for dir in /tmp /dev/shm ; do 7 | find "${dir}" -user "${SLURM_JOB_USER}" -print0 | xargs -0 -r rm -fr ||: 8 | done 9 | fi 10 | -------------------------------------------------------------------------------- /workloads/examples/k8s/gpu-test-job.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: cuda-container 8 | image: nvcr.io/nvidia/cuda:10.0-devel 9 | command: ["sleep", "6000"] 10 | args: 11 | resources: 12 | limits: 13 | nvidia.com/gpu: 1 14 | restartPolicy: Never 15 | 16 | -------------------------------------------------------------------------------- /roles/docker-login/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | docker_login_state: present 3 | docker_login_reauth: yes 4 | docker_login_disable_log_password: yes 5 | 6 | #docker_login_registries: 7 | #- registry: docker.io 8 | # username: myuser 9 | # password: mypassword 10 | # email: docker@docker.io 11 | #- registry: nvcr.io 12 | # username: '$oauthtoken' 13 | # password: mypassword 14 | -------------------------------------------------------------------------------- /roles/prometheus/templates/alert_rules.yml.j2: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: default 3 | rules: 4 | - alert: InstanceDown 5 | expr: up{job="cluster"} == 0 6 | for: 5m 7 | labels: 8 | severity: critical 9 | annotations: 10 | summary: "Instance down" 11 | description: "Something has been down for more than 5 minutes." -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingon: -------------------------------------------------------------------------------- 1 | # Enable hypterthreading if requested 2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i hyperthreading | grep -v nohyperthreading > /dev/null 3 | if [ $? -eq 0 ]; then 4 | for i in /sys/devices/system/cpu/*/online ; do 5 | echo 1 > $i 6 | echo Enabling CPU $(echo $i | egrep -o cpu[0-9]+ | tr -d 'cpu') 7 | done 8 | fi 9 | -------------------------------------------------------------------------------- /roles/slurm/tasks/build-cleanup.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: make clean in build directory after install 3 | command: make clean 4 | args: 5 | chdir: "{{ slurm_build_dir }}" 6 | failed_when: false 7 | when: slurm_build_make_clean 8 | 9 | - name: remove build directory 10 | file: 11 | path: "{{ slurm_build_dir }}" 12 | state: absent 13 | when: slurm_build_dir_cleanup 14 | -------------------------------------------------------------------------------- /playbooks/k8s-cluster/nvidia-k8s-gpu-feature-discovery.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: kube-master[0] 3 | become: true 4 | tasks: 5 | - name: install k8s GPU feature discovery 6 | include_role: 7 | name: nvidia-k8s-gpu-feature-discovery 8 | run_once: true 9 | environment: "{{proxy_env if proxy_env is defined else {}}}" 10 | tags: 11 | - k8s_gpu_feature_discovery 12 | -------------------------------------------------------------------------------- /roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml: -------------------------------------------------------------------------------- 1 | # Vars needed to install device plugin 2 | k8s_gpu_plugin_helm_repo: "https://nvidia.github.io/k8s-device-plugin" 3 | k8s_gpu_plugin_chart_name: "nvdp/nvidia-device-plugin" 4 | k8s_gpu_plugin_release_name: "nvidia-device-plugin" 5 | k8s_gpu_plugin_chart_version: "0.14.0" 6 | k8s_gpu_plugin_init_error: "false" 7 | k8s_gpu_mig_strategy: "mixed" 8 | -------------------------------------------------------------------------------- /roles/roce_backend/templates/config_dp.j2: -------------------------------------------------------------------------------- 1 | resourceList: 2 | {% for sriov_resource in sriov_resources %} 3 | - resourceName: "{{ sriov_resource.res_name }}" 4 | isRdma: true 5 | selectors: 6 | vendors: 7 | - "{{ vendor }}" 8 | devices: 9 | - "{{ dev_id }}" 10 | pfNames: 11 | - "{{ sriov_resource.pf_name }}" 12 | {% endfor %} 13 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/remote-script-for-slurm-gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | # 3 | # Test compiling and running a GPU program using NVIDIA HPC SDK 4 | 5 | set -x 6 | set -euo pipefail 7 | 8 | module load nvhpc 9 | nvcc -o "${HOME}/deviceQuery" -I /usr/local/cuda/samples/common/inc /usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery.cpp 10 | srun -n1 -G1 "${HOME}/deviceQuery" 11 | -------------------------------------------------------------------------------- /playbooks/generic/hosts.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | - name: set /etc/hostname 6 | hostname: 7 | name: "{{ inventory_hostname }}" 8 | when: deepops_set_hostname | default(true) 9 | 10 | - name: set /etc/hosts 11 | include_role: 12 | name: DeepOps.hosts 13 | vars: 14 | hosts_add_ansible_managed_hosts: true 15 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/gres.conf: -------------------------------------------------------------------------------- 1 | {% if slurm_autodetect_nvml -%} 2 | AutoDetect=nvml 3 | {% else -%} 4 | {% set cpu_topology = ansible_local["topology"]["cpu_topology"] -%} 5 | {% set gpu_topology = ansible_local["topology"]["gpu_topology"] -%} 6 | {% for affinity in gpu_topology %} 7 | Name=gpu File=/dev/nvidia{{ loop.index0 }} Cores={{ affinity }} 8 | {% endfor %} 9 | {% endif -%} 10 | -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/epilog-mps: -------------------------------------------------------------------------------- 1 | # Quit cuda mps if it's running 2 | ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null 3 | if [ $? -eq 0 ]; then 4 | echo quit | nvidia-cuda-mps-control 5 | fi 6 | 7 | # Test for presence of mps zombie 8 | ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null 9 | if [ $? -eq 0 ]; then 10 | killall nvidia-cuda-mps-server 11 | fi 12 | -------------------------------------------------------------------------------- /roles/move-home-dirs/tasks/move_user.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: kill user processes for {{ user }} 3 | command: killall -u {{ user }} 4 | register: kill_user_procs 5 | failed_when: false 6 | changed_when: kill_user_procs.rc == 0 7 | 8 | - name: move home directory for {{ user }} 9 | user: 10 | name: "{{ user }}" 11 | home: "{{ move_home_dirs_new_root }}/{{ user }}" 12 | move_home: yes 13 | -------------------------------------------------------------------------------- /roles/nfs/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | 11 | - name: workaround to enable service in container 12 | shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d 13 | when: ansible_distribution == "Ubuntu" 14 | -------------------------------------------------------------------------------- /config.example/env.sh: -------------------------------------------------------------------------------- 1 | # This file acts as a location to override the default configurations of deepops/scripts/* 2 | # Many of the scripts in this directory define global variables and set reasonable defaults 3 | # Global variables (in all caps) that are defined here will be automatically sourced and used in all scripts 4 | # See deepops/scripts/common.sh for implementation details 5 | 6 | DEEPOPS_EXAMPLE_VAR="" 7 | -------------------------------------------------------------------------------- /roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml: -------------------------------------------------------------------------------- 1 | # Vars needed to install feature discovery 2 | k8s_gpu_feature_discovery_helm_repo: "https://nvidia.github.io/gpu-feature-discovery" 3 | k8s_gpu_feature_discovery_chart_name: "nvgfd/gpu-feature-discovery" 4 | k8s_gpu_feature_discovery_release_name: "gpu-feature-discovery" 5 | k8s_gpu_feature_discovery_chart_version: "0.8.0" 6 | k8s_gpu_mig_strategy: "mixed" 7 | -------------------------------------------------------------------------------- /roles/rsyslog_client/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: ensure rsyslog is installed 3 | package: 4 | name: rsyslog 5 | state: present 6 | 7 | - name: configure syslog forwarding 8 | template: 9 | src: "99-forward-syslog.conf" 10 | dest: "/etc/rsyslog.d/99-forward-syslog.conf" 11 | owner: "root" 12 | group: "root" 13 | mode: "0644" 14 | notify: 15 | - restart rsyslog 16 | 17 | -------------------------------------------------------------------------------- /roles/slurm/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | 11 | - name: workaround to enable service in container 12 | shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d 13 | when: ansible_distribution == "Ubuntu" 14 | -------------------------------------------------------------------------------- /roles/cachefilesd/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | 11 | - name: workaround to enable service in container 12 | shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d 13 | when: ansible_distribution == "Ubuntu" 14 | -------------------------------------------------------------------------------- /roles/nis_client/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | 11 | - name: workaround to enable service in container 12 | shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d 13 | when: ansible_distribution == "Ubuntu" 14 | -------------------------------------------------------------------------------- /roles/nvidia-dgx-firmware/tasks/run-diagnostics.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Starting diagnostics step 3 | debug: 4 | msg: "Starting now" 5 | 6 | - name: Check firmware 7 | include_tasks: check-firmware.yml 8 | ignore_errors: true 9 | 10 | - name: Get health 11 | include_tasks: get-health.yml 12 | ignore_errors: true 13 | 14 | - name: Get IB 15 | include_tasks: get-ib.yml 16 | ignore_errors: true 17 | -------------------------------------------------------------------------------- /config.example/README.md: -------------------------------------------------------------------------------- 1 | Example DeepOps configuration 2 | ============================= 3 | 4 | This directory provides an example configuration for NVIDIA DeepOps. 5 | The files in this directory will help determine the behavior of the Ansible playbooks and other scripts that DeepOps uses to set up your systems. 6 | 7 | For more details on how this works, see [how to configure DeepOps](../docs/deepops/configuration.md). 8 | -------------------------------------------------------------------------------- /roles/kerberos_client/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | 11 | - name: workaround to enable service in container 12 | shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d 13 | when: ansible_distribution == "Ubuntu" 14 | -------------------------------------------------------------------------------- /roles/rsyslog_client/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | 11 | - name: workaround to enable service in container 12 | shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d 13 | when: ansible_distribution == "Ubuntu" 14 | -------------------------------------------------------------------------------- /roles/rsyslog_server/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure apt cache is updated 7 | apt: 8 | update_cache: true 9 | when: ansible_distribution == "Ubuntu" 10 | 11 | - name: workaround to enable service in container 12 | shell: printf '$!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d 13 | when: ansible_distribution == "Ubuntu" 14 | -------------------------------------------------------------------------------- /roles/singularity_wrapper/molecule/default/verify.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: verify 3 | hosts: all 4 | tasks: 5 | - name: check for path to singularity 6 | command: which singularity 7 | register: which_singularity 8 | changed_when: which_singularity.rc != 0 9 | 10 | - name: verify path to singularity 11 | assert: 12 | that: 13 | - "'/usr/local/bin/singularity' in which_singularity.stdout" 14 | -------------------------------------------------------------------------------- /workloads/examples/slurm/dask-rapids/files/launch-dask-scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ANACONDA_ROOT="/usr/local/anaconda" 4 | CONDA_ENV="/shared/conda" 5 | export PATH="${CONDA_ENV}/bin:${ANACONDA_ROOT}/bin:${PATH}" 6 | 7 | # shellcheck disable=SC1091 8 | source activate "${CONDA_ENV}" 9 | 10 | echo "Launching dask-scheduler on $(hostname)" 11 | dask-scheduler --host "$(hostname)" || echo "Unable to start scheduler" 12 | -------------------------------------------------------------------------------- /roles/move-home-dirs/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: find non-system users who need moving 3 | shell: set -o pipefail && awk '-F:' '($1!="nobody")&&($1!="{{ tmp_user }}")&&($3>=1000){print $6,$1}' /etc/passwd | grep -v "^{{ move_home_dirs_new_root }}" | awk '{print $2}' 4 | changed_when: false 5 | register: user_list 6 | 7 | - name: skip if no work to do 8 | include_tasks: move_users.yml 9 | when: user_list.stdout 10 | -------------------------------------------------------------------------------- /roles/ood-wrapper/templates/desktop-form.yml.j2: -------------------------------------------------------------------------------- 1 | --- 2 | attributes: 3 | desktop: "mate" 4 | bc_vnc_idle: 0 5 | bc_vnc_resolution: 6 | required: true 7 | node_type: null 8 | 9 | form: 10 | - bc_vnc_idle 11 | - desktop 12 | - bc_account 13 | - bc_num_hours 14 | - bc_num_slots 15 | - bc_num_gpus 16 | - node_type 17 | - bc_queue 18 | - bc_vnc_resolution 19 | - bc_email_on_started 20 | -------------------------------------------------------------------------------- /roles/rsyslog_client/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # The destination host for TCP forwarding of rsyslog messages. 3 | # 4 | # Note that this isn't defined by default in this role, and must be defined for 5 | # forwarding to take place. 6 | # 7 | # The DeepOps configuration for Slurm or Kubernetes may define this at the 8 | # playbook level. 9 | # 10 | # rsyslog_client_tcp_host: "10.0.0.1" 11 | 12 | rsyslog_client_tcp_port: "514" 13 | -------------------------------------------------------------------------------- /playbooks/utilities/nvidia-set-gpu-clocks.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # set gpu clocks on all worker nodes 3 | - hosts: all 4 | gather_facts: no 5 | become: yes 6 | tasks: 7 | - name: set the gpu clock to a specified amount 8 | shell: "nvidia-smi -lgc {{ gpu_clock_lock }}" 9 | when: not gpu_clock_reset 10 | 11 | - name: reset the gpu clock to the default 12 | shell: "nvidia-smi -rgc" 13 | when: gpu_clock_reset 14 | -------------------------------------------------------------------------------- /workloads/examples/k8s/deep-learning-examples/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ .Values.exampleName }} 5 | labels: 6 | app: {{ .Values.exampleName }} 7 | spec: 8 | type: NodePort 9 | ports: 10 | - name: jupyterlab 11 | nodePort: {{ .Values.jupyterNodePort }} 12 | port: 8888 13 | targetPort: 8888 14 | selector: 15 | app: {{ .Values.exampleName }} 16 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-slurm-enroot-job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source workloads/jenkins/scripts/jenkins-common.sh 3 | 4 | # Run a simple one-GPU enroot job 5 | ssh -v \ 6 | -o "StrictHostKeyChecking no" \ 7 | -o "UserKnownHostsFile /dev/null" \ 8 | -l vagrant \ 9 | -i "${HOME}/.ssh/id_rsa" \ 10 | "10.0.0.5${GPU01}" \ 11 | srun -N1 -G1 \ 12 | --container-image="nvcr.io#nvidia/cuda:10.2-base-ubuntu18.04" \ 13 | nvidia-smi -L 14 | -------------------------------------------------------------------------------- /playbooks/utilities/gpu-clocks.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: slurm-node 3 | become: true 4 | tasks: 5 | - name: install custom facts 6 | include_role: 7 | name: facts 8 | 9 | - name: set GPU clocks permissions 10 | command: nvidia-smi -acp UNRESTRICTED 11 | changed_when: false 12 | when: 13 | - ansible_local['gpus']['count'] 14 | environment: "{{proxy_env if proxy_env is defined else{}}}" 15 | 16 | -------------------------------------------------------------------------------- /workloads/examples/k8s/services/dhcpd.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: dhcp-server 5 | spec: 6 | hostNetwork: true 7 | containers: 8 | - name: dhcp-server 9 | image: joebiellik/dhcpd 10 | volumeMounts: 11 | - name: config-volume 12 | mountPath: /etc/dhcp 13 | volumes: 14 | - name: config-volume 15 | configMap: 16 | name: dhcpd 17 | restartPolicy: Never 18 | -------------------------------------------------------------------------------- /src/containers/ngc/pytorch/Dockerfile-minimal: -------------------------------------------------------------------------------- 1 | # https://ngc.nvidia.com/catalog/containers/nvidia:pytorch 2 | FROM nvcr.io/nvidia/pytorch:20.12-py3 3 | 4 | # Start Jupyter up by default rather than a shell 5 | ENTRYPOINT ["/bin/sh"] 6 | CMD ["-c", "jupyter lab --notebook-dir=/workspace --ip=0.0.0.0 --no-browser --allow-root --port=8888 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' --NotebookApp.base_url=${NB_PREFIX}"] 7 | -------------------------------------------------------------------------------- /src/containers/ngc/tensorflow/Dockerfile-minimal: -------------------------------------------------------------------------------- 1 | # https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow 2 | FROM nvcr.io/nvidia/tensorflow:20.12-tf1-py3 3 | 4 | # Start Jupyter up by default rather than a shell 5 | ENTRYPOINT ["/bin/sh"] 6 | CMD ["-c", "jupyter lab --notebook-dir=/workspace --ip=0.0.0.0 --no-browser --allow-root --port=8888 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' --NotebookApp.base_url=${NB_PREFIX}"] 7 | -------------------------------------------------------------------------------- /roles/grafana/defaults/main.yml: -------------------------------------------------------------------------------- 1 | grafana_config_dir: /etc/grafana 2 | grafana_config_template: templates/grafana.ini.j2 3 | grafana_data_dir: /var/lib/grafana 4 | grafana_user_id: 472 5 | grafana_container: "grafana/grafana:8.5.10" 6 | grafana_svc_name: "docker.grafana.service" 7 | grafana_state: started 8 | grafana_enabled: yes 9 | 10 | grafana_cfg_user: admin 11 | grafana_cfg_pass: deepops 12 | grafana_cfg_dashboard_path: "{{ grafana_data_dir }}/dashboards" 13 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/remote-script-for-registry-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | # Pull nginx container locally 6 | sudo ctr images pull --all-platforms docker.io/library/nginx:1.21 7 | 8 | # Tag docker container for local cluster registry 9 | sudo ctr images tag docker.io/library/nginx:1.21 registry.local:31500/nginx:1.21 10 | 11 | # Push to the local registry 12 | sudo ctr images push --plain-http registry.local:31500/nginx:1.21 13 | -------------------------------------------------------------------------------- /workloads/services/k8s/dgxie/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | -------------------------------------------------------------------------------- /playbooks/generic/software.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: true 4 | tasks: 5 | - name: remove old/undesirable packages 6 | package: 7 | name: "{{ software_remove_packages }}" 8 | state: absent 9 | when: software_remove_packages is defined 10 | 11 | - name: install extra packages 12 | package: 13 | name: "{{ software_extra_packages }}" 14 | state: present 15 | when: software_extra_packages is defined 16 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: apt update 3 | apt: 4 | update_cache: yes 5 | when: ansible_distribution == 'Ubuntu' 6 | 7 | - name: restart cachefilesd 8 | service: 9 | name: cachefilesd 10 | state: restarted 11 | 12 | - name: reboot after driver install 13 | reboot: 14 | when: install_driver.changed and not nvidia_driver_skip_reboot 15 | 16 | - name: restart docker 17 | service: 18 | name: docker 19 | state: restarted 20 | -------------------------------------------------------------------------------- /roles/lmod/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # include some reasonable defaults for module paths 3 | sm_prefix: "/sw" 4 | sm_module_root: "{{ sm_prefix }}/modules" 5 | sm_module_path: "{{ sm_module_root }}/all" 6 | sm_software_path: "{{ sm_prefix }}/software" 7 | 8 | epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm" 9 | epel_key_url: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}" 10 | -------------------------------------------------------------------------------- /roles/ood-wrapper/templates/bc_osc_codeserver/form.yml.j2: -------------------------------------------------------------------------------- 1 | cluster: "{{ ood_cluster_name }}" 2 | form: 3 | - bc_num_gpus 4 | - bc_num_hours 5 | - working_dir 6 | attributes: 7 | working_dir: 8 | label: "Working Directory" 9 | data-filepicker: true 10 | data-target-file-type: dirs # Valid values are: files, dirs, or both 11 | readonly: false 12 | help: "Select your project directory; defaults to $HOME" 13 | bc_num_gpus: 14 | label: "Number of GPUs" 15 | value: 1 16 | -------------------------------------------------------------------------------- /roles/prometheus-node-exporter/defaults/main.yml: -------------------------------------------------------------------------------- 1 | node_exporter_container: "quay.io/prometheus/node-exporter:v1.3.1" 2 | node_exporter_prom_dir: "/run/prometheus" 3 | node_exporter_svc_name: "docker.node-exporter.service" 4 | node_exporter_state: started 5 | node_exporter_enabled: yes 6 | 7 | prometheus_config_dir: /etc/prometheus 8 | prometheus_cfg_endpoint_dir: "{{ prometheus_config_dir }}/endpoints" 9 | node_exporter_conf_template: "node-exporter.yml.j2" 10 | 11 | node_exporter_max_cpu: "0.5" 12 | -------------------------------------------------------------------------------- /workloads/examples/k8s/services/logging/kibana-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: kibana-logging 5 | namespace: logging 6 | labels: 7 | k8s-app: kibana-logging 8 | kubernetes.io/cluster-service: "true" 9 | addonmanager.kubernetes.io/mode: Reconcile 10 | kubernetes.io/name: "Kibana" 11 | spec: 12 | ports: 13 | - port: 5601 14 | protocol: TCP 15 | targetPort: ui 16 | type: NodePort 17 | selector: 18 | k8s-app: kibana-logging 19 | -------------------------------------------------------------------------------- /playbooks/utilities/mofed.yml: -------------------------------------------------------------------------------- 1 | # Installs NVIDIA Mellanox OFED, a collection of software packages to enable 2 | # high-speed networking with InfiniBand or RoCE on NVIDIA Mellanox networking 3 | # adapters. 4 | # 5 | # This playbook automates the software installation process outlined in the 6 | # MLNX_OFED documentation, here: 7 | # https://community.mellanox.com/s/article/howto-install-mlnx-ofed-driver 8 | --- 9 | - hosts: "{{ hostlist | default('all') }}" 10 | become: yes 11 | roles: 12 | - mofed 13 | -------------------------------------------------------------------------------- /workloads/examples/k8s/deep-learning-examples/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /workloads/examples/slurm/mpi-hello/hello-job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -J mpi-hello # Job name 3 | #SBATCH -n 2 # Number of processes 4 | #SBATCH -t 0:10:00 # Max wall time 5 | #SBATCH -o hello-job.out # Output file name 6 | 7 | # Disable the Infiniband transport for OpenMPI (not present on all clusters) 8 | export OMPI_MCA_btl="^openib" 9 | 10 | # Run the job (assumes the batch script is submitted from the same directory) 11 | mpirun -np 2 ./mpi-hello 12 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/vagrant-startup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | set -ex 3 | source workloads/jenkins/scripts/jenkins-common.sh 4 | 5 | cd virtual || exit 1 6 | bash ./vagrant_shutdown.sh || true # Some previous VMs may not have been cleaned; this may fail if the environment is clean; so we proceed regardless 7 | bash ./vagrant_startup.sh # If this fails the entire test should halt 8 | cat virtual_inventory* # We can't look at config/inventory because that is created after this step 9 | cat Vagrantfile 10 | -------------------------------------------------------------------------------- /playbooks/k8s-cluster/nvidia-gpu-operator.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Ensure OpenShift packages are installed 3 | import_playbook: ../bootstrap/bootstrap-openshift.yml 4 | 5 | # GPU operator 6 | - hosts: kube-master[0] 7 | become: yes 8 | tasks: 9 | - name: Install helm chart for GPU operator 10 | include_role: 11 | name: nvidia-gpu-operator 12 | run_once: true 13 | environment: "{{proxy_env if proxy_env is defined else {}}}" 14 | tags: 15 | - nvidia 16 | - nvidia-gpu-operator 17 | -------------------------------------------------------------------------------- /roles/slurm/vars/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | slurm_pam_lib_dir: /lib/x86_64-linux-gnu/security 3 | 4 | slurm_build_deps: 5 | - build-essential 6 | - libmunge-dev 7 | - libmariadb-dev 8 | - libmariadbclient-dev-compat 9 | - libpam0g-dev 10 | - libdbus-1-dev 11 | - python3-minimal 12 | - ruby-dev 13 | - wget 14 | 15 | slurm_pmix_deps: 16 | - build-essential 17 | - libev-dev 18 | - libevent-dev 19 | - zlib1g 20 | - zlib1g-dev 21 | - pandoc 22 | 23 | slurm_hwloc_deps: 24 | - build-essential 25 | -------------------------------------------------------------------------------- /scripts/pxe/build_and_restart_dgxie.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | source config/pxe/env 5 | 6 | compose_directory_cmd="" #"--project-directory ." 7 | compose_cmd="docker-compose --env-file ./config/pxe/env ${compose_directory} -f ${COMPOSE_FILE}" 8 | 9 | 10 | function tear_down() { 11 | ${compose_cmd} down 12 | } 13 | 14 | function build() { 15 | ${compose_cmd} build 16 | } 17 | 18 | function bring_up() { 19 | ${compose_cmd} up -d 20 | } 21 | 22 | 23 | tear_down 24 | build 25 | bring_up 26 | -------------------------------------------------------------------------------- /workloads/examples/k8s/services/nfs-client.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: nfs 5 | spec: 6 | capacity: 7 | storage: 500Gi 8 | accessModes: 9 | - ReadWriteMany 10 | nfs: 11 | server: 10.0.0.1 12 | path: "/exports" 13 | --- 14 | apiVersion: v1 15 | kind: PersistentVolumeClaim 16 | metadata: 17 | name: nfs 18 | spec: 19 | accessModes: 20 | - ReadWriteMany 21 | storageClassName: "" 22 | resources: 23 | requests: 24 | storage: 500Gi 25 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-ceph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | source workloads/jenkins/scripts/jenkins-common.sh 4 | 5 | # Ensure working directory is root 6 | cd "${ROOT_DIR}" || exit 1 7 | 8 | # Deploy rook, fail if it takes longer than 5 minutes 9 | timeout 300 ./scripts/k8s/deploy_rook.sh -x 10 | 11 | # Poll for completion, fail if it takes longer than 30 minutes 12 | timeout 1800 ./scripts/k8s/deploy_rook.sh -w 13 | 14 | # Print Rook-Ceph information 15 | timeout 60 ./scripts/k8s/deploy_rook.sh -p 16 | -------------------------------------------------------------------------------- /roles/mofed/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | mofed_version: "5.6-2.0.9.0" 3 | 4 | mofed_download_dir: "/tmp/mofed-install" 5 | mofed_download_dest: "mofed.tgz" 6 | mofed_download_name: "MLNX_OFED_LINUX-{{ mofed_version }}-{{ mofed_distro }}{{ ansible_distribution_version }}-{{ ansible_architecture }}" 7 | mofed_download_url: "http://www.mellanox.com/downloads/ofed/MLNX_OFED-{{ mofed_version }}/{{ mofed_download_name }}.tgz" 8 | 9 | mofed_install_flags: "--all --without-fw-update" 10 | 11 | mofed_cleanup_install_dir: true 12 | -------------------------------------------------------------------------------- /scripts/generic/gpu_diag.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | diag_level=${1:-1} 4 | 5 | hostname 6 | 7 | # discover GPUs 8 | dcgmi discovery -l 9 | 10 | # remove old groups 11 | for old_group in $(dcgmi group -l | grep "Group ID" | awk '{print $5}') ; do 12 | dcgmi group -d ${old_group} >/dev/null 2>&1 13 | done 14 | 15 | # create new default group and record group number 16 | new_group=$(dcgmi group -c default --default | awk '{print $NF}') 17 | 18 | dcgmi diag -g ${new_group} -r ${diag_level} 19 | 20 | exit 0 21 | -------------------------------------------------------------------------------- /workloads/examples/k8s/gpu-usage/mig-mixed-without-selector.yml: -------------------------------------------------------------------------------- 1 | # This yaml file will launch a container with a 1g.5gb MIG device 2 | # It will deploy onto any GPU where this profile is available 3 | apiVersion: v1 4 | kind: Pod 5 | metadata: 6 | name: gpu-pod 7 | spec: 8 | containers: 9 | - name: gpu-pod 10 | image: nvcr.io/nvidia/k8s/cuda-sample:nbody 11 | command: ["/bin/sh"] 12 | args: ["-c", "nvidia-smi"] 13 | resources: 14 | limits: 15 | nvidia.com/mig-1g.5gb: 1 16 | -------------------------------------------------------------------------------- /roles/openmpi/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | openmpi_version: 4.0.3 3 | openmpi_tag: "v{{ openmpi_version.split('.')[0] }}.{{ openmpi_version.split('.')[1] }}" 4 | openmpi_src_url: "https://download.open-mpi.org/release/open-mpi/{{ openmpi_tag }}/openmpi-{{ openmpi_version }}.tar.bz2" 5 | openmpi_build_dir: /tmp/openmpi-build 6 | openmpi_install_prefix: /usr/local 7 | openmpi_configure: "./configure --prefix={{ openmpi_install_prefix }} --disable-dependency-tracking --disable-getpwuid --with-pmix=internal" 8 | openmpi_force_rebuild: no 9 | -------------------------------------------------------------------------------- /workloads/examples/k8s/pytorch-job.yml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: pytorch-job 5 | spec: 6 | backoffLimit: 5 7 | template: 8 | spec: 9 | containers: 10 | - name: pytorch-container 11 | image: nvcr.io/nvidia/pytorch:19.02-py3 12 | command: ["/bin/sh"] 13 | args: ["-c", "python /workspace/examples/upstream/mnist/main.py"] 14 | resources: 15 | limits: 16 | nvidia.com/gpu: 1 17 | restartPolicy: Never 18 | -------------------------------------------------------------------------------- /workloads/examples/k8s/services/logging/es-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: elasticsearch-logging 5 | namespace: logging 6 | labels: 7 | k8s-app: elasticsearch-logging 8 | kubernetes.io/cluster-service: "true" 9 | addonmanager.kubernetes.io/mode: Reconcile 10 | kubernetes.io/name: "Elasticsearch" 11 | spec: 12 | ports: 13 | - port: 9200 14 | protocol: TCP 15 | targetPort: db 16 | type: NodePort 17 | selector: 18 | k8s-app: elasticsearch-logging 19 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-spack-minimal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | source workloads/jenkins/scripts/jenkins-common.sh 4 | 5 | # Install Spack, but do not install any modules 6 | ansible-playbook -i virtual/config/inventory playbooks/slurm-cluster/spack-modules.yml 7 | 8 | # After install, we expect spack to be in our PATH 9 | ssh -v \ 10 | -o "StrictHostKeyChecking no" \ 11 | -o "UserKnownHostsFile /dev/null" \ 12 | -l vagrant \ 13 | -i "${HOME}/.ssh/id_rsa" \ 14 | "10.0.0.5${GPU01}" \ 15 | "which spack" 16 | -------------------------------------------------------------------------------- /src/containers/ngc/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | for dir in `ls -d */ | sed 's:/::g'`; do 3 | cd ${dir} 4 | 5 | echo "Building deepops-${dir}-minimal" 6 | docker build -t deepops-${dir}-minimal -f Dockerfile-minimal . 7 | docker tag deepops-${dir}-minimal deepops-${dir}-minimal:kubeflow 8 | 9 | if [ "${1}" != "minimal" ]; then 10 | echo "Building deepops-${dir}" 11 | docker build -t deepops-${dir} -f Dockerfile . 12 | docker tag deepops-${dir} deepops-${dir}:kubeflow 13 | fi 14 | 15 | cd - 16 | done 17 | -------------------------------------------------------------------------------- /roles/spack/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | spack_repo: "https://github.com/spack/spack.git" 3 | spack_install_dir: "/sw/spack" 4 | spack_version: "v0.18.1" 5 | spack_user: "root" 6 | spack_group: "root" 7 | 8 | spack_ubuntu_deps: 9 | - "gcc-7" 10 | - "gfortran-7" 11 | - "make" 12 | - "git" 13 | 14 | spack_redhat_deps: 15 | - "gcc" 16 | - "gcc-c++" 17 | - "gcc-gfortran" 18 | - "make" 19 | - "git" 20 | 21 | spack_build_packages: false 22 | spack_default_packages: 23 | - "cuda@10.2.89" 24 | - "openmpi@3.1.6 +cuda +pmi schedulers=auto" 25 | -------------------------------------------------------------------------------- /src/containers/dgxie/dnsmasq.conf: -------------------------------------------------------------------------------- 1 | domain-needed 2 | bogus-priv 3 | strict-order 4 | no-resolv 5 | no-poll 6 | expand-hosts 7 | cache-size=2048 8 | bind-interfaces 9 | 10 | server=#DNS1# 11 | server=#DNS2# 12 | domain=#DOMAIN# 13 | interface=#DHCP_INT# 14 | 15 | log-queries 16 | log-dhcp 17 | log-facility=/var/log/dnsmasq.log 18 | 19 | dhcp-authoritative 20 | dhcp-range=#DHCP_START#,#DHCP_END#,#LEASETIME# 21 | dhcp-option=tag:green,option:domain-search,#DOMAIN# 22 | dhcp-option=3,#GATEWAY# 23 | 24 | conf-dir=/etc/dnsmasq.d,*.conf 25 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/epilog.d/50-lastuserjob-all-enroot-dirs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | command -v enroot >/dev/null || exit 0 # enroot not installed 5 | 6 | {% if enroot_runtime_path | default(none) %} 7 | runtime_path="$(sudo -u "$SLURM_JOB_USER" sh -c 'echo "{{ enroot_runtime_path }}"')" 8 | rm -rf "$runtime_path" 9 | {% endif %} 10 | 11 | {% if enroot_data_path | default(none) %} 12 | data_path="$(sudo -u "$SLURM_JOB_USER" sh -c 'echo "{{ enroot_data_path }}"')" 13 | rm -rf "$data_path" 14 | {% endif %} 15 | -------------------------------------------------------------------------------- /workloads/examples/k8s/tensorflow-job.yml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: tensorflow-job 5 | spec: 6 | template: 7 | spec: 8 | restartPolicy: Never 9 | containers: 10 | - name: tensorflow-job-container 11 | image: nvcr.io/nvidia/tensorflow:21.03-tf1-py3 12 | command: ["/bin/sh"] 13 | args: ["-c", "python /workspace/nvidia-examples/cnn/resnet.py --layers=50 --batch_size=64"] 14 | resources: 15 | limits: 16 | nvidia.com/gpu: 1 17 | -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/epilog-dcgmstats: -------------------------------------------------------------------------------- 1 | # Stop DCGM GPU stats collection if requested 2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null 3 | if [ $? -eq 0 ]; then 4 | OUTPUTDIR=`scontrol show job $SLURM_JOBID | grep WorkDir | cut -d = -f 2` 5 | sudo -u $SLURM_JOB_USER dcgmi stats -x $SLURM_JOBID 6 | sudo -u $SLURM_JOB_USER dcgmi stats -v -j $SLURM_JOBID | sudo -u $SLURM_JOB_USER tee $OUTPUTDIR/dcgm-gpu-stats-$HOSTNAME-$SLURM_JOBID.out 7 | sudo -u $SLURM_JOB_USER nv-hostengine -t 8 | fi 9 | -------------------------------------------------------------------------------- /playbooks/container/docker-rootless.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install NVIDIA driver on GPU servers 3 | import_playbook: ../nvidia-software/nvidia-driver.yml 4 | 5 | - name: Install NVIDIA container runtime on GPU servers 6 | import_playbook: nvidia-docker.yml 7 | vars: 8 | nvidia_docker_skip_docker_reload: true 9 | 10 | - name: Install Lmod 11 | import_playbook: ../slurm-cluster/lmod.yml 12 | 13 | - hosts: all 14 | become: yes 15 | tasks: 16 | - name: install rootless docker 17 | include_role: 18 | name: docker-rootless 19 | -------------------------------------------------------------------------------- /roles/nfs/templates/exports.j2: -------------------------------------------------------------------------------- 1 | # /etc/exports: the access control list for filesystems which may be exported 2 | # to NFS clients. See exports(5). 3 | # 4 | # Example for NFSv2 and NFSv3: 5 | # /srv/homes hostname1(rw,sync,no_subtree_check) hostname2(ro,sync,no_subtree_check) 6 | # 7 | # Example for NFSv4: 8 | # /srv/nfs4 gss/krb5i(rw,sync,fsid=0,crossmnt,no_subtree_check) 9 | # /srv/nfs4/homes gss/krb5i(rw,sync,no_subtree_check) 10 | # 11 | {% for export in nfs_exports %} 12 | {{ export.path }} {{ export.options }} 13 | {% endfor %} 14 | -------------------------------------------------------------------------------- /roles/ood-wrapper/templates/cluster.yml.j2: -------------------------------------------------------------------------------- 1 | --- 2 | v2: 3 | metadata: 4 | title: "{{ ood_cluster_title }}" 5 | login: 6 | host: "{{ ansible_fqdn }}" 7 | job: 8 | adapter: "slurm" 9 | cluster: "{{ ood_cluster_name }}" 10 | bin: "/usr/local/bin" 11 | conf: "/etc/slurm/slurm.conf" 12 | batch_connect: 13 | basic: 14 | script_wrapper: | 15 | %s 16 | vnc: 17 | script_wrapper: | 18 | export PATH="/opt/TurboVNC/bin:$PATH" 19 | export WEBSOCKIFY_CMD="/usr/bin/websockify" 20 | %s 21 | -------------------------------------------------------------------------------- /roles/grafana/templates/docker.grafana.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Grafana 3 | After=docker.service 4 | Requires=docker.service 5 | 6 | [Service] 7 | TimeoutStartSec=0 8 | Restart=always 9 | ExecStartPre=-/usr/bin/docker stop %n 10 | ExecStartPre=-/usr/bin/docker rm %n 11 | ExecStartPre=/usr/bin/docker pull {{ grafana_container }} 12 | ExecStart=/usr/bin/docker run --rm --network host --name %n -v {{ grafana_config_dir }}:/etc/grafana -v {{ grafana_data_dir }}:/var/lib/grafana {{ grafana_container }} 13 | 14 | [Install] 15 | WantedBy=multi-user.target 16 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-operator/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Set GPU Operator flags for systems with preinstalled NVIDIA software (DGX, etc). 3 | set_fact: 4 | gpu_operator_enable_driver: false 5 | gpu_operator_enable_toolkit: true 6 | when: gpu_operator_preinstalled_nvidia_software 7 | 8 | - name: deploy nvidia gpu operator to kubernetes 9 | include_tasks: k8s.yml 10 | when: not gpu_operator_nvaie_enable 11 | 12 | - name: deploy nvidia gpu operator to nvidia ai enterprise 13 | include_tasks: nvaie.yml 14 | when: gpu_operator_nvaie_enable 15 | 16 | -------------------------------------------------------------------------------- /roles/nvidia-network-operator/templates/values.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # 3 | # Default setting for DGX systems with IB networking 4 | # 5 | 6 | nfd: 7 | enabled: true 8 | sriovNetworkOperator: 9 | enabled: true 10 | 11 | # NicClusterPolicy CR values: 12 | deployCR: true 13 | ofedDriver: 14 | deploy: false 15 | rdmaSharedDevicePlugin: 16 | deploy: false 17 | sriovDevicePlugin: 18 | deploy: false 19 | 20 | secondaryNetwork: 21 | deploy: true 22 | multus: 23 | deploy: true 24 | cniPlugins: 25 | deploy: true 26 | ipamPlugin: 27 | deploy: true 28 | -------------------------------------------------------------------------------- /roles/slurm/vars/redhat.yml: -------------------------------------------------------------------------------- 1 | --- 2 | slurm_pam_lib_dir: /lib64/security 3 | 4 | slurm_build_deps: 5 | - "@Development Tools" 6 | - munge-devel 7 | - munge-libs 8 | - python3 9 | - readline-devel 10 | - mariadb-devel 11 | - numactl-devel 12 | - pam-devel 13 | - http-parser-devel 14 | - json-c-devel 15 | - perl-ExtUtils-MakeMaker 16 | - libatomic 17 | 18 | slurm_pmix_deps: 19 | - "@Development Tools" 20 | - libev-devel 21 | - libevent-devel 22 | - zlib 23 | - zlib-devel 24 | 25 | slurm_hwloc_deps: 26 | - "@Development Tools" 27 | -------------------------------------------------------------------------------- /src/repo/githooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "${DEEPOPS_BYPASS_LINT}" ]; then 4 | exit 0 5 | fi 6 | 7 | FAILED=0 8 | 9 | # Lint changed Ansible files 10 | if ! src/githooks/check-ansible.py; then 11 | FAILED=1 12 | echo "Failed Ansible lint" 13 | fi 14 | 15 | # Lint changed shell scripts 16 | if ! src/githooks/check-shell.py; then 17 | FAILED=1 18 | echo "Failed shell lint" 19 | fi 20 | 21 | # Lint changed Python files 22 | if ! src/githooks/check-python.py; then 23 | FAILED=1 24 | echo "Failed python lint" 25 | fi 26 | 27 | exit ${FAILED} 28 | -------------------------------------------------------------------------------- /roles/nfs-client-provisioner/defaults/main.yml: -------------------------------------------------------------------------------- 1 | # Vars needed to install nfs-client-provisioner 2 | k8s_nfs_server: "127.0.0.1" 3 | k8s_nfs_export_path: "/export/deepops_nfs" 4 | k8s_nfs_default_sc: "true" 5 | k8s_nfs_client_repo_name: "nfs-subdir-external-provisioner" 6 | k8s_nfs_client_helm_repo: "https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner" 7 | k8s_nfs_client_chart_name: "{{ k8s_nfs_client_repo_name }}/nfs-subdir-external-provisioner" 8 | k8s_nfs_client_release_name: "nfs-subdir-external-provisioner" 9 | k8s_nfs_client_chart_version: "4.0.13" 10 | -------------------------------------------------------------------------------- /workloads/examples/k8s/deep-learning-examples/templates/tests/test-connection.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: "{{ include "deep-learning-examples.fullname" . }}-test-connection" 5 | labels: 6 | {{- include "deep-learning-examples.labels" . | nindent 4 }} 7 | annotations: 8 | "helm.sh/hook": test 9 | spec: 10 | containers: 11 | - name: wget 12 | image: busybox 13 | command: ['wget'] 14 | args: ['{{ include "deep-learning-examples.fullname" . }}:{{ .Values.jupyterNodePort }}'] 15 | restartPolicy: Never 16 | -------------------------------------------------------------------------------- /workloads/examples/k8s/services/nfs-dgx-iso.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: nfs-dgx-iso 5 | spec: 6 | capacity: 7 | storage: 5Gi 8 | accessModes: 9 | - ReadOnlyMany 10 | nfs: 11 | server: # 12 | path: "/path/to/iso/DGXServer-3.1.2.170902_f8777e" 13 | --- 14 | apiVersion: v1 15 | kind: PersistentVolumeClaim 16 | metadata: 17 | name: nfs-dgx-iso 18 | spec: 19 | accessModes: 20 | - ReadOnlyMany 21 | storageClassName: "" 22 | resources: 23 | requests: 24 | storage: 5Gi 25 | -------------------------------------------------------------------------------- /roles/nvidia-dgx-firmware/tasks/get-health.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Run NVSM human-readable health show 3 | shell: "nvsm show health > {{ fw_dir }}/nvsm-show-health.log" 4 | become: yes 5 | 6 | - name: Run NVSM dump health 7 | shell: "nvsm dump health" 8 | become: yes 9 | when: nvsm_dump_health 10 | 11 | - name: Run quick DCGM diagnostic 12 | shell: "dcgmi diag -r 1 > {{ fw_dir }}/dcgm_diag_1.log" 13 | become: yes 14 | 15 | - name: Run full DCGM diagnostic 16 | shell: "dcgmi diag -r 3 > {{ fw_dir }}/dcgm_diag_3.log" 17 | become: yes 18 | when: dcgm_stress 19 | -------------------------------------------------------------------------------- /roles/prometheus/defaults/main.yml: -------------------------------------------------------------------------------- 1 | prometheus_config_dir: /etc/prometheus 2 | prometheus_config_src: templates/prometheus.yml.j2 3 | prometheus_alert_rules_src: templates/alert_rules.yml.j2 4 | prometheus_container: "prom/prometheus:v2.37.0" 5 | prometheus_svc_name: "docker.prometheus.service" 6 | prometheus_docker_volume_name: "deepops_prometheus_metrics" 7 | prometheus_state: started 8 | prometheus_enabled: yes 9 | 10 | prometheus_cfg_scrape_interval: 15s 11 | prometheus_cfg_evaluation_interval: 15s 12 | prometheus_cfg_endpoint_dir: "{{ prometheus_config_dir }}/endpoints" 13 | -------------------------------------------------------------------------------- /roles/kerberos_client/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | kerberos_client_redhat7_pkg: 3 | - libselinux-python 4 | - krb5-libs 5 | - krb5-workstation 6 | - cyrus-sasl-gssapi 7 | 8 | kerberos_client_redhat8_pkg: 9 | - python3-libselinux 10 | - krb5-libs 11 | - krb5-workstation 12 | - cyrus-sasl-gssapi 13 | 14 | kerberos_client_ubuntu18_pkg: 15 | - python-selinux 16 | - krb5-user 17 | - libsasl2-modules-gssapi-mit 18 | - libpam-krb5 19 | 20 | kerberos_client_ubuntu20_pkg: 21 | - python3-selinux 22 | - krb5-user 23 | - libsasl2-modules-gssapi-mit 24 | - libpam-krb5 25 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-operator-node-prep/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: update-initramfs (Debian) 3 | command: update-initramfs -u 4 | when: ansible_os_family == "Debian" 5 | listen: update-initramfs 6 | 7 | - name: Backup initramfs (RedHat) 8 | shell: mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak 9 | when: ansible_os_family == "RedHat" 10 | listen: update-initramfs 11 | 12 | - name: update-initramfs (RedHat) 13 | shell: dracut /boot/initramfs-$(uname -r).img $(uname -r) 14 | when: ansible_os_family == "RedHat" 15 | listen: update-initramfs 16 | -------------------------------------------------------------------------------- /roles/slurm/tasks/munge.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: install munge 4 | package: 5 | name: munge 6 | state: present 7 | notify: 8 | - restart munge 9 | 10 | - name: create munge.key 11 | template: 12 | src: templates/etc/munge/munge.key.j2 13 | dest: /etc/munge/munge.key 14 | mode: 0400 15 | owner: munge 16 | notify: 17 | - restart munge 18 | 19 | - name: start munge 20 | service: 21 | name: munge 22 | enabled: yes 23 | state: started 24 | 25 | - name: flush handlers to ensure munge gets restarted now 26 | meta: flush_handlers 27 | -------------------------------------------------------------------------------- /roles/kerberos_client/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | galaxy_info: 3 | author: "Benno Joy" 4 | company: AnsibleWorks 5 | license: BSD 6 | min_ansible_version: 1.4 7 | role_name: kerberos_client 8 | namespace: deepops 9 | platforms: 10 | - name: EL 11 | versions: 12 | - 5 13 | - 6 14 | - name: Fedora 15 | versions: 16 | - 16 17 | - 17 18 | - 18 19 | - name: Ubuntu 20 | versions: 21 | - precise 22 | - quantal 23 | - raring 24 | - saucy 25 | categories: 26 | - development 27 | dependencies: [] 28 | 29 | -------------------------------------------------------------------------------- /roles/prometheus/templates/docker.prometheus.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Prometheus 3 | After=docker.service 4 | Requires=docker.service 5 | 6 | [Service] 7 | TimeoutStartSec=0 8 | Restart=always 9 | ExecStartPre=-/usr/bin/docker stop %n 10 | ExecStartPre=-/usr/bin/docker rm %n 11 | ExecStartPre=/usr/bin/docker pull {{ prometheus_container }} 12 | ExecStart=/usr/bin/docker run --rm --network host --name %n -v {{ prometheus_config_dir }}:/etc/prometheus -v {{ prometheus_docker_volume_name }}:/prometheus {{ prometheus_container }} 13 | 14 | [Install] 15 | WantedBy=multi-user.target 16 | -------------------------------------------------------------------------------- /roles/nvidia_cuda/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure man directories exist in minimal ubuntu image 7 | file: 8 | path: "/usr/share/man/man1" 9 | owner: "root" 10 | group: "root" 11 | state: "directory" 12 | mode: "0755" 13 | when: ansible_distribution == "Ubuntu" 14 | 15 | - name: Ensure dependencies are present for apt key management 16 | apt: 17 | update_cache: yes 18 | name: "gpg-agent" 19 | state: present 20 | when: ansible_distribution == "Ubuntu" 21 | -------------------------------------------------------------------------------- /roles/nvidia_dcgm/molecule/default/prepare.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | become: yes 4 | tasks: 5 | 6 | - name: ensure man directories exist in minimal ubuntu image 7 | file: 8 | path: "/usr/share/man/man1" 9 | owner: "root" 10 | group: "root" 11 | state: "directory" 12 | mode: "0755" 13 | when: ansible_distribution == "Ubuntu" 14 | 15 | - name: Ensure dependencies are present for apt key management 16 | apt: 17 | update_cache: yes 18 | name: "gpg-agent" 19 | state: present 20 | when: ansible_distribution == "Ubuntu" 21 | -------------------------------------------------------------------------------- /roles/ood-wrapper/vars/ubuntu.yml: -------------------------------------------------------------------------------- 1 | install_from_src: true 2 | 3 | ood_apache_service_name: apache2 4 | ood_htpasswd_file: /etc/apache2/.htpasswd 5 | 6 | ood_url_turbovnc_pkg: https://downloads.sourceforge.net/project/turbovnc/2.2.4/turbovnc_2.2.4_amd64.deb 7 | 8 | ood_master_sw_deps: 9 | - liblz4-tool 10 | - unzip 11 | - websockify 12 | 13 | ood_client_sw_deps: 14 | - liblz4-tool 15 | - unzip 16 | - nmap 17 | - websockify 18 | - xfce4 19 | - xfce4-terminal 20 | - xfce4-goodies 21 | - jupyter-notebook 22 | - dbus-x11 23 | - firefox 24 | - nvidia-visual-profiler 25 | -------------------------------------------------------------------------------- /workloads/examples/k8s/cluster-gpu-test-job.yml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: cluster-gpu-tests 5 | spec: 6 | parallelism: 4 # DYNAMIC_PARALLELISM 7 | completions: 4 # DYNAMIC_COMPLETIONS 8 | backoffLimit: 5 9 | template: 10 | spec: 11 | containers: 12 | - name: cluster-gpu-tests 13 | image: nvcr.io/nvidia/cuda:9.0-base 14 | command: ["/bin/bash","-c","nvidia-smi && sleep 10"] 15 | args: 16 | resources: 17 | limits: 18 | nvidia.com/gpu: 1 19 | restartPolicy: Never 20 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-spack-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | source workloads/jenkins/scripts/jenkins-common.sh 4 | 5 | # Install Spack, including building default modules 6 | ansible-playbook \ 7 | -i virtual/config/inventory \ 8 | -e '{"spack_build_packages": true}' \ 9 | playbooks/slurm-cluster/spack-modules.yml 10 | 11 | # After install, we expect a cuda module to exist 12 | ssh -v \ 13 | -o "StrictHostKeyChecking no" \ 14 | -o "UserKnownHostsFile /dev/null" \ 15 | -l vagrant \ 16 | -i "${HOME}/.ssh/id_rsa" \ 17 | "10.0.0.5${GPU01}" \ 18 | "spack find | grep cuda" 19 | -------------------------------------------------------------------------------- /roles/docker-rootless/templates/rootless-docker/config/nvidia-container-runtime/config.toml: -------------------------------------------------------------------------------- 1 | disable-require = false 2 | #swarm-resource = "DOCKER_RESOURCE_GPU" 3 | 4 | [nvidia-container-cli] 5 | #root = "/run/nvidia/driver" 6 | #path = "/usr/bin/nvidia-container-cli" 7 | environment = [] 8 | #debug = "/var/log/nvidia-container-toolkit.log" 9 | #ldcache = "/etc/ld.so.cache" 10 | load-kmods = true 11 | no-cgroups = true 12 | #no-cgroups = false 13 | #user = "root:video" 14 | ldconfig = "@/sbin/ldconfig.real" 15 | 16 | [nvidia-container-runtime] 17 | #debug = "/var/log/nvidia-container-runtime.log" 18 | -------------------------------------------------------------------------------- /roles/pyxis/defaults/main.yml: -------------------------------------------------------------------------------- 1 | slurm_install_pyxis: true 2 | slurm_install_prefix: /usr/local 3 | slurm_config_dir: /etc/slurm 4 | slurm_pyxis_version: 0.11.1 5 | slurm_pyxis_tarball_url: "https://github.com/NVIDIA/pyxis/archive/v{{ slurm_pyxis_version }}.tar.gz" 6 | 7 | is_controller: no 8 | is_compute: no 9 | 10 | pyxis_ubuntu_deps: 11 | - "bsdmainutils" 12 | 13 | pyxis_el_deps: 14 | - "util-linux" 15 | 16 | # /run is default partition of pyxis runtime_path 17 | resize_run_partition: false 18 | 19 | # /run tmpfs size. ubuntu default is 10% of physical memory 20 | pyxis_run_tmpfs_size: 50% 21 | -------------------------------------------------------------------------------- /workloads/examples/k8s/services/ambassador-service.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | labels: 6 | service: ambassador 7 | name: ambassador 8 | annotations: 9 | getambassador.io/config: | 10 | --- 11 | apiVersion: ambassador/v0 12 | kind: Mapping 13 | name: httpbin_mapping 14 | prefix: /httpbin/ 15 | service: httpbin.org:80 16 | host_rewrite: httpbin.org 17 | spec: 18 | type: NodePort 19 | ports: 20 | - name: ambassador 21 | port: 80 22 | targetPort: 80 23 | selector: 24 | service: ambassador 25 | -------------------------------------------------------------------------------- /roles/alertmanager/templates/docker.alertmanager.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Alert Manager 3 | After=docker.service 4 | Requires=docker.service 5 | 6 | [Service] 7 | TimeoutStartSec=0 8 | Restart=always 9 | ExecStartPre=-/usr/bin/docker stop %n 10 | ExecStartPre=-/usr/bin/docker rm %n 11 | ExecStartPre=/usr/bin/docker pull {{ alertmanager_container }} 12 | ExecStart=/usr/bin/docker run --rm --network host --name %n -v {{ alertmanager_config_dir }}:/etc/alertmanager -v {{ alertmanager_docker_volume_name }}:/alertmanager {{ alertmanager_container }} 13 | 14 | [Install] 15 | WantedBy=multi-user.target 16 | -------------------------------------------------------------------------------- /virtual/vars_files/virt_slurm.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # For virtual cluster, ensure hosts file correctly uses private network 3 | hosts_network_interface: "eth1" 4 | 5 | # Ensure vagrant user has SSH access after pam_slurm for debugging 6 | slurm_allow_ssh_user: 7 | - "vagrant" 8 | - "root" 9 | 10 | # Perform cleanup tasks during the install to minimize disk space impact 11 | hpcsdk_clean_up_tarball_after_extract: true 12 | hpcsdk_clean_up_temp_dir: true 13 | slurm_build_dir_cleanup: false 14 | 15 | # Ensure we use the slurm management node for syslog 16 | rsyslog_client_tcp_host: "{{ groups['slurm-master'][0] }}" 17 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/get-slurm-debug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | source workloads/jenkins/scripts/jenkins-common.sh 4 | 5 | 6 | # Ensure working directory is virtual, so downstream Ansible picks up the correct inventory 7 | cd "${VIRT_DIR}/virtual" 8 | 9 | # Collect all the standard debug 10 | ${ROOT_DIR}/scripts/slurm/debug.sh 11 | 12 | # The debug script will create a time-stamped log dir 13 | logdir=$(ls -Art ./config | grep log_ | tail -n 1) 14 | 15 | # Iterate over each .log file and pring to screen, ignoring the tar 16 | for logfile in $(ls ./config/${logdir}/*log); do 17 | cat ${logfile} 18 | done 19 | -------------------------------------------------------------------------------- /roles/slurm/tasks/undrain.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # un-drain nodes that are down due to an unexpected reboot during install 3 | # sudo scontrol update node=XXX state=idle 4 | # where XXX are the nodes that have changed and are marked as *down* 5 | - name: set nodes to idle 6 | command: "scontrol update node={{ item }} state=idle" 7 | register: undrain_nodes_result 8 | ignore_errors: yes 9 | with_items: 10 | - "{{ groups['slurm-node'] }}" 11 | environment: 12 | PATH: '{{ slurm_install_prefix }}/bin:{{ ansible_env.PATH }}' 13 | run_once: true 14 | tags: 15 | - never 16 | - undrain 17 | changed_when: false 18 | -------------------------------------------------------------------------------- /playbooks/nvidia-dgx/nvidia-dgx-diag.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This playbook is meant to diagnose/debug a cluster of DGX systems 3 | # It is built around DGX-1/DGX-2 but parts of the role will work for any Tesla GPU system 4 | # Because this is built as a diagnostic tool, many tasks have ignore_error set to true, this allows best-effort debugging 5 | 6 | - hosts: all 7 | become: yes 8 | gather_facts: no 9 | strategy: free 10 | tasks: 11 | - name: Include NVIDIA DGX Firmware role 12 | include_role: 13 | name: nvidia-dgx-firmware 14 | vars: 15 | run_diagnostics: true 16 | update_firmware: false 17 | -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmstats: -------------------------------------------------------------------------------- 1 | # Start DCGM GPU stats collection if requested 2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null 3 | if [ $? -eq 0 ]; then 4 | GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'` 5 | sudo -u $SLURM_JOB_USER nv-hostengine ~$SLURM_JOB_USER/nvhost.pid 6 | sudo -u $SLURM_JOB_USER dcgmi group -c gpuinfo 7 | sudo -u $SLURM_JOB_USER dcgmi group -g 1 -a $GPULIST 8 | sudo -u $SLURM_JOB_USER dcgmi stats -g 1 --enable 9 | sudo -u $SLURM_JOB_USER dcgmi stats -g 1 -s $SLURM_JOBID 10 | fi 11 | -------------------------------------------------------------------------------- /playbooks/nvidia-dgx/nvidia-dgx-fw-update.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This playbook updates the firwmware on DGX nodes 3 | # Refer to the README in roles/nvidia-dgx-firmware for more info 4 | # NOTE: forcing use of the paramiko ssh plugin since running fw 5 | # update container with interactive `-it` flag requires tty and 6 | # ansible ssh flakes out in `auto` mode. 7 | 8 | - hosts: all 9 | become: yes 10 | connection: paramiko_ssh 11 | tasks: 12 | - name: Include NVIDIA DGX Firmware role 13 | include_role: 14 | name: nvidia-dgx-firmware 15 | vars: 16 | run_diagnostics: true 17 | update_firmware: true 18 | -------------------------------------------------------------------------------- /src/containers/ngc/rapids/Dockerfile-minimal: -------------------------------------------------------------------------------- 1 | # https://ngc.nvidia.com/catalog/containers/nvidia:rapidsai:rapidsai 2 | FROM nvcr.io/nvidia/rapidsai/rapidsai:0.17-cuda10.1-runtime-ubuntu18.04 3 | 4 | # RAPIDS is installed using conda and we need to work from this environment 5 | ENV CONDA_ENV rapids 6 | 7 | # Start using the built in RAPIDS conda environment 8 | ENTRYPOINT ["/bin/sh"] 9 | CMD ["-c", "/opt/conda/envs/${CONDA_ENV}/bin/jupyter lab --notebook-dir=/rapids --ip=0.0.0.0 --no-browser --allow-root --port=8888 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' --NotebookApp.base_url=${NB_PREFIX}"] 10 | -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/prolog-lspci: -------------------------------------------------------------------------------- 1 | # 2 | # Check that all GPUs are present 3 | # 4 | NUMGPUS=`scontrol -a show nodes $HOSTNAME | grep "Gres=gpu" | cut -d : -f 2` 5 | if [ $NUMGPUS -gt 0 ]; then 6 | PCIGPUSFOUND=`lspci | grep "3D controller: NVIDIA Corporation" | wc -l` 7 | if [ $PCIGPUSFOUND -ne $NUMGPUS ]; then 8 | echo "Slurm expects $NUMGPUS GPUs but lspci found: $PCIGPUSFOUND" 9 | scontrol update nodename=$HOSTNAME state=drain reason="Missing GPUs" 10 | exit 0 11 | fi 12 | echo "Slurm expects $NUMGPUS GPUs lspci found: $PCIGPUSFOUND" 13 | fi 14 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/tasks/configure-raid.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: create raid array mount point 3 | file: 4 | path: "{{ dgx_raid_mount_path }}" 5 | state: directory 6 | 7 | - name: Stop cachefilesd when reconfiguring RAID array 8 | service: 9 | name: cachefilesd 10 | state: stopped 11 | 12 | - name: Configure RAID array 13 | command: /usr/bin/configure_raid_array.py -c -f 14 | 15 | - name: Restore SELinux label on RAID array 16 | command: restorecon /raid 17 | when: 18 | - ansible_os_family == 'RedHat' 19 | - (ansible_selinux is defined) and (ansible_selinux.status != "disabled") 20 | notify: restart cachefilesd 21 | -------------------------------------------------------------------------------- /roles/slurm/tasks/service-files.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: copy service files 3 | copy: 4 | src: "{{ slurm_build_dir }}/etc/{{ item }}" 5 | dest: "/etc/systemd/system/{{ item }}" 6 | remote_src: yes 7 | mode: "0644" 8 | with_items: 9 | - slurmctld.service 10 | - slurmdbd.service 11 | when: is_controller 12 | failed_when: false 13 | 14 | - name: copy service files 15 | copy: 16 | src: "{{ slurm_build_dir }}/etc/{{ item }}" 17 | dest: "/etc/systemd/system/{{ item }}" 18 | remote_src: yes 19 | mode: "0644" 20 | with_items: 21 | - slurmd.service 22 | when: is_compute 23 | failed_when: false 24 | -------------------------------------------------------------------------------- /roles/standalone-container-registry/templates/config.yml: -------------------------------------------------------------------------------- 1 | version: 0.1 2 | storage: 3 | filesystem: 4 | rootdirectory: /var/lib/registry 5 | http: 6 | addr: "0.0.0.0:{{ standalone_container_registry_port }}" 7 | log: 8 | accesslog: 9 | disabled: false 10 | {% if standalone_container_registry_cache_enable -%} 11 | proxy: 12 | remoteurl: {{ standalone_container_registry_cache_upstream }} 13 | {% if standalone_container_registry_cache_username is defined -%} 14 | username: {{ standalone_container_registry_cache_username }} 15 | password: {{ standalone_container_registry_cache_password }} 16 | {% endif -%} 17 | {% endif -%} 18 | -------------------------------------------------------------------------------- /scripts/pxe/setup_nat.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export HOST_INT_PUB="${1}" 4 | export HOST_INT_PRV="${2}" 5 | 6 | ip a show dev "${HOST_INT_PUB}" 7 | if [ $? -ne 0 ] ; then 8 | exit 1 9 | fi 10 | 11 | ip a show dev "${HOST_INT_PRV}" 12 | if [ $? -ne 0 ] ; then 13 | exit 1 14 | fi 15 | 16 | set -x 17 | sudo /sbin/iptables -t nat -A POSTROUTING -o ${HOST_INT_PUB} -j MASQUERADE 18 | sudo /sbin/iptables -A FORWARD -i ${HOST_INT_PUB} -o ${HOST_INT_PRV} -m state --state RELATED,ESTABLISHED -j ACCEPT 19 | sudo /sbin/iptables -A FORWARD -i ${HOST_INT_PRV} -o ${HOST_INT_PUB} -j ACCEPT 20 | sudo sysctl -w net.ipv4.ip_forward=1 21 | set +x 22 | -------------------------------------------------------------------------------- /roles/pyxis/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart munge 3 | service: 4 | name: munge 5 | state: restarted 6 | 7 | - name: restart slurmd 8 | service: 9 | name: slurmd 10 | state: restarted 11 | when: is_compute 12 | 13 | - name: restart slurmdbd 14 | service: 15 | name: slurmdbd 16 | state: restarted 17 | when: is_controller 18 | 19 | - name: restart slurmctld 20 | service: 21 | name: slurmctld 22 | state: restarted 23 | when: is_controller 24 | 25 | - name: restart logind 26 | service: 27 | name: systemd-logind.service 28 | state: restarted 29 | enabled: yes 30 | when: is_compute 31 | -------------------------------------------------------------------------------- /workloads/examples/k8s/gpu-usage/mig-mixed-with-selector.yml: -------------------------------------------------------------------------------- 1 | # This yaml file will launch a container with a 1g.5gb MIG device 2 | # It will deploy only onto the specified GPU type (A100 40GB) even if this profile is available on other systems 3 | apiVersion: v1 4 | kind: Pod 5 | metadata: 6 | name: gpu-pod 7 | spec: 8 | nodeSelector: 9 | nvidia.com/gpu.product: A100-SXM4-40GB 10 | containers: 11 | - name: gpu-pod 12 | image: nvcr.io/nvidia/k8s/cuda-sample:nbody 13 | command: ["/bin/sh"] 14 | args: ["-c", "nvidia-smi"] 15 | resources: 16 | limits: 17 | nvidia.com/mig-1g.5gb: 1 18 | -------------------------------------------------------------------------------- /workloads/examples/slurm/dask-rapids/files/launch-dask-cuda-worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ANACONDA_ROOT="/usr/local/anaconda" 4 | CONDA_ENV="/shared/conda" 5 | export PATH="${CONDA_ENV}/bin:${ANACONDA_ROOT}/bin:${PATH}" 6 | 7 | # shellcheck disable=SC1091 8 | source activate "${CONDA_ENV}" 9 | 10 | export CUDA_ROOT=/usr/local/cuda 11 | export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_ROOT/lib64" 12 | export NUMBAPRO_NVVM="$CUDA_ROOT/nvvm/lib64/libnvvm.so" 13 | export NUMBAPRO_LIBDEVICE="$CUDA_ROOT/nvvm/libdevice" 14 | 15 | echo "Launching dask-cuda-worker with scheduler $1 and port $2" 16 | dask-cuda-worker "$1:$2" || echo "Unable to start worker" 17 | -------------------------------------------------------------------------------- /roles/slurm/tasks/setup-role.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: gather os specific variables 3 | include_vars: "{{ item }}" 4 | with_first_found: 5 | - files: 6 | - "{{ ansible_distribution|lower }}.yml" 7 | - "{{ ansible_os_family|lower }}.yml" 8 | paths: 9 | - ../vars 10 | skip: true 11 | tags: 12 | - always 13 | 14 | - name: trust GPG key for EPEL 15 | rpm_key: 16 | key: "{{ epel_key_url }}" 17 | state: present 18 | when: ansible_os_family == "RedHat" 19 | 20 | - name: add epel repo 21 | yum: 22 | name: 23 | - "{{ epel_package }}" 24 | state: present 25 | when: ansible_os_family == "RedHat" 26 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-slurm-gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source workloads/jenkins/scripts/jenkins-common.sh 3 | 4 | # Upload test script 5 | scp \ 6 | -o "StrictHostKeyChecking no" \ 7 | -o "UserKnownHostsFile /dev/null" \ 8 | -i "${HOME}/.ssh/id_rsa" \ 9 | workloads/jenkins/scripts/remote-script-for-slurm-gpu.sh \ 10 | "vagrant@10.0.0.5${GPU01}:remote-script-for-slurm-gpu.sh" 11 | 12 | # Compile and run CUDA sample 13 | ssh \ 14 | -o "StrictHostKeyChecking no" \ 15 | -o "UserKnownHostsFile /dev/null" \ 16 | -l vagrant \ 17 | -i "${HOME}/.ssh/id_rsa" \ 18 | "10.0.0.5${GPU01}" \ 19 | "bash -l /home/vagrant/remote-script-for-slurm-gpu.sh" 20 | -------------------------------------------------------------------------------- /workloads/examples/k8s/gpu-usage/mig-single.yml: -------------------------------------------------------------------------------- 1 | # This yaml file will launch a container with a 1g.5gb MIG device 2 | # It will only deploy onto a node containing A100 40B GPUs 3 | # The node must be configured in mig-strategy=single mode with all 1g.5gb profiles created 4 | apiVersion: v1 5 | kind: Pod 6 | metadata: 7 | name: gpu-pod 8 | spec: 9 | nodeSelector: 10 | nvidia.com/gpu.product: A100-SXM4-40GB-MIG-1g.5gb 11 | containers: 12 | - name: gpu-pod 13 | image: nvcr.io/nvidia/k8s/cuda-sample:nbody 14 | command: ["/bin/sh"] 15 | args: ["-c", "nvidia-smi"] 16 | resources: 17 | limits: 18 | nvidia.com/gpu: 1 19 | -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingoff: -------------------------------------------------------------------------------- 1 | # Disable hypterthreading if requested 2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i nohyperthreading > /dev/null 3 | if [ $? -eq 0 ]; then 4 | cat `find /sys/devices/system/cpu -name thread_siblings_list` | sort | uniq > /tmp/thread_siblings_list 5 | for sibs in `cat /tmp/thread_siblings_list` ; do 6 | echo $sibs | grep ',' >& /dev/null # if there is a comma (','), then need to disable 2nd 7 | if [ $? -eq 0 ] ; then 8 | x=`echo $sibs | cut -f 2 -d ','` 9 | echo Disabling CPU $x 10 | echo 0 > /sys/devices/system/cpu/cpu$x/online 11 | fi 12 | done 13 | fi 14 | -------------------------------------------------------------------------------- /roles/ood-wrapper/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: gather os specific variables 3 | include_vars: "{{ item }}" 4 | with_first_found: 5 | - files: 6 | - "{{ ansible_distribution|lower }}.yml" 7 | - "{{ ansible_os_family|lower }}.yml" 8 | paths: 9 | - ../vars 10 | skip: true 11 | tags: vars 12 | 13 | - name: Setup Open OnDemand server 14 | include_tasks: server.yml 15 | when: ood_is_server 16 | 17 | - name: Setup Open OnDemand client 18 | include_tasks: client.yml 19 | when: ood_is_client 20 | 21 | - name: Setup linuxhost adapter 22 | include_tasks: linuxhost-adapter.yml 23 | when: ood_is_server and ood_install_linuxhost_adapter 24 | -------------------------------------------------------------------------------- /roles/nvidia-peer-memory/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Check for DGX packages 3 | stat: 4 | path: /etc/dgx-release 5 | register: is_dgx 6 | 7 | - name: Autoinstall DKMS modules 8 | command: dkms autoinstall 9 | when: 10 | - ansible_local['gpus']['count'] 11 | - is_dgx.stat.exists 12 | 13 | - name: Modprobe nv_peer_mem 14 | modprobe: 15 | name: nv_peer_mem 16 | state: present 17 | when: 18 | - ansible_local['gpus']['count'] 19 | - is_dgx.stat.exists 20 | 21 | - name: Start nv_peer_mem service 22 | service: 23 | name: nv_peer_mem 24 | state: started 25 | when: 26 | - ansible_local['gpus']['count'] 27 | - is_dgx.stat.exists 28 | -------------------------------------------------------------------------------- /roles/nvidia_cuda/tasks/install-dgx.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: DGX | Install CUDA on DGX-1 3 | package: 4 | name: "{{ cuda_dgx_override_version | default(cuda_dgx_1_version) }}" 5 | state: present 6 | when: ansible_product_name is search("DGX-1") 7 | 8 | - name: DGX | Install CUDA on DGX-2 9 | package: 10 | name: "{{ cuda_dgx_override_version | default(cuda_dgx_2_version) }}" 11 | state: present 12 | when: ansible_product_name is search("DGX-2") 13 | 14 | - name: DGX | Install CUDA on DGX A100 15 | package: 16 | name: "{{ cuda_dgx_override_version | default(cuda_dgx_a100_version) }}" 17 | state: present 18 | when: ansible_product_name is search("DGXA100") 19 | -------------------------------------------------------------------------------- /roles/prometheus-node-exporter/templates/docker.node-exporter.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Prometheus Node Exporter 3 | After=docker.service 4 | Requires=docker.service 5 | 6 | [Service] 7 | TimeoutStartSec=0 8 | Restart=always 9 | ExecStartPre=-/usr/bin/docker stop %n 10 | ExecStartPre=-/usr/bin/docker rm %n 11 | ExecStartPre=/usr/bin/docker pull {{ node_exporter_container }} 12 | ExecStart=/usr/bin/docker run --rm --network host --cpus={{ node_exporter_max_cpu }} --pid=host --name %n -v {{ node_exporter_prom_dir }}:/run/prometheus {{ node_exporter_container }} --collector.textfile.directory="{{ node_exporter_prom_dir }}" 13 | 14 | [Install] 15 | WantedBy=multi-user.target 16 | -------------------------------------------------------------------------------- /config.example/playbooks/example.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Your custom playbooks should go in your DeepOps configuration directory, 3 | # under `config/playbooks`. 4 | # 5 | # These playbooks can be used to make any customizations to your cluster 6 | # that aren't already provided by DeepOps! 7 | # For example, this playbook installs `cowsay` on all your Kubernetes nodes. 8 | # 9 | # For more details on how to write Ansible playbooks, see the Ansible 10 | # documentation: 11 | # https://docs.ansible.com/ansible/latest/user_guide/playbooks.html 12 | 13 | - hosts: kube-node 14 | become: yes 15 | tasks: 16 | - name: install cowsay 17 | package: 18 | name: cowsay 19 | state: present 20 | -------------------------------------------------------------------------------- /roles/nvidia_dcgm/tasks/install-ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Ubuntu | remove old key 3 | apt_key: 4 | id: "{{ old_nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}" 5 | state: "absent" 6 | 7 | - name: Ubuntu | install CUDA keyring 8 | apt: 9 | deb: "{{ nvidia_driver_ubuntu_cuda_keyring_url }}" 10 | state: "present" 11 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 12 | 13 | - name: Ubuntu | force apt update 14 | apt: 15 | update_cache: true 16 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 17 | changed_when: false 18 | 19 | - name: Ubuntu | install package 20 | apt: 21 | name: "{{ dcgm_pkg_name }}" 22 | state: "present" 23 | -------------------------------------------------------------------------------- /roles/nvidia_hpc_sdk/molecule/default/molecule.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependency: 3 | name: galaxy 4 | driver: 5 | name: docker 6 | platforms: 7 | - name: nvhpc-ubuntu-1804 8 | image: geerlingguy/docker-ubuntu1804-ansible 9 | pre_build_image: true 10 | - name: nvhpc-ubuntu-2004 11 | image: geerlingguy/docker-ubuntu2004-ansible 12 | pre_build_image: true 13 | - name: nvhpc-centos-7 14 | image: geerlingguy/docker-centos7-ansible 15 | pre_build_image: true 16 | # - name: nvhpc-centos-8 17 | # image: geerlingguy/docker-centos8-ansible 18 | # pre_build_image: true 19 | provisioner: 20 | name: ansible 21 | ansible_args: 22 | - -vv 23 | verifier: 24 | name: ansible 25 | -------------------------------------------------------------------------------- /roles/openmpi/molecule/default/molecule.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependency: 3 | name: galaxy 4 | driver: 5 | name: docker 6 | platforms: 7 | - name: openmpi-ubuntu-1804 8 | image: geerlingguy/docker-ubuntu1804-ansible 9 | pre_build_image: true 10 | - name: openmpi-ubuntu-2004 11 | image: geerlingguy/docker-ubuntu2004-ansible 12 | pre_build_image: true 13 | - name: openmpi-centos-7 14 | image: geerlingguy/docker-centos7-ansible 15 | pre_build_image: true 16 | # - name: openmpi-centos-8 17 | # image: geerlingguy/docker-centos8-ansible 18 | # pre_build_image: true 19 | provisioner: 20 | name: ansible 21 | ansible_args: 22 | - -vv 23 | verifier: 24 | name: ansible 25 | -------------------------------------------------------------------------------- /workloads/examples/k8s/nbody.yml: -------------------------------------------------------------------------------- 1 | # kubectl apply -f tests/nbody.yml 2 | # kubectl scale deploy/cuda-nbody --replicas=2 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: cuda-nbody 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: cuda-nbody 12 | template: 13 | metadata: 14 | labels: 15 | app: cuda-nbody 16 | spec: 17 | containers: 18 | - name: cuda-nbody-container 19 | image: nvcr.io/nvidia/k8s/cuda-sample:nbody 20 | command: ["/bin/sh"] 21 | args: ["-c", "nbody -benchmark -numbodies=1000192"] 22 | resources: 23 | limits: 24 | nvidia.com/gpu: 1 25 | -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/epilog-ecc: -------------------------------------------------------------------------------- 1 | # Make sure ECC is on. 2 | nvidia-smi -a | grep -A1 Ecc | grep -i disabled > /dev/null 3 | if [ $? -eq 0 ]; then 4 | logger -t PROLOG "Enabling ECC" 5 | nvidia-smi -e 1 6 | GPUCOUNT=`nvidia-smi -L | wc -l` 7 | GPUMAXINDEX=`expr $GPUCOUNT - 1` 8 | systemctl stop collectd 9 | logger -t PROLOG "Triggering GPU reset" 10 | for i in `seq 0 $GPUMAXINDEX`; do 11 | e=`nvidia-smi -r -i $i 2>&1` 12 | if [ $? -ne 0 ]; then 13 | logger -t PROLOG "WARNING! GPU $i reset failed" 14 | logger -t PROLOG "GPU $i reset error: $e" 15 | fi 16 | done 17 | logger -t PROLOG "GPU reset done" 18 | fi 19 | -------------------------------------------------------------------------------- /roles/nvidia-dcgm-exporter/templates/docker.dcgm-exporter.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=NVIDIA DCGM Exporter 3 | After=docker.service 4 | Requires=docker.service 5 | 6 | [Service] 7 | TimeoutStartSec=0 8 | Restart=always 9 | ExecStartPre=-/usr/bin/docker stop %n 10 | ExecStartPre=-/usr/bin/docker rm %n 11 | ExecStartPre=/usr/bin/docker pull {{ nvidia_dcgm_container }} 12 | ExecStart=/usr/bin/docker run --rm --gpus all --cap-add=SYS_ADMIN --cpus="{{ nvidia_dcgm_max_cpu }}" --name %n -p 9400:9400 -v "{{ nvidia_dcgm_container_config_dir }}/{{ nvidia_dcgm_container_custom_metrics_file }}:/etc/dcgm-exporter/default-counters.csv" {{ nvidia_dcgm_container }} 13 | 14 | [Install] 15 | WantedBy=multi-user.target 16 | -------------------------------------------------------------------------------- /roles/nvidia-dgx/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Check for DGX 3 | fail: 4 | msg: "Role supports DGX systems only" 5 | when: ansible_product_name is not search("DGX") 6 | 7 | - name: Ubuntu tasks for DGX OS 4/5 8 | include_tasks: ubuntu.yml 9 | when: 10 | - ansible_distribution == 'Ubuntu' 11 | 12 | - name: redhat family tasks 13 | include_tasks: redhat.yml 14 | when: ansible_os_family == 'RedHat' 15 | 16 | - name: configure raid array 17 | include_tasks: configure-raid.yml 18 | when: dgx_configure_raid_array 19 | 20 | - name: perform full OS upgrade on Ubuntu 21 | include_tasks: ubuntu-upgrade.yml 22 | when: 23 | - dgx_full_upgrade 24 | - ansible_distribution == 'Ubuntu' 25 | -------------------------------------------------------------------------------- /roles/openshift/molecule/default/molecule.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependency: 3 | name: galaxy 4 | driver: 5 | name: docker 6 | platforms: 7 | - name: openshift-ubuntu-1804 8 | image: geerlingguy/docker-ubuntu1804-ansible 9 | pre_build_image: true 10 | - name: openshift-ubuntu-2004 11 | image: geerlingguy/docker-ubuntu2004-ansible 12 | pre_build_image: true 13 | - name: openshift-centos-7 14 | image: geerlingguy/docker-centos7-ansible 15 | pre_build_image: true 16 | # - name: openshift-centos-8 17 | # image: geerlingguy/docker-centos8-ansible 18 | # pre_build_image: true 19 | provisioner: 20 | name: ansible 21 | ansible_args: 22 | - -vv 23 | verifier: 24 | name: ansible 25 | -------------------------------------------------------------------------------- /roles/docker-login/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: ensure python prereqs are installed 3 | package: 4 | name: "{{ item }}" 5 | state: present 6 | with_items: 7 | - "python3-setuptools" 8 | - "python3-pip" 9 | 10 | - name: ensure docker pip package is installed 11 | pip: 12 | name: "docker" 13 | state: present 14 | 15 | - name: log into docker registry 16 | docker_login: 17 | state: "{{ docker_login_state }}" 18 | registry: "{{ item.registry }}" 19 | username: "{{ item.username }}" 20 | password: "{{ item.password }}" 21 | reauthorize: "{{ docker_login_reauth }}" 22 | with_items: "{{ docker_login_registries }}" 23 | no_log: "{{ docker_login_disable_log_password }}" 24 | -------------------------------------------------------------------------------- /roles/cachefilesd/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install 3 | package: 4 | name: cachefilesd 5 | state: "{{ cachefilesd_package_state }}" 6 | 7 | - name: enable 8 | template: 9 | src: cachefilesd.j2 10 | dest: /etc/default/cachefilesd 11 | owner: "root" 12 | group: "root" 13 | mode: "0644" 14 | 15 | - name: configure 16 | template: 17 | src: cachefilesd_config.j2 18 | dest: /etc/cachefilesd.conf 19 | owner: "root" 20 | group: "root" 21 | mode: "0644" 22 | 23 | # Service start not tested in molecule as we require a kernel module 24 | - name: start 25 | service: 26 | name: cachefilesd 27 | state: restarted 28 | enabled: yes 29 | tags: 30 | - molecule-notest 31 | -------------------------------------------------------------------------------- /workloads/services/k8s/k8s-dashboard-admin.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: admin-user 6 | namespace: kube-system 7 | --- 8 | apiVersion: v1 9 | kind: Secret 10 | metadata: 11 | name: admin-user-secret 12 | annotations: 13 | kubernetes.io/service-account.name: admin-user 14 | namespace: kube-system 15 | type: kubernetes.io/service-account-token 16 | 17 | --- 18 | apiVersion: rbac.authorization.k8s.io/v1 19 | kind: ClusterRoleBinding 20 | metadata: 21 | name: admin-user 22 | roleRef: 23 | apiGroup: rbac.authorization.k8s.io 24 | kind: ClusterRole 25 | name: cluster-admin 26 | subjects: 27 | - kind: ServiceAccount 28 | name: admin-user 29 | namespace: kube-system 30 | -------------------------------------------------------------------------------- /roles/prometheus-slurm-exporter/defaults/main.yml: -------------------------------------------------------------------------------- 1 | slurm_exporter_container: "deepops/prometheus-slurm-exporter:latest" 2 | slurm_exporter_svc_name: "docker.slurm-exporter.service" 3 | slurm_exporter_state: started 4 | slurm_exporter_enabled: yes 5 | 6 | slurm_install_prefix: /usr/local 7 | 8 | prometheus_config_dir: /etc/prometheus 9 | prometheus_cfg_endpoint_dir: "{{ prometheus_config_dir }}/endpoints" 10 | slurm_exporter_conf_template: "slurm-exporter.yml.j2" 11 | 12 | grafana_svc_name: "docker.grafana.service" 13 | grafana_data_dir: /var/lib/grafana 14 | grafana_cfg_dashboard_path: "{{ grafana_data_dir }}/dashboards" 15 | grafana_user_id: 472 16 | 17 | slurm_exporter_host_group: "{{ slurm_monitoring_group | default('slurm-master') }}" 18 | -------------------------------------------------------------------------------- /roles/slurm/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart munge 3 | service: 4 | name: munge 5 | state: restarted 6 | 7 | - name: restart slurmd 8 | service: 9 | name: slurmd 10 | state: restarted 11 | when: is_compute 12 | 13 | - name: restart slurmdbd 14 | service: 15 | name: slurmdbd 16 | state: restarted 17 | when: is_controller 18 | 19 | - name: restart slurmctld 20 | service: 21 | name: slurmctld 22 | state: restarted 23 | when: is_controller 24 | 25 | - name: restart logind 26 | service: 27 | name: systemd-logind.service 28 | state: restarted 29 | enabled: yes 30 | when: is_compute 31 | 32 | - name: restart rsyslog 33 | service: 34 | name: rsyslog 35 | state: restarted 36 | -------------------------------------------------------------------------------- /roles/nvidia-dcgm-exporter/defaults/main.yml: -------------------------------------------------------------------------------- 1 | nvidia_dcgm_container_version: "2.1.8-2.4.0-rc.2-ubuntu20.04" 2 | nvidia_dcgm_container: "nvcr.io/nvidia/k8s/dcgm-exporter:{{ nvidia_dcgm_container_version }}" 3 | nvidia_dcgm_container_config_dir: "/opt/deepops/nvidia-dcgm-exporter" 4 | nvidia_dcgm_container_custom_metrics_file: "dcgm-custom-metrics.csv" 5 | nvidia_dcgm_prom_dir: "/run/prometheus" 6 | nvidia_dcgm_svc_name: "docker.dcgm-exporter.service" 7 | nvidia_dcgm_state: started 8 | nvidia_dcgm_enabled: yes 9 | 10 | prometheus_config_dir: /etc/prometheus 11 | prometheus_cfg_endpoint_dir: "{{ prometheus_config_dir }}/endpoints" 12 | nvidia_dcgm_exporter_conf_template: "dcgm-exporter.yml.j2" 13 | 14 | has_gpus: false 15 | 16 | nvidia_dcgm_max_cpu: "0.5" 17 | -------------------------------------------------------------------------------- /workloads/examples/slurm/dask-rapids/files/conda-requirements.yml: -------------------------------------------------------------------------------- 1 | name: rapids 2 | channels: 3 | - numba 4 | - conda-forge 5 | - nvidia/label/cuda10.0 6 | - rapidsai/label/cuda10.0 7 | - pytorch 8 | - defaults 9 | dependencies: 10 | - arrow-cpp=0.12 11 | - bokeh 12 | - cffi=1.11.5 13 | - cmake=3.12 14 | - cuda100 15 | - cudf=0.5.1 16 | - cuml=0.5.1 17 | - cython=0.29 18 | - dask=1.1.1 19 | - distributed=1.25.3 20 | - faiss-gpu=1.5.0 21 | - jupyterlab 22 | - matplotlib 23 | - numba=0.42 24 | - numpy=1.15.4 25 | - nvstrings 26 | - pandas=0.23.4 27 | - paramiko 28 | - pyarrow=0.12 29 | - pytest 30 | - python=3.7 31 | - scikit-learn 32 | - scipy 33 | - pip: 34 | - setuptools 35 | - cupy-cuda100 36 | -------------------------------------------------------------------------------- /workloads/examples/k8s/services/pxe.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: pxe-server 5 | spec: 6 | hostNetwork: true 7 | containers: 8 | - name: pxe-server 9 | image: deepops/provision/pxe # change me 10 | volumeMounts: 11 | - name: config-volume 12 | mountPath: /data 13 | - name: nfs 14 | mountPath: "/iso" 15 | imagePullSecrets: 16 | - name: secret # change me 17 | volumes: 18 | - name: config-volume 19 | configMap: 20 | name: pxe 21 | items: 22 | - key: machines.json 23 | path: machines.json 24 | - name: nfs 25 | persistentVolumeClaim: 26 | claimName: nfs-dgx-iso 27 | restartPolicy: Never 28 | -------------------------------------------------------------------------------- /playbooks/k8s-cluster/container-registry.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: kube-master 3 | become: true 4 | tasks: 5 | - name: Install helm chart for container registry 6 | include_role: 7 | name: k8s-internal-container-registry 8 | run_once: true 9 | tags: 10 | - container-registry 11 | 12 | - hosts: kube-node 13 | become: true 14 | vars: 15 | container_registry_hostname: registry.local 16 | tasks: 17 | - name: Set registry hostname in /etc/hosts 18 | lineinfile: 19 | path: /etc/hosts 20 | line: "{{ hostvars[groups['kube-master'][0]]['ansible_host'] | default(hostvars[groups['kube-master'][0]]['ansible_default_ipv4']['address']) }} {{ container_registry_hostname }}" 21 | tags: 22 | - container-registry 23 | -------------------------------------------------------------------------------- /roles/nvidia_dcgm/.yamllint: -------------------------------------------------------------------------------- 1 | --- 2 | # Based on ansible-lint config 3 | extends: default 4 | 5 | rules: 6 | braces: 7 | max-spaces-inside: 1 8 | level: error 9 | brackets: 10 | max-spaces-inside: 1 11 | level: error 12 | colons: 13 | max-spaces-after: -1 14 | level: error 15 | commas: 16 | max-spaces-after: -1 17 | level: error 18 | comments: disable 19 | comments-indentation: disable 20 | document-start: disable 21 | empty-lines: 22 | max: 3 23 | level: error 24 | hyphens: 25 | level: error 26 | indentation: disable 27 | key-duplicates: enable 28 | line-length: disable 29 | new-line-at-end-of-file: disable 30 | new-lines: 31 | type: unix 32 | trailing-spaces: disable 33 | truthy: disable 34 | -------------------------------------------------------------------------------- /workloads/bit/hpl/syscfg-dgx1v.sh: -------------------------------------------------------------------------------- 1 | GPU_AFFINITY="0:1:2:3:4:5:6:7" 2 | CPU_AFFINITY="0-4:5-9:10-14:15-19:20-24:25-29:30-34:35-39" 3 | CPU_CORES_PER_RANK=4 4 | MEM_AFFINITY="0:0:0:0:1:1:1:1" 5 | UCX_AFFINITY="mlx5_0:mlx5_0:mlx5_1:mlx5_1:mlx5_2:mlx5_2:mlx5_3:mlx5_3" 6 | GPU_CLOCK="877,1275" 7 | 8 | export MONITOR_GPU=1 9 | export TEST_SYSTEM_PARAMS=1 10 | export TEST_LOOPS=1 11 | export GPU_CLOCK_WARNING=$(echo ${GPU_CLOCK} | cut -f2 -d,) 12 | export GPU_POWER_WARNING=300 13 | export GPU_PCIE_GEN_WARNING=3 14 | export GPU_PCIE_WIDTH_WARNING=16 15 | 16 | ## Depending on driver version, you may need to uncomment the following line 17 | # export LD_LIBRARY_PATH="/usr/local/cuda/compat:$LD_LIBRARY_PATH 18 | 19 | export UCX_TLS=all 20 | export OMPI_MCA_pml_ucx_verbose=100 21 | -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmhealth: -------------------------------------------------------------------------------- 1 | # 2 | # Check that all GPUs are healthy via dcgm 3 | # 4 | if [ $NUMGPUS -gt 0 ]; then 5 | echo "Execute dcgm health check" 6 | GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'` 7 | rm /tmp/dcgm.out 2> /dev/null 8 | nv-hostengine 9 | dcgmi group -c gpuinfo 10 | dcgmi group -g 1 -a $GPULIST 11 | dcgmi diag -g 1 -r 1 1> /tmp/dcgm.out 12 | dcgmi group -d 1 13 | nv-hostengine -t 14 | grep -i fail /tmp/dcgm.out > /dev/null 15 | if [ $? -gt 0 ]; then 16 | scontrol update nodename=$HOSTNAME state=drain reason="Failed DCGM, see /tmp/dcgm.out" 17 | exit 0 18 | fi 19 | fi 20 | -------------------------------------------------------------------------------- /workloads/examples/k8s/gpu-usage/gpu-without-selector.yml: -------------------------------------------------------------------------------- 1 | # This yaml file will launch a container onto any node that has a nvidia.com/gpu resource 2 | # This container could potentially be deployed on a NVIDIA A100, V100, T4, or any other GPU type 3 | # If a node is configured with mig-strategy=single, this container could potentially run with a MIG device 4 | # 5 | # This deployment style is unpredictable in a heterogenous cluster and should not be used 6 | apiVersion: v1 7 | kind: Pod 8 | metadata: 9 | name: gpu-pod 10 | spec: 11 | containers: 12 | - name: gpu-pod 13 | image: nvcr.io/nvidia/k8s/cuda-sample:nbody 14 | command: ["/bin/sh"] 15 | args: ["-c", "nvidia-smi"] 16 | resources: 17 | limits: 18 | nvidia.com/gpu: 1 19 | -------------------------------------------------------------------------------- /config.example/helm/metallb-resources.yml: -------------------------------------------------------------------------------- 1 | # This was autogenerated by MetalLB's custom resource generator. 2 | apiVersion: metallb.io/v1beta1 3 | kind: IPAddressPool 4 | metadata: 5 | creationTimestamp: null 6 | name: default 7 | namespace: deepops-loadbalancer 8 | # Default address range matches private network for the virtual cluster 9 | # defined in virtual/. 10 | # You should set this address range based on your site's infrastructure. 11 | spec: 12 | addresses: 13 | - 10.0.0.100-10.0.0.110 14 | status: {} 15 | --- 16 | apiVersion: metallb.io/v1beta1 17 | kind: L2Advertisement 18 | metadata: 19 | creationTimestamp: null 20 | name: l2advertisement1 21 | namespace: deepops-loadbalancer 22 | spec: 23 | ipAddressPools: 24 | - default 25 | status: {} 26 | --- 27 | -------------------------------------------------------------------------------- /playbooks/container/pyxis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: slurm-master 3 | tasks: 4 | - name: set controller fact 5 | set_fact: 6 | is_controller: true 7 | tags: always 8 | 9 | - hosts: slurm-node 10 | tasks: 11 | - name: set compute fact 12 | set_fact: 13 | is_compute: true 14 | tags: always 15 | 16 | - hosts: slurm-cluster 17 | become: yes 18 | tasks: 19 | - name: set enroot DGX config fact 20 | set_fact: 21 | enroot_environ_config_files: "{{ enroot_environ_config_files_dgx }}" 22 | when: ansible_product_name is search("DGX") 23 | 24 | - hosts: slurm-node 25 | become: yes 26 | roles: 27 | - name: nvidia.enroot 28 | 29 | - hosts: slurm-cluster 30 | become: yes 31 | roles: 32 | - name: pyxis 33 | -------------------------------------------------------------------------------- /roles/facts/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: apt install pciutils 3 | apt: name=pciutils update_cache=yes 4 | when: ansible_os_family == 'Debian' 5 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 6 | 7 | - name: yum install pciutils 8 | yum: name=pciutils update_cache=yes 9 | when: ansible_os_family == 'RedHat' 10 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 11 | 12 | - name: create fact directory 13 | file: 14 | path: /etc/ansible/facts.d 15 | state: directory 16 | mode: 0755 17 | 18 | - name: custom facts 19 | copy: 20 | src: "{{ item }}" 21 | dest: /etc/ansible/facts.d/ 22 | mode: 0755 23 | with_fileglob: 24 | - '*' 25 | 26 | - name: regather local facts 27 | setup: filter=ansible_local 28 | -------------------------------------------------------------------------------- /roles/singularity_wrapper/.yamllint: -------------------------------------------------------------------------------- 1 | --- 2 | # Based on ansible-lint config 3 | extends: default 4 | 5 | rules: 6 | braces: 7 | max-spaces-inside: 1 8 | level: error 9 | brackets: 10 | max-spaces-inside: 1 11 | level: error 12 | colons: 13 | max-spaces-after: -1 14 | level: error 15 | commas: 16 | max-spaces-after: -1 17 | level: error 18 | comments: disable 19 | comments-indentation: disable 20 | document-start: disable 21 | empty-lines: 22 | max: 3 23 | level: error 24 | hyphens: 25 | level: error 26 | indentation: disable 27 | key-duplicates: enable 28 | line-length: disable 29 | new-line-at-end-of-file: disable 30 | new-lines: 31 | type: unix 32 | trailing-spaces: disable 33 | truthy: disable 34 | -------------------------------------------------------------------------------- /roles/singularity_wrapper/molecule/default/molecule.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependency: 3 | name: galaxy 4 | options: 5 | requirements-file: requirements.yml 6 | driver: 7 | name: docker 8 | platforms: 9 | - name: singularity-ubuntu-1804 10 | image: geerlingguy/docker-ubuntu1804-ansible 11 | pre_build_image: true 12 | - name: singularity-ubuntu-2004 13 | image: geerlingguy/docker-ubuntu2004-ansible 14 | pre_build_image: true 15 | - name: singularity-centos-7 16 | image: geerlingguy/docker-centos7-ansible 17 | pre_build_image: true 18 | # - name: singularity-centos-8 19 | # image: geerlingguy/docker-centos8-ansible 20 | # pre_build_image: true 21 | provisioner: 22 | name: ansible 23 | ansible_args: 24 | - -vv 25 | verifier: 26 | name: ansible 27 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-dashboard.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | source workloads/jenkins/scripts/jenkins-common.sh 4 | 5 | # Ensure working directory is root 6 | cd "${ROOT_DIR}" 7 | 8 | # Deploy Dashboard 9 | source ./scripts/k8s/deploy_dashboard_user.sh 10 | 11 | # The deployment script exports the http endpoints, verify it returns a 200 12 | # It typically takes ~1 minutes for all pods and services to start, so we poll 13 | timeout=120 14 | time=0 15 | while [ ${time} -lt ${timeout} ]; do 16 | curl -ks --raw -kL "${dashboard_url}" | grep "Kubernetes Dashboard" && \ 17 | echo "Dashboard URLs are all responding" && exit 0 18 | let time=$time+15 19 | sleep 15 20 | done 21 | 22 | # Dashboard deployment failure 23 | echo "Dashboard did not come up in time" 24 | exit 1 25 | -------------------------------------------------------------------------------- /roles/nhc/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nhc_version: "1.4.3" 3 | nhc_src_url: "https://github.com/mej/nhc/releases/download/{{ nhc_version }}/lbnl-nhc-{{ nhc_version }}.tar.xz" 4 | nhc_install_dir: "/usr" 5 | nhc_config_dir: "/etc" 6 | nhc_libexec_dir: "/usr/libexec" 7 | nhc_build_dir: "/opt/deepops/build/nhc" 8 | nhc_sysconfig_dir: "/etc/sysconfig" 9 | 10 | nhc_extract_dir: "{{ nhc_build_dir }}/lbnl-nhc-{{ nhc_version }}" 11 | nhc_configure: "./configure --prefix={{ nhc_install_dir }} --sysconfdir={{ nhc_config_dir }} --libexecdir={{ nhc_libexec_dir }}" 12 | nhc_test: "make test" 13 | nhc_run_test: false 14 | nhc_make: "make install" 15 | 16 | nhc_config_template: "nhc.conf.j2" 17 | nhc_sysconfig_template: "sysconfig_nhc.j2" 18 | 19 | nhc_force_reinstall: false 20 | nhc_config: "nhc.conf.j2" 21 | -------------------------------------------------------------------------------- /workloads/examples/k8s/dask-rapids/k8s/rapids-dask-sa.yml: -------------------------------------------------------------------------------- 1 | kind: Role 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | namespace: rapids 5 | name: dask-scaler 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["pods"] 9 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 10 | --- 11 | kind: RoleBinding 12 | apiVersion: rbac.authorization.k8s.io/v1 13 | metadata: 14 | name: dask-scaler 15 | namespace: rapids 16 | subjects: 17 | - kind: ServiceAccount 18 | name: default # TODO: Create specific user for jupyter 19 | namespace: rapids 20 | roleRef: 21 | kind: Role 22 | name: dask-scaler 23 | apiGroup: rbac.authorization.k8s.io 24 | --- 25 | apiVersion: v1 26 | kind: ServiceAccount 27 | metadata: 28 | name: dask-rapids 29 | namespace: rapids 30 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-tests/README.md: -------------------------------------------------------------------------------- 1 | Role Name 2 | ========= 3 | # NVIDIA GPU Tests Role 4 | 5 | This role is meant to be a quick tool for system validation or simple system burn in. It should not be used as a comprehensive performance test. 6 | 7 | Running this will perform the following: 8 | 9 | * Install the CUDA toolkit 10 | * Download and build cuda-samples 11 | * Run the Peer2Peer and MatrixMultiply tests 12 | * Run the DCGM diagnostics 13 | * Run a basic Tensorflow DL job 14 | 15 | 16 | # Requirements 17 | 18 | This role can be applied to a heterogeneous cluster of GPU nodes. 19 | 20 | The following should be installed on the system prior to running this role (these come standard in the DGX Operating System): 21 | 22 | * CUDA toolkit 23 | * dcgmi 24 | * nvidia-docker 25 | * docker 26 | 27 | -------------------------------------------------------------------------------- /roles/nvidia-dgx-firmware/tasks/get-ib.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: get hostname 3 | shell: hostname 4 | register: hostname 5 | 6 | - name: check mlxconfig 7 | become: yes 8 | shell: "mlxconfig query | egrep -e Device\\|LINK_TYPE_P1\\|LINK_TYPE_P2" 9 | register: mlx_config 10 | ignore_errors: yes 11 | 12 | - name: check ibstat 13 | become: yes 14 | shell: "ibstat | egrep -e mlx\\|Link" 15 | register: ibstat 16 | ignore_errors: yes 17 | 18 | - name: save actual hostname 19 | shell: echo "{{ hostname.stdout }}" >> "{{ fw_dir }}/{{ inventory_hostname }}.log" 20 | - name: save mlx_config 21 | shell: echo "{{ mlx_config.stdout }}" >> "{{ fw_dir }}/{{ inventory_hostname }}.log" 22 | - name: save ibstat 23 | shell: echo "{{ ibstat.stdout }}" >> "{{ fw_dir }}/{{ inventory_hostname }}.log" 24 | -------------------------------------------------------------------------------- /roles/nvidia_cuda/tasks/install-redhat.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: RedHat | trust GPG key for EPEL 3 | rpm_key: 4 | key: "{{ epel_key_url }}" 5 | state: present 6 | 7 | - name: RedHat | add epel repo 8 | become: yes 9 | yum: 10 | name: 11 | - "{{ epel_package }}" 12 | state: present 13 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 14 | 15 | - name: RedHat | add CUDA repo 16 | yum_repository: 17 | name: cuda 18 | description: NVIDIA CUDA YUM Repo 19 | baseurl: "{{ nvidia_driver_rhel_cuda_repo_baseurl }}" 20 | gpgkey: "{{ nvidia_driver_rhel_cuda_repo_gpgkey }}" 21 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 22 | 23 | - name: RedHat | install cuda 24 | package: 25 | name: "{{ cuda_version }}" 26 | state: present 27 | -------------------------------------------------------------------------------- /roles/ood-wrapper/vars/redhat.yml: -------------------------------------------------------------------------------- 1 | install_from_src: false 2 | 3 | ood_apache_service_name: httpd24-httpd.service 4 | ood_htpasswd_file: /opt/rh/httpd24/root/etc/httpd/.htpasswd 5 | 6 | ood_url_turbovnc_pkg: https://downloads.sourceforge.net/project/turbovnc/2.2.4/turbovnc-2.2.4.x86_64.rpm 7 | 8 | ood_master_sw_deps: 9 | - lz4-devel 10 | - unzip 11 | - python-websockify 12 | 13 | ood_client_sw_deps: 14 | - lz4-devel 15 | - unzip 16 | - nmap 17 | - python-websockify 18 | - '@xfce4' 19 | - xfce4-session 20 | - xfce4-settings 21 | - xfce4-terminal 22 | - xfdesktop 23 | - gtk-xfce-engine 24 | - gtk2-engines 25 | - python2-jupyter-core 26 | - python2-jupyroot 27 | - python36-jupyter-core 28 | - python36-jupyroot 29 | - dbus-x11 30 | - firefox 31 | - cuda-nvvp-10-1 32 | -------------------------------------------------------------------------------- /roles/slurm/tasks/misc-node.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Configure nodes that are neither running controller services, nor slurmd. 3 | # Examples include login nodes or CI nodes. 4 | 5 | - name: create slurm directories 6 | file: 7 | path: "{{ item }}" 8 | state: directory 9 | owner: slurm 10 | mode: 0755 11 | with_items: 12 | - "{{ slurm_config_dir }}" 13 | 14 | - name: configure slurm.conf 15 | template: 16 | src: "{{ slurm_conf_template }}" 17 | dest: "{{ slurm_config_dir }}/slurm.conf" 18 | mode: "0644" 19 | tags: 20 | - config 21 | 22 | - name: ensure all slurm services are stopped 23 | service: 24 | name: "{{ item }}" 25 | state: stopped 26 | enabled: no 27 | with_items: 28 | - slurmctld 29 | - slurmd 30 | - slurmdbd 31 | failed_when: false 32 | -------------------------------------------------------------------------------- /scripts/deepops/enable_linting.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FAILED=0 4 | if ! which ansible-lint; then 5 | echo "ansible-lint not found in PATH" 6 | FAILED=1 7 | fi 8 | if ! which shellcheck; then 9 | echo "shellcheck not found in PATH" 10 | FAILED=1 11 | fi 12 | if ! which pylint; then 13 | echo "pylint not found in PATH" 14 | FAILED=1 15 | fi 16 | if [ ${FAILED} -ne 0 ]; then 17 | echo 18 | echo 'One or more required linters not found!' 19 | echo 'Please install the missing linter using pip or your system package manager,' 20 | echo 'and try again.' 21 | echo 22 | echo 'Pre-commit hook not enabled.' 23 | exit 1 24 | fi 25 | 26 | echo "Enabling pre-commit hooks to lint Ansible, Shell, and Python" 27 | cp -v src/repo/githooks/pre-commit .git/hooks/pre-commit 28 | chmod +x .git/hooks/pre-commit 29 | -------------------------------------------------------------------------------- /docs/cloud-native/README.md: -------------------------------------------------------------------------------- 1 | # Deprecated 2 | 3 | Up-to-date Ansible playbooks and install guides for Cloud Native Core can now be found in the [dedicated NVIDIA/cloud-native-core repository](https://github.com/NVIDIA/cloud-native-core). 4 | 5 | ## NVIDIA Cloud Native Core 6 | 7 | NVIDIA Cloud Native Core is a collection of software to run cloud native workloads on NVIDIA GPUs. 8 | NVIDIA Cloud Native Core is based on Ubuntu, Kubernetes, Helm, and the NVIDIA GPU and Network Operators. 9 | 10 | This software stack was previously known as "EGX Stack", and is designed to run well on the [NVIDIA EGX Platform](https://www.nvidia.com/en-us/data-center/products/egx/) 11 | 12 | In DeepOps 22.01 and before, we included Ansible playbooks for installing and validating the EGX Platform in `playbooks/nvidia-egx`. 13 | -------------------------------------------------------------------------------- /roles/nvidia_dcgm/tasks/install-redhat.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: RedHat | trust GPG key for EPEL 3 | rpm_key: 4 | key: "{{ epel_key_url }}" 5 | state: present 6 | 7 | - name: RedHat | add epel repo 8 | become: yes 9 | yum: 10 | name: 11 | - "{{ epel_package }}" 12 | state: present 13 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 14 | 15 | - name: RedHat | add CUDA repo 16 | yum_repository: 17 | name: cuda 18 | description: NVIDIA CUDA YUM Repo 19 | baseurl: "{{ nvidia_driver_rhel_cuda_repo_baseurl }}" 20 | gpgkey: "{{ nvidia_driver_rhel_cuda_repo_gpgkey }}" 21 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 22 | 23 | - name: RedHat | install package 24 | package: 25 | name: "{{ dcgm_pkg_name }}" 26 | state: "present" 27 | -------------------------------------------------------------------------------- /roles/rsyslog_server/templates/01-deepops-listen.conf: -------------------------------------------------------------------------------- 1 | # {{ ansible_managed }} 2 | # Define ruleset for per-host files 3 | template(name="perhost" type="string" string="{{ rsyslog_log_file_path_pattern }}") 4 | ruleset(name="remote") { 5 | action(type="omfile" dynafile="perhost") 6 | } 7 | {% if rsyslog_enable_journal -%} 8 | # Import journal messages into syslog 9 | module(load="imjournal") 10 | {% endif -%} 11 | {% if rsyslog_server_tcp_port is defined -%} 12 | # Accept syslog messages on TCP 13 | module(load="imtcp") 14 | input(type="imtcp" port="{{ rsyslog_server_tcp_port }}" ruleset="remote") 15 | {% endif -%} 16 | {% if rsyslog_server_udp_port -%} 17 | # Accept syslog messages on UDP 18 | module(load="imudp") 19 | input(type="imudp" port="{{ rsyslog_server_udp_port }}" ruleset="remote") 20 | {% endif -%} 21 | -------------------------------------------------------------------------------- /playbooks/slurm-cluster/nvidia-dcgm-exporter.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install docker 3 | import_playbook: ../container/docker.yml 4 | 5 | - name: Install NVIDIA driver 6 | import_playbook: ../nvidia-software/nvidia-driver.yml 7 | 8 | - name: Install NVIDIA container runtime 9 | import_playbook: ../container/nvidia-docker.yml 10 | 11 | - hosts: "{{ hostlist | default('all') }}" 12 | become: yes 13 | tasks: 14 | - name: install custom facts module 15 | include_role: 16 | name: facts 17 | - name: set GPU fact 18 | set_fact: 19 | has_gpus: true 20 | when: ansible_local['gpus']['count'] 21 | - name: configure dcgm exporter 22 | include_role: 23 | name: nvidia-dcgm-exporter 24 | when: ansible_distribution == "Ubuntu" or ansible_os_family == "RedHat" 25 | -------------------------------------------------------------------------------- /roles/nvidia_cuda/tasks/install-ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Ubuntu | remove ppa 3 | apt_repository: 4 | repo: ppa:graphics-drivers/ppa 5 | state: absent 6 | 7 | - name: Ubuntu | ensure old key is absent 8 | apt_key: 9 | id: "{{ old_nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}" 10 | state: "absent" 11 | 12 | - name: Ubuntu | install CUDA keyring 13 | apt: 14 | deb: "{{ nvidia_driver_ubuntu_cuda_keyring_url }}" 15 | state: "present" 16 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 17 | 18 | - name: Ubuntu | force apt update 19 | apt: 20 | update_cache: true 21 | environment: "{{ proxy_env if proxy_env is defined else {} }}" 22 | changed_when: false 23 | 24 | - name: Ubuntu | install cuda 25 | package: 26 | name: "{{ cuda_version }}" 27 | state: present 28 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/shared/bin/set_gpu_power_levels.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | gpu_count="$(nvidia-smi -L | wc -l)" 5 | 6 | for i in $(seq 0 "$(( gpu_count - 1 ))" ) 7 | do 8 | case "$1" in 9 | max) 10 | next="$(nvidia-smi -i "$i" --query-gpu=power.max_limit --format=csv,noheader,nounits)" 11 | ;; 12 | default) 13 | next="$(nvidia-smi -i "$i" --query-gpu=power.default_limit --format=csv,noheader,nounits)" 14 | ;; 15 | min) 16 | next="$(nvidia-smi -i "$i" --query-gpu=power.min_limit --format=csv,noheader,nounits)" 17 | ;; 18 | *) 19 | echo "Usage: $0 [max,default,min]" 20 | exit 1 21 | ;; 22 | esac 23 | nvidia-smi -i "$i" -pl "$next" 24 | done 25 | -------------------------------------------------------------------------------- /workloads/examples/k8s/ingress-nodeport.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Cluster ingress controller 3 | # An ingress controller routes external traffic to services 4 | # 5 | 6 | # Ingress controller 7 | controller: 8 | # Use host network to listen on ports 80 and 443 9 | hostNetwork: true 10 | # Service type LoadBalancer requires a load balancer to be configured, e.g. 11 | # MetalLB in an on-prem cluster. See metallb.yml for a sample definition. 12 | # NodePort can be used instead where we don't have a load balancer. 13 | service: 14 | type: NodePort 15 | # Always run on control-plane nodes 16 | nodeSelector: 17 | node-role.kubernetes.io/control-plane: "" 18 | 19 | # Ingress back-end 20 | defaultBackend: 21 | # Always run on control-plane nodes 22 | nodeSelector: 23 | node-role.kubernetes.io/control-plane: "" 24 | -------------------------------------------------------------------------------- /workloads/examples/slurm/mpi-hello/mpi-hello.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(int argc, char **argv) { 6 | // Initialize MPI 7 | MPI_Init(&argc, &argv); 8 | 9 | // Get the number of processes in the global communicator 10 | int count; 11 | MPI_Comm_size(MPI_COMM_WORLD, &count); 12 | 13 | // Get the rank of the current process 14 | int rank; 15 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 16 | 17 | // Get the current hostname 18 | char hostname[1024]; 19 | gethostname(hostname, sizeof(hostname)); 20 | 21 | // Print a hello world message for this rank 22 | printf("Hello from process %d of %d on host %s\n", rank, count, hostname); 23 | 24 | // Finalize the MPI environment before exiting 25 | MPI_Finalize(); 26 | } 27 | -------------------------------------------------------------------------------- /roles/nvidia-gpu-tests/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Directories to download samples 3 | gpu_test_cuda_dir: "/tmp/cuda-samples" 4 | gpu_test_samples_dir: "{{ gpu_test_cuda_dir }}/Samples" 5 | 6 | # Whether to install the CUDA toolkit package 7 | # Set to no if the CUDA toolkit is installed 8 | gpu_test_install_toolkit: no 9 | 10 | # Whether or not to download and build cuda-samples 11 | # Set to no if they have already been built once 12 | gpu_test_build: yes 13 | 14 | # 1 -> quick test (seconds), 2 -> regular test (minute), 3 -> long test (minutes) 15 | gpu_test_dcgm_level: 3 16 | 17 | # Batch size to use for Tensorflow tests 18 | # DGX-2 -> (512, 50); DGX-1 -> (256,50); DGX-Station -> (128, 50) 19 | gpu_test_tf_batch_size: 128 # (choose from 32, 64, 128, 256, 512) 20 | gpu_test_tf_layers: 50 # (choose from 18, 34, 50, 101, 152) 21 | -------------------------------------------------------------------------------- /workloads/examples/k8s/gpu-usage/gpu-with-selector.yml: -------------------------------------------------------------------------------- 1 | # This yaml file will launch a container with a 32GB V100 GPU 2 | # Specifying the nvidia.com/gpu.product label and the nvidia.com/gpu resource type 3 | # restricts the type of GPU this container will run on 4 | # 5 | # Specifying both a nodeSelector and resource type supports clusters with multiple GPU types or MIG configurations 6 | # This is the preferred method of deployment 7 | apiVersion: v1 8 | kind: Pod 9 | metadata: 10 | name: gpu-pod 11 | spec: 12 | nodeSelector: 13 | nvidia.com/gpu.product: Tesla-V100-DGXS-32GB 14 | containers: 15 | - name: gpu-pod 16 | image: nvcr.io/nvidia/k8s/cuda-sample:nbody 17 | command: ["/bin/sh"] 18 | args: ["-c", "nvidia-smi"] 19 | resources: 20 | limits: 21 | nvidia.com/gpu: 1 22 | -------------------------------------------------------------------------------- /workloads/examples/k8s/ingress-loadbalancer.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Cluster ingress controller 3 | # An ingress controller routes external traffic to services 4 | # 5 | 6 | # Ingress controller 7 | controller: 8 | # Use host network to listen on ports 80 and 443 9 | hostNetwork: true 10 | # Service type LoadBalancer requires a load balancer to be configured, e.g. 11 | # MetalLB in an on-prem cluster. See metallb.yml for a sample definition. 12 | # NodePort can be used instead where we don't have a load balancer. 13 | service: 14 | type: LoadBalancer 15 | # Always run on control-plane nodes 16 | nodeSelector: 17 | node-role.kubernetes.io/control-plane: "" 18 | 19 | # Ingress back-end 20 | defaultBackend: 21 | # Always run on control-plane nodes 22 | nodeSelector: 23 | node-role.kubernetes.io/control-plane: "" 24 | -------------------------------------------------------------------------------- /roles/prometheus/templates/prometheus.yml.j2: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: {{ prometheus_cfg_scrape_interval }} # Default is every 1 minute. 3 | evaluation_interval: {{ prometheus_cfg_evaluation_interval }} # Default is every 1 minute. 4 | 5 | # Alertmanager configuration 6 | alerting: 7 | alertmanagers: 8 | - scheme: http 9 | static_configs: 10 | - targets: ['localhost:9093'] 11 | 12 | # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. 13 | rule_files: 14 | - 'rules/alert_rules.yml' 15 | # - 'rules/second_rules.yml' 16 | 17 | scrape_configs: 18 | - job_name: 'cluster' 19 | file_sd_configs: 20 | - files: 21 | - {{ prometheus_cfg_endpoint_dir }}/*.yml 22 | - job_name: 'prometheus' 23 | static_configs: 24 | - targets: ['localhost:9090'] 25 | -------------------------------------------------------------------------------- /workloads/bit/hpl/syscfg-dgx2.sh: -------------------------------------------------------------------------------- 1 | GPU_AFFINITY="0:1:2:3:4:5:6:7:8:9:16:11:12:13:14:15" 2 | CPU_AFFINITY="0-2:3-5:6-8:9-11:12-14:15-17:18-20:21-23:24-26:27-29:30-32:33-35:36-38:39-41:42-44:45-47" 3 | CPU_CORES_PER_RANK=3 4 | MEM_AFFINITY="0:0:0:0:0:0:0:01:1:1:1:1:1:1:1:1" 5 | UCX_AFFINITY="mlx5_1:mlx5_1:mlx5_2:mlx5_2:mlx5_3:mlx5_3:mlx5_4:mlx5_4:mlx5_7:mlx5_7:mlx5_8:mlx5_8:mlx5_9:mlx5_9:mlx5_10:mlx_10" 6 | GPU_CLOCK="877,1275" 7 | 8 | export MONITOR_GPU=1 9 | export TEST_SYSTEM_PARAMS=1 10 | export TEST_LOOPS=1 11 | export GPU_CLOCK_WARNING=$(echo ${GPU_CLOCK} | cut -f2 -d,) 12 | export GPU_POWER_WARNING=350 13 | export GPU_PCIE_GEN_WARNING=3 14 | export GPU_PCIE_WIDTH_WARNING=16 15 | 16 | ## Depending on driver version, you may need to uncomment the following line 17 | # export LD_LIBRARY_PATH="/usr/local/cuda/compat:$LD_LIBRARY_PATH 18 | 19 | -------------------------------------------------------------------------------- /roles/nvidia_cuda/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: check if we are running on DGX 3 | stat: 4 | path: "/etc/dgx-release" 5 | register: is_dgx 6 | 7 | - name: DGX install tasks 8 | include_tasks: install-dgx.yml 9 | when: is_dgx.stat.exists == True 10 | 11 | - name: ubuntu install tasks 12 | include_tasks: install-ubuntu.yml 13 | when: (ansible_distribution == "Ubuntu") and (is_dgx.stat.exists == False) 14 | 15 | - name: redhat family install tasks 16 | include_tasks: install-redhat.yml 17 | when: (ansible_os_family == "RedHat") and (is_dgx.stat.exists == False) 18 | 19 | - name: add profile script to set environment for toolkit 20 | copy: 21 | src: "cuda-vars.sh" 22 | dest: "/etc/profile.d/cuda-vars.sh" 23 | owner: "root" 24 | group: "root" 25 | mode: "0644" 26 | when: cuda_toolkit_add_profile_script 27 | -------------------------------------------------------------------------------- /docs/slurm-cluster/slurm-prolog-epilog/prolog-ecc: -------------------------------------------------------------------------------- 1 | # Disable ECC if requested 2 | scontrol show job $SLURM_JOBID | grep Comment | grep -i ecc > /dev/null 3 | if [ $? -eq 0 ]; then 4 | logger -t PROLOG "Disabling ECC" 5 | nvidia-smi -e 0 6 | GPUCOUNT=`nvidia-smi -L | wc -l` 7 | GPUMAXINDEX=`expr $GPUCOUNT - 1` 8 | systemctl stop collectd 9 | logger -t PROLOG "Triggering GPU reset" 10 | for i in `seq 0 $GPUMAXINDEX`; do 11 | logger -t PROLOG "Resetting GPU $i" 12 | e=`nvidia-smi -r -i $i 2>&1` 13 | if [ $? -ne 0 ]; then 14 | logger -t PROLOG "WARNING! GPU $i reset failed" 15 | logger -t PROLOG "GPU $i reset error: $e" 16 | nvidia-smi -e 1 17 | fi 18 | sleep 1 19 | done 20 | logger -t PROLOG "GPU reset done" 21 | systemctl start collectd 22 | fi 23 | -------------------------------------------------------------------------------- /playbooks/k8s-cluster/netapp-trident.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Playbook for deploying NetApp Trident 3 | 4 | - name: "Install NFS utils on worker nodes" 5 | hosts: kube-node 6 | become: true 7 | become_method: sudo 8 | tasks: 9 | - name: install nfs utils (Ubuntu) 10 | package: 11 | name: nfs-common 12 | when: ansible_os_family == "Debian" 13 | - name: install nfs utils (Red Hat / CentOS) 14 | package: 15 | name: nfs-utils 16 | when: ansible_os_family == "RedHat" 17 | 18 | - name: "Deploy NetApp Trident" 19 | hosts: kube-master 20 | become: true 21 | vars_files: 22 | - ../../config/group_vars/netapp-trident.yml 23 | environment: 24 | PATH: /usr/local/bin/:{{ ansible_env.PATH }} 25 | tasks: 26 | - name: Include netapp trident role 27 | run_once: true 28 | include_role: 29 | name: netapp-trident 30 | -------------------------------------------------------------------------------- /roles/autofs/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: assert that variables are defined 3 | fail: 4 | msg: "Variable '{{ item }}' is not defined" 5 | when: item not in vars 6 | with_items: 7 | - autofs_mount 8 | - autofs_map 9 | 10 | - name: install packages 11 | package: name=autofs 12 | 13 | - name: configure /home 14 | template: 15 | src: templates/master.j2 16 | dest: /etc/auto.master 17 | owner: root 18 | group: root 19 | mode: 0644 20 | notify: "restart autofs" 21 | tags: 22 | - configuration 23 | 24 | - name: ensure mountpoint exists 25 | file: 26 | path: "{{ autofs_mount }}" 27 | state: directory 28 | owner: "root" 29 | group: "root" 30 | mode: "0755" 31 | when: autofs_mount is defined 32 | 33 | - name: make sure autofs is running 34 | service: name=autofs state=started enabled=yes 35 | -------------------------------------------------------------------------------- /roles/nvidia-dgx-firmware/tasks/get-data.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This is used to generate a spreadsheet mapping NICs/IPs/Hostnames to a cluster of DGX-2s 3 | - name: Run NVSM human-readable health show 4 | shell: "osversion=`cat /etc/dgx-release | grep 'DGX_SWBUILD_VERSION'`; host=`hostname`; copper_mac=`ip addr | grep -A 1 enp6s0 | grep link | awk '{print $2}'`; bmc_ip=`sudo ipmitool lan print | grep -i 'IP Address :' | awk '{print $4}'`; bmc_mac=`sudo ipmitool lan print | grep -i 'MAC Address' | awk '{print $4}'`;host_mac=`ifconfig {{ nv_mgmt_interface }} | grep 'ether' | awk '{print $2}'`;host_ip=`ifconfig {{ nv_mgmt_interface }} | grep 'inet ' | awk '{print $2}'`;data=\"${host_ip},${host_mac},${bmc_ip},${bmc_mac},${copper_mac},${host},${osversion}\";echo ${data}" 5 | register: command_output 6 | - debug: 7 | msg: "{{ command_output.stdout }}" 8 | ignore_errors: yes 9 | -------------------------------------------------------------------------------- /scripts/generic/install_docker.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Source common libraries and env variables 4 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 5 | ROOT_DIR="${SCRIPT_DIR}/../.." 6 | source ${ROOT_DIR}/scripts/common.sh 7 | 8 | DOCKER_COMPOSE_URL="${DOCKER_COMPOSE_URL:-https://github.com/docker/compose/releases/download/1.23.2/docker-compose-$(uname -s)-$(uname -m)}" 9 | 10 | type docker >/dev/null 2>&1 11 | if [ $? -ne 0 ] ; then 12 | get_docker=$(mktemp) 13 | curl -fsSL get.docker.com -o ${get_docker} 14 | sudo sh ${get_docker} 15 | sudo rm -f ${get_docker} 16 | sudo usermod -aG docker $(whoami) 17 | fi 18 | 19 | type docker-compose >/dev/null 2>&1 20 | if [ $? -ne 0 ] ; then 21 | sudo curl -L "${DOCKER_COMPOSE_URL}" -o /usr/local/bin/docker-compose 22 | sudo chmod +x /usr/local/bin/docker-compose 23 | fi 24 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-mpi-job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source workloads/jenkins/scripts/jenkins-common.sh 3 | 4 | # Upload MPI source 5 | scp \ 6 | -o "StrictHostKeyChecking no" \ 7 | -o "UserKnownHostsFile /dev/null" \ 8 | -i "${HOME}/.ssh/id_rsa" \ 9 | workloads/examples/slurm/mpi-hello/mpi-hello.c \ 10 | "vagrant@10.0.0.5${GPU01}:mpi-hello.c" 11 | 12 | # Upload test script 13 | scp \ 14 | -o "StrictHostKeyChecking no" \ 15 | -o "UserKnownHostsFile /dev/null" \ 16 | -i "${HOME}/.ssh/id_rsa" \ 17 | workloads/jenkins/scripts/remote-script-for-mpi.sh \ 18 | "vagrant@10.0.0.5${GPU01}:remote-script-for-mpi.sh" 19 | 20 | # Compile the program 21 | ssh \ 22 | -o "StrictHostKeyChecking no" \ 23 | -o "UserKnownHostsFile /dev/null" \ 24 | -l vagrant \ 25 | -i "${HOME}/.ssh/id_rsa" \ 26 | "10.0.0.5${GPU01}" \ 27 | "bash -l /home/vagrant/remote-script-for-mpi.sh" 28 | -------------------------------------------------------------------------------- /roles/slurm/templates/etc/slurm/shared/bin/set_gpu_clocks.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | gpu_count="$(nvidia-smi -L | wc -l)" 5 | 6 | case "$1" in 7 | default) 8 | nvidia-smi -rac # Reset application clocks 9 | nvidia-smi -acp 0 # Reset application clock permissions 10 | nvidia-smi -c DEFAULT # Reset compute mode to default 11 | ;; 12 | max) 13 | for i in $(seq 0 "$(( gpu_count - 1 ))" ) ; do 14 | nextSM="$(nvidia-smi -i "$i" --query-gpu=clocks.max.sm --format=csv,noheader,nounits)" 15 | nextMEM="$(nvidia-smi -i "$i" --query-gpu=clocks.max.mem --format=csv,noheader,nounits)" 16 | nvidia-smi -i "${i}" -ac "${nextMEM}","${nextSM}" 17 | done 18 | ;; 19 | *) 20 | echo "Usage: $0 [default|max]" 21 | exit 1 22 | ;; 23 | esac 24 | -------------------------------------------------------------------------------- /src/repo/githooks/check-python.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Get a list of changed python scripts that are staged for commit. 4 | Run shellcheck on only those files. 5 | """ 6 | 7 | 8 | from __future__ import print_function 9 | import subprocess 10 | import re 11 | import sys 12 | 13 | 14 | def get_changed_paths(): 15 | git_diff = subprocess.check_output("git diff --name-only --cached".split()) 16 | paths = [] 17 | for f in git_diff.split("\n"): 18 | # Add playbook files 19 | if re.match(r".*(\.py)$", f): 20 | paths.append(f) 21 | return paths 22 | 23 | 24 | def run_lint(paths): 25 | cmd = ["pylint", "-rn", "-sn", "-d", "R,C"] + paths 26 | return subprocess.call(cmd) 27 | 28 | 29 | if __name__ == "__main__": 30 | changed = get_changed_paths() 31 | if len(changed) > 0: 32 | sys.exit(run_lint(changed)) 33 | -------------------------------------------------------------------------------- /src/repo/githooks/check-shell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Get a list of changed bash scripts that are staged for commit. 4 | Run shellcheck on only those files. 5 | """ 6 | 7 | 8 | from __future__ import print_function 9 | import subprocess 10 | import re 11 | import sys 12 | 13 | 14 | def get_changed_shell_paths(): 15 | git_diff = subprocess.check_output("git diff --name-only --cached".split()) 16 | paths = [] 17 | for f in git_diff.split("\n"): 18 | # Add playbook files 19 | if re.match(r".*(\.sh|\.bash)$", f): 20 | paths.append(f) 21 | return paths 22 | 23 | 24 | def run_lint(paths): 25 | cmd = ["shellcheck", "-x"] + paths 26 | return subprocess.call(cmd) 27 | 28 | 29 | if __name__ == "__main__": 30 | changed = get_changed_shell_paths() 31 | if len(changed) > 0: 32 | sys.exit(run_lint(changed)) 33 | -------------------------------------------------------------------------------- /ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | collections_paths = ./collections 3 | roles_path = ./roles/galaxy:./roles:./submodules/kubespray/roles 4 | library = ./submodules/kubespray/library 5 | inventory = ./config/inventory 6 | host_key_checking = False 7 | gathering = smart 8 | fact_caching = jsonfile 9 | fact_caching_connection = /var/tmp/ansible_cache 10 | fact_caching_timeout = 86400 11 | deprecation_warnings = False 12 | #vault_password_file = ./config/.vault-pass 13 | timeout=60 14 | stdout_callback = yaml 15 | bin_ansible_callbacks = True 16 | local_tmp=/tmp 17 | remote_tmp=/tmp 18 | forks = 25 19 | force_valid_group_names = ignore 20 | ansible_python_interpreter = /usr/bin/python3 21 | 22 | [ssh_connection] 23 | pipelining = True 24 | ssh_args = -o ControlMaster=auto -o ControlPersist=5m -o ConnectionAttempts=100 -o UserKnownHostsFile=/dev/null 25 | control_path = ~/.ssh/ansible-%%r@%%h:%%p 26 | -------------------------------------------------------------------------------- /playbooks/nvidia-software/nvidia-driver.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ hostlist | default('all') }}" 3 | become: true 4 | tags: 5 | - nvidia 6 | - nvidia_driver 7 | tasks: 8 | - name: Check for DGX packages 9 | stat: 10 | path: /etc/dgx-release 11 | register: is_dgx 12 | 13 | - name: install custom facts 14 | include_role: 15 | name: facts 16 | 17 | - name: install nvidia driver 18 | include_role: 19 | name: nvidia.nvidia_driver 20 | when: (ansible_local['gpus']['count'] and is_dgx.stat.exists == False) or (nvidia_driver_force_install|default(false)) 21 | 22 | - name: test nvidia-smi 23 | command: nvidia-smi 24 | changed_when: false 25 | when: 26 | - ansible_local['gpus']['count'] 27 | - is_dgx.stat.exists == False 28 | environment: "{{proxy_env if proxy_env is defined else{}}}" 29 | -------------------------------------------------------------------------------- /.github/workflows/ansible-lint-roles.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: run ansible-lint on deepops roles 3 | on: 4 | - push 5 | - pull_request 6 | jobs: 7 | lint: 8 | runs-on: ubuntu-20.04 9 | steps: 10 | 11 | - name: check out repo 12 | uses: actions/checkout@v2 13 | with: 14 | path: "${{ github.repository }}" 15 | 16 | - name: set up python 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: "3.9" 20 | 21 | - name: install dependencies 22 | run: | 23 | python3 -m pip install --upgrade pip 24 | python3 -m pip install ansible-lint==5.4.0 ansible==4.8.0 25 | 26 | - name: run lint script 27 | env: 28 | ANSIBLE_LINT_EXCLUDE: "nvidia-dgx|nvidia-gpu-tests" 29 | run: | 30 | cd "${{ github.repository }}" 31 | bash ./scripts/deepops/ansible-lint-roles.sh 32 | -------------------------------------------------------------------------------- /roles/nfs-client-provisioner/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # See the GitHub code repo: https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner 3 | 4 | - name: install nfs-client-provisioner helm repo 5 | command: /usr/local/bin/helm repo add --force-update "{{ k8s_nfs_client_repo_name }}" "{{ k8s_nfs_client_helm_repo }}" 6 | changed_when: false 7 | 8 | - name: update helm repos 9 | command: /usr/local/bin/helm repo update 10 | changed_when: false 11 | 12 | - name: install nfs-client-provisioner 13 | command: /usr/local/bin/helm upgrade --install "{{ k8s_nfs_client_release_name }}" "{{ k8s_nfs_client_chart_name }}" --create-namespace --namespace deepops-nfs-client-provisioner --version "{{ k8s_nfs_client_chart_version }}" --set nfs.server="{{ k8s_nfs_server }}" --set nfs.path="{{ k8s_nfs_export_path }}" --set storageClass.defaultClass="{{ k8s_nfs_default_sc }}" --wait 14 | changed_when: false 15 | -------------------------------------------------------------------------------- /workloads/jenkins/scripts/test-slurm-nfs-mount.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source workloads/jenkins/scripts/jenkins-common.sh 5 | 6 | # showmount path is different between centos and ubuntu 7 | if [ "${DEEPOPS_VAGRANT_OS}" == "centos" ]; then 8 | ssh -v \ 9 | -o "StrictHostKeyChecking no" \ 10 | -o "UserKnownHostsFile /dev/null" \ 11 | -l vagrant \ 12 | -i "${HOME}/.ssh/id_rsa" \ 13 | "10.0.0.5${GPU01}" \ 14 | "/usr/sbin/showmount -e | grep home" 15 | else 16 | ssh -v \ 17 | -o "StrictHostKeyChecking no" \ 18 | -o "UserKnownHostsFile /dev/null" \ 19 | -l vagrant \ 20 | -i "${HOME}/.ssh/id_rsa" \ 21 | "10.0.0.5${GPU01}" \ 22 | "showmount -e | grep home" 23 | fi 24 | 25 | 26 | ssh -v \ 27 | -o "StrictHostKeyChecking no" \ 28 | -o "UserKnownHostsFile /dev/null" \ 29 | -l vagrant \ 30 | -i "${HOME}/.ssh/id_rsa" \ 31 | "10.0.0.6${GPU01}" \ 32 | "mount | grep nfs | grep home" 33 | --------------------------------------------------------------------------------