├── .gitignore ├── LICENSE ├── README.md ├── autoscaling ├── .DS_Store ├── clusters │ └── README ├── credentials │ └── key.sh ├── crontab │ └── autoscale_slurm.sh ├── provider_inst_prin.tpl ├── provider_user.tpl └── tf_init │ ├── cluster-network-configuration.tf │ ├── cluster-network.tf │ ├── compute-cluster.tf │ ├── compute-nodes.tf │ ├── config.controller │ ├── config.hpc │ ├── controller_update.tf │ ├── data.tf │ ├── instance-pool-configuration.tf │ ├── instance-pool.tf │ ├── inventory.tpl │ ├── locals.tf │ ├── marketplace.tf │ ├── network.tf │ ├── outputs.tf │ ├── user_data.tf │ └── versions.tf ├── bin ├── cleanup.sh ├── configure.sh ├── configure_as.sh ├── controller.sh ├── create_cluster.sh ├── delete_cluster.sh ├── find_reachable_hosts.sh ├── initial_monitoring.sh ├── rdma_metrics_collection_config.conf ├── remove_nodes_prompt.txt ├── resize.sh ├── resize │ ├── ociobj.py │ ├── reconfigure.py │ ├── resize.py │ └── utils.py ├── slurm_config.sh ├── upload_rdma_nic_metrics.sh └── wait_for_hosts.sh ├── cluster-network-configuration.tf ├── cluster-network.tf ├── compute-cluster.tf ├── compute-nodes.tf ├── conf ├── queues.conf.example └── variables.tpl ├── config.controller ├── config.hpc ├── configure.tpl ├── controller.tf ├── data.tf ├── fss.tf ├── initial_mon.tpl ├── instance-pool-configuration.tf ├── instance-pool.tf ├── inventory.tpl ├── locals.tf ├── login.tf ├── logs └── README ├── marketplace.tf ├── monitoring.tf ├── mysql.tf ├── network.tf ├── oci_images.tf ├── outputs.tf ├── playbooks ├── destroy.yml ├── group_vars │ └── all.yml ├── monitoring.yml ├── new_nodes.yml ├── resize_add.yml ├── resize_remove.yml ├── resize_remove_unreachable.yml ├── roles │ ├── autoscaling_mon │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── dashboard.json │ │ │ ├── initial.sh │ │ │ ├── initial.sql │ │ │ ├── monitor_oci.sh │ │ │ └── monitor_slurm.sh │ │ ├── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ │ └── templates │ │ │ ├── env.j2 │ │ │ └── mysql_service_initial.j2 │ ├── boot-volume │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── cloud-agent_update │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── cluster-cli │ │ ├── files │ │ │ └── cluster │ │ └── tasks │ │ │ ├── debian.yml │ │ │ ├── el7.yml │ │ │ ├── el8.yml │ │ │ └── main.yml │ ├── cron │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── destroy_unreachable │ │ └── tasks │ │ │ ├── common.yml │ │ │ ├── main.yml │ │ │ ├── slurm-rack-aware.yml │ │ │ └── slurm.yml │ ├── docker │ │ ├── tasks │ │ │ ├── main.yml │ │ │ ├── oraclelinux.yml │ │ │ └── ubuntu.yml │ │ └── templates │ │ │ └── templates.j2 │ ├── etc-hosts │ │ ├── tasks │ │ │ ├── common.yml │ │ │ └── main.yml │ │ └── templates │ │ │ ├── etc-hosts-controller.j2 │ │ │ └── etc-hosts.j2 │ ├── firewall │ │ ├── files │ │ │ └── off-iptables.sh │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── fix_broken │ │ ├── README.md │ │ └── tasks │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── fix_ldap │ │ └── tasks │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── fss-home │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── grafana │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── alert-rules.yaml │ │ │ ├── cluster.json │ │ │ ├── cluster_amd.json │ │ │ ├── cluster_prometheus.json │ │ │ ├── cluster_prometheus_v2.json │ │ │ ├── g.libsonnet │ │ │ ├── main.jsonnet │ │ │ ├── node_exporter.json │ │ │ ├── raw_message_template.txt │ │ │ ├── rdma-hw-counters.json │ │ │ └── variables.libsonnet │ │ ├── tasks │ │ │ ├── dashboard.yml │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ │ └── templates │ │ │ ├── ons-webhook.service.j2 │ │ │ └── ons_webhook.py.j2 │ ├── healthchecks │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── check_gpu_setup.py │ │ │ ├── gpu_bw_test.py │ │ │ ├── meshpinger_readme.md │ │ │ ├── rdma_link_flapping.py │ │ │ ├── run_meshpinger.sh │ │ │ ├── shared_logging.py │ │ │ └── xid_checker.py │ │ └── tasks │ │ │ └── main.yml │ ├── home_nfs │ │ └── tasks │ │ │ ├── el.yml │ │ │ └── main.yml │ ├── hostname │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── hyperthreading │ │ ├── files │ │ │ ├── control_hyperthreading.sh │ │ │ ├── control_hyperthreading_ubuntu.sh │ │ │ ├── disable-hyperthreading.service │ │ │ └── disable-hyperthreading_ubuntu.service │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── influxdb │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── config_influxdb.yml │ │ │ ├── el.yml │ │ │ ├── el_install_influxdb.yml │ │ │ ├── main.yml │ │ │ ├── ubuntu.yml │ │ │ └── ubuntu_install_influxdb.yml │ │ └── templates │ │ │ └── influxdb.conf.j2 │ ├── iscsi │ │ └── tasks │ │ │ ├── debian.yml │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── latency_check │ │ ├── tasks │ │ │ ├── el.yml │ │ │ └── main.yml │ │ └── templates │ │ │ └── latency_check.sh.j2 │ ├── limits │ │ ├── tasks │ │ │ ├── common.yml │ │ │ └── main.yml │ │ └── templates │ │ │ └── limits.j2 │ ├── localdisk │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── common.yml │ │ │ ├── debian.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── metrics-exporter │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── critical_process_monitor.py │ │ │ ├── custom_metric_common.py │ │ │ ├── dcgm-counters.csv │ │ │ ├── node_exporter.service │ │ │ ├── nvlink_counters_exporter.py │ │ │ ├── rdma_counters_exporter.py │ │ │ ├── rdma_link_flapping.py │ │ │ ├── shared_logging.py │ │ │ └── xid_checker.py │ │ ├── tasks │ │ │ ├── critical_process_monitor.yml │ │ │ ├── custom_metrics.yml │ │ │ ├── dcgm_exporter.yml │ │ │ ├── main.yml │ │ │ ├── node_exporter_el.yml │ │ │ ├── node_exporter_ubuntu.yml │ │ │ ├── nvlink_exporter.yml │ │ │ └── rdma_exporter.yml │ │ ├── templates │ │ │ ├── critical-process-monitor.service.j2 │ │ │ ├── customMetrics.service.j2 │ │ │ ├── custom_metrics.py.j2 │ │ │ ├── dcgm-exporter.service.j2 │ │ │ ├── nvlink-exporter.service.j2 │ │ │ ├── prometheus.yml.j2 │ │ │ └── rdma-exporter.service.j2 │ │ └── vars │ │ │ └── main.yml │ ├── mpi-hostfiles │ │ ├── tasks │ │ │ ├── common.yml │ │ │ └── main.yml │ │ └── templates │ │ │ ├── hostfile_rdma.j2 │ │ │ └── hostfile_tcp.j2 │ ├── mpivars │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ │ ├── templates │ │ │ └── mpivars.j2 │ │ └── vars │ │ │ └── main.yml │ ├── mysql │ │ ├── files │ │ │ └── innodb.cnf │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── centos.yml │ │ │ ├── debian.yml │ │ │ ├── el.yml │ │ │ └── main.yml │ │ ├── templates │ │ │ ├── debian_mariadb_etc_my.cnf.j2 │ │ │ ├── debian_my.cnf.j2 │ │ │ ├── mariadb_etc_my.cnf.j2 │ │ │ ├── my.cnf.j2 │ │ │ └── mysqld_etc_my.cnf.j2 │ │ └── vars │ │ │ ├── ol_vars.yml │ │ │ ├── ubuntu-2204_vars.yml │ │ │ └── ubuntu_vars.yml │ ├── nccl-conf │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ ├── a100_b4.8 │ │ │ ├── bm.gpu4.8 │ │ │ ├── h100 │ │ │ └── h200 │ ├── nfs-client │ │ ├── defaults │ │ │ └── main.yml │ │ ├── meta │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── debian.yml │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ │ └── vars │ │ │ └── main.yml │ ├── nfs-server │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── meta │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── debian.yml │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ │ ├── templates │ │ │ └── exports.j2 │ │ └── vars │ │ │ └── main.yml │ ├── no_instance_principal │ │ ├── defaults │ │ │ └── main.yml │ │ ├── meta │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── common.yml │ │ │ └── main.yml │ │ ├── templates │ │ │ └── config.j2 │ │ └── vars │ │ │ └── main.yml │ ├── nvidia-container │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── main.yml │ │ │ ├── oraclelinux-7.yml │ │ │ └── ubuntu.yml │ │ ├── templates │ │ │ └── templates.j2 │ │ └── vars │ │ │ └── main.yml │ ├── nvidia-enroot │ │ └── tasks │ │ │ ├── main.yml │ │ │ ├── oraclelinux.yml │ │ │ └── ubuntu.yml │ ├── nvidia_peermem │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── common.yml │ │ │ └── main.yml │ │ └── vars │ │ │ └── main.yml │ ├── oci-cloud-agent-updater │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── oci-cloud-agent │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── oci-cn-auth │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── oci-hostname │ │ └── tasks │ │ │ ├── el.yml │ │ │ └── main.yml │ ├── oci-legacy │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── oom-adjust │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ └── oom-adjust.conf │ │ └── tasks │ │ │ └── main.yml │ ├── openldap │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── autoinc.ldif │ │ │ ├── debian_memberof.ldif │ │ │ ├── debian_ppolicy.ldif │ │ │ ├── el_memberof.ldif │ │ │ ├── el_memberof_ol8.ldif │ │ │ ├── el_ppolicy.ldif │ │ │ ├── el_ppolicy_ol8.ldif │ │ │ ├── local_schema.ldif │ │ │ ├── rfc2307bis.ldif │ │ │ └── slapd │ │ ├── handlers │ │ │ └── main.yml │ │ ├── meta │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── debian.yml │ │ │ ├── el.yml │ │ │ └── main.yml │ │ ├── templates │ │ │ ├── config.ldif.j2 │ │ │ ├── debian_local.ldif.j2 │ │ │ ├── el_local.ldif.j2 │ │ │ └── el_local_ol8.ldif.j2 │ │ └── vars │ │ │ ├── debian_vars.yml │ │ │ └── el_vars.yml │ ├── packages │ │ └── tasks │ │ │ ├── centos-7.yml │ │ │ ├── debian.yml │ │ │ ├── el-7.yml │ │ │ ├── main.yml │ │ │ ├── ol-7.yml │ │ │ ├── ol-8.yml │ │ │ ├── ubuntu-2204.yml │ │ │ └── ubuntu.yml │ ├── passwords │ │ └── tasks │ │ │ └── main.yml │ ├── privilege_group │ │ └── tasks │ │ │ ├── common.yml │ │ │ ├── el.yml │ │ │ └── main.yml │ ├── prometheus │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── gather_info.yml │ │ │ └── main.yml │ │ └── templates │ │ │ ├── prometheus.conf.j2 │ │ │ ├── prometheus.service.j2 │ │ │ └── prometheus.yml.j2 │ ├── rack-aware │ │ ├── files │ │ │ └── node_ordering_by_rack.py │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── rdma-interface │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── debian.yml │ │ │ ├── el.yml │ │ │ └── main.yml │ │ ├── templates │ │ │ ├── ifcfg.j2 │ │ │ └── interface.j2 │ │ └── vars │ │ │ └── main.yml │ ├── rttcc │ │ └── tasks │ │ │ └── main.yml │ ├── safe_yum │ │ └── tasks │ │ │ ├── el.yml │ │ │ ├── main.yml │ │ │ └── ubuntu.yml │ ├── slurm │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── cgroup.conf │ │ │ ├── healthchecks.sh │ │ │ └── sshd │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── backup_server.yml │ │ │ ├── cleanup.yml │ │ │ ├── common.yml │ │ │ ├── common_pmix.yml │ │ │ ├── common_pyxis.yml │ │ │ ├── compute-rack-aware.yml │ │ │ ├── compute.yml │ │ │ ├── compute_pam.yml │ │ │ ├── controller.yml │ │ │ ├── destroy-rack-aware.yml │ │ │ ├── destroy.yml │ │ │ ├── download.yml │ │ │ ├── el7.yml │ │ │ ├── el8.yml │ │ │ ├── login.yml │ │ │ ├── main.yml │ │ │ ├── move-topology.yml │ │ │ ├── server.yml │ │ │ └── ubuntu.yml │ │ ├── templates │ │ │ ├── gres.conf.j2 │ │ │ ├── pyxis.sh.j2 │ │ │ ├── slurm.conf.j2 │ │ │ ├── slurmdbd.conf.j2 │ │ │ ├── systemd │ │ │ │ ├── munge.service.d │ │ │ │ │ └── unit.conf.j2 │ │ │ │ ├── sackd.service.d │ │ │ │ │ └── unit.conf.j2 │ │ │ │ ├── sackd.service.j2 │ │ │ │ ├── slurm_env.j2 │ │ │ │ ├── slurmctld.service.d │ │ │ │ │ └── unit.conf.j2 │ │ │ │ ├── slurmctld.service.j2 │ │ │ │ ├── slurmctld_backup.service.d │ │ │ │ │ └── unit.conf.j2 │ │ │ │ ├── slurmd.service.d │ │ │ │ │ └── unit.conf.j2 │ │ │ │ ├── slurmd.service.j2 │ │ │ │ ├── slurmdbd.service.d │ │ │ │ │ └── unit.conf.j2 │ │ │ │ └── slurmdbd.service.j2 │ │ │ └── topology.conf.j2 │ │ └── vars │ │ │ ├── centos_vars.yml │ │ │ ├── el_vars.yml │ │ │ └── ubuntu_vars.yml │ ├── spack │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── debian.yml │ │ │ ├── el.yml │ │ │ └── main.yml │ │ └── templates │ │ │ └── spack.j2 │ ├── ssh │ │ ├── files │ │ │ └── ssh_config │ │ └── tasks │ │ │ ├── common.yml │ │ │ └── main.yml │ ├── ssl │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── debian.yml │ │ │ ├── el.yml │ │ │ └── main.yml │ │ └── templates │ │ │ └── san.conf.j2 │ ├── sssd │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── debian.yml │ │ │ ├── el-7.yml │ │ │ ├── el-8.yml │ │ │ └── main.yml │ │ ├── templates │ │ │ ├── sssd.conf.j2 │ │ │ └── sssd_ubuntu.conf.j2 │ │ └── vars │ │ │ └── main.yml │ ├── telegraf │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── telegraf.conf │ │ │ ├── telegraf_amd_gpu.conf │ │ │ └── telegraf_nvidia_gpu.conf │ │ ├── handlers │ │ │ └── main.yml │ │ ├── meta │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── common.yml │ │ │ └── main.yml │ │ └── templates │ │ │ ├── amd_gpu.conf.j2 │ │ │ ├── custom.cnf.j2 │ │ │ ├── ethtool_counters.conf.j2 │ │ │ ├── infiniband.conf.j2 │ │ │ ├── infiniband_mlx5_0_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_10_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_11_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_12_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_13_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_14_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_15_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_16_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_1_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_2_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_3_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_4_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_5_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_6_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_7_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_8_hw_counters.conf.j2 │ │ │ ├── infiniband_mlx5_9_hw_counters.conf.j2 │ │ │ ├── influxdb.conf.j2 │ │ │ ├── net.conf.j2 │ │ │ ├── nvidia_gpu.conf.j2 │ │ │ └── prometheus.conf.j2 │ ├── tuned │ │ ├── files │ │ │ └── tuned.conf │ │ └── tasks │ │ │ ├── el-7.yml │ │ │ └── main.yml │ ├── weka_client │ │ └── tasks │ │ │ └── main.yml │ └── yaml │ │ └── tasks │ │ ├── el.yml │ │ ├── main.yml │ │ └── ubuntu.yml ├── site.yml ├── slurm_config.yml └── slurm_config_as.yml ├── provider.tf ├── queues.conf ├── samples ├── NCCL_readme ├── disable_ht.sh ├── gpu │ ├── H100-topology-kubernetes.xml │ ├── H100-topology.xml │ ├── ifdown.sh │ ├── ifup.sh │ ├── nccl_run_allreduce.sbatch │ ├── nccl_run_allreduce.sh │ ├── nccl_run_allreduce_H100_200.sbatch │ ├── nccl_run_allreduce_H100_200.sh │ ├── nccl_run_allreduce_containers_H100_H200.sbatch │ ├── nccl_run_allreduce_containers_with_ordering.sbatch │ ├── nccl_run_allreduce_srun.sbatch │ ├── nccl_run_allreduce_srun.sh │ ├── nccl_run_allreduce_tuner.sbatch │ ├── nccl_run_allreduce_tuner.sh │ ├── nccl_run_alltoall.sh │ ├── no_ncclparam_nccl_run_allreduce.sbatch │ ├── no_ncclparam_nccl_run_allreduce.sh │ ├── no_ncclparam_nccl_run_allreduce_H100_H200.sbatch │ ├── no_ncclparam_nccl_run_allreduce_H100_H200.sh │ ├── notes.txt │ ├── ping.sh │ ├── qfabv1_nccl_run_allreduce.sbatch │ ├── qfabv1_nccl_run_allreduce.sh │ ├── qfabv1_nccl_run_alltoall.sh │ ├── rccl_run_allreduce.sbatch │ ├── srun_examples_with_container.txt │ ├── topo-flattened-b4.xml │ ├── topo-flattened.xml │ ├── update_arp_settings.sh │ └── update_netmask.sh ├── nccl_compile │ └── compile.sh ├── nfs │ ├── README.txt │ └── fix_nfs.sh ├── open-ldap │ ├── add-ldap-users.yml │ └── add-linux-group.yml ├── prep_sample_files.sh ├── rdma-tuning │ ├── check_pcie_max_read.sh │ ├── pcie_max_read.sh │ ├── pcie_max_read_default.sh │ ├── rdma-nic-tuning-1.sh │ ├── rdma-nic-tuning-2.sh │ └── rdma-nic-tuning-validate.sh └── submit │ ├── sleep.sbatch │ └── sleep_gpu.sbatch ├── schema.yaml ├── scripts ├── check_firmware_version.sh ├── collect_logs.py ├── collect_metadata │ ├── collect_metadata.py │ └── requirements.txt ├── create_instance_config.py ├── gpu_throttle.sh ├── h100_script.py ├── ib_write_bw.sh ├── ib_write_lat.sh ├── max_nodes_partition.py ├── ncclscout.py ├── pcie.sh ├── runNCCL_on_hostPair.sh └── validation.py ├── slurm_ha.tf ├── user_data.tf ├── variables.tf └── versions.tf /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /autoscaling/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/autoscaling/.DS_Store -------------------------------------------------------------------------------- /autoscaling/clusters/README: -------------------------------------------------------------------------------- 1 | Each terraform configuration will be located in this folder. This will be used to destoy the cluster, do not remove live clusters manually. -------------------------------------------------------------------------------- /autoscaling/credentials/key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sed 's/-----END RSA PRIVATE KEY-----//' $1 | sed 's/ /\n/4g' > $2 4 | echo -----END RSA PRIVATE KEY----- >> $2 5 | chmod 600 $2 -------------------------------------------------------------------------------- /autoscaling/provider_inst_prin.tpl: -------------------------------------------------------------------------------- 1 | provider "oci" { 2 | auth = "InstancePrincipal" 3 | tenancy_ocid = var.tenancy_ocid 4 | #user_ocid = "${api_user_ocid}" 5 | #fingerprint = "${api_fingerprint}" 6 | #private_key_path = "${private_key_path}" 7 | region = var.region 8 | } 9 | -------------------------------------------------------------------------------- /autoscaling/provider_user.tpl: -------------------------------------------------------------------------------- 1 | provider "oci" { 2 | tenancy_ocid = var.tenancy_ocid 3 | user_ocid = "${api_user_ocid}" 4 | fingerprint = "${api_fingerprint}" 5 | private_key_path = "${private_key_path}" 6 | region = var.region 7 | } 8 | -------------------------------------------------------------------------------- /autoscaling/tf_init/compute-cluster.tf: -------------------------------------------------------------------------------- 1 | resource "oci_core_compute_cluster" "compute_cluster" { 2 | count = var.compute_cluster && var.cluster_network && var.node_count > 0 ? 1 : 0 3 | #Required 4 | availability_domain = var.ad 5 | compartment_id = var.targetCompartment 6 | 7 | #Optional 8 | display_name = local.cluster_name 9 | freeform_tags = { 10 | "user" = var.tags 11 | "cluster_name" = local.cluster_name 12 | "parent_cluster" = local.cluster_name 13 | } 14 | } -------------------------------------------------------------------------------- /autoscaling/tf_init/config.controller: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | -------------------------------------------------------------------------------- /autoscaling/tf_init/config.hpc: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | -------------------------------------------------------------------------------- /autoscaling/tf_init/outputs.tf: -------------------------------------------------------------------------------- 1 | output "private_ips" { 2 | value = join(",", local.cluster_instances_ips) 3 | } 4 | output "hostnames" { 5 | value = join(",", local.cluster_instances_names) 6 | } 7 | output "ocids" { 8 | value = join(",", local.cluster_instances_ids) 9 | } 10 | output "cluster_ocid" { 11 | value = var.compute_cluster ? oci_core_compute_cluster.compute_cluster[0].id : var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id 12 | } 13 | -------------------------------------------------------------------------------- /autoscaling/tf_init/user_data.tf: -------------------------------------------------------------------------------- 1 | data "template_file" "config" { 2 | template = file("config.hpc") 3 | } 4 | 5 | 6 | -------------------------------------------------------------------------------- /autoscaling/tf_init/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.2" 3 | required_providers { 4 | oci = { 5 | source = "oracle/oci" 6 | version = "6.9.0" 7 | } 8 | } 9 | } -------------------------------------------------------------------------------- /bin/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Cluster destroy script 4 | scripts=`realpath $0` 5 | folder=`dirname $scripts` 6 | playbooks_path=$folder/../playbooks/ 7 | inventory_path=$folder/../autoscaling/clusters/$1 8 | 9 | if [ $EUID -eq 0 ] 10 | then 11 | echo "Run this script as opc or ubuntu and not as root" 12 | exit 13 | fi 14 | 15 | ssh_options="-i ~/.ssh/id_rsa -o StrictHostKeyChecking=no" 16 | iplist=`cat $inventory_path/inventory | awk '{print $2}' | sed 's/ansible_host=//'` 17 | if [[ "$2" == "FORCE" ]] 18 | then 19 | echo Force Deletion 20 | ANSIBLE_HOST_KEY_CHECKING=False timeout 2m ansible-playbook $playbooks_path/destroy.yml -i $inventory_path/inventory -e "force=yes" 21 | status_cleanup=$? 22 | if [ $status_cleanup -ne 0 ] 23 | then 24 | /opt/oci-hpc/bin/resize.py remove_unreachable --nodes $iplist 25 | status_cleanup=$? 26 | fi 27 | exit $status_cleanup 28 | else 29 | ANSIBLE_HOST_KEY_CHECKING=False timeout 2m ansible-playbook $playbooks_path/destroy.yml -i $inventory_path/inventory -e "force=no" 30 | status_cleanup=$? 31 | exit $status_cleanup 32 | fi -------------------------------------------------------------------------------- /bin/configure_as.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Cluster init configuration script 4 | # 5 | 6 | # 7 | # wait for cloud-init completion on the controller host 8 | # 9 | 10 | scripts=`realpath $0` 11 | folder=`dirname $scripts` 12 | execution=1 13 | playbooks_path=$folder/../playbooks/ 14 | inventory_path=$folder/../autoscaling/clusters/$1 15 | 16 | 17 | username=`cat $inventory_path/inventory | grep compute_username= | tail -n 1| awk -F "=" '{print $2}'` 18 | if [ "$username" == "" ] 19 | then 20 | username=$USER 21 | fi 22 | 23 | /opt/oci-hpc/bin/wait_for_hosts.sh $inventory_path/hosts_$1 $username 24 | # 25 | # Ansible will take care of key exchange and learning the host fingerprints, but for the first time we need 26 | # to disable host key checking. 27 | # 28 | 29 | if [[ $execution -eq 1 ]] ; then 30 | ANSIBLE_HOST_KEY_CHECKING=False ansible all -m setup --tree /tmp/ansible > /dev/null 2>&1 31 | ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook $playbooks_path/new_nodes.yml -i $inventory_path/inventory 32 | else 33 | 34 | cat <<- EOF > /tmp/motd 35 | At least one of the cluster nodes has been innacessible during installation. Please validate the hosts and re-run: 36 | ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook $playbooks_path/new_nodes.yml -i $inventory_path/inventory 37 | EOF 38 | exit 1 39 | fi 40 | -------------------------------------------------------------------------------- /bin/find_reachable_hosts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # A little waiter function to make sure all the nodes are up before we start configure 5 | # 6 | 7 | echo "Checking For SSH" 8 | 9 | ssh_options="-i ~/.ssh/cluster.key -o StrictHostKeyChecking=no" 10 | rm $2 11 | touch $2 12 | 13 | for host in $(cat $1) ; do 14 | r=0 15 | echo "validating connection to: ${host}" 16 | if [[ `ssh ${ssh_options} -o ConnectTimeout=15 $3@${host} uptime | grep load | wc -l` > 0 ]] ; 17 | then 18 | echo ${host} >> $2 19 | fi 20 | done -------------------------------------------------------------------------------- /bin/rdma_metrics_collection_config.conf: -------------------------------------------------------------------------------- 1 | hoursAgoFromNow=24 2 | metricsCollectionIntervalInMinute=5 3 | parFileName=/opt/oci-hpc/PAR_file_for_metrics -------------------------------------------------------------------------------- /bin/remove_nodes_prompt.txt: -------------------------------------------------------------------------------- 1 | Does your cluster run any file system like Ceph, NFS, etc. on the GPU/HPC nodes itself using local NVMe SSDs? 2 | If yes, terminating nodes which store your data can result in permanent data loss, so before proceeding make sure any important data is copied to a persistent file system outside of the cluster such as to object storage, file storage, etc. 3 | Once data is backed up or migrated, come back and run the script. Select 2 to exit. 4 | Remember, once the nodes are terminated, all the data is lost forever and you won't be able to recover it. -------------------------------------------------------------------------------- /bin/slurm_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Regenerate Slurm Config 4 | # 5 | # Add --initial as argument if you need to restart slurm from scratch (Removes the current topology file) 6 | 7 | 8 | scripts=`realpath $0` 9 | folder=`dirname $scripts` 10 | autoscaling_folder=$folder/../autoscaling/ 11 | conf_folder=$folder/../conf/ 12 | playbooks_path=$folder/../playbooks/ 13 | 14 | source /etc/os-release 15 | 16 | 17 | if [[ ${@: -1} == "--INITIAL" || ${@: -1} == "--initial" || ${@: -1} == "-INITIAL" || ${@: -1} == "-initial" ]] 18 | then 19 | sudo rm /etc/slurm/topology.conf 20 | sudo /usr/sbin/slurmctld -c 21 | fi 22 | ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook $playbooks_path/slurm_config.yml 23 | if [[ ${@: -1} == "--INITIAL" || ${@: -1} == "--initial" || ${@: -1} == "-INITIAL" || ${@: -1} == "-initial" ]] 24 | then 25 | for inventory in /opt/oci-hpc/autoscaling/clusters/*/inventory ; 26 | do 27 | if [ -f $(dirname $inventory)/currently* ] 28 | then 29 | echo "Cluster is not in running state" 30 | else 31 | ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook $playbooks_path/slurm_config_as.yml -i $inventory 32 | fi 33 | done 34 | fi -------------------------------------------------------------------------------- /bin/wait_for_hosts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # A little waiter function to make sure all the nodes are up before we start configure 5 | # 6 | 7 | echo "Waiting for SSH to come up" 8 | 9 | ssh_options="-i ~/.ssh/cluster.key -o StrictHostKeyChecking=no" 10 | for host in $(cat $1) ; do 11 | r=0 12 | echo "validating connection to: ${host}" 13 | while ! ssh ${ssh_options} -o ConnectTimeout=30 $2@${host} uptime ; do 14 | if [[ $r -eq 10 ]] ; then 15 | execution=0 16 | break 17 | fi 18 | echo "Still waiting for ${host}" 19 | sleep 30 20 | r=$(($r + 1)) 21 | done 22 | done 23 | -------------------------------------------------------------------------------- /compute-cluster.tf: -------------------------------------------------------------------------------- 1 | resource "oci_core_compute_cluster" "compute_cluster" { 2 | count = var.compute_cluster && var.cluster_network && var.node_count > 0 ? 1 : 0 3 | #Required 4 | availability_domain = var.ad 5 | compartment_id = var.targetCompartment 6 | 7 | #Optional 8 | display_name = local.cluster_name 9 | freeform_tags = { 10 | "cluster_name" = local.cluster_name 11 | "parent_cluster" = local.cluster_name 12 | } 13 | } -------------------------------------------------------------------------------- /config.controller: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | -------------------------------------------------------------------------------- /config.hpc: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | -------------------------------------------------------------------------------- /configure.tpl: -------------------------------------------------------------------------------- 1 | ${configure} -------------------------------------------------------------------------------- /initial_mon.tpl: -------------------------------------------------------------------------------- 1 | ocid = ${cluster_ocid} 2 | queue = ${queue} 3 | shape = ${shape} 4 | cluster_network = ${cluster_network} 5 | private_ips = ${ips} 6 | ocids = ${ocids} 7 | hostnames = ${hostnames} -------------------------------------------------------------------------------- /logs/README: -------------------------------------------------------------------------------- 1 | Each log will be located in this folder. The crontab will be in crontab.log while the Cluster creation and deletion information will be in specific files. -------------------------------------------------------------------------------- /mysql.tf: -------------------------------------------------------------------------------- 1 | resource "oci_mysql_mysql_db_system" "monitoring_mysql_db_system" { 2 | #Required 3 | count = var.autoscaling_monitoring && var.autoscaling_mysql_service ? 1 : 0 4 | admin_password = var.admin_password 5 | admin_username = var.admin_username 6 | availability_domain = var.controller_ad 7 | compartment_id = var.targetCompartment 8 | shape_name = var.monitoring_shape_name 9 | subnet_id = local.subnet_id 10 | display_name = "autoscaling_monitoring" 11 | is_highly_available = false 12 | data_storage_size_in_gb = "50" 13 | backup_policy { 14 | is_enabled = false 15 | } 16 | } -------------------------------------------------------------------------------- /oci_images.tf: -------------------------------------------------------------------------------- 1 | variable "marketplace_source_images" { 2 | type = map(object({ 3 | ocid = string 4 | is_pricing_associated = bool 5 | compatible_shapes = set(string) 6 | })) 7 | default = { 8 | main_mktpl_image = { 9 | ocid = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" 10 | is_pricing_associated = false 11 | compatible_shapes = [] 12 | } 13 | supporting_image = { 14 | ocid = "ocid1.image.oc1..aaaaaaaazeefig7dqaoheiyoufmllolc3tuiv2c4xueecpr33dm3k4xjip3a" 15 | is_pricing_associated = false 16 | compatible_shapes = [] 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /outputs.tf: -------------------------------------------------------------------------------- 1 | output "controller" { 2 | value = local.host 3 | } 4 | 5 | output "private_ips" { 6 | value = join(" ", local.cluster_instances_ips) 7 | } 8 | 9 | output "backup" { 10 | value = var.slurm_ha ? local.host_backup : "No Slurm Backup Defined" 11 | } 12 | 13 | output "login" { 14 | value = var.login_node ? local.host_login : "No Login Node Defined" 15 | } 16 | 17 | output "monitoring" { 18 | value = var.monitoring_node ? local.host_monitoring : "No Monitoring Node Defined" 19 | } -------------------------------------------------------------------------------- /playbooks/destroy.yml: -------------------------------------------------------------------------------- 1 | - hosts: compute, slurm_backup 2 | become: true 3 | vars: 4 | destroy: true 5 | initial: false 6 | vars_files: 7 | - "/opt/oci-hpc/conf/queues.conf" 8 | tasks: 9 | - include_role: 10 | name: slurm 11 | when: slurm|default(false)|bool 12 | - hosts: controller, slurm_backup, login, monitoring 13 | become: true 14 | vars: 15 | destroy: true 16 | initial: false 17 | roles: 18 | - etc-hosts 19 | -------------------------------------------------------------------------------- /playbooks/group_vars/all.yml: -------------------------------------------------------------------------------- 1 | ssl_cert_path: '/etc/ssl/certs' 2 | ssl_ca_cert: '{{ ssl_cert_path }}/cluster-ca.crt' 3 | ssl_cert_group: ssl -------------------------------------------------------------------------------- /playbooks/monitoring.yml: -------------------------------------------------------------------------------- 1 | - hosts: all,!monitoring 2 | gather_facts: true 3 | tasks: 4 | - include_role: 5 | name: metrics-exporter 6 | when: cluster_monitoring|default(false)|bool 7 | 8 | - hosts: monitoring 9 | gather_facts: true 10 | tasks: 11 | - include_role: 12 | name: grafana 13 | when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length > 0 ) 14 | 15 | - hosts: controller 16 | tasks: 17 | - include_role: 18 | name: grafana 19 | when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length == 0 ) 20 | 21 | - hosts: controller, monitoring 22 | tasks: 23 | - include_role: 24 | name: prometheus 25 | when: cluster_monitoring|default(false)|bool -------------------------------------------------------------------------------- /playbooks/resize_remove.yml: -------------------------------------------------------------------------------- 1 | - hosts: controller, slurm_backup, compute, login, monitoring 2 | become: true 3 | gather_facts: true 4 | vars: 5 | destroy: false 6 | tasks: 7 | - include_role: 8 | name: mpi-hostfiles 9 | - include_role: 10 | name: etc-hosts 11 | 12 | - hosts: compute_to_destroy, slurm_backup 13 | become: true 14 | vars: 15 | destroy: true 16 | initial: false 17 | vars_files: 18 | - "/opt/oci-hpc/conf/queues.conf" 19 | tasks: 20 | - include_role: 21 | name: slurm 22 | when: slurm|default(false)|bool -------------------------------------------------------------------------------- /playbooks/resize_remove_unreachable.yml: -------------------------------------------------------------------------------- 1 | - hosts: controller, compute, slurm_backup, login, monitoring 2 | become: true 3 | gather_facts: true 4 | vars: 5 | destroy: false 6 | slurm_conf_path: "/etc/slurm" 7 | unreachable_nodes: "{{ lookup('env', 'unreachable_node_list').split(',') | list}}" 8 | vars_files: 9 | - "/opt/oci-hpc/conf/queues.conf" 10 | tasks: 11 | - include_role: 12 | name: destroy_unreachable 13 | 14 | 15 | - hosts: controller 16 | become: true 17 | tasks: 18 | - include_role: 19 | name: prometheus -------------------------------------------------------------------------------- /playbooks/roles/autoscaling_mon/defaults/main.yml: -------------------------------------------------------------------------------- 1 | grafana_api_keys_dir: /etc/opt/oci-hpc/passwords/grafana 2 | grafana_api_url: "http://localhost:3000" 3 | grafana_security: 4 | admin_user: admin 5 | admin_password: admin 6 | grafana_api_keys: 7 | - name: "admin" 8 | role: "Admin" 9 | -------------------------------------------------------------------------------- /playbooks/roles/autoscaling_mon/files/initial.sh: -------------------------------------------------------------------------------- 1 | wget https://dl.grafana.com/oss/release/grafana-7.5.0-1.x86_64.rpm 2 | sudo yum install -y grafana-7.5.0-1.x86_64.rpm 3 | sudo yum install -y https://dev.mysql.com/get/mysql80-community-release-el7-3.noarch.rpm 4 | sudo yum install -y mysql-shell 5 | sudo pip3 install protobuf==3.19.4 6 | sudo pip3 install mysql-connector-python==8.0.31 7 | sudo systemctl daemon-reload 8 | sudo systemctl start grafana-server 9 | sudo systemctl status grafana-server 10 | sudo systemctl enable grafana-server 11 | echo OK >> /opt/oci-hpc/monitoring/activated -------------------------------------------------------------------------------- /playbooks/roles/autoscaling_mon/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' 5 | -------------------------------------------------------------------------------- /playbooks/roles/autoscaling_mon/templates/env.j2: -------------------------------------------------------------------------------- 1 | export ENV_MYSQL_HOST={{ monitoring_mysql_ip }} 2 | export ENV_MYSQL_USER=logger 3 | export ENV_MYSQL_PASS=Monitor1234! 4 | export ENV_MYSQL_DATABASE_NAME=cluster_log 5 | export ENV_MYSQL_PORT=3306 -------------------------------------------------------------------------------- /playbooks/roles/autoscaling_mon/templates/mysql_service_initial.j2: -------------------------------------------------------------------------------- 1 | CREATE DATABASE cluster_log; 2 | 3 | CREATE USER 'logger' IDENTIFIED WITH mysql_native_password BY '{{ mysql_root_pwd }}'; 4 | GRANT SELECT ON cluster_log.* TO 'logger'@'%'; 5 | GRANT INSERT ON cluster_log.* TO 'logger'@'%'; 6 | GRANT UPDATE ON cluster_log.* TO 'logger'@'%'; -------------------------------------------------------------------------------- /playbooks/roles/boot-volume/tasks/el.yml: -------------------------------------------------------------------------------- 1 | - name: grow boot volume 2 | command: bash -c 'sudo /usr/libexec/oci-growfs -y | egrep "NOCHANGE:|CHANGED:"' 3 | 4 | -------------------------------------------------------------------------------- /playbooks/roles/boot-volume/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 3 | -------------------------------------------------------------------------------- /playbooks/roles/boot-volume/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | # no extra steps required for ubuntu. 2 | -------------------------------------------------------------------------------- /playbooks/roles/cloud-agent_update/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' 5 | -------------------------------------------------------------------------------- /playbooks/roles/cluster-cli/tasks/debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install required packages 3 | vars: 4 | package_name: 5 | - python3-click 6 | - python3-ldap3 7 | package_state: present 8 | package_cache: true 9 | include_role: 10 | name: safe_yum 11 | 12 | - name: copy cluster cli 13 | copy: 14 | src: cluster 15 | dest: /usr/bin/ 16 | owner: root 17 | group: root 18 | mode: '0755' -------------------------------------------------------------------------------- /playbooks/roles/cluster-cli/tasks/el7.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: install required packages 4 | vars: 5 | package_name: 6 | - python36-click 7 | - python36-ldap3 8 | package_repo: "epel,ol7_developer_EPEL" 9 | include_role: 10 | name: safe_yum 11 | 12 | - name: copy cluster cli 13 | copy: 14 | src: cluster 15 | dest: /usr/bin/ 16 | owner: root 17 | group: root 18 | mode: '0755' -------------------------------------------------------------------------------- /playbooks/roles/cluster-cli/tasks/el8.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: install required packages 4 | vars: 5 | package_name: 6 | - python3-click 7 | - python3-ldap 8 | package_repo: "ol8_developer_EPEL,ol8_appstream" 9 | include_role: 10 | name: safe_yum 11 | 12 | - name: copy cluster cli 13 | copy: 14 | src: cluster 15 | dest: /usr/bin/ 16 | owner: root 17 | group: root 18 | mode: '0755' 19 | -------------------------------------------------------------------------------- /playbooks/roles/cluster-cli/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el7.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' 3 | 4 | - include_tasks: el8.yml 5 | when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' 6 | 7 | - include_tasks: debian.yml 8 | when: ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/cron/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' 5 | -------------------------------------------------------------------------------- /playbooks/roles/destroy_unreachable/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: common.yml 2 | 3 | - include_tasks: slurm-rack-aware.yml 4 | when: rack_aware | bool 5 | 6 | - include_tasks: slurm.yml 7 | when: not rack_aware | bool -------------------------------------------------------------------------------- /playbooks/roles/docker/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - include_tasks: oraclelinux.yml 4 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 5 | 6 | #- include_tasks: centos-7.yml 7 | # when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' 8 | 9 | - include_tasks: ubuntu.yml 10 | when: ansible_os_family == 'Debian' 11 | -------------------------------------------------------------------------------- /playbooks/roles/docker/templates/templates.j2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/docker/templates/templates.j2 -------------------------------------------------------------------------------- /playbooks/roles/etc-hosts/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: common.yml 2 | -------------------------------------------------------------------------------- /playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2: -------------------------------------------------------------------------------- 1 | {% for item in groups['controller'] %} 2 | {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} 3 | {{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} controller 4 | {% endfor %} 5 | {% for item in groups['slurm_backup'] %} 6 | {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} 7 | {{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} backup 8 | {% endfor %} 9 | {% for item in groups['login'] %} 10 | {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} 11 | {{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} login 12 | {% endfor %} 13 | {% for item in groups['monitoring'] %} 14 | {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} 15 | {{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} monitoring 16 | {% endfor %} -------------------------------------------------------------------------------- /playbooks/roles/etc-hosts/templates/etc-hosts.j2: -------------------------------------------------------------------------------- 1 | {% for item in groups['compute'] %} 2 | {%- set rdma_subnet = hostvars[item]['rdma_network'] + '/' + hostvars[item]['rdma_netmask'] -%} 3 | {%- set rdma_subnet_prefix = rdma_subnet | ansible.netcommon.ipaddr('prefix') -%} 4 | {%- set rdma_subnet_cidr = rdma_subnet | ansible.netcommon.ipaddr('network/prefix') -%} 5 | {%- set host_index = hostvars[item]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) -%} 6 | {# THIS WHOLE ROUTINE IS USED TO FIND THE NEXT SUBNET #} 7 | {%- set subnets = [rdma_subnet_cidr] -%} 8 | {%- for i in range(0) -%} 9 | {%- set nsip = subnets|last | ansible.netcommon.ipaddr('net') | ansible.netcommon.ipaddr('-1') | ansible.netcommon.ipmath(1) -%} 10 | {%- set ns = nsip + '/' + rdma_subnet_prefix|string -%} 11 | {{ subnets.append(ns) }} 12 | {%- endfor -%} 13 | {%- set rdma_address = subnets|last | ansible.netcommon.ipmath(host_index | int -1) -%} 14 | {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} 15 | {{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_hostname'] }} {{ hostvars[item]['ansible_hostname'] }}.local.vcn {{ hostvars[item]['inventory_hostname'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} 16 | {% if cluster_network | bool %} 17 | {{rdma_address}} {{ hostvars[item]['ansible_hostname'] }}-rdma.local.rdma {{ hostvars[item]['ansible_hostname'] }}-rdma {{ hostvars[item]['inventory_hostname'] }}-rdma {{ hostvars[item]['inventory_hostname'] }}-rdma.local.rdma 18 | {% endif %} 19 | {% endfor %} 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /playbooks/roles/firewall/files/off-iptables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Turning off the Firewall..." 4 | which apt-get &> /dev/null 5 | if [ $? -eq 0 ] ; then 6 | echo "" > /etc/iptables/rules.v4 7 | echo "" > /etc/iptables/rules.v6 8 | 9 | iptables -F 10 | iptables -X 11 | iptables -t nat -F 12 | iptables -t nat -X 13 | iptables -t mangle -F 14 | iptables -t mangle -X 15 | iptables -P INPUT ACCEPT 16 | iptables -P OUTPUT ACCEPT 17 | iptables -P FORWARD ACCEPT 18 | else 19 | service firewalld stop 20 | chkconfig firewalld off 21 | fi 22 | -------------------------------------------------------------------------------- /playbooks/roles/firewall/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: disable firewalld 4 | service: 5 | name: firewalld 6 | state: stopped 7 | enabled: no 8 | -------------------------------------------------------------------------------- /playbooks/roles/firewall/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' 5 | -------------------------------------------------------------------------------- /playbooks/roles/firewall/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: disable ufw 4 | service: 5 | name: ufw 6 | state: stopped 7 | enabled: no 8 | ignore_errors: true 9 | 10 | # it still need update to iptables rules to accept traffic or else mpi traffic fails 11 | 12 | - name: Copy off-iptables.sh 13 | copy: 14 | src: off-iptables.sh 15 | dest: '/tmp/off-iptables.sh' 16 | owner: '{{ ansible_user }}' 17 | group: '{{ ansible_user }}' 18 | mode: '0755' 19 | 20 | - name: execute off-iptables.sh 21 | command: "sudo /tmp/off-iptables.sh" 22 | 23 | -------------------------------------------------------------------------------- /playbooks/roles/fix_broken/README.md: -------------------------------------------------------------------------------- 1 | Role Name 2 | ========= 3 | 4 | A brief description of the role goes here. 5 | 6 | Requirements 7 | ------------ 8 | 9 | Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. 15 | 16 | Dependencies 17 | ------------ 18 | 19 | A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. 20 | 21 | Example Playbook 22 | ---------------- 23 | 24 | Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: 25 | 26 | - hosts: servers 27 | roles: 28 | - { role: username.rolename, x: 42 } 29 | 30 | License 31 | ------- 32 | 33 | BSD 34 | 35 | Author Information 36 | ------------------ 37 | 38 | An optional section for the role authors to include contact information, or a website (HTML is not allowed). 39 | -------------------------------------------------------------------------------- /playbooks/roles/fix_broken/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for fix_broken 3 | 4 | # to resolve error for not able to install nfs-kernel-server. seeing the same error for compute nodes while installing other packages. so adding this to run on all compute hosts the first time itself. 5 | - include_tasks: ubuntu.yml 6 | when: ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/fix_ldap/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: ubuntu.yml 2 | when: ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/fix_ldap/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart nscd 3 | become: true 4 | systemd: 5 | name: nscd 6 | state: restarted 7 | daemon_reload: true 8 | enabled: true 9 | 10 | - name: restart sssd 11 | become: true 12 | service: 13 | name: sssd 14 | state: restarted 15 | daemon_reload: true 16 | enabled: true -------------------------------------------------------------------------------- /playbooks/roles/fss-home/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Make sure the /home directory exist 3 | file: 4 | path: "{{tmp_home}}" 5 | state: directory 6 | mode: '0755' 7 | 8 | - name: Make sure the remote home directory exist 9 | file: 10 | path: "{{tmp_home}}/home" 11 | state: directory 12 | mode: '0755' 13 | when: not create_fss|bool 14 | 15 | - name: Report if opc dir exist 16 | file: 17 | path: "{{tmp_home}}{% if not create_fss|bool %}/home{% endif %}/opc/" 18 | state: directory 19 | mode: '0700' 20 | owner: opc 21 | group: opc 22 | register: opc_directory 23 | 24 | - name: copy opc/ dir 25 | command: "cp -rpT /home/opc {{tmp_home}}{% if not create_fss|bool %}/home{% endif %}/opc" 26 | when: opc_directory.changed 27 | 28 | 29 | - name: Unmount {{tmp_home}} mount 30 | mount: 31 | path: "{{tmp_home}}" 32 | state: absent 33 | -------------------------------------------------------------------------------- /playbooks/roles/fss-home/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' 5 | -------------------------------------------------------------------------------- /playbooks/roles/fss-home/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Make sure the /home directory exist 3 | file: 4 | path: "{{tmp_home}}" 5 | state: directory 6 | mode: '0755' 7 | 8 | - name: Make sure the remote home directory exist 9 | file: 10 | path: "{{tmp_home}}/home" 11 | state: directory 12 | mode: '0755' 13 | when: not create_fss|bool 14 | 15 | - name: Report if ubuntu dir exist 16 | file: 17 | path: "{{tmp_home}}{% if not create_fss|bool %}/home{% endif %}/ubuntu/" 18 | state: directory 19 | mode: '0700' 20 | owner: ubuntu 21 | group: ubuntu 22 | register: ubuntu_directory 23 | 24 | - name: copy ubuntu/ dir 25 | command: "cp -rpT /home/ubuntu {{tmp_home}}{% if not create_fss|bool %}/home{% endif %}/ubuntu" 26 | when: ubuntu_directory.changed 27 | ignore_errors: true 28 | 29 | 30 | - name: Unmount {{tmp_home}} mount 31 | mount: 32 | path: "{{tmp_home}}" 33 | state: absent 34 | -------------------------------------------------------------------------------- /playbooks/roles/grafana/defaults/main.yml: -------------------------------------------------------------------------------- 1 | user: "{{ ol_user if ansible_os_family == 'RedHat' else ubuntu_user }}" 2 | ol_user: "opc" 3 | ubuntu_user: "ubuntu" 4 | dashboard_build_dir: "/tmp/dashboard-build" 5 | grafonnet_lib_repo_url: "github.com/grafana/grafonnet-lib/grafonnet@master" 6 | grafonnet_gen_repo_url: "github.com/grafana/grafonnet/gen/grafonnet-latest@main" 7 | grafana_api_keys_dir: /etc/opt/oci-hpc/passwords/grafana 8 | grafana_api_url: "http://localhost:3000" 9 | grafana_security: 10 | admin_user: admin 11 | admin_password: admin 12 | grafana_api_keys: 13 | - name: "admin" 14 | role: "Admin" 15 | -------------------------------------------------------------------------------- /playbooks/roles/grafana/files/g.libsonnet: -------------------------------------------------------------------------------- 1 | import 'grafonnet-latest/main.libsonnet' 2 | -------------------------------------------------------------------------------- /playbooks/roles/grafana/files/raw_message_template.txt: -------------------------------------------------------------------------------- 1 | ************************************ 2 | Alert: {{ alert_name }} 3 | Status: {{ alert_status }} 4 | {{ alert_message }} 5 | Starts At: {{ starts_at }} 6 | Ends At: {{ ends_at }} 7 | ----------------------------------- 8 | Labels: 9 | {% for key, value in labels.items() %} 10 | {{ key }}: {{ value }} 11 | {% endfor %} 12 | ----------------------------------- 13 | Annotations: 14 | {% for key, value in annotations.items() %} 15 | {{ key }}: {{ value }} 16 | {% endfor %} 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | -------------------------------------------------------------------------------- /playbooks/roles/grafana/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' 5 | - include_tasks: dashboard.yml 6 | when: ansible_os_family == 'RedHat' or ansible_os_family == 'Debian' 7 | -------------------------------------------------------------------------------- /playbooks/roles/grafana/templates/ons-webhook.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=OCI Webhook Service 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User={{ ansible_user }} 8 | Group={{ ansible_user }} 9 | Type=simple 10 | ExecStart=/usr/bin/python3 /usr/local/bin/ons_webhook.py 11 | Restart=on-failure 12 | 13 | [Install] 14 | WantedBy=multi-user.target 15 | -------------------------------------------------------------------------------- /playbooks/roles/healthchecks/defaults/main.yml: -------------------------------------------------------------------------------- 1 | mp_download_link: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/f0EgpgGOU5FlPzDwbSHuxdVQc7A8VgOaThd5KARkDG8Y60QJc53xhg-6m6nlyxRI/n/iding8g8fv8l/b/bm_meshpinger_artifacts/o/meshpinger_bm.tar.gz" 2 | mp_filename: "meshpinger_bm.tar.gz" -------------------------------------------------------------------------------- /playbooks/roles/healthchecks/files/run_meshpinger.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WRAPPER_BIN="$0" 4 | export WRAPPER_ENV="OCI_HPC_STACK" 5 | date 6 | 7 | eval "$(ssh-agent -s)" >/dev/null ; ssh-add ~/.ssh/id_rsa >/dev/null 8 | 9 | # Run meshpinger 10 | /opt/oci-hpc/healthchecks/meshpinger_bm/run_meshpinger "$@" -------------------------------------------------------------------------------- /playbooks/roles/healthchecks/files/shared_logging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import logging 4 | logging.basicConfig(level="INFO", format='%(asctime)s - %(levelname)s - %(message)s') 5 | logger = logging.getLogger('nhc') 6 | -------------------------------------------------------------------------------- /playbooks/roles/healthchecks/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Create systemd unit dirs 2 | become: true 3 | file: 4 | name: '/opt/oci-hpc/healthchecks' 5 | state: directory 6 | owner: '{{ ansible_user }}' 7 | group: '{{ ansible_user }}' 8 | 9 | - name: Copy files 10 | become: true 11 | copy: 12 | src: '{{ item }}' 13 | dest: '/opt/oci-hpc/healthchecks/{{ item }}' 14 | force: no 15 | owner: '{{ ansible_user }}' 16 | group: '{{ ansible_user }}' 17 | with_items: 18 | - check_gpu_setup.py 19 | - gpu_bw_test.py 20 | - rdma_link_flapping.py 21 | - xid_checker.py 22 | - shared_logging.py 23 | 24 | - name: Download oci-mesh-pinger 25 | get_url: 26 | url: "{{mp_download_link}}" 27 | dest: "/tmp/" 28 | when: ('controller' in group_names) 29 | 30 | - name: untar meshpinger 31 | unarchive: 32 | src: "/tmp/{{mp_filename}}" 33 | dest: "/opt/oci-hpc/healthchecks" 34 | when: ('controller' in group_names) 35 | 36 | - name: Copy files 37 | become: true 38 | copy: 39 | src: '{{ item }}' 40 | dest: '/opt/oci-hpc/healthchecks/{{ item }}' 41 | force: no 42 | owner: '{{ ansible_user }}' 43 | group: '{{ ansible_user }}' 44 | mode: 0755 45 | with_items: 46 | - run_meshpinger.sh 47 | when: ('controller' in group_names) 48 | 49 | - name: Make sure meshpinger dependencies are installed. 50 | vars: 51 | package_name: 52 | - fping 53 | - jq 54 | - lshw 55 | - ethtool 56 | package_state: latest 57 | include_role: 58 | name: safe_yum 59 | ignore_errors: true 60 | -------------------------------------------------------------------------------- /playbooks/roles/home_nfs/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: set use_nfs_home_dirs if SELINUX is enabled 3 | ansible.posix.seboolean: 4 | name: use_nfs_home_dirs 5 | state: yes 6 | persistent: yes 7 | when: ansible_selinux.status == "enabled" 8 | 9 | 10 | -------------------------------------------------------------------------------- /playbooks/roles/home_nfs/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | -------------------------------------------------------------------------------- /playbooks/roles/hostname/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: update hostname for HPC cluster 3 | vars: 4 | - index: "{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) }}" 5 | hostname: 6 | name: "{{hostname_convention}}-{{index}}" 7 | when: ('compute' in group_names ) 8 | 9 | - name: Check Hostname 10 | shell: 11 | cmd: "hostname" 12 | register: hostname_output 13 | when: ('compute' in group_names ) 14 | 15 | - name: update hostname for HPC cluster 16 | vars: 17 | - index: "{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) }}" 18 | hostname: 19 | name: "{{hostname_convention}}-{{index}}" 20 | when: ('compute' in group_names ) and ( hostname_output.stdout != ansible_fqdn.split('.')[0] ) 21 | -------------------------------------------------------------------------------- /playbooks/roles/hostname/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' 5 | -------------------------------------------------------------------------------- /playbooks/roles/hostname/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: update hostname for HPC cluster 3 | vars: 4 | - index: "{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) }}" 5 | hostname: 6 | name: "{{hostname_convention}}-{{index}}" 7 | when: ('compute' in group_names ) -------------------------------------------------------------------------------- /playbooks/roles/hyperthreading/files/control_hyperthreading_ubuntu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ `id -u` -ne 0 ] 3 | then 4 | echo $0: you need to be root 5 | exit 1 6 | fi 7 | disable_ht() { 8 | echo -n $0: disabling 9 | echo off | sudo tee /sys/devices/system/cpu/smt/control 10 | } 11 | 12 | enable_ht() { 13 | echo -n $0: enabling 14 | echo on | sudo tee /sys/devices/system/cpu/smt/control 15 | } 16 | 17 | case "$1" in 18 | "1"|"on") 19 | enable_ht 20 | ;; 21 | "0"|"off") 22 | disable_ht 23 | ;; 24 | "show") 25 | ;; 26 | *) 27 | echo $0: wrong argument "$1" 28 | exit 2 29 | ;; 30 | esac 31 | 32 | echo '' 33 | lscpu | egrep "On-line|Off-line" 34 | 35 | exit 0 -------------------------------------------------------------------------------- /playbooks/roles/hyperthreading/files/disable-hyperthreading.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Start this service to disable Hyperthreading, stop it to enable Hyperthreading. 3 | After=syslog.target irqbalance.service tuned.service 4 | 5 | [Service] 6 | Type=oneshot 7 | RemainAfterExit=true 8 | ExecStart=/opt/oci-hpc/sbin/control_hyperthreading.sh off 9 | ExecStop=/opt/oci-hpc/sbin/control_hyperthreading.sh on 10 | 11 | [Install] 12 | WantedBy=multi-user.target 13 | -------------------------------------------------------------------------------- /playbooks/roles/hyperthreading/files/disable-hyperthreading_ubuntu.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Start this service to disable Hyperthreading, stop it to enable Hyperthreading. 3 | After=syslog.target irqbalance.service tuned.service 4 | 5 | [Service] 6 | Type=oneshot 7 | RemainAfterExit=true 8 | ExecStart=/opt/oci-hpc/sbin/control_hyperthreading_ubuntu.sh off 9 | ExecStop=/opt/oci-hpc/sbin/control_hyperthreading_ubuntu.sh on 10 | 11 | [Install] 12 | WantedBy=multi-user.target 13 | -------------------------------------------------------------------------------- /playbooks/roles/hyperthreading/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Make sure directory exist 3 | become: true 4 | file: 5 | path: /opt/oci-hpc/sbin 6 | state: directory 7 | mode: '0755' 8 | 9 | - name: Copy script 10 | become: true 11 | copy: 12 | src: control_hyperthreading.sh 13 | dest: /opt/oci-hpc/sbin/control_hyperthreading.sh 14 | mode: '0755' 15 | 16 | - name: Copy service unit 17 | become: true 18 | copy: 19 | src: disable-hyperthreading.service 20 | dest: /etc/systemd/system/disable-hyperthreading.service 21 | 22 | - name: Create a unit file 23 | become: true 24 | copy: 25 | src: disable-hyperthreading.service 26 | dest: /etc/systemd/system/disable-hyperthreading.service 27 | 28 | - name: Force systemd to reread configs 29 | ansible.builtin.systemd: 30 | daemon_reload: yes 31 | 32 | - name: Start HT service 33 | ansible.builtin.systemd: 34 | name: disable-hyperthreading.service 35 | state: started 36 | enabled: yes 37 | when: not hyperthreading|default(true)|bool 38 | 39 | #- name: disable Hyperthreading 40 | # become: true 41 | # shell: "echo off | sudo tee /sys/devices/system/cpu/smt/control" 42 | -------------------------------------------------------------------------------- /playbooks/roles/hyperthreading/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_tasks: ubuntu.yml 5 | when: ansible_distribution == 'Ubuntu' 6 | -------------------------------------------------------------------------------- /playbooks/roles/hyperthreading/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Make sure directory exist 3 | become: true 4 | file: 5 | path: /opt/oci-hpc/sbin 6 | state: directory 7 | mode: '0755' 8 | 9 | - name: Copy script 10 | become: true 11 | copy: 12 | src: control_hyperthreading_ubuntu.sh 13 | dest: /opt/oci-hpc/sbin/control_hyperthreading_ubuntu.sh 14 | mode: '0755' 15 | 16 | - name: Copy service unit 17 | become: true 18 | copy: 19 | src: disable-hyperthreading_ubuntu.service 20 | dest: /etc/systemd/system/disable-hyperthreading_ubuntu.service 21 | 22 | - name: Create a unit file 23 | become: true 24 | copy: 25 | src: disable-hyperthreading_ubuntu.service 26 | dest: /etc/systemd/system/disable-hyperthreading.service 27 | 28 | - name: Force systemd to reread configs 29 | ansible.builtin.systemd: 30 | daemon_reload: yes 31 | 32 | - name: Start HT service 33 | ansible.builtin.systemd: 34 | name: disable-hyperthreading_ubuntu.service 35 | state: started 36 | enabled: yes 37 | when: not hyperthreading|default(true)|bool -------------------------------------------------------------------------------- /playbooks/roles/influxdb/defaults/main.yml: -------------------------------------------------------------------------------- 1 | influxdb_configuration_dir: /etc/influxdb 2 | -------------------------------------------------------------------------------- /playbooks/roles/influxdb/handlers/main.yml: -------------------------------------------------------------------------------- 1 | - name: restart influxdb 2 | become: true 3 | service: 4 | name: influxdb 5 | state: started 6 | enabled: yes 7 | when: influx_config.changed 8 | 9 | -------------------------------------------------------------------------------- /playbooks/roles/influxdb/tasks/config_influxdb.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Create /etc/opt/oci-hpc/passwords/influxdb 3 | become: true 4 | file: 5 | path: /etc/opt/oci-hpc/passwords/influxdb 6 | state: directory 7 | owner: '{{ ansible_user }}' 8 | mode: 0770 9 | group: '{{ ansible_user }}' 10 | recurse: yes 11 | 12 | - name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords 13 | set_fact: 14 | tmp_pwd: "{{ lookup('password', 15 | '/etc/opt/oci-hpc/passwords/influxdb/root.txt 16 | chars=ascii_letters,digits,hexdigits') }}" 17 | 18 | - name: Get influx password from /etc/opt/oci-hpc/passwords 19 | set_fact: 20 | influx_admin_pwd: "{{ lookup('password', 21 | '/etc/opt/oci-hpc/passwords/influxdb/root.txt 22 | chars=ascii_letters,digits,hexdigits') }}" 23 | 24 | - name: Start InfluxDB 25 | become: true 26 | service: 27 | name: influxdb 28 | state: started 29 | enabled: true 30 | 31 | 32 | - name: Set configuration directory path 33 | become: true 34 | file: 35 | path: "{{ influxdb_configuration_dir }}" 36 | state: directory 37 | 38 | - name: Set templatized InfluxDB configuration 39 | become: true 40 | template: 41 | src: influxdb.conf.j2 42 | dest: "{{ influxdb_configuration_dir }}/influxdb.conf" 43 | force: yes 44 | backup: yes 45 | owner: influxdb 46 | group: influxdb 47 | mode: 0744 48 | register: influx_config 49 | notify: restart influxdb 50 | 51 | 52 | -------------------------------------------------------------------------------- /playbooks/roles/influxdb/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install influxdb 3 | include_tasks: el_install_influxdb.yml 4 | 5 | - name: configure influxdb on controller 6 | include_tasks: config_influxdb.yml 7 | when: "'controller' in group_names" -------------------------------------------------------------------------------- /playbooks/roles/influxdb/tasks/el_install_influxdb.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Add influxdb repository 3 | become: true 4 | yum_repository: 5 | name: influxdb 6 | description: InfluxDB Repository - RHEL $releasever 7 | baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable 8 | enabled: 1 9 | gpgcheck: 1 10 | gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key 11 | 12 | - name: Install InfluxDB 13 | vars: 14 | package_name: 15 | - influxdb 16 | package_state: latest 17 | include_role: 18 | name: safe_yum 19 | 20 | - name: install influx pip 21 | become: true 22 | pip: 23 | name: influxdb 24 | executable: pip3 -------------------------------------------------------------------------------- /playbooks/roles/influxdb/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' 5 | -------------------------------------------------------------------------------- /playbooks/roles/influxdb/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install influxdb 3 | include_tasks: ubuntu_install_influxdb.yml 4 | 5 | - name: configure influxdb on controller 6 | include_tasks: config_influxdb.yml 7 | when: "'controller' in group_names" -------------------------------------------------------------------------------- /playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Add InfluxData's key 3 | become: true 4 | apt_key: 5 | state: present 6 | url: https://repos.influxdata.com/influxdata-archive_compat.key 7 | 8 | - name: Manage InfluxData APT repositories 9 | become: true 10 | apt_repository: 11 | repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable 12 | state: present 13 | 14 | - name: Install InfluxDB 15 | vars: 16 | package_name: 17 | - influxdb 18 | package_state: latest 19 | include_role: 20 | name: safe_yum 21 | 22 | - name: force influxdb gid 997 23 | become: true 24 | lineinfile: 25 | path: /etc/group 26 | state: present 27 | regexp: '^influxdb:x:(.*)$' 28 | line: 'influxdb:x:997:' 29 | backrefs: yes 30 | 31 | - name: force influxd uid 997 32 | become: true 33 | lineinfile: 34 | path: /etc/passwd 35 | state: present 36 | regexp: '^influxdb:x:(.*)$' 37 | line: 'influxdb:x:997:997::/var/lib/influxdb:/bin/false' 38 | backrefs: yes 39 | 40 | - name: restart influxdb 41 | become: true 42 | service: 43 | name: influxdb 44 | state: restarted 45 | enabled: yes 46 | 47 | - name: install influx pip 48 | become: true 49 | vars: 50 | ansible_python_interpreter: /usr/bin/python3 51 | pip: 52 | name: influxdb 53 | executable: pip3 54 | -------------------------------------------------------------------------------- /playbooks/roles/iscsi/tasks/debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: "Perform a discovery on {{ iscsi_ip }} and show available target nodes" 3 | community.general.open_iscsi: 4 | show_nodes: yes 5 | discover: yes 6 | portal: '{{ iscsi_ip }}' 7 | register: nodes 8 | 9 | - name: Connect to the named target, after updating the local persistent database (cache) 10 | community.general.open_iscsi: 11 | login: yes 12 | target: '{{ nodes["nodes"][0] }}' 13 | register: target 14 | 15 | - name: create local storage directory 16 | file: 17 | path: "{{ local_path }}" 18 | state: directory 19 | owner: debian 20 | group: debian 21 | 22 | - name: create a filesystem 23 | filesystem: 24 | dev: '{{ target["devicenodes"][0] }}' 25 | fstype: ext4 26 | 27 | - name: Mount local volume 28 | mount: 29 | path: "{{ local_path }}" 30 | src: '{{ target["devicenodes"][0] }}' 31 | fstype: ext4 32 | opts: defaults,noatime,_netdev 33 | state: mounted 34 | -------------------------------------------------------------------------------- /playbooks/roles/iscsi/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: "Perform a discovery on {{ iscsi_ip }} and show available target nodes" 3 | community.general.open_iscsi: 4 | show_nodes: yes 5 | discover: yes 6 | portal: '{{ iscsi_ip }}' 7 | register: nodes 8 | 9 | - name: Connect to the named target, after updating the local persistent database (cache) 10 | community.general.open_iscsi: 11 | login: yes 12 | target: '{{ nodes["nodes"][0] }}' 13 | register: target 14 | 15 | - name: create local storage directory 16 | file: 17 | path: "{{ local_path }}" 18 | state: directory 19 | owner: opc 20 | group: opc 21 | 22 | - name: create a filesystem 23 | filesystem: 24 | dev: '{{ target["devicenodes"][0] }}' 25 | fstype: xfs 26 | when: not ('slurm_backup' in group_names) 27 | 28 | - name: Mount local volume 29 | mount: 30 | path: "{{ local_path }}" 31 | src: '{{ target["devicenodes"][0] }}' 32 | fstype: xfs 33 | opts: defaults,noatime,_netdev 34 | state: mounted 35 | -------------------------------------------------------------------------------- /playbooks/roles/iscsi/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_tasks: ubuntu.yml 5 | when: ansible_distribution == 'Ubuntu' 6 | 7 | - include_tasks: debian.yml 8 | when: ansible_distribution == 'Debian' 9 | -------------------------------------------------------------------------------- /playbooks/roles/iscsi/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: "Perform a discovery on {{ iscsi_ip }} and show available target nodes" 3 | community.general.open_iscsi: 4 | show_nodes: yes 5 | discover: yes 6 | portal: '{{ iscsi_ip }}' 7 | register: nodes 8 | 9 | - name: Connect to the named target, after updating the local persistent database (cache) 10 | community.general.open_iscsi: 11 | login: yes 12 | target: '{{ nodes["nodes"][0] }}' 13 | register: target 14 | 15 | - name: create local storage directory 16 | file: 17 | path: "{{ local_path }}" 18 | state: directory 19 | owner: ubuntu 20 | group: ubuntu 21 | 22 | - name: create a filesystem 23 | filesystem: 24 | dev: '{{ target["devicenodes"][0] }}' 25 | fstype: ext4 26 | 27 | - name: Mount local volume 28 | mount: 29 | path: "{{ local_path }}" 30 | src: '{{ target["devicenodes"][0] }}' 31 | fstype: ext4 32 | opts: defaults,noatime,_netdev 33 | state: mounted 34 | -------------------------------------------------------------------------------- /playbooks/roles/latency_check/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: copy script 3 | template: 4 | src: latency_check.sh.j2 5 | dest: '/tmp/latency_check.sh' 6 | backup: yes 7 | owner: opc 8 | group: opc 9 | mode: 0775 10 | run_once: true 11 | 12 | - name: Run Latency test 13 | command: "/tmp/latency_check.sh" 14 | register: latency_check 15 | failed_when: "'KO' in latency_check.stdout" 16 | run_once: true 17 | retries: 3 18 | delay: 10 19 | until: latency_check is not failed -------------------------------------------------------------------------------- /playbooks/roles/latency_check/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' -------------------------------------------------------------------------------- /playbooks/roles/latency_check/templates/latency_check.sh.j2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | {% if shape == "BM.Optimized3.36" %} 4 | RDMA_OPTIONS="-mca pml ucx -x UCX_NET_DEVICES=mlx5_2:1 -x UCX_IB_TRAFFIC_CLASS=105 -x UCX_IB_GID_INDEX=3 -x HCOLL_ENABLE_MCAST_ALL=0 -mca coll_hcoll_enable 0" 5 | {% elif shape == "BM.HPC2.36" %} 6 | RDMA_OPTIONS="-mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 -x UCX_IB_TRAFFIC_CLASS=105 -x UCX_IB_GID_INDEX=3 -x HCOLL_ENABLE_MCAST_ALL=0 -mca coll_hcoll_enable 0" 7 | {% else %} 8 | echo OK 9 | exit 10 | {% endif%} 11 | rm /tmp/latency_hostfile 12 | for i in `cat /etc/hosts | grep .local.vcn | awk '{print $1}'`; do echo $i cpu=1 >> /tmp/latency_hostfile;done 13 | source `ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` 14 | mpirun -hostfile /tmp/latency_hostfile $RDMA_OPTIONS /usr/mpi/gcc/openmpi-*/tests/imb/IMB-MPI1 alltoall > /tmp/latency.out 2>&1 15 | latency=`cat /tmp/latency.out | grep ' 64 ' | tail -n 1 | awk '{print $NF}' | cut -d. -f1 ` 16 | nodes=`cat /tmp/latency.out | grep ' #processes' | tail -n 1 | awk '{print $NF}'` 17 | result=KO 18 | 19 | one_node=`cat /tmp/latency.out | grep 'cannot run this test, minimum number of nodes is 2' | wc -l ` 20 | echo $one_node 21 | if (( $one_node == 1 )); 22 | then 23 | result=OK 24 | elif (( $latency/$nodes < 5 )); 25 | then 26 | result=OK 27 | fi 28 | echo $result -------------------------------------------------------------------------------- /playbooks/roles/limits/tasks/common.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - block: 3 | - name: Install limits.conf 4 | template: 5 | src: "templates/limits.j2" 6 | dest: "/etc/security/limits.d/50-hpc-limits.conf" 7 | owner: root 8 | group: root 9 | mode: 0644 10 | -------------------------------------------------------------------------------- /playbooks/roles/limits/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - include_tasks: common.yml 4 | when: ansible_os_family == 'RedHat' 5 | 6 | - include_tasks: common.yml 7 | when: ansible_distribution == 'Ubuntu' 8 | 9 | - include_tasks: common.yml 10 | when: ansible_distribution == 'Debian' 11 | -------------------------------------------------------------------------------- /playbooks/roles/limits/templates/limits.j2: -------------------------------------------------------------------------------- 1 | # 2 | ## HPC limits increased per recommendations in: 3 | ## 4 | ## https://windows.epfl.ch/info/linux.pdf 5 | ## http://openhpc.community/wp-content/uploads/Install_guide-CentOS7.1-1.0.pdf 6 | ## 7 | ## Also increased max number of open files 8 | ## 9 | * hard memlock unlimited 10 | * soft memlock unlimited 11 | #### 12 | #### these numbers yield very jittery results -- more testing is needed before enabling them 13 | #### 14 | ####* hard nofile 16384 15 | ####* soft nofile 16384 16 | ####* hard stack 1048576 17 | ####* soft stack 1048576 18 | #### 19 | 20 | {% if shape == "BM.GPU.B4.8" or shape == "BM.GPU4.8" or shape == "BM.GPU.A100-v2.8" or shape == "BM.GPU.H100.8" or shape == "BM.GPU.H200.8" or shape == "BM.GPU.MI300X.8" or shape == "BM.GPU.L40S.4" or shape == "BM.GPU.L40S-NC.4"%} 21 | * soft nproc 40960 22 | * hard nproc 40960 23 | * soft nofile 20480 24 | * hard nofile 20480 25 | {% else %} 26 | # 27 | {% endif %} 28 | -------------------------------------------------------------------------------- /playbooks/roles/localdisk/defaults/main.yml: -------------------------------------------------------------------------------- 1 | nvme_path: "/scratch" 2 | -------------------------------------------------------------------------------- /playbooks/roles/localdisk/tasks/debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Create a new primary partition 3 | parted: 4 | device: /dev/nvme0n1 5 | number: 1 6 | state: present 7 | label: gpt 8 | when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" 9 | - name: create a filesystem 10 | filesystem: 11 | dev: /dev/nvme0n1p1 12 | fstype: ext4 13 | opts: -L localscratch 14 | when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" 15 | - name: Mount local volume 16 | mount: 17 | path: "{{ nvme_path }}" 18 | src: LABEL=localscratch 19 | fstype: ext4 20 | opts: defaults,noatime 21 | state: mounted 22 | when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" 23 | -------------------------------------------------------------------------------- /playbooks/roles/localdisk/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: common.yml 2 | -------------------------------------------------------------------------------- /playbooks/roles/localdisk/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Create a new primary partition 3 | parted: 4 | device: /dev/nvme0n1 5 | number: 1 6 | state: present 7 | label: gpt 8 | when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" 9 | - name: create a filesystem 10 | filesystem: 11 | dev: /dev/nvme0n1p1 12 | fstype: ext4 13 | opts: -L localscratch 14 | when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" 15 | - name: Mount local volume 16 | mount: 17 | path: "{{ nvme_path }}" 18 | src: LABEL=localscratch 19 | fstype: ext4 20 | opts: defaults,noatime 21 | state: mounted 22 | when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" 23 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/defaults/main.yml: -------------------------------------------------------------------------------- 1 | node_exporter: 1.8.2 2 | cpu_ports: 9100 3 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/files/node_exporter.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Node Exporter 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User=prometheus 8 | ExecStart=/usr/bin/node_exporter --collector.nfs --collector.mountstats --collector.systemd --collector.processes --collector.textfile.directory=/var/lib/node_exporter/textfile_collector 9 | 10 | [Install] 11 | WantedBy=default.target 12 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/files/shared_logging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import logging 4 | logging.basicConfig(level="INFO", format='%(asctime)s - %(levelname)s - %(message)s') 5 | logger = logging.getLogger('nem') 6 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/tasks/critical_process_monitor.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install prometheus_client python package 3 | ansible.builtin.pip: 4 | name: prometheus_client 5 | executable: /usr/bin/pip3 6 | become: yes 7 | 8 | - name: Copy service file to scripts directory 9 | copy: 10 | src: critical_process_monitor.py 11 | dest: /usr/local/bin 12 | mode: 0755 13 | become: yes 14 | 15 | - name: Render systemd service file 16 | become: true 17 | template: 18 | src: critical-process-monitor.service.j2 19 | dest: /etc/systemd/system/critical-process-monitor.service 20 | force: yes 21 | backup: yes 22 | owner: prometheus 23 | group: prometheus 24 | mode: 0744 25 | 26 | - name: Restart critical process monitor exporter 27 | become: true 28 | service: 29 | name: critical-process-monitor 30 | state: restarted 31 | enabled: yes 32 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: node_exporter_el.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_tasks: node_exporter_ubuntu.yml 5 | when: ansible_distribution == 'Ubuntu' 6 | 7 | - include_tasks: dcgm_exporter.yml 8 | when: ('compute' in group_names) and 'GPU' in shape 9 | 10 | - include_tasks: rdma_exporter.yml 11 | when: ('compute' in group_names) and cluster_network|bool 12 | 13 | - include_tasks: nvlink_exporter.yml 14 | when: ('compute' in group_names) and 'GPU' in shape 15 | 16 | - include_tasks: custom_metrics.yml 17 | when: ('compute' in group_names) and cluster_network|bool 18 | 19 | - include_tasks: critical_process_monitor.yml 20 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/tasks/nvlink_exporter.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install prometheus_client python package 3 | ansible.builtin.pip: 4 | name: prometheus_client 5 | executable: /usr/bin/pip3 6 | become: yes 7 | 8 | - name: Copy service file to scripts directory 9 | copy: 10 | src: nvlink_counters_exporter.py 11 | dest: /usr/local/bin 12 | mode: 0755 13 | become: yes 14 | 15 | - name: Render systemd service file 16 | become: true 17 | template: 18 | src: nvlink-exporter.service.j2 19 | dest: /etc/systemd/system/nvlink-exporter.service 20 | force: yes 21 | backup: yes 22 | owner: prometheus 23 | group: prometheus 24 | mode: 0744 25 | 26 | - name: Restart nvlink exporter 27 | become: true 28 | service: 29 | name: nvlink-exporter 30 | state: restarted 31 | enabled: yes 32 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/tasks/rdma_exporter.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install prometheus_client python package 3 | ansible.builtin.pip: 4 | name: prometheus_client 5 | executable: /usr/bin/pip3 6 | become: yes 7 | 8 | - name: Copy service file to scripts directory 9 | copy: 10 | src: rdma_counters_exporter.py 11 | dest: /usr/local/bin 12 | mode: 0755 13 | become: yes 14 | 15 | - name: Render systemd service file 16 | become: true 17 | template: 18 | src: rdma-exporter.service.j2 19 | dest: /etc/systemd/system/rdma-exporter.service 20 | force: yes 21 | backup: yes 22 | owner: prometheus 23 | group: prometheus 24 | mode: 0755 25 | 26 | - name: Restart rdma exporter 27 | become: true 28 | service: 29 | name: rdma-exporter 30 | state: restarted 31 | enabled: yes 32 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/templates/critical-process-monitor.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Critical Systemd Service monitor 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User={{ ansible_user }} 8 | Group={{ ansible_user }} 9 | Type=simple 10 | ExecStart=/usr/bin/env python3 /usr/local/bin/critical_process_monitor.py 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/templates/customMetrics.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Captures custom metrics for node exporter 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User=root 8 | Group=root 9 | Type=simple 10 | ExecStart={{ ne_script_dir }}/custom_metrics.py 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/templates/dcgm-exporter.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=NVIDIA DCGM Exporter 3 | After=network.target 4 | 5 | [Service] 6 | ExecStart=/usr/bin/dcgm-exporter -f /etc/dcgm-counters.csv 7 | 8 | [Install] 9 | WantedBy=multi-user.target 10 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/templates/nvlink-exporter.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=NVLink bandwidth tracker for all GPUs 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User={{ prometheus_user }} 8 | Group={{ prometheus_user }} 9 | Type=simple 10 | ExecStart=/usr/bin/env python3 /usr/local/bin/nvlink_counters_exporter.py 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/templates/rdma-exporter.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=RDMA Hardware counter exporter for RDMA NICs 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User={{ prometheus_user }} 8 | Group={{ prometheus_user }} 9 | Type=simple 10 | ExecStart=/usr/bin/env python3 /usr/local/bin/rdma_counters_exporter.py 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /playbooks/roles/metrics-exporter/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Variables 3 | gpu_ports: 9400 4 | prometheus_user: prometheus 5 | 6 | go_version: "1.23.0" 7 | go_tarball: "go{{ go_version }}.linux-amd64.tar.gz" 8 | go_url: "https://go.dev/dl/{{ go_tarball }}" 9 | go_install_dir: "/usr/local" 10 | go_path: "/usr/local/go/bin" 11 | dcgm_repo: "https://github.com/NVIDIA/dcgm-exporter.git" 12 | dcgm_service_path: "/etc/systemd/system/dcgm-exporter.service" 13 | dcgm_counter_config_path: "/etc" 14 | dependencies: "{{ rhel_dependencies if ansible_os_family == 'RedHat' else ubuntu_dependencies }}" 15 | ubuntu_dependencies: 16 | - wget 17 | - git 18 | - make 19 | - build-essential 20 | rhel_dependencies: 21 | - wget 22 | - git 23 | - make 24 | - gcc 25 | -------------------------------------------------------------------------------- /playbooks/roles/mpi-hostfiles/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: common.yml 2 | # when: ansible_os_family == 'RedHat' 3 | # 4 | #- include_tasks: ubuntu.yml 5 | # when: ansible_distribution == 'Ubuntu' 6 | # 7 | #- include_tasks: debian.yml 8 | # when: ansible_distribution == 'Debian' 9 | -------------------------------------------------------------------------------- /playbooks/roles/mpi-hostfiles/templates/hostfile_rdma.j2: -------------------------------------------------------------------------------- 1 | {% for item in groups['compute'] %} 2 | {{ hostvars[item]['ansible_hostname'] }}-rdma.local.rdma 3 | {% endfor %} 4 | -------------------------------------------------------------------------------- /playbooks/roles/mpi-hostfiles/templates/hostfile_tcp.j2: -------------------------------------------------------------------------------- 1 | {% for item in groups['compute'] %} 2 | {{ hostvars[item]['ansible_hostname'] }} 3 | {% endfor %} 4 | -------------------------------------------------------------------------------- /playbooks/roles/mpivars/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for mpivars 3 | -------------------------------------------------------------------------------- /playbooks/roles/mpivars/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for mpivars 3 | - include_tasks: ubuntu.yml 4 | when: ansible_distribution == 'Ubuntu' 5 | -------------------------------------------------------------------------------- /playbooks/roles/mpivars/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Get the openmpi version 3 | shell: 4 | cmd: ls /usr/mpi/gcc/ 5 | register: openmpi 6 | failed_when: false 7 | 8 | 9 | - name: Check if mpivars.sh exists 10 | stat: 11 | path: /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/bin/mpivars.sh 12 | register: mpivars 13 | when: openmpi.stdout_lines | length > 0 14 | 15 | 16 | - name: Create mpivars.sh 17 | become: true 18 | template: 19 | src: mpivars.j2 20 | dest: /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/bin/mpivars.sh 21 | force: yes 22 | owner: root 23 | group: root 24 | when: openmpi.stdout_lines | length > 0 and not mpivars.stat.exists -------------------------------------------------------------------------------- /playbooks/roles/mpivars/templates/mpivars.j2: -------------------------------------------------------------------------------- 1 | # NOTE: This is an automatically-generated file! (generated by the 2 | # Open MPI/SHMEM RPM). Any changes made here will be lost if the RPM is 3 | # uninstalled or upgraded. 4 | 5 | # PATH 6 | if test -z "`echo $PATH | grep /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/bin`"; then 7 | PATH=/usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/bin:${PATH} 8 | export PATH 9 | fi 10 | 11 | # LD_LIBRARY_PATH 12 | if test -z "`echo $LD_LIBRARY_PATH | grep /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/lib`"; then 13 | LD_LIBRARY_PATH=/usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/lib${LD_LIBRARY_PATH:+:}${LD_LIBRARY_PATH} 14 | export LD_LIBRARY_PATH 15 | fi 16 | 17 | # MANPATH 18 | if test -z "`echo $MANPATH | grep /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/share/man`"; then 19 | MANPATH=/usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/share/man:${MANPATH} 20 | export MANPATH 21 | fi 22 | 23 | # MPI_ROOT 24 | MPI_ROOT=/usr/mpi/gcc/{{ openmpi.stdout_lines[0] }} 25 | export MPI_ROOT -------------------------------------------------------------------------------- /playbooks/roles/mpivars/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # vars file for mpivars -------------------------------------------------------------------------------- /playbooks/roles/mysql/files/innodb.cnf: -------------------------------------------------------------------------------- 1 | [mysqld] 2 | innodb_buffer_pool_size=1024M 3 | innodb_log_file_size=64M 4 | innodb_lock_wait_timeout=900 5 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # handler file for mariadb 3 | 4 | - name: restart mysqld 5 | become: true 6 | service: 7 | name: mysqld 8 | state: restarted 9 | 10 | - name: restart mariadb 11 | become: true 12 | service: 13 | name: mariadb 14 | state: restarted 15 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: centos.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' 3 | 4 | - include_tasks: el.yml 5 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 6 | 7 | - include_tasks: debian.yml 8 | when: ansible_distribution == 'Ubuntu' 9 | 10 | - include_tasks: debian.yml 11 | when: ansible_distribution == 'Debian' 12 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/templates/debian_mariadb_etc_my.cnf.j2: -------------------------------------------------------------------------------- 1 | [mysqld] 2 | datadir= {{ mariadb_db_path }} 3 | socket=/var/lib/mysql/mysql.sock 4 | symbolic-links=0 5 | default-authentication-plugin=mysql_native_password 6 | innodb_file_per_table=1 7 | 8 | [mysqld_safe] 9 | log-error=/var/log/mysql/error.log 10 | pid-file=/var/run/mysqld/mysqld.pid 11 | 12 | # 13 | # include all files from the config directory 14 | # 15 | !includedir /etc/my.cnf.d 16 | 17 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/templates/debian_my.cnf.j2: -------------------------------------------------------------------------------- 1 | [client] 2 | socket=/var/run/mysqld/mysqld.sock 3 | user=root 4 | password={{ mysql_root_pwd }} 5 | 6 | [mysql] 7 | socket=/var/run/mysqld/mysqld.sock 8 | user=root 9 | password={{ mysql_root_pwd }} 10 | 11 | [mysqldump] 12 | socket=/var/run/mysqld/mysqld.sock 13 | user=root 14 | password={{ mysql_root_pwd }} 15 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/templates/mariadb_etc_my.cnf.j2: -------------------------------------------------------------------------------- 1 | [mysqld] 2 | datadir= {{ mariadb_db_path }} 3 | socket=/var/lib/mysql/mysql.sock 4 | symbolic-links=0 5 | default-authentication-plugin=mysql_native_password 6 | innodb_file_per_table=1 7 | 8 | [mysqld_safe] 9 | log-error=/var/log/mariadb/mariadb.log 10 | pid-file=/var/run/mariadb/mariadb.pid 11 | 12 | # 13 | # include all files from the config directory 14 | # 15 | !includedir /etc/my.cnf.d 16 | 17 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/templates/my.cnf.j2: -------------------------------------------------------------------------------- 1 | [client] 2 | socket=/var/lib/mysql/mysql.sock 3 | user=root 4 | password={{ mysql_root_pwd }} 5 | 6 | [mysql] 7 | socket=/var/lib/mysql/mysql.sock 8 | user=root 9 | password={{ mysql_root_pwd }} 10 | 11 | [mysqldump] 12 | socket=/var/lib/mysql/mysql.sock 13 | user=root 14 | password={{ mysql_root_pwd }} 15 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/templates/mysqld_etc_my.cnf.j2: -------------------------------------------------------------------------------- 1 | [mysqld] 2 | symbolic-links=0 3 | default-authentication-plugin=mysql_native_password 4 | innodb_file_per_table=1 5 | 6 | datadir=/var/lib/mysql 7 | socket=/var/lib/mysql/mysql.sock 8 | 9 | log-error=/var/log/mysqld.log 10 | pid-file=/var/run/mysqld/mysqld.pid 11 | 12 | # 13 | # include all files from the config directory 14 | # 15 | !includedir /etc/my.cnf.d 16 | 17 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/vars/ol_vars.yml: -------------------------------------------------------------------------------- 1 | --- 2 | mariadb_packages: 3 | - mariadb 4 | - mariadb-server 5 | - MySQL-python 6 | 7 | mysql_packages: 8 | - mysql-community-server 9 | - mysql-community-client 10 | - MySQL-python 11 | 12 | mysql_packages_ol8: 13 | - mysql-server 14 | - mysql 15 | 16 | deb_mariadb_packages: 17 | - mariadb-server 18 | - mariadb-common 19 | - python-mysqldb 20 | - python3-pymysql 21 | # added above for mariadb 22 | 23 | mariadb_db_path: '/var/lib/mysql' 24 | mysql_db_path: '/var/lib/mysql' 25 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/vars/ubuntu-2204_vars.yml: -------------------------------------------------------------------------------- 1 | --- 2 | mariadb_packages: 3 | - mariadb 4 | - mariadb-server 5 | - MySQL-python 6 | 7 | mysql_packages: 8 | - mysql-community-server 9 | - mysql-community-client 10 | - MySQL-python 11 | 12 | mysql_packages_ol8: 13 | - mysql-server 14 | - mysql 15 | 16 | deb_mariadb_packages: 17 | - mariadb-server 18 | - mariadb-common 19 | - python3-mysqldb 20 | - python3-pymysql 21 | # added above for mariadb 22 | 23 | mariadb_db_path: '/var/lib/mysql' 24 | mysql_db_path: '/var/lib/mysql' 25 | -------------------------------------------------------------------------------- /playbooks/roles/mysql/vars/ubuntu_vars.yml: -------------------------------------------------------------------------------- 1 | --- 2 | mariadb_packages: 3 | - mariadb 4 | - mariadb-server 5 | - MySQL-python 6 | 7 | mysql_packages: 8 | - mysql-community-server 9 | - mysql-community-client 10 | - MySQL-python 11 | 12 | mysql_packages_ol8: 13 | - mysql-server 14 | - mysql 15 | 16 | deb_mariadb_packages: 17 | - mariadb-server 18 | - mariadb-common 19 | - python-mysqldb 20 | - python3-pymysql 21 | # added above for mariadb 22 | 23 | mariadb_db_path: '/var/lib/mysql' 24 | mysql_db_path: '/var/lib/mysql' 25 | -------------------------------------------------------------------------------- /playbooks/roles/nccl-conf/templates/a100_b4.8: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=WARN 2 | NCCL_IGNORE_CPU_AFFINITY=1 3 | NCCL_IB_SL=0 4 | NCCL_IB_TC=41 5 | NCCL_IB_QPS_PER_CONNECTION=4 6 | NCCL_IB_GID_INDEX=3 7 | NCCL_IB_HCA==mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12 -------------------------------------------------------------------------------- /playbooks/roles/nccl-conf/templates/bm.gpu4.8: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=WARN 2 | NCCL_IGNORE_CPU_AFFINITY=1 3 | NCCL_IB_SL=0 4 | NCCL_IB_TC=41 5 | NCCL_IB_QPS_PER_CONNECTION=4 6 | NCCL_IB_GID_INDEX=3 7 | NCCL_IB_HCA==mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17 -------------------------------------------------------------------------------- /playbooks/roles/nccl-conf/templates/h100: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=WARN 2 | NCCL_CUMEM_ENABLE=0 3 | NCCL_IB_SPLIT_DATA_ON_QPS=0 4 | NCCL_IB_QPS_PER_CONNECTION=1 5 | NCCL_IB_GID_INDEX=3 6 | NCCL_IB_TC=41 7 | NCCL_IB_SL=0 8 | NCCL_IB_TIMEOUT=22 9 | NCCL_NET_PLUGIN={{hpcx_path}} 10 | NCCL_SOCKET_IFNAME=eth0 11 | NCCL_IGNORE_CPU_AFFINITY=1 12 | NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17 -------------------------------------------------------------------------------- /playbooks/roles/nccl-conf/templates/h200: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=WARN 2 | NCCL_CUMEM_ENABLE=0 3 | NCCL_IB_SPLIT_DATA_ON_QPS=0 4 | NCCL_IB_QPS_PER_CONNECTION=1 5 | NCCL_IB_GID_INDEX=3 6 | NCCL_IB_TC=41 7 | NCCL_IB_SL=0 8 | NCCL_IB_TIMEOUT=22 9 | NCCL_NET_PLUGIN={{hpcx_path}} 10 | NCCL_SOCKET_IFNAME=eth0 11 | NCCL_IGNORE_CPU_AFFINITY=1 12 | NCCL_IB_HCA==mlx5_0,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_9,mlx5_10,mlx5_11 -------------------------------------------------------------------------------- /playbooks/roles/nfs-client/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nfs_rpm_packages: 3 | - nfs-utils 4 | 5 | nfs_deb_packages: 6 | - nfs-common 7 | -------------------------------------------------------------------------------- /playbooks/roles/nfs-client/meta/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/nfs-client/meta/main.yml -------------------------------------------------------------------------------- /playbooks/roles/nfs-client/tasks/debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install nfs packages 3 | ansible.builtin.package: 4 | name: '{{ deb_packages }}' 5 | state: present 6 | 7 | - name: create share directory 8 | become: true 9 | file: 10 | path: "{{ local_path }}" 11 | state: directory 12 | owner: debian 13 | group: "{{privilege_group_name}}" 14 | mode: 0775 15 | 16 | - name: Mount NFS with options 17 | mount: 18 | path: "{{ local_path }}" 19 | src: "{{ export_host }}:{{ export_path | trim() }}" 20 | fstype: nfs 21 | opts: "{{ options }}" 22 | state: mounted 23 | when: options!="" 24 | - name: Mount NFS without options 25 | mount: 26 | path: "{{ local_path }}" 27 | src: "{{ export_host }}:{{ export_path | trim() }}" 28 | fstype: nfs 29 | state: mounted 30 | when: options=="" 31 | 32 | - name: make sure the permissions of the share directory are right 33 | become: true 34 | file: 35 | path: "{{ local_path }}" 36 | state: directory 37 | owner: debian 38 | group: "{{privilege_group_name}}" 39 | mode: 0775 -------------------------------------------------------------------------------- /playbooks/roles/nfs-client/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install nfs packages 3 | vars: 4 | package_name: '{{ nfs_rpm_packages }}' 5 | include_role: 6 | name: safe_yum 7 | 8 | - name: create share directory 9 | become: true 10 | file: 11 | path: "{{ local_path }}" 12 | state: directory 13 | owner: opc 14 | group: "{{privilege_group_name}}" 15 | mode: 0775 16 | 17 | - name: Mount NFS with options 18 | mount: 19 | path: "{{ local_path }}" 20 | src: "{{ export_host }}:{{ export_path | trim() }}" 21 | fstype: nfs 22 | opts: "{{ options }}" 23 | state: mounted 24 | when: options!="" 25 | - name: Mount NFS without options 26 | mount: 27 | path: "{{ local_path }}" 28 | src: "{{ export_host }}:{{ export_path | trim() }}" 29 | fstype: nfs 30 | state: mounted 31 | when: options=="" 32 | 33 | - name: make sure the permissions of the share directory are right 34 | become: true 35 | file: 36 | path: "{{ local_path }}" 37 | state: directory 38 | owner: opc 39 | group: "{{privilege_group_name}}" 40 | mode: 0775 -------------------------------------------------------------------------------- /playbooks/roles/nfs-client/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_tasks: ubuntu.yml 5 | when: ansible_distribution == 'Ubuntu' 6 | 7 | - include_tasks: debian.yml 8 | when: ansible_distribution == 'Debian' 9 | -------------------------------------------------------------------------------- /playbooks/roles/nfs-client/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install nfs packages 3 | ansible.builtin.package: 4 | name: "{{ nfs_deb_packages }}" 5 | state: present 6 | 7 | - name: create share directory 8 | become: true 9 | file: 10 | path: "{{ local_path }}" 11 | state: directory 12 | owner: ubuntu 13 | group: "{{privilege_group_name}}" 14 | mode: 0775 15 | 16 | - name: Mount NFS with options 17 | mount: 18 | path: "{{ local_path }}" 19 | src: "{{ export_host }}:{{ export_path | trim() }}" 20 | fstype: nfs 21 | opts: "{{ options }}" 22 | state: mounted 23 | when: options!="" 24 | - name: Mount NFS without options 25 | mount: 26 | path: "{{ local_path }}" 27 | src: "{{ export_host }}:{{ export_path | trim() }}" 28 | fstype: nfs 29 | state: mounted 30 | when: options=="" 31 | 32 | - name: make sure the permissions of the share directory are right 33 | become: true 34 | file: 35 | path: "{{ local_path }}" 36 | state: directory 37 | owner: ubuntu 38 | group: "{{privilege_group_name}}" 39 | mode: 0775 -------------------------------------------------------------------------------- /playbooks/roles/nfs-client/vars/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/nfs-client/vars/main.yml -------------------------------------------------------------------------------- /playbooks/roles/nfs-server/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nfs_packages: 3 | - nfs-utils 4 | 5 | deb_packages: 6 | - nfs-kernel-server 7 | 8 | nfs_rpcbind_state: started 9 | nfs_rpcbind_enabled: true 10 | 11 | home_nfs_options: "rw,sync,no_root_squash" 12 | 13 | -------------------------------------------------------------------------------- /playbooks/roles/nfs-server/handlers/main.yml: -------------------------------------------------------------------------------- 1 | - name: reload_nfs 2 | service: 3 | name: nfs-server 4 | state: reloaded 5 | enabled: true 6 | 7 | - name: debian_reload_nfs 8 | service: 9 | name: nfs-kernel-server 10 | state: reloaded 11 | enabled: true 12 | 13 | -------------------------------------------------------------------------------- /playbooks/roles/nfs-server/meta/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/nfs-server/meta/main.yml -------------------------------------------------------------------------------- /playbooks/roles/nfs-server/tasks/debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Ensure NFS utilities are installed. 3 | package: 4 | name: "{{ deb_packages }}" 5 | state: present 6 | 7 | - name: Start NFS server 8 | service: 9 | name: nfs-kernel-server 10 | state: started 11 | enabled: true 12 | 13 | - name: Ensure rpcbind is running as configured. 14 | service: 15 | name: rpcbind 16 | state: "{{ nfs_rpcbind_state }}" 17 | enabled: "{{ nfs_rpcbind_enabled }}" 18 | 19 | - name: create local storage directory 20 | file: 21 | path: "{{ local_path }}" 22 | state: directory 23 | owner: "{{ ansible_user }}" 24 | group: "{{ privilege_group_name }}" 25 | 26 | - name: create exports.d 27 | file: 28 | path: /etc/exports.d 29 | owner: root 30 | group: root 31 | state: directory 32 | 33 | - name: create exports file 34 | register: nfs_config 35 | notify: debian_reload_nfs 36 | template: 37 | src: exports.j2 38 | dest: "/etc/exports.d/{{ export_name }}.exports" 39 | 40 | - name: export folders 41 | command: 'exportfs -ra' 42 | 43 | -------------------------------------------------------------------------------- /playbooks/roles/nfs-server/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Ensure NFS utilities are installed. 3 | vars: 4 | package_name: 5 | - nfs-utils 6 | package_state: present 7 | include_role: 8 | name: safe_yum 9 | ignore_errors: true 10 | 11 | - name: Start NFS server 12 | service: 13 | name: nfs-server 14 | state: started 15 | enabled: true 16 | 17 | - name: Ensure rpcbind is running as configured. 18 | service: 19 | name: rpcbind 20 | state: "{{ nfs_rpcbind_state }}" 21 | enabled: "{{ nfs_rpcbind_enabled }}" 22 | 23 | - name: create local storage directory 24 | file: 25 | path: "{{ local_path }}" 26 | state: directory 27 | owner: "{{ ansible_user }}" 28 | group: "{{privilege_group_name}}" 29 | mode: '0775' 30 | 31 | - name: create exports file 32 | register: nfs_config 33 | notify: reload_nfs 34 | template: 35 | src: exports.j2 36 | dest: "/etc/exports.d/{{ export_name }}.exports" 37 | 38 | - name: export folders 39 | command: 'exportfs -ra' 40 | -------------------------------------------------------------------------------- /playbooks/roles/nfs-server/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_tasks: ubuntu.yml 5 | when: ansible_distribution == 'Ubuntu' 6 | 7 | - include_tasks: debian.yml 8 | when: ansible_distribution == 'Debian' 9 | -------------------------------------------------------------------------------- /playbooks/roles/nfs-server/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Ensure NFS utilities are installed. 3 | vars: 4 | package_name: 5 | - nfs-kernel-server 6 | include_role: 7 | name: safe_yum 8 | 9 | - name: Start NFS server 10 | service: 11 | name: nfs-kernel-server 12 | state: started 13 | enabled: true 14 | 15 | - name: Ensure rpcbind is running as configured. 16 | service: 17 | name: rpcbind 18 | state: "{{ nfs_rpcbind_state }}" 19 | enabled: "{{ nfs_rpcbind_enabled }}" 20 | 21 | - name: create local storage directory 22 | file: 23 | path: "{{ local_path }}" 24 | state: directory 25 | owner: "{{ ansible_user }}" 26 | group: "{{ privilege_group_name }}" 27 | 28 | - name: create exports.d 29 | file: 30 | path: /etc/exports.d 31 | owner: root 32 | group: root 33 | state: directory 34 | 35 | - name: create exports file 36 | register: nfs_config 37 | notify: reload_nfs 38 | template: 39 | src: exports.j2 40 | dest: "/etc/exports.d/{{ export_name }}.exports" 41 | 42 | - name: export folders 43 | command: 'exportfs -ra' 44 | -------------------------------------------------------------------------------- /playbooks/roles/nfs-server/templates/exports.j2: -------------------------------------------------------------------------------- 1 | {% if public_subnet is not defined %} 2 | {% set address = ansible_default_ipv4.network %} 3 | {% set mask = ansible_default_ipv4.netmask %} 4 | {% set network = address + '/' + mask %} 5 | {% set public_subnet = network | ipaddr('net') %} 6 | {% endif %} 7 | 8 | {{ local_path }} {{ public_subnet }}({{ home_nfs_options }}) 9 | {% if private_subnet is defined and private_subnet != public_subnet %} 10 | {{ local_path }} {{ private_subnet }}({{ home_nfs_options }}) 11 | {% endif %} 12 | -------------------------------------------------------------------------------- /playbooks/roles/nfs-server/vars/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/nfs-server/vars/main.yml -------------------------------------------------------------------------------- /playbooks/roles/no_instance_principal/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/no_instance_principal/defaults/main.yml -------------------------------------------------------------------------------- /playbooks/roles/no_instance_principal/meta/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/no_instance_principal/meta/main.yml -------------------------------------------------------------------------------- /playbooks/roles/no_instance_principal/tasks/common.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: create .oci directory 3 | become: true 4 | file: 5 | path: /home/{{ ansible_user }}/.oci 6 | state: directory 7 | owner: "{{ ansible_user }}" 8 | group: "{{ ansible_user }}" 9 | mode: 0775 10 | 11 | - name: Generate config file 12 | become: true 13 | template: 14 | src: 'config.j2' 15 | dest: '/home/{{ ansible_user }}/.oci/config' 16 | mode: 0600 17 | owner: "{{ ansible_user }}" 18 | group: "{{ ansible_user }}" 19 | 20 | - name: delete --auth in create_cluster.sh 21 | become: true 22 | replace: 23 | path: /opt/oci-hpc/bin/create_cluster.sh 24 | regexp: '--auth instance_principal' 25 | replace: '' -------------------------------------------------------------------------------- /playbooks/roles/no_instance_principal/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: common.yml 2 | 3 | 4 | -------------------------------------------------------------------------------- /playbooks/roles/no_instance_principal/templates/config.j2: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | user={{ api_user_ocid }} 3 | fingerprint={{ api_fingerprint }} 4 | tenancy={{ tenancy_ocid}} 5 | region={{ region }} 6 | key_file=/opt/oci-hpc/autoscaling/credentials/key.pem -------------------------------------------------------------------------------- /playbooks/roles/no_instance_principal/vars/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/no_instance_principal/vars/main.yml -------------------------------------------------------------------------------- /playbooks/roles/nvidia-container/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | libnvidia_container_tools_package_version: '' 3 | libnvidia_container_tools_package_state: present 4 | 5 | libnvidia_container_repo_base_url: "https://nvidia.github.io/libnvidia-container" 6 | libnvidia_container_repo_gpg_url: "{{ libnvidia_container_repo_base_url }}/gpgkey" 7 | -------------------------------------------------------------------------------- /playbooks/roles/nvidia-container/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - include_tasks: oraclelinux-7.yml 4 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '7' 5 | 6 | #- include_tasks: centos-7.yml 7 | # when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' 8 | 9 | - include_tasks: ubuntu.yml 10 | when: ansible_distribution == 'Ubuntu' 11 | -------------------------------------------------------------------------------- /playbooks/roles/nvidia-container/tasks/oraclelinux-7.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - block: 3 | 4 | 5 | - name: set mydistribution 6 | ansible.builtin.set_fact: 7 | mydistribution: "{{ 'rhel' if (ansible_distribution == 'OracleLinux') else (ansible_distribution | lower) }}{{ ansible_distribution_version }}" 8 | 9 | - name: mydistribution 10 | debug: 11 | msg: "{{ mydistribution }}" 12 | 13 | - name: Download nvidia-docker.repo. 14 | get_url: 15 | url: "https://nvidia.github.io/nvidia-docker/{{ mydistribution }}/nvidia-docker.repo " 16 | dest: "/etc/yum.repos.d/nvidia-docker.repo" 17 | mode: '0644' 18 | owner: root 19 | group: root 20 | force: yes 21 | 22 | - name: clean 23 | command: yum clean expire-cache 24 | args: 25 | warn: no 26 | 27 | - name: Install nvidia-container-toolkit 28 | vars: 29 | package_name: 30 | - nvidia-container-toolkit 31 | package_state: latest 32 | package_cache: true 33 | include_role: 34 | name: safe_yum 35 | 36 | - name: Start Docker 37 | ansible.builtin.service: 38 | name: "docker" 39 | state: restarted 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /playbooks/roles/nvidia-container/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: add key 4 | apt_key: 5 | url: "{{ libnvidia_container_repo_gpg_url }}" 6 | state: present 7 | 8 | - name: add repo 9 | get_url: 10 | url: "{{ libnvidia_container_repo_base_url }}/{{ ubuntu_repo_dist_name }}/{{ ubuntu_repo_file_name }}" 11 | dest: "{{ ubuntu_repo_file_path }}" 12 | mode: 0644 13 | owner: root 14 | group: root 15 | 16 | 17 | - name: install packages 18 | vars: 19 | package_name: 20 | - libnvidia-container-tools{{ libnvidia_container_tools_package_version | ternary("="+libnvidia_container_tools_package_version, "") }} 21 | package_state: "{{ libnvidia_container_tools_package_state }}" 22 | package_cache: true 23 | include_role: 24 | name: safe_yum 25 | 26 | - name: Install nvidia-container-toolkit 27 | vars: 28 | package_name: 29 | - nvidia-container-toolkit 30 | package_state: latest 31 | package_cache: true 32 | include_role: 33 | name: safe_yum 34 | 35 | - name: Start Docker 36 | ansible.builtin.service: 37 | name: "docker" 38 | state: restarted 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /playbooks/roles/nvidia-container/templates/templates.j2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/nvidia-container/templates/templates.j2 -------------------------------------------------------------------------------- /playbooks/roles/nvidia-container/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | ubuntu_repo_dist_name: "{{ ansible_distribution | lower }}{{ ansible_distribution_version }}" 3 | ubuntu_repo_file_name: "libnvidia-container.list" 4 | ubuntu_repo_file_path: "/etc/apt/sources.list.d/{{ ubuntu_repo_file_name }}" 5 | -------------------------------------------------------------------------------- /playbooks/roles/nvidia-enroot/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - include_tasks: oraclelinux.yml 4 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 5 | 6 | #- include_tasks: centos-7.yml 7 | # when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' 8 | 9 | - include_tasks: ubuntu.yml 10 | when: ansible_os_family == 'Debian' and ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/nvidia_peermem/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for nvidia_peermem 3 | -------------------------------------------------------------------------------- /playbooks/roles/nvidia_peermem/tasks/common.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Check if its a GPU shape 3 | shell: 4 | cmd: "curl -sH \"Authorization: Bearer Oracle\" -L http://169.254.169.254/opc/v2/instance/ | jq .shape | grep GPU" 5 | register: shape_gpu 6 | failed_when: false 7 | 8 | - name: Check if nvidia drivers are installed 9 | shell: cat /sys/module/nvidia/version | wc -l 10 | register: nvidia 11 | when: shape_gpu.stdout != "" 12 | 13 | - name: Check if nvidia_peermem module is loaded 14 | shell: lsmod | grep nvidia_peermem | wc -l 15 | register: result 16 | when: shape_gpu.stdout != "" and nvidia.stdout == '1' 17 | 18 | - name: Check ofed version 19 | shell: 20 | cmd: | 21 | /usr/bin/ofed_info |grep MLNX_OFED_LINUX|grep -v rpm|awk -F "(" '{print $2}'|cut -c 6-|awk -F "-" '{print $1}' 22 | register: ofed_version_local 23 | when: shape_gpu.stdout != "" and nvidia.stdout == '1' 24 | 25 | - name: Load nvidia_peermem module 26 | become: true 27 | shell: modprobe nvidia_peermem 28 | when: shape_gpu.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' and ofed_version_local.stdout|int >= '5.1' -------------------------------------------------------------------------------- /playbooks/roles/nvidia_peermem/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for nvidia_peermem 3 | - include_tasks: common.yml -------------------------------------------------------------------------------- /playbooks/roles/nvidia_peermem/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # vars file for nvidia_peermem 3 | -------------------------------------------------------------------------------- /playbooks/roles/oci-cloud-agent-updater/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install yum-plugin-versionlock for OL8 3 | yum: 4 | name: python3-dnf-plugin-versionlock 5 | state: latest 6 | disable_gpg_check: yes 7 | when: 8 | - ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' 9 | 10 | - name: Hold latest OCA for OL 11 | community.general.yum_versionlock: 12 | name: oracle-cloud-agent 13 | state: present 14 | when: 15 | - ansible_os_family == 'RedHat' 16 | 17 | - name: Check if the updater file exists 18 | stat: 19 | path: /etc/oracle-cloud-agent/updater.yml 20 | register: updater_exist 21 | 22 | - name: Disable OCA updater 23 | replace: 24 | path: /etc/oracle-cloud-agent/updater.yml 25 | regexp: 'upgrade_interval: 3600' 26 | replace: 'upgrade_interval: -1' 27 | when: updater_exist.stat.exists 28 | 29 | - name: Restart cloud agent updater 30 | service: 31 | name: oracle-cloud-agent-updater 32 | state: restarted 33 | retries: 5 34 | register: restart_cloud_agent_updater 35 | until: restart_cloud_agent_updater is not failed 36 | when: updater_exist.stat.exists 37 | ignore_errors: yes 38 | -------------------------------------------------------------------------------- /playbooks/roles/oci-cloud-agent-updater/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' -------------------------------------------------------------------------------- /playbooks/roles/oci-cloud-agent-updater/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Hold Oracle cloud-agent 3 | shell: "sudo snap refresh --hold=forever oracle-cloud-agent" 4 | when : ansible_distribution == 'Ubuntu' 5 | ignore_errors: yes 6 | 7 | - name: Check if the updater file exists 8 | stat: 9 | path: /etc/oracle-cloud-agent/updater.yml 10 | register: updater_exist 11 | 12 | - name: Disable OCA updater 13 | replace: 14 | path: /etc/oracle-cloud-agent/updater.yml 15 | regexp: 'upgrade_interval: 3600' 16 | replace: 'upgrade_interval: -1' 17 | when: updater_exist.stat.exists 18 | 19 | 20 | - name: Restart cloud agent updater 21 | service: 22 | name: snap.oracle-cloud-agent.oracle-cloud-agent-updater.service 23 | state: restarted 24 | retries: 5 25 | register: restart_cloud_agent_updater 26 | until: restart_cloud_agent_updater is not failed 27 | when: updater_exist.stat.exists 28 | ignore_errors: yes 29 | 30 | # oracle-cloud-agent runs by default on ubuntu 31 | # However it doesn't have osms (OS Management Service) which we had to disable on Oracle Linux 32 | # sudo snap list | grep oracle-cloud-agent 33 | # to check for version 34 | # snap info oracle-cloud-agent -------------------------------------------------------------------------------- /playbooks/roles/oci-cloud-agent/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' -------------------------------------------------------------------------------- /playbooks/roles/oci-cloud-agent/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: call oracle cloud agent updater 3 | include_role: 4 | name: oci-cloud-agent-updater 5 | 6 | # oracle-cloud-agent runs by default on ubuntu 7 | # However it doesn't have osms (OS Management Service) which we had to disable on Oracle Linux 8 | # sudo snap list | grep oracle-cloud-agent 9 | # to check for version 10 | # snap info oracle-cloud-agent -------------------------------------------------------------------------------- /playbooks/roles/oci-cn-auth/defaults/main.yml: -------------------------------------------------------------------------------- 1 | version: 2.1.4 2 | download_link: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/F7gihhVuJbrnsV8KjAMA7XblkZYRBYJ2xAH2FPmaIJrgtYcuy5wJRWAQXMfw9hLD/n/hpc/b/source/o/ -------------------------------------------------------------------------------- /playbooks/roles/oci-cn-auth/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Check the oci-cn-auth version 3 | shell: cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth | awk -F- '{print $1}' 4 | register: current_version 5 | 6 | - name: Download oci-cn-auth .rpm if the current version is lower 7 | get_url: 8 | url: "{{download_link}}oci-cn-auth-{{version}}-compute.el{{ansible_distribution_major_version}}.noarch.rpm" 9 | dest: "/tmp/" 10 | when: current_version.stdout < version 11 | 12 | - name: Install oci-cn-auth .rpm if the current version is lower 13 | vars: 14 | package_name: 15 | - "/tmp/oci-cn-auth-{{version}}-compute.el{{ansible_distribution_major_version}}.noarch.rpm" 16 | package_state: present 17 | include_role: 18 | name: safe_yum 19 | when: current_version.stdout < version 20 | 21 | - name: Restart the OCI CN AUTH service 22 | become: true 23 | service: 24 | name: oci-cn-auth 25 | state: restarted 26 | enabled: yes 27 | when: current_version.stdout < version -------------------------------------------------------------------------------- /playbooks/roles/oci-cn-auth/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - include_tasks: el.yml 3 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 4 | 5 | - include_tasks: ubuntu.yml 6 | when: ansible_os_family == 'Debian' and ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/oci-cn-auth/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Check the oci-cn-auth version 3 | shell: cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth | awk -F- '{print $1}' 4 | register: current_version 5 | 6 | - name: Download oci-cn-auth .deb if the current version is lower 7 | get_url: 8 | url: "{{download_link}}oci-cn-auth_{{version}}-compute_all.deb" 9 | dest: "/tmp/" 10 | when: current_version.stdout < version 11 | 12 | - name: Install oci-cn-auth .deb if the current version is lower 13 | vars: 14 | deb_name: 15 | - "/tmp/oci-cn-auth_{{version}}-compute_all.deb" 16 | package_state: present 17 | include_role: 18 | name: safe_yum 19 | when: current_version.stdout < version 20 | 21 | - name: Restart the OCI CN AUTH service 22 | become: true 23 | service: 24 | name: oci-cn-auth 25 | state: restarted 26 | enabled: yes 27 | when: current_version.stdout < version -------------------------------------------------------------------------------- /playbooks/roles/oci-hostname/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Update /etc/oci-hostname.conf 4 | replace: 5 | path: /etc/oci-hostname.conf 6 | regexp: '^PRESERVE_HOSTINFO=.*' 7 | replace: 'PRESERVE_HOSTINFO=2' 8 | async: 30 9 | poll: 2 10 | -------------------------------------------------------------------------------- /playbooks/roles/oci-hostname/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | -------------------------------------------------------------------------------- /playbooks/roles/oci-legacy/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Check if oci-hpc bashrc config exists 3 | stat: 4 | path: /etc/opt/oci-hpc/bashrc/.bashrc_config 5 | register: bashrc_config 6 | 7 | - name: update NFS_CONFIGURED if needed 8 | lineinfile: 9 | dest: /etc/opt/oci-hpc/bashrc/.bashrc_config 10 | state: present 11 | regexp: 'export NFS_CONFIGURED="no"' 12 | line: 'export NFS_CONFIGURED="yes"' 13 | when: bashrc_config.stat.exists|bool and cluster_nfs|bool 14 | 15 | - name: Update NFS_MOUNT_PATH 16 | lineinfile: 17 | path: /etc/opt/oci-hpc/bashrc/.bashrc_config 18 | regexp: '^export NFS_MOUNT_PATH' 19 | line: 'export NFS_MOUNT_PATH="{{ cluster_nfs_path }}"' 20 | when: bashrc_config.stat.exists|bool and cluster_nfs|bool 21 | -------------------------------------------------------------------------------- /playbooks/roles/oci-legacy/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' -------------------------------------------------------------------------------- /playbooks/roles/oci-legacy/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Check if oci-hpc bashrc config exists 3 | stat: 4 | path: /etc/opt/oci-hpc/bashrc/.bashrc_config 5 | register: bashrc_config 6 | 7 | - name: update NFS_CONFIGURED if needed 8 | lineinfile: 9 | dest: /etc/opt/oci-hpc/bashrc/.bashrc_config 10 | state: present 11 | regexp: 'export NFS_CONFIGURED="no"' 12 | line: 'export NFS_CONFIGURED="yes"' 13 | when: bashrc_config.stat.exists|bool and cluster_nfs|bool 14 | 15 | - name: Update NFS_MOUNT_PATH 16 | lineinfile: 17 | path: /etc/opt/oci-hpc/bashrc/.bashrc_config 18 | regexp: '^export NFS_MOUNT_PATH' 19 | line: 'export NFS_MOUNT_PATH="{{ cluster_nfs_path }}"' 20 | when: bashrc_config.stat.exists|bool and cluster_nfs|bool 21 | -------------------------------------------------------------------------------- /playbooks/roles/oom-adjust/defaults/main.yml: -------------------------------------------------------------------------------- 1 | oom_services: 2 | - nvidia-dcgm.service 3 | - nvidia-fabricmanager.service 4 | - nvidia-hibernate.service 5 | - nvidia-persistenced.service 6 | - nvidia-resume.service 7 | - nvidia-suspend.service 8 | - oci-hpc-nvidia-gpu-configure.service 9 | - slurmd.service 10 | - slurmctld.service 11 | - slurmdbd.service 12 | - munge.service 13 | - sshd.service 14 | - sssd.service 15 | -------------------------------------------------------------------------------- /playbooks/roles/oom-adjust/files/oom-adjust.conf: -------------------------------------------------------------------------------- 1 | [Service] 2 | OOMScoreAdjust=-500 3 | 4 | -------------------------------------------------------------------------------- /playbooks/roles/oom-adjust/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Gather service facts 3 | ansible.builtin.service_facts: 4 | 5 | - name: Filter existing services 6 | ansible.builtin.set_fact: 7 | existing_services: "{{ oom_services | select('in', ansible_facts.services.keys()) | list }}" 8 | 9 | - name: Ensure directories exist for systemd overrides 10 | become: true 11 | ansible.builtin.file: 12 | path: "/etc/systemd/system/{{ item }}.d" 13 | state: directory 14 | owner: root 15 | group: root 16 | mode: '0755' 17 | with_items: "{{ existing_services }}" 18 | 19 | - name: Copy oom-adjust.conf to the override directory 20 | become: true 21 | ansible.builtin.copy: 22 | src: oom-adjust.conf 23 | dest: "/etc/systemd/system/{{ item }}.d/override.conf" 24 | owner: root 25 | group: root 26 | mode: '0644' 27 | with_items: "{{ existing_services }}" 28 | 29 | - name: Reload systemd daemon 30 | become: true 31 | ansible.builtin.systemd: 32 | daemon_reload: true 33 | -------------------------------------------------------------------------------- /playbooks/roles/openldap/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # common vars file for openldap 3 | 4 | openldap_server_dir_path: /var/lib/ldap 5 | 6 | openldap_endpoints: ldaps:/// ldapi:/// 7 | openldap_host: 'controller.cluster' 8 | 9 | ssl_cert_path: '/etc/ssl/certs' 10 | ssl_cert_group: '{{ openldap_default_user }}' 11 | 12 | openldap_tls_cacrt: '{{ ssl_ca_cert }}' 13 | openldap_tls_crt: '{{ ssl_cert_path }}/{{ ansible_fqdn }}.crt' 14 | openldap_tls_key: '{{ ssl_cert_path }}/{{ ansible_fqdn }}.key' 15 | 16 | openldap_schemas: 17 | - cosine 18 | - inetorgperson 19 | - rfc2307bis 20 | - autoinc 21 | - ppolicy 22 | 23 | openldap_schemas_22: 24 | - cosine 25 | - inetorgperson 26 | - rfc2307bis 27 | - autoinc 28 | -------------------------------------------------------------------------------- /playbooks/roles/openldap/files/autoinc.ldif: -------------------------------------------------------------------------------- 1 | 2 | # import required schemas 3 | # uidNumber Attribute Auto-Incrementing 4 | # http://www.rexconsulting.net/ldap-protocol-uidNumber.html 5 | dn: cn=autoinc,cn=schema,cn=config 6 | changetype: add 7 | objectClass: olcSchemaConfig 8 | cn: autoinc 9 | olcObjectClasses: ( 1.3.6.1.4.1.23277.2.2.2.8 10 | NAME 'uidNext' 11 | DESC 'Where we get the next uidNumber from' 12 | STRUCTURAL MUST ( cn $ uidNumber ) ) 13 | 14 | -------------------------------------------------------------------------------- /playbooks/roles/openldap/files/debian_memberof.ldif: -------------------------------------------------------------------------------- 1 | dn: cn=module{0},cn=config 2 | changetype: modify 3 | add: olcModuleLoad 4 | olcModuleLoad: memberof 5 | 6 | dn: olcOverlay={0}memberof,olcDatabase={1}mdb,cn=config 7 | changetype: add 8 | objectClass: olcConfig 9 | objectClass: olcMemberOf 10 | objectClass: olcOverlayConfig 11 | objectClass: top 12 | olcOverlay: {0}memberof 13 | olcMemberOfDangling: ignore 14 | olcMemberOfRefInt: TRUE 15 | olcMemberOfGroupOC: groupOfMembers 16 | olcMemberOfMemberAD: member 17 | olcMemberOfMemberOfAD: memberOf 18 | 19 | dn: cn=module{0},cn=config 20 | changetype: modify 21 | add: olcModuleLoad 22 | olcModuleLoad: refint 23 | 24 | dn: olcOverlay={1}refint,olcDatabase={1}mdb,cn=config 25 | changetype: add 26 | objectClass: olcConfig 27 | objectClass: olcOverlayConfig 28 | objectClass: olcRefintConfig 29 | objectClass: top 30 | olcOverlay: {1}refint 31 | olcRefintAttribute: memberof member 32 | 33 | -------------------------------------------------------------------------------- /playbooks/roles/openldap/files/debian_ppolicy.ldif: -------------------------------------------------------------------------------- 1 | dn: cn=module{0},cn=config 2 | changetype: modify 3 | add: olcModuleLoad 4 | olcModuleLoad: ppolicy 5 | 6 | dn: olcOverlay={0}ppolicy,olcDatabase={1}mdb,cn=config 7 | changetype: add 8 | objectClass: olcConfig 9 | objectClass: olcPPolicyConfig 10 | objectClass: olcOverlayConfig 11 | objectClass: top 12 | olcOverlay: ppolicy 13 | olcPPolicyDefault: cn=pwdDefaultPolicy,ou=Policies,dc=local 14 | olcPPolicyHashCleartext: FALSE 15 | olcPPolicyUseLockout: FALSE 16 | olcPPolicyForwardUpdates: FALSE -------------------------------------------------------------------------------- /playbooks/roles/openldap/files/el_memberof.ldif: -------------------------------------------------------------------------------- 1 | dn: cn=module{0},cn=config 2 | changetype: modify 3 | add: olcModuleLoad 4 | olcModuleLoad: memberof 5 | 6 | dn: olcOverlay={0}memberof,olcDatabase={2}hdb,cn=config 7 | changetype: add 8 | objectClass: olcConfig 9 | objectClass: olcMemberOf 10 | objectClass: olcOverlayConfig 11 | objectClass: top 12 | olcOverlay: {0}memberof 13 | olcMemberOfDangling: ignore 14 | olcMemberOfRefInt: TRUE 15 | olcMemberOfGroupOC: groupOfMembers 16 | olcMemberOfMemberAD: member 17 | olcMemberOfMemberOfAD: memberOf 18 | 19 | dn: cn=module{0},cn=config 20 | changetype: modify 21 | add: olcModuleLoad 22 | olcModuleLoad: refint 23 | 24 | dn: olcOverlay={1}refint,olcDatabase={2}hdb,cn=config 25 | changetype: add 26 | objectClass: olcConfig 27 | objectClass: olcOverlayConfig 28 | objectClass: olcRefintConfig 29 | objectClass: top 30 | olcOverlay: {1}refint 31 | olcRefintAttribute: memberof member -------------------------------------------------------------------------------- /playbooks/roles/openldap/files/el_memberof_ol8.ldif: -------------------------------------------------------------------------------- 1 | dn: cn=module{0},cn=config 2 | changetype: modify 3 | add: olcModuleLoad 4 | olcModuleLoad: memberof 5 | 6 | dn: olcOverlay={0}memberof,olcDatabase={2}mdb,cn=config 7 | changetype: add 8 | objectClass: olcConfig 9 | objectClass: olcMemberOf 10 | objectClass: olcOverlayConfig 11 | objectClass: top 12 | olcOverlay: {0}memberof 13 | olcMemberOfDangling: ignore 14 | olcMemberOfRefInt: TRUE 15 | olcMemberOfGroupOC: groupOfMembers 16 | olcMemberOfMemberAD: member 17 | olcMemberOfMemberOfAD: memberOf 18 | 19 | dn: cn=module{0},cn=config 20 | changetype: modify 21 | add: olcModuleLoad 22 | olcModuleLoad: refint 23 | 24 | dn: olcOverlay={1}refint,olcDatabase={2}mdb,cn=config 25 | changetype: add 26 | objectClass: olcConfig 27 | objectClass: olcOverlayConfig 28 | objectClass: olcRefintConfig 29 | objectClass: top 30 | olcOverlay: {1}refint 31 | olcRefintAttribute: memberof member 32 | 33 | -------------------------------------------------------------------------------- /playbooks/roles/openldap/files/el_ppolicy.ldif: -------------------------------------------------------------------------------- 1 | dn: cn=module{0},cn=config 2 | changetype: add 3 | objectClass: olcModuleList 4 | cn: module{0} 5 | olcModuleLoad: ppolicy 6 | 7 | dn: olcOverlay={0}ppolicy,olcDatabase={2}hdb,cn=config 8 | changetype: add 9 | objectClass: olcConfig 10 | objectClass: olcPPolicyConfig 11 | objectClass: olcOverlayConfig 12 | objectClass: top 13 | olcOverlay: ppolicy 14 | olcPPolicyDefault: cn=pwdDefaultPolicy,ou=Policies,dc=local 15 | olcPPolicyHashCleartext: FALSE 16 | olcPPolicyUseLockout: FALSE 17 | olcPPolicyForwardUpdates: FALSE -------------------------------------------------------------------------------- /playbooks/roles/openldap/files/el_ppolicy_ol8.ldif: -------------------------------------------------------------------------------- 1 | dn: cn=module{0},cn=config 2 | changetype: add 3 | objectClass: olcModuleList 4 | cn: module{0} 5 | olcModuleLoad: ppolicy 6 | 7 | dn: olcOverlay={0}ppolicy,olcDatabase={2}mdb,cn=config 8 | changetype: add 9 | objectClass: olcPPolicyConfig 10 | objectClass: olcOverlayConfig 11 | objectClass: top 12 | olcOverlay: ppolicy 13 | olcPPolicyDefault: cn=pwdDefaultPolicy,ou=Policies,dc=local 14 | olcPPolicyHashCleartext: FALSE 15 | olcPPolicyUseLockout: FALSE 16 | olcPPolicyForwardUpdates: FALSE 17 | 18 | -------------------------------------------------------------------------------- /playbooks/roles/openldap/files/local_schema.ldif: -------------------------------------------------------------------------------- 1 | dn: dc=local 2 | dc: local 3 | objectClass: domain 4 | 5 | dn: ou=People,dc=local 6 | ou: People 7 | objectClass: top 8 | objectClass: organizationalUnit 9 | 10 | dn: ou=Group,dc=local 11 | ou: Group 12 | objectClass: top 13 | objectClass: organizationalUnit 14 | 15 | dn: cn=uid,dc=local 16 | cn: uid 17 | objectClass: uidNext 18 | uidNumber: 1050 19 | 20 | dn: cn=gid,dc=local 21 | cn: gid 22 | objectClass: uidNext 23 | uidNumber: 150 24 | 25 | dn: ou=Policies,dc=local 26 | ou: Policies 27 | objectClass: organizationalUnit 28 | 29 | dn: cn=pwdDefaultPolicy,ou=Policies,dc=local 30 | objectClass: pwdPolicy 31 | objectClass: person 32 | objectClass: top 33 | cn: pwdDefaultPolicy 34 | sn: pwdDefaultPolicy 35 | pwdAttribute: userPassword 36 | pwdCheckQuality: 1 37 | pwdMinAge: 0 38 | pwdMaxAge: 0 39 | pwdMinLength: 8 40 | pwdInHistory: 2 41 | pwdMaxFailure: 3 42 | pwdFailureCountInterval: 0 43 | pwdLockout: FALSE 44 | pwdLockoutDuration: 0 45 | pwdAllowUserChange: TRUE 46 | pwdExpireWarning: 0 47 | pwdGraceAuthNLimit: 0 48 | pwdMustChange: FALSE 49 | pwdSafeModify: FALSE 50 | 51 | -------------------------------------------------------------------------------- /playbooks/roles/openldap/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # handler file for openldap 3 | 4 | - name: restart openldap 5 | service: 6 | name: slapd 7 | state: restarted 8 | enabled: yes 9 | when: primary|default(True) -------------------------------------------------------------------------------- /playbooks/roles/openldap/meta/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/openldap/meta/main.yml -------------------------------------------------------------------------------- /playbooks/roles/openldap/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_vars: el_vars.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_vars: debian_vars.yml 5 | when: ansible_distribution == 'Ubuntu' 6 | 7 | - include_tasks: el.yml 8 | when: ansible_os_family == 'RedHat' 9 | 10 | #- include_tasks: el-8.yml 11 | # when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' 12 | 13 | - include_tasks: debian.yml 14 | when: ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/openldap/templates/config.ldif.j2: -------------------------------------------------------------------------------- 1 | # Global parameters 2 | dn: cn=config 3 | changetype: modify 4 | replace: olcTLSCACertificateFile 5 | olcTLSCACertificateFile: {{ openldap_tls_cacrt }} 6 | - 7 | replace: olcTLSCertificateKeyFile 8 | olcTLSCertificateKeyFile: {{ openldap_tls_key }} 9 | - 10 | replace: olcTLSCertificateFile 11 | olcTLSCertificateFile: {{ openldap_tls_crt }} 12 | - 13 | replace: olcConnMaxPendingAuth 14 | olcConnMaxPendingAuth: 20000 15 | - 16 | replace: olcConnMaxPending 17 | olcConnMaxPending: 5000 18 | - 19 | replace: olcIdleTimeout 20 | olcIdleTimeout: 120 21 | - 22 | replace: olcWriteTimeout 23 | olcWriteTimeout: 120 24 | - 25 | replace: olcTimeLimit 26 | olcTimeLimit: 60 27 | 28 | # config backend: access configuration 29 | dn: olcDatabase={0}config,cn=config 30 | changetype: modify 31 | replace: olcRootDN 32 | olcRootDN: cn=config 33 | - 34 | replace: olcRootPW 35 | olcRootPW: {{ openldap_root_pwd_hash.stdout }} 36 | -------------------------------------------------------------------------------- /playbooks/roles/openldap/vars/debian_vars.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # vars file for openldap 3 | 4 | openldap_packages: 5 | - slapd 6 | - ldap-utils 7 | - openssl 8 | - libsasl2-dev 9 | - libldap2-dev 10 | - libssl-dev 11 | - python3-pexpect 12 | - rpcbind 13 | - nscd 14 | - libpam-ldap 15 | - libnss-ldapd 16 | 17 | openldap_default_user: openldap 18 | openldap_default_group: openldap 19 | 20 | openldap_server_conf_path: /etc/ldap/slapd.d 21 | openldap_server_defaults_file: /etc/default/slapd -------------------------------------------------------------------------------- /playbooks/roles/openldap/vars/el_vars.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # vars file for openldap 3 | 4 | openldap_packages: 5 | - openldap-servers 6 | - openldap-clients 7 | - rsync 8 | 9 | openldap_packages_ol8: 10 | - openldap-2.4.46-18.el8.x86_64 11 | - openldap-clients-2.4.46-18.el8.x86_64 12 | - rsync 13 | 14 | openldap_default_user: ldap 15 | openldap_default_group: ldap 16 | 17 | openldap_server_conf_path: /etc/openldap/slapd.d 18 | openldap_server_defaults_file: /etc/sysconfig/slapd 19 | -------------------------------------------------------------------------------- /playbooks/roles/packages/tasks/centos-7.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Make sure python OpenSSL and parallel ssh is installed 3 | vars: 4 | package_name: 5 | - pyOpenSSL 6 | - python2-cryptography 7 | - pssh 8 | - pdsh 9 | - python3-pip 10 | package_state: latest 11 | include_role: 12 | name: safe_yum 13 | 14 | - name: Upgrade Pip3 15 | become: true 16 | pip: 17 | name: [pip] 18 | state: latest 19 | executable: pip3 20 | ignore_errors: yes -------------------------------------------------------------------------------- /playbooks/roles/packages/tasks/debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Make sure python OpenSSL and parallel ssh is installed 3 | vars: 4 | package_name: 5 | - python-openssl 6 | - python-cryptography 7 | - parted 8 | - pssh 9 | - pdsh 10 | - jq 11 | - python3-pip 12 | package_state: latest 13 | include_role: 14 | name: safe_yum 15 | 16 | - name: Upgrade Pip3 17 | become: true 18 | pip: 19 | name: [pip] 20 | state: latest 21 | executable: pip3 22 | ignore_errors: yes -------------------------------------------------------------------------------- /playbooks/roles/packages/tasks/el-7.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Make sure python OpenSSL and parallel ssh is installed 3 | vars: 4 | package_name: 5 | - pyOpenSSL 6 | - python2-cryptography 7 | - python3-oci-cli 8 | - pssh 9 | - pdsh 10 | - python3-pip 11 | package_state: latest 12 | include_role: 13 | name: safe_yum 14 | ignore_errors: true 15 | 16 | - name: Upgrade Pip3 17 | become: true 18 | pip: 19 | name: [pip] 20 | state: latest 21 | executable: pip3 22 | ignore_errors: yes 23 | 24 | - name: install oci-cli latest version 25 | become: true 26 | pip: 27 | name: [oci-cli] 28 | state: latest 29 | executable: pip3 30 | ignore_errors: yes 31 | when: ('controller' in group_names) 32 | -------------------------------------------------------------------------------- /playbooks/roles/packages/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: ol-7.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '7' 3 | 4 | - include_tasks: ol-8.yml 5 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '8' 6 | 7 | - include_tasks: centos-7.yml 8 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' 9 | 10 | - include_tasks: ubuntu.yml 11 | when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version < '22' 12 | 13 | - include_tasks: ubuntu-2204.yml 14 | when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version == '22' 15 | 16 | - include_tasks: debian.yml 17 | when: ansible_distribution == 'Debian' -------------------------------------------------------------------------------- /playbooks/roles/packages/tasks/ol-7.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Enable ol7_developer_EPEL repo 3 | shell: yum-config-manager --enable ol7_developer_EPEL 4 | 5 | - name: Make sure python OpenSSL and parallel ssh is installed 6 | vars: 7 | package_name: 8 | - pyOpenSSL 9 | - python2-cryptography 10 | - python3-oci-cli 11 | - pssh 12 | - pdsh 13 | - python3-pip 14 | package_state: latest 15 | package_repo: "ol7_developer_EPEL" 16 | include_role: 17 | name: safe_yum 18 | 19 | - name: Upgrade Pip3 20 | become: true 21 | pip: 22 | name: [pip] 23 | state: latest 24 | executable: pip3 25 | ignore_errors: yes 26 | 27 | - name: install oci-cli latest version 28 | become: true 29 | pip: 30 | name: [oci-cli] 31 | state: latest 32 | executable: pip3 33 | ignore_errors: yes 34 | when: ('controller' in group_names) -------------------------------------------------------------------------------- /playbooks/roles/packages/tasks/ol-8.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Enable ol8_developer_EPEL repo 3 | shell: yum-config-manager --enable ol8_developer_EPEL 4 | 5 | - name: Make sure python OpenSSL and parallel ssh is installed 6 | vars: 7 | package_name: 8 | #- pyOpenSSL 9 | #- python2-cryptography 10 | - python3-oci-cli 11 | - pssh 12 | - pdsh 13 | - python3-pip 14 | package_state: latest 15 | package_repo: "ol8_developer_EPEL" 16 | include_role: 17 | name: safe_yum 18 | ignore_errors: true 19 | 20 | 21 | - name: Upgrade Pip3 22 | become: true 23 | pip: 24 | name: [pip] 25 | state: latest 26 | executable: pip3 27 | ignore_errors: yes 28 | 29 | - name: install oci-cli latest version 30 | become: true 31 | pip: 32 | name: [oci-cli] 33 | state: latest 34 | executable: pip3 35 | ignore_errors: yes 36 | when: ('controller' in group_names) 37 | 38 | -------------------------------------------------------------------------------- /playbooks/roles/packages/tasks/ubuntu-2204.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - block: 3 | - name: Automatically restart the services 4 | become: true 5 | replace: 6 | path: /etc/needrestart/needrestart.conf 7 | regexp: "#$nrconf{restart} = 'i';" 8 | replace: "$nrconf{restart} = 'a';" 9 | - name: Make sure python OpenSSL and parallel ssh is installed 10 | vars: 11 | package_name: 12 | - python3-openssl 13 | - python3-cryptography 14 | - parted 15 | - pssh 16 | - pdsh 17 | - python3-netaddr 18 | - jq 19 | - python3-pip 20 | package_state: latest 21 | include_role: 22 | name: safe_yum 23 | ignore_errors: true 24 | 25 | - name: Upgrade Pip3 26 | become: true 27 | pip: 28 | name: [pip] 29 | state: latest 30 | executable: pip3 31 | ignore_errors: yes -------------------------------------------------------------------------------- /playbooks/roles/packages/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - block: 3 | - name: Make sure python OpenSSL and parallel ssh is installed 4 | vars: 5 | package_name: 6 | - python-openssl 7 | - python-cryptography 8 | - parted 9 | - pssh 10 | - pdsh 11 | - python3-netaddr 12 | - jq 13 | - python3-pip 14 | package_state: latest 15 | include_role: 16 | name: safe_yum 17 | ignore_errors: true 18 | 19 | - name: Upgrade Pip3 20 | become: true 21 | pip: 22 | name: [pip] 23 | state: latest 24 | executable: pip3 25 | ignore_errors: yes -------------------------------------------------------------------------------- /playbooks/roles/passwords/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Create /etc/opt/oci-hpc/passwords/ 3 | become: true 4 | file: 5 | path: /etc/opt/oci-hpc/passwords/ 6 | state: directory 7 | owner: "{{ ansible_user }}" 8 | mode: 0770 9 | group: "{{ ansible_user }}" 10 | recurse: yes 11 | -------------------------------------------------------------------------------- /playbooks/roles/privilege_group/tasks/common.yml: -------------------------------------------------------------------------------- 1 | - name: create groupname group (gid 9876) 2 | become: true 3 | group: 4 | name: "{{privilege_group_name}}" 5 | gid: 9876 6 | state: present 7 | 8 | - name: allow groupname group to have passwordless sudo 9 | become: true 10 | lineinfile: 11 | dest: /etc/sudoers 12 | state: present 13 | regexp: '^%{{privilege_group_name}}' 14 | line: '%{{privilege_group_name}} ALL=(ALL) NOPASSWD: ALL' 15 | validate: 'visudo -cf %s' 16 | when: privilege_sudo | bool 17 | 18 | - name: "add opc/ubuntu user to {{privilege_group_name}} group" 19 | become: true 20 | user: 21 | name: "{{ansible_user}}" 22 | groups: "{{privilege_group_name}}" 23 | append: yes 24 | -------------------------------------------------------------------------------- /playbooks/roles/privilege_group/tasks/el.yml: -------------------------------------------------------------------------------- 1 | - name: create groupname group (gid 9876) 2 | become: true 3 | group: 4 | name: "{{privilege_group_name}}" 5 | gid: 9876 6 | state: present 7 | 8 | - name: allow groupname group to have passwordless sudo 9 | become: true 10 | lineinfile: 11 | dest: /etc/sudoers 12 | state: present 13 | regexp: '^%{{privilege_group_name}}' 14 | line: '%{{privilege_group_name}} ALL=(ALL) NOPASSWD: ALL' 15 | validate: 'visudo -cf %s' 16 | when: privilege_sudo | bool 17 | 18 | - name: "add opc user to {{privilege_group_name}} group" 19 | become: true 20 | user: 21 | name: opc 22 | groups: "{{privilege_group_name}}" 23 | append: yes -------------------------------------------------------------------------------- /playbooks/roles/privilege_group/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: common.yml 2 | 3 | #- include_tasks: el.yml 4 | # when: ansible_os_family == 'RedHat' 5 | 6 | -------------------------------------------------------------------------------- /playbooks/roles/prometheus/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # vars file for prometheus 3 | prometheus_user: prometheus 4 | prometheus_dest_dir: /etc/prometheus 5 | prometheus_data_dir: /var/lib/prometheus 6 | service_dest_dir: /etc/systemd/system 7 | prometheus_download_dir: /tmp/prometheus-2.53.1.linux-amd64 8 | prometheus_download_url: https://github.com/prometheus/prometheus/releases/download/v2.53.1/prometheus-2.53.1.linux-amd64.tar.gz 9 | # vars file for prometheus 10 | 11 | exporter_ports: 12 | - "9100" 13 | - "9400" 14 | - "9500" 15 | - "9600" 16 | - "9700" 17 | -------------------------------------------------------------------------------- /playbooks/roles/prometheus/templates/prometheus.conf.j2: -------------------------------------------------------------------------------- 1 | --- 2 | # handlers file for prometheus 3 | - name: restart prometheus 4 | service: 5 | name: prometheus 6 | state: restarted 7 | daemon_reload: yes 8 | enabled: yes -------------------------------------------------------------------------------- /playbooks/roles/prometheus/templates/prometheus.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Prometheus 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User={{ prometheus_user }} 8 | Group={{ prometheus_user }} 9 | Type=simple 10 | ExecStart=/usr/local/bin/prometheus \ 11 | --config.file /etc/prometheus/prometheus.yml \ 12 | --storage.tsdb.path /var/lib/prometheus/ \ 13 | --web.console.templates=/etc/prometheus/consoles \ 14 | --web.console.libraries=/etc/prometheus/console_libraries \ 15 | --web.listen-address=:9090 16 | 17 | [Install] 18 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /playbooks/roles/prometheus/templates/prometheus.yml.j2: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 60s 3 | external_labels: 4 | monitor: 'prometheus' 5 | 6 | scrape_configs: 7 | - job_name: 'prometheus' 8 | static_configs: 9 | {% for host, info in host_info.items() %} 10 | - targets: [{% for port in exporter_ports %}{{ host }}:{{port}}{{ "," if not loop.last }}{%- endfor %}] 11 | labels: 12 | serial: {{ info.serial_number }} 13 | hostname: {{ host }} 14 | cluster_name: {{ info.cluster_name }} 15 | ocid: {{ info.ocid }} 16 | oci_name: {{ info.oci_name }} 17 | AD: {{ info.availabilityDomain }} 18 | compartment: {{ info.compartmentId }} 19 | rackID: {{ info.rackID }} 20 | networkBlockId: {{ info.networkBlockId }} 21 | rail_id: {{ info.rail_id }} 22 | hpc_island: {{ info.hpc_island }} 23 | fss_mount: {{ info.fss_ip }} 24 | {% endfor %} 25 | -------------------------------------------------------------------------------- /playbooks/roles/rack-aware/tasks/el.yml: -------------------------------------------------------------------------------- 1 | - name: Install and upgrade pip 2 | shell: "python3 -m ensurepip & sudo python3 -m pip install --upgrade pip" 3 | 4 | - name: install pssh and parallel-ssh 5 | become: true 6 | pip: 7 | name: ['pssh', 'parallel-ssh'] 8 | executable: pip3 9 | state: latest 10 | with_items: 11 | - pssh 12 | - parallel-ssh 13 | ignore_errors: yes 14 | 15 | - name: Make sure /opt/oci-hpc/bin/ exists 16 | become: true 17 | file: 18 | path: /opt/oci-hpc/bin/ 19 | state: directory 20 | recurse: yes 21 | mode: '0755' 22 | owner: "{{ ansible_user }}" 23 | group: "{{ privilege_group_name }}" 24 | 25 | - name: Copy node_ordering_by_rack.py 26 | block: 27 | - name: copy node_ordering_by_rack.py 28 | become: true 29 | copy: 30 | src: node_ordering_by_rack.py 31 | dest: /opt/oci-hpc/bin/ 32 | owner: "{{ ansible_user }}" 33 | group: "{{privilege_group_name}}" 34 | mode: '0755' 35 | rescue: 36 | - name: copy node_ordering_by_rack.py 37 | become: true 38 | copy: 39 | src: node_ordering_by_rack.py 40 | dest: /opt/oci-hpc/bin/ 41 | owner: "{{ ansible_user }}" 42 | group: "{{privilege_group_name}}" 43 | mode: '0755' 44 | ignore_errors: yes 45 | -------------------------------------------------------------------------------- /playbooks/roles/rack-aware/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_tasks: ubuntu.yml 5 | when: ansible_distribution == 'Ubuntu' 6 | 7 | -------------------------------------------------------------------------------- /playbooks/roles/rack-aware/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | - name: install pssh and parallel-ssh 2 | become: true 3 | vars: 4 | ansible_python_interpreter: /usr/bin/python3 5 | pip: 6 | name: ['pssh', 'parallel-ssh'] 7 | executable: pip3 8 | state: latest 9 | ignore_errors: yes 10 | 11 | - name: Make sure /opt/oci-hpc/bin/ exists 12 | become: true 13 | file: 14 | path: /opt/oci-hpc/bin/ 15 | state: directory 16 | recurse: yes 17 | mode: '0755' 18 | owner: "{{ ansible_user }}" 19 | group: "{{ privilege_group_name }}" 20 | 21 | - name: Copy node_ordering_by_rack.py 22 | block: 23 | - name: copy node_ordering_by_rack.py 24 | become: true 25 | copy: 26 | src: node_ordering_by_rack.py 27 | dest: /opt/oci-hpc/bin/ 28 | owner: "{{ ansible_user }}" 29 | group: "{{privilege_group_name}}" 30 | mode: '0755' 31 | rescue: 32 | - name: copy node_ordering_by_rack.py 33 | become: true 34 | copy: 35 | src: node_ordering_by_rack.py 36 | dest: /opt/oci-hpc/bin/ 37 | owner: "{{ ansible_user }}" 38 | group: "{{privilege_group_name}}" 39 | mode: '0755' 40 | ignore_errors: yes 41 | -------------------------------------------------------------------------------- /playbooks/roles/rdma-interface/defaults/main.yml: -------------------------------------------------------------------------------- 1 | pci_id: '0000:5e:00.0' 2 | -------------------------------------------------------------------------------- /playbooks/roles/rdma-interface/handlers/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/rdma-interface/handlers/main.yml -------------------------------------------------------------------------------- /playbooks/roles/rdma-interface/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_tasks: debian.yml 5 | when: ansible_distribution == 'Ubuntu' 6 | 7 | - include_tasks: debian.yml 8 | when: ansible_distribution == 'Debian' 9 | 10 | 11 | -------------------------------------------------------------------------------- /playbooks/roles/rdma-interface/templates/ifcfg.j2: -------------------------------------------------------------------------------- 1 | {%- set rdma_subnet = hostvars[inventory_hostname]['rdma_network'] + '/' + hostvars[inventory_hostname]['rdma_netmask'] -%} 2 | {%- set rdma_subnet_prefix = rdma_subnet | ansible.netcommon.ipaddr('prefix') -%} 3 | {%- set rdma_subnet_cidr = rdma_subnet | ansible.netcommon.ipaddr('network/prefix') -%} 4 | {%- set host_index = hostvars[inventory_hostname]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) -%} 5 | {# THIS WHOLE ROUTINE IS USED TO FIND THE NEXT SUBNET #} 6 | {%- set subnets = [rdma_subnet_cidr] -%} 7 | {%- for i in range(index) -%} 8 | {%- set nsip = subnets|last | ansible.netcommon.ipaddr('net') | ansible.netcommon.ipaddr('-1') | ansible.netcommon.ipmath(1) -%} 9 | {%- set ns = nsip + '/' + rdma_subnet_prefix|string -%} 10 | {{ subnets.append(ns) }} 11 | {%- endfor -%} 12 | {%- set rdma_address = subnets|last | ansible.netcommon.ipmath(host_index | int -1) -%} 13 | TYPE="Ethernet" 14 | BOOTPROTO="none" 15 | IPADDR={{ rdma_address }} 16 | NETMASK={{ hostvars[inventory_hostname]['rdma_netmask'] }} 17 | DEFROUTE="no" 18 | PEERDNS="no" 19 | PEERROUTES="no" 20 | IPV4_FAILURE_FATAL="no" 21 | IPV6INIT="no" 22 | IPV6_FAILURE_FATAL="no" 23 | NAME="System {{ item['device'] }}" 24 | UUID="{{ hostvars[inventory_hostname]['ansible_fqdn'] | to_uuid }}" 25 | DEVICE="{{ item['device'] }}" 26 | ONBOOT="yes" 27 | NM_CONTROLLED="no" 28 | NOZEROCONF="yes" 29 | -------------------------------------------------------------------------------- /playbooks/roles/rdma-interface/templates/interface.j2: -------------------------------------------------------------------------------- 1 | {%- set rdma_subnet = hostvars[inventory_hostname]['rdma_network'] + '/' + hostvars[inventory_hostname]['rdma_netmask'] -%} 2 | {%- set rdma_subnet_prefix = rdma_subnet | ansible.netcommon.ipaddr('prefix') -%} 3 | {%- set rdma_subnet_cidr = rdma_subnet | ansible.netcommon.ipaddr('network/prefix') -%} 4 | {%- set host_index = hostvars[inventory_hostname]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) -%} 5 | {# THIS WHOLE ROUTINE IS USED TO FIND THE NEXT SUBNET #} 6 | {%- set subnets = [rdma_subnet_cidr] -%} 7 | {%- for i in range(index) -%} 8 | {%- set nsip = subnets|last | ansible.netcommon.ipaddr('net') | ansible.netcommon.ipaddr('-1') | ansible.netcommon.ipmath(1) -%} 9 | {%- set ns = nsip + '/' + rdma_subnet_prefix|string -%} 10 | {{ subnets.append(ns) }} 11 | {%- endfor -%} 12 | {%- set rdma_address = subnets|last | ansible.netcommon.ipmath(host_index | int -1) -%} 13 | auto {{ item['device'] }} 14 | iface {{ item['device'] }} inet static 15 | address {{ rdma_address }} 16 | netmask {{ hostvars[inventory_hostname]['rdma_netmask'] }} 17 | 18 | -------------------------------------------------------------------------------- /playbooks/roles/rdma-interface/vars/main.yml: -------------------------------------------------------------------------------- 1 | rdma_pci_ids: 2 | BM.GPU4.8: 3 | - "0000:48:00.0" 4 | - "0000:48:00.1" 5 | - "0000:4c:00.0" 6 | - "0000:4c:00.1" 7 | - "0000:0c:00.0" 8 | - "0000:0c:00.1" 9 | - "0000:16:00.0" 10 | - "0000:16:00.1" 11 | - "0000:c3:00.0" 12 | - "0000:c3:00.1" 13 | - "0000:d1:00.0" 14 | - "0000:d1:00.1" 15 | - "0000:8a:00.0" 16 | - "0000:8a:00.1" 17 | - "0000:94:00.0" 18 | - "0000:94:00.1" 19 | BM.HPC2.36: 20 | - "0000:5e:00.0" 21 | BM.Optimized3.36: 22 | - "0000:98:00.0" 23 | BM.GPU.B4.8: 24 | - "0000:0c:00.0" 25 | - "0000:0c:00.1" 26 | - "0000:16:00.0" 27 | - "0000:16:00.1" 28 | - "0000:47:00.0" 29 | - "0000:47:00.1" 30 | - "0000:4b:00.0" 31 | - "0000:4b:00.1" 32 | - "0000:89:00.0" 33 | - "0000:89:00.1" 34 | - "0000:93:00.0" 35 | - "0000:93:00.1" 36 | - "0000:c3:00.0" 37 | - "0000:c3:00.1" 38 | - "0000:d1:00.0" 39 | - "0000:d1:00.1" 40 | BM.GPU.A100-v2.8: 41 | - "0000:0c:00.0" 42 | - "0000:0c:00.1" 43 | - "0000:16:00.0" 44 | - "0000:16:00.1" 45 | - "0000:47:00.0" 46 | - "0000:47:00.1" 47 | - "0000:4b:00.0" 48 | - "0000:4b:00.1" 49 | - "0000:89:00.0" 50 | - "0000:89:00.1" 51 | - "0000:93:00.0" 52 | - "0000:93:00.1" 53 | - "0000:c3:00.0" 54 | - "0000:c3:00.1" 55 | - "0000:d1:00.0" 56 | - "0000:d1:00.1" 57 | BM.GPU.T1.2: 58 | - "0000:50:00.0" 59 | - "0000:50:00.1" 60 | -------------------------------------------------------------------------------- /playbooks/roles/safe_yum/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/safe_yum/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Ensure apt process is completed 3 | become: true 4 | shell: ps aux | grep "apt update" | grep -v grep | wc -l 5 | register: result 6 | retries: 30 7 | delay: 10 8 | until: result.stdout | int == 0 9 | 10 | - name: "Installing/Removing {{package_name}}" 11 | become: true 12 | apt: 13 | name: "{{package_name}}" 14 | state: "{{package_state | default('latest')}}" 15 | purge: "{{package_purge | default('false')}}" 16 | update_cache: "{{package_cache | default('false')}}" 17 | register: result 18 | until: result is not failed 19 | retries: 5 20 | delay: 5 21 | when: not deb_name is defined 22 | 23 | - name: "Installing/Removing {{deb_name}}" 24 | become: true 25 | apt: 26 | deb: "{{item}}" 27 | state: "{{package_state | default('latest')}}" 28 | purge: "{{package_purge | default('false')}}" 29 | update_cache: "{{package_cache | default('false')}}" 30 | register: result 31 | until: result is not failed 32 | retries: 5 33 | delay: 5 34 | when: deb_name is defined 35 | with_items: "{{deb_name}}" 36 | 37 | 38 | - name: Ensure apt process is completed 39 | become: true 40 | shell: ps aux | grep "apt update" | grep -v grep | wc -l 41 | register: result 42 | retries: 30 43 | delay: 10 44 | until: result.stdout | int == 0 -------------------------------------------------------------------------------- /playbooks/roles/slurm/defaults/main.yml: -------------------------------------------------------------------------------- 1 | munge_conf_path: '/etc/munge' 2 | slurm_spool_path: '/var/spool/slurmd' 3 | munge_log_path: '/var/log/munge' 4 | munge_run_path: '/var/run/munge' 5 | slurm_db_user: 'slurm_accounting' 6 | slurm_db_name: 'slurm_accounting' 7 | slurm_gid: 1501 8 | slurm_uid: 1501 9 | munge_gid: 1500 10 | munge_uid: 1500 11 | rack_aware_playbook_suffix: "{% if rack_aware|bool %}-rack-aware{% endif%}" 12 | slurm_version: "24.05.1-1" -------------------------------------------------------------------------------- /playbooks/roles/slurm/files/cgroup.conf: -------------------------------------------------------------------------------- 1 | CgroupMountpoint="/sys/fs/cgroup" 2 | ConstrainDevices=yes 3 | ConstrainCores=yes -------------------------------------------------------------------------------- /playbooks/roles/slurm/files/healthchecks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Fetch the instance shape 4 | shape=$(curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq -r .shape) 5 | 6 | # Check if the shape matches the desired GPU types 7 | if [ ${shape} = "BM.GPU.H100.8" ] || \ 8 | [ ${shape} = "BM.GPU.A100-v2.8" ] || \ 9 | [ ${shape} = "BM.GPU4.8" ] || \ 10 | [ ${shape} = "BM.GPU.B4.8" ] || \ 11 | [ ${shape} = "BM.GPU.H200.8" ]; then 12 | 13 | FILE="/tmp/latest_healthcheck.log" 14 | if [ -e "$FILE" ]; then 15 | # Get the current time and the file's last modification time 16 | CURRENT_TIME=$(date +%s) 17 | FILE_MOD_TIME=$(stat -c %Y "$FILE") 18 | 19 | # Calculate the time difference in seconds 20 | TIME_DIFF=$((CURRENT_TIME - FILE_MOD_TIME)) 21 | else 22 | TIME_DIFF=600 23 | fi 24 | 25 | # Check if the file is older than 60 seconds (1 minute) 26 | if [ $TIME_DIFF -gt 60 ]; then 27 | sudo python3 /opt/oci-hpc/healthchecks/check_gpu_setup.py --slurm > /tmp/latest_healthcheck.log 2>&1 28 | fi 29 | 30 | # Check for healthcheck messages 31 | DRAIN_MSG=$(grep "Healthcheck::" /tmp/latest_healthcheck.log) 32 | if [ -n "$DRAIN_MSG" ]; then 33 | if [ -n "$SLURM_JOB_ID" ]; then 34 | echo "${DRAIN_MSG}" 35 | exit 1 36 | else 37 | scontrol update nodename=$(hostname) state=drain reason="${DRAIN_MSG}" 38 | fi 39 | fi 40 | fi -------------------------------------------------------------------------------- /playbooks/roles/slurm/files/sshd: -------------------------------------------------------------------------------- 1 | #%PAM-1.0 2 | auth required pam_nologin.so 3 | auth include password-auth 4 | # Used with polkit to reauthorize users in remote sessions 5 | -auth optional pam_reauthorize.so prepare 6 | account required pam_nologin.so 7 | account include password-auth 8 | password include password-auth 9 | -account required pam_slurm_adopt.so 10 | # pam_selinux.so close should be the first session rule 11 | session required pam_selinux.so close 12 | session required pam_loginuid.so 13 | # pam_selinux.so open should only be followed by sessions to be executed in the user context 14 | session required pam_selinux.so open env_params 15 | session required pam_namespace.so 16 | session optional pam_keyinit.so force revoke 17 | session include password-auth 18 | session include postlogin 19 | # Used with polkit to reauthorize users in remote sessions 20 | -session optional pam_reauthorize.so prepare 21 | -------------------------------------------------------------------------------- /playbooks/roles/slurm/handlers/main.yml: -------------------------------------------------------------------------------- 1 | 2 | - name: restart slurm server 3 | become: true 4 | service: 5 | name: '{{ item }}' 6 | state: restarted 7 | enabled: true 8 | with_items: 9 | - slurmdbd 10 | - slurmctld 11 | register: result 12 | until: result is not failed 13 | retries: 5 14 | delay: 5 15 | 16 | - name: restart slurm 17 | become: true 18 | service: 19 | name: '{{ item }}' 20 | state: restarted 21 | enabled: true 22 | with_items: 23 | - slurmd 24 | register: result 25 | until: result is not failed 26 | retries: 5 27 | delay: 5 28 | 29 | - name: restart munge 30 | become: true 31 | service: 32 | name: munge 33 | state: restarted 34 | enabled: true 35 | 36 | # seeing this error stderr: 'slurm_reconfigure error: Operation now in progress', that's why added retries 37 | - name: reconfigure slurm 38 | become: true 39 | command: "scontrol reconfigure" 40 | delegate_to: 127.0.0.1 41 | register: result 42 | until: result is not failed 43 | retries: 5 44 | delay: 5 -------------------------------------------------------------------------------- /playbooks/roles/slurm/tasks/cleanup.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: delete temporary key 3 | file: 4 | path: /tmp/munge.key 5 | state: absent 6 | 7 | - name: Reconfigure Slurm 8 | become: true 9 | command: "scontrol reconfigure" 10 | delegate_to: 127.0.0.1 11 | run_once: true 12 | ignore_errors: yes -------------------------------------------------------------------------------- /playbooks/roles/slurm/tasks/controller.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - block: 3 | - name: include common tasks 4 | include_tasks: common.yml 5 | vars: 6 | slurm_repos: "epel,ol7_developer_EPEL" 7 | when: (not destroy|bool) and ((initial|bool) or (not initial|bool and ('compute' in group_names))) 8 | 9 | - name: run server directives ol7 controller 10 | include_tasks: server.yml 11 | vars: 12 | slurm_repos: "epel,ol7_developer_EPEL" 13 | when: ('controller' in group_names) and (not destroy|bool) and (initial| bool) 14 | when: ansible_distribution_major_version == '7' 15 | 16 | - block: 17 | - name: include common tasks 18 | include_tasks: common.yml 19 | vars: 20 | slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" 21 | when: (not destroy|bool) and ((initial|bool) or (not initial|bool and ('compute' in group_names))) 22 | 23 | - name: run server directives ol8 controller 24 | include_tasks: server.yml 25 | vars: 26 | slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" 27 | when: ('controller' in group_names) and (not destroy|bool) and (initial| bool) 28 | when: ansible_distribution_major_version == '8' -------------------------------------------------------------------------------- /playbooks/roles/slurm/tasks/download.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | #- name: Download Packages 4 | #block: 5 | - name: Create Slurm directory 6 | file: 7 | path: "{{ download_path }}/slurm_rpms" 8 | state: directory 9 | - name: Download slurm RPMs. 10 | get_url: 11 | url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/{{ item }}" 12 | dest: "{{ download_path }}/slurm_rpms" 13 | with_items: 14 | - "slurm-{{slurm_version}}.el7.x86_64.rpm" 15 | - "slurm-contribs-{{slurm_version}}.el7.x86_64.rpm" 16 | - "slurm-devel-{{slurm_version}}.el7.x86_64.rpm" 17 | - "slurm-example-configs-{{slurm_version}}.el7.x86_64.rpm" 18 | - "slurm-libpmi-{{slurm_version}}.el7.x86_64.rpm" 19 | - "slurm-openlava-{{slurm_version}}.el7.x86_64.rpm" 20 | - "slurm-pam_slurm-{{slurm_version}}.el7.x86_64.rpm" 21 | - "slurm-perlapi-{{slurm_version}}.el7.x86_64.rpm" 22 | - "slurm-slurmctld-{{slurm_version}}.el7.x86_64.rpm" 23 | - "slurm-slurmd-{{slurm_version}}.el7.x86_64.rpm" 24 | - "slurm-slurmdbd-{{slurm_version}}.el7.x86_64.rpm" 25 | - "slurm-torque-{{slurm_version}}.el7.x86_64.rpm" 26 | run_once: true 27 | 28 | # rescue: 29 | # - name: Repository 30 | # become: true 31 | # yum_repository: 32 | # name: oci-hpc 33 | # description: oci-hpc 34 | # baseurl: https://objectstorage.us-ashburn-1.oraclecloud.com/n/hpc/b/rpms/o/ 35 | # gpgcheck: no 36 | # enabled: yes 37 | # retries: 3 -------------------------------------------------------------------------------- /playbooks/roles/slurm/tasks/el7.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: run compute directives 3 | vars: 4 | slurm_repos: "epel,ol7_developer_EPEL" 5 | include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" 6 | when: ('compute' in group_names) and (not destroy|bool) 7 | 8 | - name: run login/monitoring server directives 9 | vars: 10 | slurm_repos: "epel,ol7_developer_EPEL" 11 | include_tasks: login.yml 12 | when: (('login' in group_names) or ('monitoring' in group_names) )and (not destroy|bool) and (initial| bool) 13 | 14 | - name: run backup server directives 15 | vars: 16 | slurm_repos: "epel,ol7_developer_EPEL" 17 | include_tasks: backup_server.yml 18 | when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) 19 | 20 | - name: cleanup 21 | include_tasks: cleanup.yml 22 | when: ('compute' in group_names) and (not destroy|bool) 23 | 24 | - name: destroy 25 | include_tasks: destroy{{rack_aware_playbook_suffix}}.yml 26 | when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool) 27 | 28 | - name: move topology.conf on backup slurm controller 29 | include_tasks: move-topology.yml 30 | when: ('slurm_backup' in group_names) and (not initial| bool) 31 | -------------------------------------------------------------------------------- /playbooks/roles/slurm/tasks/el8.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: run compute directives 3 | vars: 4 | slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" 5 | include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" 6 | when: ('compute' in group_names) and (not destroy|bool) 7 | 8 | - name: run login/monitoring server directives 9 | vars: 10 | slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" 11 | include_tasks: login.yml 12 | when: (('login' in group_names) or ('monitoring' in group_names) ) and (not destroy|bool) and (initial| bool) 13 | 14 | - name: run backup server directives 15 | vars: 16 | slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" 17 | include_tasks: backup_server.yml 18 | when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) 19 | 20 | - name: cleanup 21 | include_tasks: cleanup.yml 22 | when: ('compute' in group_names) and (not destroy|bool) 23 | 24 | - name: destroy 25 | include_tasks: destroy{{rack_aware_playbook_suffix}}.yml 26 | when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool) 27 | 28 | - name: move topology.conf on backup slurm controller 29 | include_tasks: move-topology.yml 30 | when: ('slurm_backup' in group_names) and (not initial| bool) 31 | -------------------------------------------------------------------------------- /playbooks/roles/slurm/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_vars: el_vars.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 3 | 4 | - include_vars: centos_vars.yml 5 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' 6 | 7 | - include_vars: ubuntu_vars.yml 8 | when: ansible_distribution == 'Ubuntu' 9 | 10 | - include_tasks: controller.yml 11 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' 12 | 13 | - include_tasks: el7.yml 14 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '7' 15 | 16 | - include_tasks: el7.yml 17 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' 18 | 19 | - include_tasks: el8.yml 20 | when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '8' 21 | 22 | - include_tasks: ubuntu.yml 23 | when: ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/slurm/tasks/move-topology.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: move topology.conf on backup servers 4 | become: true 5 | copy: 6 | dest: '{{ slurm_conf_path }}/topology.conf' 7 | src: '{{ slurm_conf_path }}/topology.conf' 8 | force: yes 9 | register: topology_copied 10 | until: topology_copied is not failed 11 | retries: 10 12 | delay: 5 -------------------------------------------------------------------------------- /playbooks/roles/slurm/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | - name: include common tasks 2 | include_tasks: common.yml 3 | when: (not destroy|bool) and ((initial|bool) or (not initial|bool and ('compute' in group_names))) 4 | 5 | - name: run server directives 6 | include_tasks: server.yml 7 | when: ('controller' in group_names) and (not destroy|bool) and (initial| bool) 8 | 9 | - name: run compute directives 10 | include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" 11 | when: ('compute' in group_names) and (not destroy|bool) 12 | 13 | - name: run login/monitoring server directives 14 | include_tasks: login.yml 15 | when: (('login' in group_names) or ('monitoring' in group_names) ) and (not destroy|bool) and (initial| bool) 16 | 17 | - name: run backup server directives 18 | include_tasks: backup_server.yml 19 | when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) 20 | 21 | - name: cleanup 22 | include_tasks: cleanup.yml 23 | when: ('compute' in group_names) and (not destroy|bool) 24 | 25 | - name: destroy 26 | include_tasks: destroy{{rack_aware_playbook_suffix}}.yml 27 | when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool) 28 | 29 | - name: move topology.conf on backup slurm controller 30 | include_tasks: move-topology.yml 31 | when: ('slurm_backup' in group_names) and (not initial| bool) 32 | -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/pyxis.sh.j2: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | runtime_path="$(sudo -u "$SLURM_JOB_USER" sh -c 'echo "{{enroot_top_path_checked}}/enroot_runtime/user-$(id -u)"')" 3 | mkdir -p "$runtime_path" 4 | chown "$SLURM_JOB_USER:$(id -g "$SLURM_JOB_USER")" "$runtime_path" 5 | #chmod 777 -R /tmp 6 | chmod 0700 "$runtime_path" -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/slurmdbd.conf.j2: -------------------------------------------------------------------------------- 1 | 2 | # See the slurmdbd.conf man page for more information. 3 | # 4 | 5 | # Authentication info 6 | AuthType=auth/munge 7 | #AuthInfo=/var/run/munge/munge.socket.2 8 | 9 | # slurmDBD info 10 | DbdHost=localhost 11 | SlurmUser=slurm 12 | #MessageTimeout=300 13 | DebugLevel=4 14 | LogFile=/var/log/slurm/slurmdbd.log 15 | PidFile=/var/run/slurmdbd.pid 16 | #PrivateData=accounts,users,usage,jobs 17 | 18 | # Database info 19 | StorageType=accounting_storage/mysql 20 | StorageHost=localhost 21 | StoragePort=3306 22 | StoragePass={{ slurmdbd_sql_pwd }} 23 | StorageUser={{ slurm_db_user }} 24 | StorageLoc={{ slurm_db_name }} 25 | 26 | {% if sacct_limits|bool %} 27 | TrackWckey=no 28 | {% endif %} -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/munge.service.d/unit.conf.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | After=remote-fs.target 3 | Requires=remote-fs.target 4 | 5 | [Service] 6 | ExecStart= 7 | ExecStart=/usr/sbin/munged --key-file {{ munge_conf_path }}/munge.key 8 | 9 | -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/sackd.service.d/unit.conf.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Requires=munge.service 3 | 4 | [Service] 5 | Restart=always 6 | ExecStart= 7 | ExecStart={{slurm_exec}}/sbin/sackd --systemd --conf-server {{ hostvars[groups['controller'][0]]['ansible_fqdn'].split('.')[0] }}{% if (groups['slurm_backup']| length ) > 0 %},{{ hostvars[groups['slurm_backup'][0]]['ansible_fqdn'].split('.')[0] }}{% endif %} $SACKD_OPTIONS -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/sackd.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Slurm auth and cred kiosk daemon 3 | After=network-online.target remote-fs.target 4 | Wants=network-online.target 5 | 6 | [Service] 7 | Type=notify 8 | EnvironmentFile=-/etc/sysconfig/sackd 9 | EnvironmentFile=-/etc/default/sackd 10 | User=slurm 11 | Group=slurm 12 | RuntimeDirectory=slurm 13 | RuntimeDirectoryMode=0755 14 | ExecStart={{slurm_exec}}/sbin/sackd --systemd --conf-server {{ hostvars[groups['controller'][0]]['ansible_fqdn'].split('.')[0] }}{% if (groups['slurm_backup']| length ) > 0 %},{{ hostvars[groups['slurm_backup'][0]]['ansible_fqdn'].split('.')[0] }}{% endif %} $SACKD_OPTIONS 15 | ExecReload=/bin/kill -HUP $MAINPID 16 | KillMode=process 17 | LimitNOFILE=131072 18 | LimitMEMLOCK=infinity 19 | LimitSTACK=infinity 20 | TasksMax=infinity 21 | 22 | # Uncomment the following lines to disable logging through journald. 23 | # NOTE: It may be preferable to set these through an override file instead. 24 | #StandardOutput=null 25 | #StandardError=null 26 | 27 | [Install] 28 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/slurm_env.j2: -------------------------------------------------------------------------------- 1 | LD_LIBRARY_PATH=/usr/local/lib/slurm:/usr/local/lib:$LD_LIBRARY_PATH -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/slurmctld.service.d/unit.conf.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Requires=munge.service slurmdbd.service 3 | 4 | [Service] 5 | Environment="SLURMCTLD_OPTIONS=-R" 6 | Restart=always 7 | -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/slurmctld.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Slurm controller daemon 3 | After=network.target munge.service 4 | ConditionPathExists={{slurm_conf_path}}/slurm.conf 5 | Documentation=man:slurmctld(8) 6 | 7 | [Service] 8 | Type=forking 9 | EnvironmentFile=-/etc/default/slurm 10 | ExecStart={{slurm_exec}}/sbin/slurmctld $SLURMCTLD_OPTIONS 11 | ExecReload=/bin/kill -HUP $MAINPID 12 | PIDFile=/run/slurmctld.pid 13 | LimitNOFILE=65536 14 | TasksMax=infinity 15 | 16 | [Install] 17 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/slurmctld_backup.service.d/unit.conf.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Requires=munge.service 3 | 4 | [Service] 5 | Environment="SLURMCTLD_OPTIONS=-R" 6 | Restart=always 7 | -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/slurmd.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Slurm node daemon 3 | After=munge.service network-online.target remote-fs.target sssd.service 4 | Wants=network-online.target 5 | 6 | [Service] 7 | Type=notify 8 | EnvironmentFile=-/etc/sysconfig/slurmd 9 | EnvironmentFile=-/etc/default/slurmd 10 | RuntimeDirectory=slurm 11 | RuntimeDirectoryMode=0755 12 | ExecStart={{slurm_exec}}/sbin/slurmd --systemd --conf-server {{ hostvars[groups['controller'][0]]['ansible_fqdn'].split('.')[0] }}{% if (groups['slurm_backup']| length ) > 0 %},{{ hostvars[groups['slurm_backup'][0]]['ansible_fqdn'].split('.')[0] }}{% endif %} $SLURMD_OPTIONS 13 | ExecReload=/bin/kill -HUP $MAINPID 14 | KillMode=process 15 | LimitNOFILE=131072 16 | LimitMEMLOCK=infinity 17 | LimitSTACK=infinity 18 | Delegate=yes 19 | TasksMax=infinity 20 | 21 | # Uncomment the following lines to disable logging through journald. 22 | # NOTE: It may be preferable to set these through an override file instead. 23 | #StandardOutput=null 24 | #StandardError=null 25 | 26 | [Install] 27 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/slurmdbd.service.d/unit.conf.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Requires=munge.service 3 | 4 | [Service] 5 | Restart=always 6 | -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/systemd/slurmdbd.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Slurm DBD accounting daemon 3 | After=network.target munge.service 4 | ConditionPathExists={{slurm_conf_path}}/slurmdbd.conf 5 | Documentation=man:slurmdbd(8) 6 | 7 | [Service] 8 | Type=forking 9 | EnvironmentFile=-/etc/default/slurm 10 | ExecStart={{slurm_exec}}/sbin/slurmdbd $SLURMDBD_OPTIONS 11 | ExecReload=/bin/kill -HUP $MAINPID 12 | PIDFile=/run/slurmdbd.pid 13 | LimitNOFILE=65536 14 | TasksMax=infinity 15 | 16 | [Install] 17 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /playbooks/roles/slurm/templates/topology.conf.j2: -------------------------------------------------------------------------------- 1 | ### Topology File -------------------------------------------------------------------------------- /playbooks/roles/slurm/vars/ubuntu_vars.yml: -------------------------------------------------------------------------------- 1 | slurm_conf_path: '/etc/slurm' 2 | slurm_log_path: '/var/log/slurm' 3 | slurm_exec: '/usr/local' 4 | slurm_conf_file: slurm.conf.j2 5 | slurmdbd_conf_file: slurmdbd.conf.j2 6 | 7 | cgroup_conf_file: cgroup.conf 8 | 9 | munge_packages: 10 | - munge 11 | - libmunge2 12 | - libmunge-dev 13 | - libpmix-dev 14 | 15 | slurm_common_packages: [] 16 | 17 | slurm_server_packages: 18 | - libjwt-dev 19 | 20 | slurm_compute_packages: 21 | - libpmi0 22 | 23 | slurm_backup_server_packages: 24 | - libpmi0 25 | 26 | slurm_login_packages: 27 | - libpmi0 -------------------------------------------------------------------------------- /playbooks/roles/spack/defaults/main.yml: -------------------------------------------------------------------------------- 1 | spack_repo: https://github.com/spack/spack.git 2 | -------------------------------------------------------------------------------- /playbooks/roles/spack/tasks/debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install GIT 3 | vars: 4 | package_name: 5 | - git 6 | package_state: latest 7 | include_role: 8 | name: safe_yum 9 | when: cluster_nfs 10 | 11 | - name: Development Tools" 12 | vars: 13 | package_name: 14 | - build-essential 15 | package_state: latest 16 | include_role: 17 | name: safe_yum 18 | when: cluster_nfs 19 | 20 | - name: Clone SPACK 21 | git: 22 | repo: "{{ spack_repo }}" 23 | dest: "{{ cluster_nfs_path }}/spack" 24 | update: no 25 | run_once: true 26 | when: cluster_nfs 27 | 28 | - name: Add SPACK initialization 29 | template: 30 | src: templates/spack.j2 31 | dest: /etc/profile.d/spack.sh 32 | owner: root 33 | group: root 34 | mode: '0755' 35 | become: true 36 | when: cluster_nfs 37 | 38 | -------------------------------------------------------------------------------- /playbooks/roles/spack/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install GIT 3 | vars: 4 | package_name: 5 | - git 6 | include_role: 7 | name: safe_yum 8 | 9 | - name: Development Tools" 10 | vars: 11 | package_name: 12 | - "@Development Tools" 13 | include_role: 14 | name: safe_yum 15 | 16 | - name: Clone SPACK 17 | git: 18 | repo: "{{ spack_repo }}" 19 | dest: "{{ cluster_nfs_path }}/spack" 20 | update: no 21 | run_once: true 22 | 23 | - name: Add SPACK initialization 24 | template: 25 | src: templates/spack.j2 26 | dest: /etc/profile.d/spack.sh 27 | owner: root 28 | group: root 29 | mode: '0755' 30 | become: true 31 | -------------------------------------------------------------------------------- /playbooks/roles/spack/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_tasks: debian.yml 5 | when: ansible_distribution == 'Ubuntu' 6 | 7 | - include_tasks: debian.yml 8 | when: ansible_distribution == 'Debian' 9 | -------------------------------------------------------------------------------- /playbooks/roles/spack/templates/spack.j2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SPACK_ROOT={{ cluster_nfs_path }}/spack 3 | source $SPACK_ROOT/share/spack/setup-env.sh 4 | 5 | -------------------------------------------------------------------------------- /playbooks/roles/ssh/tasks/common.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Update ssh configuration 3 | copy: 4 | src: files/ssh_config 5 | dest: /etc/ssh/ssh_config 6 | owner: root 7 | group: root 8 | mode: '0644' 9 | 10 | - name: Install private ssh key on all nodes 11 | copy: 12 | dest: "/home/{{ ansible_user }}/.ssh/id_rsa" 13 | src: "/home/{{ controller_username }}/.ssh/{{ item }}" 14 | owner: "{{ ansible_user }}" 15 | group: "{{ ansible_user }}" 16 | mode: '0600' 17 | with_items: 18 | - cluster.key 19 | 20 | - name: Install public ssh key on all nodes 21 | copy: 22 | dest: "/home/{{ ansible_user }}/.ssh/id_rsa.pub" 23 | src: "/home/{{ controller_username }}/.ssh/{{ item }}" 24 | owner: "{{ ansible_user }}" 25 | group: "{{ ansible_user }}" 26 | mode: '0644' 27 | with_items: 28 | - id_rsa.pub -------------------------------------------------------------------------------- /playbooks/roles/ssh/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: common.yml 2 | -------------------------------------------------------------------------------- /playbooks/roles/ssl/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | ssl_cert_path: '/etc/ssl/certs' 3 | ssl_cert_country: 'US' 4 | ssl_cert_locality: 'Seattle' 5 | ssl_cert_organization: 'Oracle Cloud' 6 | ssl_cert_state: 'WA' 7 | ssl_cert_altname: 'controller.cluster' 8 | 9 | ssl_cert_days: '3650' 10 | 11 | ssl_cert_owner: 'root' 12 | ssl_cert_owner_id: 0 13 | 14 | ssl_cert_group: 'ssl' 15 | ssl_cert_group_id: 1502 16 | -------------------------------------------------------------------------------- /playbooks/roles/ssl/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | 4 | - include_tasks: debian.yml 5 | when: ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/ssl/templates/san.conf.j2: -------------------------------------------------------------------------------- 1 | [ req ] 2 | distinguished_name = req_distinguished_name 3 | req_extensions = req_ext 4 | [ req_distinguished_name ] 5 | countryName = Country Name (2 letter code) 6 | stateOrProvinceName = State or Province Name (full name) 7 | localityName = Locality Name (eg, city) 8 | organizationName = Organization Name (eg, company) 9 | commonName = Common Name (e.g. server FQDN) 10 | 11 | [ req_ext ] 12 | subjectAltName = @alt_names 13 | 14 | [alt_names] 15 | DNS.1 = {{ ssl_cert_altname }} 16 | -------------------------------------------------------------------------------- /playbooks/roles/sssd/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart sshd 3 | listen: restart login services 4 | service: 5 | name: sshd 6 | state: restarted 7 | enabled: yes 8 | 9 | - name: restart sssd 10 | listen: restart ns daemons 11 | service: 12 | name: sssd 13 | state: restarted 14 | enabled: yes 15 | 16 | - name: restart nscd 17 | listen: restart ns daemons 18 | systemd: 19 | name: nscd 20 | state: restarted 21 | daemon_reload: yes 22 | enabled: yes 23 | 24 | - name: restart_nslcd 25 | listen: restart ns daemons 26 | service: 27 | name: nslcd 28 | state: restarted 29 | enabled: yes 30 | 31 | - name: restart_systemd_logind 32 | listen: restart login services 33 | service: 34 | name: systemd-logind 35 | state: restarted 36 | enabled: yes -------------------------------------------------------------------------------- /playbooks/roles/sssd/tasks/el-7.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install sssd packages 3 | vars: 4 | package_name: 5 | - sssd 6 | - authconfig 7 | include_role: 8 | name: safe_yum 9 | 10 | - name: Add configuration file to /etc/sssd/sssd.conf 11 | template: 12 | src: 'sssd.conf.j2' 13 | dest: '/etc/sssd/sssd.conf' 14 | owner: 'root' 15 | group: 'root' 16 | mode: '0600' 17 | notify: restart sssd 18 | 19 | - name: Copy CA certificate 20 | copy: 21 | src: "{{ ssl_ca_cert }}" 22 | dest: /etc/openldap/certs/cluster-ca.crt 23 | 24 | - name: Adjust OpenLDAP client TLS configuration 25 | lineinfile: 26 | path: '/etc/openldap/ldap.conf' 27 | line: 'TLS_CACERT /etc/openldap/certs/cluster-ca.crt' 28 | 29 | - name: Enable sssd service 30 | systemd: 31 | name: sssd 32 | enabled: "yes" 33 | 34 | - name: Start sssd service 35 | systemd: 36 | name: sssd 37 | state: started 38 | 39 | - name: Update sshd configuration 40 | lineinfile: 41 | path: /etc/ssh/sshd_config 42 | regexp: '^PasswordAuthentication' 43 | line: PasswordAuthentication no 44 | notify: restart sshd 45 | 46 | - name: Setting up the system to use sssd for authentication 47 | command: authconfig --enablemkhomedir --enablesssd --enablesssdauth --update 48 | changed_when: false 49 | -------------------------------------------------------------------------------- /playbooks/roles/sssd/tasks/el-8.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install sssd packages 3 | vars: 4 | package_name: 5 | - sssd 6 | - authconfig 7 | include_role: 8 | name: safe_yum 9 | 10 | - name: Add configuration file to /etc/sssd/sssd.conf 11 | template: 12 | src: 'sssd.conf.j2' 13 | dest: '/etc/sssd/sssd.conf' 14 | owner: 'root' 15 | group: 'root' 16 | mode: '0600' 17 | notify: restart sssd 18 | 19 | - name: Update /etc/sssd/sssd.conf 20 | lineinfile: 21 | path: /etc/sssd/sssd.conf 22 | line: "ldap_tls_reqcert = allow" 23 | state: present 24 | notify: restart sssd 25 | 26 | - name: Copy CA certificate 27 | copy: 28 | src: "{{ ssl_ca_cert }}" 29 | dest: /etc/openldap/certs/cluster-ca.crt 30 | 31 | - name: Adjust OpenLDAP client TLS configuration 32 | lineinfile: 33 | path: '/etc/openldap/ldap.conf' 34 | line: 'TLS_CACERT /etc/openldap/certs/cluster-ca.crt' 35 | 36 | - name: Enable sssd service 37 | systemd: 38 | name: sssd 39 | enabled: "yes" 40 | 41 | - name: Start sssd service 42 | systemd: 43 | name: sssd 44 | state: started 45 | 46 | - name: Update sshd configuration 47 | lineinfile: 48 | path: /etc/ssh/sshd_config 49 | regexp: '^PasswordAuthentication' 50 | line: PasswordAuthentication no 51 | notify: restart sshd 52 | 53 | - name: Setting up the system to use sssd for authentication 54 | command: authconfig --enablemkhomedir --enablesssd --enablesssdauth --update 55 | changed_when: false 56 | -------------------------------------------------------------------------------- /playbooks/roles/sssd/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_vars: /opt/oci-hpc/playbooks/roles/openldap/vars/debian_vars.yml 2 | when: ansible_distribution == 'Ubuntu' 3 | 4 | - include_tasks: el-7.yml 5 | when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' 6 | 7 | - include_tasks: el-8.yml 8 | when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' 9 | 10 | - include_tasks: debian.yml 11 | when: ansible_distribution == 'Ubuntu' -------------------------------------------------------------------------------- /playbooks/roles/sssd/templates/sssd.conf.j2: -------------------------------------------------------------------------------- 1 | [sssd] 2 | 3 | config_file_version = 2 4 | services = nss, pam 5 | domains = cluster 6 | 7 | [nss] 8 | filter_users = root 9 | entry_negative_timeout = 5 10 | 11 | [pam] 12 | pam_verbosity = 2 13 | pam_account_expired_message = 'Your account has expired. Please contact a system administrator' 14 | 15 | [domain/cluster] 16 | ldap_schema = rfc2307bis 17 | id_provider = ldap 18 | auth_provider = ldap 19 | access_provider = ldap 20 | chpass_provider = ldap 21 | cache_credentials = true 22 | entry_cache_timeout = 600 23 | ldap_uri = ldaps://{{ hostvars[groups['controller'][0]]['ansible_fqdn'] }} 24 | ldap_search_base = dc=local 25 | ldap_network_timeout = 30 26 | ldap_access_order = expire 27 | ldap_access_filter = (&(objectclass=inetOrgPerson)) 28 | ldap_account_expire_policy = shadow 29 | enumerate = true -------------------------------------------------------------------------------- /playbooks/roles/sssd/templates/sssd_ubuntu.conf.j2: -------------------------------------------------------------------------------- 1 | [sssd] 2 | config_file_version = 2 3 | domains = cluster 4 | 5 | [domain/cluster] 6 | ldap_schema = rfc2307bis 7 | id_provider = ldap 8 | auth_provider = ldap 9 | access_provider = ldap 10 | chpass_provider = ldap 11 | cache_credentials = true 12 | entry_cache_timeout = 600 13 | ldap_uri = ldaps://{{ hostvars[groups['controller'][0]]['ansible_fqdn'] }} 14 | ldap_search_base = dc=local 15 | ldap_network_timeout = 30 16 | ldap_access_order = expire 17 | ldap_access_filter = (&(objectclass=inetOrgPerson)) 18 | ldap_account_expire_policy = shadow 19 | enumerate = true 20 | -------------------------------------------------------------------------------- /playbooks/roles/sssd/vars/main.yml: -------------------------------------------------------------------------------- 1 | ssl_cert_path: '/etc/ssl/certs' 2 | ssl_cert_group: 'ldap' 3 | 4 | openldap_tls_cacrt: '{{ ssl_ca_cert }}' 5 | openldap_tls_crt: '{{ ssl_cert_path }}/{{ ansible_fqdn }}.crt' 6 | openldap_tls_key: '{{ ssl_cert_path }}/{{ ansible_fqdn }}.key' 7 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/defaults/main.yml: -------------------------------------------------------------------------------- 1 | influxdb_configuration_dir: /etc/influxdb 2 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/handlers/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/telegraf/handlers/main.yml -------------------------------------------------------------------------------- /playbooks/roles/telegraf/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: common.yml 2 | when: ansible_os_family == 'RedHat' or ansible_os_family == 'Debian' 3 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/amd_gpu.conf.j2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-hpc/a9f4a4dd44ce56059c26e60443c136cca0460e3a/playbooks/roles/telegraf/templates/amd_gpu.conf.j2 -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/custom.cnf.j2: -------------------------------------------------------------------------------- 1 | [[inputs.exec]] 2 | commands = ["echo {}"] 3 | data_format = "json" -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/ethtool_counters.conf.j2: -------------------------------------------------------------------------------- 1 | # Returns ethtool statistics for given interfaces 2 | [[inputs.ethtool]] 3 | 4 | interface_include = ["rdma*","enp*"] 5 | interval = "300s" 6 | fieldpass = ["tx_pci_signal_integrity","rx_steer_missed_packets","rx_vport_multicast_bytes","rx_vport_rdma_unicast_packets", 7 | "rx_vport_rdma_unicast_bytes","tx_vport_rdma_unicast_packets","tx_vport_rdma_unicast_bytes","tx_packets_phy","rx_packets_phy", 8 | "tx_bytes_phy","rx_bytes_phy", "rx_multicast_phy","rx_65_to_127_bytes_phy","rx_2048_to_4095_bytes_phy", "rx_4096_to_8191_bytes_phy", 9 | "rx_crc_errors_phy","rx_symbol_err_phy", "rx_discards_phy","tx_discards_phy","tx_errors_phy","rx_64_bytes_phy","link_down_events_phy", 10 | "rx_out_of_buffer","module_bus_stuck","module_high_temp","rx_buffer_passed_thres_phy","tx_pause_storm_warning_events","tx_pause_storm_error_events", 11 | "rx_pcs_symbol_err_phy","rx_pci_signal_integrity","tx_pci_signal_integrity","rx_prio0_bytes","rx_prio0_packets","tx_prio0_bytes", 12 | "tx_prio0_packets","rx_prio0_buf_discard","rx_prio0_cong_discard","rx_prio0_marked","outbound_pci_buffer_overflow"] 13 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/infiniband.conf.j2: -------------------------------------------------------------------------------- 1 | # # Gets counters from all InfiniBand cards and ports installed 2 | [[inputs.infiniband]] 3 | # # no configuration 4 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/infiniband_mlx5_0_hw_counters.conf.j2: -------------------------------------------------------------------------------- 1 | [[inputs.multifile]] 2 | name_override = "infiniband_mlx5_0_hw_counters" 3 | base_dir = "/sys/class/infiniband" 4 | interval = "300s" 5 | 6 | [[inputs.multifile.tags]] 7 | device="mlx5_0" 8 | port="1" 9 | type="hw_counters" 10 | 11 | [[inputs.multifile.file]] 12 | file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets" 13 | conversion = "int" 14 | 15 | [[inputs.multifile.file]] 16 | file = "mlx5_0/ports/1/hw_counters/out_of_sequence" 17 | conversion = "int" 18 | 19 | [[inputs.multifile.file]] 20 | file = "mlx5_0/ports/1/hw_counters/packet_seq_err" 21 | conversion = "int" 22 | 23 | [[inputs.multifile.file]] 24 | file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err" 25 | conversion = "int" 26 | 27 | [[inputs.multifile.file]] 28 | file = "mlx5_0/ports/1/hw_counters/roce_adp_retrans" 29 | conversion = "int" 30 | 31 | [[inputs.multifile.file]] 32 | file = "mlx5_0/ports/1/hw_counters/np_cnp_sent" 33 | conversion = "int" 34 | 35 | [[inputs.multifile.file]] 36 | file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled" 37 | conversion = "int" 38 | 39 | [[inputs.multifile.file]] 40 | file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored" 41 | conversion = "int" 42 | 43 | [[inputs.multifile.file]] 44 | file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated" 45 | conversion = "int" 46 | 47 | [[inputs.multifile.file]] 48 | file = "mlx5_0/ports/1/hw_counters/roce_slow_restart" 49 | conversion = "int" 50 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/infiniband_mlx5_1_hw_counters.conf.j2: -------------------------------------------------------------------------------- 1 | [[inputs.multifile]] 2 | name_override = "infiniband_mlx5_1_hw_counters" 3 | base_dir = "/sys/class/infiniband" 4 | interval = "300s" 5 | 6 | [[inputs.multifile.tags]] 7 | device="mlx5_1" 8 | port="1" 9 | type="hw_counters" 10 | 11 | [[inputs.multifile.file]] 12 | file = "mlx5_1/ports/1/hw_counters/np_ecn_marked_roce_packets" 13 | conversion = "int" 14 | 15 | [[inputs.multifile.file]] 16 | file = "mlx5_1/ports/1/hw_counters/out_of_sequence" 17 | conversion = "int" 18 | 19 | [[inputs.multifile.file]] 20 | file = "mlx5_1/ports/1/hw_counters/packet_seq_err" 21 | conversion = "int" 22 | 23 | [[inputs.multifile.file]] 24 | file = "mlx5_1/ports/1/hw_counters/local_ack_timeout_err" 25 | conversion = "int" 26 | 27 | [[inputs.multifile.file]] 28 | file = "mlx5_1/ports/1/hw_counters/roce_adp_retrans" 29 | conversion = "int" 30 | 31 | [[inputs.multifile.file]] 32 | file = "mlx5_1/ports/1/hw_counters/np_cnp_sent" 33 | conversion = "int" 34 | 35 | [[inputs.multifile.file]] 36 | file = "mlx5_1/ports/1/hw_counters/rp_cnp_handled" 37 | conversion = "int" 38 | 39 | [[inputs.multifile.file]] 40 | file = "mlx5_1/ports/1/hw_counters/rp_cnp_ignored" 41 | conversion = "int" 42 | 43 | [[inputs.multifile.file]] 44 | file = "mlx5_1/ports/1/hw_counters/rx_icrc_encapsulated" 45 | conversion = "int" 46 | 47 | [[inputs.multifile.file]] 48 | file = "mlx5_1/ports/1/hw_counters/roce_slow_restart" 49 | conversion = "int" 50 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/infiniband_mlx5_3_hw_counters.conf.j2: -------------------------------------------------------------------------------- 1 | [[inputs.multifile]] 2 | name_override = "infiniband_mlx5_3_hw_counters" 3 | base_dir = "/sys/class/infiniband" 4 | interval = "300s" 5 | 6 | [[inputs.multifile.tags]] 7 | device="mlx5_3" 8 | port="1" 9 | type="hw_counters" 10 | 11 | [[inputs.multifile.file]] 12 | file = "mlx5_3/ports/1/hw_counters/np_ecn_marked_roce_packets" 13 | conversion = "int" 14 | 15 | [[inputs.multifile.file]] 16 | file = "mlx5_3/ports/1/hw_counters/out_of_sequence" 17 | conversion = "int" 18 | 19 | [[inputs.multifile.file]] 20 | file = "mlx5_3/ports/1/hw_counters/packet_seq_err" 21 | conversion = "int" 22 | 23 | [[inputs.multifile.file]] 24 | file = "mlx5_3/ports/1/hw_counters/local_ack_timeout_err" 25 | conversion = "int" 26 | 27 | [[inputs.multifile.file]] 28 | file = "mlx5_3/ports/1/hw_counters/roce_adp_retrans" 29 | conversion = "int" 30 | 31 | [[inputs.multifile.file]] 32 | file = "mlx5_3/ports/1/hw_counters/np_cnp_sent" 33 | conversion = "int" 34 | 35 | [[inputs.multifile.file]] 36 | file = "mlx5_3/ports/1/hw_counters/rp_cnp_handled" 37 | conversion = "int" 38 | 39 | [[inputs.multifile.file]] 40 | file = "mlx5_3/ports/1/hw_counters/rp_cnp_ignored" 41 | conversion = "int" 42 | 43 | [[inputs.multifile.file]] 44 | file = "mlx5_3/ports/1/hw_counters/rx_icrc_encapsulated" 45 | conversion = "int" 46 | 47 | [[inputs.multifile.file]] 48 | file = "mlx5_3/ports/1/hw_counters/roce_slow_restart" 49 | conversion = "int" 50 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/infiniband_mlx5_4_hw_counters.conf.j2: -------------------------------------------------------------------------------- 1 | [[inputs.multifile]] 2 | name_override = "infiniband_mlx5_4_hw_counters" 3 | base_dir = "/sys/class/infiniband" 4 | interval = "300s" 5 | 6 | [[inputs.multifile.tags]] 7 | device="mlx5_4" 8 | port="1" 9 | type="hw_counters" 10 | 11 | [[inputs.multifile.file]] 12 | file = "mlx5_4/ports/1/hw_counters/np_ecn_marked_roce_packets" 13 | conversion = "int" 14 | 15 | [[inputs.multifile.file]] 16 | file = "mlx5_4/ports/1/hw_counters/out_of_sequence" 17 | conversion = "int" 18 | 19 | [[inputs.multifile.file]] 20 | file = "mlx5_4/ports/1/hw_counters/packet_seq_err" 21 | conversion = "int" 22 | 23 | [[inputs.multifile.file]] 24 | file = "mlx5_4/ports/1/hw_counters/local_ack_timeout_err" 25 | conversion = "int" 26 | 27 | [[inputs.multifile.file]] 28 | file = "mlx5_4/ports/1/hw_counters/roce_adp_retrans" 29 | conversion = "int" 30 | 31 | [[inputs.multifile.file]] 32 | file = "mlx5_4/ports/1/hw_counters/np_cnp_sent" 33 | conversion = "int" 34 | 35 | [[inputs.multifile.file]] 36 | file = "mlx5_4/ports/1/hw_counters/rp_cnp_handled" 37 | conversion = "int" 38 | 39 | [[inputs.multifile.file]] 40 | file = "mlx5_4/ports/1/hw_counters/rp_cnp_ignored" 41 | conversion = "int" 42 | 43 | [[inputs.multifile.file]] 44 | file = "mlx5_4/ports/1/hw_counters/rx_icrc_encapsulated" 45 | conversion = "int" 46 | 47 | [[inputs.multifile.file]] 48 | file = "mlx5_4/ports/1/hw_counters/roce_slow_restart" 49 | conversion = "int" 50 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/infiniband_mlx5_5_hw_counters.conf.j2: -------------------------------------------------------------------------------- 1 | [[inputs.multifile]] 2 | name_override = "infiniband_mlx5_5_hw_counters" 3 | base_dir = "/sys/class/infiniband" 4 | interval = "300s" 5 | 6 | [[inputs.multifile.tags]] 7 | device="mlx5_5" 8 | port="1" 9 | type="hw_counters" 10 | 11 | [[inputs.multifile.file]] 12 | file = "mlx5_5/ports/1/hw_counters/np_ecn_marked_roce_packets" 13 | conversion = "int" 14 | 15 | [[inputs.multifile.file]] 16 | file = "mlx5_5/ports/1/hw_counters/out_of_sequence" 17 | conversion = "int" 18 | 19 | [[inputs.multifile.file]] 20 | file = "mlx5_5/ports/1/hw_counters/packet_seq_err" 21 | conversion = "int" 22 | 23 | [[inputs.multifile.file]] 24 | file = "mlx5_5/ports/1/hw_counters/local_ack_timeout_err" 25 | conversion = "int" 26 | 27 | [[inputs.multifile.file]] 28 | file = "mlx5_5/ports/1/hw_counters/roce_adp_retrans" 29 | conversion = "int" 30 | 31 | [[inputs.multifile.file]] 32 | file = "mlx5_5/ports/1/hw_counters/np_cnp_sent" 33 | conversion = "int" 34 | 35 | [[inputs.multifile.file]] 36 | file = "mlx5_5/ports/1/hw_counters/rp_cnp_handled" 37 | conversion = "int" 38 | 39 | [[inputs.multifile.file]] 40 | file = "mlx5_5/ports/1/hw_counters/rp_cnp_ignored" 41 | conversion = "int" 42 | 43 | [[inputs.multifile.file]] 44 | file = "mlx5_5/ports/1/hw_counters/rx_icrc_encapsulated" 45 | conversion = "int" 46 | 47 | [[inputs.multifile.file]] 48 | file = "mlx5_5/ports/1/hw_counters/roce_slow_restart" 49 | conversion = "int" 50 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/influxdb.conf.j2: -------------------------------------------------------------------------------- 1 | [[outputs.influxdb]] 2 | urls = ["http://{{ hostvars[groups['controller'][0]]['ansible_fqdn'] }}:8086"] 3 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/net.conf.j2: -------------------------------------------------------------------------------- 1 | [[inputs.net]] 2 | # ## By default, telegraf gathers stats from any up interface (excluding loopback) 3 | # ## Setting interfaces will tell it to gather these explicit interfaces, 4 | # ## regardless of status. 5 | # ## 6 | # # interfaces = ["eth0"] 7 | # ## 8 | # ## On linux systems telegraf also collects protocol stats. 9 | # ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. 10 | # ## 11 | ignore_protocol_stats = true 12 | # ## 13 | -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/nvidia_gpu.conf.j2: -------------------------------------------------------------------------------- 1 | [[inputs.nvidia_smi]] -------------------------------------------------------------------------------- /playbooks/roles/telegraf/templates/prometheus.conf.j2: -------------------------------------------------------------------------------- 1 | [[outputs.prometheus_client]] 2 | listen = ":9273" 3 | expiration_interval = "60s" -------------------------------------------------------------------------------- /playbooks/roles/tuned/files/tuned.conf: -------------------------------------------------------------------------------- 1 | [main] 2 | summary=Perf tuning for common GPU workloads 3 | 4 | [cpu] 5 | force_latency=1 6 | governor=performance 7 | energy_perf_bias=performance 8 | min_perf_pct=100 9 | 10 | [vm] 11 | transparent_huge_pages=never 12 | 13 | [sysctl] 14 | net.ipv4.tcp_timestamps=1 15 | net.ipv4.tcp_sack=1 16 | net.ipv4.tcp_dsack=1 17 | net.ipv4.tcp_low_latency=1 18 | net.ipv4.tcp_adv_win_scale=2 19 | net.ipv4.tcp_window_scaling=1 20 | net.ipv4.tcp_slow_start_after_idle=0 21 | net.ipv4.tcp_syn_retries=8 22 | net.ipv4.tcp_rmem=4096 87380 16777216 23 | net.ipv4.tcp_wmem=4096 65536 16777216 24 | net.core.rmem_max=16777216 25 | net.core.wmem_max=16777216 26 | net.core.rmem_default=16777216 27 | net.core.wmem_default=16777216 28 | net.core.optmem_max=16777216 29 | net.core.somaxconn = 8192 30 | net.core.netdev_max_backlog=250000 31 | sunrpc.udp_slot_table_entries=128 32 | sunrpc.tcp_slot_table_entries=128 33 | kernel.sysrq = 1 34 | kernel.sched_min_granularity_ns = 10000000 35 | kernel.sched_wakeup_granularity_ns = 15000000 36 | vm.min_free_kbytes = 16777216 37 | vm.dirty_ratio = 30 38 | vm.dirty_background_ratio = 10 39 | vm.swappiness=30 40 | -------------------------------------------------------------------------------- /playbooks/roles/tuned/tasks/el-7.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Ensure tuned profile directory exists 4 | become: true 5 | file: 6 | path='/usr/lib/tuned/oci-network-performance' 7 | state=directory 8 | 9 | - name: Copy profile file 10 | become: true 11 | copy: 12 | src: tuned.conf 13 | dest: "/usr/lib/tuned/oci-network-performance/tuned.conf" 14 | 15 | - name: Start profile 16 | become: true 17 | shell: tuned-adm profile oci-network-performance 18 | -------------------------------------------------------------------------------- /playbooks/roles/tuned/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el-7.yml 2 | when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8' or shape == 'BM.GPU.H100.8' or shape == 'BM.GPU.H200.8') 3 | -------------------------------------------------------------------------------- /playbooks/roles/yaml/tasks/el.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Download yq RPMs. 3 | get_url: 4 | url: "https://github.com/mikefarah/yq/releases/download/v4.9.6/yq_linux_amd64.tar.gz" 5 | dest: "/tmp" 6 | retries: 10 7 | delay: 10 8 | register: result 9 | until: result is not failed 10 | - name: untar yq 11 | unarchive: 12 | src: "/tmp/yq_linux_amd64.tar.gz" 13 | dest: "/tmp" 14 | - name: move 15 | become: true 16 | copy: 17 | src: "/tmp/yq_linux_amd64" 18 | dest: "/usr/bin/yq" 19 | mode: "0755" 20 | 21 | -------------------------------------------------------------------------------- /playbooks/roles/yaml/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: el.yml 2 | when: ansible_os_family == 'RedHat' 3 | - include_tasks: ubuntu.yml 4 | when: ansible_os_family == 'Debian' -------------------------------------------------------------------------------- /playbooks/roles/yaml/tasks/ubuntu.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Download yq RPMs. 3 | get_url: 4 | url: "https://github.com/mikefarah/yq/releases/download/v4.9.6/yq_linux_amd64.tar.gz" 5 | dest: "/tmp" 6 | retries: 10 7 | delay: 10 8 | register: result 9 | until: result is not failed 10 | - name: untar yq 11 | unarchive: 12 | src: "/tmp/yq_linux_amd64.tar.gz" 13 | dest: "/tmp" 14 | - name: move 15 | become: true 16 | copy: 17 | src: "/tmp/yq_linux_amd64" 18 | dest: "/usr/bin/yq" 19 | mode: "0755" 20 | 21 | -------------------------------------------------------------------------------- /playbooks/slurm_config.yml: -------------------------------------------------------------------------------- 1 | - hosts: controller,slurm_backup,compute,login, monitoring 2 | gather_facts: true 3 | vars: 4 | destroy: false 5 | initial: true 6 | download_path: "{{ nfs_target_path if create_fss | bool else ( cluster_nfs_path if cluster_nfs|bool else '/tmp') }}" 7 | enroot_top_path: "{{ nvme_path }}/enroot/" 8 | vars_files: 9 | - "/opt/oci-hpc/conf/queues.conf" 10 | tasks: 11 | - include_role: 12 | name: slurm 13 | when: slurm|default(true)|bool -------------------------------------------------------------------------------- /playbooks/slurm_config_as.yml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | gather_facts: true 3 | tasks: 4 | - debug: 5 | msg: "Gathering facts" 6 | - hosts: compute, slurm_backup 7 | gather_facts: true 8 | vars: 9 | destroy: false 10 | initial: false 11 | download_path: "{{ cluster_nfs_path if cluster_nfs|bool else '/tmp' }}" 12 | enroot_top_path: "{{ nvme_path }}/enroot/" 13 | vars_files: 14 | - "/opt/oci-hpc/conf/queues.conf" 15 | tasks: 16 | - include_role: 17 | name: slurm 18 | when: slurm|default(false)|bool -------------------------------------------------------------------------------- /provider.tf: -------------------------------------------------------------------------------- 1 | #provider "oci" { 2 | #tenancy_ocid = "${var.tenancy_ocid}" 3 | #user_ocid = "${var.user_ocid}" 4 | #fingerprint = "${var.fingerprint}" 5 | #private_key_path = "${var.private_key_path}" 6 | #region = "${var.region}" 7 | #} 8 | -------------------------------------------------------------------------------- /samples/NCCL_readme: -------------------------------------------------------------------------------- 1 | To Run a NCCL test, run the following commands: 2 | chmod 775 /opt/oci-hpc/samples/prep_sample_files.sh 3 | /opt/oci-hpc/samples/prep_sample_files.sh 4 | 5 | SSH to one of the compute nodes and run: ~/compile.sh 6 | 7 | From the controller, you can edit the third line of /home/opc/nccl_run_allreduce.sbatch with the number of nodes that you would like to test on: 8 | sbatch /home/opc/nccl_run_allreduce.sbatch 9 | 10 | Look at the last line of the log for bandwidth. 11 | 12 | -------------------------------------------------------------------------------- /samples/disable_ht.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # argument: 1="0" turn off hyper threading, "1" turn it on. 4 | THREADS=`lscpu | grep -E '^Thread|^Core|^Socket|^CPU\(' | head -1 | awk '{ print $2 }'` 5 | CORES=`expr $THREADS / 2` 6 | 7 | if [[ $# -ne 1 ]]; then 8 | echo 'One argument required. 0 to turn off hyper-threading or' 9 | echo '1 to turn hyper-threading back on' 10 | exit 1 11 | fi 12 | 13 | echo Thread pairs before change 14 | cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | sort --unique --numeric-sort 15 | echo 16 | 17 | for k in `seq $CORES $THREADS`; do 18 | echo $1 > /sys/devices/system/cpu/cpu$k/online; 19 | done 20 | 21 | grep "" /sys/devices/system/cpu/cpu*/topology/core_id 22 | 23 | grep -q '^flags.*[[:space:]]ht[[:space:]]' /proc/cpuinfo && \ 24 | echo "Hyper-threading is supported" 25 | 26 | grep -E 'model|stepping' /proc/cpuinfo | sort -u 27 | 28 | echo Thread pairs after change 29 | cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | sort --unique --numeric-sort 30 | echo -------------------------------------------------------------------------------- /samples/gpu/ifdown.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NICS="" 4 | shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` 5 | if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] 6 | then 7 | NICS="enp71s0f0 enp71s0f1 enp75s0f0 enp75s0f1 enp12s0f0 enp12s0f1 enp22s0f0 enp22s0f1 enp195s0f0 enp195s0f1 enp209s0f0 enp209s0f1 enp137s0f0 enp137s0f1 enp147s0" 8 | elif [ $shape == \"BM.GPU4.8\" ] 9 | then 10 | NICS="enp72s0f0 enp72s0f1 enp76s0f0 enp76s0f1 enp12s0f0 enp12s0f1 enp22s0f0 enp22s0f1 enp195s0f0 enp195s0f1 enp209s0f0 enp209s0f1 enp138s0f0 enp138s0f1 enp148s0f0 enp148s0f1" 11 | fi 12 | echo $NICS 13 | 14 | for NIC in $NICS; do 15 | echo "running ifdown $NIC ..." 16 | sudo ifdown $NIC 17 | done 18 | 19 | -------------------------------------------------------------------------------- /samples/gpu/ifup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NICS="" 4 | shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` 5 | if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] 6 | then 7 | NICS="enp71s0f0 enp71s0f1 enp75s0f0 enp75s0f1 enp12s0f0 enp12s0f1 enp22s0f0 enp22s0f1 enp195s0f0 enp195s0f1 enp209s0f0 enp209s0f1 enp137s0f0 enp137s0f1 enp147s0" 8 | elif [ $shape == \"BM.GPU4.8\" ] 9 | then 10 | NICS="enp72s0f0 enp72s0f1 enp76s0f0 enp76s0f1 enp12s0f0 enp12s0f1 enp22s0f0 enp22s0f1 enp195s0f0 enp195s0f1 enp209s0f0 enp209s0f1 enp138s0f0 enp138s0f1 enp148s0f0 enp148s0f1" 11 | fi 12 | echo $NICS 13 | 14 | for NIC in $NICS; do 15 | echo "running ifup $NIC ..." 16 | sudo ifup $NIC 17 | done 18 | 19 | -------------------------------------------------------------------------------- /samples/gpu/no_ncclparam_nccl_run_allreduce_H100_H200.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=nccl-allreduce-slurm 3 | #SBATCH --nodes=2 4 | #SBATCH --gpus-per-node=8 5 | #SBATCH --ntasks-per-node=8 6 | #SBATCH --exclusive 7 | export PMI_DEBUG=1 8 | 9 | 10 | cd /nfs/cluster 11 | mkdir $SLURM_JOB_ID 12 | cd $SLURM_JOB_ID 13 | 14 | MACHINEFILE="hostfile" 15 | 16 | scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE 17 | echo MACHINEFILE 18 | cat $MACHINEFILE 19 | 20 | source /etc/os-release 21 | 22 | mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` 23 | 24 | if [[ "$mpivars_path" == "" ]]; then 25 | mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` 26 | fi 27 | 28 | if [[ "$mpivars_path" == "" ]]; then 29 | echo "Could not find MPIPATH"; exit; fi 30 | 31 | source $mpivars_path 32 | 33 | shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` 34 | if [ $shape == \"BM.GPU.H100.8\" ] || [ $shape == \"BM.GPU.H200.8\" ] 35 | then 36 | var_UCX_NET_DEVICES=eth0 37 | else 38 | echo "Use the appropriate nccl test run script for non H100 nodes" 39 | fi 40 | 41 | # all NCCL parameters are at /etc/nccl.conf on each compute node. 42 | mpirun --mca pml ucx \ 43 | --bind-to numa \ 44 | -npernode 8 \ 45 | --mca coll ^hcoll \ 46 | -x HCOLL_ENABLE_MCAST_ALL=0 \ 47 | -x coll_hcoll_enable=0 \ 48 | -x UCX_TLS=tcp \ 49 | -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ 50 | -x RX_QUEUE_LEN=8192 \ 51 | -x IB_RX_QUEUE_LEN=8192 \ 52 | --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 -------------------------------------------------------------------------------- /samples/gpu/notes.txt: -------------------------------------------------------------------------------- 1 | # QFAB1.0 2 | 3 | mpirun --mca pml ucx \ 4 | --bind-to numa \ 5 | -x NCCL_DEBUG=WARN \ 6 | -x NCCL_IB_SL=0 \ 7 | -x NCCL_IB_TC=41 \ 8 | -x NCCL_IB_QPS_PER_CONNECTION=16 \ 9 | -x UCX_TLS=ud,self,sm \ 10 | -x UCX_NET_DEVICES=mlx5_4:1 \ 11 | -x HCOLL_ENABLE_MCAST_ALL=0 \ 12 | -x coll_hcoll_enable=0 \ 13 | -x NCCL_IB_GID_INDEX=3 \ 14 | -x NCCL_ALGO=Ring \ 15 | -x NCCL_TOPO_FILE=/home/opc/topo-flattened.xml \ 16 | -x NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16" \ 17 | --np $np --hostfile $hostfile -N 8 /home/opc/nccl-tests/build/all_reduce_perf -b8 -e 4G -f 2 -n $iter >> $logfile 18 | 19 | -------------------------------------------------------------------------------- /samples/gpu/ping.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | f=$(mktemp) 3 | HOST=$1 4 | ssh $HOST /usr/sbin/ip -j addr | jq -r '.[] | select(.ifname | test("rdma")) | .ifname + " " + .addr_info[0].local' > $f 5 | while read -r l ; do 6 | i=$(echo $l | awk '{print $1}') 7 | ip=$(echo $l | awk '{print $2}') 8 | ping -qI $i $ip -c1 ; done < $f 9 | rm -rf $f -------------------------------------------------------------------------------- /samples/gpu/rccl_run_allreduce.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=nccl-allreduce-slurm 3 | #SBATCH --nodes=2 4 | #SBATCH --gpus-per-node=8 5 | #SBATCH --ntasks-per-node=8 6 | #SBATCH --exclusive 7 | export PMI_DEBUG=1 8 | 9 | 10 | cd /nfs/cluster 11 | mkdir $SLURM_JOB_ID 12 | cd $SLURM_JOB_ID 13 | 14 | MACHINEFILE="hostfile" 15 | 16 | scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE 17 | echo MACHINEFILE 18 | cat $MACHINEFILE 19 | 20 | source /etc/os-release 21 | 22 | mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` 23 | 24 | if [[ "$mpivars_path" == "" ]]; then 25 | mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` 26 | fi 27 | 28 | if [[ "$mpivars_path" == "" ]]; then 29 | echo "Could not find MPIPATH"; exit; fi 30 | 31 | source $mpivars_path 32 | 33 | var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9" 34 | 35 | 36 | mpirun --mca pml ucx \ 37 | --bind-to numa \ 38 | -x UCX_NET_DEVICES=mlx5_0:1 \ 39 | -x NCCL_SOCKET_IFNAME=eth0 \ 40 | -x NCCL_IB_SL=0 \ 41 | -x NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9" \ 42 | -x coll_hcoll_enable=0 \ 43 | -x HCOLL_ENABLE_MCAST_ALL=0 \ 44 | -x NCCL_IGNORE_CPU_AFFINITY=1 \ 45 | -x NCCL_IB_QPS_PER_CONNECTION=4 \ 46 | -x RX_QUEUE_LEN=8192 \ 47 | -x IB_RX_QUEUE_LEN=8192 \ 48 | -np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/rccl-tests/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 -------------------------------------------------------------------------------- /samples/gpu/srun_examples_with_container.txt: -------------------------------------------------------------------------------- 1 | # assuming nodes have enroot and pyxis configured and ngc container downloaded and sqsh created. 2 | srun --container-name=pytorch --container-image=nvcr.io+nvidia+pytorch+22.04-py3.sqsh bash -c "cat /etc/*rel*" 3 | 4 | # non-NGC container 5 | enroot create -n pyxis_ubuntu /home/opc/ubuntu.sqsh 6 | enroot start pyxis_ubuntu 7 | srun --container-name=ubuntu bash -c "cat /etc/*rel*" 8 | 9 | -------------------------------------------------------------------------------- /samples/gpu/update_arp_settings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # arp change for single subnet for all RoCE NICs only. 4 | links=$(/sbin/ip link | grep -e "enp.*s0f*" | grep -v enp45 | awk -F":" "{print \$2}") 5 | parameters="rp_filter=2 arp_ignore=2 arp_announce=1" 6 | 7 | for link in ${links} 8 | do 9 | for param in ${parameters} 10 | do 11 | echo "${link} current: " $(sudo sysctl net.ipv4.conf.${link}.${param}) 12 | sudo sysctl -w net.ipv4.conf.${link}.${param} 13 | done 14 | done 15 | 16 | 17 | # Permanent/Persistent change 18 | # arp change for single subnet for all RoCE NICs only. 19 | links=$(/sbin/ip link | grep -e "enp.*s0f*" | grep -v enp45 | awk -F":" "{print \$2}") 20 | parameters="rp_filter=2 arp_ignore=2 arp_announce=1" 21 | 22 | sysctl_network_conf="/etc/sysctl.d/80-network.conf" 23 | less $sysctl_network_conf | grep "rp_filter=2" 24 | if [ $? -ne 0 ]; then 25 | for link in ${links} 26 | do 27 | for param in ${parameters} 28 | do 29 | echo "net.ipv4.conf.${link}.${param}" | sudo tee -a $sysctl_network_conf 30 | done 31 | done 32 | fi 33 | 34 | -------------------------------------------------------------------------------- /samples/gpu/update_netmask.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sed -i 's|NETMASK=255.255.252.0|NETMASK=255.255.0.0|g' /etc/sysconfig/network-scripts/ifcfg-enp* 4 | 5 | -------------------------------------------------------------------------------- /samples/nccl_compile/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run on 1 GPU node only 4 | 5 | mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` 6 | 7 | if [[ "$mpivars_path" == "" ]]; then 8 | mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` 9 | fi 10 | 11 | if [[ "$mpivars_path" == "" ]]; then 12 | echo "Could not find MPIPATH"; exit; fi 13 | 14 | source $mpivars_path 15 | MPI_HOME=${mpivars_path%%/bin*} 16 | 17 | USER=`whoami` 18 | 19 | cd /home/$USER 20 | rm -rf nccl-tests 21 | git clone https://github.com/NVIDIA/nccl-tests.git 22 | cd nccl-tests/ 23 | make MPI=1 MPI_HOME=$MPI_HOME CUDA_HOME=/usr/local/cuda 24 | 25 | -------------------------------------------------------------------------------- /samples/nfs/README.txt: -------------------------------------------------------------------------------- 1 | Problem: 2 | When node running NFS needs to be terminated due to H/W failure, site.yml playbook fails, sudo umount /nfs/scratch hangs. 3 | 4 | Solution: 5 | 1. Manually change the ansible inventory file (/etc/ansible/hosts) on controller. You will need to use sudo. 6 | a. To replace the [nfs] group hostname with another node of the cluster to act as NFS server. 7 | Example: ansible_user=opc role=nfs 8 | b. If the node that was deleted is still there in [compute_configured] group, then remove it. 9 | 2. Run the script fix_nfs.sh. 10 | 11 | -------------------------------------------------------------------------------- /samples/nfs/fix_nfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /opt/oci-hpc/samples/nfs 4 | sinfo -hNr -o "%N" > machinefile 5 | 6 | sudo umount -l /nfs/scratch 7 | PID=$! 8 | wait $PID 9 | 10 | pssh -i -h /opt/oci-hpc/samples/nfs/machinefile 'sudo umount -l /nfs/scratch' 11 | PID=$! 12 | wait $PID 13 | 14 | sudo sed -i_bak -e "/ \/nfs\/scratch/d" /etc/fstab 15 | PID=$! 16 | wait $PID 17 | 18 | pssh -i -h /opt/oci-hpc/samples/nfs/machinefile 'sudo sed -i_bak -e "/ \/nfs\/scratch/d" /etc/fstab' 19 | PID=$! 20 | wait $PID 21 | 22 | ansible-playbook /opt/oci-hpc/playbooks/site.yml 23 | 24 | -------------------------------------------------------------------------------- /samples/open-ldap/add-linux-group.yml: -------------------------------------------------------------------------------- 1 | 2 | - hosts: all 3 | become: true 4 | tasks: 5 | - name: create groupname group (gid 9876) 6 | group: 7 | name: groupname 8 | gid: 9876 9 | state: present 10 | - name: allow groupname group to have passwordless sudo 11 | lineinfile: 12 | dest: /etc/sudoers 13 | state: present 14 | regexp: '^%groupname' 15 | line: '%groupname ALL=(ALL) NOPASSWD: ALL' 16 | validate: 'visudo -cf %s' 17 | - name: add opc user to groupname group 18 | user: 19 | name: opc 20 | groups: groupname 21 | append: yes 22 | 23 | - hosts: compute 24 | become: true 25 | tasks: 26 | - name: update permission to allow groupname rw access /tmp/enroot-data directory 27 | file: 28 | path: "/tmp/enroot-data" 29 | state: directory 30 | mode: '0775' 31 | owner: opc 32 | group: groupname 33 | recurse: no 34 | -------------------------------------------------------------------------------- /samples/prep_sample_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /opt/oci-hpc/samples/ 4 | for directory in `ls -d */ ` ; 5 | do 6 | echo $directory 7 | sudo chmod +x $directory/*.sh 8 | done; 9 | 10 | cp nccl_compile/compile.sh ~ 11 | cp gpu/*.sbatch ~ 12 | cp gpu/H100* ~ 13 | cp /opt/oci-hpc/bin/node_ordering_by_rack.py ~ 14 | 15 | -------------------------------------------------------------------------------- /samples/rdma-tuning/check_pcie_max_read.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # https://support.mellanox.com/s/article/understanding-pcie-configuration-for-maximum-performance 4 | # 5 | 6 | PCI_DEVICES_48="" 7 | shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` 8 | if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] 9 | then 10 | PCI_DEVICES_48="0c:00.0 0c:00.1 16:00.0 16:00.1 47:00.0 47:00.1 4b:00.0 4b:00.1 89:00.0 89:00.1 93:00.0 93:00.1 c3:00.0 c3:00.1 d1:00.0 d1:00.1" 11 | elif [ $shape == \"BM.GPU4.8\" ] 12 | then 13 | PCI_DEVICES_48="0c:00.0 0c:00.1 16:00.0 16:00.1 48:00.0 48:00.1 4c:00.0 4c:00.1 8a:00.0 8a:00.1 94:00.0 94:00.1 c3:00.0 c3:00.1 d1:00.0 d1:00.1" 14 | fi 15 | 16 | # 17 | for d in ${PCI_DEVICES_48} 18 | do 19 | echo ${d} 20 | sudo lspci -s ${d} -vvv | grep MaxRead 21 | done -------------------------------------------------------------------------------- /samples/rdma-tuning/pcie_max_read.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # https://support.mellanox.com/s/article/understanding-pcie-configuration-for-maximum-performance 4 | # 5 | PCI_DEVICES_48="" 6 | shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` 7 | if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] 8 | then 9 | PCI_DEVICES_48="0c:00.0 0c:00.1 16:00.0 16:00.1 47:00.0 47:00.1 4b:00.0 4b:00.1 89:00.0 89:00.1 93:00.0 93:00.1 c3:00.0 c3:00.1 d1:00.0 d1:00.1" 10 | elif [ $shape == \"BM.GPU4.8\" ] 11 | then 12 | PCI_DEVICES_48="0c:00.0 0c:00.1 16:00.0 16:00.1 48:00.0 48:00.1 4c:00.0 4c:00.1 8a:00.0 8a:00.1 94:00.0 94:00.1 c3:00.0 c3:00.1 d1:00.0 d1:00.1" 13 | fi 14 | 15 | # 16 | for d in ${PCI_DEVICES_48} 17 | do 18 | echo ${d} 19 | sudo lspci -s ${d} -vvv | grep MaxRead 20 | sudo setpci -s ${d} 68.w 21 | # 2937 is the default 22 | # 2 is 512B 23 | # 5 is 4096B 24 | OUT=`sudo setpci -s ${d} 68.w | cut -c2-5` 25 | echo sudo setpci -s ${d} 68.w=5${OUT} 26 | sudo setpci -s ${d} 68.w=5${OUT} 27 | sudo lspci -s ${d} -vvv | grep MaxRead 28 | sudo setpci -s ${d} 68.w 29 | echo 30 | done -------------------------------------------------------------------------------- /samples/rdma-tuning/pcie_max_read_default.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # https://support.mellanox.com/s/article/understanding-pcie-configuration-for-maximum-performance 4 | # 5 | 6 | PCI_DEVICES_48="" 7 | shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` 8 | if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] 9 | then 10 | PCI_DEVICES_48="0c:00.0 0c:00.1 16:00.0 16:00.1 47:00.0 47:00.1 4b:00.0 4b:00.1 89:00.0 89:00.1 93:00.0 93:00.1 c3:00.0 c3:00.1 d1:00.0 d1:00.1" 11 | elif [ $shape == \"BM.GPU4.8\" ] 12 | then 13 | PCI_DEVICES_48="0c:00.0 0c:00.1 16:00.0 16:00.1 48:00.0 48:00.1 4c:00.0 4c:00.1 8a:00.0 8a:00.1 94:00.0 94:00.1 c3:00.0 c3:00.1 d1:00.0 d1:00.1" 14 | fi 15 | 16 | # 17 | for d in ${PCI_DEVICES_48} 18 | do 19 | echo ${d} 20 | sudo lspci -s ${d} -vvv | grep MaxRead 21 | sudo setpci -s ${d} 68.w 22 | # 2937 is the default 23 | # 2 is 512B 24 | # 5 is 4096B 25 | OUT=`sudo setpci -s ${d} 68.w | cut -c2-5` 26 | echo sudo setpci -s ${d} 68.w=2${OUT} 27 | sudo setpci -s ${d} 68.w=2${OUT} 28 | sudo lspci -s ${d} -vvv | grep MaxRead 29 | sudo setpci -s ${d} 68.w 30 | echo 31 | done -------------------------------------------------------------------------------- /samples/rdma-tuning/rdma-nic-tuning-1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | sudo mst start 4 | 5 | BUSIDS="" 6 | 7 | shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` 8 | if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] 9 | then 10 | BUSIDS="0000:0c:00.0 0000:16:00.0 0000:47:00.0 0000:4b:00.0 0000:89:00.0 0000:93:00.0 0000:c3:00.0 0000:d1:00.0" 11 | elif [ $shape == \"BM.GPU4.8\" ] 12 | then 13 | BUSIDS="0000:48:00.0 0000:c3:00.0 0000:d1:00.0 0000:8a:00.0 0000:94:00.0 0000:4c:00.0 0000:0c:00.0 0000:16:00.0" 14 | fi 15 | 16 | for BUS in $BUSIDS; do 17 | sudo mlxconfig -d $BUS -y set ADVANCED_PCI_SETTINGS=1 18 | sudo mstfwreset -d $BUS r -y -l 3 19 | sudo mlxconfig -d $BUS query ADVANCED_PCI_SETTINGS 20 | done 21 | 22 | -------------------------------------------------------------------------------- /samples/rdma-tuning/rdma-nic-tuning-2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | sudo mst start 4 | 5 | BUSIDS="" 6 | 7 | shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` 8 | if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] 9 | then 10 | BUSIDS="0000:0c:00.0 0000:16:00.0 0000:47:00.0 0000:4b:00.0 0000:89:00.0 0000:93:00.0 0000:c3:00.0 0000:d1:00.0" 11 | elif [ $shape == \"BM.GPU4.8\" ] 12 | then 13 | BUSIDS="0000:48:00.0 0000:c3:00.0 0000:d1:00.0 0000:8a:00.0 0000:94:00.0 0000:4c:00.0 0000:0c:00.0 0000:16:00.0" 14 | fi 15 | 16 | #echo "********************** Updating firmware settings **********************" 17 | for BUS in $BUSIDS; do 18 | sudo mlxconfig -d $BUS -y set MAX_ACC_OUT_READ=44 19 | sudo mlxconfig -d $BUS query MAX_ACC_OUT_READ 20 | done 21 | -------------------------------------------------------------------------------- /samples/rdma-tuning/rdma-nic-tuning-validate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | sudo mst start 4 | 5 | BUSIDS="" 6 | 7 | shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` 8 | if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] 9 | then 10 | BUSIDS="0000:0c:00.0 0000:16:00.0 0000:47:00.0 0000:4b:00.0 0000:89:00.0 0000:93:00.0 0000:c3:00.0 0000:d1:00.0" 11 | elif [ $shape == \"BM.GPU4.8\" ] 12 | then 13 | BUSIDS="0000:48:00.0 0000:c3:00.0 0000:d1:00.0 0000:8a:00.0 0000:94:00.0 0000:4c:00.0 0000:0c:00.0 0000:16:00.0" 14 | fi 15 | 16 | #echo "********************** Updating firmware settings **********************" 17 | for BUS in $BUSIDS; do 18 | sudo mlxconfig -d $BUS query ADVANCED_PCI_SETTINGS 19 | sudo mlxconfig -d $BUS query MAX_ACC_OUT_READ 20 | done 21 | -------------------------------------------------------------------------------- /samples/submit/sleep.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #SBATCH -n 72 3 | #SBATCH --ntasks-per-node 36 4 | #SBATCH --exclusive 5 | #SBATCH --job-name sleep_job 6 | #SBATCH --constraint hpc-default 7 | 8 | 9 | cd /nfs/scratch 10 | mkdir $SLURM_JOB_ID 11 | cd $SLURM_JOB_ID 12 | MACHINEFILE="hostfile" 13 | 14 | # Generate Machinefile for mpi such that hosts are in the same 15 | # order as if run via srun 16 | # 17 | #srun -N$SLURM_NNODES -n$SLURM_NNODES hostname > $MACHINEFILE 18 | scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE 19 | sed -i "s/$/:${SLURM_NTASKS_PER_NODE}/" $MACHINEFILE 20 | 21 | cat $MACHINEFILE 22 | # Run using generated Machine file: 23 | sleep 1000 24 | -------------------------------------------------------------------------------- /samples/submit/sleep_gpu.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #SBATCH --gpus 2 3 | #SBATCH --job-name sleep_gpu 4 | #SBATCH --constraint p100 5 | #SBATCH --partition compute2 6 | 7 | cd /nfs/scratch 8 | mkdir $SLURM_JOB_ID 9 | cd $SLURM_JOB_ID 10 | 11 | for i in 1 2 3 4 5; do 12 | srun --gpus 1 sleep 5 & 13 | done 14 | # Run using generated Machine file: 15 | wait 16 | 17 | sleep 5 -------------------------------------------------------------------------------- /scripts/check_firmware_version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # check_firmware_version.sh 3 | 4 | # Script to check the firmware version on the nodes. 5 | # Needs one argument which is a hostfile (one host per line). 6 | # Example: ./check_firmware_version.sh hosts 7 | 8 | # check if host file is passed 9 | if [ -n "$1" ]; then 10 | HOST_FILE=$1 11 | else 12 | echo "scriptname " 13 | echo "host file is missing, pass a file with list of hostname, one host per line" 14 | exit 1; 15 | fi 16 | 17 | # check if ubuntu or oracle 18 | source /etc/os-release 19 | 20 | if [ $ID == "ol" ] ; then 21 | echo "oracle" 22 | USERNAME=opc 23 | fi 24 | 25 | if [ $ID == "ubuntu" ] ; then 26 | echo "ubuntu" 27 | USERNAME=ubuntu 28 | fi 29 | 30 | for h in `less $HOST_FILE` ; 31 | do 32 | echo $h 33 | ssh $USERNAME@$h "/usr/sbin/ibstat | grep 'Firmware version'" 34 | done -------------------------------------------------------------------------------- /scripts/collect_metadata/requirements.txt: -------------------------------------------------------------------------------- 1 | paramiko 2 | -------------------------------------------------------------------------------- /scripts/gpu_throttle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | /usr/bin/nvidia-smi --query-gpu=timestamp,pci.bus,utilization.gpu,utilization.memory,temperature.gpu,power.draw,clocks.mem,clocks.gr,clocks_throttle_reasons.sw_power_cap,clocks_throttle_reasons.hw_thermal_slowdown,clocks_throttle_reasons.hw_power_brake_slowdown,clocks_throttle_reasons.sw_thermal_slowdown,clocks_throttle_reasons.sync_boost,clocks_throttle_reasons.active --format=csv 5 | 6 | -------------------------------------------------------------------------------- /scripts/pcie.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /etc/os-release 4 | if [ $ID == "ol" ] || [ $ID == "centos" ] ; then 5 | for dev in `/usr/sbin/lspci | grep ConnectX-5 | awk '{print $1}'` 6 | do 7 | echo ${dev} 8 | sudo lspci -vvv -s ${dev} | grep LnkSta: 9 | done 10 | elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then 11 | for dev in `/usr/bin/lspci | grep ConnectX-5 | awk '{print $1}'` 12 | do 13 | echo ${dev} 14 | sudo lspci -vvv -s ${dev} | grep LnkSta: 15 | done 16 | fi 17 | 18 | -------------------------------------------------------------------------------- /user_data.tf: -------------------------------------------------------------------------------- 1 | data "template_file" "controller_config" { 2 | template = file("config.controller") 3 | vars = { 4 | key = tls_private_key.ssh.private_key_pem 5 | } 6 | } 7 | 8 | data "template_file" "config" { 9 | template = file("config.hpc") 10 | } 11 | 12 | 13 | -------------------------------------------------------------------------------- /versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.2" 3 | required_providers { 4 | oci = { 5 | source = "oracle/oci" 6 | version = ">= 6.9.0" 7 | } 8 | local = { 9 | source = "hashicorp/local" 10 | version = ">= 2.1.0" 11 | } 12 | tls = { 13 | source = "hashicorp/tls" 14 | version = ">= 3.0.0" 15 | } 16 | random = { 17 | source = "hashicorp/random" 18 | version = ">= 3.0.0" 19 | } 20 | null = { 21 | source = "hashicorp/null" 22 | version = ">= 3.0.0" 23 | } 24 | template = { 25 | source = "hashicorp/template" 26 | version = ">= 2.2.0" 27 | } 28 | } 29 | } --------------------------------------------------------------------------------