├── .github
    ├── CODEOWNERS
    ├── bin
    │   ├── create-merge-branch.sh
    │   └── get-s3-image.sh
    └── workflows
    │   ├── extra.yml
    │   ├── fatimage.yml
    │   ├── nightly-cleanup.yml
    │   ├── nightlybuild.yml
    │   ├── release-image.yml
    │   ├── s3-image-sync.yml
    │   ├── stackhpc.yml
    │   ├── trivyscan.yml
    │   ├── upgrade-check.yml.sample
    │   └── upload-release-image.yml.sample
├── .gitignore
├── README.md
├── ansible.cfg
├── ansible
    ├── .gitignore
    ├── adhoc
    │   ├── backup-keytabs.yml
    │   ├── cudatests.yml
    │   ├── deploy-pulp.yml
    │   ├── generate-passwords.yml
    │   ├── hpctests.yml
    │   ├── rebuild-via-slurm.yml
    │   ├── rebuild.yml
    │   ├── restart-slurm.yml
    │   ├── sync-pulp.yml
    │   └── update-packages.yml
    ├── bootstrap.yml
    ├── ci
    │   ├── check_eessi.yml
    │   ├── check_grafana.yml
    │   ├── check_sacct_hpctests.yml
    │   ├── check_slurm.yml
    │   ├── delete_images.yml
    │   ├── get_image_ids.yml
    │   ├── library
    │   │   └── grafana_elasticsearch_query.py
    │   ├── output_vars.yml
    │   ├── retrieve_inventory.yml
    │   └── update_timestamps.yml
    ├── cleanup.yml
    ├── disable-repos.yml
    ├── extras.yml
    ├── fatimage.yml
    ├── filesystems.yml
    ├── filter_plugins
    │   └── utils.py
    ├── iam.yml
    ├── library
    │   ├── latest_timestamps.py
    │   └── user_namespace_facts.py
    ├── monitoring.yml
    ├── noop.yml
    ├── portal.yml
    ├── roles
    │   ├── alertmanager
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── configure.yml
    │   │   │   └── install.yml
    │   │   └── templates
    │   │   │   ├── alertmanager-web.yml.j2
    │   │   │   ├── alertmanager.service.j2
    │   │   │   └── alertmanager.yml.j2
    │   ├── basic_users
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── filter_plugins
    │   │   │   └── filter_keys.py
    │   │   ├── library
    │   │   │   └── terminate_user_sessions.py
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── block_devices
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── library
    │   │   │   └── block_devices.py
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── cacerts
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── configure.yml
    │   │   │   ├── export.yml
    │   │   │   └── main.yml
    │   ├── cluster_infra
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   ├── outputs.tf.j2
    │   │   │   ├── providers.tf.j2
    │   │   │   └── resources.tf.j2
    │   ├── compute_init
    │   │   ├── README.md
    │   │   ├── files
    │   │   │   └── compute-init.yml
    │   │   ├── tasks
    │   │   │   ├── export.yml
    │   │   │   └── install.yml
    │   │   └── templates
    │   │   │   └── hostvars.yml.j2
    │   ├── cuda
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── install.yml
    │   │   │   ├── runtime.yml
    │   │   │   └── samples.yml
    │   ├── dnf_repos
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── disable_repos.yml
    │   │   │   └── set_repos.yml
    │   ├── doca
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── install-kernel-devel.yml
    │   │   │   ├── install.yml
    │   │   │   └── main.yml
    │   ├── eessi
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yaml
    │   │   └── tasks
    │   │   │   └── main.yaml
    │   ├── etc_hosts
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   └── hosts.j2
    │   ├── fail2ban
    │   │   ├── README.md
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   ├── meta
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   └── jail.local.j2
    │   ├── filebeat
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── install.yml
    │   │   │   ├── main.yml
    │   │   │   ├── runtime.yml
    │   │   │   └── validate.yml
    │   │   └── templates
    │   │   │   └── filebeat.service.j2
    │   ├── firewalld
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   ├── meta
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── install.yml
    │   │   │   ├── main.yml
    │   │   │   └── runtime.yml
    │   │   └── vars
    │   │   │   └── main.yml
    │   ├── freeipa
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── addhost.yml
    │   │   │   ├── backup-keytabs.yml
    │   │   │   ├── client-install.yml
    │   │   │   ├── enrol.yml
    │   │   │   ├── server.yml
    │   │   │   ├── users.yml
    │   │   │   └── validate.yml
    │   ├── gateway
    │   │   ├── README.md
    │   │   ├── files
    │   │   │   └── gateway-init.yml
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── grafana-dashboards
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── files
    │   │   │   ├── openhpc-slurm.json
    │   │   │   └── slurm-jobs.json
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── hpctests
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── files
    │   │   │   ├── mpi_nxnlatbw.c
    │   │   │   └── plot_imb_pingpong.py
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   ├── library
    │   │   │   ├── hpl_pq.py
    │   │   │   ├── plot_nxnlatbw.py
    │   │   │   ├── read_imb_pingpong.py
    │   │   │   └── slurm_node_info.py
    │   │   ├── meta
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── build-hpl.yml
    │   │   │   ├── hpl-solo.yml
    │   │   │   ├── main.yml
    │   │   │   ├── pingmatrix.yml
    │   │   │   ├── pingpong.yml
    │   │   │   └── setup.yml
    │   │   ├── templates
    │   │   │   ├── HPL.dat.j2
    │   │   │   ├── hpl-build.sh.j2
    │   │   │   ├── hpl-solo.sh.j2
    │   │   │   ├── pingmatrix.sh.j2
    │   │   │   └── pingpong.sh.j2
    │   │   ├── tests
    │   │   │   ├── inventory
    │   │   │   └── test.yml
    │   │   └── vars
    │   │   │   └── main.yml
    │   ├── k3s
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── agent-runtime.yml
    │   │   │   ├── install.yml
    │   │   │   └── server-runtime.yml
    │   │   └── templates
    │   │   │   ├── k3s-agent.service.env.j2
    │   │   │   └── k3s.service.env.j2
    │   ├── k9s
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── lustre
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── configure.yml
    │   │   │   ├── install.yml
    │   │   │   └── validate.yml
    │   │   └── templates
    │   │   │   └── lnet.conf.j2
    │   ├── mysql
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── configure.yml
    │   │   │   ├── install.yml
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   └── mysql.service.j2
    │   ├── ofed
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── install.yml
    │   │   │   └── main.yml
    │   ├── openondemand
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── files
    │   │   │   ├── jupyter_requirements.txt
    │   │   │   └── missing_home_directory.html
    │   │   ├── tasks
    │   │   │   ├── config_changes.yml
    │   │   │   ├── exporter.yml
    │   │   │   ├── jupyter_compute.yml
    │   │   │   ├── main.yml
    │   │   │   ├── pam_auth.yml
    │   │   │   ├── validate.yml
    │   │   │   └── vnc_compute.yml
    │   │   └── templates
    │   │   │   ├── dashboard_app_links.yml.j2
    │   │   │   ├── files_shortcuts.rb.j2
    │   │   │   └── grid-mapfile.j2
    │   ├── opensearch
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── archive_data.yml
    │   │   │   ├── certs.yml
    │   │   │   ├── install.yml
    │   │   │   ├── migrate-opendistro.yml
    │   │   │   └── runtime.yml
    │   │   └── templates
    │   │   │   ├── opensearch.service.j2
    │   │   │   └── opensearch.yml.j2
    │   ├── passwords
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── main.yml
    │   │   │   └── validate.yml
    │   │   └── templates
    │   │   │   └── passwords.yml
    │   ├── persist_hostkeys
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── persist_openhpc_secrets
    │   │   ├── tasks
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   └── openhpc_secrets.fact
    │   ├── podman
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── config.yml
    │   │   │   └── prereqs.yml
    │   ├── proxy
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── pulp_site
    │   │   ├── .gitignore
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── filter_plugins
    │   │   │   └── pulp-list-filters.py
    │   │   ├── tasks
    │   │   │   ├── install.yml
    │   │   │   └── sync.yml
    │   │   └── templates
    │   │   │   ├── cli.toml.j2
    │   │   │   └── settings.py.j2
    │   ├── rebuild
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── main.yml
    │   │   │   ├── rebuild.yml
    │   │   │   └── rebuild_partition.yml
    │   ├── resolv_conf
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── files
    │   │   │   └── NetworkManager-dns-none.conf
    │   │   ├── tasks
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   └── resolv.conf.j2
    │   ├── slurm_exporter
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── install.yml
    │   │   │   └── main.yml
    │   ├── slurm_stats
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── slurm_tools
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── squid
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── configure.yml
    │   │   │   ├── install.yml
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   └── squid.conf.j2
    │   ├── sshd
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   ├── configure.yml
    │   │   │   ├── export.yml
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   └── sshd.conf.j2
    │   ├── sssd
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── configure.yml
    │   │   │   ├── export.yml
    │   │   │   ├── install.yml
    │   │   │   └── main.yml
    │   ├── systemd
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── tuned
    │   │   ├── README.md
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   ├── configure.yml
    │   │   │   ├── install.yml
    │   │   │   └── main.yml
    │   └── zenith_proxy
    │   │   ├── defaults
    │   │       └── main.yml
    │   │   ├── files
    │   │       └── podman-pod-infra-attach.sh
    │   │   ├── tasks
    │   │       └── main.yml
    │   │   └── templates
    │   │       ├── client.service.j2
    │   │       ├── mitm.service.j2
    │   │       ├── pod.service.j2
    │   │       └── zenith-client.yaml.j2
    ├── site.yml
    ├── slurm.yml
    └── validate.yml
├── dev
    ├── ansible-ssh
    ├── delete-cluster.py
    ├── extract_logs.py
    ├── image-share.sh
    ├── output_manifest.py
    └── setup-env.sh
├── docs
    ├── README.md
    ├── adding-functionality.md
    ├── alerting.md
    ├── chrony.md
    ├── ci.md
    ├── environments.md
    ├── experimental
    │   ├── compute-init.md
    │   ├── pulp.md
    │   └── slurm-controlled-rebuild.md
    ├── image-build.md
    ├── k3s.README.md
    ├── monitoring-and-logging.md
    ├── networks.md
    ├── openondemand.md
    ├── operations.md
    ├── persistent-state.md
    ├── production.md
    ├── screenshots
    │   └── grafana
    │   │   ├── dashboard-node-exporter.png
    │   │   ├── dashboard-openhpc-slurm-jobs.png
    │   │   ├── dashboard-openhpc-slurm.png
    │   │   └── grafana-slurm-jobs-linking-to-node-exporter.png
    ├── sequence.md
    ├── site
    │   └── README.md
    └── upgrades.md
├── environments
    ├── .caas
    │   ├── README.md
    │   ├── ansible.cfg
    │   ├── assets
    │   │   └── ood-icon.png
    │   ├── hooks
    │   │   ├── .gitkeep
    │   │   ├── post.yml
    │   │   └── pre.yml
    │   ├── inventory
    │   │   ├── group_vars
    │   │   │   ├── all
    │   │   │   │   ├── .gitkeep
    │   │   │   │   ├── basic_users.yml
    │   │   │   │   ├── cluster.yml
    │   │   │   │   ├── grafana.yml
    │   │   │   │   ├── hpctests.yml
    │   │   │   │   ├── manila.yml
    │   │   │   │   ├── nfs.yml
    │   │   │   │   ├── openhpc.yml
    │   │   │   │   ├── openondemand.yml
    │   │   │   │   ├── prometheus.yml
    │   │   │   │   └── zenith.yml
    │   │   │   └── openstack.yml
    │   │   ├── groups
    │   │   └── hosts
    │   └── ui-meta
    │   │   ├── slurm-infra-fast-volume-type.yml
    │   │   ├── slurm-infra-manila-home.yml
    │   │   └── slurm-infra.yml
    ├── .stackhpc
    │   ├── .gitignore
    │   ├── ARCUS.pkrvars.hcl
    │   ├── LEAFCLOUD.pkrvars.hcl
    │   ├── SMS.pkrvars.hcl
    │   ├── activate
    │   ├── ansible.cfg
    │   ├── bastion_fingerprints
    │   ├── cacerts
    │   │   └── myCA.pem
    │   ├── hooks
    │   │   ├── post-bootstrap.yml
    │   │   └── pre.yml
    │   ├── inventory
    │   │   ├── everything
    │   │   ├── extra_groups
    │   │   └── group_vars
    │   │   │   ├── all
    │   │   │       ├── .gitkeep
    │   │   │       ├── basic_users.yml
    │   │   │       ├── bastion.yml
    │   │   │       ├── freeipa.yml
    │   │   │       ├── grafana.yml
    │   │   │       ├── hpctests.yml
    │   │   │       ├── manila.yml
    │   │   │       ├── openhpc.yml
    │   │   │       ├── openondemand.yml
    │   │   │       └── podman.yml
    │   │   │   └── builder.yml
    │   └── tofu
    │   │   ├── ARCUS.tfvars
    │   │   ├── LEAFCLOUD-dev.tfvars
    │   │   ├── LEAFCLOUD.tfvars
    │   │   ├── SMS.tfvars
    │   │   ├── cluster_image.auto.tfvars.json
    │   │   └── main.tf
    ├── README.md
    ├── common
    │   ├── .gitignore
    │   ├── README.md
    │   ├── files
    │   │   ├── filebeat
    │   │   │   └── filebeat.yml
    │   │   ├── grafana
    │   │   │   └── grafana.repo.j2
    │   │   ├── opensearch
    │   │   │   └── internal_users.yml.j2
    │   │   └── prometheus
    │   │   │   └── rules
    │   │   │       ├── node-exporter.rules
    │   │   │       ├── precompute.rules
    │   │   │       └── slurm.rules
    │   ├── inventory
    │   │   ├── group_vars
    │   │   │   ├── all
    │   │   │   │   ├── alertmanager.yml
    │   │   │   │   ├── ansible_init.yml
    │   │   │   │   ├── basic_users.yml
    │   │   │   │   ├── defaults.yml
    │   │   │   │   ├── filebeat.yml
    │   │   │   │   ├── firewalld.yml
    │   │   │   │   ├── freeipa_server.yml
    │   │   │   │   ├── grafana.yml
    │   │   │   │   ├── hpctests.yml
    │   │   │   │   ├── k3s.yml
    │   │   │   │   ├── manila.yml
    │   │   │   │   ├── mysql.yml
    │   │   │   │   ├── nfs.yml
    │   │   │   │   ├── openhpc.yml
    │   │   │   │   ├── openondemand.yml
    │   │   │   │   ├── opensearch.yml
    │   │   │   │   ├── os-manila-mount.yml
    │   │   │   │   ├── podman.yml
    │   │   │   │   ├── prometheus.yml
    │   │   │   │   ├── proxy.yml
    │   │   │   │   ├── pulp.yml
    │   │   │   │   ├── selinux.yml
    │   │   │   │   ├── slurm_exporter.yml
    │   │   │   │   ├── squid.yml
    │   │   │   │   ├── sshd.yaml
    │   │   │   │   ├── systemd.yml
    │   │   │   │   ├── timestamps.yml
    │   │   │   │   └── update.yml
    │   │   │   └── builder
    │   │   │   │   └── defaults.yml
    │   │   └── groups
    │   └── layouts
    │   │   ├── README.md
    │   │   ├── everything
    │   │   └── minimal
    └── skeleton
    │   ├── cookiecutter.json
    │   └── {{cookiecutter.environment}}
    │       ├── README.md
    │       ├── activate
    │       ├── ansible.cfg
    │       ├── hooks
    │           └── .gitkeep
    │       ├── inventory
    │           ├── group_vars
    │           │   └── all
    │           │   │   ├── .gitkeep
    │           │   │   ├── alertmanager.yml
    │           │   │   ├── basic_users.yml
    │           │   │   ├── grafana.yml
    │           │   │   ├── hpctests.yml
    │           │   │   └── vault_alertmanager.yml
    │           └── groups
    │       └── tofu
    │           ├── baremetal-node-list.py
    │           ├── compute.tf
    │           ├── control.tf
    │           ├── data.tf
    │           ├── inventory.tf
    │           ├── inventory.tpl
    │           ├── login.tf
    │           ├── main.tf
    │           ├── network.tf
    │           ├── node_group
    │               ├── main.tf
    │               ├── network.tf
    │               ├── nodes.tf
    │               └── variables.tf
    │           ├── read-inventory-secrets.py
    │           ├── variables.tf
    │           └── volumes.tf
├── packer
    ├── .gitignore
    ├── ansible-inventory.sh
    ├── openhpc_extravars.yml
    └── openstack.pkr.hcl
├── requirements.txt
└── requirements.yml


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @stackhpc/batch
2 | 


--------------------------------------------------------------------------------
/.github/bin/get-s3-image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #####
 4 | # This script looks for an image in OpenStack and if not found, downloads from
 5 | # S3 bucket, and then uploads to OpenStack
 6 | #####
 7 | 
 8 | set -ex
 9 | 
10 | image_name=$1
11 | bucket_name=$2
12 | echo "Checking if image $image_name exists in OpenStack"
13 | image_exists=$(openstack image list --name "$image_name" -f value -c Name)
14 | 
15 | if [ -n "$image_exists" ]; then
16 |     echo "Image $image_name already exists in OpenStack."
17 | else
18 |     echo "Image $image_name not found in OpenStack. Getting it from S3."
19 | 
20 |     wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/$bucket_name/$image_name --progress=dot:giga
21 | 
22 |     echo "Uploading image $image_name to OpenStack..."
23 |     openstack image create --file $image_name --disk-format qcow2 $image_name --progress
24 | 
25 |     echo "Image $image_name has been uploaded to OpenStack."
26 | fi


--------------------------------------------------------------------------------
/.github/workflows/release-image.yml:
--------------------------------------------------------------------------------
 1 | name: Release images
 2 | on:
 3 |   workflow_dispatch:
 4 |   release:
 5 |     types:
 6 |       - published # should work for both pre-releases and releases
 7 | env:
 8 |   IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
 9 | jobs:
10 |   ci-image-release:
11 |     name: ci-image-release
12 |     runs-on: ubuntu-22.04
13 |     concurrency: ${{ github.workflow }}-${{ github.ref }}
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         build:
18 |           - RL8
19 |           - RL9
20 |     steps:
21 |       - uses: actions/checkout@v2
22 | 
23 |       - name: Write s3cmd configuration
24 |         run: echo "${{ secrets.ARCUS_S3_CFG }}" > ~/.s3cfg
25 | 
26 |       - name: Install s3cmd
27 |         run: |
28 |           sudo apt-get update
29 |           sudo apt-get --yes install s3cmd
30 |       
31 |       - name: Retrieve image name
32 |         run: |
33 |           TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
34 |           echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
35 |       
36 |       - name: Copy image from pre-release to release bucket
37 |         run: s3cmd cp s3://openhpc-images-prerelease/${{ env.TARGET_IMAGE }} s3://openhpc-images
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | terraform.tfstate*
2 | .terraform
3 | config-drive.iso
4 | venv
5 | *.pyc
6 | packer/openhpc2
7 | .vscode
8 | 


--------------------------------------------------------------------------------
/ansible.cfg:
--------------------------------------------------------------------------------
 1 | # Only used for Azimuth running the caas environment
 2 | [defaults]
 3 | any_errors_fatal = True
 4 | gathering = smart
 5 | forks = 30
 6 | host_key_checking = False
 7 | remote_tmp = /tmp
 8 | collections_path = ansible/collections
 9 | roles_path = ansible/roles
10 | filter_plugins = ansible/filter_plugins
11 | callbacks_enabled = ansible.posix.profile_tasks
12 | 
13 | [ssh_connection]
14 | ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
15 | pipelining = True
16 | # This is important because we are using one of the hosts in the play as a jump host
17 | # This ensures that if the proxy connection is interrupted, rendering the other hosts
18 | # unreachable, the connection is retried instead of failing the entire play
19 | retries = 10
20 | 


--------------------------------------------------------------------------------
/ansible/adhoc/backup-keytabs.yml:
--------------------------------------------------------------------------------
 1 | # Use ONE of the following tags on this playbook:
 2 | #   - retrieve: copies keytabs out of the state volume to the environment
 3 | #   - deploy: copies keytabs from the environment to the state volume
 4 | 
 5 | - hosts: freeipa_client
 6 |   become: yes
 7 |   gather_facts: no
 8 |   tasks:
 9 |     - import_role:
10 |         name: freeipa
11 |         tasks_from: backup-keytabs.yml
12 | 


--------------------------------------------------------------------------------
/ansible/adhoc/cudatests.yml:
--------------------------------------------------------------------------------
1 | - hosts: cuda
2 |   become: yes
3 |   gather_facts: yes
4 |   tags: cuda_samples
5 |   tasks:
6 |     - import_role:
7 |         name: cuda
8 |         tasks_from: samples.yml
9 | 


--------------------------------------------------------------------------------
/ansible/adhoc/deploy-pulp.yml:
--------------------------------------------------------------------------------
 1 | # Usage: ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server=<pulp server hostname>"
 2 | 
 3 | - name: Add temporary pulp server host
 4 |   hosts: localhost
 5 |   tasks:
 6 |   - ansible.builtin.add_host:
 7 |       name: "{{ pulp_server }}"
 8 |       group: "_pulp_host"
 9 | 
10 | - name: Install pulp on server and add to config
11 |   become: yes
12 |   hosts: _pulp_host
13 |   tasks:
14 |   - name: Install pulp
15 |     ansible.builtin.include_role:
16 |       name: pulp_site
17 |       tasks_from: install.yml
18 |       public: true
19 | 
20 |   - name: Print Pulp endpoint
21 |     become: no
22 |     debug:
23 |       msg: | 
24 |         Server configured, override 'appliances_pulp_url' with
25 |           appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}"
26 |         in your environments
27 | 


--------------------------------------------------------------------------------
/ansible/adhoc/generate-passwords.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | - name: Generate passwords.yml
4 |   hosts: localhost
5 |   gather_facts: false
6 |   tasks:
7 |     - name: Include password generation role
8 |       include_role:
9 |         name: passwords


--------------------------------------------------------------------------------
/ansible/adhoc/hpctests.yml:
--------------------------------------------------------------------------------
 1 | # An MPI-based test suite for Slurm appliance clusters. Safe to use on in-production clusters.
 2 | # See ansible/roles/hpctests/README.md for details and options.
 3 | # Relies on installed packages in appliance defaults - see openhpc variables.
 4 | 
 5 | ---
 6 | 
 7 | - hosts: hpctests[0] # TODO: might want to make which node is used selectable?
 8 |   become: false
 9 |   gather_facts: false
10 |   tasks:
11 |     - import_role:
12 |         name: hpctests
13 | 


--------------------------------------------------------------------------------
/ansible/adhoc/rebuild-via-slurm.yml:
--------------------------------------------------------------------------------
 1 | # Rebuild compute nodes via slurm.
 2 | # Nodes will be rebuilt if `image_id` in inventory is different to the
 3 | # currently-provisioned image. Otherwise they are rebooted.
 4 | 
 5 | # Example:
 6 | #   ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml
 7 | 
 8 | # See docs/slurm-controlled-rebuild.md.
 9 | 
10 | - hosts: login
11 |   run_once: true
12 |   gather_facts: no
13 |   tasks:
14 |     - name: Run slurm-controlled rebuild
15 |       import_role:
16 |         name: rebuild
17 |         tasks_from: rebuild.yml
18 | 


--------------------------------------------------------------------------------
/ansible/adhoc/rebuild.yml:
--------------------------------------------------------------------------------
 1 | # Rebuild hosts with a specified image from OpenStack.
 2 | # 
 3 | # Use ansible's -v output to see output.
 4 | # Use --limit to control which hosts to rebuild (either specific hosts or the <cluster_name>_<partition_name> groups defining partitions).
 5 | # Optionally, supply `-e rebuild_image=<image_name_or_id>` to define a specific image, otherwise the current image is reused.
 6 | #
 7 | # NOTE: If a hostvar `instance_id` is defined this is used to select hosts. Otherwise the hostname is used and this must be unique, which may not be the case e.g. if using identically-named staging and production hosts.
 8 | #
 9 | # Example:
10 | #   ansible-playbook -v --limit ohpc_compute ansible/adhoc/rebuild.yml -e rebuild_image=openhpc_v2.3
11 | 
12 | - hosts: cluster
13 |   become: no
14 |   gather_facts: no
15 |   tasks:
16 |     - command: "openstack server rebuild {{ instance_id | default(inventory_hostname) }}{% if rebuild_image is defined %} --image {{ rebuild_image }}{% endif %}"
17 |       delegate_to: localhost
18 |     - wait_for_connection:
19 |         delay: 60
20 |         timeout: 600
21 | 
22 | 


--------------------------------------------------------------------------------
/ansible/adhoc/restart-slurm.yml:
--------------------------------------------------------------------------------
 1 | # Restart all slurm daemons e.g. after changing configuration. Note that:
 2 | # - `scontrol reconfigure` will handle most reconfiguration - see https://slurm.schedmd.com/scontrol.html#OPT_reconfigure
 3 | #   for which options need a restart
 4 | # - Adding or removing nodes by changing the `openhpc_` configuration and rerunning ansible/site.yml will automatically
 5 | #   restart daemons as required.
 6 | 
 7 | - hosts: compute,login
 8 |   become: yes
 9 |   gather_facts: no
10 |   tasks:
11 |     - service:
12 |         name: slurmd
13 |         state: stopped
14 | 
15 | - hosts: control
16 |   become: yes
17 |   gather_facts: no
18 |   tasks:
19 |     - service:
20 |         name: slurmctld
21 |         state: restarted
22 | 
23 | - hosts: compute,login
24 |   become: yes
25 |   gather_facts: no
26 |   tasks:
27 |     - service:
28 |         name: slurmd
29 |         state: started
30 | 


--------------------------------------------------------------------------------
/ansible/adhoc/sync-pulp.yml:
--------------------------------------------------------------------------------
 1 | - hosts: localhost
 2 |   tasks:
 3 |     - ansible.builtin.include_role:
 4 |         name: pulp_site
 5 |         tasks_from: sync.yml
 6 |       vars:
 7 |         pulp_site_target_arch: "x86_64"
 8 |         pulp_site_target_distribution: "rocky"
 9 |         pulp_site_target_distribution_version: "9.5"
10 |         pulp_site_target_distribution_version_major: "9"
11 | 


--------------------------------------------------------------------------------
/ansible/adhoc/update-packages.yml:
--------------------------------------------------------------------------------
 1 | - hosts: update
 2 |   become: yes
 3 |   gather_facts: false
 4 |   tasks:
 5 |     - name: Update selected packages
 6 |       yum:
 7 |         name: "{{ update_name }}"
 8 |         state: "{{ update_state }}"
 9 |         exclude: "{{ update_exclude }}"
10 |         disablerepo: "{{ update_disablerepo }}"
11 |       register: updates
12 |     - name: Log updated packages
13 |       copy:
14 |         content: "{{ updates.results | join('\n') }}"
15 |         dest: "{{ update_log_path }}"
16 |       delegate_to: localhost
17 |     - debug:
18 |         msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details"
19 | 


--------------------------------------------------------------------------------
/ansible/ci/check_grafana.yml:
--------------------------------------------------------------------------------
 1 | # Checks Slurm jobs from hpctests are shown in Grafana.
 2 | # Can't actually check the dashboard programatically so this queries the datasource used by the dashboard instead.
 3 | 
 4 | - hosts: control # so proxying etc is irrelevant
 5 |   gather_facts: no
 6 |   become: no
 7 |   tasks:
 8 |     - name: Wait for slurm-stats file to exist (run by cron)
 9 |       ansible.builtin.wait_for:
10 |         path: /var/log/slurm-stats/finished_jobs.json
11 |         timeout: 315 # slurm stats cron job runs every 5 mins
12 |       
13 |     - name: Query grafana for expected hpctests jobs
14 |       grafana_elasticsearch_query:
15 |         grafana_url: http://{{ grafana_api_address }}:{{ grafana_port }}
16 |         grafana_username: grafana
17 |         grafana_password: "{{ vault_grafana_admin_password }}"
18 |         datasource: slurmstats
19 |         index_pattern: filebeat-*
20 |       register: _slurm_stats_jobs
21 |       until: _expected_jobs | difference(_found_jobs) == []
22 |       retries: 60
23 |       delay: 5
24 |       vars:
25 |         _found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}"
26 |         _expected_jobs: ['pingpong.sh']
27 | 


--------------------------------------------------------------------------------
/ansible/ci/check_sacct_hpctests.yml:
--------------------------------------------------------------------------------
 1 | - hosts: control
 2 |   gather_facts: false
 3 |   become: true
 4 |   vars:
 5 |     sacct_stdout_expected: |- # based on CI running hpctests as the first job
 6 |       JobID,JobName,State
 7 |       1,pingpong.sh,COMPLETED
 8 |   tasks:
 9 |     - name: Get info for ended jobs
10 |       shell:
11 |         cmd: sacct --format=jobid,jobname,state --allocations --parsable2 --delimiter=, --starttime=now-1days --endtime=now
12 |         # by default start/end time is midnight/now which is not robust
13 |       changed_when: false
14 |       register: sacct
15 |     - name: Check info for ended jobs
16 |       assert:
17 |         that: sacct_stdout_expected in sacct.stdout
18 |         fail_msg: |
19 |           Expected:
20 |           --{{ sacct_stdout_expected }}--
21 |           Got:
22 |           --{{ sacct.stdout }}--
23 |         success_msg: sacct shows hpctests jobs as first jobs in list
24 | 


--------------------------------------------------------------------------------
/ansible/ci/check_slurm.yml:
--------------------------------------------------------------------------------
 1 | - hosts: login:!builder # won't have a slurm control daemon when in build
 2 |   become: no
 3 |   gather_facts: false
 4 |   tasks:
 5 |     - name: Run sinfo
 6 |       shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
 7 |       register: sinfo
 8 |       changed_when: false
 9 |       until: sinfo.stdout_lines == expected_sinfo
10 |       retries: 200
11 |       delay: 5
12 |       vars:
13 |         expected_sinfo:
14 |           - " extra up 60-00:00:00 0 n/a" # empty partition
15 |           - "{{ openhpc_cluster_name }}-compute-[0-1] standard* up 60-00:00:00 2 idle"
16 | 


--------------------------------------------------------------------------------
/ansible/ci/delete_images.yml:
--------------------------------------------------------------------------------
 1 | - hosts: login:!builder
 2 |   become: no
 3 |   gather_facts: no
 4 |   tasks:
 5 |     - import_tasks: get_image_ids.yml
 6 |     
 7 |     - name: Delete images
 8 |       shell:
 9 |         cmd: |
10 |           openstack image delete {{ item.artifact_id }}
11 |       delegate_to: localhost
12 |       loop: "{{ manifest['builds'] }}"
13 | 


--------------------------------------------------------------------------------
/ansible/ci/get_image_ids.yml:
--------------------------------------------------------------------------------
 1 | - name: Read packer build manifest
 2 |   set_fact:
 3 |     manifest: "{{ lookup('file', manifest_path) | from_json }}"
 4 |   vars:
 5 |     manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
 6 |   delegate_to: localhost
 7 | 
 8 | - name: Get latest image builds
 9 |   set_fact:
10 |     login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
11 |     compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"
12 |     control_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'control'}) | last }}"
13 | 


--------------------------------------------------------------------------------
/ansible/ci/output_vars.yml:
--------------------------------------------------------------------------------
 1 | # Output specific hostvars to a file in a form which can be sourced by bash
 2 | # NB: obviously the keys and values for the hostvars need to be suitable bash variables
 3 | - hosts: "{{ output_vars_hosts }}"
 4 |   gather_facts: no
 5 |   tasks:
 6 |     - copy:
 7 |         dest: "{{ output_vars_path }}"
 8 |         content: |
 9 |           {% for item in output_vars_items.split(',') %}
10 |           export {{output_vars_prefix | default('') }}{{ item }}={{ lookup('vars', item) }}
11 |           {% endfor %}
12 |       delegate_to: localhost
13 | 


--------------------------------------------------------------------------------
/ansible/ci/retrieve_inventory.yml:
--------------------------------------------------------------------------------
 1 | # Retrieve inventory from a deployed CI arcus environment by reversing arcus/inventory/hooks/pre.yml
 2 | # Usage example:
 3 | #   ansible-playbook ansible/ci/retrieve_inventory.yml -e cluster_prefix=ci4005969475
 4 | #
 5 | - hosts: localhost
 6 |   become: no
 7 |   gather_facts: no
 8 |   vars:
 9 |     cluster_prefix: "{{ undef(hint='cluster_prefix must be defined') }}" # e.g. ci4005969475
10 |     ci_vars_file: "{{ appliances_environment_root + '/tofu/' + lookup('env', 'CI_CLOUD') }}.tfvars"
11 |     cluster_network: "{{ lookup('ansible.builtin.ini', 'cluster_net', file=ci_vars_file, type='properties') | trim('\"') }}"
12 |   tasks:
13 |     - name: Get control host IP
14 |       set_fact:
15 |         control_ip: "{{ (lookup('pipe', 'openstack server show -f json ' + cluster_prefix + '-control') | from_json)['addresses'][cluster_network][0] }}"
16 |     - name: Add host into in-memory inventory
17 |       add_host:
18 |         name: cluster_control
19 |         groups: control
20 |         ansible_host: "{{ control_ip }}"
21 | 
22 | - hosts: control
23 |   become: yes
24 |   gather_facts: no
25 |   tasks:
26 |     - ansible.builtin.fetch:
27 |         src: "/etc/ci-config/{{ item | basename }}"
28 |         dest: "{{ item }}"
29 |         flat: true
30 |       loop:
31 |         - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts"
32 |         - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml"
33 |         - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml"
34 | 


--------------------------------------------------------------------------------
/ansible/ci/update_timestamps.yml:
--------------------------------------------------------------------------------
 1 | - hosts: localhost
 2 |   tasks:
 3 |     - name: Get latest timestamps from sources
 4 |       latest_timestamps:
 5 |         repos_dict: "{{ appliances_pulp_repos }}"
 6 |         content_url: "https://ark.stackhpc.com/pulp/content"
 7 |       register: _result
 8 | 
 9 |     - name: Overwrite repo timestamps with latest
10 |       ansible.builtin.copy:
11 |         dest: "{{ appliances_repository_root }}/environments/common/inventory/group_vars/all/timestamps.yml"
12 |         content: "{{ repo_template | to_nice_yaml(indent=2) }}"
13 |         backup: true
14 |       vars:
15 |         repo_template:
16 |           appliances_pulp_repos: "{{ _result.timestamps }}"
17 | 


--------------------------------------------------------------------------------
/ansible/disable-repos.yml:
--------------------------------------------------------------------------------
1 | - hosts: dnf_repos
2 |   become: yes
3 |   tasks:
4 |     - name: Disable pulp repos
5 |       ansible.builtin.include_role:
6 |         name: dnf_repos
7 |         tasks_from: disable_repos.yml
8 |       when: not dnf_repos_enabled | default(false) | bool
9 | 


--------------------------------------------------------------------------------
/ansible/filesystems.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Setup block devices
 4 |   hosts: block_devices
 5 |   become: yes
 6 |   tags: block_devices
 7 |   tasks:
 8 |     - include_role:
 9 |         name: block_devices
10 | 
11 | - name: Setup NFS
12 |   hosts: nfs
13 |   become: true
14 |   tags:
15 |     - nfs
16 |   tasks:
17 |     - include_role:
18 |         name: stackhpc.nfs
19 | 
20 | - name: Setup Manila share mounts
21 |   hosts: manila
22 |   become: true
23 |   tags: manila
24 |   tasks:
25 |     - include_role:
26 |         name: stackhpc.os-manila-mount
27 | 
28 | - name: Setup Lustre clients
29 |   hosts: lustre
30 |   become: true
31 |   tags: lustre
32 |   tasks:
33 |     - include_role:
34 |         name: lustre
35 |         # NB install is ONLY run in builder
36 |         tasks_from: configure.yml
37 | 


--------------------------------------------------------------------------------
/ansible/iam.yml:
--------------------------------------------------------------------------------
 1 | - hosts: freeipa_client
 2 |   tags:
 3 |     - freeipa
 4 |     - freeipa_server # as this is only relevant if using freeipa_server
 5 |     - freeipa_host
 6 |   gather_facts: no
 7 |   become: yes
 8 |   tasks:
 9 |     - name: Ensure FreeIPA client hosts are added to the FreeIPA server
10 |       import_role:
11 |         name: freeipa
12 |         tasks_from: addhost.yml
13 |       when: groups['freeipa_server'] | length > 0
14 | 
15 | - hosts: freeipa_client
16 |   tags:
17 |     - freeipa
18 |     - freeipa_client
19 |   gather_facts: yes
20 |   become: yes
21 |   tasks:
22 |     - name: Install FreeIPA client
23 |       import_role:
24 |         name: freeipa
25 |         tasks_from: client-install.yml
26 |     - name: Enrol FreeIPA client
27 |       import_role:
28 |         name: freeipa
29 |         tasks_from: enrol.yml
30 | 
31 | - hosts: freeipa_server
32 |   tags:
33 |     - freeipa
34 |     - freeipa_server
35 |     - users
36 |   gather_facts: yes
37 |   become: yes
38 |   tasks:
39 |     - name: Add FreeIPA users
40 |       import_role:
41 |         name: freeipa
42 |         tasks_from: users.yml
43 | 
44 | - hosts: sssd
45 |   become: yes
46 |   gather_facts: no
47 |   tags: sssd
48 |   tasks:
49 |     - name: Configure sssd
50 |       import_role:
51 |         name: sssd
52 | 


--------------------------------------------------------------------------------
/ansible/noop.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | # This file exists so that we can conditionally import a playbook. The path
 4 | # must exist, but we can use a when conditional so that it is not actually
 5 | # run
 6 | 
 7 | - hosts: localhost
 8 |   gather_facts: false
 9 |   tasks: []
10 | 


--------------------------------------------------------------------------------
/ansible/portal.yml:
--------------------------------------------------------------------------------
 1 | - hosts: openondemand
 2 |   tags:
 3 |     - openondemand
 4 |     - openondemand_server
 5 |   become: yes
 6 |   gather_facts: yes # TODO
 7 |   tasks:
 8 |     - import_role:
 9 |         name: openondemand
10 |         tasks_from: main.yml
11 | 
12 | - hosts: openondemand_desktop
13 |   tags:
14 |     - openondemand
15 |     - openondemand_desktop
16 |   become: yes
17 |   gather_facts: yes
18 |   tasks:
19 |     - import_role:
20 |         name: openondemand
21 |         tasks_from: vnc_compute.yml
22 | 
23 | - hosts: openondemand_jupyter
24 |   tags:
25 |     - openondemand
26 |     - openondemand_jupyter
27 |   become: yes
28 |   gather_facts: yes
29 |   tasks:
30 |     - import_role:
31 |         name: openondemand
32 |         tasks_from: jupyter_compute.yml
33 | 


--------------------------------------------------------------------------------
/ansible/roles/alertmanager/handlers/main.yml:
--------------------------------------------------------------------------------
1 | - name: Restart alertmanager
2 |   systemd:
3 |     name: alertmanager
4 |     state: restarted
5 |     daemon_reload: "{{ _alertmanager_service.changed | default(false) }}"
6 |   when: alertmanager_started | bool
7 | 


--------------------------------------------------------------------------------
/ansible/roles/alertmanager/tasks/configure.yml:
--------------------------------------------------------------------------------
 1 | - name: Create alertmanager directories
 2 |   ansible.builtin.file:
 3 |     path: "{{ item }}"
 4 |     state: directory
 5 |     owner: "{{ alertmanager_system_user }}"
 6 |     group: "{{ alertmanager_system_group }}"
 7 |     mode: u=rwX,go=rX
 8 |   loop:
 9 |     - "{{ alertmanager_config_file | dirname }}"
10 |     - "{{ alertmanager_web_config_file | dirname }}"
11 |     - "{{ alertmanager_storage_path }}"
12 | 
13 | - name: Create alertmanager service file with immutable options
14 |   template:
15 |     src: alertmanager.service.j2
16 |     dest: /usr/lib/systemd/system/alertmanager.service
17 |     owner: root
18 |     group: root
19 |     mode: u=rw,go=r
20 |   register: _alertmanager_service
21 |   notify: Restart alertmanager
22 | 
23 | - name: Template alertmanager config
24 |   ansible.builtin.template:
25 |     src: "{{ alertmanager_config_template }}"
26 |     dest: "{{ alertmanager_config_file }}"
27 |     owner: "{{ alertmanager_system_user }}"
28 |     group: "{{ alertmanager_system_group }}"
29 |     mode: u=rw,go=
30 |   notify: Restart alertmanager
31 | 
32 | - name: Template alertmanager web config
33 |   ansible.builtin.template:
34 |     src: "{{ alertmanager_web_config_template }}"
35 |     dest: "{{ alertmanager_web_config_file }}"
36 |     owner: "{{ alertmanager_system_user }}"
37 |     group: "{{ alertmanager_system_group }}"
38 |     mode: u=rw,go=
39 |   notify: Restart alertmanager
40 | 
41 | - meta: flush_handlers
42 | 
43 | - name: Ensure alertmanager service state
44 |   systemd:
45 |     name: alertmanager
46 |     state: "{{ 'started' if alertmanager_started | bool else 'stopped' }}"
47 |     enabled: "{{ alertmanager_enabled | bool }}"
48 | 


--------------------------------------------------------------------------------
/ansible/roles/alertmanager/tasks/install.yml:
--------------------------------------------------------------------------------
 1 | - name: Create alertmanager system user
 2 |   ansible.builtin.user:
 3 |     name: "{{ alertmanager_system_user }}"
 4 |     system: true
 5 |     create_home: false
 6 | 
 7 | - name: Download alertmanager binary
 8 |   ansible.builtin.get_url:
 9 |     url: "https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/alertmanager-{{ alertmanager_version }}.linux-amd64.tar.gz"
10 |     dest: "{{ alertmanager_download_dest }}"
11 |     owner: root
12 |     group: root
13 |     mode: u=rw,go=
14 |     checksum: "{{ alertmanager_download_checksum }}"
15 | 
16 | - name: Unpack alertmanager binary
17 |   ansible.builtin.unarchive:
18 |     src: "{{ alertmanager_download_dest }}"
19 |     include: "alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager"
20 |     dest: "{{ alertmanager_binary_dir }}"
21 |     owner: root
22 |     group: root
23 |     mode: u=rwx,go=rx
24 |     remote_src: true
25 |     extra_opts: ['--strip-components=1', '--show-stored-names']
26 | 


--------------------------------------------------------------------------------
/ansible/roles/alertmanager/templates/alertmanager-web.yml.j2:
--------------------------------------------------------------------------------
1 | {{ ansible_managed | comment }}
2 | 
3 | {{ alertmanager_web_config_default | to_nice_yaml }}
4 | {{ alertmanager_alertmanager_web_config_extra | to_nice_yaml if alertmanager_alertmanager_web_config_extra | length > 0 else '' }}
5 | 


--------------------------------------------------------------------------------
/ansible/roles/alertmanager/templates/alertmanager.service.j2:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | {{ ansible_managed | comment }}
 5 | [Unit]
 6 | Description=Prometheus Alertmanager
 7 | After=network-online.target
 8 | StartLimitInterval=0
 9 | StartLimitIntervalSec=0
10 | 
11 | [Service]
12 | Type=simple
13 | PIDFile=/run/alertmanager.pid
14 | User={{ alertmanager_system_user }}
15 | Group={{ alertmanager_system_group }}
16 | ExecReload=/bin/kill -HUP $MAINPID
17 | ExecStart={{ alertmanager_binary_dir }}/alertmanager \
18 |   --cluster.listen-address='' \
19 |   --config.file={{ alertmanager_config_file }} \
20 |   --storage.path={{ alertmanager_storage_path }} \
21 |   --data.retention={{ alertmanager_data_retention }} \
22 |   --data.maintenance-interval={{ alertmanager_data_maintenance_interval }} \
23 | {% for address in alertmanager_web_listen_addresses %}
24 |   --web.listen-address={{ address }} \
25 | {% endfor %}
26 |   --web.external-url={{ alertmanager_web_external_url }} \
27 |   --web.config.file={{ alertmanager_web_config_file }} \
28 | {% for flag, flag_value in alertmanager_config_flags.items() %}
29 |   --{{ flag }}={{ flag_value }} \
30 | {% endfor %}
31 | 
32 | SyslogIdentifier=alertmanager
33 | Restart=always
34 | RestartSec=5
35 | 
36 | CapabilityBoundingSet=CAP_SET_UID
37 | LockPersonality=true
38 | NoNewPrivileges=true
39 | MemoryDenyWriteExecute=true
40 | PrivateTmp=true
41 | ProtectHome=true
42 | ReadWriteDirectories={{ alertmanager_storage_path }}
43 | RemoveIPC=true
44 | RestrictSUIDSGID=true
45 | 
46 | PrivateUsers=true
47 | ProtectControlGroups=true
48 | ProtectKernelModules=true
49 | ProtectKernelTunables=yes
50 | ProtectSystem=strict
51 | 
52 | [Install]
53 | WantedBy=multi-user.target
54 | 


--------------------------------------------------------------------------------
/ansible/roles/alertmanager/templates/alertmanager.yml.j2:
--------------------------------------------------------------------------------
1 | {{ ansible_managed | comment }}
2 | 
3 | {{ alertmanager_config_default | to_nice_yaml }}
4 | {{ alertmanager_config_extra | to_nice_yaml if alertmanager_config_extra | length > 0 else '' }}
5 | 


--------------------------------------------------------------------------------
/ansible/roles/basic_users/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | basic_users_homedir_server: "{{ groups['control'] | first }}" # no way, generally, to find the nfs_server
 2 | basic_users_homedir_server_path: /exports/home
 3 | basic_users_homedir_client: "{{ groups['login'] | first }}"  
 4 | basic_users_userdefaults:
 5 |   state: present # need this here so don't have to add default() everywhere
 6 |   generate_ssh_key:  true
 7 |   ssh_key_comment: "{{ item.name }}"
 8 |   ssh_key_type: ed25519
 9 |   shell: "{{'/sbin/nologin' if 'control' in group_names else omit }}"
10 | basic_users_users: []
11 | basic_users_groups: []
12 | basic_users_override_sssd: false
13 | 


--------------------------------------------------------------------------------
/ansible/roles/basic_users/filter_plugins/filter_keys.py:
--------------------------------------------------------------------------------
 1 | """ Filter a dict to remove specified keys """
 2 | 
 3 | import copy
 4 | 
 5 | USER_MODULE_PARAMS = ('append authorization comment create_home createhome expires force generate_ssh_key group '
 6 |                       'groups hidden home local login_class move_home name user non_unique password password_expire_min '
 7 |                       'password_expire_max password_lock profile remove role seuser shell skeleton ssh_key_bits '
 8 |                       'ssh_key_comment ssh_key_file ssh_key_passphrase ssh_key_type state system uid update_password').split()
 9 | 
10 | class FilterModule(object):
11 | 
12 |     def filters(self):
13 |         return {
14 |             'filter_user_params': self.filter_user_params
15 |         }
16 | 
17 |     def filter_user_params(self, d):
18 |         ''' Return a copy of dict `d` containing only keys which are parameters for the user module'''
19 |         
20 |         user_dict = copy.deepcopy(d)
21 |         remove_keys = set(user_dict).difference(USER_MODULE_PARAMS)
22 |         for key in remove_keys:
23 |             del user_dict[key]
24 |         return user_dict
25 | 


--------------------------------------------------------------------------------
/ansible/roles/block_devices/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | block_devices_configurations: [{}]
 2 | block_devices_partition_state: present # 'present', 'absent' (as for parted) or 'skip'
 3 | block_devices_device: # Path to block device, e.g. '/dev/sda'. See community.general.parted:device and community.general.filesystem:dev
 4 | block_devices_number: # Partition number, e.g 1 for /dev/sda1. See community.general.parted:number
 5 | block_devices_fstype: # Filesystem type, e.g. e.g. 'ext4'. See community.general.filesystem:fstype
 6 | block_devices_resizefs: no # Grow filesystem into block device space (yes or no). See community.general.filesystem:resizefs
 7 | block_devices_filesystem_state: present # 'present', 'absent' (as for community.general.filesystem:state) or 'skip'
 8 | block_devices_path: # Path to mount point, e.g. '/mnt/files'
 9 | block_devices_mount_state: mounted # Mount state, see ansible.posix.mount:state
10 | block_devices_owner: # Name of owner for mounted directory (ansible.buildin.file:owner
11 | block_devices_group: # Name of group for mounted directory (ansible.buildin.file.group)
12 | # also see hostnames
13 | 


--------------------------------------------------------------------------------
/ansible/roles/block_devices/library/block_devices.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Copyright: (c) 2021, StackHPC
 4 | # Apache 2 License
 5 | 
 6 | DOCUMENTATION = r'''
 7 | ---
 8 | module: block_devices
 9 | 
10 | short_description: Return block device paths by serial number.
11 | 
12 | options: (none)
13 | 
14 | author:
15 |     - Steve Brasier (@sjpb)
16 | '''
17 | 
18 | RETURN = r'''
19 | devices:
20 |     description: dict with device serial numbers as keys and full paths (e.g. /dev/sdb) as values
21 |     type: dict
22 |     return: always
23 | '''
24 | 
25 | import json
26 | 
27 | from ansible.module_utils.basic import AnsibleModule
28 | 
29 | def run_module():
30 |     module_args = dict()
31 |     module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
32 |     result = {"changed": False}
33 |     _, stdout, _ = module.run_command("lsblk --paths --json -O", check_rc=True)
34 |     
35 |     device_info = json.loads(stdout)['blockdevices']
36 |     result['devices'] = dict((item['serial'], item['name']) for item in device_info)
37 |     module.exit_json(**result)
38 | 
39 | def main():
40 |     run_module()
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/ansible/roles/cacerts/README.md:
--------------------------------------------------------------------------------
 1 | # cacerts
 2 | 
 3 | Configure CA certificates and trusts.
 4 | 
 5 | ## Role variables
 6 | 
 7 | - `cacerts_cert_dir`: Optional str. Path to directory containing certificates
 8 |   in PEM or DER format. Any files here will be added to the list of CAs trusted
 9 |   by the system.
10 | 
11 | Note: This role assumes the `ca-certificates` dnf package is installed, which
12 | is the case for GenericCloud-based images.
13 | 


--------------------------------------------------------------------------------
/ansible/roles/cacerts/defaults/main.yml:
--------------------------------------------------------------------------------
1 | #cacerts_dest_dir: /etc/pki/ca-trust/source/anchors/
2 | cacerts_cert_dir: "{{ appliances_environment_root }}/cacerts"
3 | cacerts_update: true
4 | 


--------------------------------------------------------------------------------
/ansible/roles/cacerts/tasks/configure.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Copy all certificates
 4 |   copy:
 5 |     src: "{{ item }}"
 6 |     dest: /etc/pki/ca-trust/source/anchors/
 7 |     owner: root
 8 |     group: root
 9 |     mode: 0644
10 |   with_fileglob:
11 |     - "{{ cacerts_cert_dir }}/*"
12 |   become: true
13 | 
14 | - name: Update trust store
15 |   command: update-ca-trust extract
16 |   become: true
17 | 


--------------------------------------------------------------------------------
/ansible/roles/cacerts/tasks/export.yml:
--------------------------------------------------------------------------------
 1 | - name: Copy cacerts from deploy host to /exports/cluster/cacerts/
 2 |   copy:
 3 |     src: "{{ item }}"
 4 |     dest: /exports/cluster/cacerts/
 5 |     owner: slurm
 6 |     group: root
 7 |     mode: 0644
 8 |   with_fileglob:
 9 |     - "{{ cacerts_cert_dir }}/*"
10 |   delegate_to: "{{ groups['control'] | first }}"
11 |   run_once: true
12 | 


--------------------------------------------------------------------------------
/ansible/roles/cacerts/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - import_tasks: configure.yml
2 | 


--------------------------------------------------------------------------------
/ansible/roles/cluster_infra/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ansible_init_collections: []
2 | ansible_init_playbooks: []
3 | 


--------------------------------------------------------------------------------
/ansible/roles/cluster_infra/templates/providers.tf.j2:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 0.14"
 3 | 
 4 |   # We need the OpenStack provider
 5 |   required_providers {
 6 |     openstack = {
 7 |       source = "terraform-provider-openstack/openstack"
 8 |       version = "~>3.0.0"
 9 |     }
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/ansible/roles/compute_init/templates/hostvars.yml.j2:
--------------------------------------------------------------------------------
1 | {{ hostvars[inventory_hostname] | to_nice_json }}


--------------------------------------------------------------------------------
/ansible/roles/cuda/README.md:
--------------------------------------------------------------------------------
 1 | # cuda
 2 | 
 3 | Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | Requires OFED to be installed to provide required kernel-* packages.
 8 | 
 9 | ## Role Variables
10 | 
11 | - `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture.
12 | - `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version.
13 | - `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-9']`.
14 | - `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA.
15 | - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.
16 | 


--------------------------------------------------------------------------------
/ansible/roles/cuda/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
 2 | cuda_nvidia_driver_stream: '575-open'
 3 | cuda_package_version: '12.9.0-1'
 4 | cuda_version_short: '12.9'
 5 | cuda_packages:
 6 |   - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
 7 |   - nvidia-gds
 8 |   - cmake
 9 |   - cuda-toolkit-12-9
10 | cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
11 | cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples"
12 | cuda_samples_programs:
13 |   - deviceQuery
14 |   - bandwidthTest
15 | # cuda_devices: # discovered from deviceQuery run
16 | cuda_persistenced_state: started
17 | 


--------------------------------------------------------------------------------
/ansible/roles/cuda/tasks/runtime.yml:
--------------------------------------------------------------------------------
1 | - name: Ensure NVIDIA Persistence Daemon state
2 |   systemd:
3 |     name: nvidia-persistenced
4 |     enabled: true
5 |     state: "{{ cuda_persistenced_state }}"
6 | 


--------------------------------------------------------------------------------
/ansible/roles/cuda/tasks/samples.yml:
--------------------------------------------------------------------------------
 1 | - name: Ensure cuda_samples_path exists
 2 |   file:
 3 |     state: directory
 4 |     path: "{{ cuda_samples_path }}"
 5 |     owner: "{{ ansible_user }}"
 6 |     group: "{{ ansible_user }}"
 7 | 
 8 | - name: Download CUDA samples release
 9 |   unarchive:
10 |     remote_src: yes
11 |     src: "{{ cuda_samples_release_url }}"
12 |     dest: "{{ cuda_samples_path }}"
13 |     owner: "{{ ansible_user }}"
14 |     group: "{{ ansible_user }}"
15 |     creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}"
16 | 
17 | - name: Create CUDA samples build directory
18 |   file:
19 |     state: directory
20 |     path: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build"
21 | 
22 | - name: Build CUDA samples
23 |   shell:
24 |     # We need to source /etc/profile.d/sh.local to add CUDA to the PATH
25 |     cmd: . /etc/profile.d/sh.local && cmake .. && make -j {{ ansible_processor_vcpus }}
26 |     chdir: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build"
27 |     creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build/Samples/1_Utilities/deviceQuery/deviceQuery"
28 | 


--------------------------------------------------------------------------------
/ansible/roles/dnf_repos/tasks/disable_repos.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Disable Pulp repos
 3 |   ansible.builtin.yum_repository:
 4 |     file: "{{ item.file }}"
 5 |     name: "{{ item.name }}"
 6 |     baseurl: "{{ item.base_url }}"
 7 |     description: "{{ item.name }}"
 8 |     enabled: false
 9 |   loop: "{{ dnf_repos_repolist }}"
10 | 
11 | - name: Disable EPEL repo
12 |   ansible.builtin.yum_repository:
13 |     name: epel
14 |     file: epel
15 |     description: "{{ dnf_repos_epel_description }}"
16 |     baseurl: "{{ dnf_repos_epel_baseurl }}"
17 |     gpgcheck: false
18 |     enabled: false
19 | 


--------------------------------------------------------------------------------
/ansible/roles/dnf_repos/tasks/set_repos.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Replace system repos with Pulp repos
 4 |   ansible.builtin.yum_repository:
 5 |     file: "{{ item.file }}"
 6 |     name: "{{ item.name }}"
 7 |     baseurl: "{{ item.base_url }}"
 8 |     description: "{{ item.name }}"
 9 |     username: "{{ dnf_repos_username }}"
10 |     password: "{{ dnf_repos_password }}"
11 |     gpgcheck: false
12 |   loop: "{{ dnf_repos_repolist }}"
13 | 
14 | - name: Install epel-release
15 |   # done so that roles installing epel via epel-release don't over-write our changes to the epel repo
16 |   ansible.builtin.dnf:
17 |     name: epel-release
18 | 
19 | - name: Use Pulp EPEL repo
20 |   ansible.builtin.yum_repository:
21 |     name: epel
22 |     file: epel
23 |     description: "{{ dnf_repos_epel_description }}"
24 |     gpgcheck: false
25 |     baseurl: "{{ dnf_repos_epel_baseurl }}"
26 |     username: "{{ dnf_repos_username }}"
27 |     password: "{{ dnf_repos_password }}"
28 | 


--------------------------------------------------------------------------------
/ansible/roles/doca/README.md:
--------------------------------------------------------------------------------
 1 | # doca
 2 | 
 3 | Install [NVIDIA DOCA](https://docs.nvidia.com/doca/sdk/index.html).
 4 | 
 5 | This role is not idempotent and is only intended to be run during an image build. It builds DOCA kernel modules to match the installed kernel and then installs these
 6 | plus the selected DOCA packages.
 7 | 
 8 | ## Role Variables
 9 | 
10 | - `doca_version`: Optional. String giving doca version.
11 | - `doca_profile`: Optional. Name of [profile](https://docs.nvidia.com/doca/sdk/nvidia+doca+profiles/index.html) defining subset of DOCA to install. Default is `doca-ofed`.
12 | - `doca_repo_url`: Optional. URL of DOCA repository. Default is appropriate upstream public repository for DOCA version, distro version and architecture.
13 | 


--------------------------------------------------------------------------------
/ansible/roles/doca/defaults/main.yml:
--------------------------------------------------------------------------------
1 | doca_version: '2.9.1' # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates
2 | doca_profile: doca-ofed
3 | doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/"
4 | 


--------------------------------------------------------------------------------
/ansible/roles/doca/tasks/install-kernel-devel.yml:
--------------------------------------------------------------------------------
 1 | - name: Get installed kernels
 2 |   command: dnf list --installed kernel
 3 |   register: _ofed_dnf_kernels
 4 |   changed_when: false
 5 | 
 6 | - name: Determine running kernel
 7 |   command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64
 8 |   register: _ofed_loaded_kernel
 9 |   changed_when: false
10 | 
11 | - name: Check current kernel is newest installed
12 |   assert:
13 |     that: _ofed_kernel_current == _ofed_dnf_kernels_newest
14 |     fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?"
15 |   vars:
16 |     _ofed_kernel_current: >-
17 |       {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }}
18 |     _ofed_dnf_kernels_newest: >-
19 |       {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }}
20 |     # dnf line format e.g. "kernel.x86_64  4.18.0-513.18.1.el8_9   @baseos  "
21 | 
22 | - name: Install matching kernel-devel package
23 |   dnf:
24 |     name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}"
25 | 


--------------------------------------------------------------------------------
/ansible/roles/doca/tasks/install.yml:
--------------------------------------------------------------------------------
 1 | - import_tasks: install-kernel-devel.yml
 2 | 
 3 | - name: Install DOCA repo
 4 |   ansible.builtin.yum_repository:
 5 |     name: doca
 6 |     file: doca
 7 |     description: DOCA Online Repo
 8 |     baseurl: "{{ doca_repo_url }}"
 9 |     enabled: true
10 |     gpgcheck: false
11 | 
12 | - name: Install doca-extra package
13 |   ansible.builtin.dnf:
14 |     name: doca-extra
15 | 
16 | - name: Build DOCA kernel modules
17 |   ansible.builtin.shell:
18 |     cmd: /opt/mellanox/doca/tools/doca-kernel-support
19 |   register: _doca_kernel_build
20 | 
21 | 
22 | - name: Find generated doca-kernel-repo
23 |   ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*'
24 |   register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm
25 |   changed_when: false
26 | 
27 | - name: Create dnf cache
28 |   ansible.builtin.command: dnf makecache
29 | 
30 | - name: Install DOCA repository package
31 |   ansible.builtin.dnf:
32 |     name: "{{ _doca_kernel_repo.stdout }}"
33 |     disable_gpg_check: true
34 | 
35 | - name: Install DOCA packages
36 |   ansible.builtin.dnf:
37 |     name: "{{ doca_profile }}"
38 | 
39 | - name: Cleanup DOCA build directories
40 |   ansible.builtin.file:
41 |     state: absent
42 |     path: "{{ (_doca_kernel_repo.stdout | split('/'))[:3] | join('/') }}" # leading / means 1st element of split list is ''
43 | 
44 | - name: Update initramfs
45 |   ansible.builtin.command:
46 |     cmd: dracut -f
47 |   register: _doca_dracut
48 |   failed_when: _doca_dracut.stderr != '' # appears rc is always 0
49 | 
50 | - name: Load the new driver
51 |   ansible.builtin.command: /etc/init.d/openibd restart
52 | 


--------------------------------------------------------------------------------
/ansible/roles/doca/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - include_tasks: install.yml
2 | 


--------------------------------------------------------------------------------
/ansible/roles/eessi/README.md:
--------------------------------------------------------------------------------
 1 | EESSI
 2 | =====
 3 | 
 4 | Configure the EESSI pilot respository for use on given hosts.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | None.
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | - `cvmfs_quota_limit_mb`: Optional int. Maximum size of local package cache on each node in MB.
15 | - `cvmfs_config_overrides`: Optional dict. Set of key-value pairs for additional CernVM-FS settings see [official docs](https://cvmfs.readthedocs.io/en/stable/cpt-configure.html) for list of options. Each dict key should correspond to a valid config variable (e.g. `CVMFS_HTTP_PROXY`) and the corresponding dict value will be set as the variable value (e.g. `https://my-proxy.com`). These configuration parameters will be written to the `/etc/cvmfs/default.local` config file on each host in the form `KEY=VALUE`.
16 | 
17 | Dependencies
18 | ------------
19 | 
20 | None.
21 | 
22 | Example Playbook
23 | ----------------
24 | 
25 | ```yaml
26 | - name: Setup EESSI
27 |   hosts: eessi
28 |   tags: eessi
29 |   become: true
30 |   tasks:
31 |     - name: Install and configure EESSI
32 |       import_role:
33 |         name: eessi
34 | ```
35 | 


--------------------------------------------------------------------------------
/ansible/roles/eessi/defaults/main.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Default to 10GB
 3 | cvmfs_quota_limit_mb: 10000
 4 | 
 5 | cvmfs_config_default:
 6 |   CVMFS_CLIENT_PROFILE: single
 7 |   CVMFS_QUOTA_LIMIT: "{{ cvmfs_quota_limit_mb }}"
 8 | 
 9 | cvmfs_config_overrides: {}
10 | 
11 | cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}"
12 | 
13 | cvmfs_gpg_checksum: "sha256:4ac81adff957565277cfa6a4a330cdc2ce5a8fdd73b8760d1a5a32bef71c4bd6"
14 | 


--------------------------------------------------------------------------------
/ansible/roles/eessi/tasks/main.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Download Cern GPG key
 3 |   ansible.builtin.get_url:
 4 |     url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM
 5 |     dest: ./cvmfs-key.gpg
 6 |     checksum: "{{ cvmfs_gpg_checksum }}"
 7 | 
 8 | - name: Import downloaded GPG key
 9 |   command: rpm --import cvmfs-key.gpg
10 | 
11 | - name: Add CVMFS repo
12 |   dnf:
13 |     name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm
14 |     disable_gpg_check: true
15 | 
16 | - name: Install CVMFS
17 |   dnf:
18 |     name: cvmfs
19 | 
20 | - name: Install EESSI CVMFS config
21 |   dnf:
22 |     name: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm
23 |     # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok?
24 |     disable_gpg_check: true
25 | 
26 | # Alternative version using official repo - still no GPG key :(
27 | # - name: Add EESSI repo
28 | #   dnf:
29 | #     name: http://repo.eessi-infra.org/eessi/rhel/8/noarch/eessi-release-0-1.noarch.rpm
30 | 
31 | # - name: Install EESSI CVMFS config
32 | #   dnf:
33 | #     name: cvmfs-config-eessi
34 | 
35 | - name: Add base CVMFS config
36 |   community.general.ini_file:
37 |     dest: /etc/cvmfs/default.local
38 |     section: null
39 |     option: "{{ item.key }}"
40 |     value: "{{ item.value }}"
41 |     no_extra_spaces: true
42 |   loop: "{{ cvmfs_config | dict2items }}"
43 | 
44 | 
45 | # NOTE: Not clear how to make this idempotent
46 | - name: Ensure CVMFS config is setup
47 |   command:
48 |     cmd: "cvmfs_config setup"
49 | 


--------------------------------------------------------------------------------
/ansible/roles/etc_hosts/README.md:
--------------------------------------------------------------------------------
 1 | # etc_hosts
 2 | 
 3 | Hosts in the `etc_hosts` groups have `/etc/hosts` created with entries of the format `IP_address canonical_hostname [alias]`.
 4 | 
 5 | By default, an entry is created for each host in this group as follows:
 6 | - The value of `ansible_host` is used as the IP_address.
 7 | - If `node_fqdn` is defined then that is used as the canonical hostname and `inventory_hostname` as an alias. Otherwise `inventory_hostname` is used as the canonical hostname.
 8 | This may need overriding for multi-homed hosts or hosts with multiple aliases.
 9 | 
10 | # Variables
11 | 
12 | - `etc_hosts_template`: Template file to use. Default is the in-role template.
13 | - `etc_hosts_hostvars`: A list of variable names, used (in the order supplied) to create the entry for each host. Default is described above.
14 | - `etc_hosts_extra_hosts`: String (possibly multi-line) defining additional hosts to add to `/etc/hosts`. Default is empty string.
15 | 


--------------------------------------------------------------------------------
/ansible/roles/etc_hosts/defaults/main.yml:
--------------------------------------------------------------------------------
1 | etc_hosts_template: hosts.j2
2 | etc_hosts_hostvars: "{{ ['ansible_host'] + (['node_fqdn'] if node_fqdn is defined else []) + ['inventory_hostname'] }}"
3 | etc_hosts_extra_hosts: ''
4 | 


--------------------------------------------------------------------------------
/ansible/roles/etc_hosts/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - name: Template out /etc/hosts
2 |   template:
3 |     src: "{{ etc_hosts_template }}"
4 |     dest: /etc/hosts
5 |     owner: root
6 |     group: root
7 |     mode: 0644
8 |   become: yes
9 | 


--------------------------------------------------------------------------------
/ansible/roles/etc_hosts/templates/hosts.j2:
--------------------------------------------------------------------------------
 1 | 127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
 2 | ::1         localhost localhost.localdomain localhost6 localhost6.localdomain6
 3 | 
 4 | {% for inventory_hostname in groups['etc_hosts'] | sort -%}
 5 | {{ hostvars[inventory_hostname] | json_query('['  + ( etc_hosts_hostvars | join(', ') ) + ']' ) | join(' ')}}
 6 | {% endfor %}
 7 | {% if etc_hosts_extra_hosts != '' %}
 8 | {{ etc_hosts_extra_hosts }}
 9 | {% endif %}
10 | 


--------------------------------------------------------------------------------
/ansible/roles/fail2ban/README.md:
--------------------------------------------------------------------------------
 1 | fail2ban
 2 | =========
 3 | 
 4 | Setup fail2ban to protect SSH on a host.
 5 | 
 6 | Note that no email alerts are set up so logs (at `/var/log/fail2ban.log`) will have to be manually reviewed if required.
 7 | 
 8 | Requirements
 9 | ------------
10 | 
11 | - An EL8 system.
12 | - `firewalld` running.
13 | 
14 | Role Variables
15 | --------------
16 | None.
17 | 
18 | Dependencies
19 | ------------
20 | 
21 | None.
22 | 
23 | Example Playbook
24 | ----------------
25 | 
26 | ```yaml
27 | - hosts: fail2ban
28 |   gather_facts: false
29 |   become: yes
30 |   tasks:
31 |     - import_role:
32 |         name: firewalld
33 |     - import_role:
34 |         name: fail2ban
35 | ```
36 | 
37 | License
38 | -------
39 | 
40 | Apache v2
41 | 
42 | Author Information
43 | ------------------
44 | 
45 | stackhpc.com
46 | 


--------------------------------------------------------------------------------
/ansible/roles/fail2ban/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | - name: Restart fail2ban
4 |   service:
5 |     name: fail2ban
6 |     state: restarted
7 |     enabled: true
8 | 


--------------------------------------------------------------------------------
/ansible/roles/fail2ban/meta/main.yml:
--------------------------------------------------------------------------------
 1 | galaxy_info:
 2 |   author: Steve Brasier
 3 |   company: stackhpc
 4 | 
 5 |   # If the issue tracker for your role is not on github, uncomment the
 6 |   # next line and provide a value
 7 |   # issue_tracker_url: http://example.com/issue/tracker
 8 | 
 9 |   # Choose a valid license ID from https://spdx.org - some suggested licenses:
10 |   # - BSD-3-Clause (default)
11 |   # - MIT
12 |   # - GPL-2.0-or-later
13 |   # - GPL-3.0-only
14 |   # - Apache-2.0
15 |   # - CC-BY-4.0
16 |   license: Apache-2.0
17 | 
18 |   min_ansible_version: 2.1
19 | 
20 |   # If this a Container Enabled role, provide the minimum Ansible Container version.
21 |   # min_ansible_container_version:
22 | 
23 |   #
24 |   # Provide a list of supported platforms, and for each platform a list of versions.
25 |   # If you don't wish to enumerate all versions for a particular platform, use 'all'.
26 |   # To view available platforms and versions (or releases), visit:
27 |   # https://galaxy.ansible.com/api/v1/platforms/
28 |   #
29 |   platforms:
30 |   - name: EL
31 |     versions:
32 |     - 8
33 | 
34 |   galaxy_tags: []
35 |     # List tags for your role here, one per line. A tag is a keyword that describes
36 |     # and categorizes the role. Users find roles by searching for tags. Be sure to
37 |     # remove the '[]' above, if you add tags to this list.
38 |     #
39 |     # NOTE: A tag is limited to a single word comprised of alphanumeric characters.
40 |     #       Maximum 20 tags per role.
41 | 
42 | dependencies: []
43 |   # List your role dependencies here, one per line. Be sure to remove the '[]' above,
44 |   # if you add dependencies to this list.
45 | 


--------------------------------------------------------------------------------
/ansible/roles/fail2ban/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install EPEL repo
 3 |   package:
 4 |     name: epel-release
 5 | 
 6 | - name: Install fail2ban packages
 7 |   package:
 8 |     name:
 9 |       - fail2ban-server
10 |       - fail2ban-firewalld
11 |     state: present
12 | 
13 | - name: Create config
14 |   template:
15 |     dest: /etc/fail2ban/jail.local
16 |     src: jail.local.j2
17 |   notify: Restart fail2ban
18 | 
19 | - name: flush handlers
20 |   meta: flush_handlers
21 | 
22 | - name: Ensure fail2ban running even if no config change
23 |   service:
24 |     name: fail2ban
25 |     state: started
26 |     enabled: true
27 | 


--------------------------------------------------------------------------------
/ansible/roles/fail2ban/templates/jail.local.j2:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | bantime = 3600
3 | action = %(action_)s
4 | 
5 | [sshd]
6 | enabled = true
7 | 


--------------------------------------------------------------------------------
/ansible/roles/filebeat/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | #filebeat_config_path: undefined # REQUIRED. Path to filebeat.yml configuration file template
4 | filebeat_podman_user: "{{ ansible_user }}" # User that runs the filebeat container
5 | filebeat_version: 7.12.1 # latest usable with opensearch - see https://opensearch.org/docs/2.4/tools/index/#compatibility-matrix-for-beats
6 | filebeat_debug: false
7 | 


--------------------------------------------------------------------------------
/ansible/roles/filebeat/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Restart filebeat container
 4 |   systemd:
 5 |     name: filebeat.service
 6 |     state: restarted
 7 |     enabled: yes
 8 |     daemon_reload: yes
 9 |   become: true
10 | 


--------------------------------------------------------------------------------
/ansible/roles/filebeat/tasks/install.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Create systemd unit file
 3 |   template:
 4 |     dest: /etc/systemd/system/filebeat.service
 5 |     src: filebeat.service.j2
 6 |   become: true
 7 |   register: _filebeat_unit
 8 | 
 9 | - name: Pull container image
10 |   containers.podman.podman_image:
11 |     name: "docker.elastic.co/beats/filebeat-oss"
12 |     tag: "{{ filebeat_version }}"
13 |   become_user: "{{ filebeat_podman_user }}"
14 | 
15 | - name: Reload filebeat unit file
16 |   command: systemctl daemon-reload
17 |   when: _filebeat_unit.changed
18 |   become: true
19 | 


--------------------------------------------------------------------------------
/ansible/roles/filebeat/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - import_tasks: install.yml
2 | - import_tasks: runtime.yml
3 | 


--------------------------------------------------------------------------------
/ansible/roles/filebeat/tasks/runtime.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Collect usernamespace facts
 4 |   user_namespace_facts:
 5 | 
 6 | - name: Set facts containing sub-ids
 7 |   set_fact:
 8 |     # filebeat user is 1000
 9 |     filebeat_host_user_id: "{{ ansible_facts.subuid[filebeat_podman_user]['start'] + 1000 - 1 }}"
10 |     filebeat_host_group_id: "{{ ansible_facts.subgid[filebeat_podman_user]['start'] + 1000 - 1 }}"
11 | 
12 | - name: Ensure parent directory exists
13 |   file:
14 |     state: directory
15 |     path: "/etc/filebeat"
16 |     owner: "{{ filebeat_host_user_id }}"
17 |     group: "{{ filebeat_host_group_id }}"
18 |     mode: 0770
19 |   become: true
20 | 
21 | - name: Template configuration files
22 |   template:
23 |       src: "{{ filebeat_config_path }}"
24 |       dest: /etc/filebeat/filebeat.yml
25 |       owner: "{{ filebeat_host_user_id }}"
26 |       group: "{{ filebeat_host_group_id }}"
27 |       mode: 0600
28 |   notify: Restart filebeat container
29 |   become: true
30 | 
31 | - name: Flush handlers
32 |   meta: flush_handlers
33 | 
34 | - name: Ensure filebeat service state
35 |   systemd:
36 |     name: filebeat.service
37 |     state: started
38 |     enabled: true
39 |   become: true
40 | 


--------------------------------------------------------------------------------
/ansible/roles/filebeat/tasks/validate.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | - name: Assert that filebeat_config_path is defined
4 |   assert:
5 |     that: filebeat_config_path is defined


--------------------------------------------------------------------------------
/ansible/roles/filebeat/templates/filebeat.service.j2:
--------------------------------------------------------------------------------
 1 | # container-filebeat.service
 2 | # based off
 3 | #   podman generate systemd filebeat --restart-policy always --new --name
 4 | # with pid/cidfiles replaced with --sdnotify=conmon approach
 5 | 
 6 | [Unit]
 7 | Description=Podman container-filebeat.service
 8 | Documentation=man:podman-generate-systemd(1)
 9 | Wants=network.target
10 | After=network-online.target
11 | 
12 | [Service]
13 | Environment=PODMAN_SYSTEMD_UNIT=%n
14 | Restart=always
15 | ExecStart=/usr/bin/podman run \
16 |     --network=host \
17 |     --sdnotify=conmon \
18 |     --cgroups=no-conmon \
19 |     --replace \
20 |     --name filebeat \
21 |     --user root \
22 |     --restart=always \
23 |     --security-opt label=disable \
24 |     --volume /var/log/:/logs:ro \
25 |     --volume /etc/filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro \
26 |     --detach=True docker.elastic.co/beats/filebeat-oss:{{ filebeat_version }} \
27 |     -e -strict.perms=false -d "*"
28 | ExecStop=/usr/bin/podman stop --ignore filebeat -t 10
29 | ExecStopPost=/usr/bin/podman rm --ignore -f filebeat
30 | KillMode=none
31 | Type=notify
32 | NotifyAccess=all
33 | User={{ filebeat_podman_user }}
34 | Group={{ filebeat_podman_user }}
35 | TimeoutStartSec=180
36 | 
37 | [Install]
38 | WantedBy=multi-user.target default.target
39 | 


--------------------------------------------------------------------------------
/ansible/roles/firewalld/README.md:
--------------------------------------------------------------------------------
 1 | Role Name
 2 | =========
 3 | 
 4 | Install and configure the `firewalld` firewall.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | EL8 host
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | - `firewalld_enabled`: Optional. Whether `firewalld` service is enabled (starts at boot). Default `yes`.
15 | - `firewalld_state`: Optional. State of `firewalld` service. Default `started`. Other values: `stopped`.
16 | - `firewalld_configs`: Optional. List of dicts giving parameters for [ansible.posix.firewalld module](https://docs.ansible.com/ansible/latest/collections/ansible/posix/firewalld_module.html). Default is an empty list.
17 | 
18 | Note that the default configuration for firewalld on Rocky Linux 8.5 is as follows:
19 | ```shell
20 | #  firewall-offline-cmd --list-all
21 | public
22 |   target: default
23 |   icmp-block-inversion: no
24 |   interfaces: 
25 |   sources: 
26 |   services: cockpit dhcpv6-client ssh
27 |   ports: 
28 |   protocols: 
29 |   forward: no
30 |   masquerade: no
31 |   forward-ports: 
32 |   source-ports: 
33 |   icmp-blocks: 
34 |   rich rules: 
35 | ```
36 | 
37 | Dependencies
38 | ------------
39 | 
40 | None.
41 | 
42 | Example Playbook
43 | ----------------
44 | 
45 | ```
46 | - hosts: firewalld
47 |   gather_facts: false
48 |   become: yes
49 |   tags: firewalld
50 |   tasks:
51 |     - import_role:
52 |         name: firewalld
53 | ```
54 | 
55 | License
56 | -------
57 | 
58 | BSD
59 | 
60 | Author Information
61 | ------------------
62 | 
63 | An optional section for the role authors to include contact information, or a website (HTML is not allowed).
64 | 


--------------------------------------------------------------------------------
/ansible/roles/firewalld/defaults/main.yml:
--------------------------------------------------------------------------------
1 | firewalld_enabled: yes
2 | firewalld_state: started
3 | firewalld_configs: []
4 | 


--------------------------------------------------------------------------------
/ansible/roles/firewalld/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Restart filewalld
3 |   service:
4 |     name: firewalld
5 |     state: restarted
6 |   when: firewalld_state != 'stopped'
7 | 


--------------------------------------------------------------------------------
/ansible/roles/firewalld/tasks/install.yml:
--------------------------------------------------------------------------------
1 | - name: Install firewalld package
2 |   dnf:
3 |     name: firewalld
4 | 


--------------------------------------------------------------------------------
/ansible/roles/firewalld/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - import_tasks: install.yml
3 | - import_tasks: runtime.yml
4 | 


--------------------------------------------------------------------------------
/ansible/roles/firewalld/tasks/runtime.yml:
--------------------------------------------------------------------------------
 1 | - name: Apply filewalld configs
 2 |   ansible.posix.firewalld: "{{ item }}"
 3 |   notify: Restart filewalld
 4 |   loop: "{{ firewalld_configs }}"
 5 | 
 6 | - meta: flush_handlers
 7 | 
 8 | - name: Ensure filewalld state
 9 |   ansible.builtin.systemd:
10 |     name: firewalld
11 |     state: "{{ firewalld_state }}"
12 |     enabled: "{{ firewalld_enabled | default(true) }}"
13 | 


--------------------------------------------------------------------------------
/ansible/roles/firewalld/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # vars file for firewalld
3 | 


--------------------------------------------------------------------------------
/ansible/roles/freeipa/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | #freeipa_realm:
 2 | freeipa_domain: "{{ freeipa_realm | lower }}"
 3 | #freeipa_ds_password:
 4 | #freeipa_admin_password:
 5 | #freeipa_server_ip:
 6 | freeipa_setup_dns: "{{ groups['freeipa_server'] | length > 0 }}"
 7 | freeipa_client_ip: "{{ ansible_host }}" # when run on freeipa_client group!
 8 | # freeipa_host_password:
 9 | freeipa_user_defaults:
10 |   ipa_pass: "{{ freeipa_admin_password | quote }}"
11 |   ipa_user: admin
12 | freeipa_users: [] # see community.general.ipa_user
13 | 
14 | _freeipa_keytab_backup_path: "{{ hostvars[groups['control'].0].appliances_state_dir  }}/freeipa/{{ inventory_hostname }}/krb5.keytab"
15 | 


--------------------------------------------------------------------------------
/ansible/roles/freeipa/tasks/addhost.yml:
--------------------------------------------------------------------------------
 1 | - name: Get ipa host information
 2 |   # This uses DNS to find the ipa server, which works as this is running on the enrolled ipa server
 3 |   # It doesn't fail even if the host doesn't exist
 4 |   community.general.ipa_host:
 5 |     name: "{{ node_fqdn }}"
 6 |     ip_address: "{{ freeipa_client_ip }}"
 7 |     ipa_pass: "{{ vault_freeipa_admin_password }}"
 8 |     ipa_user: admin
 9 |     state: present
10 |     validate_certs: false
11 |   delegate_to: "{{ groups['freeipa_server'].0 }}"
12 |   register: _ipa_host_check
13 |   check_mode: yes
14 |   changed_when: false
15 | 
16 | - name: Add host to IPA
17 |   # Using random_password=true this unenroles an enroled host, hence the check above
18 |   community.general.ipa_host:
19 |     name: "{{ node_fqdn }}"
20 |     ip_address: "{{ freeipa_client_ip }}"
21 |     ipa_pass: "{{ vault_freeipa_admin_password }}"
22 |     ipa_user: admin
23 |     random_password: true
24 |     state: present
25 |     validate_certs: false
26 |     ipa_timeout: 30
27 |   delegate_to: "{{ groups['freeipa_server'].0 }}"
28 |   when: "'sshpubkeyfp' not in _ipa_host_check.host"
29 |   register: _ipa_host_add
30 | 
31 | - name: Set fact for ipa host password
32 |   set_fact:
33 |     freeipa_host_password: "{{ _ipa_host_add.host.randompassword }}"
34 |   when: _ipa_host_add.changed
35 | 


--------------------------------------------------------------------------------
/ansible/roles/freeipa/tasks/backup-keytabs.yml:
--------------------------------------------------------------------------------
 1 | - name: Retrieve keytabs to localhost
 2 |   fetch:
 3 |     src: "{{ _freeipa_keytab_backup_path }}"
 4 |     dest: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/"
 5 |     flat: true
 6 |   delegate_to: "{{ groups['control'].0 }}"
 7 |   tags: retrieve
 8 | 
 9 | - name: Copy keytabs back to control node
10 |   copy:
11 |     src: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/"
12 |     dest: "{{ _freeipa_keytab_backup_path | dirname }}"
13 |   delegate_to: "{{ groups['control'].0 }}"
14 |   tags: deploy
15 | 


--------------------------------------------------------------------------------
/ansible/roles/freeipa/tasks/client-install.yml:
--------------------------------------------------------------------------------
1 | 
2 | - name: Install FreeIPA client package
3 |   dnf:
4 |     name: ipa-client
5 | 


--------------------------------------------------------------------------------
/ansible/roles/freeipa/tasks/users.yml:
--------------------------------------------------------------------------------
 1 | - name: Add users to freeipa
 2 |   # This uses DNS to find the ipa server, which works as this is running on the enrolled ipa server
 3 |   community.general.ipa_user:
 4 |     displayname: "{{ item.displayname | default(omit) }}"
 5 |     gidnumber: "{{ item.gidnumber | default(omit) }}"
 6 |     givenname: "{{ item.givenname }}"
 7 |     #ipa_host
 8 |     ipa_pass: "{{ freeipa_admin_password | quote }}"
 9 |     #ipa_port
10 |     #ipa_prot
11 |     ipa_timeout: "{{ item.ipa_timeout | default(omit) }}"
12 |     #ipa_user
13 |     krbpasswordexpiration: "{{ item.krbpasswordexpiration | default(omit) }}"
14 |     loginshell: "{{ item.loginshell | default(omit) }}"
15 |     mail: "{{ item.mail | default(omit) }}"
16 |     password: "{{ item.password | default(omit) }}"
17 |     sn: "{{ item.sn }}"
18 |     sshpubkey: "{{ item.sshpubkey | default(omit) }}"
19 |     state: "{{ item.state | default(omit) }}"
20 |     telephonenumber: "{{ item.telephonenumber | default(omit) }}"
21 |     title: "{{ item.title | default(omit) }}"
22 |     uid: "{{ item.name | default(item.uid) }}"
23 |     uidnumber: "{{ item.uidnumber | default(omit) }}"
24 |     update_password: "{{ item.update_password | default(omit) }}"
25 |     userauthtype: "{{ item.userauthtype | default(omit) }}"
26 |     #validate_certs
27 |   loop: "{{ freeipa_users }}"
28 | 


--------------------------------------------------------------------------------
/ansible/roles/gateway/README.md:
--------------------------------------------------------------------------------
 1 | # gateway
 2 | 
 3 | Ensure a single default route via a specified address exists on boot.
 4 | 
 5 | **NB:** This role uses `linux-ansible-init` and is not run by the
 6 | `ansible/site.yml` playbook.
 7 | 
 8 | ## Role variables
 9 | 
10 | **NB:** This role has no Ansible variables. Setting the OpenTofu variable
11 | `gateway_ip` to an IPv4 address will modify default routes as necessary to give
12 | the instance a single default route via that address. The default route will
13 | use the interface which has a CIDR including the gateway address.
14 | 
15 | Note that:
16 | - If the correct default route already exists, no changes are made.
17 | - If a default route exists on a different interface, that route will be deleted.
18 | - If a default route exists on the same interface but using a different address,
19 |   an assert will be raised to fail the `ansible-init` service - see logs using
20 |   `journalctl -xue ansible-init`.
21 | 
22 | See [docs/networks.md](../../../docs/networks.md) for further discussion.
23 | 
24 | ## Requirements
25 | 
26 | The image must include both this role and the `linux-ansible-init` role. This
27 | is the case for StackHPC-built images. For custom images use one of the following
28 | configurations during Packer build:
29 | - Add `builder` into the `gateway` group in `environments/$ENV/inventory/groups`
30 | - Add `gateway` to the `inventory_groups` Packer variable
31 | 


--------------------------------------------------------------------------------
/ansible/roles/gateway/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - name: Add gateway playbook
2 |   copy:
3 |     src: gateway-init.yml
4 |     dest: /etc/ansible-init/playbooks/05-gateway-init.yml
5 |     owner: root
6 |     group: root
7 |     mode: 0644
8 | 


--------------------------------------------------------------------------------
/ansible/roles/grafana-dashboards/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | grafana_address: "0.0.0.0"
 4 | grafana_port: 3000
 5 | 
 6 | # External Grafana address. Variable maps to "root_url" in grafana server section
 7 | grafana_url: "http://{{ grafana_address }}:{{ grafana_port }}"
 8 | grafana_api_url: "{{ grafana_url }}"
 9 | 
10 | grafana_security:
11 |   admin_user: admin
12 |   admin_password: ""
13 | 
14 | grafana_data_dir: "/var/lib/grafana"
15 | grafana_dashboards_dir: "dashboards"
16 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | hpctests_user: "{{ ansible_user }}"
 3 | hpctests_group: "{{ hpctests_user }}"
 4 | hpctests_rootdir: "/home/{{ hpctests_user }}/hpctests"
 5 | hpctests_pre_cmd: ''
 6 | hpctests_pingmatrix_modules: [gnu12 openmpi4]
 7 | hpctests_pingpong_modules: [gnu12 openmpi4 imb]
 8 | hpctests_pingpong_plot: yes
 9 | hpctests_hpl_modules: [gnu12 openmpi4 openblas]
10 | hpctests_outdir: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hpctests"
11 | hpctests_ucx_net_devices: all
12 | hpctests_hpl_version: "2.3"
13 | hpctests_hpl_NB: 192
14 | hpctests_hpl_mem_frac: 0.3
15 | hpctests_hpl_arch: linux64
16 | #hpctests_nodes:
17 | #hpctests_partition:
18 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # handlers file for hpctests
3 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | - name: setup
 2 |   block:
 3 |     - include_tasks: setup.yml
 4 |   become: true
 5 |   become_user: "{{ hpctests_user }}"
 6 |   tags: always
 7 | 
 8 | - name: pingpong
 9 |   block:
10 |     - include_tasks: pingpong.yml
11 |       when: hpctests_computes.stdout_lines | length > 1
12 |   become: true
13 |   become_user: "{{ hpctests_user }}"
14 |   tags: pingpong
15 | 
16 | - name: pingmatrix
17 |   block:
18 |     - include_tasks: pingmatrix.yml
19 |       when: hpctests_computes.stdout_lines | length > 1
20 |   become: true
21 |   become_user: "{{ hpctests_user }}"
22 |   tags: pingmatrix
23 | 
24 | - name: build HPL
25 |   block:
26 |     - include_tasks: build-hpl.yml
27 |   become: true
28 |   become_user: "{{ hpctests_user }}"
29 |   tags:
30 |     - hpl-solo
31 | 
32 | - name: run HPL on individual nodes
33 |   block:
34 |     - include_tasks: hpl-solo.yml
35 |   become: true
36 |   become_user: "{{ hpctests_user }}"
37 |   tags:
38 |     - hpl-solo
39 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/tasks/setup.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Get partition information
 4 |   shell: "sinfo --format %P --noheader"
 5 |   register: _sinfo_partitions
 6 |   changed_when: false
 7 | 
 8 | - name: Select default partition if hpctests_partition not given
 9 |   set_fact:
10 |     hpctests_partition: "{{ (_sinfo_partitions.stdout_lines  | select('contains', '*') | first)[:-1] }}"
11 |   when: hpctests_partition is not defined
12 | 
13 | - name: Get info about compute nodes
14 |   shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format %N"
15 |   register: hpctests_computes
16 |   changed_when: false
17 |   failed_when: hpctests_computes.rc != 0
18 | 
19 | - name: Check compute node selection valid
20 |   assert:
21 |     that: hpctests_computes.stdout_lines | length > 0
22 |     fail_msg: "No nodes selected - was variable `hpctests_nodes` set (correctly)?"
23 | 
24 | - name: Create test root directory
25 |   file:
26 |     path: "{{ hpctests_rootdir }}"
27 |     state: directory
28 |     owner: "{{ hpctests_user }}"
29 |     group: "{{ hpctests_group }}"
30 | 
31 | - name: Set fact for UCX_NET_DEVICES
32 |   set_fact:
33 |     hpctests_ucx_net_devices: "{{ hpctests_ucx_net_devices.get(hpctests_partition, 'all') }}"
34 |   when: hpctests_ucx_net_devices is mapping
35 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/templates/HPL.dat.j2:
--------------------------------------------------------------------------------
 1 | HPLinpack benchmark input file
 2 | Innovative Computing Laboratory, University of Tennessee
 3 | HPL.out      output file name (if any)
 4 | 6            device out (6=stdout,7=stderr,file)
 5 | 1            # of problems sizes (N)
 6 | {{ hpctests_hpl_N}}           Ns
 7 | 1            # of NBs
 8 | {{ hpctests_hpl_NB }}          NBs
 9 | 0            PMAP process mapping (0=Row-,1=Column-major)
10 | 1            # of process grids (P x Q)
11 | {{ hpctests_hpl_P }}            Ps
12 | {{ hpctests_hpl_Q }}            Qs
13 | 16.0         threshold
14 | 1            # of panel fact
15 | 0            PFACTs (0=left, 1=Crout, 2=Right)
16 | 1            # of recursive stopping criterium
17 | 2            NBMINs (>= 1)
18 | 1            # of panels in recursion
19 | 2            NDIVs
20 | 1            # of recursive panel fact.
21 | 0            RFACTs (0=left, 1=Crout, 2=Right)
22 | 1            # of broadcast
23 | 0            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
24 | 1            # of lookahead depth
25 | 0            DEPTHs (>=0)
26 | 2            SWAP (0=bin-exch,1=long,2=mix)
27 | 64           swapping threshold
28 | 0            L1 in (0=transposed,1=no-transposed) form
29 | 0            U  in (0=transposed,1=no-transposed) form
30 | 1            Equilibration (0=no,1=yes)
31 | 8            memory alignment in double (> 0)


--------------------------------------------------------------------------------
/ansible/roles/hpctests/templates/hpl-build.sh.j2:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #SBATCH --nodes=1
 4 | #SBATCH --output=%x.%a.out
 5 | #SBATCH --error=%x.%a.out
 6 | #SBATCH --exclusive
 7 | #SBATCH --partition={{ hpctests_partition }}
 8 | {%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_computes.stdout_lines[0] }}{% endif %}
 9 | 
10 | echo HPL arch: {{ hpctests_hpl_arch }}
11 | {{ hpctests_pre_cmd }}
12 | module load {{ hpctests_hpl_modules | join(' ' ) }}
13 | make arch={{ hpctests_hpl_arch }} clean_arch_all
14 | make arch={{ hpctests_hpl_arch }}
15 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/templates/hpl-solo.sh.j2:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #SBATCH --ntasks={{ hpctests_hplsolo_ntasks }}
 4 | #SBATCH --output=%x.%a.out
 5 | #SBATCH --error=%x.%a.out
 6 | #SBATCH --exclusive
 7 | #SBATCH --array=0-{{ hpctests_computes.stdout_lines | length - 1 }}
 8 | #SBATCH --partition={{ hpctests_partition }}
 9 | {% if hpctests_hplsolo_excluded_nodes | length > 0 %}
10 | #SBATCH --exclude={{ hpctests_hplsolo_excluded_nodes | join(',') }}
11 | {% endif %}
12 | 
13 | export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }}
14 | echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST
15 | echo SLURM_JOB_ID: $SLURM_JOB_ID
16 | echo UCX_NET_DEVICES: $UCX_NET_DEVICES
17 | echo HPL arch: {{ hpctests_hpl_arch }}
18 | {{ hpctests_pre_cmd }}
19 | module load {{ hpctests_hpl_modules | join(' ' ) }}
20 | mpirun ./xhpl-{{ hpctests_hpl_arch }}
21 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/templates/pingmatrix.sh.j2:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #SBATCH --ntasks={{ hpctests_computes.stdout_lines | length }}
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --output=%x.out
 6 | #SBATCH --error=%x.out
 7 | #SBATCH --exclusive
 8 | #SBATCH --partition={{ hpctests_partition }}
 9 | {%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_nodes }}{% endif %}
10 | 
11 | export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }}
12 | echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST
13 | echo SLURM_JOB_ID: $SLURM_JOB_ID
14 | echo UCX_NET_DEVICES: $UCX_NET_DEVICES
15 | {{ hpctests_pre_cmd }}
16 | module load {{ hpctests_pingmatrix_modules | join(' ' ) }}
17 | 
18 | mpicc -o nxnlatbw mpi_nxnlatbw.c
19 | 
20 | # mpirun flags force using UCX TCP transports, overriding higher
21 | # priority of OpenMPI btl/openib component, which is also using RDMA
22 | # https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 
23 | mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any nxnlatbw
24 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/templates/pingpong.sh.j2:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #SBATCH --ntasks=2
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --output=%x.out
 6 | #SBATCH --error=%x.out
 7 | #SBATCH --exclusive
 8 | #SBATCH --partition={{ hpctests_partition }}
 9 | {%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_nodes }}{% endif %}
10 | 
11 | export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }}
12 | echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST
13 | echo SLURM_JOB_ID: $SLURM_JOB_ID
14 | echo UCX_NET_DEVICES: $UCX_NET_DEVICES
15 | {{ hpctests_pre_cmd }}
16 | module load {{ hpctests_pingpong_modules | join(' ' ) }}
17 | 
18 | #srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1
19 | 
20 | # mpirun flags force using UCX TCP transports, overriding higher 
21 | # priority of OpenMPI btl/openib component, which is also using RDMA 
22 | # https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100
23 | mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any IMB-MPI1 pingpong
24 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/tests/inventory:
--------------------------------------------------------------------------------
1 | localhost
2 | 
3 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/tests/test.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: localhost
3 |   remote_user: root
4 |   roles:
5 |     - hpctests
6 | 


--------------------------------------------------------------------------------
/ansible/roles/hpctests/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | hpctests_hpl_srcdir: "{{ hpctests_rootdir }}/hpl/hpl-{{ hpctests_hpl_version }}"
3 | 


--------------------------------------------------------------------------------
/ansible/roles/k3s/README.md:
--------------------------------------------------------------------------------
 1 | k3s
 2 | =====
 3 | 
 4 | Installs k3s agent and server services on nodes and an ansible-init playbook to activate them. The service that each node will activate on init is determined by OpenStack metadata. Also includes Helm install. Currently only supports a single k3s-server
 5 | (i.e one control node). Install based on the [official k3s ansible role](https://github.com/k3s-io/k3s-ansible).
 6 | 
 7 | 
 8 | Requirements
 9 | ------------
10 | 
11 | `azimuth_cloud.image_utils.linux_ansible_init` must have been run previously on targeted nodes during image build.
12 | 
13 | Role Variables
14 | --------------
15 | 
16 | - `k3s_version`: Optional str. K3s version to install, see [official releases](https://github.com/k3s-io/k3s/releases/).
17 | 


--------------------------------------------------------------------------------
/ansible/roles/k3s/defaults/main.yml:
--------------------------------------------------------------------------------
1 | # Warning: changes to these variables won't be reflected in the cluster/image if k3s is already installed
2 | k3s_version: "v1.31.0+k3s1"
3 | k3s_selinux_release: v1.6.latest.1
4 | k3s_selinux_rpm_version: 1.6-1
5 | k3s_helm_version: v3.11.0
6 | k3s_bootstrap_token: '' # matches common environment default
7 | k3s_bootstrap_token_expiry: 10m
8 | k3s_server_name: "{{ None }}" # ansible managed
9 | 


--------------------------------------------------------------------------------
/ansible/roles/k3s/tasks/agent-runtime.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Template k3s agent env file
 4 |   when: k3s_bootstrap_token != ''
 5 |   ansible.builtin.template:
 6 |     dest: /etc/systemd/system/k3s-agent.service.env
 7 |     src: k3s-agent.service.env.j2
 8 |     owner: root
 9 |     group: root
10 |     mode: 0640
11 |   register: _k3s_agent_token_result
12 | 
13 | - name: Ensure password directory exists
14 |   ansible.builtin.file: 
15 |     path: "/etc/rancher/node"
16 |     state: directory
17 |     owner: root
18 |     group: root
19 |     mode: 0640
20 | 
21 | - name: Write node password
22 |   ansible.builtin.copy:
23 |     dest: /etc/rancher/node/password
24 |     content: "{{ vault_k3s_node_password }}"
25 |     owner: root
26 |     group: root
27 |     mode: 0640 # normal k3s install is 644 but that doesn't feel right
28 | 
29 | - name: Start/restart k3s agent
30 |   when: _k3s_agent_token_result.changed
31 |   ansible.builtin.systemd:
32 |     name: k3s-agent
33 |     daemon_reload: true
34 |     state: restarted
35 |     enabled: true
36 | 


--------------------------------------------------------------------------------
/ansible/roles/k3s/tasks/server-runtime.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Template k3s env file
 4 |   ansible.builtin.template:
 5 |     dest: /etc/systemd/system/k3s.service.env
 6 |     src: k3s.service.env.j2
 7 |   register: _k3s_env_file_status
 8 | 
 9 | - name: Start k3s server
10 |   ansible.builtin.systemd:
11 |     name: k3s
12 |     daemon_reload: "{{ _k3s_env_file_status.changed }}"
13 |     state: started
14 |     enabled: true
15 | 
16 | # Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though
17 | - name: Check which k3s agents are connected
18 |   ansible.builtin.shell:
19 |     cmd: kubectl get nodes --no-headers | grep -w Ready
20 |   register: _k3s_connected_nodes
21 |   retries: 6 # task may fail if server is not ready yet
22 |   delay: 10
23 |   until: not _k3s_connected_nodes.failed
24 | 
25 | - when: _k3s_connected_nodes.stdout_lines | length != groups['k3s'] | length
26 |   block:
27 |   - name: Generate new bootstrap token if not all agents are connected
28 |     no_log: true
29 |     shell:
30 |       cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}"
31 |     register: _k3s_token_output
32 | 
33 |   - name: Set bootstrap token as fact
34 |     set_fact:
35 |       k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}"
36 | 


--------------------------------------------------------------------------------
/ansible/roles/k3s/templates/k3s-agent.service.env.j2:
--------------------------------------------------------------------------------
1 | K3S_NODE_IP={{ ansible_host }}
2 | K3S_TOKEN={{ k3s_bootstrap_token }}
3 | K3S_URL=https://{{ k3s_server_name }}:6443
4 | 


--------------------------------------------------------------------------------
/ansible/roles/k3s/templates/k3s.service.env.j2:
--------------------------------------------------------------------------------
1 | K3S_NODE_IP={{ ansible_host }}
2 | 


--------------------------------------------------------------------------------
/ansible/roles/k9s/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 |   
 3 |   - name: Check if k9s is installed
 4 |     ansible.builtin.stat:
 5 |       path: "/usr/bin/k9s"
 6 |     register: _k9s_stat_result
 7 | 
 8 |   - name: Install k9s and clean up temporary files
 9 |     block:
10 |     - name: Create install directory
11 |       ansible.builtin.file:
12 |         path: /tmp/k9s
13 |         state: directory
14 |         owner: root
15 |         group: root
16 |         mode: "744"
17 |       when: not _k9s_stat_result.stat.exists
18 | 
19 |     - name: Download k9s
20 |       ansible.builtin.get_url:
21 |         url: https://github.com/derailed/k9s/releases/download/v0.32.5/k9s_Linux_amd64.tar.gz
22 |         dest: /tmp/k9s/k9s_Linux_amd64.tar.gz
23 |         owner: root
24 |         group: root
25 |         mode: "744"
26 | 
27 |     - name: Unpack k9s binary
28 |       ansible.builtin.unarchive:
29 |         src: /tmp/k9s/k9s_Linux_amd64.tar.gz
30 |         dest: /tmp/k9s
31 |         remote_src: yes
32 | 
33 |     - name: Add k9s to root path
34 |       ansible.builtin.copy:
35 |         src: /tmp/k9s/k9s
36 |         dest: /usr/bin/k9s
37 |         mode: u+rwx
38 |         remote_src: yes
39 | 
40 |     - name: Cleanup k9s install directory
41 |       ansible.builtin.file:
42 |         path: /tmp/k9s
43 |         state: absent
44 |     when: not _k9s_stat_result.stat.exists
45 | 


--------------------------------------------------------------------------------
/ansible/roles/lustre/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | lustre_repo: https://github.com/stackhpc/lustre-release.git
 2 | lustre_version: '2.15.6/lu-18085' # Fixes https://jira.whamcloud.com/browse/LU-18085
 3 | lustre_lnet_label: tcp
 4 | #lustre_mgs_nid:
 5 | lustre_mounts: []
 6 | lustre_mount_state: mounted
 7 | lustre_mount_options: 'defaults,_netdev,noauto,x-systemd.automount,x-systemd.requires=lnet.service,nosuid,nodev'
 8 | 
 9 | # below variables are for build and should not generally require changes
10 | lustre_git_repo: "git://git.whamcloud.com/fs/lustre-release.git"
11 | lustre_build_packages:
12 |   - "kernel-devel-{{ ansible_kernel }}"
13 |   - git
14 |   - gcc
15 |   - libtool
16 |   - python3
17 |   - python3-devel
18 |   - openmpi
19 |   - elfutils-libelf-devel
20 |   - libmount-devel
21 |   - libnl3-devel
22 |   - libyaml-devel
23 |   - rpm-build
24 |   - kernel-abi-stablelists
25 |   - libaio
26 |   - libaio-devel
27 | lustre_build_dir: /tmp/lustre-release
28 | lustre_configure_opts:
29 |   - --disable-server
30 |   - --with-linux=/usr/src/kernels/*
31 |   - --with-o2ib=/usr/src/ofa_kernel/default
32 |   - --disable-maintainer-mode
33 |   - --disable-gss-keyring
34 |   - --enable-mpitests=no
35 | lustre_rpm_globs: # NB: order is important here, as not installing from a repo
36 |   - "kmod-lustre-client-{{ lustre_version | split('.') | first }}*" # only take part of the version as -RC versions produce _RC rpms
37 |   - "lustre-client-{{ lustre_version | split('.') | first }}*"
38 | lustre_build_cleanup: true
39 | 


--------------------------------------------------------------------------------
/ansible/roles/lustre/tasks/configure.yml:
--------------------------------------------------------------------------------
 1 | - name: Gather Lustre interface info
 2 |   shell:
 3 |     cmd: |
 4 |       ip --json r get {{ _lustre_mgs_ip }}
 5 |   changed_when: false
 6 |   register: _lustre_ip_r_mgs
 7 |   vars:
 8 |     _lustre_mgs_ip: "{{ lustre_mgs_nid | split('@') | first }}"
 9 | 
10 | - name: Set facts for Lustre interface
11 |   set_fact:
12 |     _lustre_interface: "{{ _lustre_ip_r_mgs_info.dev }}"
13 |     _lustre_ip: "{{ _lustre_ip_r_mgs_info.prefsrc }}"
14 |   vars:
15 |     _lustre_ip_r_mgs_info: "{{ _lustre_ip_r_mgs.stdout | from_json | first }}"
16 | 
17 | - name: Write LNet configuration file
18 |   template:
19 |     src: lnet.conf.j2
20 |     dest:  /etc/lnet.conf # exists from package install, expected by lnet service
21 |     owner: root
22 |     group: root
23 |     mode: u=rw,go=r # from package install
24 |   register: _lnet_conf
25 | 
26 | - name: Ensure lnet service state
27 |   systemd:
28 |     name: lnet
29 |     state: "{{ 'restarted' if _lnet_conf.changed else 'started' }}"
30 | 
31 | - name: Ensure mount points exist
32 |   ansible.builtin.file:
33 |     path: "{{ item.mount_point }}"
34 |     state: directory
35 |   loop: "{{ lustre_mounts }}"
36 |   when: "(item.mount_state | default(lustre_mount_state)) != 'absent'"
37 | 
38 | - name: Mount lustre filesystem
39 |   ansible.posix.mount:
40 |     fstype: lustre
41 |     src: "{{ lustre_mgs_nid }}:/{{ item.fs_name }}"
42 |     path: "{{ item.mount_point }}"
43 |     state: "{{ (item.mount_state | default(lustre_mount_state)) }}"
44 |     opts: "{{ item.mount_options | default(lustre_mount_options) }}"
45 |   loop: "{{ lustre_mounts }}"
46 | 


--------------------------------------------------------------------------------
/ansible/roles/lustre/tasks/install.yml:
--------------------------------------------------------------------------------
 1 | - name: Install lustre build prerequisites
 2 |   ansible.builtin.dnf:
 3 |     name: "{{ lustre_build_packages }}"
 4 |   register: _lustre_dnf_build_packages
 5 |  
 6 | - name: Clone lustre git repo
 7 |   ansible.builtin.git:
 8 |     repo: "{{ lustre_repo }}"
 9 |     dest: "{{ lustre_build_dir }}"
10 |     version: "{{ lustre_version }}"
11 | 
12 | - name: Prepare for lustre configuration
13 |   ansible.builtin.command:
14 |     cmd: sh ./autogen.sh
15 |     chdir: "{{ lustre_build_dir }}"
16 | 
17 | - name: Configure lustre build
18 |   ansible.builtin.command:
19 |     cmd: "./configure {{ lustre_configure_opts | join(' ') }}"
20 |     chdir: "{{ lustre_build_dir }}"
21 | 
22 | - name: Build lustre
23 |   ansible.builtin.command:
24 |     cmd: make rpms
25 |     chdir: "{{ lustre_build_dir }}"
26 | 
27 | - name: Find rpms
28 |   ansible.builtin.find:
29 |     paths: "{{ lustre_build_dir }}"
30 |     patterns: "{{ lustre_rpm_globs }}"
31 |     use_regex: false
32 |   register: _lustre_find_rpms
33 | 
34 | - name: Check rpms found
35 |   assert:
36 |     that: _lustre_find_rpms.files | length
37 |     fail_msg: "No lustre repos found with lustre_rpm_globs = {{ lustre_rpm_globs }}"
38 | 
39 | - name: Install lustre rpms
40 |   ansible.builtin.dnf:
41 |     name: "{{ _lustre_find_rpms.files | map(attribute='path')}}"
42 |     disable_gpg_check: yes
43 | 
44 | - name: Delete lustre build dir
45 |   file:
46 |     path: "{{ lustre_build_dir }}"
47 |     state: absent
48 |   when: lustre_build_cleanup | bool
49 | 


--------------------------------------------------------------------------------
/ansible/roles/lustre/tasks/validate.yml:
--------------------------------------------------------------------------------
 1 | - name: Check kernel-devel package is installed
 2 |   command: "dnf list --installed kernel-devel-{{ ansible_kernel }}"
 3 |   changed_when: false
 4 |   # NB: we don't check here the kernel will remain the same after reboot etc, see ofed/install.yml
 5 | 
 6 | - name: Ensure SELinux in permissive mode
 7 |   assert:
 8 |     that: selinux_state in ['permissive', 'disabled']
 9 |     fail_msg: "SELinux must be permissive for Lustre not '{{ selinux_state }}'; see variable selinux_state"
10 | 
11 | - name: Ensure lustre_mgs_nid is defined
12 |   assert:
13 |     that: lustre_mgs_nid is defined
14 |     fail_msg: Variable lustre_mgs_nid must be defined
15 | 
16 | - name: Ensure lustre_mounts entries define filesystem name and mount point
17 |   assert:
18 |     that:
19 |       - item.fs_name is defined
20 |       - item.mount_point is defined
21 |     fail_msg: All lustre_mounts entries must specify fs_name and mount_point
22 |   loop: "{{ lustre_mounts }}"
23 | 


--------------------------------------------------------------------------------
/ansible/roles/lustre/templates/lnet.conf.j2:
--------------------------------------------------------------------------------
1 | net:
2 |     - net type: {{ lustre_lnet_label }}
3 |       local NI(s):
4 |         - nid: {{ _lustre_ip }}@{{ lustre_lnet_label }}
5 |           interfaces:
6 |               0: {{ _lustre_interface }}
7 | 


--------------------------------------------------------------------------------
/ansible/roles/mysql/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | # required:
 2 | # mysql_root_password: # TODO: make it possible to CHANGE root password
 3 | 
 4 | mysql_tag: 8.0.30
 5 | mysql_systemd_service_enabled: yes
 6 | #mysql_state: # default is started or restarted as required
 7 | mysql_podman_user: "{{ ansible_user }}"
 8 | mysql_datadir: /var/lib/mysql
 9 | mysql_mysqld_options: [] # list of str options to mysqld, see `run -it --rm mysql:tag --verbose --help`
10 | mysql_users: [] # list of dicts for community.mysql.mysql_user
11 | mysql_databases: [] # list of dicts for community.mysql.mysql_db
12 | 


--------------------------------------------------------------------------------
/ansible/roles/mysql/tasks/configure.yml:
--------------------------------------------------------------------------------
 1 | - name: Create environment file for mysql server root password
 2 |   # NB: This doesn't trigger a restart on changes as it will be ignored once mysql is initialised
 3 |   copy:
 4 |     dest: /etc/sysconfig/mysqld
 5 |     content: |
 6 |       MYSQL_INITIAL_ROOT_PASSWORD='{{ mysql_root_password }}'
 7 |     owner: root
 8 |     group: root
 9 |     mode: u=rw,go=
10 | 
11 | - name: Ensure mysql service state
12 |   systemd:
13 |     name: mysql
14 |     state: "{{ mysql_state | default('restarted' if _mysql_unitfile.changed else 'started') }}"
15 |     enabled: "{{ mysql_systemd_service_enabled }}"
16 |     daemon_reload: "{{ _mysql_unitfile.changed }}"
17 | 
18 | - block:
19 |   - name: Wait for mysql to initialise
20 |     # NB: It is not sufficent to wait_for the port
21 |     community.mysql.mysql_info:
22 |       login_user: root
23 |       login_password: "{{ mysql_root_password }}"
24 |     no_log: "{{ no_log | default(true) }}"
25 |     register: _mysql_info
26 |     until: "'version' in _mysql_info"
27 |     retries: 90
28 |     delay: 2
29 | 
30 |   - name: Ensure mysql databases created
31 |     community.mysql.mysql_db: "{{ item }}"
32 |     loop: "{{ mysql_databases}}"
33 | 
34 |   - name: Ensure mysql users present
35 |     community.mysql.mysql_user: "{{ item }}"
36 |     loop: "{{ mysql_users }}"
37 |   when: "mysql_state | default('unspecified') != 'stopped'"
38 | 


--------------------------------------------------------------------------------
/ansible/roles/mysql/tasks/install.yml:
--------------------------------------------------------------------------------
 1 | - name: Install pip
 2 |   dnf:
 3 |     name: python3-pip
 4 | 
 5 | - name: Install python mysql client
 6 |   pip:
 7 |     name:
 8 |       - pymysql
 9 |       - cryptography
10 |     state: present
11 | 
12 | - name: Create systemd mysql container unit file
13 |   template:
14 |     dest: /etc/systemd/system/mysql.service
15 |     src: mysql.service.j2
16 |   register: _mysql_unitfile
17 | 
18 | - name: Pull container image
19 |   containers.podman.podman_image:
20 |     name: docker.io/library/mysql
21 |     tag: "{{ mysql_tag }}"
22 |   become_user: "{{ mysql_podman_user }}"
23 | 


--------------------------------------------------------------------------------
/ansible/roles/mysql/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - import_tasks: install.yml
2 | - import_tasks: configure.yml
3 | 


--------------------------------------------------------------------------------
/ansible/roles/ofed/README.md:
--------------------------------------------------------------------------------
 1 | # ofed
 2 | 
 3 | This role installs Mellanox OFED:
 4 | - It checks that the running kernel is the latest installed one, and errors if not.
 5 | - Installation uses the `mlnxofedinstall` command, with support for the running kernel
 6 | and (by default) without firmware updates.
 7 | 
 8 | As OFED installation takes a long time generally this should only be used during image build,
 9 | for example by setting:
10 | 
11 | ```
12 | environments/groups/<environment>/groups:
13 | [ofed:children]
14 | builder
15 | ```
16 | 
17 | # Role variables
18 | 
19 | See `defaults/main.yml`
20 | 
21 | Note ansible facts are required, unless setting `ofed_distro_version` and `ofed_arch` specifically.
22 | 


--------------------------------------------------------------------------------
/ansible/roles/ofed/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ofed_version: '23.10-3.2.2.0' # LTS
 2 | ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz
 3 | ofed_distro: rhel # NB: not expected to work on other distros due to installation differences
 4 | ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9'
 5 | ofed_distro_major_version: "{{ ansible_distribution_major_version }}" # e.g. '8'
 6 | ofed_arch: "{{ ansible_architecture }}"
 7 | ofed_tmp_dir: /tmp
 8 | ofed_update_firmware: false
 9 | ofed_build_packages: # may require additional packages depending on ofed_package_selection
10 |   - autoconf
11 |   - automake
12 |   - gcc
13 |   - gcc-gfortran
14 |   - kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}
15 |   - kernel-rpm-macros
16 |   - libtool
17 |   - lsof
18 |   - patch
19 |   - pciutils
20 |   - perl
21 |   - rpm-build
22 |   - tcl
23 |   - tk
24 | ofed_build_rl8_packages:
25 |   - gdb-headless
26 |   - python36
27 | ofed_package_selection: # list of package selection flags for mlnxofedinstall script
28 |   - hpc
29 |   - with-nfsrdma
30 | 


--------------------------------------------------------------------------------
/ansible/roles/ofed/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - include_tasks: install.yml
2 | 


--------------------------------------------------------------------------------
/ansible/roles/openondemand/files/missing_home_directory.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |   <title>Home Directory Not Found</title>
 5 |   <style>
 6 |     body {
 7 |       font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
 8 |       font-size: 20px;
 9 |       line-height: 1.4;
10 |       color: #333;
11 |       font-weight: 300;
12 |       padding: 15px;
13 |     }
14 |     h1 {
15 |       font-weight: 500;
16 |       font-size: 30px;
17 |     }
18 |     .btn-primary {
19 |       text-decoration: none;
20 |       font-weight: 400;
21 |       padding: 10px 16px;
22 |       border-radius: 6px;
23 |       color: #fff;
24 |       background-color: #337ab7;
25 |     }
26 |     .btn-secondary {
27 |       text-decoration: none;
28 |       font-weight: 400;
29 |       padding: 10px 16px;
30 |       border-radius: 6px;
31 |       color: #fff;
32 |       background-color: #338ab7;
33 |     }
34 |   </style>
35 | </head>
36 | <body>
37 |   <h1>Home directory not found</h1>
38 |   <p>
39 |   Your home directory appears to be missing. If this is the first time you have logged in with this account, you may
40 |   need to access our systems using SSH in order to trigger the creation of your home directory.
41 |   </p>
42 |   <ol>
43 |     <a class="btn-primary" target="_blank" href="/pun/sys/shell/ssh/default">Open Shell to create home directory</a>
44 |     </br>
45 |     </br>
46 |     <a class="btn-secondary" href="/nginx/stop?redir=/pun/sys/dashboard">Restart Web Server</a>
47 |   </ol>
48 | </body>
49 | </html>
50 | 


--------------------------------------------------------------------------------
/ansible/roles/openondemand/tasks/config_changes.yml:
--------------------------------------------------------------------------------
1 | - name: Add Apache directives for node_uri forwarding
2 |   blockinfile:
3 |     path: /opt/ood/ood-portal-generator/templates/ood-portal.conf.erb
4 |     block: "{{ openondemand_node_proxy_directives }}"
5 |     insertafter: '    Header edit  Set-Cookie "\^\(\[\^;\]\+\)" "\$1; Path=<%= @node_uri %>\/%{MATCH_HOST}e\/%{MATCH_PORT}e"'
6 |   when: openondemand_node_proxy_directives
7 | 


--------------------------------------------------------------------------------
/ansible/roles/openondemand/tasks/exporter.yml:
--------------------------------------------------------------------------------
 1 | - name: Install ondemand prometheus exporter
 2 |   yum:
 3 |     name: ondemand_exporter
 4 |   when: openondemand_exporter
 5 | 
 6 | - name: Start and enable ondemand prometheus exporter
 7 |   service:
 8 |     name: ondemand_exporter
 9 |     enabled: true
10 |     state: started
11 | 


--------------------------------------------------------------------------------
/ansible/roles/openondemand/tasks/jupyter_compute.yml:
--------------------------------------------------------------------------------
 1 | # Should be run on compute nodes you want to run jupyter notebook on
 2 | # See https://osc.github.io/ood-documentation/latest/app-development/tutorials-interactive-apps/add-jupyter/software-requirements.html
 3 | # - Will already have openssl and lmod
 4 | 
 5 | - name: Ensure python3.9 installed
 6 |   dnf:
 7 |     name: python39
 8 |   tags: install
 9 | 
10 | - name: Install jupyter venv
11 |   # Requires separate step so that the upgraded pip is used to install packages
12 |   pip:
13 |     name: pip
14 |     state: latest
15 |     virtualenv: /opt/jupyter-py39
16 |     virtualenv_command: python3.9 -m venv
17 |   tags: install
18 | 
19 | - name: Copy jupyter requirements file
20 |   copy:
21 |     src: jupyter_requirements.txt
22 |     dest: /opt/jupyter-py39/jupyter_requirements.txt
23 |   tags: install
24 | 
25 | - name: Install jupyter package in venv
26 |   pip:
27 |     virtualenv: /opt/jupyter-py39
28 |     virtualenv_command: python3.9 -m venv
29 |     requirements: /opt/jupyter-py39/jupyter_requirements.txt
30 |   tags: install
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/ansible/roles/openondemand/tasks/pam_auth.yml:
--------------------------------------------------------------------------------
 1 | # https://osc.github.io/ood-documentation/latest/authentication/pam.html
 2 | ---
 3 | - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build
 4 |   yum:
 5 |     name: mod_authnz_pam
 6 | 
 7 | - name: Enable Apache PAM module
 8 |   lineinfile:
 9 |     path: /etc/httpd/conf.modules.d/55-authnz_pam.conf
10 |     line: LoadModule authnz_pam_module modules/mod_authnz_pam.so
11 |     regexp: ^LoadModule authnz_pam_module modules/mod_authnz_pam.so
12 | 
13 | - name: Set PAM service # TODO: might need subsequent modification??
14 |   command:
15 |     cmd: cp /etc/pam.d/sshd /etc/pam.d/ood
16 |     creates: /etc/pam.d/ood
17 | 
18 | - name: Allow the Apache user to read /etc/shadow
19 |   file:
20 |     path: /etc/shadow
21 |     mode: 0640
22 |     group: apache
23 | 
24 | - name: Allow httpd access to PAM in SELinux
25 |   ansible.posix.seboolean:
26 |     name: httpd_mod_auth_pam
27 |     state: yes
28 |     persistent: yes
29 |   when: ansible_facts.selinux.status == 'enabled'
30 | 
31 | # TODO: do we need to restart OOD here??
32 | 


--------------------------------------------------------------------------------
/ansible/roles/openondemand/tasks/validate.yml:
--------------------------------------------------------------------------------
1 | - name: Check Open Ondemand servername is defined
2 |   assert:
3 |     that: openondemand_servername != ''
4 |     fail_msg: "Variable `openondemand_servername` must be set on openondemand and (by default) grafana hosts. See ansible/roles/openondemand/README.md"
5 | 


--------------------------------------------------------------------------------
/ansible/roles/openondemand/templates/dashboard_app_links.yml.j2:
--------------------------------------------------------------------------------
1 | name: "{{ item.name }}"
2 | category: "{{ item.category }}"
3 | description: "{{ item.description }}"
4 | icon: "{{ item.icon | default('fa://clock-o') }}"
5 | url: "{{ item.url }}"
6 | new_window: "{{ item.get('new_window', false) }}"
7 | 


--------------------------------------------------------------------------------
/ansible/roles/openondemand/templates/files_shortcuts.rb.j2:
--------------------------------------------------------------------------------
1 | # Template to add additional shortcuts to the Files dashboard app
2 | # See https://osc.github.io/ood-documentation/master/customization.html#add-shortcuts-to-files-menu
3 | 
4 | OodFilesApp.candidate_favorite_paths.tap do |paths|
5 | {% for path in openondemand_filesapp_paths %}
6 |   paths << Pathname.new("{{ path }}")
7 | {% endfor %}
8 | end
9 | 


--------------------------------------------------------------------------------
/ansible/roles/openondemand/templates/grid-mapfile.j2:
--------------------------------------------------------------------------------
1 | {% for user in openondemand_mapping_users %}
2 | {% if 'openondemand_username' in user %}
3 | "{{ user.openondemand_username }}" {{ user.name }}
4 | {% endif %}
5 | {% endfor %}
6 | 


--------------------------------------------------------------------------------
/ansible/roles/opensearch/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Used to set passwords
 3 | #opensearch_internal_users_path:
 4 | 
 5 | opensearch_podman_user: "{{ ansible_user }}"
 6 | opensearch_version: '2.9.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags
 7 | opensearch_config_path: /usr/share/opensearch/config
 8 | opensearch_data_path: /usr/share/opensearch/data
 9 | opensearch_state: started # will be restarted if required
10 | opensearch_systemd_service_enabled: true
11 | opensearch_certs_duration: "{{ 365 * 10 }}" # days validity for self-signed certs
12 | opensearch_debug: false
13 | 


--------------------------------------------------------------------------------
/ansible/roles/opensearch/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | - name: Restart opensearch service
4 |   systemd:
5 |     name: opensearch.service
6 |     state: "{{ 'restarted' if 'started' in opensearch_state else opensearch_state }}"
7 |     enabled: "{{ opensearch_systemd_service_enabled }}"
8 |   become: true
9 | 


--------------------------------------------------------------------------------
/ansible/roles/opensearch/tasks/archive_data.yml:
--------------------------------------------------------------------------------
 1 | # Remove data which was NOT indexed by Slurm Job ID
 2 | # It will be re-ingested by filebeat from the slurmdbd, with that index
 3 | 
 4 | - name: Ensure opensearch stopped
 5 |   systemd:
 6 |     name: opensearch
 7 |     state: stopped
 8 |   register: _opensearch_stop
 9 |   until: "_opensearch_stop.status.ActiveState in ['inactive', 'failed']"
10 |   retries: 15
11 |   delay: 5
12 | 
13 | - name: Archive existing data
14 |   community.general.archive:
15 |     path: "{{ opensearch_data_path }}"
16 |     dest: "{{ opensearch_data_path | dirname }}/data-{{ lookup('pipe', 'date --iso-8601=minutes') }}.tar.gz"
17 |     remove: true
18 | 


--------------------------------------------------------------------------------
/ansible/roles/opensearch/tasks/install.yml:
--------------------------------------------------------------------------------
 1 | # safe to use during build
 2 | 
 3 | - name: Increase maximum number of virtual memory maps
 4 |   # see https://opensearch.org/docs/2.0/opensearch/install/important-settings/
 5 |   ansible.posix.sysctl:
 6 |     name: vm.max_map_count
 7 |     value: '262144'
 8 |     state: present
 9 |     reload: yes
10 | 
11 | - name: Create systemd unit file
12 |   template:
13 |     dest: /etc/systemd/system/opensearch.service
14 |     src: opensearch.service.j2
15 |   register: _opensearch_unit
16 | 
17 | - name: Pull container image
18 |   containers.podman.podman_image:
19 |     name: docker.io/opensearchproject/opensearch
20 |     tag: "{{ opensearch_version }}"
21 |   become_user: "{{ opensearch_podman_user }}"
22 | 
23 | - name: Reload opensearch unit file
24 |   command: systemctl daemon-reload
25 |   when: _opensearch_unit.changed
26 | 


--------------------------------------------------------------------------------
/ansible/roles/opensearch/tasks/migrate-opendistro.yml:
--------------------------------------------------------------------------------
 1 | # Migrate data from existing containerised opendistro v1.12.0 to containerised opensearch 2.1.0.
 2 | #
 3 | # This relies on:
 4 | # - Both opendistro and opensearch using host directories for data. See `_default_opendistro_data_path` below
 5 | # - Pre-upgrade group `opendistro` and current group `opensearch` containing the same host.
 6 | #
 7 | # NB: If `opendistro_data_path` was set to something non-default it MUST be set again in the `opensearch` group_vars,
 8 | # as the `opendistro` group will not exist in the groups.
 9 | 
10 | # NB: This deliberately does not remove the opendistro data - this could be done manually if required.
11 | 
12 | - name: Stop opendistro
13 |   ansible.builtin.systemd:
14 |     name: opendistro.service
15 |     state: stopped
16 |     enabled: false
17 | 
18 | - name: Copy opendistro data directory
19 |   ansible.builtin.copy:
20 |     remote_src: true
21 |     src: "{{ opendistro_data_path | default(_default_opendistro_data_path) }}"
22 |     dest: "{{ opensearch_data_path | dirname }}/" # copying a directory, so need to specify the parent for destination
23 |     owner: "{{ opensearch_podman_user }}"
24 |     group: "{{ opensearch_podman_user }}"
25 |     mode: 0770
26 |   vars:
27 |     # from environments/common/inventory/group_vars/all/opendistro.yml:
28 |     _default_opendistro_data_path: "{{ appliances_state_dir | default('/usr/share') }}/elasticsearch/data"
29 | 


--------------------------------------------------------------------------------
/ansible/roles/passwords/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | - name: Template passwords
4 |   template:
5 |     src: passwords.yml
6 |     dest: "{{ openhpc_passwords_output_path }}"
7 |   delegate_to: localhost
8 |   run_once: true
9 | 


--------------------------------------------------------------------------------
/ansible/roles/passwords/tasks/validate.yml:
--------------------------------------------------------------------------------
1 | - name: Assert secrets created
2 |   assert:
3 |     that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_demo_user_password defined in dev
4 |     fail_msg: "No inventory variables 'vault_*' found: Has ansible/adhoc/generate-passwords.yml been run?"
5 | 


--------------------------------------------------------------------------------
/ansible/roles/passwords/templates/passwords.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # {{ ansible_managed }}
3 | {{ slurm_appliance_secrets | to_nice_yaml }}


--------------------------------------------------------------------------------
/ansible/roles/persist_hostkeys/README.md:
--------------------------------------------------------------------------------
1 | # persist_hostkeys
2 | 
3 | Idempotently generates a persistent set of hostkeys and restores them after a rebuild/reimage.
4 | 
5 | Add hosts to the `persist_hostkeys` group to enable. All hosts in group will share the same set hostkeys.
6 | 


--------------------------------------------------------------------------------
/ansible/roles/persist_hostkeys/defaults/main.yml:
--------------------------------------------------------------------------------
1 | persist_hostkeys_state_server: "{{ groups['control'] | first }}"
2 | persist_hostkeys_state_dir: "{{ hostvars[persist_hostkeys_state_server]['appliances_state_dir'] }}/hostkeys"
3 | 


--------------------------------------------------------------------------------
/ansible/roles/persist_hostkeys/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Generate persistent hostkeys in state directory
 4 |   delegate_to: "{{ persist_hostkeys_state_server }}"
 5 |   block:
 6 |   - name: Ensure hostkeys directory exists on persistent storage
 7 |     file:
 8 |       path: "{{ persist_hostkeys_state_dir }}"
 9 |       state: directory
10 |       owner: root
11 |       group: root
12 |       mode: 0600
13 | 
14 |   - name: Check for existing hostkeys
15 |     find:
16 |       paths: "{{ persist_hostkeys_state_dir }}/"
17 |     register: _files_found
18 | 
19 |   - name: Generate hostkeys
20 |     when: _files_found.matched == 0
21 |     shell:
22 |     # ssh-keygen -A needs a directory with an /etc/ssh suffix to write hostkeys into
23 |       cmd: |
24 |         mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh
25 |         ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }}
26 |         mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }}
27 |         rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh
28 |   
29 |   - name: Get created key names
30 |     find:
31 |       path: "{{ persist_hostkeys_state_dir }}/"
32 |     register: _find_ssh_keys
33 | 
34 |   - name: Create in-memory copies of keys
35 |     ansible.builtin.slurp:
36 |       src: "{{ item.path }}"
37 |     loop: "{{ _find_ssh_keys.files }}"
38 |     register: _slurp_keys
39 | 
40 | - name: Copy keys to hosts
41 |   no_log: true
42 |   copy:
43 |     content: "{{ item.content | b64decode }}"
44 |     dest: "/etc/ssh/{{ item.source | regex_search('[^/]+$') }}"
45 |   loop: "{{ _slurp_keys.results }}"
46 | 
47 | - meta: reset_connection
48 | 


--------------------------------------------------------------------------------
/ansible/roles/persist_openhpc_secrets/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Check if OpenHPC secrets exist in persistent storage
 4 |   stat:
 5 |     path: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact"
 6 |   register: openhpc_secrets_stat
 7 | 
 8 | - name: Ensure Ansible facts directories exist
 9 |   file:
10 |     path: "{{ item }}"
11 |     state: directory
12 |     owner: root
13 |     mode: 0600
14 |   loop:
15 |     - "{{ appliances_state_dir }}/ansible.facts.d"
16 |     - "/etc/ansible/facts.d"
17 |     
18 | - name: Write OpenHPC secrets
19 |   template:
20 |     src: openhpc_secrets.fact
21 |     dest: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact"
22 |     owner: root
23 |     mode: 0600
24 |   when: "not openhpc_secrets_stat.stat.exists"
25 | 
26 | - name: Symlink persistent facts to facts_path
27 |   file:
28 |     state: link
29 |     src: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact"
30 |     dest: /etc/ansible/facts.d/openhpc_secrets.fact
31 |     owner: root
32 |     
33 | - name: Read facts
34 |   ansible.builtin.setup:
35 |     filter: ansible_local
36 | 


--------------------------------------------------------------------------------
/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact:
--------------------------------------------------------------------------------
 1 | {
 2 |   "vault_azimuth_user_password": "{{ lookup('password', '/dev/null') }}",
 3 |   "vault_grafana_admin_password": "{{ lookup('password', '/dev/null') }}",
 4 |   "vault_elasticsearch_admin_password": "{{ lookup('password', '/dev/null') }}",
 5 |   "vault_elasticsearch_kibana_password": "{{ lookup('password', '/dev/null') }}",
 6 |   "vault_mysql_root_password": "{{ lookup('password', '/dev/null') }}",
 7 |   "vault_mysql_slurm_password": "{{ lookup('password', '/dev/null') }}",
 8 |   "vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}"
 9 | }
10 | 


--------------------------------------------------------------------------------
/ansible/roles/podman/defaults/main.yml:
--------------------------------------------------------------------------------
1 | podman_users:
2 |   - name: "{{ ansible_user }}"
3 | 


--------------------------------------------------------------------------------
/ansible/roles/podman/tasks/prereqs.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Install OS packages
3 |   yum:
4 |     name:
5 |       - podman
6 |       - python3
7 |     state: installed
8 |   become: true


--------------------------------------------------------------------------------
/ansible/roles/proxy/README.md:
--------------------------------------------------------------------------------
 1 | # proxy
 2 | 
 3 | Define http/s proxy configuration.
 4 | 
 5 | ## Role variables
 6 | 
 7 | - `proxy_http_proxy`: Required. Address of http proxy. E.g. "http://10.1.0.28:3128" for a Squid proxy on default port.
 8 | - `proxy_https_proxy`: Optional. Address of https proxy. Default is `{{ proxy_http_proxy }}`.
 9 | - `proxy_no_proxy_extra`: Optional. List of additional addresses not to proxy. Will be combined with default list which includes `inventory_hostname` (for hostnames) and `ansible_host` (for host IPs) for all Ansible hosts.
10 | - `proxy_dnf`: Optional bool. Whether to configure yum/dnf proxying through `proxy_http_proxy`. Default `true`.
11 | - `proxy_systemd`: Optional bool. Whether to give processes started by systemd the above http, https and no_proxy configuration. **NB** Running services will need restarting if this is changed. Default `true`.
12 | 


--------------------------------------------------------------------------------
/ansible/roles/proxy/defaults/main.yml:
--------------------------------------------------------------------------------
1 | # proxy_http_proxy:
2 | proxy_https_proxy: "{{ proxy_http_proxy }}"
3 | proxy_no_proxy_defaults: "{{ ['localhost', '127.0.0.1'] + groups['all'] + hostvars.values() | map(attribute='ansible_host') }}"
4 | proxy_no_proxy_extras: []
5 | proxy_no_proxy: "{{ (proxy_no_proxy_defaults + proxy_no_proxy_extras) | unique | sort | join(',') }}"
6 | proxy_dnf: true
7 | proxy_systemd: true
8 | 


--------------------------------------------------------------------------------
/ansible/roles/pulp_site/.gitignore:
--------------------------------------------------------------------------------
1 | filter_plugins/__pycache__


--------------------------------------------------------------------------------
/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py:
--------------------------------------------------------------------------------
 1 | class FilterModule(object):
 2 |     def filters(self):
 3 |         return {
 4 |             'to_rpm_repos': self.to_rpm_repos,
 5 |             'to_rpm_pubs': self.to_rpm_pubs,
 6 |             'to_rpm_distros': self.to_rpm_distros
 7 |         }
 8 | 
 9 |     def to_rpm_repos(self, list, pulp_url):
10 |         repo_list = map(lambda x: {
11 |             'name': x['name'],
12 |             'url': pulp_url+'/'+x['subpath'],
13 |             'remote_username': x['remote_username'],
14 |             'remote_password': x['remote_password'],
15 |             'policy': x['policy'],
16 |             'state': x['state'] }, list)
17 |         return repo_list
18 |     
19 |     def to_rpm_pubs(self, list):
20 |         pub_list = map(lambda x: {
21 |             'repository': x['name'],
22 |             'state': x['state'] }, list)
23 |         return pub_list
24 |     
25 |     def to_rpm_distros(self, list):
26 |         distro_list = map(lambda x: {
27 |             'name': x['name'],
28 |             'repository': x['name'],
29 |             'base_path': x['subpath'],
30 |             'state': x['state'] }, list)
31 |         return distro_list


--------------------------------------------------------------------------------
/ansible/roles/pulp_site/tasks/install.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Install packages
 4 |   dnf:
 5 |     name:
 6 |     - podman
 7 | 
 8 | - name: Create install directories
 9 |   ansible.builtin.file:
10 |     state: directory
11 |     path: "{{ pulp_site_install_dir }}/{{ item }}"
12 |   loop:
13 |   - settings/certs
14 |   - pulp_storage
15 |   - pgsql
16 |   - containers
17 | 
18 | - name: Template settings file
19 |   ansible.builtin.template:
20 |     src: settings.py.j2
21 |     dest: "{{ pulp_site_install_dir }}/settings/settings.py"
22 | 
23 | - name: Install pulp podman container
24 |   containers.podman.podman_container:
25 |     name: pulp
26 |     publish:
27 |       - "{{ pulp_site_port }}:80"
28 |     volume:
29 |     - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ pulp_site_selinux_suffix }}"
30 |     - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ pulp_site_selinux_suffix }}"
31 |     - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ pulp_site_selinux_suffix }}"
32 |     - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ pulp_site_selinux_suffix }}"
33 |     device: /dev/fuse
34 |     image: docker.io/pulp/pulp:3.68.1
35 | 
36 | - name: Reset admin password once container has initialised
37 |   no_log: true
38 |   ansible.builtin.shell:
39 |     cmd: "podman exec pulp bash -c 'pulpcore-manager reset-admin-password -p {{ pulp_site_password }}'"
40 |   register: _admin_reset_output
41 |   until: 0 == _admin_reset_output.rc
42 |   retries: 6
43 |   delay: 30
44 | 


--------------------------------------------------------------------------------
/ansible/roles/pulp_site/templates/cli.toml.j2:
--------------------------------------------------------------------------------
 1 | [cli]
 2 | base_url = "{{ pulp_site_url }}"
 3 | username = "{{ pulp_site_username }}"
 4 | password = "{{ pulp_site_password }}"
 5 | api_root = "/pulp/"
 6 | domain = "default"
 7 | headers = []
 8 | cert = ""
 9 | key = ""
10 | verify_ssl = true
11 | format = "json"
12 | dry_run = false
13 | timeout = 0
14 | verbose = 0
15 | 


--------------------------------------------------------------------------------
/ansible/roles/pulp_site/templates/settings.py.j2:
--------------------------------------------------------------------------------
1 | CONTENT_ORIGIN='http://{{ ansible_fqdn }}:{{ pulp_site_port }}'
2 | TOKEN_AUTH_DISABLED=True
3 | 


--------------------------------------------------------------------------------
/ansible/roles/rebuild/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | rebuild_clouds_path: ~/.config/openstack/clouds.yaml
 4 | 
 5 | rebuild_job_partitions: rebuild
 6 | rebuild_job_name: "rebuild-{{ item }}" # item is nodename
 7 | rebuild_job_command: 'sleep 5'
 8 | rebuild_job_reboot: true
 9 | rebuild_job_options: ''
10 | rebuild_job_user: root
11 | rebuild_job_template: >-
12 |   sbatch
13 |   --nodelist={{ item }}
14 |   {{ '--reboot' if rebuild_job_reboot | bool else '' }}
15 |   --job-name={{ rebuild_job_name }}
16 |   --nodes=1
17 |   --exclusive
18 |   --partition={{ _rebuild_job_current_partition }}
19 |   --no-requeue
20 |   --output=/dev/null
21 |   --wrap="{{ rebuild_job_command }}"
22 |   {{ rebuild_job_options }}
23 | #rebuild_job_hostlist:


--------------------------------------------------------------------------------
/ansible/roles/rebuild/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Create /etc/openstack
 4 |   file:
 5 |     path: /etc/openstack
 6 |     state: directory
 7 |     owner: slurm
 8 |     group: root
 9 |     mode: u=rX,g=rwX
10 | 
11 | - name: Copy out clouds.yaml
12 |   copy:
13 |     src: "{{ rebuild_clouds_path }}"
14 |     dest: /etc/openstack/clouds.yaml
15 |     owner: slurm
16 |     group: root
17 |     mode: u=r,g=rw
18 | 
19 | - name: Setup slurm tools
20 |   include_role:
21 |     name: slurm_tools
22 | 


--------------------------------------------------------------------------------
/ansible/roles/rebuild/tasks/rebuild.yml:
--------------------------------------------------------------------------------
 1 | - name: Create rebuild jobs for partition
 2 |   include_tasks:
 3 |     file: rebuild_partition.yml
 4 |   args:
 5 |     apply:
 6 |       become: yes
 7 |       become_user: "{{ rebuild_job_user }}"
 8 |   loop: "{{ rebuild_job_partitions | split(',') }}"
 9 |   loop_control:
10 |     loop_var: _rebuild_job_current_partition
11 | 
12 | 


--------------------------------------------------------------------------------
/ansible/roles/rebuild/tasks/rebuild_partition.yml:
--------------------------------------------------------------------------------
 1 | - name: Get list of nodes in partition
 2 |   ansible.builtin.command:
 3 |     cmd: >-
 4 |       sinfo
 5 |       --Node
 6 |       --format=%N
 7 |       --noheader
 8 |       --partition={{ _rebuild_job_current_partition }}
 9 |   register: _sinfo_partition
10 |   when: rebuild_job_hostlist is not defined
11 | 
12 | - name: Expand rebuild_job_hostlist to host names
13 |   ansible.builtin.command:
14 |     cmd: "scontrol show hostnames {{ rebuild_job_hostlist }}"
15 |   register: _scontrol_hostnames
16 |   when: rebuild_job_hostlist is defined
17 | 
18 | - name: Submit rebuild jobs
19 |   ansible.builtin.command:
20 |     cmd: "{{ rebuild_job_template }}"
21 |   loop: "{{ _scontrol_hostnames.stdout_lines | default(_sinfo_partition.stdout_lines) }}"
22 | 


--------------------------------------------------------------------------------
/ansible/roles/resolv_conf/README.md:
--------------------------------------------------------------------------------
 1 | # resolv_conf
 2 | 
 3 | Template out `/etc/resolv.conf`.
 4 | 
 5 | ## Role variables
 6 | - `resolv_conf_nameservers`: List of up to 3 nameserver addresses.
 7 | 
 8 | Notes:
 9 | - `NetworkManager` (if used) will be prevented from rewriting this file on boot.
10 | - If `/etc/resolv.conf` includes `127.0.0.1` (e.g. due to a FreeIPA server installation), then `resolv_conf_nameservers` is ignored and this role does not change `/etc/resolv.conf`
11 | - For hosts in the `resolv_conf` group, the `/etc/resolv.conf` created with `resolv_conf_nameservers` will
12 |   NOT be deleted at the end of Packer image builds.
13 | 


--------------------------------------------------------------------------------
/ansible/roles/resolv_conf/defaults/main.yml:
--------------------------------------------------------------------------------
1 | resolv_conf_nameservers: []
2 | 


--------------------------------------------------------------------------------
/ansible/roles/resolv_conf/files/NetworkManager-dns-none.conf:
--------------------------------------------------------------------------------
1 | [main]
2 | dns=none
3 | 


--------------------------------------------------------------------------------
/ansible/roles/resolv_conf/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | - name: Read nameservers from /etc/resolv.conf
 2 |   ansible.builtin.slurp:
 3 |     src: /etc/resolv.conf
 4 |   register: _slurp_resolv_conf
 5 | 
 6 | - name: Set nameservers in /etc/resolv.conf
 7 |   # Might need to set this for freeipa_server host, but freeipa server install
 8 |   # will then change it to point to 127.0.0.1.
 9 |   ansible.builtin.template:
10 |     src: resolv.conf.j2
11 |     dest: /etc/resolv.conf
12 |     owner: root
13 |     group: root
14 |     mode: u=rw,og=r
15 |   when: "'127.0.0.1' not in (_slurp_resolv_conf.content | b64decode)"
16 | 
17 | - name: Disable NetworkManager control of resolv.conf
18 |   ansible.builtin.copy:
19 |     src: NetworkManager-dns-none.conf
20 |     dest: /etc/NetworkManager/conf.d/90-dns-none.conf
21 |     owner: root
22 |     group: root
23 |     mode: u=rw,og=r
24 |   register: _copy_nm_config
25 | 
26 | - name: Reload NetworkManager
27 |   ansible.builtin.systemd:
28 |     name: NetworkManager
29 |     state: reloaded
30 |   when: _copy_nm_config.changed | default(false)
31 | 


--------------------------------------------------------------------------------
/ansible/roles/resolv_conf/templates/resolv.conf.j2:
--------------------------------------------------------------------------------
1 | # Created by slurm appliance ansible/roles/resolv_conf
2 | {% if cluster_domain_suffix is defined %}
3 | search {{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}
4 | {% endif %}
5 | 
6 | {% for ns in resolv_conf_nameservers[0:3] %}
7 | nameserver {{ ns }}
8 | {% endfor %}
9 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_exporter/README.md:
--------------------------------------------------------------------------------
 1 | slurm_exporter
 2 | ==============
 3 | 
 4 | Build, install and configure a Prometheus exporter for metrics about Slurm itself: https://github.com/vpenso/prometheus-slurm-exporter/
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | Rocky Linux 8.5 host.
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | See `defaults/main.yml`
15 | 
16 | Dependencies
17 | ------------
18 | 
19 | None.
20 | 
21 | Example Playbook
22 | ----------------
23 | 
24 |     - name: Deploy Slurm exporter
25 |       hosts: control
26 |       become: true
27 |       tags: slurm_exporter
28 |       tasks:
29 |         - import_role:
30 |             name: slurm_exporter
31 | 
32 | Prometheus scrape configuration for this might look like:
33 | 
34 | ```
35 | - job_name: "slurm_exporter"
36 |   scrape_interval: 30s
37 |   scrape_timeout: 30s
38 |   static_configs:
39 |     - targets:
40 |       - "{{ openhpc_slurm_control_host }}:9341"
41 | ```
42 | 
43 | License
44 | -------
45 | 
46 | Apache v2
47 | 
48 | Author Information
49 | ------------------
50 | 
51 | StackHPC Ltd.
52 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_exporter/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # see https://github.com/stackhpc/prometheus-slurm-exporter/releases - version follows upstream, release is stackhpc build
3 | slurm_exporter_version: '0.21'
4 | slurm_exporter_release: '1'
5 | slurm_exporter_state: started
6 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_exporter/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Restart slurm exporter
 3 |   become: true
 4 |   systemd:
 5 |     daemon_reload: true
 6 |     name: prometheus-slurm-exporter
 7 |     state: restarted
 8 |   when:
 9 |     - not ansible_check_mode
10 |     - slurm_exporter_state != 'stopped'
11 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_exporter/tasks/install.yml:
--------------------------------------------------------------------------------
 1 | - name: Install slurm_exporter package
 2 |   dnf:
 3 |     name: "https://github.com/stackhpc/prometheus-slurm-exporter/releases/download/{{ slurm_exporter_version }}/prometheus-slurm-exporter-{{ slurm_exporter_version }}-{{slurm_exporter_release}}.el8.x86_64.rpm"
 4 |     disable_gpg_check: yes
 5 |   notify: Restart slurm exporter
 6 | 
 7 | - meta: flush_handlers
 8 | 
 9 | - name: Ensure slurm exporter state
10 |   systemd:
11 |     name: prometheus-slurm-exporter
12 |     state: "{{ slurm_exporter_state }}"
13 |     enabled: true
14 |   when:
15 |     - not ansible_check_mode
16 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_exporter/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - import_tasks: install.yml
3 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_stats/README.md:
--------------------------------------------------------------------------------
 1 | stackhpc.slurm_openstack_tools.slurm-stats
 2 | ==========================================
 3 | 
 4 | Configures slurm-stats from https://github.com/stackhpc/slurm-openstack-tools.git which
 5 | transforms sacct output into a form that is more amenable for importing into elasticsearch/loki.
 6 | 
 7 | Requirements
 8 | ------------
 9 | 
10 | Role Variables
11 | --------------
12 | 
13 | See `defaults/main.yml`.
14 | 
15 | Dependencies
16 | ------------
17 | 
18 | Example Playbook
19 | ----------------
20 | 
21 |     - hosts: compute
22 |       tasks:
23 |         - import_role:
24 |             name: slurm_stats
25 | 
26 | 
27 | License
28 | -------
29 | 
30 | Apache-2.0
31 | 
32 | Author Information
33 | ------------------
34 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_stats/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | ####################
 3 | # log rotate options
 4 | ####################
 5 | 
 6 | # These options affect the contents of the log-rotate file.
 7 | # See: man logrotate
 8 | 
 9 | # Log files are rotated count times before being removed
10 | slurm_stats_log_rotate_content_rotate: 7
11 | 
12 | # How frequently are the log files rotated. Can be one of daily, monthly, ...
13 | slurm_stats_log_rotate_content_frequency: daily
14 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_stats/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Setup slurm tools
 4 |   include_role:
 5 |     name: slurm_tools
 6 | 
 7 | - name: Create a directory to house the log files
 8 |   file:
 9 |     state: directory
10 |     path: /var/log/slurm-stats
11 |   become: true
12 | 
13 | - name: Create cron job
14 |   cron:
15 |     name: Generate slurm stats
16 |     minute: "*/5"
17 |     user: root
18 |     # NOTE: lasttimestamp is stored at /root/lasttimestamp
19 |     job: "TZ=UTC /opt/slurm-tools/bin/slurm-stats >> /var/log/slurm-stats/finished_jobs.json"
20 |     cron_file: slurm-stats
21 |   become: true
22 | 
23 | - name: Setup log rotate
24 |   copy:
25 |     content: |
26 |       # WARNING: This file is managed by ansible, do not modify.
27 |       /var/log/slurm-stats/finished_jobs.json {
28 |               {{ slurm_stats_log_rotate_content_frequency }}
29 |               rotate {{ slurm_stats_log_rotate_content_rotate }}
30 |               compress
31 |               delaycompress
32 |       }
33 |     dest: /etc/logrotate.d/slurm-stats
34 |   become: true
35 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_tools/README.md:
--------------------------------------------------------------------------------
 1 | slurm_tools
 2 | =========
 3 | 
 4 | Install python-based tools from https://github.com/stackhpc/slurm-openstack-tools.git into `/opt/slurm-tools/bin/`.
 5 | 
 6 | Role Variables
 7 | --------------
 8 | 
 9 | - `pytools_editable`: Optional bool. Whether to install the package using `pip`'s
10 |   editable mode (installing source to `/opt/slurm-tools/src`). Default `false`.
11 | - `pytools_gitref`: Optional. Git branch/tag/commit etc to install. Default `master`.
12 | - `pytools_user`: Optional user to install as. Default `root`.
13 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_tools/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | pytools_editable: false
3 | pytools_gitref: v2.0
4 | pytools_user: root
5 | 


--------------------------------------------------------------------------------
/ansible/roles/slurm_tools/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: install python3
 3 |   package:
 4 |     name: python3,git
 5 |   become: true
 6 | 
 7 | - name: Create virtualenv directory
 8 |   file:
 9 |     path: /opt/slurm-tools
10 |     owner: "{{ pytools_user }}"
11 |     group: "{{ pytools_user }}"
12 |     state: directory
13 |   become: true
14 | 
15 | - block:
16 |   - name: Upgrade pip
17 |     # This needs to a separate step so that we use the updated version
18 |     # to install the packages below.
19 |     pip:
20 |       name: pip
21 | 
22 |   - name: Create virtualenv
23 |     pip:
24 |       name: "git+https://github.com/stackhpc/slurm-openstack-tools.git@{{ pytools_gitref }}#egg=slurm_openstack_tools"
25 |       editable: "{{ pytools_editable }}"
26 | 
27 |   module_defaults:
28 |     ansible.builtin.pip:
29 |       virtualenv: /opt/slurm-tools
30 |       virtualenv_command: "{{ 'python3.9 -m venv' if ansible_distribution_major_version == '8' else 'python3 -m venv' }}"
31 |       state: latest
32 |   become: true
33 |   become_user: "{{ pytools_user }}"
34 | 


--------------------------------------------------------------------------------
/ansible/roles/squid/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | squid_conf_template: squid.conf.j2
 2 | squid_started: true
 3 | squid_enabled: true
 4 | 
 5 | squid_cache_mem: "{{ undef(hint='squid_cache_mem required, e.g. \"12 GB\"')  }}"
 6 | squid_cache_dir: /var/spool/squid
 7 | squid_cache_disk: "{{ undef(hint='squid_cache_disk (in MB) required, e.g. \"1024\"')  }}" # always in MB
 8 | squid_maximum_object_size_in_memory: '64 MB'
 9 | squid_maximum_object_size: '200 MB'
10 | squid_http_port: 3128
11 | squid_acls: acl anywhere src all # rely on openstack security groups
12 | squid_http_access: |
13 |   # Deny requests to certain unsafe ports
14 |   http_access deny !Safe_ports
15 |   # Deny CONNECT to other than secure SSL ports
16 |   http_access deny CONNECT !SSL_ports
17 |   # Only allow cachemgr access from localhost
18 |   http_access allow localhost manager
19 |   http_access deny manager
20 |   # Rules allowing http access
21 |   http_access allow anywhere
22 |   http_access allow localhost
23 |   # Finally deny all other access to this proxy
24 |   http_access deny all
25 | 


--------------------------------------------------------------------------------
/ansible/roles/squid/handlers/main.yml:
--------------------------------------------------------------------------------
1 | - name: Restart squid
2 |   service:
3 |     name: squid
4 |     state: restarted
5 |   when: squid_started | bool
6 | 


--------------------------------------------------------------------------------
/ansible/roles/squid/tasks/configure.yml:
--------------------------------------------------------------------------------
 1 | - name: Ensure squid cache directory exists
 2 |   file:
 3 |     path: "{{ squid_cache_dir }}"
 4 |     # based on what dnf package creates:
 5 |     owner: squid
 6 |     group: squid
 7 |     mode: u=rwx,g=rw,o=
 8 | 
 9 | - name: Template squid configuration
10 |   template:
11 |     src: "{{ squid_conf_template }}"
12 |     dest: /etc/squid/squid.conf
13 |     owner: squid
14 |     group: squid
15 |     mode: ug=rwX,go=
16 |   notify: Restart squid
17 | 
18 | - meta: flush_handlers
19 | 
20 | - name: Ensure squid service state
21 |   systemd:
22 |     name: squid
23 |     state: "{{ 'started' if squid_started | bool else 'stopped' }}"
24 |     enabled: "{{ true if squid_enabled else false }}"
25 | 


--------------------------------------------------------------------------------
/ansible/roles/squid/tasks/install.yml:
--------------------------------------------------------------------------------
1 | - name: Install squid package
2 |   dnf:
3 |     name: squid
4 | 


--------------------------------------------------------------------------------
/ansible/roles/squid/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - import_tasks: install.yml
2 | - import_tasks: configure.yml
3 | 


--------------------------------------------------------------------------------
/ansible/roles/sshd/README.md:
--------------------------------------------------------------------------------
 1 | # sshd
 2 | 
 3 | Configure sshd.
 4 | 
 5 | ## Role variables
 6 | 
 7 | - `sshd_password_authentication`: Optional bool. Whether to enable password login. Default `false`.
 8 | - `sshd_disable_forwarding`: Optional bool. Whether to disable all forwarding features (X11, ssh-agent, TCP and StreamLocal). Default `true`.
 9 | - `sshd_conf_src`: Optional string. Path to sshd configuration template. Default is in-role template.
10 | - `sshd_conf_dest`: Optional string. Path to destination for sshd configuration file. Default is `/etc/ssh/sshd_config.d/10-ansible.conf` which overrides `50-{cloud-init,redhat}` files, if present.
11 | 


--------------------------------------------------------------------------------
/ansible/roles/sshd/defaults/main.yml:
--------------------------------------------------------------------------------
1 | sshd_password_authentication: false
2 | sshd_disable_forwarding: true
3 | sshd_conf_src: sshd.conf.j2
4 | sshd_conf_dest: /etc/ssh/sshd_config.d/10-ansible.conf
5 | 


--------------------------------------------------------------------------------
/ansible/roles/sshd/handlers/main.yml:
--------------------------------------------------------------------------------
1 | - name: Restart sshd
2 |   systemd:
3 |     name: sshd
4 |     state: restarted
5 | 


--------------------------------------------------------------------------------
/ansible/roles/sshd/tasks/configure.yml:
--------------------------------------------------------------------------------
 1 | - name: Grab facts to determine distribution
 2 |   setup:
 3 | 
 4 | - name: Ensure drop in directory exists
 5 |   file:
 6 |     path: /etc/ssh/sshd_config.d/
 7 |     state: directory
 8 |     owner: root
 9 |     group: root
10 |     mode: 700
11 |   become: true
12 | 
13 | - name: Ensure drop in configuration is included
14 |   blockinfile:
15 |     dest: /etc/ssh/sshd_config
16 |     content: |
17 |       # To modify the system-wide sshd configuration, create <custom>.conf
18 |       # files under /etc/ssh/sshd_config.d/ which will be automatically
19 |       # included below.
20 |       Include /etc/ssh/sshd_config.d/*.conf
21 |     state: present
22 |     insertafter: "# default value."
23 |     validate: sshd -t -f %s
24 |   notify:
25 |     - Restart sshd
26 |   become: true
27 |   when: ansible_facts.distribution_major_version == '8'
28 | 
29 | - name: Template sshd configuration
30 |   # NB: If parameters are defined multiple times the first value wins;
31 |   # The default /etc/ssh/sshd_config has
32 |   #   Include /etc/ssh/sshd_config.d/*.conf
33 |   # early on, which is generally held to be the correct approach, so adding
34 |   # values to the end of that file won't work
35 |   template:
36 |     src: "{{ sshd_conf_src }}"
37 |     dest: "{{ sshd_conf_dest }}"
38 |     owner: root
39 |     group: root
40 |     mode: u=rw,go=
41 |     validate: sshd -t -f %s
42 |   notify:
43 |     - Restart sshd
44 | 


--------------------------------------------------------------------------------
/ansible/roles/sshd/tasks/export.yml:
--------------------------------------------------------------------------------
 1 | # Exclusively used for compute-init
 2 | - name: Inject host specific config template
 3 |   template:
 4 |     src: "{{ sshd_conf_src }}"
 5 |     dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/sshd.conf"
 6 |     owner: root
 7 |     group: root
 8 |     mode: u=rw,go=
 9 |   delegate_to: "{{ groups['control'] | first }}"
10 | 


--------------------------------------------------------------------------------
/ansible/roles/sshd/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - import_tasks: configure.yml
2 | 


--------------------------------------------------------------------------------
/ansible/roles/sshd/templates/sshd.conf.j2:
--------------------------------------------------------------------------------
1 | # {{ ansible_managed }}
2 | PasswordAuthentication {{ 'yes' if sshd_password_authentication | bool else 'no' }}
3 | DisableForwarding {{ 'yes' if sshd_disable_forwarding | bool else 'no' }}
4 | 


--------------------------------------------------------------------------------
/ansible/roles/sssd/README.md:
--------------------------------------------------------------------------------
 1 | # sssd
 2 | 
 3 | Install and configure [sssd](https://sssd.io/docs/introduction.html).
 4 | 
 5 | 
 6 | ## Role variables
 7 | 
 8 | The only required configuration is to create a [sssd.conf](https://www.mankier.com/5/sssd.conf) template at the location specified by `sssd_conf_src`.
 9 | 
10 | - `sssd_packages`: Optional list. Packages to install.
11 | - `sssd_install_ldap`: Optional bool. Whether to install packages enabling SSSD to authenticate against LDAP. Default `false`.
12 | - `sssd_ldap_packages`: Optional list. Packages to install when using `sssd_install_ldap`.
13 | - `sssd_enable_mkhomedir`: Optional bool. Whether to enable creation of home directories on login. Default `false`.
14 | - `sssd_mkhomedir_packages`: Optional list. Packages to install when using `sssd_enable_mkhomedir`.
15 | - `sssd_conf_src`: Optional string. Path to `sssd.conf` template. Default (which must be created) is `{{ appliances_environment_root }}/files/sssd.conf.j2`.
16 | - `sssd_conf_dest`: Optional string. Path to destination for `sssd.conf`. Default `/etc/sssd/sssd.conf`.
17 | - `sssd_started`: Optional bool. Whether `sssd` service should be started.
18 | - `sssd_enabled`: Optional bool. Whether `sssd` service should be enabled.
19 | 


--------------------------------------------------------------------------------
/ansible/roles/sssd/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | sssd_packages:
 2 |   - sssd-common
 3 | sssd_install_ldap: false
 4 | sssd_ldap_packages:
 5 |   - sssd-ldap
 6 | sssd_enable_mkhomedir: false
 7 | sssd_mkhomedir_packages:
 8 |   - oddjob-mkhomedir
 9 | sssd_conf_src: "{{ appliances_environment_root }}/files/sssd.conf.j2"
10 | sssd_conf_dest: /etc/sssd/sssd.conf
11 | sssd_started: true
12 | sssd_enabled: true
13 | 


--------------------------------------------------------------------------------
/ansible/roles/sssd/handlers/main.yml:
--------------------------------------------------------------------------------
1 | - name: Restart sssd
2 |   systemd:
3 |     name: sssd
4 |     state: restarted
5 |   when: sssd_started | bool
6 | 


--------------------------------------------------------------------------------
/ansible/roles/sssd/tasks/configure.yml:
--------------------------------------------------------------------------------
 1 | - name: Manage sssd.conf configuration
 2 |   template:
 3 |     src: "{{ sssd_conf_src }}"
 4 |     dest: "{{ sssd_conf_dest }}"
 5 |     owner: root
 6 |     group: root
 7 |     mode: u=rw,go=
 8 |   notify: "Restart sssd"
 9 | 
10 | - meta: flush_handlers
11 | 
12 | - name: Ensure sssd service state
13 |   systemd:
14 |     name: sssd
15 |     state: "{{ 'started' if sssd_started | bool else 'stopped' }}"
16 |     enabled: "{{ sssd_enabled | bool }}"
17 | 
18 | - name: Get current authselect configuration
19 |   command: authselect current --raw
20 |   changed_when: false
21 |   failed_when:
22 |     - _authselect_current.rc != 0
23 |     - "'No existing configuration detected' not in _authselect_current.stdout"
24 |   register: _authselect_current # stdout: sssd with-mkhomedir
25 | 
26 | - name: Configure nsswitch and PAM for SSSD
27 |   command: "authselect select sssd --force{% if sssd_enable_mkhomedir | bool %} with-mkhomedir{% endif %}"
28 |   when: "'sssd' not in _authselect_current.stdout"
29 | 
30 | - name: "Ensure oddjob is started"
31 |   service:
32 |     name: oddjobd
33 |     state: 'started'
34 |     enabled: true
35 |   when: sssd_enable_mkhomedir | bool


--------------------------------------------------------------------------------
/ansible/roles/sssd/tasks/export.yml:
--------------------------------------------------------------------------------
1 | # Exclusively used for compute-init
2 | - name: Inject host specific config template
3 |   template:
4 |     src: "{{ sssd_conf_src }}"
5 |     dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/sssd.conf"
6 |     owner: root
7 |     group: root
8 |     mode: u=rw,go=
9 |   delegate_to: "{{ groups['control'] | first }}"


--------------------------------------------------------------------------------
/ansible/roles/sssd/tasks/install.yml:
--------------------------------------------------------------------------------
 1 | - name: Ensure sssd packages are installed
 2 |   dnf:
 3 |     name: "{{ sssd_packages + sssd_ldap_packages if (sssd_install_ldap | bool) else [] }}"
 4 | 
 5 | - name: Control if sssd should start on boot
 6 |   # Needs to be done here to prevent starting after image build, is enabled by default
 7 |   systemd:
 8 |     name: sssd
 9 |     enabled: "{{ sssd_enabled | bool }}"
10 | 
11 | - name: Ensure mkhomedir packages are installed if required
12 |   dnf:
13 |     name: "{{ sssd_mkhomedir_packages }}"
14 | 


--------------------------------------------------------------------------------
/ansible/roles/sssd/tasks/main.yml:
--------------------------------------------------------------------------------
1 | - import_tasks: install.yml
2 | - import_tasks: configure.yml
3 | 


--------------------------------------------------------------------------------
/ansible/roles/systemd/README.md:
--------------------------------------------------------------------------------
 1 | # systemd
 2 | 
 3 | Create drop-in files for systemd services.
 4 | 
 5 | # Role Variables
 6 | - `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows:
 7 |     - `group`: Required str. Inventory group this drop-in applies to.
 8 |     - `comment`: Optional str. Comment describing reason for drop-in.
 9 |     - `content`: Required str. Content of drop-in file.
10 | # systemd
11 | 
12 | Create drop-in files for systemd services.
13 | 
14 | # Role Variables
15 | - `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows:
16 |     - `group`: Required str. Inventory group this drop-in applies to.
17 |     - `comment`: Optional str. Comment describing reason for drop-in.
18 |     - `content`: Required str. Content of drop-in file.
19 | - `systemd_restart`: Optional bool. Whether to reload unit definitions and restart services. Default `false`.
20 | 


--------------------------------------------------------------------------------
/ansible/roles/systemd/defaults/main.yml:
--------------------------------------------------------------------------------
1 | #systemd_dropins:
2 | #   <unit_name>:
3 | #     group: <required>
4 | #     comment: <optional>
5 | #     content: <required>
6 | 
7 | systemd_restart: false
8 | 


--------------------------------------------------------------------------------
/ansible/roles/systemd/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | # NB: As `systemd_TODO:` is defined in group_vars/all, all tasks here are conditional on group.
 2 | - name: Make directory for unit dropins
 3 |   file:
 4 |     path: "/etc/systemd/system/{{ item.key }}.service.d/"
 5 |     state: directory
 6 |     owner: root
 7 |     group: root
 8 |     mode: 0644
 9 |   loop: "{{ systemd_dropins | dict2items }}"
10 |   when: "item.value.group in group_names"
11 | 
12 | - name: Add dropins for unit files
13 |   ansible.builtin.copy:
14 |     content: |
15 |       # {{ item.value.comment | default('slurm appliance generated') }}
16 |       {{ item.value.content }}
17 |     dest: "/etc/systemd/system/{{ item.key }}.service.d/slurm_app.conf"
18 |     owner: root
19 |     group: root
20 |     mode: 0644
21 |   loop: "{{ systemd_dropins | dict2items }}"
22 |   register: _systemd_dropins
23 |   when: "item.value.group in group_names"
24 | 
25 | - name: Reload unit definitions
26 |   ansible.builtin.shell:
27 |     cmd: systemctl daemon-reload
28 |   when:
29 |     - _systemd_dropins.changed
30 |     - systemd_restart | default(false) | bool
31 | 
32 | - name: Reload units
33 |   ansible.builtin.systemd:
34 |     name: "{{ item.key }}"
35 |     state: restarted
36 |   loop: "{{ systemd_dropins | dict2items }}"
37 |   when:
38 |     - _systemd_dropins.changed
39 |     - "item.value.group in group_names"
40 |     - systemd_restart | default(false) | bool
41 | 


--------------------------------------------------------------------------------
/ansible/roles/tuned/README.md:
--------------------------------------------------------------------------------
 1 | tuned
 2 | =========
 3 | 
 4 | This role configures the TuneD tool for system tuning, ensuring optimal performance based on the profile settings defined.
 5 | 
 6 | Role Variables
 7 | --------------
 8 | 
 9 | See the [TuneD documentation](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9/html/monitoring_and_managing_system_status_and_performance/getting-started-with-tuned_monitoring-and-managing-system-status-and-performance) for profile details.
10 | 
11 | 
12 | - `tuned_profile_baremetal`: Optional str. Name of default profile for non-virtualised hosts. Default `hpc-compute`.
13 | - `tuned_profile_vm`: Optional str. Name of default profile for virtualised hosts. Default `virtual-guest`.
14 | - `tuned_profile`: Optional str. Name of profile to apply to host. Defaults to `tuned_profile_baremetal` or `tuned_profile_vm` as appropriate.
15 | 


--------------------------------------------------------------------------------
/ansible/roles/tuned/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # defaults file for tuned
3 | tuned_profile_baremetal: hpc-compute
4 | tuned_profile_vm: virtual-guest
5 | tuned_profile: "{{ tuned_profile_baremetal if ansible_virtualization_role != 'guest' else tuned_profile_vm }}"
6 | tuned_enabled: true
7 | tuned_started: true 
8 | 


--------------------------------------------------------------------------------
/ansible/roles/tuned/tasks/configure.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Enable and start TuneD
 3 |   ansible.builtin.systemd:
 4 |     name: tuned
 5 |     enabled: "{{ tuned_enabled | bool }}"
 6 |     state: "{{ 'started' if tuned_started | bool else 'stopped' }}"
 7 | 
 8 | - name: Check TuneD profile
 9 |   ansible.builtin.command:
10 |     cmd: tuned-adm active
11 |   when: tuned_started
12 |   register: _tuned_profile_current
13 |   changed_when: false
14 | 
15 | - name: Set tuned-adm profile
16 |   ansible.builtin.command:
17 |     cmd: "tuned-adm profile {{ tuned_profile }}"
18 |   when: 
19 |     - tuned_started | bool
20 |     - tuned_profile not in _tuned_profile_current.stdout
21 | 


--------------------------------------------------------------------------------
/ansible/roles/tuned/tasks/install.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Install tuneD
3 |   ansible.builtin.dnf:
4 |     name: tuned
5 |     state: present


--------------------------------------------------------------------------------
/ansible/roles/tuned/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - import_tasks: install.yml
3 | - import_tasks: configure.yml


--------------------------------------------------------------------------------
/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #####
 4 | # Small script that can be used to attach to the infra container of a pod
 5 | #
 6 | # Useful in a systemd service that starts a pod in order to track the execution
 7 | #
 8 | # Accepts a single argument which is the name of the pod whose infra container we should attach to
 9 | #####
10 | 
11 | set -e
12 | 
13 | echo "[INFO] Finding infra container for pod '$1'"
14 | INFRA_CONTAINER_ID="$(podman pod inspect --format '{{.InfraContainerID}}' "$1")"
15 | 
16 | echo "[INFO] Attaching to infra container '${INFRA_CONTAINER_ID}'"
17 | exec podman container attach --no-stdin ${INFRA_CONTAINER_ID}
18 | 


--------------------------------------------------------------------------------
/ansible/roles/zenith_proxy/templates/client.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Podman {{ zenith_proxy_client_service_name }}.service
 3 | Wants=network.target
 4 | After=network-online.target
 5 | BindsTo={{ zenith_proxy_service_name }}.service
 6 | PartOf={{ zenith_proxy_service_name }}.service
 7 | After={{ zenith_proxy_service_name }}.service
 8 | {% if zenith_proxy_mitm_enabled %}
 9 | Wants={{ zenith_proxy_mitm_service_name }}.service
10 | After={{ zenith_proxy_mitm_service_name }}.service
11 | {% endif %}
12 | 
13 | [Service]
14 | Environment=PODMAN_SYSTEMD_UNIT=%n
15 | Type=simple
16 | Restart=always
17 | RestartSec=5
18 | User={{ zenith_proxy_podman_user }}
19 | Group={{ zenith_proxy_podman_user }}
20 | ExecStart=/usr/bin/podman run \
21 |   --cgroups=no-conmon \
22 |   --replace \
23 |   --restart=no \
24 |   --pod {{ zenith_proxy_pod_name }} \
25 |   --name {{ zenith_proxy_client_container_name }} \
26 |   --security-opt label=disable \
27 |   --volume /etc/zenith/{{ zenith_proxy_service_name }}:/etc/zenith:ro \
28 |   --volume {{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh:/home/zenith/.ssh \
29 |   {{ zenith_proxy_client_image }}
30 | ExecStop=/usr/bin/podman stop --ignore -t 10 {{ zenith_proxy_client_container_name }}
31 | ExecStopPost=/usr/bin/podman rm --ignore -f {{ zenith_proxy_client_container_name }}
32 | 
33 | [Install]
34 | WantedBy=multi-user.target default.target
35 | 


--------------------------------------------------------------------------------
/ansible/roles/zenith_proxy/templates/pod.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Podman {{ zenith_proxy_service_name }}.service
 3 | Wants=network.target
 4 | After=network-online.target
 5 | 
 6 | [Service]
 7 | Environment=PODMAN_SYSTEMD_UNIT=%n
 8 | Type=simple
 9 | Restart=always
10 | User={{ zenith_proxy_podman_user }}
11 | Group={{ zenith_proxy_podman_user }}
12 | ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} --network=slirp4netns
13 | ExecStartPre=/usr/bin/podman pod start {{ zenith_proxy_pod_name }}
14 | ExecStart=/usr/bin/podman-pod-infra-attach.sh {{ zenith_proxy_pod_name }}
15 | ExecStop=/usr/bin/podman pod stop --ignore -t 10 {{ zenith_proxy_pod_name }}
16 | ExecStopPost=/usr/bin/podman pod rm --ignore -f {{ zenith_proxy_pod_name }}
17 | 
18 | [Install]
19 | WantedBy=multi-user.target default.target
20 | 


--------------------------------------------------------------------------------
/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2:
--------------------------------------------------------------------------------
 1 | ssh_identity_path: /home/zenith/.ssh/id_zenith
 2 | 
 3 | # Init options
 4 | registrar_url: {{ zenith_registrar_url }}
 5 | token: {{ zenith_proxy_client_token }}
 6 | verify_ssl: {{ 'yes' if zenith_registrar_verify_ssl else 'no' }}
 7 | 
 8 | # Connect options
 9 | server_address: {{ zenith_sshd_host }}
10 | server_port: {{ zenith_sshd_port }}
11 | {% if zenith_proxy_mitm_enabled %}
12 | backend_protocol: http
13 | forward_to_host: 127.0.0.1
14 | forward_to_port: {{ zenith_proxy_mitm_listen_port }}
15 | {% else %}
16 | backend_protocol: {{ zenith_proxy_upstream_scheme }}
17 | forward_to_host: {{ zenith_proxy_upstream_host }}
18 | forward_to_port: {{ zenith_proxy_upstream_port }}
19 | {% endif %}
20 | {% if zenith_proxy_upstream_read_timeout %}
21 | read_timeout: {{ zenith_proxy_upstream_read_timeout }}
22 | {% endif %}
23 | skip_auth: {{ 'yes' if zenith_proxy_client_auth_skip else 'no' }}
24 | {% if zenith_proxy_client_auth_params %}
25 | auth_params:
26 |   {{ zenith_proxy_client_auth_params | to_nice_yaml | indent(2) }}
27 | {% endif %}
28 | 


--------------------------------------------------------------------------------
/dev/image-share.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Share images from one project to another
 3 | #
 4 | # usage:
 5 | #   share-images SOURCE_PROJECT DEST_PROJECT IMAGE_NAME
 6 | #
 7 | # NB: This requires a clouds.yaml file which uses project names as cloud keys
 8 | 
 9 | set -euo pipefail
10 | 
11 | SOURCE=$1
12 | DEST=$2
13 | IMAGE_NAME=$3
14 | 
15 | export OS_CLOUD=$SOURCE
16 | SOURCE_PROJECT=$(openstack project show -c id -f value $SOURCE)
17 | export OS_CLOUD=$DEST
18 | DEST_PROJECT=$(openstack project show -c id -f value $DEST)
19 | export OS_CLOUD=$SOURCE
20 | IMAGE=$(openstack image show -c id -f value $IMAGE_NAME)
21 | 
22 | echo "Sharing $IMAGE_NAME ($IMAGE) from $SOURCE ($SOURCE_PROJECT) ..."
23 | openstack image set --shared $IMAGE
24 | echo "Adding destination project $DEST ($DEST_PROJECT) ..."
25 | openstack image add project $IMAGE $DEST_PROJECT
26 | 
27 | export OS_CLOUD=$DEST
28 | echo "Accepting share ..."
29 | openstack image set --accept $IMAGE
30 | echo "Done"
31 | 


--------------------------------------------------------------------------------
/dev/output_manifest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Set github workflow output parameters defining image IDs from a packer manifest.
 3 | # Usage:
 4 | #   ./packer/read_manifest.py packer/packer-manifest.json
 5 | 
 6 | # E.g. assuming the default packer builds this will produce something like:
 7 | #   ::set-output name=NEW_COMPUTE_IMAGE_ID::9aabd73d-e550-4116-a90c-700478b722ce
 8 | #   ::set-output name=NEW_LOGIN_IMAGE_ID::87b41d58-d7e3-4c38-be05-453c3287ecab
 9 | #   ::set-output name=NEW_CONTROL_IMAGE_ID::7f812168-73fe-4a60-b9e9-9109a405390d
10 | # which can be used in subsequent workflow steps: [1]
11 | #
12 | # [1]: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-setting-a-value
13 | 
14 | import sys, json
15 | output = {}
16 | with open(sys.argv[1]) as f:
17 |     data = json.load(f)
18 | for build in data['builds']:
19 |     node_type = build['custom_data']['source']
20 |     image_id = build['artifact_id']
21 |     output[node_type] = image_id # NB: this deliberately gets the LAST build for a node type
22 | for node_type, image_id in output.items():
23 |     print('::set-output name=NEW_%s_IMAGE_ID::%s' % (node_type.upper(), image_id))
24 | 


--------------------------------------------------------------------------------
/dev/setup-env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -euo pipefail
 4 | 
 5 | PYTHON_VERSION=${PYTHON_VERSION:-}
 6 | 
 7 | if [[ "$PYTHON_VERSION" == "" ]]; then
 8 |     if [[ -f /etc/os-release ]]; then
 9 |         . /etc/os-release
10 |         OS=$ID
11 |         OS_VERSION=$VERSION_ID
12 |     else
13 |         exit 1
14 |     fi
15 | 
16 |     MAJOR_VERSION=$(echo $OS_VERSION | cut -d. -f1)
17 | 
18 |     if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then
19 |         PYTHON_VERSION="/usr/bin/python3.10"
20 |     elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then
21 |         # python3.9+ doesn't have selinux bindings
22 |         PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this
23 |     elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then
24 |         PYTHON_VERSION="/usr/bin/python3.9"
25 |     else
26 |         echo "Unsupported OS version: $OS $MAJOR_VERSION"
27 |         exit 1
28 |     fi
29 | fi
30 | 
31 | if [[ ! -d "venv" ]]; then
32 |     $PYTHON_VERSION -m venv venv
33 | fi
34 | 
35 | . venv/bin/activate
36 | pip install -U pip
37 | pip install -r requirements.txt
38 | ansible --version
39 | # Install or update ansible dependencies ...
40 | ansible-galaxy role install -fr requirements.yml -p ansible/roles
41 | ansible-galaxy collection install -fr requirements.yml -p ansible/collections
42 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # StackHPC Slurm Appliance Documentation
 2 | 
 3 | ### Operator docs
 4 | 
 5 | [Image build](image-build.md)
 6 | 
 7 | [CI](ci.md)
 8 | 
 9 | [Monitoring and logging](monitoring-and-logging.md)
10 | 
11 | [Operations guide](operations.md)
12 | 
13 | [Production deployment](production.md)
14 | 
15 | [Upgrades](upgrades.md)
16 | 
17 | [Sequence diagrams](sequence.md)
18 | 
19 | ### Configuration docs
20 | 
21 | [Alerting](alerting.md)
22 | 
23 | [Chrony](chrony.md)
24 | 
25 | [Environments](environments.md)
26 | 
27 | [K3s](k3s.README.md)
28 | 
29 | [Networking](networks.md)
30 | 
31 | [Open OnDemand](openondemand.md)
32 | 
33 | [Persistent state](persistent-state.md)
34 | 
35 | #### Experimental fetaures
36 | 
37 | [Compute init](experimental/compute-init.md)
38 | 
39 | [Pulp](experimental/pulp.md)
40 | 
41 | [Slurm controlled rebuild](experimental/slurm-controlled-rebuild.md)
42 | 
43 | ### Contributor docs
44 | 
45 | [Adding functionality](adding-functionality.md)
46 | 


--------------------------------------------------------------------------------
/docs/adding-functionality.md:
--------------------------------------------------------------------------------
 1 | # Adding new functionality
 2 | 
 3 | Please contact us for specific advice, but this generally involves:
 4 | - Adding a role.
 5 | - Adding a play calling that role into an existing playbook in `ansible/`, or adding a new playbook there and updating `site.yml`.
 6 | - Adding a new (empty) group named after the role into `environments/common/inventory/groups` and a non-empty example group into `environments/common/layouts/everything`.
 7 | - Adding new default group vars into `environments/common/inventory/group_vars/all/<rolename>/`.
 8 | - Updating the default Packer build variables in `environments/common/inventory/group_vars/builder/defaults.yml`.
 9 | - Updating READMEs.
10 | 


--------------------------------------------------------------------------------
/docs/chrony.md:
--------------------------------------------------------------------------------
 1 | # Chrony configuration
 2 | 
 3 | Use variables from the [mrlesmithjr.chrony](https://github.com/mrlesmithjr/ansible-chrony) role.
 4 | 
 5 | For example in: `environments/<environment>/inventory/group_vars/all/chrony`:
 6 | 
 7 | ```
 8 | ---
 9 | chrony_ntp_servers:
10 |   - server: ntp-0.example.org
11 |     options:
12 |       - option: iburst
13 |       - option: minpoll
14 |         val: 8
15 |   - server: ntp-1.example.org
16 |     options:
17 |       - option: iburst
18 |       - option: minpoll
19 |         val: 8
20 | 
21 | ```
22 | 


--------------------------------------------------------------------------------
/docs/ci.md:
--------------------------------------------------------------------------------
1 | # CI/CD automation
2 | 
3 | The `.github` directory contains a set of sample workflows which can be used by downstream site-specific configuration repositories to simplify ongoing maintainence tasks. These include:
4 | 
5 | - An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published.
6 | 
7 | - An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow which takes an image name, downloads it from StackHPC's public S3 bucket if available, and uploads it to the target OpenStack cloud.
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/experimental/compute-init.md:
--------------------------------------------------------------------------------
 1 | # compute-init
 2 | 
 3 | See the role README.md
 4 | 
 5 | # Changes to image / tofu state
 6 | 
 7 | When a compute group has the `ignore_image_changes` parameter set to true,
 8 | changes to the `image_id` parameter (which defaults to `cluster_image_id`) are
 9 | ignored by OpenTofu.
10 | 
11 | Regardless of whether `ignore_image_changes` is set, OpenTofu templates out the
12 | `image_id` into the Ansible inventory for each compute node. The `compute_init`
13 | role templates out hostvars to the control node, which means the "target" image
14 | ID is then available on the control node. Subsequent work will use this to
15 | rebuild the node via slurm.
16 | 
17 | # CI workflow
18 | 
19 | The compute node rebuild is tested in CI after the tests for rebuilding the
20 | login and control nodes. The process follows
21 | 
22 | 1. Compute nodes are reimaged:
23 | 
24 |          ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
25 | 
26 | 2. Ansible-init runs against newly reimaged compute nodes
27 | 
28 | 3. Run sinfo and check nodes have expected slurm state
29 | 
30 |          ansible-playbook -v ansible/ci/check_slurm.yml


--------------------------------------------------------------------------------
/docs/k3s.README.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | A K3s cluster is deployed with the Slurm cluster. Both an agent and server instance of K3s is installed during image build and the correct service (determined by OpenStack metadata) will be 
3 | enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes are configured as agents. Using multiple K3s servers isn't supported. Currently only the root user on the control node has 
4 | access to the Kubernetes API. The `k3s` role installs Helm for package management. K9s is also installed in the image and can be used by the root user.
5 | 
6 | # Idempotency
7 | K3s is intended to only be installed during image build as it is configured by the appliance on first boot with `azimuth_cloud.image_utils.linux_ansible_init`. Therefore, the `k3s` role isn't
8 | idempotent and changes to variables will not be reflected in the image when running `site.yml`.
9 | 


--------------------------------------------------------------------------------
/docs/screenshots/grafana/dashboard-node-exporter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/docs/screenshots/grafana/dashboard-node-exporter.png


--------------------------------------------------------------------------------
/docs/screenshots/grafana/dashboard-openhpc-slurm-jobs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/docs/screenshots/grafana/dashboard-openhpc-slurm-jobs.png


--------------------------------------------------------------------------------
/docs/screenshots/grafana/dashboard-openhpc-slurm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/docs/screenshots/grafana/dashboard-openhpc-slurm.png


--------------------------------------------------------------------------------
/docs/screenshots/grafana/grafana-slurm-jobs-linking-to-node-exporter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/docs/screenshots/grafana/grafana-slurm-jobs-linking-to-node-exporter.png


--------------------------------------------------------------------------------
/docs/site/README.md:
--------------------------------------------------------------------------------
1 | # Site-specific Documentation
2 | 
3 | This document is a placeholder for any site-specific documentation, e.g. environment descriptions.
4 | 
5 | #TODO: list things which should commonly be specified here.
6 | 
7 | 


--------------------------------------------------------------------------------
/environments/.caas/README.md:
--------------------------------------------------------------------------------
 1 | # Caas cluster
 2 | 
 3 | Environment for default Azimuth Slurm. This is not intended to be manually deployed.
 4 | 
 5 | Non-standard things for this environment:
 6 | - There is no activate script.
 7 | - `ansible.cgf` is provided in the repo root, as expected by the caas operator.
 8 | - `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the 
 9 |   runner project directory:
10 | 
11 |         azimuth_caas_stackhpc_slurm_appliance_template:
12 |         ...
13 |         envVars:
14 |             ANSIBLE_INVENTORY: environments/common/inventory,environments/.caas/inventory
15 | 
16 |     Ansible then defines `ansible_inventory_sources` which contains absolute paths, and 
17 |     that is used to derive the `appliances_environment_root` and 
18 |     `appliances_repository_root`.
19 | 


--------------------------------------------------------------------------------
/environments/.caas/ansible.cfg:
--------------------------------------------------------------------------------
 1 | [defaults]
 2 | any_errors_fatal = True
 3 | stdout_callback = debug
 4 | stderr_callback = debug
 5 | gathering = smart
 6 | forks = 30
 7 | host_key_checking = False
 8 | inventory = ../common/inventory,inventory
 9 | collections_path = ../../ansible/collections
10 | roles_path = ../../ansible/roles
11 | filter_plugins = ../../ansible/filter_plugins
12 | 
13 | [ssh_connection]
14 | ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
15 | pipelining = True
16 | 
17 | [inventory]
18 | # Fail when any inventory source cannot be parsed.
19 | any_unparsed_is_failed = True
20 | 


--------------------------------------------------------------------------------
/environments/.caas/assets/ood-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/.caas/assets/ood-icon.png


--------------------------------------------------------------------------------
/environments/.caas/hooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/.caas/hooks/.gitkeep


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/.caas/inventory/group_vars/all/.gitkeep


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/basic_users.yml:
--------------------------------------------------------------------------------
 1 | basic_users_users:
 2 |   - name: azimuth
 3 |     # Hash the password with a salt that is different for each host
 4 |     password: "{{ vault_azimuth_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}"
 5 |     uid: 1005
 6 |     public_key: "{{ cluster_user_ssh_public_key }}"
 7 |     shell: /bin/bash
 8 |     append: true
 9 |     groups:
10 |       - adm
11 |       - systemd-journal
12 |     sudo: azimuth ALL=(ALL) NOPASSWD:ALL
13 | 
14 | # the path *on the control node* for the home directories depends on the filesystem:
15 | basic_users_homedir_server_path: "{{ '/home' if cluster_home_manila_share | bool else '/exports/home' }}"
16 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/grafana.yml:
--------------------------------------------------------------------------------
1 | grafana_auth_anonymous: "{{ groups['openondemand'] | count > 0 }}"
2 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/hpctests.yml:
--------------------------------------------------------------------------------
 1 | # Skip plotting pingpong as matplotlib not in runner environment
 2 | hpctests_pingpong_plot: false
 3 | 
 4 | # In Azimuth, the Ansible controller is an ephemeral pod, so all that matters is that
 5 | # this is a location that is writable by the container user
 6 | hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests"
 7 | 
 8 | # hpctests run by default in Azimuth but not trying to stress-test the nodes
 9 | # just check compiler, mpi etc works
10 | hpctests_hpl_mem_frac: 0.05 # 5% node memory
11 | 
12 | # use basic_user-defined user:
13 | hpctests_user: azimuth
14 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/manila.yml:
--------------------------------------------------------------------------------
 1 | caas_manila_home:
 2 |   share_name: "{{ cluster_name }}-home"
 3 |   mount_path: /home
 4 |   mount_user: root
 5 |   mount_group: root
 6 |   mount_mode: u=rwX,go=rX
 7 | 
 8 | cluster_project_manila_share_name: azimuth-project-share
 9 | caas_manila_project:
10 |   share_name: "{{ cluster_project_manila_share_name | default('azimuth-project-share') }}"
11 |   share_user: "{{ cluster_project_manila_share_user | default(omit) }}"
12 |   mount_path: /project
13 |   mount_user: root
14 |   mount_group: root
15 |   mount_mode: ugo=rwX
16 | 
17 | os_manila_mount_shares: "{{ ([caas_manila_home] if cluster_home_manila_share | bool else []) + ([caas_manila_project] if cluster_project_manila_share | bool else []) }}"
18 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/nfs.yml:
--------------------------------------------------------------------------------
 1 | nfs_server: "{{ nfs_server_default }}"
 2 | 
 3 | caas_nfs_home:
 4 |   - comment: Export /exports/home from Slurm control node as /home
 5 |     nfs_enable:
 6 |         server:  "{{ inventory_hostname in groups['control'] }}"
 7 |         clients: "{{ inventory_hostname in groups['cluster'] }}"
 8 |     nfs_export: "/exports/home" # assumes skeleton TF is being used
 9 |     nfs_client_mnt_point: "/home"
10 | 
11 | nfs_configurations: "{{ caas_nfs_home if not cluster_home_manila_share | bool else [] }}"
12 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/openhpc.yml:
--------------------------------------------------------------------------------
1 | openhpc_cluster_name: "{{ cluster_name }}"
2 | 
3 | # Provision a single "standard" compute nodegroup using the supplied
4 | # node count and flavor
5 | openhpc_nodegroups: "{{ hostvars[groups['openstack'][0]]['openhpc_nodegroups'] }}"
6 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/openondemand.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | openondemand_auth: basic_pam
 3 | openondemand_jupyter_partition: "{{ openhpc_partitions[0]['name'] }}"
 4 | openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}"
 5 | 
 6 | httpd_listen_addr_port:
 7 |   - 80
 8 |   - 443
 9 | 
10 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/prometheus.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | # We reserve 10GB of the state volume for cluster state, the rest is for metrics
4 | prometheus_storage_retention_size: "{{ state_volume_size - 10 }}GB"
5 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/all/zenith.yml:
--------------------------------------------------------------------------------
1 | zenith_proxy_podman_user: podman
2 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/group_vars/openstack.yml:
--------------------------------------------------------------------------------
 1 | # The default Terraform state key for backends that support it
 2 | terraform_state_key: "cluster/{{ cluster_id }}/tfstate"
 3 | 
 4 | # Set up the terraform backend
 5 | terraform_backend_type: "{{ 'consul' if 'CONSUL_HTTP_ADDR' in ansible_env else 'local' }}"
 6 | terraform_backend_config_defaults:
 7 |   consul:
 8 |     path: "{{ terraform_state_key }}"
 9 |     gzip: "true"
10 |   local: {}
11 | terraform_backend_config: "{{ terraform_backend_config_defaults[terraform_backend_type] }}"
12 | 
13 | terraform_binary_directory: "{{ appliances_environment_root }}/bin"
14 | terraform_project_path: "{{ playbook_dir }}/terraform"
15 | 
16 | terraform_state: "{{ cluster_state | default('present') }}"
17 | cluster_ssh_user: rocky
18 | 
19 | # Provision a single "standard" compute nodegroup using the supplied
20 | # node count and flavor
21 | openhpc_nodegroups:
22 |   - name: "standard"
23 |     count: "{{ compute_count }}"
24 |     flavor: "{{ compute_flavor }}"
25 |     default: "YES"
26 | 


--------------------------------------------------------------------------------
/environments/.caas/inventory/hosts:
--------------------------------------------------------------------------------
1 | [openstack]
2 | localhost ansible_connection=local ansible_python_interpreter=/usr/bin/python3
3 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/.gitignore:
--------------------------------------------------------------------------------
1 | partitions.yml
2 | secrets.yml
3 | hosts
4 | terraform.tfvars
5 | .terraform.lock.hcl
6 | logs/
7 | hpctests/
8 | inventory/group_vars/all/test_user.yml
9 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/ARCUS.pkrvars.hcl:
--------------------------------------------------------------------------------
1 | flavor = "vm.ska.cpu.general.small"
2 | networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
3 | ssh_keypair_name = "slurm-app-ci"
4 | ssh_private_key_file = "~/.ssh/id_rsa"
5 | security_groups = ["default", "SSH"]
6 | floating_ip_network = "CUDN-Internet" # Use FIP to avoid docker ratelimits on portal-internal outbound IP
7 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl:
--------------------------------------------------------------------------------
 1 | flavor = "ec1.large"
 2 | volume_type = "unencrypted"
 3 | networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci
 4 | ssh_keypair_name = "slurm-app-ci"
 5 | ssh_private_key_file = "~/.ssh/id_rsa"
 6 | security_groups = ["default", "SSH"]
 7 | # see environments/.stackhpc/inventory/group_vars/all/bastion.yml:
 8 | ssh_bastion_username = "slurm-app-ci"
 9 | ssh_bastion_host = "195.114.30.222"
10 | ssh_bastion_private_key_file = "~/.ssh/id_rsa"
11 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/SMS.pkrvars.hcl:
--------------------------------------------------------------------------------
1 | flavor = "general.v1.small"
2 | networks = ["e2b9e59f-43da-4e1c-b558-dc9da4c0d738"] # stackhpc-ipv4-geneve
3 | ssh_keypair_name = "slurm-app-ci"
4 | ssh_private_key_file = "~/.ssh/id_rsa"
5 | # see environments/.stackhpc/inventory/group_vars/all/bastion.yml:
6 | ssh_bastion_username = "slurm-app-ci"
7 | ssh_bastion_host = "185.45.78.150"
8 | ssh_bastion_private_key_file = "~/.ssh/id_rsa"
9 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/activate:
--------------------------------------------------------------------------------
 1 | export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}}))
 2 | echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT"
 3 | 
 4 | export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}"
 5 | 
 6 | export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..")
 7 | echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT"
 8 | 
 9 | export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT")
10 | echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root"
11 | 
12 | export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT")
13 | echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root"
14 | 
15 | export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT")
16 | echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root"
17 | 
18 | if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then
19 |    export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg
20 | fi
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/ansible.cfg:
--------------------------------------------------------------------------------
 1 | [defaults]
 2 | any_errors_fatal = True
 3 | stdout_callback = debug
 4 | stderr_callback = debug
 5 | callbacks_enabled = ansible.posix.profile_tasks
 6 | gathering = smart
 7 | forks = 30
 8 | host_key_checking = False
 9 | inventory = ../common/inventory,inventory
10 | collections_path = ../../ansible/collections
11 | roles_path = ../../ansible/roles
12 | filter_plugins = ../../ansible/filter_plugins
13 | library = ../../ansible/library
14 | 
15 | [ssh_connection]
16 | ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
17 | pipelining = True
18 | 
19 | [inventory]
20 | # Fail when any inventory source cannot be parsed.
21 | any_unparsed_is_failed = True
22 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/cacerts/myCA.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIDgzCCAmugAwIBAgIUd5qnvmXczLvacv3Mu2hzwJlmimMwDQYJKoZIhvcNAQEL
 3 | BQAwUTELMAkGA1UEBhMCWFgxFTATBgNVBAcMDERlZmF1bHQgQ2l0eTEcMBoGA1UE
 4 | CgwTRGVmYXVsdCBDb21wYW55IEx0ZDENMAsGA1UEAwwEdGVzdDAeFw0yNTAyMTIx
 5 | NjIxNTlaFw0zMDAyMTExNjIxNTlaMFExCzAJBgNVBAYTAlhYMRUwEwYDVQQHDAxE
 6 | ZWZhdWx0IENpdHkxHDAaBgNVBAoME0RlZmF1bHQgQ29tcGFueSBMdGQxDTALBgNV
 7 | BAMMBHRlc3QwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDExC9wqRyG
 8 | vQ5FYGb48iDfq8er4WvWO94F/q746mCHvVJn7GTu3AMavIXCYqH9WnXY0lzey7xU
 9 | /40/F/xihQfGYFrY+8ssYrT8Z+H3fSuwmq6XqsHcCupBQHKTTjZWaVMODxF4Eq5F
10 | Vyk4/AJpoOFLrzjUA9Sw74HKBH+r3N74x+3fFzElFGfjtFXPlgnYi9T9dXEEoNc7
11 | Udulcr6MrL+l6ITr0Grti4FP0qOari9a4XqC7G2Jtga1PF/GaMlyrmQphnhpS7ph
12 | n1dr6hYWmHZ1r1vcNBxBl71CoOVoLwk9v2x0jOsbYpzAp5CJEl/6whwo/Pn2JzIV
13 | xbCuVg9znbHpAgMBAAGjUzBRMB0GA1UdDgQWBBSEbb8xKKL1NwsRfzeZ7Shyq9xq
14 | QTAfBgNVHSMEGDAWgBSEbb8xKKL1NwsRfzeZ7Shyq9xqQTAPBgNVHRMBAf8EBTAD
15 | AQH/MA0GCSqGSIb3DQEBCwUAA4IBAQB2z7YMpZKAPY19EWaTV80Gwks56hBClcfR
16 | 6Y6d/7+ltML5pRHCFB2fF850Rj5vmnflSwrSWDcDbRktEfha3OIhHWtY8TzF7Zkx
17 | dIMyN8JaqjmJ488WGhcuqQDIK5sREg/JfECVeBId5mF390TKszlM9FNQL1NOC0D+
18 | I/+BeWHYAu4dGWQR6xbC6SYUMbhTQrQSgJFckq5i2fQPcNK8Xlnzc+oxjJuqgsfB
19 | P1oLnrb2OVHEpjuxdK1UYds3z/6ilKwZQvx6uuv0baSbTsQT9TXKpbAZCynOQnGS
20 | 3rzTeOTapwsj1yVlAuo7koxbjFFaz6b1nGC5Ap/rGeVdIT7ZVKF/
21 | -----END CERTIFICATE-----
22 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/hooks/post-bootstrap.yml:
--------------------------------------------------------------------------------
 1 | - hosts: podman:!builder
 2 |   become: yes
 3 |   gather_facts: false
 4 |   tags: podman
 5 |   tasks:
 6 |     - name: Configure container image registry to avoid docker.io ratelimits
 7 |       copy:
 8 |         dest: /etc/containers/registries.conf.d/003-arcus-mirror.conf
 9 |         content: |
10 |           [[registry]]
11 |           location="docker.io/library/"
12 |           prefix="docker.io/library/"
13 |           
14 |           [[registry.mirror]]
15 |           location = "{{ podman_registry_address }}"
16 |           insecure = true
17 |       when: "ci_cloud == 'ARCUS'"
18 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/hooks/pre.yml:
--------------------------------------------------------------------------------
 1 | - hosts: control:!builder
 2 |   become: yes
 3 |   gather_facts: false
 4 |   tasks:
 5 |     - name: Output OS version
 6 |       command: cat /etc/redhat-release
 7 |       changed_when: false
 8 | 
 9 |     - name: Write CI-generated inventory and secrets for debugging
10 |       ansible.builtin.copy:
11 |         dest: /etc/ci-config/
12 |         src: "{{ item }}"
13 |         directory_mode: 0400
14 |         mode: 0400
15 |         owner: root
16 |         group: root
17 |       no_log: "{{ no_log | default(true) }}"
18 |       loop:
19 |         - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml"
20 |         - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml"
21 |         - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml"
22 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/everything:
--------------------------------------------------------------------------------
1 | ../../../environments/common/layouts/everything


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/extra_groups:
--------------------------------------------------------------------------------
 1 | [basic_users:children]
 2 | cluster
 3 | 
 4 | [etc_hosts:children]
 5 | cluster
 6 | 
 7 | # -- Example of enabling FreeIPA with an in-appliance (dev-only) server
 8 | # NB: The etc_hosts and basic_users group definitions above should be commented out
 9 | # The freeipa_* hosts will pick up configuration from environments/.stackhpc/inventory/group_vars/all/freeipa.yml
10 | 
11 | # [freeipa_server:children]
12 | # control
13 | # 
14 | # [freeipa_client:children]
15 | # login
16 | # compute
17 | # 
18 | # [resolv_conf:children]
19 | # freeipa_client
20 | # --- end of FreeIPA example ---
21 | 
22 | [manila:children]
23 | # Allows demo; also installs manila client in fat image
24 | login
25 | compute
26 | 
27 | [chrony:children]
28 | cluster
29 | 
30 | [tuned:children]
31 | # Install tuned into fat image
32 | builder
33 | 
34 | [squid:children]
35 | # Install squid into fat image
36 | builder
37 | 
38 | [sssd:children]
39 | # Install sssd into fat image
40 | builder
41 | 
42 | [rebuild:children]
43 | control
44 | 
45 | [cacerts:children]
46 | cluster
47 | 
48 | [compute_init:children]
49 | compute
50 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/.stackhpc/inventory/group_vars/all/.gitkeep


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/basic_users.yml:
--------------------------------------------------------------------------------
1 | test_demo_user_password: "{{ lookup('env', 'DEMO_USER_PASSWORD') | default(vault_demo_user_password, true) }}" # CI uses env, debug can set vault_demo_user_password
2 | 
3 | basic_users_users:
4 |   - name: demo_user # can't use rocky as $HOME isn't shared!
5 |     password: "{{ test_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent
6 |     uid: 1005
7 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/bastion.yml:
--------------------------------------------------------------------------------
 1 | ci_cloud: "{{ lookup('env', 'CI_CLOUD') }}"
 2 | bastion_config:
 3 |   ARCUS:
 4 |     user: slurm-app-ci
 5 |     ip: 128.232.222.183
 6 |   LEAFCLOUD:
 7 |     user: slurm-app-ci
 8 |     ip: 195.114.30.222
 9 |   SMS:
10 |     user: slurm-app-ci
11 |     ip: 185.45.78.150
12 | # NB: The bastion_{user,ip} variables are used directly in the CI workflow too
13 | bastion_user: "{{ bastion_config[ci_cloud].user }}"
14 | bastion_ip: "{{ bastion_config[ci_cloud].ip }}"
15 | ansible_ssh_common_args: '-o ProxyCommand="ssh {{ bastion_user }}@{{ bastion_ip }} -W %h:%p"'
16 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/freeipa.yml:
--------------------------------------------------------------------------------
 1 | # This file provides examples of using freeipa role variables. These are NOT functional in CI as freeipa_{server,client} groups are not defined.
 2 | 
 3 | # NB: Users defined this way have expired passwords
 4 | freeipa_users:
 5 |   - name: demo_user # can't use rocky as $HOME isn't shared!
 6 |     password: "{{ test_demo_user_password }}"
 7 |     givenname: test
 8 |     sn: test
 9 | 
10 | # freeipa_client hosts must use a FreeIPA server for name resolution - requires hosts to be in group `resolv_conf`.
11 | resolv_conf_nameservers:
12 |   - "{{ hostvars[groups['freeipa_server'].0].ansible_host }}"
13 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/grafana.yml:
--------------------------------------------------------------------------------
1 | grafana_auth_anonymous: true
2 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/hpctests.yml:
--------------------------------------------------------------------------------
1 | hpctests_user: demo_user
2 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/manila.yml:
--------------------------------------------------------------------------------
1 | os_manila_mount_shares_arcus:
2 |   - share_name: slurm-v2-home
3 |     mount_path: /project
4 |   - share_name: slurm-scratch
5 |     mount_path: /scratch
6 | 
7 | os_manila_mount_shares: "{{ os_manila_mount_shares_arcus if ci_cloud == 'ARCUS' else [] }}"
8 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/openhpc.yml:
--------------------------------------------------------------------------------
1 | openhpc_config_extra:
2 |   SlurmctldDebug: debug
3 |   SlurmdDebug: debug
4 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/openondemand.yml:
--------------------------------------------------------------------------------
1 | openondemand_auth: basic_pam
2 | openondemand_jupyter_partition: standard
3 | openondemand_desktop_partition: standard
4 | #openondemand_dashboard_support_url: 
5 | #openondemand_dashboard_docs_url:
6 | #openondemand_filesapp_paths:
7 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/all/podman.yml:
--------------------------------------------------------------------------------
1 | arcus_podman_registry_address: 192.168.3.95:5000
2 | podman_registry_address: "{{ arcus_podman_registry_address if ci_cloud == 'ARCUS' else '' }}"
3 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/inventory/group_vars/builder.yml:
--------------------------------------------------------------------------------
 1 | #update_enable: false # Can uncomment for speed debugging non-update related build issues
 2 | sssd_install_ldap: true # include sssd-ldap package in fatimage
 3 | # update_enable: false # Can uncomment for speed debugging non-update related build issues
 4 | 
 5 | # Uncomment below to use CI pulp servers
 6 | 
 7 | # pulp_server_config:
 8 | #   LEAFCLOUD:
 9 | #     url: http://192.168.10.157:8080
10 | #     password: lookup('env','LEAFCLOUD_PULP_PASSWORD')
11 | 
12 | # appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}"
13 | # pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}"
14 | 
15 | # Alternatively, configure to use ark directly:
16 | dnf_repos_username: slurm-app-ci
17 | dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}"
18 | 
19 | # Can be set regardless of approach above:
20 | pulp_site_upstream_username: slurm-app-ci
21 | pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}"
22 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/tofu/ARCUS.tfvars:
--------------------------------------------------------------------------------
1 | cluster_net = "portal-internal"
2 | cluster_subnet = "portal-internal"
3 | control_node_flavor = "vm.ska.cpu.general.eighth"
4 | other_node_flavor = "vm.ska.cpu.general.small"
5 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars:
--------------------------------------------------------------------------------
 1 | cluster_networks = [
 2 |     {
 3 |         network = "stackhpc-dev"
 4 |         subnet = "stackhpc-dev"
 5 |     }
 6 | ]
 7 | control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment
 8 | other_node_flavor = "en1.xsmall"
 9 | state_volume_type = "unencrypted"
10 | home_volume_type = "unencrypted"
11 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/tofu/LEAFCLOUD.tfvars:
--------------------------------------------------------------------------------
 1 | cluster_networks = [
 2 |     {
 3 |         network = "slurmapp-ci"
 4 |         subnet = "slurmapp-ci"
 5 |     }
 6 | ]
 7 | control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment
 8 | other_node_flavor = "en1.xsmall"
 9 | state_volume_type = "unencrypted"
10 | home_volume_type = "unencrypted"
11 | 


--------------------------------------------------------------------------------
/environments/.stackhpc/tofu/SMS.tfvars:
--------------------------------------------------------------------------------
1 | cluster_networks = [
2 |     {
3 |         network = "stackhpc-ipv4-geneve"
4 |         subnet = "stackhpc-ipv4-geneve-subnet"
5 |     }
6 | ]
7 | control_node_flavor = "general.v1.small"
8 | other_node_flavor = "general.v1.small"


--------------------------------------------------------------------------------
/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json:
--------------------------------------------------------------------------------
1 | {
2 |     "cluster_image": {
3 |         "RL8": "openhpc-RL8-250514-1502-5a923b2c",
4 |         "RL9": "openhpc-RL9-250514-1502-5a923b2c"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/environments/common/.gitignore:
--------------------------------------------------------------------------------
1 | inventory/hosts
2 | 


--------------------------------------------------------------------------------
/environments/common/README.md:
--------------------------------------------------------------------------------
 1 | # Common configuration
 2 | 
 3 | This contains an inventory that defines variables which are common between the
 4 | `production` and `development` environments. It is not intended to be used in
 5 | a standalone fashion to deploy infrastructure, but is instead
 6 | referenced in `ansible.cfg` from the `production` and `development` configurations.
 7 | 
 8 | The pattern we use is that all resources referenced in the inventory
 9 | are located in the environment directory containing the inventory that
10 | references them. For example, the file referenced in `inventory/group_vars/prometheus/defaults.yml`
11 | using the variable `prometheus_alert_rules_files` references a file in the
12 | `files` directory relative to this one.
13 | 


--------------------------------------------------------------------------------
/environments/common/files/grafana/grafana.repo.j2:
--------------------------------------------------------------------------------
 1 | {{ ansible_managed | comment }}
 2 | [grafana]
 3 | baseurl = {{ appliances_pulp_url }}/pulp/content/{{ appliances_pulp_repos.grafana[ansible_distribution_major_version] | appliances_repo_to_subpath }}
 4 | enabled = 0
 5 | name = grafana
 6 | async = 1
 7 | gpgcheck = 0
 8 | {% if 'dnf_repos' in group_names and dnf_repos_password is defined %}
 9 | password = {{ dnf_repos_password }}
10 | username = {{ dnf_repos_username }}
11 | {% endif %}
12 | 


--------------------------------------------------------------------------------
/environments/common/files/opensearch/internal_users.yml.j2:
--------------------------------------------------------------------------------
 1 | ---
 2 | # See https://opensearch.org/docs/latest/security-plugin/configuration/yaml#internal_usersyml
 3 | 
 4 | _meta:
 5 |   type: "internalusers"
 6 |   config_version: 2
 7 | 
 8 | admin:
 9 |   hash: "{{ opensearch_admin_password_hash }}"
10 |   reserved: true
11 |   backend_roles:
12 |   - "admin"
13 |   description: "Admin user"
14 | 


--------------------------------------------------------------------------------
/environments/common/files/prometheus/rules/precompute.rules:
--------------------------------------------------------------------------------
 1 | # Required for openhpc dashboard
 2 | 
 3 | groups:
 4 | - name: opehnpc
 5 |   interval: 60s
 6 |   rules:
 7 |   - record: node_cpu_system_seconds:record
 8 |     expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node"}[60s])))
 9 |   - record: node_cpu_user_seconds:record
10 |     expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="user",job="node"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node"}[60s])))
11 |   - record: node_cpu_iowait_seconds:record
12 |     expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="iowait",job="node"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node"}[60s])))
13 |   - record: node_cpu_other_seconds:record
14 |     expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode!="idle",mode!="user",mode!="system",mode!="iowait",job="node"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node"}[60s])))
15 |   - record: node_cpu_scaling_frequency_hertz_avg:record
16 |     expr: avg by (instance) (node_cpu_scaling_frequency_hertz)
17 |   - record: node_cpu_scaling_frequency_hertz_min:record
18 |     expr: min by (instance) (node_cpu_scaling_frequency_hertz)
19 |   - record: node_cpu_scaling_frequency_hertz_max:record
20 |     expr: max by (instance) (node_cpu_scaling_frequency_hertz)
21 | 


--------------------------------------------------------------------------------
/environments/common/files/prometheus/rules/slurm.rules:
--------------------------------------------------------------------------------
 1 | 
 2 | groups:
 3 | - name: Slurm
 4 |   rules:
 5 |   - alert: SlurmNodeDown
 6 |     annotations:
 7 |       description: '{{ $value }} Slurm nodes are in down status'
 8 |       summary: 'At least one Slurm node is down.'
 9 |     expr: "slurm_nodes_down > 0\n"
10 |     labels:
11 |       severity: critical
12 |   - alert: SlurmNodeFail
13 |     annotations:
14 |       description: '{{ $value }} Slurm nodes are in fail status'
15 |       summary: 'At least one Slurm node is failed.'
16 |     expr: "slurm_nodes_fail > 0\n"
17 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/alertmanager.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | alertmanager_port: '9093' # defined here as required for prometheus
 3 | 
 4 | alertmanager_slack_receiver_name: slack-receiver
 5 | alertmanager_slack_receiver_send_resolved: true
 6 | alertmanager_slack_receiver: # defined here as needs prometheus address
 7 |   name: "{{ alertmanager_slack_receiver_name }}"
 8 |   slack_configs:
 9 |     - channel: "{{ alertmanager_slack_integration.channel | default('none') }}"
10 |       api_url: https://slack.com/api/chat.postMessage
11 |       http_config:
12 |         authorization:
13 |           credentials: "{{ alertmanager_slack_integration.app_creds | default('none') }}"
14 |       text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }}  .CommonAnnotations.description {{ '}}' }}"
15 |       title_link: "{{ prometheus_web_external_url }}/alerts?receiver={{ alertmanager_slack_receiver_name }}"
16 |       send_resolved: "{{ alertmanager_slack_receiver_send_resolved }}"
17 | 
18 | alertmanager_web_external_url: "http://{{ hostvars[groups['alertmanager'].0].ansible_host }}:{{ alertmanager_port}}/"
19 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/ansible_init.yml:
--------------------------------------------------------------------------------
 1 | ansible_init_wait: 300 # seconds
 2 | 
 3 | ansible_init_pip_packages:
 4 |   # role defaults:
 5 |   - ansible
 6 |   - jmespath
 7 |   - requests
 8 |   # custom:
 9 |   - netaddr # required for gateway role
10 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/basic_users.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | # See ansible/roles/basic_users/README.md for variable definitions.
4 | 
5 | basic_users_users: []
6 | 
7 | # The following are defined for the purpose of compute-init
8 | basic_users_homedir_server: "{{ groups['control'] | first }}"
9 | basic_users_homedir_client: "{{ groups['login'] | first }}"


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/filebeat.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | # Path to filebeat.yml configuration file template
4 | filebeat_config_path: "{{ appliances_repository_root }}/environments/common/files/filebeat/filebeat.yml"
5 | 
6 | # User that runs the filebeat container
7 | filebeat_podman_user: podman


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/firewalld.yml:
--------------------------------------------------------------------------------
 1 | # See ansible/roles/firewalld/README.md
 2 | # for variable definitions.
 3 | 
 4 | firewalld_configs_default:
 5 |   # A list of dicts defining firewalld rules.
 6 |   # Using the "everything" template firewalld is deployed on the login node to enable fail2ban.
 7 |   # However by default we rely on openstack security groups so make firewalld permissive.
 8 |   # Each dict contains:
 9 |   #   name: An arbitrary name or description
10 |   #   group: An ansible group name - this rule is applied if the fail2ban node is in this group
11 |   #   rule: A dict of parameters passed to the `ansible.posix.firewalld` module.
12 |   # FaiBy default we rely on openstack security groups so 
13 |   - name: Make firewalld permissive
14 |     group: openhpc
15 |     rule:
16 |       zone: public
17 |       state: enabled
18 |       target: ACCEPT
19 |       permanent: yes
20 | 
21 | firewalld_configs_extra: [] # list of dicts with parameters as for firewalld_configs_default
22 | 
23 | firewalld_configs: "{{ (firewalld_configs_default + firewalld_configs_extra) | selectattr('group', 'in',  group_names) | map(attribute='rule') }}"
24 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/freeipa_server.yml:
--------------------------------------------------------------------------------
1 | # See ansible/roles/freeipa/README.md
2 | # These vars are only used when freeipa_server is enabled. They are not required when enabling only freeipa_client
3 | freeipa_realm: "{{ openhpc_cluster_name | upper }}.{{ cluster_domain_suffix | upper }}"
4 | freeipa_ds_password: "{{ vault_freeipa_ds_password }}"
5 | freeipa_admin_password: "{{ vault_freeipa_admin_password }}"
6 | # the below doesn't use ansible_default_ipv4.address as that requires facts, and allows for templating when group freeipa_server is empty
7 | freeipa_server_ip: "{{ hostvars[groups['freeipa_server'].0].ansible_host if groups['freeipa_server'] else false }}"
8 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/hpctests.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # See: ansible/roles/hpctests/README.md
3 | # for variable definitions.
4 | 
5 | # hpctests_user:
6 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/k3s.yml:
--------------------------------------------------------------------------------
1 | k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first].k3s_bootstrap_token | default('') }}"
2 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/manila.yml:
--------------------------------------------------------------------------------
 1 | # Default configuration for manila file shares, see
 2 | # https://github.com/stackhpc/ansible-role-os-manila-mount
 3 | # for all variable definitions, and override in your environment.
 4 | 
 5 | os_manila_mount_shares: []
 6 |   # - share_name:
 7 |   #   share_user:
 8 |   #   mount_path:
 9 |   #   mount_user:
10 |   #   mount_group:
11 |   #   mount_mode:
12 | 
13 | # os_manila_mount_ceph_version: nautilus # role default for RockyLinux 8
14 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/mysql.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | # See ansible/roles/mysql for variable definitions.
 4 | 
 5 | mysql_host: "{{ hostvars[groups['mysql'] | first].api_address }}"
 6 | 
 7 | # The user which runs the mysql container
 8 | mysql_podman_user: podman
 9 | 
10 | # Slurm recommends larger than default values: https://slurm.schedmd.com/accounting.html
11 | mysql_mysqld_options:
12 |   - innodb-buffer-pool-size=1024M
13 |   - innodb-lock-wait-timeout=900
14 | 
15 | mysql_root_password: "{{ vault_mysql_root_password }}"
16 | mysql_datadir: "{{ appliances_state_dir | default('/var/lib') }}/mysql"
17 | 
18 | mysql_databases:
19 |   - name: slurm_acct_db
20 |     config_file: ''
21 |     login_user: root
22 |     login_password: "{{ mysql_root_password }}"
23 |     login_host: "{{ mysql_host }}"
24 | 
25 | mysql_users:
26 |   - name: slurm
27 |     host: "%"
28 |     password: "{{ vault_mysql_slurm_password }}"
29 |     priv: "slurm_acct_db.*:ALL"
30 |     login_user: root
31 |     login_password: "{{ mysql_root_password }}"
32 |     login_host: "{{ mysql_host }}"
33 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/opensearch.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # See: https://opensearch.org/docs/latest/security-plugin/configuration/index/
 3 | 
 4 | # Path to template that specifies opensearch users
 5 | opensearch_internal_users_path: "{{ appliances_repository_root }}/environments/common/files/opensearch/internal_users.yml.j2"
 6 | 
 7 | # define an idempotent bcrypt hash for the above (requires a 128bit salt in base64 encoding):
 8 | opensearch_admin_password_salt: "{{ (2 | pow(128) | int) | random(seed=inventory_hostname) | b64encode }}"
 9 | opensearch_admin_password_hash: "{{ vault_elasticsearch_admin_password | password_hash('bcrypt', opensearch_admin_password_salt[0:22]) }}"
10 | 
11 | # user running the opensearch container
12 | opensearch_podman_user: podman
13 | 
14 | # Path to host directories
15 | opensearch_config_path: "{{ appliances_state_dir | default('/usr/share') }}/opensearch/config"
16 | opensearch_data_path: "{{ appliances_state_dir | default('/usr/share') }}/opensearch/data"
17 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/os-manila-mount.yml:
--------------------------------------------------------------------------------
1 | # Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are
2 | # now generated by dnf_repos to allow injecting Ark creds:
3 | os_manila_mount_ceph_rpm_repos: []
4 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/podman.yml:
--------------------------------------------------------------------------------
1 | podman_users: "{{ [appliances_local_users_podman] }}" # user to use for podman
2 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/proxy.yml:
--------------------------------------------------------------------------------
1 | # default proxy address to first squid api address port 3128 if squid group non-empty, else empty string to avoid breaking hostvars
2 | proxy_http_proxy: "{{ 'http://' + hostvars[groups['squid'].0].api_address + ':' + (squid_http_port | string) if groups['squid'] else '' }}"
3 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/pulp.yml:
--------------------------------------------------------------------------------
 1 | pulp_site_port: 8080
 2 | 
 3 | # If using Ark directly (no local Pulp server), override the following with Ark creds
 4 | 
 5 | # dnf_repos_username:
 6 | # dnf_repos_password:
 7 | 
 8 | # If instead using local Pulp server, override below with Ark creds
 9 | 
10 | # pulp_site_upstream_username:
11 | # pulp_site_upstream_password:
12 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/selinux.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 
3 | selinux_state: disabled
4 | selinux_policy: targeted
5 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/slurm_exporter.yml:
--------------------------------------------------------------------------------
1 | slurm_exporter_port: 9341 # as defined by [1] and implemented in [2]
2 | #[1]: https://github.com/prometheus/prometheus/wiki/Default-port-allocations
3 | #[2]: https://github.com/stackhpc/prometheus-slurm-exporter/blob/master/lib/systemd/prometheus-slurm-exporter.service
4 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/squid.yml:
--------------------------------------------------------------------------------
1 | squid_http_port: 3128 # defined here for proxy role
2 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/sshd.yaml:
--------------------------------------------------------------------------------
1 | sshd_password_authentication: "{{ sssd_install_ldap | default(false) | bool }}"
2 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/systemd.yml:
--------------------------------------------------------------------------------
 1 | _systemd_requiresmount_statedir: |
 2 |   {% if appliances_state_dir is defined %}
 3 |   [Unit]
 4 |   RequiresMountsFor={{ appliances_state_dir | default('') }}
 5 |   {% endif %}
 6 | 
 7 | systemd_dropins:
 8 |   # NB: mysql does not need _systemd_requiresmount_statedir as role handles state dir correctly
 9 |   opensearch:
10 |     group: opensearch
11 |     content: "{{ _systemd_requiresmount_statedir }}"
12 |   grafana-server:
13 |     group: grafana
14 |     content: "{{ _systemd_requiresmount_statedir }}"
15 |   slurmdbd:
16 |     group: openhpc
17 |     content: "{{ _systemd_requiresmount_statedir }}"
18 |   slurmctld:
19 |     group: openhpc
20 |     content: "{{ _systemd_requiresmount_statedir }}"
21 |   prometheus:
22 |     group: prometheus
23 |     content: "{{ _systemd_requiresmount_statedir }}"
24 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/all/update.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | update_enable: false
 4 | # These variables define the packages updates and are passed to ansible's yum module parameters with the same names: https://docs.ansible.com/ansible/latest/collections/ansible/builtin/yum_module.html
 5 | update_name: '*'
 6 | update_state: latest
 7 | update_exclude:
 8 |   - grafana
 9 |   - apptainer # see https://github.com/stackhpc/ansible-slurm-appliance/pull/245
10 | update_disablerepo: omit
11 | # Log changes during update here on localhost:
12 | update_log_path:  "{{ appliances_environment_root }}/logs/{{ inventory_hostname }}-updates.log"
13 | 


--------------------------------------------------------------------------------
/environments/common/inventory/group_vars/builder/defaults.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # NOTE: Might be better of as extra vars or in a builder specific inventory as
 3 | # as dependent on alphabetical ordering of groups, so if these variables are
 4 | # defined elsewhere the group that is ordered lower will determine the values.
 5 | update_enable: true
 6 | openhpc_slurm_service_started: false
 7 | nfs_client_mnt_state: present
 8 | block_devices_partition_state: skip
 9 | block_devices_filesystem_state: skip
10 | block_devices_mount_state: present
11 | basic_users_manage_homedir: false
12 | grafana_state: stopped # as it tries to listen on the "real" grafana node
13 | block_devices_configurations: [] # as volumes will not be attached to Packer build VMs
14 | mysql_state: stopped # as it tries to connect to real mysql node
15 | opensearch_state: stopped # avoid writing config+certs+db into image
16 | cuda_persistenced_state: stopped # probably don't have GPU in Packer build VMs
17 | firewalld_enabled: false # dnf install of firewalld enables it
18 | firewalld_state: stopped
19 | squid_started: false
20 | squid_enabled: false
21 | squid_cache_disk: 0 # just needs to be defined
22 | squid_cache_mem: 0
23 | tuned_started: false
24 | tuned_enabled: false
25 | sssd_started: false
26 | sssd_enabled: false
27 | appliances_mode: build
28 | 


--------------------------------------------------------------------------------
/environments/common/layouts/README.md:
--------------------------------------------------------------------------------
1 | # Layouts
2 | 
3 | This folder contains some predefined group mappings. You can copy them into
4 | an environment folder if you wish to modify them or just reference them directly
5 | in ansible.cfg as another inventory file. If you are referencing them in the
6 | inventory file, it is advisable to put them just after the common environment.


--------------------------------------------------------------------------------
/environments/common/layouts/minimal:
--------------------------------------------------------------------------------
1 | [nfs:children]
2 | cluster
3 | 
4 | [openhpc:children]
5 | cluster
6 | 
7 | [mysql:children]
8 | control
9 | 


--------------------------------------------------------------------------------
/environments/skeleton/cookiecutter.json:
--------------------------------------------------------------------------------
1 | {
2 |     "environment": "foo",
3 |     "description" : "Describe the environment here"
4 | }
5 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/README.md:
--------------------------------------------------------------------------------
1 | # {{ cookiecutter.environment | title }} cluster
2 | 
3 | {{ cookiecutter.description }}
4 | 
5 | See the main README.md in the repo root for an overview and general install instructions.  Any environment-specific instructions should be added here.


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/activate:
--------------------------------------------------------------------------------
 1 | export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}}))
 2 | echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT"
 3 | 
 4 | export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}"
 5 | 
 6 | export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..")
 7 | echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT"
 8 | 
 9 | export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT")
10 | echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root"
11 | 
12 | export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT")
13 | echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root"
14 | 
15 | export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT")
16 | echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root"
17 | 
18 | if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then
19 |    export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg
20 | fi
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg:
--------------------------------------------------------------------------------
 1 | [defaults]
 2 | any_errors_fatal = True
 3 | stdout_callback = debug
 4 | stderr_callback = debug
 5 | gathering = smart
 6 | forks = 30
 7 | host_key_checking = False
 8 | inventory = ../common/inventory,inventory
 9 | collections_path = ../../ansible/collections
10 | roles_path = ../../ansible/roles
11 | filter_plugins = ../../ansible/filter_plugins
12 | 
13 | [ssh_connection]
14 | ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
15 | pipelining = True
16 | 
17 | [inventory]
18 | # Fail when any inventory source cannot be parsed.
19 | any_unparsed_is_failed = True
20 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/hooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/skeleton/{{cookiecutter.environment}}/hooks/.gitkeep


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/.gitkeep


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml:
--------------------------------------------------------------------------------
1 | # Uncomment below and add Slack bot app creds in the adjacent file
2 | # vault_alertmanager.yml for Slack integration:
3 | #
4 | # alertmanager_slack_integration:
5 | #   channel: '#alerts'
6 | #   app_creds: "{% raw %}{{ vault_alertmanager_slack_integration_app_creds }}{% endraw %}"
7 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml:
--------------------------------------------------------------------------------
1 | basic_users_users:
2 |   - name: demo_user
3 |     password: "{% raw %}{{ vault_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}{% endraw %}" # idempotent
4 |     uid: 1005
5 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml:
--------------------------------------------------------------------------------
1 | grafana_auth_anonymous: true


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml:
--------------------------------------------------------------------------------
1 | hpctests_user: demo_user
2 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml:
--------------------------------------------------------------------------------
1 | # Add a bot token here THEN VAULT-ENCRYPT this file!
2 | 
3 | #vault_alertmanager_slack_integration_app_creds: '<bot_token>'
4 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/inventory/groups:
--------------------------------------------------------------------------------
1 | ../../../common/layouts/everything


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ opentofu external data program to list baremetal nodes
 3 | 
 4 |     Example usage:
 5 | 
 6 |         data "external" "example" {
 7 |             program = [this_file]
 8 |         }
 9 | 
10 |     The external data resource's result attribute then contains a mapping of
11 |     Ironic node names to their UUIDs.
12 | 
13 |     An empty list is returned if:
14 |     - There are no baremetal nodes
15 |     - The listing fails for any reason, e.g.
16 |         - there is no baremetal service
17 |         - admin credentials are required and are not provided
18 | """
19 | 
20 | import openstack
21 | import json
22 | 
23 | nodes = []
24 | proxy = None
25 | output = {}
26 | conn = openstack.connection.from_config()
27 | try:
28 |     proxy = getattr(conn, 'baremetal', None)
29 | except Exception:
30 |     pass
31 | if proxy is not None:
32 |     nodes = proxy.nodes()
33 | for node in nodes:
34 |     output[node.name] = node.id
35 | print(json.dumps(output))
36 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf:
--------------------------------------------------------------------------------
1 | data "external" "baremetal_nodes" {
2 |   # returns an empty map if cannot list baremetal nodes
3 |   program = ["${path.module}/baremetal-node-list.py"]
4 |   query = {}
5 | }
6 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf:
--------------------------------------------------------------------------------
 1 | resource "local_file" "hosts" {
 2 |   content  = templatefile("${path.module}/inventory.tpl",
 3 |                           {
 4 |                             "cluster_name": var.cluster_name,
 5 |                             "cluster_domain_suffix": var.cluster_domain_suffix,
 6 |                             "control": openstack_compute_instance_v2.control
 7 |                             "login_groups": module.login
 8 |                             "compute_groups": module.compute
 9 |                             "state_dir": var.state_dir
10 |                             "cluster_home_volume": var.home_volume_provisioning != "none"
11 |                           },
12 |                           )
13 |   filename = "../inventory/hosts.yml"
14 | }
15 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/tofu/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.7" # templatestring() function
 3 |   required_providers {
 4 |     openstack = {
 5 |       source = "terraform-provider-openstack/openstack"
 6 |       version = "~>3.0.0"
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/tofu/network.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | data "openstack_networking_network_v2" "cluster_net" {
 3 | 
 4 |   for_each = {for net in var.cluster_networks: net.network => net}
 5 | 
 6 |   name = each.value.network
 7 | }
 8 | 
 9 | data "openstack_networking_subnet_v2" "cluster_subnet" {
10 | 
11 |   for_each = {for net in var.cluster_networks: net.network => net}
12 | 
13 |   name = each.value.subnet
14 | }
15 | 
16 | data "openstack_identity_auth_scope_v3" "scope" {
17 |   # This is an arbitrary name which is only used as a unique identifier so an
18 |   # actual token isn't used as the ID.
19 |   name = "scope"
20 | }
21 | 
22 | data "openstack_networking_secgroup_v2" "login" {
23 |   for_each = toset(var.login_security_groups)
24 | 
25 |   name = each.key
26 |   tenant_id = data.openstack_identity_auth_scope_v3.scope.project_id
27 | }
28 | 
29 | data "openstack_networking_secgroup_v2" "nonlogin" {
30 |   for_each = toset(var.nonlogin_security_groups)
31 | 
32 |   name = each.key
33 |   tenant_id = data.openstack_identity_auth_scope_v3.scope.project_id
34 | }
35 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 0.14"
 3 |   required_providers {
 4 |     openstack = {
 5 |       source = "terraform-provider-openstack/openstack"
 6 |       version = "~>3.0.0"
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/network.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | data "openstack_networking_network_v2" "network" {
 3 | 
 4 |   for_each = {for net in var.networks: net.network => net}
 5 | 
 6 |   name = each.value.network
 7 | }
 8 | 
 9 | data "openstack_networking_subnet_v2" "subnet" {
10 | 
11 |   for_each = {for net in var.networks: net.network => net}
12 | 
13 |   name = each.value.subnet
14 | }
15 | 


--------------------------------------------------------------------------------
/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ opentofu external data program to load inventory string variables from
 3 |     a (possibly vault-encrypted) secrets file.
 4 | 
 5 |     Example usage:
 6 | 
 7 |         data "external" "example" {
 8 |             program = [this_file]
 9 | 
10 |             query = {
11 |                 path = "${path.module}/../inventory/group_vars/all/secrets.yml"
12 |             }
13 |         }
14 | 
15 |     The external data resource's result attribute then contains a mapping of
16 |     variable names to values.
17 | 
18 |     NB: Only keys/values where values are strings are returned, in line with
19 |     the external program protocol.
20 | 
21 |     NB: This approach is better than e.g. templating inventory vars as the
22 |     inventory doesn't need to be valid, which is helpful when opentofu will
23 |     template out hosts/groups.
24 | """
25 | 
26 | import sys, json, subprocess, yaml
27 | input = sys.stdin.read()
28 | secrets_path = json.loads(input)['path']
29 | 
30 | with open(secrets_path) as f:
31 |     header = f.readline()
32 |     if header.startswith('$ANSIBLE_VAULT'):
33 |         cmd = ['ansible-vault', 'view', secrets_path]
34 |         ansible = subprocess.run(cmd, capture_output=True, text=True)
35 |         contents = ansible.stdout
36 |     else:
37 |         contents = f.read()
38 | 
39 | data = yaml.safe_load(contents)
40 | 
41 | output = {}
42 | for k, v in data.items():
43 |     if isinstance(v, str):
44 |         output[k] = v
45 | print(json.dumps(output))
46 | 


--------------------------------------------------------------------------------
/packer/.gitignore:
--------------------------------------------------------------------------------
1 | packer_cache
2 | roles
3 | output_*
4 | *.gz
5 | 


--------------------------------------------------------------------------------
/packer/ansible-inventory.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # NOTE: This allows to make use of the ANSIBLE_CONFIG environment variable
4 | 
5 | ansible-inventory --list --export
6 | 


--------------------------------------------------------------------------------
/packer/openhpc_extravars.yml:
--------------------------------------------------------------------------------
1 | workaround_ansible_issue_61497: yes # extravars files can't be empty
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ansible==6.7.0 # cloudalchemy.prometheus uses ansible.builtin.include, removed in ansible-core==2.16 => ansible==9
 2 | openstacksdk
 3 | python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild
 4 | python-manilaclient
 5 | python-ironicclient
 6 | jmespath
 7 | passlib[bcrypt]==1.7.4
 8 | cookiecutter
 9 | selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3
10 | netaddr
11 | matplotlib
12 | pulp-cli==0.23.2
13 | beautifulsoup4==4.13.3
14 | 


--------------------------------------------------------------------------------