├── .github ├── CODEOWNERS ├── bin │ ├── create-merge-branch.sh │ └── get-s3-image.sh └── workflows │ ├── extra.yml │ ├── fatimage.yml │ ├── nightly-cleanup.yml │ ├── nightlybuild.yml │ ├── release-image.yml │ ├── s3-image-sync.yml │ ├── stackhpc.yml │ ├── trivyscan.yml │ ├── upgrade-check.yml.sample │ └── upload-release-image.yml.sample ├── .gitignore ├── README.md ├── ansible.cfg ├── ansible ├── .gitignore ├── adhoc │ ├── backup-keytabs.yml │ ├── cudatests.yml │ ├── deploy-pulp.yml │ ├── generate-passwords.yml │ ├── hpctests.yml │ ├── rebuild-via-slurm.yml │ ├── rebuild.yml │ ├── restart-slurm.yml │ ├── sync-pulp.yml │ └── update-packages.yml ├── bootstrap.yml ├── ci │ ├── check_eessi.yml │ ├── check_grafana.yml │ ├── check_sacct_hpctests.yml │ ├── check_slurm.yml │ ├── delete_images.yml │ ├── get_image_ids.yml │ ├── library │ │ └── grafana_elasticsearch_query.py │ ├── output_vars.yml │ ├── retrieve_inventory.yml │ └── update_timestamps.yml ├── cleanup.yml ├── disable-repos.yml ├── extras.yml ├── fatimage.yml ├── filesystems.yml ├── filter_plugins │ └── utils.py ├── iam.yml ├── library │ ├── latest_timestamps.py │ └── user_namespace_facts.py ├── monitoring.yml ├── noop.yml ├── portal.yml ├── roles │ ├── alertmanager │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── configure.yml │ │ │ └── install.yml │ │ └── templates │ │ │ ├── alertmanager-web.yml.j2 │ │ │ ├── alertmanager.service.j2 │ │ │ └── alertmanager.yml.j2 │ ├── basic_users │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── filter_plugins │ │ │ └── filter_keys.py │ │ ├── library │ │ │ └── terminate_user_sessions.py │ │ └── tasks │ │ │ └── main.yml │ ├── block_devices │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── library │ │ │ └── block_devices.py │ │ └── tasks │ │ │ └── main.yml │ ├── cacerts │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── configure.yml │ │ │ ├── export.yml │ │ │ └── main.yml │ ├── cluster_infra │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ ├── outputs.tf.j2 │ │ │ ├── providers.tf.j2 │ │ │ └── resources.tf.j2 │ ├── compute_init │ │ ├── README.md │ │ ├── files │ │ │ └── compute-init.yml │ │ ├── tasks │ │ │ ├── export.yml │ │ │ └── install.yml │ │ └── templates │ │ │ └── hostvars.yml.j2 │ ├── cuda │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── install.yml │ │ │ ├── runtime.yml │ │ │ └── samples.yml │ ├── dnf_repos │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── disable_repos.yml │ │ │ └── set_repos.yml │ ├── doca │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── install-kernel-devel.yml │ │ │ ├── install.yml │ │ │ └── main.yml │ ├── eessi │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yaml │ │ └── tasks │ │ │ └── main.yaml │ ├── etc_hosts │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ └── hosts.j2 │ ├── fail2ban │ │ ├── README.md │ │ ├── handlers │ │ │ └── main.yml │ │ ├── meta │ │ │ └── main.yml │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ └── jail.local.j2 │ ├── filebeat │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── install.yml │ │ │ ├── main.yml │ │ │ ├── runtime.yml │ │ │ └── validate.yml │ │ └── templates │ │ │ └── filebeat.service.j2 │ ├── firewalld │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── meta │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── install.yml │ │ │ ├── main.yml │ │ │ └── runtime.yml │ │ └── vars │ │ │ └── main.yml │ ├── freeipa │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── addhost.yml │ │ │ ├── backup-keytabs.yml │ │ │ ├── client-install.yml │ │ │ ├── enrol.yml │ │ │ ├── server.yml │ │ │ ├── users.yml │ │ │ └── validate.yml │ ├── gateway │ │ ├── README.md │ │ ├── files │ │ │ └── gateway-init.yml │ │ └── tasks │ │ │ └── main.yml │ ├── grafana-dashboards │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── openhpc-slurm.json │ │ │ └── slurm-jobs.json │ │ └── tasks │ │ │ └── main.yml │ ├── hpctests │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── mpi_nxnlatbw.c │ │ │ └── plot_imb_pingpong.py │ │ ├── handlers │ │ │ └── main.yml │ │ ├── library │ │ │ ├── hpl_pq.py │ │ │ ├── plot_nxnlatbw.py │ │ │ ├── read_imb_pingpong.py │ │ │ └── slurm_node_info.py │ │ ├── meta │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── build-hpl.yml │ │ │ ├── hpl-solo.yml │ │ │ ├── main.yml │ │ │ ├── pingmatrix.yml │ │ │ ├── pingpong.yml │ │ │ └── setup.yml │ │ ├── templates │ │ │ ├── HPL.dat.j2 │ │ │ ├── hpl-build.sh.j2 │ │ │ ├── hpl-solo.sh.j2 │ │ │ ├── pingmatrix.sh.j2 │ │ │ └── pingpong.sh.j2 │ │ ├── tests │ │ │ ├── inventory │ │ │ └── test.yml │ │ └── vars │ │ │ └── main.yml │ ├── k3s │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── agent-runtime.yml │ │ │ ├── install.yml │ │ │ └── server-runtime.yml │ │ └── templates │ │ │ ├── k3s-agent.service.env.j2 │ │ │ └── k3s.service.env.j2 │ ├── k9s │ │ └── tasks │ │ │ └── main.yml │ ├── lustre │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── configure.yml │ │ │ ├── install.yml │ │ │ └── validate.yml │ │ └── templates │ │ │ └── lnet.conf.j2 │ ├── mysql │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── configure.yml │ │ │ ├── install.yml │ │ │ └── main.yml │ │ └── templates │ │ │ └── mysql.service.j2 │ ├── ofed │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── install.yml │ │ │ └── main.yml │ ├── openondemand │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ ├── jupyter_requirements.txt │ │ │ └── missing_home_directory.html │ │ ├── tasks │ │ │ ├── config_changes.yml │ │ │ ├── exporter.yml │ │ │ ├── jupyter_compute.yml │ │ │ ├── main.yml │ │ │ ├── pam_auth.yml │ │ │ ├── validate.yml │ │ │ └── vnc_compute.yml │ │ └── templates │ │ │ ├── dashboard_app_links.yml.j2 │ │ │ ├── files_shortcuts.rb.j2 │ │ │ └── grid-mapfile.j2 │ ├── opensearch │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── archive_data.yml │ │ │ ├── certs.yml │ │ │ ├── install.yml │ │ │ ├── migrate-opendistro.yml │ │ │ └── runtime.yml │ │ └── templates │ │ │ ├── opensearch.service.j2 │ │ │ └── opensearch.yml.j2 │ ├── passwords │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── main.yml │ │ │ └── validate.yml │ │ └── templates │ │ │ └── passwords.yml │ ├── persist_hostkeys │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ ├── persist_openhpc_secrets │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ └── openhpc_secrets.fact │ ├── podman │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── config.yml │ │ │ └── prereqs.yml │ ├── proxy │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ ├── pulp_site │ │ ├── .gitignore │ │ ├── defaults │ │ │ └── main.yml │ │ ├── filter_plugins │ │ │ └── pulp-list-filters.py │ │ ├── tasks │ │ │ ├── install.yml │ │ │ └── sync.yml │ │ └── templates │ │ │ ├── cli.toml.j2 │ │ │ └── settings.py.j2 │ ├── rebuild │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── main.yml │ │ │ ├── rebuild.yml │ │ │ └── rebuild_partition.yml │ ├── resolv_conf │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ └── NetworkManager-dns-none.conf │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ └── resolv.conf.j2 │ ├── slurm_exporter │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── install.yml │ │ │ └── main.yml │ ├── slurm_stats │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ ├── slurm_tools │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ ├── squid │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── configure.yml │ │ │ ├── install.yml │ │ │ └── main.yml │ │ └── templates │ │ │ └── squid.conf.j2 │ ├── sshd │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ ├── configure.yml │ │ │ ├── export.yml │ │ │ └── main.yml │ │ └── templates │ │ │ └── sshd.conf.j2 │ ├── sssd │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── configure.yml │ │ │ ├── export.yml │ │ │ ├── install.yml │ │ │ └── main.yml │ ├── systemd │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ ├── tuned │ │ ├── README.md │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ ├── configure.yml │ │ │ ├── install.yml │ │ │ └── main.yml │ └── zenith_proxy │ │ ├── defaults │ │ └── main.yml │ │ ├── files │ │ └── podman-pod-infra-attach.sh │ │ ├── tasks │ │ └── main.yml │ │ └── templates │ │ ├── client.service.j2 │ │ ├── mitm.service.j2 │ │ ├── pod.service.j2 │ │ └── zenith-client.yaml.j2 ├── site.yml ├── slurm.yml └── validate.yml ├── dev ├── ansible-ssh ├── delete-cluster.py ├── extract_logs.py ├── image-share.sh ├── output_manifest.py └── setup-env.sh ├── docs ├── README.md ├── adding-functionality.md ├── alerting.md ├── chrony.md ├── ci.md ├── environments.md ├── experimental │ ├── compute-init.md │ ├── pulp.md │ └── slurm-controlled-rebuild.md ├── image-build.md ├── k3s.README.md ├── monitoring-and-logging.md ├── networks.md ├── openondemand.md ├── operations.md ├── persistent-state.md ├── production.md ├── screenshots │ └── grafana │ │ ├── dashboard-node-exporter.png │ │ ├── dashboard-openhpc-slurm-jobs.png │ │ ├── dashboard-openhpc-slurm.png │ │ └── grafana-slurm-jobs-linking-to-node-exporter.png ├── sequence.md ├── site │ └── README.md └── upgrades.md ├── environments ├── .caas │ ├── README.md │ ├── ansible.cfg │ ├── assets │ │ └── ood-icon.png │ ├── hooks │ │ ├── .gitkeep │ │ ├── post.yml │ │ └── pre.yml │ ├── inventory │ │ ├── group_vars │ │ │ ├── all │ │ │ │ ├── .gitkeep │ │ │ │ ├── basic_users.yml │ │ │ │ ├── cluster.yml │ │ │ │ ├── grafana.yml │ │ │ │ ├── hpctests.yml │ │ │ │ ├── manila.yml │ │ │ │ ├── nfs.yml │ │ │ │ ├── openhpc.yml │ │ │ │ ├── openondemand.yml │ │ │ │ ├── prometheus.yml │ │ │ │ └── zenith.yml │ │ │ └── openstack.yml │ │ ├── groups │ │ └── hosts │ └── ui-meta │ │ ├── slurm-infra-fast-volume-type.yml │ │ ├── slurm-infra-manila-home.yml │ │ └── slurm-infra.yml ├── .stackhpc │ ├── .gitignore │ ├── ARCUS.pkrvars.hcl │ ├── LEAFCLOUD.pkrvars.hcl │ ├── SMS.pkrvars.hcl │ ├── activate │ ├── ansible.cfg │ ├── bastion_fingerprints │ ├── cacerts │ │ └── myCA.pem │ ├── hooks │ │ ├── post-bootstrap.yml │ │ └── pre.yml │ ├── inventory │ │ ├── everything │ │ ├── extra_groups │ │ └── group_vars │ │ │ ├── all │ │ │ ├── .gitkeep │ │ │ ├── basic_users.yml │ │ │ ├── bastion.yml │ │ │ ├── freeipa.yml │ │ │ ├── grafana.yml │ │ │ ├── hpctests.yml │ │ │ ├── manila.yml │ │ │ ├── openhpc.yml │ │ │ ├── openondemand.yml │ │ │ └── podman.yml │ │ │ └── builder.yml │ └── tofu │ │ ├── ARCUS.tfvars │ │ ├── LEAFCLOUD-dev.tfvars │ │ ├── LEAFCLOUD.tfvars │ │ ├── SMS.tfvars │ │ ├── cluster_image.auto.tfvars.json │ │ └── main.tf ├── README.md ├── common │ ├── .gitignore │ ├── README.md │ ├── files │ │ ├── filebeat │ │ │ └── filebeat.yml │ │ ├── grafana │ │ │ └── grafana.repo.j2 │ │ ├── opensearch │ │ │ └── internal_users.yml.j2 │ │ └── prometheus │ │ │ └── rules │ │ │ ├── node-exporter.rules │ │ │ ├── precompute.rules │ │ │ └── slurm.rules │ ├── inventory │ │ ├── group_vars │ │ │ ├── all │ │ │ │ ├── alertmanager.yml │ │ │ │ ├── ansible_init.yml │ │ │ │ ├── basic_users.yml │ │ │ │ ├── defaults.yml │ │ │ │ ├── filebeat.yml │ │ │ │ ├── firewalld.yml │ │ │ │ ├── freeipa_server.yml │ │ │ │ ├── grafana.yml │ │ │ │ ├── hpctests.yml │ │ │ │ ├── k3s.yml │ │ │ │ ├── manila.yml │ │ │ │ ├── mysql.yml │ │ │ │ ├── nfs.yml │ │ │ │ ├── openhpc.yml │ │ │ │ ├── openondemand.yml │ │ │ │ ├── opensearch.yml │ │ │ │ ├── os-manila-mount.yml │ │ │ │ ├── podman.yml │ │ │ │ ├── prometheus.yml │ │ │ │ ├── proxy.yml │ │ │ │ ├── pulp.yml │ │ │ │ ├── selinux.yml │ │ │ │ ├── slurm_exporter.yml │ │ │ │ ├── squid.yml │ │ │ │ ├── sshd.yaml │ │ │ │ ├── systemd.yml │ │ │ │ ├── timestamps.yml │ │ │ │ └── update.yml │ │ │ └── builder │ │ │ │ └── defaults.yml │ │ └── groups │ └── layouts │ │ ├── README.md │ │ ├── everything │ │ └── minimal └── skeleton │ ├── cookiecutter.json │ └── {{cookiecutter.environment}} │ ├── README.md │ ├── activate │ ├── ansible.cfg │ ├── hooks │ └── .gitkeep │ ├── inventory │ ├── group_vars │ │ └── all │ │ │ ├── .gitkeep │ │ │ ├── alertmanager.yml │ │ │ ├── basic_users.yml │ │ │ ├── grafana.yml │ │ │ ├── hpctests.yml │ │ │ └── vault_alertmanager.yml │ └── groups │ └── tofu │ ├── baremetal-node-list.py │ ├── compute.tf │ ├── control.tf │ ├── data.tf │ ├── inventory.tf │ ├── inventory.tpl │ ├── login.tf │ ├── main.tf │ ├── network.tf │ ├── node_group │ ├── main.tf │ ├── network.tf │ ├── nodes.tf │ └── variables.tf │ ├── read-inventory-secrets.py │ ├── variables.tf │ └── volumes.tf ├── packer ├── .gitignore ├── ansible-inventory.sh ├── openhpc_extravars.yml └── openstack.pkr.hcl ├── requirements.txt └── requirements.yml /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @stackhpc/batch 2 | -------------------------------------------------------------------------------- /.github/bin/get-s3-image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ##### 4 | # This script looks for an image in OpenStack and if not found, downloads from 5 | # S3 bucket, and then uploads to OpenStack 6 | ##### 7 | 8 | set -ex 9 | 10 | image_name=$1 11 | bucket_name=$2 12 | echo "Checking if image $image_name exists in OpenStack" 13 | image_exists=$(openstack image list --name "$image_name" -f value -c Name) 14 | 15 | if [ -n "$image_exists" ]; then 16 | echo "Image $image_name already exists in OpenStack." 17 | else 18 | echo "Image $image_name not found in OpenStack. Getting it from S3." 19 | 20 | wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/$bucket_name/$image_name --progress=dot:giga 21 | 22 | echo "Uploading image $image_name to OpenStack..." 23 | openstack image create --file $image_name --disk-format qcow2 $image_name --progress 24 | 25 | echo "Image $image_name has been uploaded to OpenStack." 26 | fi -------------------------------------------------------------------------------- /.github/workflows/release-image.yml: -------------------------------------------------------------------------------- 1 | name: Release images 2 | on: 3 | workflow_dispatch: 4 | release: 5 | types: 6 | - published # should work for both pre-releases and releases 7 | env: 8 | IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json 9 | jobs: 10 | ci-image-release: 11 | name: ci-image-release 12 | runs-on: ubuntu-22.04 13 | concurrency: ${{ github.workflow }}-${{ github.ref }} 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | build: 18 | - RL8 19 | - RL9 20 | steps: 21 | - uses: actions/checkout@v2 22 | 23 | - name: Write s3cmd configuration 24 | run: echo "${{ secrets.ARCUS_S3_CFG }}" > ~/.s3cfg 25 | 26 | - name: Install s3cmd 27 | run: | 28 | sudo apt-get update 29 | sudo apt-get --yes install s3cmd 30 | 31 | - name: Retrieve image name 32 | run: | 33 | TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}") 34 | echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV" 35 | 36 | - name: Copy image from pre-release to release bucket 37 | run: s3cmd cp s3://openhpc-images-prerelease/${{ env.TARGET_IMAGE }} s3://openhpc-images 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | terraform.tfstate* 2 | .terraform 3 | config-drive.iso 4 | venv 5 | *.pyc 6 | packer/openhpc2 7 | .vscode 8 | -------------------------------------------------------------------------------- /ansible.cfg: -------------------------------------------------------------------------------- 1 | # Only used for Azimuth running the caas environment 2 | [defaults] 3 | any_errors_fatal = True 4 | gathering = smart 5 | forks = 30 6 | host_key_checking = False 7 | remote_tmp = /tmp 8 | collections_path = ansible/collections 9 | roles_path = ansible/roles 10 | filter_plugins = ansible/filter_plugins 11 | callbacks_enabled = ansible.posix.profile_tasks 12 | 13 | [ssh_connection] 14 | ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null 15 | pipelining = True 16 | # This is important because we are using one of the hosts in the play as a jump host 17 | # This ensures that if the proxy connection is interrupted, rendering the other hosts 18 | # unreachable, the connection is retried instead of failing the entire play 19 | retries = 10 20 | -------------------------------------------------------------------------------- /ansible/adhoc/backup-keytabs.yml: -------------------------------------------------------------------------------- 1 | # Use ONE of the following tags on this playbook: 2 | # - retrieve: copies keytabs out of the state volume to the environment 3 | # - deploy: copies keytabs from the environment to the state volume 4 | 5 | - hosts: freeipa_client 6 | become: yes 7 | gather_facts: no 8 | tasks: 9 | - import_role: 10 | name: freeipa 11 | tasks_from: backup-keytabs.yml 12 | -------------------------------------------------------------------------------- /ansible/adhoc/cudatests.yml: -------------------------------------------------------------------------------- 1 | - hosts: cuda 2 | become: yes 3 | gather_facts: yes 4 | tags: cuda_samples 5 | tasks: 6 | - import_role: 7 | name: cuda 8 | tasks_from: samples.yml 9 | -------------------------------------------------------------------------------- /ansible/adhoc/deploy-pulp.yml: -------------------------------------------------------------------------------- 1 | # Usage: ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server=" 2 | 3 | - name: Add temporary pulp server host 4 | hosts: localhost 5 | tasks: 6 | - ansible.builtin.add_host: 7 | name: "{{ pulp_server }}" 8 | group: "_pulp_host" 9 | 10 | - name: Install pulp on server and add to config 11 | become: yes 12 | hosts: _pulp_host 13 | tasks: 14 | - name: Install pulp 15 | ansible.builtin.include_role: 16 | name: pulp_site 17 | tasks_from: install.yml 18 | public: true 19 | 20 | - name: Print Pulp endpoint 21 | become: no 22 | debug: 23 | msg: | 24 | Server configured, override 'appliances_pulp_url' with 25 | appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" 26 | in your environments 27 | -------------------------------------------------------------------------------- /ansible/adhoc/generate-passwords.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Generate passwords.yml 4 | hosts: localhost 5 | gather_facts: false 6 | tasks: 7 | - name: Include password generation role 8 | include_role: 9 | name: passwords -------------------------------------------------------------------------------- /ansible/adhoc/hpctests.yml: -------------------------------------------------------------------------------- 1 | # An MPI-based test suite for Slurm appliance clusters. Safe to use on in-production clusters. 2 | # See ansible/roles/hpctests/README.md for details and options. 3 | # Relies on installed packages in appliance defaults - see openhpc variables. 4 | 5 | --- 6 | 7 | - hosts: hpctests[0] # TODO: might want to make which node is used selectable? 8 | become: false 9 | gather_facts: false 10 | tasks: 11 | - import_role: 12 | name: hpctests 13 | -------------------------------------------------------------------------------- /ansible/adhoc/rebuild-via-slurm.yml: -------------------------------------------------------------------------------- 1 | # Rebuild compute nodes via slurm. 2 | # Nodes will be rebuilt if `image_id` in inventory is different to the 3 | # currently-provisioned image. Otherwise they are rebooted. 4 | 5 | # Example: 6 | # ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml 7 | 8 | # See docs/slurm-controlled-rebuild.md. 9 | 10 | - hosts: login 11 | run_once: true 12 | gather_facts: no 13 | tasks: 14 | - name: Run slurm-controlled rebuild 15 | import_role: 16 | name: rebuild 17 | tasks_from: rebuild.yml 18 | -------------------------------------------------------------------------------- /ansible/adhoc/rebuild.yml: -------------------------------------------------------------------------------- 1 | # Rebuild hosts with a specified image from OpenStack. 2 | # 3 | # Use ansible's -v output to see output. 4 | # Use --limit to control which hosts to rebuild (either specific hosts or the _ groups defining partitions). 5 | # Optionally, supply `-e rebuild_image=` to define a specific image, otherwise the current image is reused. 6 | # 7 | # NOTE: If a hostvar `instance_id` is defined this is used to select hosts. Otherwise the hostname is used and this must be unique, which may not be the case e.g. if using identically-named staging and production hosts. 8 | # 9 | # Example: 10 | # ansible-playbook -v --limit ohpc_compute ansible/adhoc/rebuild.yml -e rebuild_image=openhpc_v2.3 11 | 12 | - hosts: cluster 13 | become: no 14 | gather_facts: no 15 | tasks: 16 | - command: "openstack server rebuild {{ instance_id | default(inventory_hostname) }}{% if rebuild_image is defined %} --image {{ rebuild_image }}{% endif %}" 17 | delegate_to: localhost 18 | - wait_for_connection: 19 | delay: 60 20 | timeout: 600 21 | 22 | -------------------------------------------------------------------------------- /ansible/adhoc/restart-slurm.yml: -------------------------------------------------------------------------------- 1 | # Restart all slurm daemons e.g. after changing configuration. Note that: 2 | # - `scontrol reconfigure` will handle most reconfiguration - see https://slurm.schedmd.com/scontrol.html#OPT_reconfigure 3 | # for which options need a restart 4 | # - Adding or removing nodes by changing the `openhpc_` configuration and rerunning ansible/site.yml will automatically 5 | # restart daemons as required. 6 | 7 | - hosts: compute,login 8 | become: yes 9 | gather_facts: no 10 | tasks: 11 | - service: 12 | name: slurmd 13 | state: stopped 14 | 15 | - hosts: control 16 | become: yes 17 | gather_facts: no 18 | tasks: 19 | - service: 20 | name: slurmctld 21 | state: restarted 22 | 23 | - hosts: compute,login 24 | become: yes 25 | gather_facts: no 26 | tasks: 27 | - service: 28 | name: slurmd 29 | state: started 30 | -------------------------------------------------------------------------------- /ansible/adhoc/sync-pulp.yml: -------------------------------------------------------------------------------- 1 | - hosts: localhost 2 | tasks: 3 | - ansible.builtin.include_role: 4 | name: pulp_site 5 | tasks_from: sync.yml 6 | vars: 7 | pulp_site_target_arch: "x86_64" 8 | pulp_site_target_distribution: "rocky" 9 | pulp_site_target_distribution_version: "9.5" 10 | pulp_site_target_distribution_version_major: "9" 11 | -------------------------------------------------------------------------------- /ansible/adhoc/update-packages.yml: -------------------------------------------------------------------------------- 1 | - hosts: update 2 | become: yes 3 | gather_facts: false 4 | tasks: 5 | - name: Update selected packages 6 | yum: 7 | name: "{{ update_name }}" 8 | state: "{{ update_state }}" 9 | exclude: "{{ update_exclude }}" 10 | disablerepo: "{{ update_disablerepo }}" 11 | register: updates 12 | - name: Log updated packages 13 | copy: 14 | content: "{{ updates.results | join('\n') }}" 15 | dest: "{{ update_log_path }}" 16 | delegate_to: localhost 17 | - debug: 18 | msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" 19 | -------------------------------------------------------------------------------- /ansible/ci/check_grafana.yml: -------------------------------------------------------------------------------- 1 | # Checks Slurm jobs from hpctests are shown in Grafana. 2 | # Can't actually check the dashboard programatically so this queries the datasource used by the dashboard instead. 3 | 4 | - hosts: control # so proxying etc is irrelevant 5 | gather_facts: no 6 | become: no 7 | tasks: 8 | - name: Wait for slurm-stats file to exist (run by cron) 9 | ansible.builtin.wait_for: 10 | path: /var/log/slurm-stats/finished_jobs.json 11 | timeout: 315 # slurm stats cron job runs every 5 mins 12 | 13 | - name: Query grafana for expected hpctests jobs 14 | grafana_elasticsearch_query: 15 | grafana_url: http://{{ grafana_api_address }}:{{ grafana_port }} 16 | grafana_username: grafana 17 | grafana_password: "{{ vault_grafana_admin_password }}" 18 | datasource: slurmstats 19 | index_pattern: filebeat-* 20 | register: _slurm_stats_jobs 21 | until: _expected_jobs | difference(_found_jobs) == [] 22 | retries: 60 23 | delay: 5 24 | vars: 25 | _found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}" 26 | _expected_jobs: ['pingpong.sh'] 27 | -------------------------------------------------------------------------------- /ansible/ci/check_sacct_hpctests.yml: -------------------------------------------------------------------------------- 1 | - hosts: control 2 | gather_facts: false 3 | become: true 4 | vars: 5 | sacct_stdout_expected: |- # based on CI running hpctests as the first job 6 | JobID,JobName,State 7 | 1,pingpong.sh,COMPLETED 8 | tasks: 9 | - name: Get info for ended jobs 10 | shell: 11 | cmd: sacct --format=jobid,jobname,state --allocations --parsable2 --delimiter=, --starttime=now-1days --endtime=now 12 | # by default start/end time is midnight/now which is not robust 13 | changed_when: false 14 | register: sacct 15 | - name: Check info for ended jobs 16 | assert: 17 | that: sacct_stdout_expected in sacct.stdout 18 | fail_msg: | 19 | Expected: 20 | --{{ sacct_stdout_expected }}-- 21 | Got: 22 | --{{ sacct.stdout }}-- 23 | success_msg: sacct shows hpctests jobs as first jobs in list 24 | -------------------------------------------------------------------------------- /ansible/ci/check_slurm.yml: -------------------------------------------------------------------------------- 1 | - hosts: login:!builder # won't have a slurm control daemon when in build 2 | become: no 3 | gather_facts: false 4 | tasks: 5 | - name: Run sinfo 6 | shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name 7 | register: sinfo 8 | changed_when: false 9 | until: sinfo.stdout_lines == expected_sinfo 10 | retries: 200 11 | delay: 5 12 | vars: 13 | expected_sinfo: 14 | - " extra up 60-00:00:00 0 n/a" # empty partition 15 | - "{{ openhpc_cluster_name }}-compute-[0-1] standard* up 60-00:00:00 2 idle" 16 | -------------------------------------------------------------------------------- /ansible/ci/delete_images.yml: -------------------------------------------------------------------------------- 1 | - hosts: login:!builder 2 | become: no 3 | gather_facts: no 4 | tasks: 5 | - import_tasks: get_image_ids.yml 6 | 7 | - name: Delete images 8 | shell: 9 | cmd: | 10 | openstack image delete {{ item.artifact_id }} 11 | delegate_to: localhost 12 | loop: "{{ manifest['builds'] }}" 13 | -------------------------------------------------------------------------------- /ansible/ci/get_image_ids.yml: -------------------------------------------------------------------------------- 1 | - name: Read packer build manifest 2 | set_fact: 3 | manifest: "{{ lookup('file', manifest_path) | from_json }}" 4 | vars: 5 | manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" 6 | delegate_to: localhost 7 | 8 | - name: Get latest image builds 9 | set_fact: 10 | login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" 11 | compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" 12 | control_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'control'}) | last }}" 13 | -------------------------------------------------------------------------------- /ansible/ci/output_vars.yml: -------------------------------------------------------------------------------- 1 | # Output specific hostvars to a file in a form which can be sourced by bash 2 | # NB: obviously the keys and values for the hostvars need to be suitable bash variables 3 | - hosts: "{{ output_vars_hosts }}" 4 | gather_facts: no 5 | tasks: 6 | - copy: 7 | dest: "{{ output_vars_path }}" 8 | content: | 9 | {% for item in output_vars_items.split(',') %} 10 | export {{output_vars_prefix | default('') }}{{ item }}={{ lookup('vars', item) }} 11 | {% endfor %} 12 | delegate_to: localhost 13 | -------------------------------------------------------------------------------- /ansible/ci/retrieve_inventory.yml: -------------------------------------------------------------------------------- 1 | # Retrieve inventory from a deployed CI arcus environment by reversing arcus/inventory/hooks/pre.yml 2 | # Usage example: 3 | # ansible-playbook ansible/ci/retrieve_inventory.yml -e cluster_prefix=ci4005969475 4 | # 5 | - hosts: localhost 6 | become: no 7 | gather_facts: no 8 | vars: 9 | cluster_prefix: "{{ undef(hint='cluster_prefix must be defined') }}" # e.g. ci4005969475 10 | ci_vars_file: "{{ appliances_environment_root + '/tofu/' + lookup('env', 'CI_CLOUD') }}.tfvars" 11 | cluster_network: "{{ lookup('ansible.builtin.ini', 'cluster_net', file=ci_vars_file, type='properties') | trim('\"') }}" 12 | tasks: 13 | - name: Get control host IP 14 | set_fact: 15 | control_ip: "{{ (lookup('pipe', 'openstack server show -f json ' + cluster_prefix + '-control') | from_json)['addresses'][cluster_network][0] }}" 16 | - name: Add host into in-memory inventory 17 | add_host: 18 | name: cluster_control 19 | groups: control 20 | ansible_host: "{{ control_ip }}" 21 | 22 | - hosts: control 23 | become: yes 24 | gather_facts: no 25 | tasks: 26 | - ansible.builtin.fetch: 27 | src: "/etc/ci-config/{{ item | basename }}" 28 | dest: "{{ item }}" 29 | flat: true 30 | loop: 31 | - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts" 32 | - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" 33 | - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" 34 | -------------------------------------------------------------------------------- /ansible/ci/update_timestamps.yml: -------------------------------------------------------------------------------- 1 | - hosts: localhost 2 | tasks: 3 | - name: Get latest timestamps from sources 4 | latest_timestamps: 5 | repos_dict: "{{ appliances_pulp_repos }}" 6 | content_url: "https://ark.stackhpc.com/pulp/content" 7 | register: _result 8 | 9 | - name: Overwrite repo timestamps with latest 10 | ansible.builtin.copy: 11 | dest: "{{ appliances_repository_root }}/environments/common/inventory/group_vars/all/timestamps.yml" 12 | content: "{{ repo_template | to_nice_yaml(indent=2) }}" 13 | backup: true 14 | vars: 15 | repo_template: 16 | appliances_pulp_repos: "{{ _result.timestamps }}" 17 | -------------------------------------------------------------------------------- /ansible/disable-repos.yml: -------------------------------------------------------------------------------- 1 | - hosts: dnf_repos 2 | become: yes 3 | tasks: 4 | - name: Disable pulp repos 5 | ansible.builtin.include_role: 6 | name: dnf_repos 7 | tasks_from: disable_repos.yml 8 | when: not dnf_repos_enabled | default(false) | bool 9 | -------------------------------------------------------------------------------- /ansible/filesystems.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Setup block devices 4 | hosts: block_devices 5 | become: yes 6 | tags: block_devices 7 | tasks: 8 | - include_role: 9 | name: block_devices 10 | 11 | - name: Setup NFS 12 | hosts: nfs 13 | become: true 14 | tags: 15 | - nfs 16 | tasks: 17 | - include_role: 18 | name: stackhpc.nfs 19 | 20 | - name: Setup Manila share mounts 21 | hosts: manila 22 | become: true 23 | tags: manila 24 | tasks: 25 | - include_role: 26 | name: stackhpc.os-manila-mount 27 | 28 | - name: Setup Lustre clients 29 | hosts: lustre 30 | become: true 31 | tags: lustre 32 | tasks: 33 | - include_role: 34 | name: lustre 35 | # NB install is ONLY run in builder 36 | tasks_from: configure.yml 37 | -------------------------------------------------------------------------------- /ansible/iam.yml: -------------------------------------------------------------------------------- 1 | - hosts: freeipa_client 2 | tags: 3 | - freeipa 4 | - freeipa_server # as this is only relevant if using freeipa_server 5 | - freeipa_host 6 | gather_facts: no 7 | become: yes 8 | tasks: 9 | - name: Ensure FreeIPA client hosts are added to the FreeIPA server 10 | import_role: 11 | name: freeipa 12 | tasks_from: addhost.yml 13 | when: groups['freeipa_server'] | length > 0 14 | 15 | - hosts: freeipa_client 16 | tags: 17 | - freeipa 18 | - freeipa_client 19 | gather_facts: yes 20 | become: yes 21 | tasks: 22 | - name: Install FreeIPA client 23 | import_role: 24 | name: freeipa 25 | tasks_from: client-install.yml 26 | - name: Enrol FreeIPA client 27 | import_role: 28 | name: freeipa 29 | tasks_from: enrol.yml 30 | 31 | - hosts: freeipa_server 32 | tags: 33 | - freeipa 34 | - freeipa_server 35 | - users 36 | gather_facts: yes 37 | become: yes 38 | tasks: 39 | - name: Add FreeIPA users 40 | import_role: 41 | name: freeipa 42 | tasks_from: users.yml 43 | 44 | - hosts: sssd 45 | become: yes 46 | gather_facts: no 47 | tags: sssd 48 | tasks: 49 | - name: Configure sssd 50 | import_role: 51 | name: sssd 52 | -------------------------------------------------------------------------------- /ansible/noop.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # This file exists so that we can conditionally import a playbook. The path 4 | # must exist, but we can use a when conditional so that it is not actually 5 | # run 6 | 7 | - hosts: localhost 8 | gather_facts: false 9 | tasks: [] 10 | -------------------------------------------------------------------------------- /ansible/portal.yml: -------------------------------------------------------------------------------- 1 | - hosts: openondemand 2 | tags: 3 | - openondemand 4 | - openondemand_server 5 | become: yes 6 | gather_facts: yes # TODO 7 | tasks: 8 | - import_role: 9 | name: openondemand 10 | tasks_from: main.yml 11 | 12 | - hosts: openondemand_desktop 13 | tags: 14 | - openondemand 15 | - openondemand_desktop 16 | become: yes 17 | gather_facts: yes 18 | tasks: 19 | - import_role: 20 | name: openondemand 21 | tasks_from: vnc_compute.yml 22 | 23 | - hosts: openondemand_jupyter 24 | tags: 25 | - openondemand 26 | - openondemand_jupyter 27 | become: yes 28 | gather_facts: yes 29 | tasks: 30 | - import_role: 31 | name: openondemand 32 | tasks_from: jupyter_compute.yml 33 | -------------------------------------------------------------------------------- /ansible/roles/alertmanager/handlers/main.yml: -------------------------------------------------------------------------------- 1 | - name: Restart alertmanager 2 | systemd: 3 | name: alertmanager 4 | state: restarted 5 | daemon_reload: "{{ _alertmanager_service.changed | default(false) }}" 6 | when: alertmanager_started | bool 7 | -------------------------------------------------------------------------------- /ansible/roles/alertmanager/tasks/configure.yml: -------------------------------------------------------------------------------- 1 | - name: Create alertmanager directories 2 | ansible.builtin.file: 3 | path: "{{ item }}" 4 | state: directory 5 | owner: "{{ alertmanager_system_user }}" 6 | group: "{{ alertmanager_system_group }}" 7 | mode: u=rwX,go=rX 8 | loop: 9 | - "{{ alertmanager_config_file | dirname }}" 10 | - "{{ alertmanager_web_config_file | dirname }}" 11 | - "{{ alertmanager_storage_path }}" 12 | 13 | - name: Create alertmanager service file with immutable options 14 | template: 15 | src: alertmanager.service.j2 16 | dest: /usr/lib/systemd/system/alertmanager.service 17 | owner: root 18 | group: root 19 | mode: u=rw,go=r 20 | register: _alertmanager_service 21 | notify: Restart alertmanager 22 | 23 | - name: Template alertmanager config 24 | ansible.builtin.template: 25 | src: "{{ alertmanager_config_template }}" 26 | dest: "{{ alertmanager_config_file }}" 27 | owner: "{{ alertmanager_system_user }}" 28 | group: "{{ alertmanager_system_group }}" 29 | mode: u=rw,go= 30 | notify: Restart alertmanager 31 | 32 | - name: Template alertmanager web config 33 | ansible.builtin.template: 34 | src: "{{ alertmanager_web_config_template }}" 35 | dest: "{{ alertmanager_web_config_file }}" 36 | owner: "{{ alertmanager_system_user }}" 37 | group: "{{ alertmanager_system_group }}" 38 | mode: u=rw,go= 39 | notify: Restart alertmanager 40 | 41 | - meta: flush_handlers 42 | 43 | - name: Ensure alertmanager service state 44 | systemd: 45 | name: alertmanager 46 | state: "{{ 'started' if alertmanager_started | bool else 'stopped' }}" 47 | enabled: "{{ alertmanager_enabled | bool }}" 48 | -------------------------------------------------------------------------------- /ansible/roles/alertmanager/tasks/install.yml: -------------------------------------------------------------------------------- 1 | - name: Create alertmanager system user 2 | ansible.builtin.user: 3 | name: "{{ alertmanager_system_user }}" 4 | system: true 5 | create_home: false 6 | 7 | - name: Download alertmanager binary 8 | ansible.builtin.get_url: 9 | url: "https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/alertmanager-{{ alertmanager_version }}.linux-amd64.tar.gz" 10 | dest: "{{ alertmanager_download_dest }}" 11 | owner: root 12 | group: root 13 | mode: u=rw,go= 14 | checksum: "{{ alertmanager_download_checksum }}" 15 | 16 | - name: Unpack alertmanager binary 17 | ansible.builtin.unarchive: 18 | src: "{{ alertmanager_download_dest }}" 19 | include: "alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager" 20 | dest: "{{ alertmanager_binary_dir }}" 21 | owner: root 22 | group: root 23 | mode: u=rwx,go=rx 24 | remote_src: true 25 | extra_opts: ['--strip-components=1', '--show-stored-names'] 26 | -------------------------------------------------------------------------------- /ansible/roles/alertmanager/templates/alertmanager-web.yml.j2: -------------------------------------------------------------------------------- 1 | {{ ansible_managed | comment }} 2 | 3 | {{ alertmanager_web_config_default | to_nice_yaml }} 4 | {{ alertmanager_alertmanager_web_config_extra | to_nice_yaml if alertmanager_alertmanager_web_config_extra | length > 0 else '' }} 5 | -------------------------------------------------------------------------------- /ansible/roles/alertmanager/templates/alertmanager.service.j2: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{ ansible_managed | comment }} 5 | [Unit] 6 | Description=Prometheus Alertmanager 7 | After=network-online.target 8 | StartLimitInterval=0 9 | StartLimitIntervalSec=0 10 | 11 | [Service] 12 | Type=simple 13 | PIDFile=/run/alertmanager.pid 14 | User={{ alertmanager_system_user }} 15 | Group={{ alertmanager_system_group }} 16 | ExecReload=/bin/kill -HUP $MAINPID 17 | ExecStart={{ alertmanager_binary_dir }}/alertmanager \ 18 | --cluster.listen-address='' \ 19 | --config.file={{ alertmanager_config_file }} \ 20 | --storage.path={{ alertmanager_storage_path }} \ 21 | --data.retention={{ alertmanager_data_retention }} \ 22 | --data.maintenance-interval={{ alertmanager_data_maintenance_interval }} \ 23 | {% for address in alertmanager_web_listen_addresses %} 24 | --web.listen-address={{ address }} \ 25 | {% endfor %} 26 | --web.external-url={{ alertmanager_web_external_url }} \ 27 | --web.config.file={{ alertmanager_web_config_file }} \ 28 | {% for flag, flag_value in alertmanager_config_flags.items() %} 29 | --{{ flag }}={{ flag_value }} \ 30 | {% endfor %} 31 | 32 | SyslogIdentifier=alertmanager 33 | Restart=always 34 | RestartSec=5 35 | 36 | CapabilityBoundingSet=CAP_SET_UID 37 | LockPersonality=true 38 | NoNewPrivileges=true 39 | MemoryDenyWriteExecute=true 40 | PrivateTmp=true 41 | ProtectHome=true 42 | ReadWriteDirectories={{ alertmanager_storage_path }} 43 | RemoveIPC=true 44 | RestrictSUIDSGID=true 45 | 46 | PrivateUsers=true 47 | ProtectControlGroups=true 48 | ProtectKernelModules=true 49 | ProtectKernelTunables=yes 50 | ProtectSystem=strict 51 | 52 | [Install] 53 | WantedBy=multi-user.target 54 | -------------------------------------------------------------------------------- /ansible/roles/alertmanager/templates/alertmanager.yml.j2: -------------------------------------------------------------------------------- 1 | {{ ansible_managed | comment }} 2 | 3 | {{ alertmanager_config_default | to_nice_yaml }} 4 | {{ alertmanager_config_extra | to_nice_yaml if alertmanager_config_extra | length > 0 else '' }} 5 | -------------------------------------------------------------------------------- /ansible/roles/basic_users/defaults/main.yml: -------------------------------------------------------------------------------- 1 | basic_users_homedir_server: "{{ groups['control'] | first }}" # no way, generally, to find the nfs_server 2 | basic_users_homedir_server_path: /exports/home 3 | basic_users_homedir_client: "{{ groups['login'] | first }}" 4 | basic_users_userdefaults: 5 | state: present # need this here so don't have to add default() everywhere 6 | generate_ssh_key: true 7 | ssh_key_comment: "{{ item.name }}" 8 | ssh_key_type: ed25519 9 | shell: "{{'/sbin/nologin' if 'control' in group_names else omit }}" 10 | basic_users_users: [] 11 | basic_users_groups: [] 12 | basic_users_override_sssd: false 13 | -------------------------------------------------------------------------------- /ansible/roles/basic_users/filter_plugins/filter_keys.py: -------------------------------------------------------------------------------- 1 | """ Filter a dict to remove specified keys """ 2 | 3 | import copy 4 | 5 | USER_MODULE_PARAMS = ('append authorization comment create_home createhome expires force generate_ssh_key group ' 6 | 'groups hidden home local login_class move_home name user non_unique password password_expire_min ' 7 | 'password_expire_max password_lock profile remove role seuser shell skeleton ssh_key_bits ' 8 | 'ssh_key_comment ssh_key_file ssh_key_passphrase ssh_key_type state system uid update_password').split() 9 | 10 | class FilterModule(object): 11 | 12 | def filters(self): 13 | return { 14 | 'filter_user_params': self.filter_user_params 15 | } 16 | 17 | def filter_user_params(self, d): 18 | ''' Return a copy of dict `d` containing only keys which are parameters for the user module''' 19 | 20 | user_dict = copy.deepcopy(d) 21 | remove_keys = set(user_dict).difference(USER_MODULE_PARAMS) 22 | for key in remove_keys: 23 | del user_dict[key] 24 | return user_dict 25 | -------------------------------------------------------------------------------- /ansible/roles/block_devices/defaults/main.yml: -------------------------------------------------------------------------------- 1 | block_devices_configurations: [{}] 2 | block_devices_partition_state: present # 'present', 'absent' (as for parted) or 'skip' 3 | block_devices_device: # Path to block device, e.g. '/dev/sda'. See community.general.parted:device and community.general.filesystem:dev 4 | block_devices_number: # Partition number, e.g 1 for /dev/sda1. See community.general.parted:number 5 | block_devices_fstype: # Filesystem type, e.g. e.g. 'ext4'. See community.general.filesystem:fstype 6 | block_devices_resizefs: no # Grow filesystem into block device space (yes or no). See community.general.filesystem:resizefs 7 | block_devices_filesystem_state: present # 'present', 'absent' (as for community.general.filesystem:state) or 'skip' 8 | block_devices_path: # Path to mount point, e.g. '/mnt/files' 9 | block_devices_mount_state: mounted # Mount state, see ansible.posix.mount:state 10 | block_devices_owner: # Name of owner for mounted directory (ansible.buildin.file:owner 11 | block_devices_group: # Name of group for mounted directory (ansible.buildin.file.group) 12 | # also see hostnames 13 | -------------------------------------------------------------------------------- /ansible/roles/block_devices/library/block_devices.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Copyright: (c) 2021, StackHPC 4 | # Apache 2 License 5 | 6 | DOCUMENTATION = r''' 7 | --- 8 | module: block_devices 9 | 10 | short_description: Return block device paths by serial number. 11 | 12 | options: (none) 13 | 14 | author: 15 | - Steve Brasier (@sjpb) 16 | ''' 17 | 18 | RETURN = r''' 19 | devices: 20 | description: dict with device serial numbers as keys and full paths (e.g. /dev/sdb) as values 21 | type: dict 22 | return: always 23 | ''' 24 | 25 | import json 26 | 27 | from ansible.module_utils.basic import AnsibleModule 28 | 29 | def run_module(): 30 | module_args = dict() 31 | module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) 32 | result = {"changed": False} 33 | _, stdout, _ = module.run_command("lsblk --paths --json -O", check_rc=True) 34 | 35 | device_info = json.loads(stdout)['blockdevices'] 36 | result['devices'] = dict((item['serial'], item['name']) for item in device_info) 37 | module.exit_json(**result) 38 | 39 | def main(): 40 | run_module() 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /ansible/roles/cacerts/README.md: -------------------------------------------------------------------------------- 1 | # cacerts 2 | 3 | Configure CA certificates and trusts. 4 | 5 | ## Role variables 6 | 7 | - `cacerts_cert_dir`: Optional str. Path to directory containing certificates 8 | in PEM or DER format. Any files here will be added to the list of CAs trusted 9 | by the system. 10 | 11 | Note: This role assumes the `ca-certificates` dnf package is installed, which 12 | is the case for GenericCloud-based images. 13 | -------------------------------------------------------------------------------- /ansible/roles/cacerts/defaults/main.yml: -------------------------------------------------------------------------------- 1 | #cacerts_dest_dir: /etc/pki/ca-trust/source/anchors/ 2 | cacerts_cert_dir: "{{ appliances_environment_root }}/cacerts" 3 | cacerts_update: true 4 | -------------------------------------------------------------------------------- /ansible/roles/cacerts/tasks/configure.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Copy all certificates 4 | copy: 5 | src: "{{ item }}" 6 | dest: /etc/pki/ca-trust/source/anchors/ 7 | owner: root 8 | group: root 9 | mode: 0644 10 | with_fileglob: 11 | - "{{ cacerts_cert_dir }}/*" 12 | become: true 13 | 14 | - name: Update trust store 15 | command: update-ca-trust extract 16 | become: true 17 | -------------------------------------------------------------------------------- /ansible/roles/cacerts/tasks/export.yml: -------------------------------------------------------------------------------- 1 | - name: Copy cacerts from deploy host to /exports/cluster/cacerts/ 2 | copy: 3 | src: "{{ item }}" 4 | dest: /exports/cluster/cacerts/ 5 | owner: slurm 6 | group: root 7 | mode: 0644 8 | with_fileglob: 9 | - "{{ cacerts_cert_dir }}/*" 10 | delegate_to: "{{ groups['control'] | first }}" 11 | run_once: true 12 | -------------------------------------------------------------------------------- /ansible/roles/cacerts/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - import_tasks: configure.yml 2 | -------------------------------------------------------------------------------- /ansible/roles/cluster_infra/defaults/main.yml: -------------------------------------------------------------------------------- 1 | ansible_init_collections: [] 2 | ansible_init_playbooks: [] 3 | -------------------------------------------------------------------------------- /ansible/roles/cluster_infra/templates/providers.tf.j2: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 0.14" 3 | 4 | # We need the OpenStack provider 5 | required_providers { 6 | openstack = { 7 | source = "terraform-provider-openstack/openstack" 8 | version = "~>3.0.0" 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /ansible/roles/compute_init/templates/hostvars.yml.j2: -------------------------------------------------------------------------------- 1 | {{ hostvars[inventory_hostname] | to_nice_json }} -------------------------------------------------------------------------------- /ansible/roles/cuda/README.md: -------------------------------------------------------------------------------- 1 | # cuda 2 | 3 | Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. 4 | 5 | ## Prerequisites 6 | 7 | Requires OFED to be installed to provide required kernel-* packages. 8 | 9 | ## Role Variables 10 | 11 | - `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. 12 | - `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. 13 | - `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-9']`. 14 | - `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. 15 | - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. 16 | -------------------------------------------------------------------------------- /ansible/roles/cuda/defaults/main.yml: -------------------------------------------------------------------------------- 1 | cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" 2 | cuda_nvidia_driver_stream: '575-open' 3 | cuda_package_version: '12.9.0-1' 4 | cuda_version_short: '12.9' 5 | cuda_packages: 6 | - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" 7 | - nvidia-gds 8 | - cmake 9 | - cuda-toolkit-12-9 10 | cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz" 11 | cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples" 12 | cuda_samples_programs: 13 | - deviceQuery 14 | - bandwidthTest 15 | # cuda_devices: # discovered from deviceQuery run 16 | cuda_persistenced_state: started 17 | -------------------------------------------------------------------------------- /ansible/roles/cuda/tasks/runtime.yml: -------------------------------------------------------------------------------- 1 | - name: Ensure NVIDIA Persistence Daemon state 2 | systemd: 3 | name: nvidia-persistenced 4 | enabled: true 5 | state: "{{ cuda_persistenced_state }}" 6 | -------------------------------------------------------------------------------- /ansible/roles/cuda/tasks/samples.yml: -------------------------------------------------------------------------------- 1 | - name: Ensure cuda_samples_path exists 2 | file: 3 | state: directory 4 | path: "{{ cuda_samples_path }}" 5 | owner: "{{ ansible_user }}" 6 | group: "{{ ansible_user }}" 7 | 8 | - name: Download CUDA samples release 9 | unarchive: 10 | remote_src: yes 11 | src: "{{ cuda_samples_release_url }}" 12 | dest: "{{ cuda_samples_path }}" 13 | owner: "{{ ansible_user }}" 14 | group: "{{ ansible_user }}" 15 | creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}" 16 | 17 | - name: Create CUDA samples build directory 18 | file: 19 | state: directory 20 | path: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build" 21 | 22 | - name: Build CUDA samples 23 | shell: 24 | # We need to source /etc/profile.d/sh.local to add CUDA to the PATH 25 | cmd: . /etc/profile.d/sh.local && cmake .. && make -j {{ ansible_processor_vcpus }} 26 | chdir: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build" 27 | creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build/Samples/1_Utilities/deviceQuery/deviceQuery" 28 | -------------------------------------------------------------------------------- /ansible/roles/dnf_repos/tasks/disable_repos.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Disable Pulp repos 3 | ansible.builtin.yum_repository: 4 | file: "{{ item.file }}" 5 | name: "{{ item.name }}" 6 | baseurl: "{{ item.base_url }}" 7 | description: "{{ item.name }}" 8 | enabled: false 9 | loop: "{{ dnf_repos_repolist }}" 10 | 11 | - name: Disable EPEL repo 12 | ansible.builtin.yum_repository: 13 | name: epel 14 | file: epel 15 | description: "{{ dnf_repos_epel_description }}" 16 | baseurl: "{{ dnf_repos_epel_baseurl }}" 17 | gpgcheck: false 18 | enabled: false 19 | -------------------------------------------------------------------------------- /ansible/roles/dnf_repos/tasks/set_repos.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Replace system repos with Pulp repos 4 | ansible.builtin.yum_repository: 5 | file: "{{ item.file }}" 6 | name: "{{ item.name }}" 7 | baseurl: "{{ item.base_url }}" 8 | description: "{{ item.name }}" 9 | username: "{{ dnf_repos_username }}" 10 | password: "{{ dnf_repos_password }}" 11 | gpgcheck: false 12 | loop: "{{ dnf_repos_repolist }}" 13 | 14 | - name: Install epel-release 15 | # done so that roles installing epel via epel-release don't over-write our changes to the epel repo 16 | ansible.builtin.dnf: 17 | name: epel-release 18 | 19 | - name: Use Pulp EPEL repo 20 | ansible.builtin.yum_repository: 21 | name: epel 22 | file: epel 23 | description: "{{ dnf_repos_epel_description }}" 24 | gpgcheck: false 25 | baseurl: "{{ dnf_repos_epel_baseurl }}" 26 | username: "{{ dnf_repos_username }}" 27 | password: "{{ dnf_repos_password }}" 28 | -------------------------------------------------------------------------------- /ansible/roles/doca/README.md: -------------------------------------------------------------------------------- 1 | # doca 2 | 3 | Install [NVIDIA DOCA](https://docs.nvidia.com/doca/sdk/index.html). 4 | 5 | This role is not idempotent and is only intended to be run during an image build. It builds DOCA kernel modules to match the installed kernel and then installs these 6 | plus the selected DOCA packages. 7 | 8 | ## Role Variables 9 | 10 | - `doca_version`: Optional. String giving doca version. 11 | - `doca_profile`: Optional. Name of [profile](https://docs.nvidia.com/doca/sdk/nvidia+doca+profiles/index.html) defining subset of DOCA to install. Default is `doca-ofed`. 12 | - `doca_repo_url`: Optional. URL of DOCA repository. Default is appropriate upstream public repository for DOCA version, distro version and architecture. 13 | -------------------------------------------------------------------------------- /ansible/roles/doca/defaults/main.yml: -------------------------------------------------------------------------------- 1 | doca_version: '2.9.1' # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates 2 | doca_profile: doca-ofed 3 | doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" 4 | -------------------------------------------------------------------------------- /ansible/roles/doca/tasks/install-kernel-devel.yml: -------------------------------------------------------------------------------- 1 | - name: Get installed kernels 2 | command: dnf list --installed kernel 3 | register: _ofed_dnf_kernels 4 | changed_when: false 5 | 6 | - name: Determine running kernel 7 | command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 8 | register: _ofed_loaded_kernel 9 | changed_when: false 10 | 11 | - name: Check current kernel is newest installed 12 | assert: 13 | that: _ofed_kernel_current == _ofed_dnf_kernels_newest 14 | fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" 15 | vars: 16 | _ofed_kernel_current: >- 17 | {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} 18 | _ofed_dnf_kernels_newest: >- 19 | {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} 20 | # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " 21 | 22 | - name: Install matching kernel-devel package 23 | dnf: 24 | name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}" 25 | -------------------------------------------------------------------------------- /ansible/roles/doca/tasks/install.yml: -------------------------------------------------------------------------------- 1 | - import_tasks: install-kernel-devel.yml 2 | 3 | - name: Install DOCA repo 4 | ansible.builtin.yum_repository: 5 | name: doca 6 | file: doca 7 | description: DOCA Online Repo 8 | baseurl: "{{ doca_repo_url }}" 9 | enabled: true 10 | gpgcheck: false 11 | 12 | - name: Install doca-extra package 13 | ansible.builtin.dnf: 14 | name: doca-extra 15 | 16 | - name: Build DOCA kernel modules 17 | ansible.builtin.shell: 18 | cmd: /opt/mellanox/doca/tools/doca-kernel-support 19 | register: _doca_kernel_build 20 | 21 | 22 | - name: Find generated doca-kernel-repo 23 | ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*' 24 | register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm 25 | changed_when: false 26 | 27 | - name: Create dnf cache 28 | ansible.builtin.command: dnf makecache 29 | 30 | - name: Install DOCA repository package 31 | ansible.builtin.dnf: 32 | name: "{{ _doca_kernel_repo.stdout }}" 33 | disable_gpg_check: true 34 | 35 | - name: Install DOCA packages 36 | ansible.builtin.dnf: 37 | name: "{{ doca_profile }}" 38 | 39 | - name: Cleanup DOCA build directories 40 | ansible.builtin.file: 41 | state: absent 42 | path: "{{ (_doca_kernel_repo.stdout | split('/'))[:3] | join('/') }}" # leading / means 1st element of split list is '' 43 | 44 | - name: Update initramfs 45 | ansible.builtin.command: 46 | cmd: dracut -f 47 | register: _doca_dracut 48 | failed_when: _doca_dracut.stderr != '' # appears rc is always 0 49 | 50 | - name: Load the new driver 51 | ansible.builtin.command: /etc/init.d/openibd restart 52 | -------------------------------------------------------------------------------- /ansible/roles/doca/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: install.yml 2 | -------------------------------------------------------------------------------- /ansible/roles/eessi/README.md: -------------------------------------------------------------------------------- 1 | EESSI 2 | ===== 3 | 4 | Configure the EESSI pilot respository for use on given hosts. 5 | 6 | Requirements 7 | ------------ 8 | 9 | None. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | - `cvmfs_quota_limit_mb`: Optional int. Maximum size of local package cache on each node in MB. 15 | - `cvmfs_config_overrides`: Optional dict. Set of key-value pairs for additional CernVM-FS settings see [official docs](https://cvmfs.readthedocs.io/en/stable/cpt-configure.html) for list of options. Each dict key should correspond to a valid config variable (e.g. `CVMFS_HTTP_PROXY`) and the corresponding dict value will be set as the variable value (e.g. `https://my-proxy.com`). These configuration parameters will be written to the `/etc/cvmfs/default.local` config file on each host in the form `KEY=VALUE`. 16 | 17 | Dependencies 18 | ------------ 19 | 20 | None. 21 | 22 | Example Playbook 23 | ---------------- 24 | 25 | ```yaml 26 | - name: Setup EESSI 27 | hosts: eessi 28 | tags: eessi 29 | become: true 30 | tasks: 31 | - name: Install and configure EESSI 32 | import_role: 33 | name: eessi 34 | ``` 35 | -------------------------------------------------------------------------------- /ansible/roles/eessi/defaults/main.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Default to 10GB 3 | cvmfs_quota_limit_mb: 10000 4 | 5 | cvmfs_config_default: 6 | CVMFS_CLIENT_PROFILE: single 7 | CVMFS_QUOTA_LIMIT: "{{ cvmfs_quota_limit_mb }}" 8 | 9 | cvmfs_config_overrides: {} 10 | 11 | cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" 12 | 13 | cvmfs_gpg_checksum: "sha256:4ac81adff957565277cfa6a4a330cdc2ce5a8fdd73b8760d1a5a32bef71c4bd6" 14 | -------------------------------------------------------------------------------- /ansible/roles/eessi/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Download Cern GPG key 3 | ansible.builtin.get_url: 4 | url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM 5 | dest: ./cvmfs-key.gpg 6 | checksum: "{{ cvmfs_gpg_checksum }}" 7 | 8 | - name: Import downloaded GPG key 9 | command: rpm --import cvmfs-key.gpg 10 | 11 | - name: Add CVMFS repo 12 | dnf: 13 | name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm 14 | disable_gpg_check: true 15 | 16 | - name: Install CVMFS 17 | dnf: 18 | name: cvmfs 19 | 20 | - name: Install EESSI CVMFS config 21 | dnf: 22 | name: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm 23 | # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok? 24 | disable_gpg_check: true 25 | 26 | # Alternative version using official repo - still no GPG key :( 27 | # - name: Add EESSI repo 28 | # dnf: 29 | # name: http://repo.eessi-infra.org/eessi/rhel/8/noarch/eessi-release-0-1.noarch.rpm 30 | 31 | # - name: Install EESSI CVMFS config 32 | # dnf: 33 | # name: cvmfs-config-eessi 34 | 35 | - name: Add base CVMFS config 36 | community.general.ini_file: 37 | dest: /etc/cvmfs/default.local 38 | section: null 39 | option: "{{ item.key }}" 40 | value: "{{ item.value }}" 41 | no_extra_spaces: true 42 | loop: "{{ cvmfs_config | dict2items }}" 43 | 44 | 45 | # NOTE: Not clear how to make this idempotent 46 | - name: Ensure CVMFS config is setup 47 | command: 48 | cmd: "cvmfs_config setup" 49 | -------------------------------------------------------------------------------- /ansible/roles/etc_hosts/README.md: -------------------------------------------------------------------------------- 1 | # etc_hosts 2 | 3 | Hosts in the `etc_hosts` groups have `/etc/hosts` created with entries of the format `IP_address canonical_hostname [alias]`. 4 | 5 | By default, an entry is created for each host in this group as follows: 6 | - The value of `ansible_host` is used as the IP_address. 7 | - If `node_fqdn` is defined then that is used as the canonical hostname and `inventory_hostname` as an alias. Otherwise `inventory_hostname` is used as the canonical hostname. 8 | This may need overriding for multi-homed hosts or hosts with multiple aliases. 9 | 10 | # Variables 11 | 12 | - `etc_hosts_template`: Template file to use. Default is the in-role template. 13 | - `etc_hosts_hostvars`: A list of variable names, used (in the order supplied) to create the entry for each host. Default is described above. 14 | - `etc_hosts_extra_hosts`: String (possibly multi-line) defining additional hosts to add to `/etc/hosts`. Default is empty string. 15 | -------------------------------------------------------------------------------- /ansible/roles/etc_hosts/defaults/main.yml: -------------------------------------------------------------------------------- 1 | etc_hosts_template: hosts.j2 2 | etc_hosts_hostvars: "{{ ['ansible_host'] + (['node_fqdn'] if node_fqdn is defined else []) + ['inventory_hostname'] }}" 3 | etc_hosts_extra_hosts: '' 4 | -------------------------------------------------------------------------------- /ansible/roles/etc_hosts/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Template out /etc/hosts 2 | template: 3 | src: "{{ etc_hosts_template }}" 4 | dest: /etc/hosts 5 | owner: root 6 | group: root 7 | mode: 0644 8 | become: yes 9 | -------------------------------------------------------------------------------- /ansible/roles/etc_hosts/templates/hosts.j2: -------------------------------------------------------------------------------- 1 | 127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4 2 | ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 3 | 4 | {% for inventory_hostname in groups['etc_hosts'] | sort -%} 5 | {{ hostvars[inventory_hostname] | json_query('[' + ( etc_hosts_hostvars | join(', ') ) + ']' ) | join(' ')}} 6 | {% endfor %} 7 | {% if etc_hosts_extra_hosts != '' %} 8 | {{ etc_hosts_extra_hosts }} 9 | {% endif %} 10 | -------------------------------------------------------------------------------- /ansible/roles/fail2ban/README.md: -------------------------------------------------------------------------------- 1 | fail2ban 2 | ========= 3 | 4 | Setup fail2ban to protect SSH on a host. 5 | 6 | Note that no email alerts are set up so logs (at `/var/log/fail2ban.log`) will have to be manually reviewed if required. 7 | 8 | Requirements 9 | ------------ 10 | 11 | - An EL8 system. 12 | - `firewalld` running. 13 | 14 | Role Variables 15 | -------------- 16 | None. 17 | 18 | Dependencies 19 | ------------ 20 | 21 | None. 22 | 23 | Example Playbook 24 | ---------------- 25 | 26 | ```yaml 27 | - hosts: fail2ban 28 | gather_facts: false 29 | become: yes 30 | tasks: 31 | - import_role: 32 | name: firewalld 33 | - import_role: 34 | name: fail2ban 35 | ``` 36 | 37 | License 38 | ------- 39 | 40 | Apache v2 41 | 42 | Author Information 43 | ------------------ 44 | 45 | stackhpc.com 46 | -------------------------------------------------------------------------------- /ansible/roles/fail2ban/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Restart fail2ban 4 | service: 5 | name: fail2ban 6 | state: restarted 7 | enabled: true 8 | -------------------------------------------------------------------------------- /ansible/roles/fail2ban/meta/main.yml: -------------------------------------------------------------------------------- 1 | galaxy_info: 2 | author: Steve Brasier 3 | company: stackhpc 4 | 5 | # If the issue tracker for your role is not on github, uncomment the 6 | # next line and provide a value 7 | # issue_tracker_url: http://example.com/issue/tracker 8 | 9 | # Choose a valid license ID from https://spdx.org - some suggested licenses: 10 | # - BSD-3-Clause (default) 11 | # - MIT 12 | # - GPL-2.0-or-later 13 | # - GPL-3.0-only 14 | # - Apache-2.0 15 | # - CC-BY-4.0 16 | license: Apache-2.0 17 | 18 | min_ansible_version: 2.1 19 | 20 | # If this a Container Enabled role, provide the minimum Ansible Container version. 21 | # min_ansible_container_version: 22 | 23 | # 24 | # Provide a list of supported platforms, and for each platform a list of versions. 25 | # If you don't wish to enumerate all versions for a particular platform, use 'all'. 26 | # To view available platforms and versions (or releases), visit: 27 | # https://galaxy.ansible.com/api/v1/platforms/ 28 | # 29 | platforms: 30 | - name: EL 31 | versions: 32 | - 8 33 | 34 | galaxy_tags: [] 35 | # List tags for your role here, one per line. A tag is a keyword that describes 36 | # and categorizes the role. Users find roles by searching for tags. Be sure to 37 | # remove the '[]' above, if you add tags to this list. 38 | # 39 | # NOTE: A tag is limited to a single word comprised of alphanumeric characters. 40 | # Maximum 20 tags per role. 41 | 42 | dependencies: [] 43 | # List your role dependencies here, one per line. Be sure to remove the '[]' above, 44 | # if you add dependencies to this list. 45 | -------------------------------------------------------------------------------- /ansible/roles/fail2ban/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install EPEL repo 3 | package: 4 | name: epel-release 5 | 6 | - name: Install fail2ban packages 7 | package: 8 | name: 9 | - fail2ban-server 10 | - fail2ban-firewalld 11 | state: present 12 | 13 | - name: Create config 14 | template: 15 | dest: /etc/fail2ban/jail.local 16 | src: jail.local.j2 17 | notify: Restart fail2ban 18 | 19 | - name: flush handlers 20 | meta: flush_handlers 21 | 22 | - name: Ensure fail2ban running even if no config change 23 | service: 24 | name: fail2ban 25 | state: started 26 | enabled: true 27 | -------------------------------------------------------------------------------- /ansible/roles/fail2ban/templates/jail.local.j2: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | bantime = 3600 3 | action = %(action_)s 4 | 5 | [sshd] 6 | enabled = true 7 | -------------------------------------------------------------------------------- /ansible/roles/filebeat/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | #filebeat_config_path: undefined # REQUIRED. Path to filebeat.yml configuration file template 4 | filebeat_podman_user: "{{ ansible_user }}" # User that runs the filebeat container 5 | filebeat_version: 7.12.1 # latest usable with opensearch - see https://opensearch.org/docs/2.4/tools/index/#compatibility-matrix-for-beats 6 | filebeat_debug: false 7 | -------------------------------------------------------------------------------- /ansible/roles/filebeat/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Restart filebeat container 4 | systemd: 5 | name: filebeat.service 6 | state: restarted 7 | enabled: yes 8 | daemon_reload: yes 9 | become: true 10 | -------------------------------------------------------------------------------- /ansible/roles/filebeat/tasks/install.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Create systemd unit file 3 | template: 4 | dest: /etc/systemd/system/filebeat.service 5 | src: filebeat.service.j2 6 | become: true 7 | register: _filebeat_unit 8 | 9 | - name: Pull container image 10 | containers.podman.podman_image: 11 | name: "docker.elastic.co/beats/filebeat-oss" 12 | tag: "{{ filebeat_version }}" 13 | become_user: "{{ filebeat_podman_user }}" 14 | 15 | - name: Reload filebeat unit file 16 | command: systemctl daemon-reload 17 | when: _filebeat_unit.changed 18 | become: true 19 | -------------------------------------------------------------------------------- /ansible/roles/filebeat/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - import_tasks: install.yml 2 | - import_tasks: runtime.yml 3 | -------------------------------------------------------------------------------- /ansible/roles/filebeat/tasks/runtime.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Collect usernamespace facts 4 | user_namespace_facts: 5 | 6 | - name: Set facts containing sub-ids 7 | set_fact: 8 | # filebeat user is 1000 9 | filebeat_host_user_id: "{{ ansible_facts.subuid[filebeat_podman_user]['start'] + 1000 - 1 }}" 10 | filebeat_host_group_id: "{{ ansible_facts.subgid[filebeat_podman_user]['start'] + 1000 - 1 }}" 11 | 12 | - name: Ensure parent directory exists 13 | file: 14 | state: directory 15 | path: "/etc/filebeat" 16 | owner: "{{ filebeat_host_user_id }}" 17 | group: "{{ filebeat_host_group_id }}" 18 | mode: 0770 19 | become: true 20 | 21 | - name: Template configuration files 22 | template: 23 | src: "{{ filebeat_config_path }}" 24 | dest: /etc/filebeat/filebeat.yml 25 | owner: "{{ filebeat_host_user_id }}" 26 | group: "{{ filebeat_host_group_id }}" 27 | mode: 0600 28 | notify: Restart filebeat container 29 | become: true 30 | 31 | - name: Flush handlers 32 | meta: flush_handlers 33 | 34 | - name: Ensure filebeat service state 35 | systemd: 36 | name: filebeat.service 37 | state: started 38 | enabled: true 39 | become: true 40 | -------------------------------------------------------------------------------- /ansible/roles/filebeat/tasks/validate.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Assert that filebeat_config_path is defined 4 | assert: 5 | that: filebeat_config_path is defined -------------------------------------------------------------------------------- /ansible/roles/filebeat/templates/filebeat.service.j2: -------------------------------------------------------------------------------- 1 | # container-filebeat.service 2 | # based off 3 | # podman generate systemd filebeat --restart-policy always --new --name 4 | # with pid/cidfiles replaced with --sdnotify=conmon approach 5 | 6 | [Unit] 7 | Description=Podman container-filebeat.service 8 | Documentation=man:podman-generate-systemd(1) 9 | Wants=network.target 10 | After=network-online.target 11 | 12 | [Service] 13 | Environment=PODMAN_SYSTEMD_UNIT=%n 14 | Restart=always 15 | ExecStart=/usr/bin/podman run \ 16 | --network=host \ 17 | --sdnotify=conmon \ 18 | --cgroups=no-conmon \ 19 | --replace \ 20 | --name filebeat \ 21 | --user root \ 22 | --restart=always \ 23 | --security-opt label=disable \ 24 | --volume /var/log/:/logs:ro \ 25 | --volume /etc/filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro \ 26 | --detach=True docker.elastic.co/beats/filebeat-oss:{{ filebeat_version }} \ 27 | -e -strict.perms=false -d "*" 28 | ExecStop=/usr/bin/podman stop --ignore filebeat -t 10 29 | ExecStopPost=/usr/bin/podman rm --ignore -f filebeat 30 | KillMode=none 31 | Type=notify 32 | NotifyAccess=all 33 | User={{ filebeat_podman_user }} 34 | Group={{ filebeat_podman_user }} 35 | TimeoutStartSec=180 36 | 37 | [Install] 38 | WantedBy=multi-user.target default.target 39 | -------------------------------------------------------------------------------- /ansible/roles/firewalld/README.md: -------------------------------------------------------------------------------- 1 | Role Name 2 | ========= 3 | 4 | Install and configure the `firewalld` firewall. 5 | 6 | Requirements 7 | ------------ 8 | 9 | EL8 host 10 | 11 | Role Variables 12 | -------------- 13 | 14 | - `firewalld_enabled`: Optional. Whether `firewalld` service is enabled (starts at boot). Default `yes`. 15 | - `firewalld_state`: Optional. State of `firewalld` service. Default `started`. Other values: `stopped`. 16 | - `firewalld_configs`: Optional. List of dicts giving parameters for [ansible.posix.firewalld module](https://docs.ansible.com/ansible/latest/collections/ansible/posix/firewalld_module.html). Default is an empty list. 17 | 18 | Note that the default configuration for firewalld on Rocky Linux 8.5 is as follows: 19 | ```shell 20 | # firewall-offline-cmd --list-all 21 | public 22 | target: default 23 | icmp-block-inversion: no 24 | interfaces: 25 | sources: 26 | services: cockpit dhcpv6-client ssh 27 | ports: 28 | protocols: 29 | forward: no 30 | masquerade: no 31 | forward-ports: 32 | source-ports: 33 | icmp-blocks: 34 | rich rules: 35 | ``` 36 | 37 | Dependencies 38 | ------------ 39 | 40 | None. 41 | 42 | Example Playbook 43 | ---------------- 44 | 45 | ``` 46 | - hosts: firewalld 47 | gather_facts: false 48 | become: yes 49 | tags: firewalld 50 | tasks: 51 | - import_role: 52 | name: firewalld 53 | ``` 54 | 55 | License 56 | ------- 57 | 58 | BSD 59 | 60 | Author Information 61 | ------------------ 62 | 63 | An optional section for the role authors to include contact information, or a website (HTML is not allowed). 64 | -------------------------------------------------------------------------------- /ansible/roles/firewalld/defaults/main.yml: -------------------------------------------------------------------------------- 1 | firewalld_enabled: yes 2 | firewalld_state: started 3 | firewalld_configs: [] 4 | -------------------------------------------------------------------------------- /ansible/roles/firewalld/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Restart filewalld 3 | service: 4 | name: firewalld 5 | state: restarted 6 | when: firewalld_state != 'stopped' 7 | -------------------------------------------------------------------------------- /ansible/roles/firewalld/tasks/install.yml: -------------------------------------------------------------------------------- 1 | - name: Install firewalld package 2 | dnf: 3 | name: firewalld 4 | -------------------------------------------------------------------------------- /ansible/roles/firewalld/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - import_tasks: install.yml 3 | - import_tasks: runtime.yml 4 | -------------------------------------------------------------------------------- /ansible/roles/firewalld/tasks/runtime.yml: -------------------------------------------------------------------------------- 1 | - name: Apply filewalld configs 2 | ansible.posix.firewalld: "{{ item }}" 3 | notify: Restart filewalld 4 | loop: "{{ firewalld_configs }}" 5 | 6 | - meta: flush_handlers 7 | 8 | - name: Ensure filewalld state 9 | ansible.builtin.systemd: 10 | name: firewalld 11 | state: "{{ firewalld_state }}" 12 | enabled: "{{ firewalld_enabled | default(true) }}" 13 | -------------------------------------------------------------------------------- /ansible/roles/firewalld/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # vars file for firewalld 3 | -------------------------------------------------------------------------------- /ansible/roles/freeipa/defaults/main.yml: -------------------------------------------------------------------------------- 1 | #freeipa_realm: 2 | freeipa_domain: "{{ freeipa_realm | lower }}" 3 | #freeipa_ds_password: 4 | #freeipa_admin_password: 5 | #freeipa_server_ip: 6 | freeipa_setup_dns: "{{ groups['freeipa_server'] | length > 0 }}" 7 | freeipa_client_ip: "{{ ansible_host }}" # when run on freeipa_client group! 8 | # freeipa_host_password: 9 | freeipa_user_defaults: 10 | ipa_pass: "{{ freeipa_admin_password | quote }}" 11 | ipa_user: admin 12 | freeipa_users: [] # see community.general.ipa_user 13 | 14 | _freeipa_keytab_backup_path: "{{ hostvars[groups['control'].0].appliances_state_dir }}/freeipa/{{ inventory_hostname }}/krb5.keytab" 15 | -------------------------------------------------------------------------------- /ansible/roles/freeipa/tasks/addhost.yml: -------------------------------------------------------------------------------- 1 | - name: Get ipa host information 2 | # This uses DNS to find the ipa server, which works as this is running on the enrolled ipa server 3 | # It doesn't fail even if the host doesn't exist 4 | community.general.ipa_host: 5 | name: "{{ node_fqdn }}" 6 | ip_address: "{{ freeipa_client_ip }}" 7 | ipa_pass: "{{ vault_freeipa_admin_password }}" 8 | ipa_user: admin 9 | state: present 10 | validate_certs: false 11 | delegate_to: "{{ groups['freeipa_server'].0 }}" 12 | register: _ipa_host_check 13 | check_mode: yes 14 | changed_when: false 15 | 16 | - name: Add host to IPA 17 | # Using random_password=true this unenroles an enroled host, hence the check above 18 | community.general.ipa_host: 19 | name: "{{ node_fqdn }}" 20 | ip_address: "{{ freeipa_client_ip }}" 21 | ipa_pass: "{{ vault_freeipa_admin_password }}" 22 | ipa_user: admin 23 | random_password: true 24 | state: present 25 | validate_certs: false 26 | ipa_timeout: 30 27 | delegate_to: "{{ groups['freeipa_server'].0 }}" 28 | when: "'sshpubkeyfp' not in _ipa_host_check.host" 29 | register: _ipa_host_add 30 | 31 | - name: Set fact for ipa host password 32 | set_fact: 33 | freeipa_host_password: "{{ _ipa_host_add.host.randompassword }}" 34 | when: _ipa_host_add.changed 35 | -------------------------------------------------------------------------------- /ansible/roles/freeipa/tasks/backup-keytabs.yml: -------------------------------------------------------------------------------- 1 | - name: Retrieve keytabs to localhost 2 | fetch: 3 | src: "{{ _freeipa_keytab_backup_path }}" 4 | dest: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/" 5 | flat: true 6 | delegate_to: "{{ groups['control'].0 }}" 7 | tags: retrieve 8 | 9 | - name: Copy keytabs back to control node 10 | copy: 11 | src: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/" 12 | dest: "{{ _freeipa_keytab_backup_path | dirname }}" 13 | delegate_to: "{{ groups['control'].0 }}" 14 | tags: deploy 15 | -------------------------------------------------------------------------------- /ansible/roles/freeipa/tasks/client-install.yml: -------------------------------------------------------------------------------- 1 | 2 | - name: Install FreeIPA client package 3 | dnf: 4 | name: ipa-client 5 | -------------------------------------------------------------------------------- /ansible/roles/freeipa/tasks/users.yml: -------------------------------------------------------------------------------- 1 | - name: Add users to freeipa 2 | # This uses DNS to find the ipa server, which works as this is running on the enrolled ipa server 3 | community.general.ipa_user: 4 | displayname: "{{ item.displayname | default(omit) }}" 5 | gidnumber: "{{ item.gidnumber | default(omit) }}" 6 | givenname: "{{ item.givenname }}" 7 | #ipa_host 8 | ipa_pass: "{{ freeipa_admin_password | quote }}" 9 | #ipa_port 10 | #ipa_prot 11 | ipa_timeout: "{{ item.ipa_timeout | default(omit) }}" 12 | #ipa_user 13 | krbpasswordexpiration: "{{ item.krbpasswordexpiration | default(omit) }}" 14 | loginshell: "{{ item.loginshell | default(omit) }}" 15 | mail: "{{ item.mail | default(omit) }}" 16 | password: "{{ item.password | default(omit) }}" 17 | sn: "{{ item.sn }}" 18 | sshpubkey: "{{ item.sshpubkey | default(omit) }}" 19 | state: "{{ item.state | default(omit) }}" 20 | telephonenumber: "{{ item.telephonenumber | default(omit) }}" 21 | title: "{{ item.title | default(omit) }}" 22 | uid: "{{ item.name | default(item.uid) }}" 23 | uidnumber: "{{ item.uidnumber | default(omit) }}" 24 | update_password: "{{ item.update_password | default(omit) }}" 25 | userauthtype: "{{ item.userauthtype | default(omit) }}" 26 | #validate_certs 27 | loop: "{{ freeipa_users }}" 28 | -------------------------------------------------------------------------------- /ansible/roles/gateway/README.md: -------------------------------------------------------------------------------- 1 | # gateway 2 | 3 | Ensure a single default route via a specified address exists on boot. 4 | 5 | **NB:** This role uses `linux-ansible-init` and is not run by the 6 | `ansible/site.yml` playbook. 7 | 8 | ## Role variables 9 | 10 | **NB:** This role has no Ansible variables. Setting the OpenTofu variable 11 | `gateway_ip` to an IPv4 address will modify default routes as necessary to give 12 | the instance a single default route via that address. The default route will 13 | use the interface which has a CIDR including the gateway address. 14 | 15 | Note that: 16 | - If the correct default route already exists, no changes are made. 17 | - If a default route exists on a different interface, that route will be deleted. 18 | - If a default route exists on the same interface but using a different address, 19 | an assert will be raised to fail the `ansible-init` service - see logs using 20 | `journalctl -xue ansible-init`. 21 | 22 | See [docs/networks.md](../../../docs/networks.md) for further discussion. 23 | 24 | ## Requirements 25 | 26 | The image must include both this role and the `linux-ansible-init` role. This 27 | is the case for StackHPC-built images. For custom images use one of the following 28 | configurations during Packer build: 29 | - Add `builder` into the `gateway` group in `environments/$ENV/inventory/groups` 30 | - Add `gateway` to the `inventory_groups` Packer variable 31 | -------------------------------------------------------------------------------- /ansible/roles/gateway/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Add gateway playbook 2 | copy: 3 | src: gateway-init.yml 4 | dest: /etc/ansible-init/playbooks/05-gateway-init.yml 5 | owner: root 6 | group: root 7 | mode: 0644 8 | -------------------------------------------------------------------------------- /ansible/roles/grafana-dashboards/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | grafana_address: "0.0.0.0" 4 | grafana_port: 3000 5 | 6 | # External Grafana address. Variable maps to "root_url" in grafana server section 7 | grafana_url: "http://{{ grafana_address }}:{{ grafana_port }}" 8 | grafana_api_url: "{{ grafana_url }}" 9 | 10 | grafana_security: 11 | admin_user: admin 12 | admin_password: "" 13 | 14 | grafana_data_dir: "/var/lib/grafana" 15 | grafana_dashboards_dir: "dashboards" 16 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | hpctests_user: "{{ ansible_user }}" 3 | hpctests_group: "{{ hpctests_user }}" 4 | hpctests_rootdir: "/home/{{ hpctests_user }}/hpctests" 5 | hpctests_pre_cmd: '' 6 | hpctests_pingmatrix_modules: [gnu12 openmpi4] 7 | hpctests_pingpong_modules: [gnu12 openmpi4 imb] 8 | hpctests_pingpong_plot: yes 9 | hpctests_hpl_modules: [gnu12 openmpi4 openblas] 10 | hpctests_outdir: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hpctests" 11 | hpctests_ucx_net_devices: all 12 | hpctests_hpl_version: "2.3" 13 | hpctests_hpl_NB: 192 14 | hpctests_hpl_mem_frac: 0.3 15 | hpctests_hpl_arch: linux64 16 | #hpctests_nodes: 17 | #hpctests_partition: 18 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # handlers file for hpctests 3 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: setup 2 | block: 3 | - include_tasks: setup.yml 4 | become: true 5 | become_user: "{{ hpctests_user }}" 6 | tags: always 7 | 8 | - name: pingpong 9 | block: 10 | - include_tasks: pingpong.yml 11 | when: hpctests_computes.stdout_lines | length > 1 12 | become: true 13 | become_user: "{{ hpctests_user }}" 14 | tags: pingpong 15 | 16 | - name: pingmatrix 17 | block: 18 | - include_tasks: pingmatrix.yml 19 | when: hpctests_computes.stdout_lines | length > 1 20 | become: true 21 | become_user: "{{ hpctests_user }}" 22 | tags: pingmatrix 23 | 24 | - name: build HPL 25 | block: 26 | - include_tasks: build-hpl.yml 27 | become: true 28 | become_user: "{{ hpctests_user }}" 29 | tags: 30 | - hpl-solo 31 | 32 | - name: run HPL on individual nodes 33 | block: 34 | - include_tasks: hpl-solo.yml 35 | become: true 36 | become_user: "{{ hpctests_user }}" 37 | tags: 38 | - hpl-solo 39 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/tasks/setup.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Get partition information 4 | shell: "sinfo --format %P --noheader" 5 | register: _sinfo_partitions 6 | changed_when: false 7 | 8 | - name: Select default partition if hpctests_partition not given 9 | set_fact: 10 | hpctests_partition: "{{ (_sinfo_partitions.stdout_lines | select('contains', '*') | first)[:-1] }}" 11 | when: hpctests_partition is not defined 12 | 13 | - name: Get info about compute nodes 14 | shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format %N" 15 | register: hpctests_computes 16 | changed_when: false 17 | failed_when: hpctests_computes.rc != 0 18 | 19 | - name: Check compute node selection valid 20 | assert: 21 | that: hpctests_computes.stdout_lines | length > 0 22 | fail_msg: "No nodes selected - was variable `hpctests_nodes` set (correctly)?" 23 | 24 | - name: Create test root directory 25 | file: 26 | path: "{{ hpctests_rootdir }}" 27 | state: directory 28 | owner: "{{ hpctests_user }}" 29 | group: "{{ hpctests_group }}" 30 | 31 | - name: Set fact for UCX_NET_DEVICES 32 | set_fact: 33 | hpctests_ucx_net_devices: "{{ hpctests_ucx_net_devices.get(hpctests_partition, 'all') }}" 34 | when: hpctests_ucx_net_devices is mapping 35 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/templates/HPL.dat.j2: -------------------------------------------------------------------------------- 1 | HPLinpack benchmark input file 2 | Innovative Computing Laboratory, University of Tennessee 3 | HPL.out output file name (if any) 4 | 6 device out (6=stdout,7=stderr,file) 5 | 1 # of problems sizes (N) 6 | {{ hpctests_hpl_N}} Ns 7 | 1 # of NBs 8 | {{ hpctests_hpl_NB }} NBs 9 | 0 PMAP process mapping (0=Row-,1=Column-major) 10 | 1 # of process grids (P x Q) 11 | {{ hpctests_hpl_P }} Ps 12 | {{ hpctests_hpl_Q }} Qs 13 | 16.0 threshold 14 | 1 # of panel fact 15 | 0 PFACTs (0=left, 1=Crout, 2=Right) 16 | 1 # of recursive stopping criterium 17 | 2 NBMINs (>= 1) 18 | 1 # of panels in recursion 19 | 2 NDIVs 20 | 1 # of recursive panel fact. 21 | 0 RFACTs (0=left, 1=Crout, 2=Right) 22 | 1 # of broadcast 23 | 0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) 24 | 1 # of lookahead depth 25 | 0 DEPTHs (>=0) 26 | 2 SWAP (0=bin-exch,1=long,2=mix) 27 | 64 swapping threshold 28 | 0 L1 in (0=transposed,1=no-transposed) form 29 | 0 U in (0=transposed,1=no-transposed) form 30 | 1 Equilibration (0=no,1=yes) 31 | 8 memory alignment in double (> 0) -------------------------------------------------------------------------------- /ansible/roles/hpctests/templates/hpl-build.sh.j2: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | #SBATCH --nodes=1 4 | #SBATCH --output=%x.%a.out 5 | #SBATCH --error=%x.%a.out 6 | #SBATCH --exclusive 7 | #SBATCH --partition={{ hpctests_partition }} 8 | {%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_computes.stdout_lines[0] }}{% endif %} 9 | 10 | echo HPL arch: {{ hpctests_hpl_arch }} 11 | {{ hpctests_pre_cmd }} 12 | module load {{ hpctests_hpl_modules | join(' ' ) }} 13 | make arch={{ hpctests_hpl_arch }} clean_arch_all 14 | make arch={{ hpctests_hpl_arch }} 15 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/templates/hpl-solo.sh.j2: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #SBATCH --ntasks={{ hpctests_hplsolo_ntasks }} 4 | #SBATCH --output=%x.%a.out 5 | #SBATCH --error=%x.%a.out 6 | #SBATCH --exclusive 7 | #SBATCH --array=0-{{ hpctests_computes.stdout_lines | length - 1 }} 8 | #SBATCH --partition={{ hpctests_partition }} 9 | {% if hpctests_hplsolo_excluded_nodes | length > 0 %} 10 | #SBATCH --exclude={{ hpctests_hplsolo_excluded_nodes | join(',') }} 11 | {% endif %} 12 | 13 | export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }} 14 | echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST 15 | echo SLURM_JOB_ID: $SLURM_JOB_ID 16 | echo UCX_NET_DEVICES: $UCX_NET_DEVICES 17 | echo HPL arch: {{ hpctests_hpl_arch }} 18 | {{ hpctests_pre_cmd }} 19 | module load {{ hpctests_hpl_modules | join(' ' ) }} 20 | mpirun ./xhpl-{{ hpctests_hpl_arch }} 21 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/templates/pingmatrix.sh.j2: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #SBATCH --ntasks={{ hpctests_computes.stdout_lines | length }} 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --output=%x.out 6 | #SBATCH --error=%x.out 7 | #SBATCH --exclusive 8 | #SBATCH --partition={{ hpctests_partition }} 9 | {%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_nodes }}{% endif %} 10 | 11 | export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }} 12 | echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST 13 | echo SLURM_JOB_ID: $SLURM_JOB_ID 14 | echo UCX_NET_DEVICES: $UCX_NET_DEVICES 15 | {{ hpctests_pre_cmd }} 16 | module load {{ hpctests_pingmatrix_modules | join(' ' ) }} 17 | 18 | mpicc -o nxnlatbw mpi_nxnlatbw.c 19 | 20 | # mpirun flags force using UCX TCP transports, overriding higher 21 | # priority of OpenMPI btl/openib component, which is also using RDMA 22 | # https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 23 | mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any nxnlatbw 24 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/templates/pingpong.sh.j2: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #SBATCH --ntasks=2 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --output=%x.out 6 | #SBATCH --error=%x.out 7 | #SBATCH --exclusive 8 | #SBATCH --partition={{ hpctests_partition }} 9 | {%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_nodes }}{% endif %} 10 | 11 | export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }} 12 | echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST 13 | echo SLURM_JOB_ID: $SLURM_JOB_ID 14 | echo UCX_NET_DEVICES: $UCX_NET_DEVICES 15 | {{ hpctests_pre_cmd }} 16 | module load {{ hpctests_pingpong_modules | join(' ' ) }} 17 | 18 | #srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1 19 | 20 | # mpirun flags force using UCX TCP transports, overriding higher 21 | # priority of OpenMPI btl/openib component, which is also using RDMA 22 | # https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 23 | mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any IMB-MPI1 pingpong 24 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/tests/inventory: -------------------------------------------------------------------------------- 1 | localhost 2 | 3 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/tests/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: localhost 3 | remote_user: root 4 | roles: 5 | - hpctests 6 | -------------------------------------------------------------------------------- /ansible/roles/hpctests/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | hpctests_hpl_srcdir: "{{ hpctests_rootdir }}/hpl/hpl-{{ hpctests_hpl_version }}" 3 | -------------------------------------------------------------------------------- /ansible/roles/k3s/README.md: -------------------------------------------------------------------------------- 1 | k3s 2 | ===== 3 | 4 | Installs k3s agent and server services on nodes and an ansible-init playbook to activate them. The service that each node will activate on init is determined by OpenStack metadata. Also includes Helm install. Currently only supports a single k3s-server 5 | (i.e one control node). Install based on the [official k3s ansible role](https://github.com/k3s-io/k3s-ansible). 6 | 7 | 8 | Requirements 9 | ------------ 10 | 11 | `azimuth_cloud.image_utils.linux_ansible_init` must have been run previously on targeted nodes during image build. 12 | 13 | Role Variables 14 | -------------- 15 | 16 | - `k3s_version`: Optional str. K3s version to install, see [official releases](https://github.com/k3s-io/k3s/releases/). 17 | -------------------------------------------------------------------------------- /ansible/roles/k3s/defaults/main.yml: -------------------------------------------------------------------------------- 1 | # Warning: changes to these variables won't be reflected in the cluster/image if k3s is already installed 2 | k3s_version: "v1.31.0+k3s1" 3 | k3s_selinux_release: v1.6.latest.1 4 | k3s_selinux_rpm_version: 1.6-1 5 | k3s_helm_version: v3.11.0 6 | k3s_bootstrap_token: '' # matches common environment default 7 | k3s_bootstrap_token_expiry: 10m 8 | k3s_server_name: "{{ None }}" # ansible managed 9 | -------------------------------------------------------------------------------- /ansible/roles/k3s/tasks/agent-runtime.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Template k3s agent env file 4 | when: k3s_bootstrap_token != '' 5 | ansible.builtin.template: 6 | dest: /etc/systemd/system/k3s-agent.service.env 7 | src: k3s-agent.service.env.j2 8 | owner: root 9 | group: root 10 | mode: 0640 11 | register: _k3s_agent_token_result 12 | 13 | - name: Ensure password directory exists 14 | ansible.builtin.file: 15 | path: "/etc/rancher/node" 16 | state: directory 17 | owner: root 18 | group: root 19 | mode: 0640 20 | 21 | - name: Write node password 22 | ansible.builtin.copy: 23 | dest: /etc/rancher/node/password 24 | content: "{{ vault_k3s_node_password }}" 25 | owner: root 26 | group: root 27 | mode: 0640 # normal k3s install is 644 but that doesn't feel right 28 | 29 | - name: Start/restart k3s agent 30 | when: _k3s_agent_token_result.changed 31 | ansible.builtin.systemd: 32 | name: k3s-agent 33 | daemon_reload: true 34 | state: restarted 35 | enabled: true 36 | -------------------------------------------------------------------------------- /ansible/roles/k3s/tasks/server-runtime.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Template k3s env file 4 | ansible.builtin.template: 5 | dest: /etc/systemd/system/k3s.service.env 6 | src: k3s.service.env.j2 7 | register: _k3s_env_file_status 8 | 9 | - name: Start k3s server 10 | ansible.builtin.systemd: 11 | name: k3s 12 | daemon_reload: "{{ _k3s_env_file_status.changed }}" 13 | state: started 14 | enabled: true 15 | 16 | # Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though 17 | - name: Check which k3s agents are connected 18 | ansible.builtin.shell: 19 | cmd: kubectl get nodes --no-headers | grep -w Ready 20 | register: _k3s_connected_nodes 21 | retries: 6 # task may fail if server is not ready yet 22 | delay: 10 23 | until: not _k3s_connected_nodes.failed 24 | 25 | - when: _k3s_connected_nodes.stdout_lines | length != groups['k3s'] | length 26 | block: 27 | - name: Generate new bootstrap token if not all agents are connected 28 | no_log: true 29 | shell: 30 | cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" 31 | register: _k3s_token_output 32 | 33 | - name: Set bootstrap token as fact 34 | set_fact: 35 | k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}" 36 | -------------------------------------------------------------------------------- /ansible/roles/k3s/templates/k3s-agent.service.env.j2: -------------------------------------------------------------------------------- 1 | K3S_NODE_IP={{ ansible_host }} 2 | K3S_TOKEN={{ k3s_bootstrap_token }} 3 | K3S_URL=https://{{ k3s_server_name }}:6443 4 | -------------------------------------------------------------------------------- /ansible/roles/k3s/templates/k3s.service.env.j2: -------------------------------------------------------------------------------- 1 | K3S_NODE_IP={{ ansible_host }} 2 | -------------------------------------------------------------------------------- /ansible/roles/k9s/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Check if k9s is installed 4 | ansible.builtin.stat: 5 | path: "/usr/bin/k9s" 6 | register: _k9s_stat_result 7 | 8 | - name: Install k9s and clean up temporary files 9 | block: 10 | - name: Create install directory 11 | ansible.builtin.file: 12 | path: /tmp/k9s 13 | state: directory 14 | owner: root 15 | group: root 16 | mode: "744" 17 | when: not _k9s_stat_result.stat.exists 18 | 19 | - name: Download k9s 20 | ansible.builtin.get_url: 21 | url: https://github.com/derailed/k9s/releases/download/v0.32.5/k9s_Linux_amd64.tar.gz 22 | dest: /tmp/k9s/k9s_Linux_amd64.tar.gz 23 | owner: root 24 | group: root 25 | mode: "744" 26 | 27 | - name: Unpack k9s binary 28 | ansible.builtin.unarchive: 29 | src: /tmp/k9s/k9s_Linux_amd64.tar.gz 30 | dest: /tmp/k9s 31 | remote_src: yes 32 | 33 | - name: Add k9s to root path 34 | ansible.builtin.copy: 35 | src: /tmp/k9s/k9s 36 | dest: /usr/bin/k9s 37 | mode: u+rwx 38 | remote_src: yes 39 | 40 | - name: Cleanup k9s install directory 41 | ansible.builtin.file: 42 | path: /tmp/k9s 43 | state: absent 44 | when: not _k9s_stat_result.stat.exists 45 | -------------------------------------------------------------------------------- /ansible/roles/lustre/defaults/main.yml: -------------------------------------------------------------------------------- 1 | lustre_repo: https://github.com/stackhpc/lustre-release.git 2 | lustre_version: '2.15.6/lu-18085' # Fixes https://jira.whamcloud.com/browse/LU-18085 3 | lustre_lnet_label: tcp 4 | #lustre_mgs_nid: 5 | lustre_mounts: [] 6 | lustre_mount_state: mounted 7 | lustre_mount_options: 'defaults,_netdev,noauto,x-systemd.automount,x-systemd.requires=lnet.service,nosuid,nodev' 8 | 9 | # below variables are for build and should not generally require changes 10 | lustre_git_repo: "git://git.whamcloud.com/fs/lustre-release.git" 11 | lustre_build_packages: 12 | - "kernel-devel-{{ ansible_kernel }}" 13 | - git 14 | - gcc 15 | - libtool 16 | - python3 17 | - python3-devel 18 | - openmpi 19 | - elfutils-libelf-devel 20 | - libmount-devel 21 | - libnl3-devel 22 | - libyaml-devel 23 | - rpm-build 24 | - kernel-abi-stablelists 25 | - libaio 26 | - libaio-devel 27 | lustre_build_dir: /tmp/lustre-release 28 | lustre_configure_opts: 29 | - --disable-server 30 | - --with-linux=/usr/src/kernels/* 31 | - --with-o2ib=/usr/src/ofa_kernel/default 32 | - --disable-maintainer-mode 33 | - --disable-gss-keyring 34 | - --enable-mpitests=no 35 | lustre_rpm_globs: # NB: order is important here, as not installing from a repo 36 | - "kmod-lustre-client-{{ lustre_version | split('.') | first }}*" # only take part of the version as -RC versions produce _RC rpms 37 | - "lustre-client-{{ lustre_version | split('.') | first }}*" 38 | lustre_build_cleanup: true 39 | -------------------------------------------------------------------------------- /ansible/roles/lustre/tasks/configure.yml: -------------------------------------------------------------------------------- 1 | - name: Gather Lustre interface info 2 | shell: 3 | cmd: | 4 | ip --json r get {{ _lustre_mgs_ip }} 5 | changed_when: false 6 | register: _lustre_ip_r_mgs 7 | vars: 8 | _lustre_mgs_ip: "{{ lustre_mgs_nid | split('@') | first }}" 9 | 10 | - name: Set facts for Lustre interface 11 | set_fact: 12 | _lustre_interface: "{{ _lustre_ip_r_mgs_info.dev }}" 13 | _lustre_ip: "{{ _lustre_ip_r_mgs_info.prefsrc }}" 14 | vars: 15 | _lustre_ip_r_mgs_info: "{{ _lustre_ip_r_mgs.stdout | from_json | first }}" 16 | 17 | - name: Write LNet configuration file 18 | template: 19 | src: lnet.conf.j2 20 | dest: /etc/lnet.conf # exists from package install, expected by lnet service 21 | owner: root 22 | group: root 23 | mode: u=rw,go=r # from package install 24 | register: _lnet_conf 25 | 26 | - name: Ensure lnet service state 27 | systemd: 28 | name: lnet 29 | state: "{{ 'restarted' if _lnet_conf.changed else 'started' }}" 30 | 31 | - name: Ensure mount points exist 32 | ansible.builtin.file: 33 | path: "{{ item.mount_point }}" 34 | state: directory 35 | loop: "{{ lustre_mounts }}" 36 | when: "(item.mount_state | default(lustre_mount_state)) != 'absent'" 37 | 38 | - name: Mount lustre filesystem 39 | ansible.posix.mount: 40 | fstype: lustre 41 | src: "{{ lustre_mgs_nid }}:/{{ item.fs_name }}" 42 | path: "{{ item.mount_point }}" 43 | state: "{{ (item.mount_state | default(lustre_mount_state)) }}" 44 | opts: "{{ item.mount_options | default(lustre_mount_options) }}" 45 | loop: "{{ lustre_mounts }}" 46 | -------------------------------------------------------------------------------- /ansible/roles/lustre/tasks/install.yml: -------------------------------------------------------------------------------- 1 | - name: Install lustre build prerequisites 2 | ansible.builtin.dnf: 3 | name: "{{ lustre_build_packages }}" 4 | register: _lustre_dnf_build_packages 5 | 6 | - name: Clone lustre git repo 7 | ansible.builtin.git: 8 | repo: "{{ lustre_repo }}" 9 | dest: "{{ lustre_build_dir }}" 10 | version: "{{ lustre_version }}" 11 | 12 | - name: Prepare for lustre configuration 13 | ansible.builtin.command: 14 | cmd: sh ./autogen.sh 15 | chdir: "{{ lustre_build_dir }}" 16 | 17 | - name: Configure lustre build 18 | ansible.builtin.command: 19 | cmd: "./configure {{ lustre_configure_opts | join(' ') }}" 20 | chdir: "{{ lustre_build_dir }}" 21 | 22 | - name: Build lustre 23 | ansible.builtin.command: 24 | cmd: make rpms 25 | chdir: "{{ lustre_build_dir }}" 26 | 27 | - name: Find rpms 28 | ansible.builtin.find: 29 | paths: "{{ lustre_build_dir }}" 30 | patterns: "{{ lustre_rpm_globs }}" 31 | use_regex: false 32 | register: _lustre_find_rpms 33 | 34 | - name: Check rpms found 35 | assert: 36 | that: _lustre_find_rpms.files | length 37 | fail_msg: "No lustre repos found with lustre_rpm_globs = {{ lustre_rpm_globs }}" 38 | 39 | - name: Install lustre rpms 40 | ansible.builtin.dnf: 41 | name: "{{ _lustre_find_rpms.files | map(attribute='path')}}" 42 | disable_gpg_check: yes 43 | 44 | - name: Delete lustre build dir 45 | file: 46 | path: "{{ lustre_build_dir }}" 47 | state: absent 48 | when: lustre_build_cleanup | bool 49 | -------------------------------------------------------------------------------- /ansible/roles/lustre/tasks/validate.yml: -------------------------------------------------------------------------------- 1 | - name: Check kernel-devel package is installed 2 | command: "dnf list --installed kernel-devel-{{ ansible_kernel }}" 3 | changed_when: false 4 | # NB: we don't check here the kernel will remain the same after reboot etc, see ofed/install.yml 5 | 6 | - name: Ensure SELinux in permissive mode 7 | assert: 8 | that: selinux_state in ['permissive', 'disabled'] 9 | fail_msg: "SELinux must be permissive for Lustre not '{{ selinux_state }}'; see variable selinux_state" 10 | 11 | - name: Ensure lustre_mgs_nid is defined 12 | assert: 13 | that: lustre_mgs_nid is defined 14 | fail_msg: Variable lustre_mgs_nid must be defined 15 | 16 | - name: Ensure lustre_mounts entries define filesystem name and mount point 17 | assert: 18 | that: 19 | - item.fs_name is defined 20 | - item.mount_point is defined 21 | fail_msg: All lustre_mounts entries must specify fs_name and mount_point 22 | loop: "{{ lustre_mounts }}" 23 | -------------------------------------------------------------------------------- /ansible/roles/lustre/templates/lnet.conf.j2: -------------------------------------------------------------------------------- 1 | net: 2 | - net type: {{ lustre_lnet_label }} 3 | local NI(s): 4 | - nid: {{ _lustre_ip }}@{{ lustre_lnet_label }} 5 | interfaces: 6 | 0: {{ _lustre_interface }} 7 | -------------------------------------------------------------------------------- /ansible/roles/mysql/defaults/main.yml: -------------------------------------------------------------------------------- 1 | # required: 2 | # mysql_root_password: # TODO: make it possible to CHANGE root password 3 | 4 | mysql_tag: 8.0.30 5 | mysql_systemd_service_enabled: yes 6 | #mysql_state: # default is started or restarted as required 7 | mysql_podman_user: "{{ ansible_user }}" 8 | mysql_datadir: /var/lib/mysql 9 | mysql_mysqld_options: [] # list of str options to mysqld, see `run -it --rm mysql:tag --verbose --help` 10 | mysql_users: [] # list of dicts for community.mysql.mysql_user 11 | mysql_databases: [] # list of dicts for community.mysql.mysql_db 12 | -------------------------------------------------------------------------------- /ansible/roles/mysql/tasks/configure.yml: -------------------------------------------------------------------------------- 1 | - name: Create environment file for mysql server root password 2 | # NB: This doesn't trigger a restart on changes as it will be ignored once mysql is initialised 3 | copy: 4 | dest: /etc/sysconfig/mysqld 5 | content: | 6 | MYSQL_INITIAL_ROOT_PASSWORD='{{ mysql_root_password }}' 7 | owner: root 8 | group: root 9 | mode: u=rw,go= 10 | 11 | - name: Ensure mysql service state 12 | systemd: 13 | name: mysql 14 | state: "{{ mysql_state | default('restarted' if _mysql_unitfile.changed else 'started') }}" 15 | enabled: "{{ mysql_systemd_service_enabled }}" 16 | daemon_reload: "{{ _mysql_unitfile.changed }}" 17 | 18 | - block: 19 | - name: Wait for mysql to initialise 20 | # NB: It is not sufficent to wait_for the port 21 | community.mysql.mysql_info: 22 | login_user: root 23 | login_password: "{{ mysql_root_password }}" 24 | no_log: "{{ no_log | default(true) }}" 25 | register: _mysql_info 26 | until: "'version' in _mysql_info" 27 | retries: 90 28 | delay: 2 29 | 30 | - name: Ensure mysql databases created 31 | community.mysql.mysql_db: "{{ item }}" 32 | loop: "{{ mysql_databases}}" 33 | 34 | - name: Ensure mysql users present 35 | community.mysql.mysql_user: "{{ item }}" 36 | loop: "{{ mysql_users }}" 37 | when: "mysql_state | default('unspecified') != 'stopped'" 38 | -------------------------------------------------------------------------------- /ansible/roles/mysql/tasks/install.yml: -------------------------------------------------------------------------------- 1 | - name: Install pip 2 | dnf: 3 | name: python3-pip 4 | 5 | - name: Install python mysql client 6 | pip: 7 | name: 8 | - pymysql 9 | - cryptography 10 | state: present 11 | 12 | - name: Create systemd mysql container unit file 13 | template: 14 | dest: /etc/systemd/system/mysql.service 15 | src: mysql.service.j2 16 | register: _mysql_unitfile 17 | 18 | - name: Pull container image 19 | containers.podman.podman_image: 20 | name: docker.io/library/mysql 21 | tag: "{{ mysql_tag }}" 22 | become_user: "{{ mysql_podman_user }}" 23 | -------------------------------------------------------------------------------- /ansible/roles/mysql/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - import_tasks: install.yml 2 | - import_tasks: configure.yml 3 | -------------------------------------------------------------------------------- /ansible/roles/ofed/README.md: -------------------------------------------------------------------------------- 1 | # ofed 2 | 3 | This role installs Mellanox OFED: 4 | - It checks that the running kernel is the latest installed one, and errors if not. 5 | - Installation uses the `mlnxofedinstall` command, with support for the running kernel 6 | and (by default) without firmware updates. 7 | 8 | As OFED installation takes a long time generally this should only be used during image build, 9 | for example by setting: 10 | 11 | ``` 12 | environments/groups//groups: 13 | [ofed:children] 14 | builder 15 | ``` 16 | 17 | # Role variables 18 | 19 | See `defaults/main.yml` 20 | 21 | Note ansible facts are required, unless setting `ofed_distro_version` and `ofed_arch` specifically. 22 | -------------------------------------------------------------------------------- /ansible/roles/ofed/defaults/main.yml: -------------------------------------------------------------------------------- 1 | ofed_version: '23.10-3.2.2.0' # LTS 2 | ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz 3 | ofed_distro: rhel # NB: not expected to work on other distros due to installation differences 4 | ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' 5 | ofed_distro_major_version: "{{ ansible_distribution_major_version }}" # e.g. '8' 6 | ofed_arch: "{{ ansible_architecture }}" 7 | ofed_tmp_dir: /tmp 8 | ofed_update_firmware: false 9 | ofed_build_packages: # may require additional packages depending on ofed_package_selection 10 | - autoconf 11 | - automake 12 | - gcc 13 | - gcc-gfortran 14 | - kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }} 15 | - kernel-rpm-macros 16 | - libtool 17 | - lsof 18 | - patch 19 | - pciutils 20 | - perl 21 | - rpm-build 22 | - tcl 23 | - tk 24 | ofed_build_rl8_packages: 25 | - gdb-headless 26 | - python36 27 | ofed_package_selection: # list of package selection flags for mlnxofedinstall script 28 | - hpc 29 | - with-nfsrdma 30 | -------------------------------------------------------------------------------- /ansible/roles/ofed/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - include_tasks: install.yml 2 | -------------------------------------------------------------------------------- /ansible/roles/openondemand/files/missing_home_directory.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Home Directory Not Found 5 | 35 | 36 | 37 |

Home directory not found

38 |

39 | Your home directory appears to be missing. If this is the first time you have logged in with this account, you may 40 | need to access our systems using SSH in order to trigger the creation of your home directory. 41 |

42 |
    43 | Open Shell to create home directory 44 |
    45 |
    46 | Restart Web Server 47 |
48 | 49 | 50 | -------------------------------------------------------------------------------- /ansible/roles/openondemand/tasks/config_changes.yml: -------------------------------------------------------------------------------- 1 | - name: Add Apache directives for node_uri forwarding 2 | blockinfile: 3 | path: /opt/ood/ood-portal-generator/templates/ood-portal.conf.erb 4 | block: "{{ openondemand_node_proxy_directives }}" 5 | insertafter: ' Header edit Set-Cookie "\^\(\[\^;\]\+\)" "\$1; Path=<%= @node_uri %>\/%{MATCH_HOST}e\/%{MATCH_PORT}e"' 6 | when: openondemand_node_proxy_directives 7 | -------------------------------------------------------------------------------- /ansible/roles/openondemand/tasks/exporter.yml: -------------------------------------------------------------------------------- 1 | - name: Install ondemand prometheus exporter 2 | yum: 3 | name: ondemand_exporter 4 | when: openondemand_exporter 5 | 6 | - name: Start and enable ondemand prometheus exporter 7 | service: 8 | name: ondemand_exporter 9 | enabled: true 10 | state: started 11 | -------------------------------------------------------------------------------- /ansible/roles/openondemand/tasks/jupyter_compute.yml: -------------------------------------------------------------------------------- 1 | # Should be run on compute nodes you want to run jupyter notebook on 2 | # See https://osc.github.io/ood-documentation/latest/app-development/tutorials-interactive-apps/add-jupyter/software-requirements.html 3 | # - Will already have openssl and lmod 4 | 5 | - name: Ensure python3.9 installed 6 | dnf: 7 | name: python39 8 | tags: install 9 | 10 | - name: Install jupyter venv 11 | # Requires separate step so that the upgraded pip is used to install packages 12 | pip: 13 | name: pip 14 | state: latest 15 | virtualenv: /opt/jupyter-py39 16 | virtualenv_command: python3.9 -m venv 17 | tags: install 18 | 19 | - name: Copy jupyter requirements file 20 | copy: 21 | src: jupyter_requirements.txt 22 | dest: /opt/jupyter-py39/jupyter_requirements.txt 23 | tags: install 24 | 25 | - name: Install jupyter package in venv 26 | pip: 27 | virtualenv: /opt/jupyter-py39 28 | virtualenv_command: python3.9 -m venv 29 | requirements: /opt/jupyter-py39/jupyter_requirements.txt 30 | tags: install 31 | 32 | 33 | -------------------------------------------------------------------------------- /ansible/roles/openondemand/tasks/pam_auth.yml: -------------------------------------------------------------------------------- 1 | # https://osc.github.io/ood-documentation/latest/authentication/pam.html 2 | --- 3 | - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build 4 | yum: 5 | name: mod_authnz_pam 6 | 7 | - name: Enable Apache PAM module 8 | lineinfile: 9 | path: /etc/httpd/conf.modules.d/55-authnz_pam.conf 10 | line: LoadModule authnz_pam_module modules/mod_authnz_pam.so 11 | regexp: ^LoadModule authnz_pam_module modules/mod_authnz_pam.so 12 | 13 | - name: Set PAM service # TODO: might need subsequent modification?? 14 | command: 15 | cmd: cp /etc/pam.d/sshd /etc/pam.d/ood 16 | creates: /etc/pam.d/ood 17 | 18 | - name: Allow the Apache user to read /etc/shadow 19 | file: 20 | path: /etc/shadow 21 | mode: 0640 22 | group: apache 23 | 24 | - name: Allow httpd access to PAM in SELinux 25 | ansible.posix.seboolean: 26 | name: httpd_mod_auth_pam 27 | state: yes 28 | persistent: yes 29 | when: ansible_facts.selinux.status == 'enabled' 30 | 31 | # TODO: do we need to restart OOD here?? 32 | -------------------------------------------------------------------------------- /ansible/roles/openondemand/tasks/validate.yml: -------------------------------------------------------------------------------- 1 | - name: Check Open Ondemand servername is defined 2 | assert: 3 | that: openondemand_servername != '' 4 | fail_msg: "Variable `openondemand_servername` must be set on openondemand and (by default) grafana hosts. See ansible/roles/openondemand/README.md" 5 | -------------------------------------------------------------------------------- /ansible/roles/openondemand/templates/dashboard_app_links.yml.j2: -------------------------------------------------------------------------------- 1 | name: "{{ item.name }}" 2 | category: "{{ item.category }}" 3 | description: "{{ item.description }}" 4 | icon: "{{ item.icon | default('fa://clock-o') }}" 5 | url: "{{ item.url }}" 6 | new_window: "{{ item.get('new_window', false) }}" 7 | -------------------------------------------------------------------------------- /ansible/roles/openondemand/templates/files_shortcuts.rb.j2: -------------------------------------------------------------------------------- 1 | # Template to add additional shortcuts to the Files dashboard app 2 | # See https://osc.github.io/ood-documentation/master/customization.html#add-shortcuts-to-files-menu 3 | 4 | OodFilesApp.candidate_favorite_paths.tap do |paths| 5 | {% for path in openondemand_filesapp_paths %} 6 | paths << Pathname.new("{{ path }}") 7 | {% endfor %} 8 | end 9 | -------------------------------------------------------------------------------- /ansible/roles/openondemand/templates/grid-mapfile.j2: -------------------------------------------------------------------------------- 1 | {% for user in openondemand_mapping_users %} 2 | {% if 'openondemand_username' in user %} 3 | "{{ user.openondemand_username }}" {{ user.name }} 4 | {% endif %} 5 | {% endfor %} 6 | -------------------------------------------------------------------------------- /ansible/roles/opensearch/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Used to set passwords 3 | #opensearch_internal_users_path: 4 | 5 | opensearch_podman_user: "{{ ansible_user }}" 6 | opensearch_version: '2.9.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags 7 | opensearch_config_path: /usr/share/opensearch/config 8 | opensearch_data_path: /usr/share/opensearch/data 9 | opensearch_state: started # will be restarted if required 10 | opensearch_systemd_service_enabled: true 11 | opensearch_certs_duration: "{{ 365 * 10 }}" # days validity for self-signed certs 12 | opensearch_debug: false 13 | -------------------------------------------------------------------------------- /ansible/roles/opensearch/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Restart opensearch service 4 | systemd: 5 | name: opensearch.service 6 | state: "{{ 'restarted' if 'started' in opensearch_state else opensearch_state }}" 7 | enabled: "{{ opensearch_systemd_service_enabled }}" 8 | become: true 9 | -------------------------------------------------------------------------------- /ansible/roles/opensearch/tasks/archive_data.yml: -------------------------------------------------------------------------------- 1 | # Remove data which was NOT indexed by Slurm Job ID 2 | # It will be re-ingested by filebeat from the slurmdbd, with that index 3 | 4 | - name: Ensure opensearch stopped 5 | systemd: 6 | name: opensearch 7 | state: stopped 8 | register: _opensearch_stop 9 | until: "_opensearch_stop.status.ActiveState in ['inactive', 'failed']" 10 | retries: 15 11 | delay: 5 12 | 13 | - name: Archive existing data 14 | community.general.archive: 15 | path: "{{ opensearch_data_path }}" 16 | dest: "{{ opensearch_data_path | dirname }}/data-{{ lookup('pipe', 'date --iso-8601=minutes') }}.tar.gz" 17 | remove: true 18 | -------------------------------------------------------------------------------- /ansible/roles/opensearch/tasks/install.yml: -------------------------------------------------------------------------------- 1 | # safe to use during build 2 | 3 | - name: Increase maximum number of virtual memory maps 4 | # see https://opensearch.org/docs/2.0/opensearch/install/important-settings/ 5 | ansible.posix.sysctl: 6 | name: vm.max_map_count 7 | value: '262144' 8 | state: present 9 | reload: yes 10 | 11 | - name: Create systemd unit file 12 | template: 13 | dest: /etc/systemd/system/opensearch.service 14 | src: opensearch.service.j2 15 | register: _opensearch_unit 16 | 17 | - name: Pull container image 18 | containers.podman.podman_image: 19 | name: docker.io/opensearchproject/opensearch 20 | tag: "{{ opensearch_version }}" 21 | become_user: "{{ opensearch_podman_user }}" 22 | 23 | - name: Reload opensearch unit file 24 | command: systemctl daemon-reload 25 | when: _opensearch_unit.changed 26 | -------------------------------------------------------------------------------- /ansible/roles/opensearch/tasks/migrate-opendistro.yml: -------------------------------------------------------------------------------- 1 | # Migrate data from existing containerised opendistro v1.12.0 to containerised opensearch 2.1.0. 2 | # 3 | # This relies on: 4 | # - Both opendistro and opensearch using host directories for data. See `_default_opendistro_data_path` below 5 | # - Pre-upgrade group `opendistro` and current group `opensearch` containing the same host. 6 | # 7 | # NB: If `opendistro_data_path` was set to something non-default it MUST be set again in the `opensearch` group_vars, 8 | # as the `opendistro` group will not exist in the groups. 9 | 10 | # NB: This deliberately does not remove the opendistro data - this could be done manually if required. 11 | 12 | - name: Stop opendistro 13 | ansible.builtin.systemd: 14 | name: opendistro.service 15 | state: stopped 16 | enabled: false 17 | 18 | - name: Copy opendistro data directory 19 | ansible.builtin.copy: 20 | remote_src: true 21 | src: "{{ opendistro_data_path | default(_default_opendistro_data_path) }}" 22 | dest: "{{ opensearch_data_path | dirname }}/" # copying a directory, so need to specify the parent for destination 23 | owner: "{{ opensearch_podman_user }}" 24 | group: "{{ opensearch_podman_user }}" 25 | mode: 0770 26 | vars: 27 | # from environments/common/inventory/group_vars/all/opendistro.yml: 28 | _default_opendistro_data_path: "{{ appliances_state_dir | default('/usr/share') }}/elasticsearch/data" 29 | -------------------------------------------------------------------------------- /ansible/roles/passwords/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Template passwords 4 | template: 5 | src: passwords.yml 6 | dest: "{{ openhpc_passwords_output_path }}" 7 | delegate_to: localhost 8 | run_once: true 9 | -------------------------------------------------------------------------------- /ansible/roles/passwords/tasks/validate.yml: -------------------------------------------------------------------------------- 1 | - name: Assert secrets created 2 | assert: 3 | that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_demo_user_password defined in dev 4 | fail_msg: "No inventory variables 'vault_*' found: Has ansible/adhoc/generate-passwords.yml been run?" 5 | -------------------------------------------------------------------------------- /ansible/roles/passwords/templates/passwords.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # {{ ansible_managed }} 3 | {{ slurm_appliance_secrets | to_nice_yaml }} -------------------------------------------------------------------------------- /ansible/roles/persist_hostkeys/README.md: -------------------------------------------------------------------------------- 1 | # persist_hostkeys 2 | 3 | Idempotently generates a persistent set of hostkeys and restores them after a rebuild/reimage. 4 | 5 | Add hosts to the `persist_hostkeys` group to enable. All hosts in group will share the same set hostkeys. 6 | -------------------------------------------------------------------------------- /ansible/roles/persist_hostkeys/defaults/main.yml: -------------------------------------------------------------------------------- 1 | persist_hostkeys_state_server: "{{ groups['control'] | first }}" 2 | persist_hostkeys_state_dir: "{{ hostvars[persist_hostkeys_state_server]['appliances_state_dir'] }}/hostkeys" 3 | -------------------------------------------------------------------------------- /ansible/roles/persist_hostkeys/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Generate persistent hostkeys in state directory 4 | delegate_to: "{{ persist_hostkeys_state_server }}" 5 | block: 6 | - name: Ensure hostkeys directory exists on persistent storage 7 | file: 8 | path: "{{ persist_hostkeys_state_dir }}" 9 | state: directory 10 | owner: root 11 | group: root 12 | mode: 0600 13 | 14 | - name: Check for existing hostkeys 15 | find: 16 | paths: "{{ persist_hostkeys_state_dir }}/" 17 | register: _files_found 18 | 19 | - name: Generate hostkeys 20 | when: _files_found.matched == 0 21 | shell: 22 | # ssh-keygen -A needs a directory with an /etc/ssh suffix to write hostkeys into 23 | cmd: | 24 | mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh 25 | ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} 26 | mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} 27 | rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh 28 | 29 | - name: Get created key names 30 | find: 31 | path: "{{ persist_hostkeys_state_dir }}/" 32 | register: _find_ssh_keys 33 | 34 | - name: Create in-memory copies of keys 35 | ansible.builtin.slurp: 36 | src: "{{ item.path }}" 37 | loop: "{{ _find_ssh_keys.files }}" 38 | register: _slurp_keys 39 | 40 | - name: Copy keys to hosts 41 | no_log: true 42 | copy: 43 | content: "{{ item.content | b64decode }}" 44 | dest: "/etc/ssh/{{ item.source | regex_search('[^/]+$') }}" 45 | loop: "{{ _slurp_keys.results }}" 46 | 47 | - meta: reset_connection 48 | -------------------------------------------------------------------------------- /ansible/roles/persist_openhpc_secrets/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Check if OpenHPC secrets exist in persistent storage 4 | stat: 5 | path: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" 6 | register: openhpc_secrets_stat 7 | 8 | - name: Ensure Ansible facts directories exist 9 | file: 10 | path: "{{ item }}" 11 | state: directory 12 | owner: root 13 | mode: 0600 14 | loop: 15 | - "{{ appliances_state_dir }}/ansible.facts.d" 16 | - "/etc/ansible/facts.d" 17 | 18 | - name: Write OpenHPC secrets 19 | template: 20 | src: openhpc_secrets.fact 21 | dest: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" 22 | owner: root 23 | mode: 0600 24 | when: "not openhpc_secrets_stat.stat.exists" 25 | 26 | - name: Symlink persistent facts to facts_path 27 | file: 28 | state: link 29 | src: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" 30 | dest: /etc/ansible/facts.d/openhpc_secrets.fact 31 | owner: root 32 | 33 | - name: Read facts 34 | ansible.builtin.setup: 35 | filter: ansible_local 36 | -------------------------------------------------------------------------------- /ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact: -------------------------------------------------------------------------------- 1 | { 2 | "vault_azimuth_user_password": "{{ lookup('password', '/dev/null') }}", 3 | "vault_grafana_admin_password": "{{ lookup('password', '/dev/null') }}", 4 | "vault_elasticsearch_admin_password": "{{ lookup('password', '/dev/null') }}", 5 | "vault_elasticsearch_kibana_password": "{{ lookup('password', '/dev/null') }}", 6 | "vault_mysql_root_password": "{{ lookup('password', '/dev/null') }}", 7 | "vault_mysql_slurm_password": "{{ lookup('password', '/dev/null') }}", 8 | "vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}" 9 | } 10 | -------------------------------------------------------------------------------- /ansible/roles/podman/defaults/main.yml: -------------------------------------------------------------------------------- 1 | podman_users: 2 | - name: "{{ ansible_user }}" 3 | -------------------------------------------------------------------------------- /ansible/roles/podman/tasks/prereqs.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install OS packages 3 | yum: 4 | name: 5 | - podman 6 | - python3 7 | state: installed 8 | become: true -------------------------------------------------------------------------------- /ansible/roles/proxy/README.md: -------------------------------------------------------------------------------- 1 | # proxy 2 | 3 | Define http/s proxy configuration. 4 | 5 | ## Role variables 6 | 7 | - `proxy_http_proxy`: Required. Address of http proxy. E.g. "http://10.1.0.28:3128" for a Squid proxy on default port. 8 | - `proxy_https_proxy`: Optional. Address of https proxy. Default is `{{ proxy_http_proxy }}`. 9 | - `proxy_no_proxy_extra`: Optional. List of additional addresses not to proxy. Will be combined with default list which includes `inventory_hostname` (for hostnames) and `ansible_host` (for host IPs) for all Ansible hosts. 10 | - `proxy_dnf`: Optional bool. Whether to configure yum/dnf proxying through `proxy_http_proxy`. Default `true`. 11 | - `proxy_systemd`: Optional bool. Whether to give processes started by systemd the above http, https and no_proxy configuration. **NB** Running services will need restarting if this is changed. Default `true`. 12 | -------------------------------------------------------------------------------- /ansible/roles/proxy/defaults/main.yml: -------------------------------------------------------------------------------- 1 | # proxy_http_proxy: 2 | proxy_https_proxy: "{{ proxy_http_proxy }}" 3 | proxy_no_proxy_defaults: "{{ ['localhost', '127.0.0.1'] + groups['all'] + hostvars.values() | map(attribute='ansible_host') }}" 4 | proxy_no_proxy_extras: [] 5 | proxy_no_proxy: "{{ (proxy_no_proxy_defaults + proxy_no_proxy_extras) | unique | sort | join(',') }}" 6 | proxy_dnf: true 7 | proxy_systemd: true 8 | -------------------------------------------------------------------------------- /ansible/roles/pulp_site/.gitignore: -------------------------------------------------------------------------------- 1 | filter_plugins/__pycache__ -------------------------------------------------------------------------------- /ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py: -------------------------------------------------------------------------------- 1 | class FilterModule(object): 2 | def filters(self): 3 | return { 4 | 'to_rpm_repos': self.to_rpm_repos, 5 | 'to_rpm_pubs': self.to_rpm_pubs, 6 | 'to_rpm_distros': self.to_rpm_distros 7 | } 8 | 9 | def to_rpm_repos(self, list, pulp_url): 10 | repo_list = map(lambda x: { 11 | 'name': x['name'], 12 | 'url': pulp_url+'/'+x['subpath'], 13 | 'remote_username': x['remote_username'], 14 | 'remote_password': x['remote_password'], 15 | 'policy': x['policy'], 16 | 'state': x['state'] }, list) 17 | return repo_list 18 | 19 | def to_rpm_pubs(self, list): 20 | pub_list = map(lambda x: { 21 | 'repository': x['name'], 22 | 'state': x['state'] }, list) 23 | return pub_list 24 | 25 | def to_rpm_distros(self, list): 26 | distro_list = map(lambda x: { 27 | 'name': x['name'], 28 | 'repository': x['name'], 29 | 'base_path': x['subpath'], 30 | 'state': x['state'] }, list) 31 | return distro_list -------------------------------------------------------------------------------- /ansible/roles/pulp_site/tasks/install.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Install packages 4 | dnf: 5 | name: 6 | - podman 7 | 8 | - name: Create install directories 9 | ansible.builtin.file: 10 | state: directory 11 | path: "{{ pulp_site_install_dir }}/{{ item }}" 12 | loop: 13 | - settings/certs 14 | - pulp_storage 15 | - pgsql 16 | - containers 17 | 18 | - name: Template settings file 19 | ansible.builtin.template: 20 | src: settings.py.j2 21 | dest: "{{ pulp_site_install_dir }}/settings/settings.py" 22 | 23 | - name: Install pulp podman container 24 | containers.podman.podman_container: 25 | name: pulp 26 | publish: 27 | - "{{ pulp_site_port }}:80" 28 | volume: 29 | - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ pulp_site_selinux_suffix }}" 30 | - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ pulp_site_selinux_suffix }}" 31 | - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ pulp_site_selinux_suffix }}" 32 | - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ pulp_site_selinux_suffix }}" 33 | device: /dev/fuse 34 | image: docker.io/pulp/pulp:3.68.1 35 | 36 | - name: Reset admin password once container has initialised 37 | no_log: true 38 | ansible.builtin.shell: 39 | cmd: "podman exec pulp bash -c 'pulpcore-manager reset-admin-password -p {{ pulp_site_password }}'" 40 | register: _admin_reset_output 41 | until: 0 == _admin_reset_output.rc 42 | retries: 6 43 | delay: 30 44 | -------------------------------------------------------------------------------- /ansible/roles/pulp_site/templates/cli.toml.j2: -------------------------------------------------------------------------------- 1 | [cli] 2 | base_url = "{{ pulp_site_url }}" 3 | username = "{{ pulp_site_username }}" 4 | password = "{{ pulp_site_password }}" 5 | api_root = "/pulp/" 6 | domain = "default" 7 | headers = [] 8 | cert = "" 9 | key = "" 10 | verify_ssl = true 11 | format = "json" 12 | dry_run = false 13 | timeout = 0 14 | verbose = 0 15 | -------------------------------------------------------------------------------- /ansible/roles/pulp_site/templates/settings.py.j2: -------------------------------------------------------------------------------- 1 | CONTENT_ORIGIN='http://{{ ansible_fqdn }}:{{ pulp_site_port }}' 2 | TOKEN_AUTH_DISABLED=True 3 | -------------------------------------------------------------------------------- /ansible/roles/rebuild/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | rebuild_clouds_path: ~/.config/openstack/clouds.yaml 4 | 5 | rebuild_job_partitions: rebuild 6 | rebuild_job_name: "rebuild-{{ item }}" # item is nodename 7 | rebuild_job_command: 'sleep 5' 8 | rebuild_job_reboot: true 9 | rebuild_job_options: '' 10 | rebuild_job_user: root 11 | rebuild_job_template: >- 12 | sbatch 13 | --nodelist={{ item }} 14 | {{ '--reboot' if rebuild_job_reboot | bool else '' }} 15 | --job-name={{ rebuild_job_name }} 16 | --nodes=1 17 | --exclusive 18 | --partition={{ _rebuild_job_current_partition }} 19 | --no-requeue 20 | --output=/dev/null 21 | --wrap="{{ rebuild_job_command }}" 22 | {{ rebuild_job_options }} 23 | #rebuild_job_hostlist: -------------------------------------------------------------------------------- /ansible/roles/rebuild/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Create /etc/openstack 4 | file: 5 | path: /etc/openstack 6 | state: directory 7 | owner: slurm 8 | group: root 9 | mode: u=rX,g=rwX 10 | 11 | - name: Copy out clouds.yaml 12 | copy: 13 | src: "{{ rebuild_clouds_path }}" 14 | dest: /etc/openstack/clouds.yaml 15 | owner: slurm 16 | group: root 17 | mode: u=r,g=rw 18 | 19 | - name: Setup slurm tools 20 | include_role: 21 | name: slurm_tools 22 | -------------------------------------------------------------------------------- /ansible/roles/rebuild/tasks/rebuild.yml: -------------------------------------------------------------------------------- 1 | - name: Create rebuild jobs for partition 2 | include_tasks: 3 | file: rebuild_partition.yml 4 | args: 5 | apply: 6 | become: yes 7 | become_user: "{{ rebuild_job_user }}" 8 | loop: "{{ rebuild_job_partitions | split(',') }}" 9 | loop_control: 10 | loop_var: _rebuild_job_current_partition 11 | 12 | -------------------------------------------------------------------------------- /ansible/roles/rebuild/tasks/rebuild_partition.yml: -------------------------------------------------------------------------------- 1 | - name: Get list of nodes in partition 2 | ansible.builtin.command: 3 | cmd: >- 4 | sinfo 5 | --Node 6 | --format=%N 7 | --noheader 8 | --partition={{ _rebuild_job_current_partition }} 9 | register: _sinfo_partition 10 | when: rebuild_job_hostlist is not defined 11 | 12 | - name: Expand rebuild_job_hostlist to host names 13 | ansible.builtin.command: 14 | cmd: "scontrol show hostnames {{ rebuild_job_hostlist }}" 15 | register: _scontrol_hostnames 16 | when: rebuild_job_hostlist is defined 17 | 18 | - name: Submit rebuild jobs 19 | ansible.builtin.command: 20 | cmd: "{{ rebuild_job_template }}" 21 | loop: "{{ _scontrol_hostnames.stdout_lines | default(_sinfo_partition.stdout_lines) }}" 22 | -------------------------------------------------------------------------------- /ansible/roles/resolv_conf/README.md: -------------------------------------------------------------------------------- 1 | # resolv_conf 2 | 3 | Template out `/etc/resolv.conf`. 4 | 5 | ## Role variables 6 | - `resolv_conf_nameservers`: List of up to 3 nameserver addresses. 7 | 8 | Notes: 9 | - `NetworkManager` (if used) will be prevented from rewriting this file on boot. 10 | - If `/etc/resolv.conf` includes `127.0.0.1` (e.g. due to a FreeIPA server installation), then `resolv_conf_nameservers` is ignored and this role does not change `/etc/resolv.conf` 11 | - For hosts in the `resolv_conf` group, the `/etc/resolv.conf` created with `resolv_conf_nameservers` will 12 | NOT be deleted at the end of Packer image builds. 13 | -------------------------------------------------------------------------------- /ansible/roles/resolv_conf/defaults/main.yml: -------------------------------------------------------------------------------- 1 | resolv_conf_nameservers: [] 2 | -------------------------------------------------------------------------------- /ansible/roles/resolv_conf/files/NetworkManager-dns-none.conf: -------------------------------------------------------------------------------- 1 | [main] 2 | dns=none 3 | -------------------------------------------------------------------------------- /ansible/roles/resolv_conf/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Read nameservers from /etc/resolv.conf 2 | ansible.builtin.slurp: 3 | src: /etc/resolv.conf 4 | register: _slurp_resolv_conf 5 | 6 | - name: Set nameservers in /etc/resolv.conf 7 | # Might need to set this for freeipa_server host, but freeipa server install 8 | # will then change it to point to 127.0.0.1. 9 | ansible.builtin.template: 10 | src: resolv.conf.j2 11 | dest: /etc/resolv.conf 12 | owner: root 13 | group: root 14 | mode: u=rw,og=r 15 | when: "'127.0.0.1' not in (_slurp_resolv_conf.content | b64decode)" 16 | 17 | - name: Disable NetworkManager control of resolv.conf 18 | ansible.builtin.copy: 19 | src: NetworkManager-dns-none.conf 20 | dest: /etc/NetworkManager/conf.d/90-dns-none.conf 21 | owner: root 22 | group: root 23 | mode: u=rw,og=r 24 | register: _copy_nm_config 25 | 26 | - name: Reload NetworkManager 27 | ansible.builtin.systemd: 28 | name: NetworkManager 29 | state: reloaded 30 | when: _copy_nm_config.changed | default(false) 31 | -------------------------------------------------------------------------------- /ansible/roles/resolv_conf/templates/resolv.conf.j2: -------------------------------------------------------------------------------- 1 | # Created by slurm appliance ansible/roles/resolv_conf 2 | {% if cluster_domain_suffix is defined %} 3 | search {{ openhpc_cluster_name }}.{{ cluster_domain_suffix }} 4 | {% endif %} 5 | 6 | {% for ns in resolv_conf_nameservers[0:3] %} 7 | nameserver {{ ns }} 8 | {% endfor %} 9 | -------------------------------------------------------------------------------- /ansible/roles/slurm_exporter/README.md: -------------------------------------------------------------------------------- 1 | slurm_exporter 2 | ============== 3 | 4 | Build, install and configure a Prometheus exporter for metrics about Slurm itself: https://github.com/vpenso/prometheus-slurm-exporter/ 5 | 6 | Requirements 7 | ------------ 8 | 9 | Rocky Linux 8.5 host. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | See `defaults/main.yml` 15 | 16 | Dependencies 17 | ------------ 18 | 19 | None. 20 | 21 | Example Playbook 22 | ---------------- 23 | 24 | - name: Deploy Slurm exporter 25 | hosts: control 26 | become: true 27 | tags: slurm_exporter 28 | tasks: 29 | - import_role: 30 | name: slurm_exporter 31 | 32 | Prometheus scrape configuration for this might look like: 33 | 34 | ``` 35 | - job_name: "slurm_exporter" 36 | scrape_interval: 30s 37 | scrape_timeout: 30s 38 | static_configs: 39 | - targets: 40 | - "{{ openhpc_slurm_control_host }}:9341" 41 | ``` 42 | 43 | License 44 | ------- 45 | 46 | Apache v2 47 | 48 | Author Information 49 | ------------------ 50 | 51 | StackHPC Ltd. 52 | -------------------------------------------------------------------------------- /ansible/roles/slurm_exporter/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # see https://github.com/stackhpc/prometheus-slurm-exporter/releases - version follows upstream, release is stackhpc build 3 | slurm_exporter_version: '0.21' 4 | slurm_exporter_release: '1' 5 | slurm_exporter_state: started 6 | -------------------------------------------------------------------------------- /ansible/roles/slurm_exporter/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Restart slurm exporter 3 | become: true 4 | systemd: 5 | daemon_reload: true 6 | name: prometheus-slurm-exporter 7 | state: restarted 8 | when: 9 | - not ansible_check_mode 10 | - slurm_exporter_state != 'stopped' 11 | -------------------------------------------------------------------------------- /ansible/roles/slurm_exporter/tasks/install.yml: -------------------------------------------------------------------------------- 1 | - name: Install slurm_exporter package 2 | dnf: 3 | name: "https://github.com/stackhpc/prometheus-slurm-exporter/releases/download/{{ slurm_exporter_version }}/prometheus-slurm-exporter-{{ slurm_exporter_version }}-{{slurm_exporter_release}}.el8.x86_64.rpm" 4 | disable_gpg_check: yes 5 | notify: Restart slurm exporter 6 | 7 | - meta: flush_handlers 8 | 9 | - name: Ensure slurm exporter state 10 | systemd: 11 | name: prometheus-slurm-exporter 12 | state: "{{ slurm_exporter_state }}" 13 | enabled: true 14 | when: 15 | - not ansible_check_mode 16 | -------------------------------------------------------------------------------- /ansible/roles/slurm_exporter/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - import_tasks: install.yml 3 | -------------------------------------------------------------------------------- /ansible/roles/slurm_stats/README.md: -------------------------------------------------------------------------------- 1 | stackhpc.slurm_openstack_tools.slurm-stats 2 | ========================================== 3 | 4 | Configures slurm-stats from https://github.com/stackhpc/slurm-openstack-tools.git which 5 | transforms sacct output into a form that is more amenable for importing into elasticsearch/loki. 6 | 7 | Requirements 8 | ------------ 9 | 10 | Role Variables 11 | -------------- 12 | 13 | See `defaults/main.yml`. 14 | 15 | Dependencies 16 | ------------ 17 | 18 | Example Playbook 19 | ---------------- 20 | 21 | - hosts: compute 22 | tasks: 23 | - import_role: 24 | name: slurm_stats 25 | 26 | 27 | License 28 | ------- 29 | 30 | Apache-2.0 31 | 32 | Author Information 33 | ------------------ 34 | -------------------------------------------------------------------------------- /ansible/roles/slurm_stats/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #################### 3 | # log rotate options 4 | #################### 5 | 6 | # These options affect the contents of the log-rotate file. 7 | # See: man logrotate 8 | 9 | # Log files are rotated count times before being removed 10 | slurm_stats_log_rotate_content_rotate: 7 11 | 12 | # How frequently are the log files rotated. Can be one of daily, monthly, ... 13 | slurm_stats_log_rotate_content_frequency: daily 14 | -------------------------------------------------------------------------------- /ansible/roles/slurm_stats/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Setup slurm tools 4 | include_role: 5 | name: slurm_tools 6 | 7 | - name: Create a directory to house the log files 8 | file: 9 | state: directory 10 | path: /var/log/slurm-stats 11 | become: true 12 | 13 | - name: Create cron job 14 | cron: 15 | name: Generate slurm stats 16 | minute: "*/5" 17 | user: root 18 | # NOTE: lasttimestamp is stored at /root/lasttimestamp 19 | job: "TZ=UTC /opt/slurm-tools/bin/slurm-stats >> /var/log/slurm-stats/finished_jobs.json" 20 | cron_file: slurm-stats 21 | become: true 22 | 23 | - name: Setup log rotate 24 | copy: 25 | content: | 26 | # WARNING: This file is managed by ansible, do not modify. 27 | /var/log/slurm-stats/finished_jobs.json { 28 | {{ slurm_stats_log_rotate_content_frequency }} 29 | rotate {{ slurm_stats_log_rotate_content_rotate }} 30 | compress 31 | delaycompress 32 | } 33 | dest: /etc/logrotate.d/slurm-stats 34 | become: true 35 | -------------------------------------------------------------------------------- /ansible/roles/slurm_tools/README.md: -------------------------------------------------------------------------------- 1 | slurm_tools 2 | ========= 3 | 4 | Install python-based tools from https://github.com/stackhpc/slurm-openstack-tools.git into `/opt/slurm-tools/bin/`. 5 | 6 | Role Variables 7 | -------------- 8 | 9 | - `pytools_editable`: Optional bool. Whether to install the package using `pip`'s 10 | editable mode (installing source to `/opt/slurm-tools/src`). Default `false`. 11 | - `pytools_gitref`: Optional. Git branch/tag/commit etc to install. Default `master`. 12 | - `pytools_user`: Optional user to install as. Default `root`. 13 | -------------------------------------------------------------------------------- /ansible/roles/slurm_tools/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | pytools_editable: false 3 | pytools_gitref: v2.0 4 | pytools_user: root 5 | -------------------------------------------------------------------------------- /ansible/roles/slurm_tools/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install python3 3 | package: 4 | name: python3,git 5 | become: true 6 | 7 | - name: Create virtualenv directory 8 | file: 9 | path: /opt/slurm-tools 10 | owner: "{{ pytools_user }}" 11 | group: "{{ pytools_user }}" 12 | state: directory 13 | become: true 14 | 15 | - block: 16 | - name: Upgrade pip 17 | # This needs to a separate step so that we use the updated version 18 | # to install the packages below. 19 | pip: 20 | name: pip 21 | 22 | - name: Create virtualenv 23 | pip: 24 | name: "git+https://github.com/stackhpc/slurm-openstack-tools.git@{{ pytools_gitref }}#egg=slurm_openstack_tools" 25 | editable: "{{ pytools_editable }}" 26 | 27 | module_defaults: 28 | ansible.builtin.pip: 29 | virtualenv: /opt/slurm-tools 30 | virtualenv_command: "{{ 'python3.9 -m venv' if ansible_distribution_major_version == '8' else 'python3 -m venv' }}" 31 | state: latest 32 | become: true 33 | become_user: "{{ pytools_user }}" 34 | -------------------------------------------------------------------------------- /ansible/roles/squid/defaults/main.yml: -------------------------------------------------------------------------------- 1 | squid_conf_template: squid.conf.j2 2 | squid_started: true 3 | squid_enabled: true 4 | 5 | squid_cache_mem: "{{ undef(hint='squid_cache_mem required, e.g. \"12 GB\"') }}" 6 | squid_cache_dir: /var/spool/squid 7 | squid_cache_disk: "{{ undef(hint='squid_cache_disk (in MB) required, e.g. \"1024\"') }}" # always in MB 8 | squid_maximum_object_size_in_memory: '64 MB' 9 | squid_maximum_object_size: '200 MB' 10 | squid_http_port: 3128 11 | squid_acls: acl anywhere src all # rely on openstack security groups 12 | squid_http_access: | 13 | # Deny requests to certain unsafe ports 14 | http_access deny !Safe_ports 15 | # Deny CONNECT to other than secure SSL ports 16 | http_access deny CONNECT !SSL_ports 17 | # Only allow cachemgr access from localhost 18 | http_access allow localhost manager 19 | http_access deny manager 20 | # Rules allowing http access 21 | http_access allow anywhere 22 | http_access allow localhost 23 | # Finally deny all other access to this proxy 24 | http_access deny all 25 | -------------------------------------------------------------------------------- /ansible/roles/squid/handlers/main.yml: -------------------------------------------------------------------------------- 1 | - name: Restart squid 2 | service: 3 | name: squid 4 | state: restarted 5 | when: squid_started | bool 6 | -------------------------------------------------------------------------------- /ansible/roles/squid/tasks/configure.yml: -------------------------------------------------------------------------------- 1 | - name: Ensure squid cache directory exists 2 | file: 3 | path: "{{ squid_cache_dir }}" 4 | # based on what dnf package creates: 5 | owner: squid 6 | group: squid 7 | mode: u=rwx,g=rw,o= 8 | 9 | - name: Template squid configuration 10 | template: 11 | src: "{{ squid_conf_template }}" 12 | dest: /etc/squid/squid.conf 13 | owner: squid 14 | group: squid 15 | mode: ug=rwX,go= 16 | notify: Restart squid 17 | 18 | - meta: flush_handlers 19 | 20 | - name: Ensure squid service state 21 | systemd: 22 | name: squid 23 | state: "{{ 'started' if squid_started | bool else 'stopped' }}" 24 | enabled: "{{ true if squid_enabled else false }}" 25 | -------------------------------------------------------------------------------- /ansible/roles/squid/tasks/install.yml: -------------------------------------------------------------------------------- 1 | - name: Install squid package 2 | dnf: 3 | name: squid 4 | -------------------------------------------------------------------------------- /ansible/roles/squid/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - import_tasks: install.yml 2 | - import_tasks: configure.yml 3 | -------------------------------------------------------------------------------- /ansible/roles/sshd/README.md: -------------------------------------------------------------------------------- 1 | # sshd 2 | 3 | Configure sshd. 4 | 5 | ## Role variables 6 | 7 | - `sshd_password_authentication`: Optional bool. Whether to enable password login. Default `false`. 8 | - `sshd_disable_forwarding`: Optional bool. Whether to disable all forwarding features (X11, ssh-agent, TCP and StreamLocal). Default `true`. 9 | - `sshd_conf_src`: Optional string. Path to sshd configuration template. Default is in-role template. 10 | - `sshd_conf_dest`: Optional string. Path to destination for sshd configuration file. Default is `/etc/ssh/sshd_config.d/10-ansible.conf` which overrides `50-{cloud-init,redhat}` files, if present. 11 | -------------------------------------------------------------------------------- /ansible/roles/sshd/defaults/main.yml: -------------------------------------------------------------------------------- 1 | sshd_password_authentication: false 2 | sshd_disable_forwarding: true 3 | sshd_conf_src: sshd.conf.j2 4 | sshd_conf_dest: /etc/ssh/sshd_config.d/10-ansible.conf 5 | -------------------------------------------------------------------------------- /ansible/roles/sshd/handlers/main.yml: -------------------------------------------------------------------------------- 1 | - name: Restart sshd 2 | systemd: 3 | name: sshd 4 | state: restarted 5 | -------------------------------------------------------------------------------- /ansible/roles/sshd/tasks/configure.yml: -------------------------------------------------------------------------------- 1 | - name: Grab facts to determine distribution 2 | setup: 3 | 4 | - name: Ensure drop in directory exists 5 | file: 6 | path: /etc/ssh/sshd_config.d/ 7 | state: directory 8 | owner: root 9 | group: root 10 | mode: 700 11 | become: true 12 | 13 | - name: Ensure drop in configuration is included 14 | blockinfile: 15 | dest: /etc/ssh/sshd_config 16 | content: | 17 | # To modify the system-wide sshd configuration, create .conf 18 | # files under /etc/ssh/sshd_config.d/ which will be automatically 19 | # included below. 20 | Include /etc/ssh/sshd_config.d/*.conf 21 | state: present 22 | insertafter: "# default value." 23 | validate: sshd -t -f %s 24 | notify: 25 | - Restart sshd 26 | become: true 27 | when: ansible_facts.distribution_major_version == '8' 28 | 29 | - name: Template sshd configuration 30 | # NB: If parameters are defined multiple times the first value wins; 31 | # The default /etc/ssh/sshd_config has 32 | # Include /etc/ssh/sshd_config.d/*.conf 33 | # early on, which is generally held to be the correct approach, so adding 34 | # values to the end of that file won't work 35 | template: 36 | src: "{{ sshd_conf_src }}" 37 | dest: "{{ sshd_conf_dest }}" 38 | owner: root 39 | group: root 40 | mode: u=rw,go= 41 | validate: sshd -t -f %s 42 | notify: 43 | - Restart sshd 44 | -------------------------------------------------------------------------------- /ansible/roles/sshd/tasks/export.yml: -------------------------------------------------------------------------------- 1 | # Exclusively used for compute-init 2 | - name: Inject host specific config template 3 | template: 4 | src: "{{ sshd_conf_src }}" 5 | dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/sshd.conf" 6 | owner: root 7 | group: root 8 | mode: u=rw,go= 9 | delegate_to: "{{ groups['control'] | first }}" 10 | -------------------------------------------------------------------------------- /ansible/roles/sshd/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - import_tasks: configure.yml 2 | -------------------------------------------------------------------------------- /ansible/roles/sshd/templates/sshd.conf.j2: -------------------------------------------------------------------------------- 1 | # {{ ansible_managed }} 2 | PasswordAuthentication {{ 'yes' if sshd_password_authentication | bool else 'no' }} 3 | DisableForwarding {{ 'yes' if sshd_disable_forwarding | bool else 'no' }} 4 | -------------------------------------------------------------------------------- /ansible/roles/sssd/README.md: -------------------------------------------------------------------------------- 1 | # sssd 2 | 3 | Install and configure [sssd](https://sssd.io/docs/introduction.html). 4 | 5 | 6 | ## Role variables 7 | 8 | The only required configuration is to create a [sssd.conf](https://www.mankier.com/5/sssd.conf) template at the location specified by `sssd_conf_src`. 9 | 10 | - `sssd_packages`: Optional list. Packages to install. 11 | - `sssd_install_ldap`: Optional bool. Whether to install packages enabling SSSD to authenticate against LDAP. Default `false`. 12 | - `sssd_ldap_packages`: Optional list. Packages to install when using `sssd_install_ldap`. 13 | - `sssd_enable_mkhomedir`: Optional bool. Whether to enable creation of home directories on login. Default `false`. 14 | - `sssd_mkhomedir_packages`: Optional list. Packages to install when using `sssd_enable_mkhomedir`. 15 | - `sssd_conf_src`: Optional string. Path to `sssd.conf` template. Default (which must be created) is `{{ appliances_environment_root }}/files/sssd.conf.j2`. 16 | - `sssd_conf_dest`: Optional string. Path to destination for `sssd.conf`. Default `/etc/sssd/sssd.conf`. 17 | - `sssd_started`: Optional bool. Whether `sssd` service should be started. 18 | - `sssd_enabled`: Optional bool. Whether `sssd` service should be enabled. 19 | -------------------------------------------------------------------------------- /ansible/roles/sssd/defaults/main.yml: -------------------------------------------------------------------------------- 1 | sssd_packages: 2 | - sssd-common 3 | sssd_install_ldap: false 4 | sssd_ldap_packages: 5 | - sssd-ldap 6 | sssd_enable_mkhomedir: false 7 | sssd_mkhomedir_packages: 8 | - oddjob-mkhomedir 9 | sssd_conf_src: "{{ appliances_environment_root }}/files/sssd.conf.j2" 10 | sssd_conf_dest: /etc/sssd/sssd.conf 11 | sssd_started: true 12 | sssd_enabled: true 13 | -------------------------------------------------------------------------------- /ansible/roles/sssd/handlers/main.yml: -------------------------------------------------------------------------------- 1 | - name: Restart sssd 2 | systemd: 3 | name: sssd 4 | state: restarted 5 | when: sssd_started | bool 6 | -------------------------------------------------------------------------------- /ansible/roles/sssd/tasks/configure.yml: -------------------------------------------------------------------------------- 1 | - name: Manage sssd.conf configuration 2 | template: 3 | src: "{{ sssd_conf_src }}" 4 | dest: "{{ sssd_conf_dest }}" 5 | owner: root 6 | group: root 7 | mode: u=rw,go= 8 | notify: "Restart sssd" 9 | 10 | - meta: flush_handlers 11 | 12 | - name: Ensure sssd service state 13 | systemd: 14 | name: sssd 15 | state: "{{ 'started' if sssd_started | bool else 'stopped' }}" 16 | enabled: "{{ sssd_enabled | bool }}" 17 | 18 | - name: Get current authselect configuration 19 | command: authselect current --raw 20 | changed_when: false 21 | failed_when: 22 | - _authselect_current.rc != 0 23 | - "'No existing configuration detected' not in _authselect_current.stdout" 24 | register: _authselect_current # stdout: sssd with-mkhomedir 25 | 26 | - name: Configure nsswitch and PAM for SSSD 27 | command: "authselect select sssd --force{% if sssd_enable_mkhomedir | bool %} with-mkhomedir{% endif %}" 28 | when: "'sssd' not in _authselect_current.stdout" 29 | 30 | - name: "Ensure oddjob is started" 31 | service: 32 | name: oddjobd 33 | state: 'started' 34 | enabled: true 35 | when: sssd_enable_mkhomedir | bool -------------------------------------------------------------------------------- /ansible/roles/sssd/tasks/export.yml: -------------------------------------------------------------------------------- 1 | # Exclusively used for compute-init 2 | - name: Inject host specific config template 3 | template: 4 | src: "{{ sssd_conf_src }}" 5 | dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/sssd.conf" 6 | owner: root 7 | group: root 8 | mode: u=rw,go= 9 | delegate_to: "{{ groups['control'] | first }}" -------------------------------------------------------------------------------- /ansible/roles/sssd/tasks/install.yml: -------------------------------------------------------------------------------- 1 | - name: Ensure sssd packages are installed 2 | dnf: 3 | name: "{{ sssd_packages + sssd_ldap_packages if (sssd_install_ldap | bool) else [] }}" 4 | 5 | - name: Control if sssd should start on boot 6 | # Needs to be done here to prevent starting after image build, is enabled by default 7 | systemd: 8 | name: sssd 9 | enabled: "{{ sssd_enabled | bool }}" 10 | 11 | - name: Ensure mkhomedir packages are installed if required 12 | dnf: 13 | name: "{{ sssd_mkhomedir_packages }}" 14 | -------------------------------------------------------------------------------- /ansible/roles/sssd/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - import_tasks: install.yml 2 | - import_tasks: configure.yml 3 | -------------------------------------------------------------------------------- /ansible/roles/systemd/README.md: -------------------------------------------------------------------------------- 1 | # systemd 2 | 3 | Create drop-in files for systemd services. 4 | 5 | # Role Variables 6 | - `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows: 7 | - `group`: Required str. Inventory group this drop-in applies to. 8 | - `comment`: Optional str. Comment describing reason for drop-in. 9 | - `content`: Required str. Content of drop-in file. 10 | # systemd 11 | 12 | Create drop-in files for systemd services. 13 | 14 | # Role Variables 15 | - `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows: 16 | - `group`: Required str. Inventory group this drop-in applies to. 17 | - `comment`: Optional str. Comment describing reason for drop-in. 18 | - `content`: Required str. Content of drop-in file. 19 | - `systemd_restart`: Optional bool. Whether to reload unit definitions and restart services. Default `false`. 20 | -------------------------------------------------------------------------------- /ansible/roles/systemd/defaults/main.yml: -------------------------------------------------------------------------------- 1 | #systemd_dropins: 2 | # : 3 | # group: 4 | # comment: 5 | # content: 6 | 7 | systemd_restart: false 8 | -------------------------------------------------------------------------------- /ansible/roles/systemd/tasks/main.yml: -------------------------------------------------------------------------------- 1 | # NB: As `systemd_TODO:` is defined in group_vars/all, all tasks here are conditional on group. 2 | - name: Make directory for unit dropins 3 | file: 4 | path: "/etc/systemd/system/{{ item.key }}.service.d/" 5 | state: directory 6 | owner: root 7 | group: root 8 | mode: 0644 9 | loop: "{{ systemd_dropins | dict2items }}" 10 | when: "item.value.group in group_names" 11 | 12 | - name: Add dropins for unit files 13 | ansible.builtin.copy: 14 | content: | 15 | # {{ item.value.comment | default('slurm appliance generated') }} 16 | {{ item.value.content }} 17 | dest: "/etc/systemd/system/{{ item.key }}.service.d/slurm_app.conf" 18 | owner: root 19 | group: root 20 | mode: 0644 21 | loop: "{{ systemd_dropins | dict2items }}" 22 | register: _systemd_dropins 23 | when: "item.value.group in group_names" 24 | 25 | - name: Reload unit definitions 26 | ansible.builtin.shell: 27 | cmd: systemctl daemon-reload 28 | when: 29 | - _systemd_dropins.changed 30 | - systemd_restart | default(false) | bool 31 | 32 | - name: Reload units 33 | ansible.builtin.systemd: 34 | name: "{{ item.key }}" 35 | state: restarted 36 | loop: "{{ systemd_dropins | dict2items }}" 37 | when: 38 | - _systemd_dropins.changed 39 | - "item.value.group in group_names" 40 | - systemd_restart | default(false) | bool 41 | -------------------------------------------------------------------------------- /ansible/roles/tuned/README.md: -------------------------------------------------------------------------------- 1 | tuned 2 | ========= 3 | 4 | This role configures the TuneD tool for system tuning, ensuring optimal performance based on the profile settings defined. 5 | 6 | Role Variables 7 | -------------- 8 | 9 | See the [TuneD documentation](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9/html/monitoring_and_managing_system_status_and_performance/getting-started-with-tuned_monitoring-and-managing-system-status-and-performance) for profile details. 10 | 11 | 12 | - `tuned_profile_baremetal`: Optional str. Name of default profile for non-virtualised hosts. Default `hpc-compute`. 13 | - `tuned_profile_vm`: Optional str. Name of default profile for virtualised hosts. Default `virtual-guest`. 14 | - `tuned_profile`: Optional str. Name of profile to apply to host. Defaults to `tuned_profile_baremetal` or `tuned_profile_vm` as appropriate. 15 | -------------------------------------------------------------------------------- /ansible/roles/tuned/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for tuned 3 | tuned_profile_baremetal: hpc-compute 4 | tuned_profile_vm: virtual-guest 5 | tuned_profile: "{{ tuned_profile_baremetal if ansible_virtualization_role != 'guest' else tuned_profile_vm }}" 6 | tuned_enabled: true 7 | tuned_started: true 8 | -------------------------------------------------------------------------------- /ansible/roles/tuned/tasks/configure.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Enable and start TuneD 3 | ansible.builtin.systemd: 4 | name: tuned 5 | enabled: "{{ tuned_enabled | bool }}" 6 | state: "{{ 'started' if tuned_started | bool else 'stopped' }}" 7 | 8 | - name: Check TuneD profile 9 | ansible.builtin.command: 10 | cmd: tuned-adm active 11 | when: tuned_started 12 | register: _tuned_profile_current 13 | changed_when: false 14 | 15 | - name: Set tuned-adm profile 16 | ansible.builtin.command: 17 | cmd: "tuned-adm profile {{ tuned_profile }}" 18 | when: 19 | - tuned_started | bool 20 | - tuned_profile not in _tuned_profile_current.stdout 21 | -------------------------------------------------------------------------------- /ansible/roles/tuned/tasks/install.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install tuneD 3 | ansible.builtin.dnf: 4 | name: tuned 5 | state: present -------------------------------------------------------------------------------- /ansible/roles/tuned/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - import_tasks: install.yml 3 | - import_tasks: configure.yml -------------------------------------------------------------------------------- /ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ##### 4 | # Small script that can be used to attach to the infra container of a pod 5 | # 6 | # Useful in a systemd service that starts a pod in order to track the execution 7 | # 8 | # Accepts a single argument which is the name of the pod whose infra container we should attach to 9 | ##### 10 | 11 | set -e 12 | 13 | echo "[INFO] Finding infra container for pod '$1'" 14 | INFRA_CONTAINER_ID="$(podman pod inspect --format '{{.InfraContainerID}}' "$1")" 15 | 16 | echo "[INFO] Attaching to infra container '${INFRA_CONTAINER_ID}'" 17 | exec podman container attach --no-stdin ${INFRA_CONTAINER_ID} 18 | -------------------------------------------------------------------------------- /ansible/roles/zenith_proxy/templates/client.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Podman {{ zenith_proxy_client_service_name }}.service 3 | Wants=network.target 4 | After=network-online.target 5 | BindsTo={{ zenith_proxy_service_name }}.service 6 | PartOf={{ zenith_proxy_service_name }}.service 7 | After={{ zenith_proxy_service_name }}.service 8 | {% if zenith_proxy_mitm_enabled %} 9 | Wants={{ zenith_proxy_mitm_service_name }}.service 10 | After={{ zenith_proxy_mitm_service_name }}.service 11 | {% endif %} 12 | 13 | [Service] 14 | Environment=PODMAN_SYSTEMD_UNIT=%n 15 | Type=simple 16 | Restart=always 17 | RestartSec=5 18 | User={{ zenith_proxy_podman_user }} 19 | Group={{ zenith_proxy_podman_user }} 20 | ExecStart=/usr/bin/podman run \ 21 | --cgroups=no-conmon \ 22 | --replace \ 23 | --restart=no \ 24 | --pod {{ zenith_proxy_pod_name }} \ 25 | --name {{ zenith_proxy_client_container_name }} \ 26 | --security-opt label=disable \ 27 | --volume /etc/zenith/{{ zenith_proxy_service_name }}:/etc/zenith:ro \ 28 | --volume {{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh:/home/zenith/.ssh \ 29 | {{ zenith_proxy_client_image }} 30 | ExecStop=/usr/bin/podman stop --ignore -t 10 {{ zenith_proxy_client_container_name }} 31 | ExecStopPost=/usr/bin/podman rm --ignore -f {{ zenith_proxy_client_container_name }} 32 | 33 | [Install] 34 | WantedBy=multi-user.target default.target 35 | -------------------------------------------------------------------------------- /ansible/roles/zenith_proxy/templates/pod.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Podman {{ zenith_proxy_service_name }}.service 3 | Wants=network.target 4 | After=network-online.target 5 | 6 | [Service] 7 | Environment=PODMAN_SYSTEMD_UNIT=%n 8 | Type=simple 9 | Restart=always 10 | User={{ zenith_proxy_podman_user }} 11 | Group={{ zenith_proxy_podman_user }} 12 | ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} --network=slirp4netns 13 | ExecStartPre=/usr/bin/podman pod start {{ zenith_proxy_pod_name }} 14 | ExecStart=/usr/bin/podman-pod-infra-attach.sh {{ zenith_proxy_pod_name }} 15 | ExecStop=/usr/bin/podman pod stop --ignore -t 10 {{ zenith_proxy_pod_name }} 16 | ExecStopPost=/usr/bin/podman pod rm --ignore -f {{ zenith_proxy_pod_name }} 17 | 18 | [Install] 19 | WantedBy=multi-user.target default.target 20 | -------------------------------------------------------------------------------- /ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2: -------------------------------------------------------------------------------- 1 | ssh_identity_path: /home/zenith/.ssh/id_zenith 2 | 3 | # Init options 4 | registrar_url: {{ zenith_registrar_url }} 5 | token: {{ zenith_proxy_client_token }} 6 | verify_ssl: {{ 'yes' if zenith_registrar_verify_ssl else 'no' }} 7 | 8 | # Connect options 9 | server_address: {{ zenith_sshd_host }} 10 | server_port: {{ zenith_sshd_port }} 11 | {% if zenith_proxy_mitm_enabled %} 12 | backend_protocol: http 13 | forward_to_host: 127.0.0.1 14 | forward_to_port: {{ zenith_proxy_mitm_listen_port }} 15 | {% else %} 16 | backend_protocol: {{ zenith_proxy_upstream_scheme }} 17 | forward_to_host: {{ zenith_proxy_upstream_host }} 18 | forward_to_port: {{ zenith_proxy_upstream_port }} 19 | {% endif %} 20 | {% if zenith_proxy_upstream_read_timeout %} 21 | read_timeout: {{ zenith_proxy_upstream_read_timeout }} 22 | {% endif %} 23 | skip_auth: {{ 'yes' if zenith_proxy_client_auth_skip else 'no' }} 24 | {% if zenith_proxy_client_auth_params %} 25 | auth_params: 26 | {{ zenith_proxy_client_auth_params | to_nice_yaml | indent(2) }} 27 | {% endif %} 28 | -------------------------------------------------------------------------------- /dev/image-share.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Share images from one project to another 3 | # 4 | # usage: 5 | # share-images SOURCE_PROJECT DEST_PROJECT IMAGE_NAME 6 | # 7 | # NB: This requires a clouds.yaml file which uses project names as cloud keys 8 | 9 | set -euo pipefail 10 | 11 | SOURCE=$1 12 | DEST=$2 13 | IMAGE_NAME=$3 14 | 15 | export OS_CLOUD=$SOURCE 16 | SOURCE_PROJECT=$(openstack project show -c id -f value $SOURCE) 17 | export OS_CLOUD=$DEST 18 | DEST_PROJECT=$(openstack project show -c id -f value $DEST) 19 | export OS_CLOUD=$SOURCE 20 | IMAGE=$(openstack image show -c id -f value $IMAGE_NAME) 21 | 22 | echo "Sharing $IMAGE_NAME ($IMAGE) from $SOURCE ($SOURCE_PROJECT) ..." 23 | openstack image set --shared $IMAGE 24 | echo "Adding destination project $DEST ($DEST_PROJECT) ..." 25 | openstack image add project $IMAGE $DEST_PROJECT 26 | 27 | export OS_CLOUD=$DEST 28 | echo "Accepting share ..." 29 | openstack image set --accept $IMAGE 30 | echo "Done" 31 | -------------------------------------------------------------------------------- /dev/output_manifest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Set github workflow output parameters defining image IDs from a packer manifest. 3 | # Usage: 4 | # ./packer/read_manifest.py packer/packer-manifest.json 5 | 6 | # E.g. assuming the default packer builds this will produce something like: 7 | # ::set-output name=NEW_COMPUTE_IMAGE_ID::9aabd73d-e550-4116-a90c-700478b722ce 8 | # ::set-output name=NEW_LOGIN_IMAGE_ID::87b41d58-d7e3-4c38-be05-453c3287ecab 9 | # ::set-output name=NEW_CONTROL_IMAGE_ID::7f812168-73fe-4a60-b9e9-9109a405390d 10 | # which can be used in subsequent workflow steps: [1] 11 | # 12 | # [1]: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-setting-a-value 13 | 14 | import sys, json 15 | output = {} 16 | with open(sys.argv[1]) as f: 17 | data = json.load(f) 18 | for build in data['builds']: 19 | node_type = build['custom_data']['source'] 20 | image_id = build['artifact_id'] 21 | output[node_type] = image_id # NB: this deliberately gets the LAST build for a node type 22 | for node_type, image_id in output.items(): 23 | print('::set-output name=NEW_%s_IMAGE_ID::%s' % (node_type.upper(), image_id)) 24 | -------------------------------------------------------------------------------- /dev/setup-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | PYTHON_VERSION=${PYTHON_VERSION:-} 6 | 7 | if [[ "$PYTHON_VERSION" == "" ]]; then 8 | if [[ -f /etc/os-release ]]; then 9 | . /etc/os-release 10 | OS=$ID 11 | OS_VERSION=$VERSION_ID 12 | else 13 | exit 1 14 | fi 15 | 16 | MAJOR_VERSION=$(echo $OS_VERSION | cut -d. -f1) 17 | 18 | if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then 19 | PYTHON_VERSION="/usr/bin/python3.10" 20 | elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then 21 | # python3.9+ doesn't have selinux bindings 22 | PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this 23 | elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then 24 | PYTHON_VERSION="/usr/bin/python3.9" 25 | else 26 | echo "Unsupported OS version: $OS $MAJOR_VERSION" 27 | exit 1 28 | fi 29 | fi 30 | 31 | if [[ ! -d "venv" ]]; then 32 | $PYTHON_VERSION -m venv venv 33 | fi 34 | 35 | . venv/bin/activate 36 | pip install -U pip 37 | pip install -r requirements.txt 38 | ansible --version 39 | # Install or update ansible dependencies ... 40 | ansible-galaxy role install -fr requirements.yml -p ansible/roles 41 | ansible-galaxy collection install -fr requirements.yml -p ansible/collections 42 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # StackHPC Slurm Appliance Documentation 2 | 3 | ### Operator docs 4 | 5 | [Image build](image-build.md) 6 | 7 | [CI](ci.md) 8 | 9 | [Monitoring and logging](monitoring-and-logging.md) 10 | 11 | [Operations guide](operations.md) 12 | 13 | [Production deployment](production.md) 14 | 15 | [Upgrades](upgrades.md) 16 | 17 | [Sequence diagrams](sequence.md) 18 | 19 | ### Configuration docs 20 | 21 | [Alerting](alerting.md) 22 | 23 | [Chrony](chrony.md) 24 | 25 | [Environments](environments.md) 26 | 27 | [K3s](k3s.README.md) 28 | 29 | [Networking](networks.md) 30 | 31 | [Open OnDemand](openondemand.md) 32 | 33 | [Persistent state](persistent-state.md) 34 | 35 | #### Experimental fetaures 36 | 37 | [Compute init](experimental/compute-init.md) 38 | 39 | [Pulp](experimental/pulp.md) 40 | 41 | [Slurm controlled rebuild](experimental/slurm-controlled-rebuild.md) 42 | 43 | ### Contributor docs 44 | 45 | [Adding functionality](adding-functionality.md) 46 | -------------------------------------------------------------------------------- /docs/adding-functionality.md: -------------------------------------------------------------------------------- 1 | # Adding new functionality 2 | 3 | Please contact us for specific advice, but this generally involves: 4 | - Adding a role. 5 | - Adding a play calling that role into an existing playbook in `ansible/`, or adding a new playbook there and updating `site.yml`. 6 | - Adding a new (empty) group named after the role into `environments/common/inventory/groups` and a non-empty example group into `environments/common/layouts/everything`. 7 | - Adding new default group vars into `environments/common/inventory/group_vars/all//`. 8 | - Updating the default Packer build variables in `environments/common/inventory/group_vars/builder/defaults.yml`. 9 | - Updating READMEs. 10 | -------------------------------------------------------------------------------- /docs/chrony.md: -------------------------------------------------------------------------------- 1 | # Chrony configuration 2 | 3 | Use variables from the [mrlesmithjr.chrony](https://github.com/mrlesmithjr/ansible-chrony) role. 4 | 5 | For example in: `environments//inventory/group_vars/all/chrony`: 6 | 7 | ``` 8 | --- 9 | chrony_ntp_servers: 10 | - server: ntp-0.example.org 11 | options: 12 | - option: iburst 13 | - option: minpoll 14 | val: 8 15 | - server: ntp-1.example.org 16 | options: 17 | - option: iburst 18 | - option: minpoll 19 | val: 8 20 | 21 | ``` 22 | -------------------------------------------------------------------------------- /docs/ci.md: -------------------------------------------------------------------------------- 1 | # CI/CD automation 2 | 3 | The `.github` directory contains a set of sample workflows which can be used by downstream site-specific configuration repositories to simplify ongoing maintainence tasks. These include: 4 | 5 | - An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published. 6 | 7 | - An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow which takes an image name, downloads it from StackHPC's public S3 bucket if available, and uploads it to the target OpenStack cloud. 8 | 9 | -------------------------------------------------------------------------------- /docs/experimental/compute-init.md: -------------------------------------------------------------------------------- 1 | # compute-init 2 | 3 | See the role README.md 4 | 5 | # Changes to image / tofu state 6 | 7 | When a compute group has the `ignore_image_changes` parameter set to true, 8 | changes to the `image_id` parameter (which defaults to `cluster_image_id`) are 9 | ignored by OpenTofu. 10 | 11 | Regardless of whether `ignore_image_changes` is set, OpenTofu templates out the 12 | `image_id` into the Ansible inventory for each compute node. The `compute_init` 13 | role templates out hostvars to the control node, which means the "target" image 14 | ID is then available on the control node. Subsequent work will use this to 15 | rebuild the node via slurm. 16 | 17 | # CI workflow 18 | 19 | The compute node rebuild is tested in CI after the tests for rebuilding the 20 | login and control nodes. The process follows 21 | 22 | 1. Compute nodes are reimaged: 23 | 24 | ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml 25 | 26 | 2. Ansible-init runs against newly reimaged compute nodes 27 | 28 | 3. Run sinfo and check nodes have expected slurm state 29 | 30 | ansible-playbook -v ansible/ci/check_slurm.yml -------------------------------------------------------------------------------- /docs/k3s.README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | A K3s cluster is deployed with the Slurm cluster. Both an agent and server instance of K3s is installed during image build and the correct service (determined by OpenStack metadata) will be 3 | enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes are configured as agents. Using multiple K3s servers isn't supported. Currently only the root user on the control node has 4 | access to the Kubernetes API. The `k3s` role installs Helm for package management. K9s is also installed in the image and can be used by the root user. 5 | 6 | # Idempotency 7 | K3s is intended to only be installed during image build as it is configured by the appliance on first boot with `azimuth_cloud.image_utils.linux_ansible_init`. Therefore, the `k3s` role isn't 8 | idempotent and changes to variables will not be reflected in the image when running `site.yml`. 9 | -------------------------------------------------------------------------------- /docs/screenshots/grafana/dashboard-node-exporter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/docs/screenshots/grafana/dashboard-node-exporter.png -------------------------------------------------------------------------------- /docs/screenshots/grafana/dashboard-openhpc-slurm-jobs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/docs/screenshots/grafana/dashboard-openhpc-slurm-jobs.png -------------------------------------------------------------------------------- /docs/screenshots/grafana/dashboard-openhpc-slurm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/docs/screenshots/grafana/dashboard-openhpc-slurm.png -------------------------------------------------------------------------------- /docs/screenshots/grafana/grafana-slurm-jobs-linking-to-node-exporter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/docs/screenshots/grafana/grafana-slurm-jobs-linking-to-node-exporter.png -------------------------------------------------------------------------------- /docs/site/README.md: -------------------------------------------------------------------------------- 1 | # Site-specific Documentation 2 | 3 | This document is a placeholder for any site-specific documentation, e.g. environment descriptions. 4 | 5 | #TODO: list things which should commonly be specified here. 6 | 7 | -------------------------------------------------------------------------------- /environments/.caas/README.md: -------------------------------------------------------------------------------- 1 | # Caas cluster 2 | 3 | Environment for default Azimuth Slurm. This is not intended to be manually deployed. 4 | 5 | Non-standard things for this environment: 6 | - There is no activate script. 7 | - `ansible.cgf` is provided in the repo root, as expected by the caas operator. 8 | - `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the 9 | runner project directory: 10 | 11 | azimuth_caas_stackhpc_slurm_appliance_template: 12 | ... 13 | envVars: 14 | ANSIBLE_INVENTORY: environments/common/inventory,environments/.caas/inventory 15 | 16 | Ansible then defines `ansible_inventory_sources` which contains absolute paths, and 17 | that is used to derive the `appliances_environment_root` and 18 | `appliances_repository_root`. 19 | -------------------------------------------------------------------------------- /environments/.caas/ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | any_errors_fatal = True 3 | stdout_callback = debug 4 | stderr_callback = debug 5 | gathering = smart 6 | forks = 30 7 | host_key_checking = False 8 | inventory = ../common/inventory,inventory 9 | collections_path = ../../ansible/collections 10 | roles_path = ../../ansible/roles 11 | filter_plugins = ../../ansible/filter_plugins 12 | 13 | [ssh_connection] 14 | ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null 15 | pipelining = True 16 | 17 | [inventory] 18 | # Fail when any inventory source cannot be parsed. 19 | any_unparsed_is_failed = True 20 | -------------------------------------------------------------------------------- /environments/.caas/assets/ood-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/.caas/assets/ood-icon.png -------------------------------------------------------------------------------- /environments/.caas/hooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/.caas/hooks/.gitkeep -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/.caas/inventory/group_vars/all/.gitkeep -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/basic_users.yml: -------------------------------------------------------------------------------- 1 | basic_users_users: 2 | - name: azimuth 3 | # Hash the password with a salt that is different for each host 4 | password: "{{ vault_azimuth_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" 5 | uid: 1005 6 | public_key: "{{ cluster_user_ssh_public_key }}" 7 | shell: /bin/bash 8 | append: true 9 | groups: 10 | - adm 11 | - systemd-journal 12 | sudo: azimuth ALL=(ALL) NOPASSWD:ALL 13 | 14 | # the path *on the control node* for the home directories depends on the filesystem: 15 | basic_users_homedir_server_path: "{{ '/home' if cluster_home_manila_share | bool else '/exports/home' }}" 16 | -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/grafana.yml: -------------------------------------------------------------------------------- 1 | grafana_auth_anonymous: "{{ groups['openondemand'] | count > 0 }}" 2 | -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/hpctests.yml: -------------------------------------------------------------------------------- 1 | # Skip plotting pingpong as matplotlib not in runner environment 2 | hpctests_pingpong_plot: false 3 | 4 | # In Azimuth, the Ansible controller is an ephemeral pod, so all that matters is that 5 | # this is a location that is writable by the container user 6 | hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests" 7 | 8 | # hpctests run by default in Azimuth but not trying to stress-test the nodes 9 | # just check compiler, mpi etc works 10 | hpctests_hpl_mem_frac: 0.05 # 5% node memory 11 | 12 | # use basic_user-defined user: 13 | hpctests_user: azimuth 14 | -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/manila.yml: -------------------------------------------------------------------------------- 1 | caas_manila_home: 2 | share_name: "{{ cluster_name }}-home" 3 | mount_path: /home 4 | mount_user: root 5 | mount_group: root 6 | mount_mode: u=rwX,go=rX 7 | 8 | cluster_project_manila_share_name: azimuth-project-share 9 | caas_manila_project: 10 | share_name: "{{ cluster_project_manila_share_name | default('azimuth-project-share') }}" 11 | share_user: "{{ cluster_project_manila_share_user | default(omit) }}" 12 | mount_path: /project 13 | mount_user: root 14 | mount_group: root 15 | mount_mode: ugo=rwX 16 | 17 | os_manila_mount_shares: "{{ ([caas_manila_home] if cluster_home_manila_share | bool else []) + ([caas_manila_project] if cluster_project_manila_share | bool else []) }}" 18 | -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/nfs.yml: -------------------------------------------------------------------------------- 1 | nfs_server: "{{ nfs_server_default }}" 2 | 3 | caas_nfs_home: 4 | - comment: Export /exports/home from Slurm control node as /home 5 | nfs_enable: 6 | server: "{{ inventory_hostname in groups['control'] }}" 7 | clients: "{{ inventory_hostname in groups['cluster'] }}" 8 | nfs_export: "/exports/home" # assumes skeleton TF is being used 9 | nfs_client_mnt_point: "/home" 10 | 11 | nfs_configurations: "{{ caas_nfs_home if not cluster_home_manila_share | bool else [] }}" 12 | -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/openhpc.yml: -------------------------------------------------------------------------------- 1 | openhpc_cluster_name: "{{ cluster_name }}" 2 | 3 | # Provision a single "standard" compute nodegroup using the supplied 4 | # node count and flavor 5 | openhpc_nodegroups: "{{ hostvars[groups['openstack'][0]]['openhpc_nodegroups'] }}" 6 | -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/openondemand.yml: -------------------------------------------------------------------------------- 1 | --- 2 | openondemand_auth: basic_pam 3 | openondemand_jupyter_partition: "{{ openhpc_partitions[0]['name'] }}" 4 | openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}" 5 | 6 | httpd_listen_addr_port: 7 | - 80 8 | - 443 9 | 10 | -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/prometheus.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # We reserve 10GB of the state volume for cluster state, the rest is for metrics 4 | prometheus_storage_retention_size: "{{ state_volume_size - 10 }}GB" 5 | -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/all/zenith.yml: -------------------------------------------------------------------------------- 1 | zenith_proxy_podman_user: podman 2 | -------------------------------------------------------------------------------- /environments/.caas/inventory/group_vars/openstack.yml: -------------------------------------------------------------------------------- 1 | # The default Terraform state key for backends that support it 2 | terraform_state_key: "cluster/{{ cluster_id }}/tfstate" 3 | 4 | # Set up the terraform backend 5 | terraform_backend_type: "{{ 'consul' if 'CONSUL_HTTP_ADDR' in ansible_env else 'local' }}" 6 | terraform_backend_config_defaults: 7 | consul: 8 | path: "{{ terraform_state_key }}" 9 | gzip: "true" 10 | local: {} 11 | terraform_backend_config: "{{ terraform_backend_config_defaults[terraform_backend_type] }}" 12 | 13 | terraform_binary_directory: "{{ appliances_environment_root }}/bin" 14 | terraform_project_path: "{{ playbook_dir }}/terraform" 15 | 16 | terraform_state: "{{ cluster_state | default('present') }}" 17 | cluster_ssh_user: rocky 18 | 19 | # Provision a single "standard" compute nodegroup using the supplied 20 | # node count and flavor 21 | openhpc_nodegroups: 22 | - name: "standard" 23 | count: "{{ compute_count }}" 24 | flavor: "{{ compute_flavor }}" 25 | default: "YES" 26 | -------------------------------------------------------------------------------- /environments/.caas/inventory/hosts: -------------------------------------------------------------------------------- 1 | [openstack] 2 | localhost ansible_connection=local ansible_python_interpreter=/usr/bin/python3 3 | -------------------------------------------------------------------------------- /environments/.stackhpc/.gitignore: -------------------------------------------------------------------------------- 1 | partitions.yml 2 | secrets.yml 3 | hosts 4 | terraform.tfvars 5 | .terraform.lock.hcl 6 | logs/ 7 | hpctests/ 8 | inventory/group_vars/all/test_user.yml 9 | -------------------------------------------------------------------------------- /environments/.stackhpc/ARCUS.pkrvars.hcl: -------------------------------------------------------------------------------- 1 | flavor = "vm.ska.cpu.general.small" 2 | networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60) 3 | ssh_keypair_name = "slurm-app-ci" 4 | ssh_private_key_file = "~/.ssh/id_rsa" 5 | security_groups = ["default", "SSH"] 6 | floating_ip_network = "CUDN-Internet" # Use FIP to avoid docker ratelimits on portal-internal outbound IP 7 | -------------------------------------------------------------------------------- /environments/.stackhpc/LEAFCLOUD.pkrvars.hcl: -------------------------------------------------------------------------------- 1 | flavor = "ec1.large" 2 | volume_type = "unencrypted" 3 | networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci 4 | ssh_keypair_name = "slurm-app-ci" 5 | ssh_private_key_file = "~/.ssh/id_rsa" 6 | security_groups = ["default", "SSH"] 7 | # see environments/.stackhpc/inventory/group_vars/all/bastion.yml: 8 | ssh_bastion_username = "slurm-app-ci" 9 | ssh_bastion_host = "195.114.30.222" 10 | ssh_bastion_private_key_file = "~/.ssh/id_rsa" 11 | -------------------------------------------------------------------------------- /environments/.stackhpc/SMS.pkrvars.hcl: -------------------------------------------------------------------------------- 1 | flavor = "general.v1.small" 2 | networks = ["e2b9e59f-43da-4e1c-b558-dc9da4c0d738"] # stackhpc-ipv4-geneve 3 | ssh_keypair_name = "slurm-app-ci" 4 | ssh_private_key_file = "~/.ssh/id_rsa" 5 | # see environments/.stackhpc/inventory/group_vars/all/bastion.yml: 6 | ssh_bastion_username = "slurm-app-ci" 7 | ssh_bastion_host = "185.45.78.150" 8 | ssh_bastion_private_key_file = "~/.ssh/id_rsa" 9 | -------------------------------------------------------------------------------- /environments/.stackhpc/activate: -------------------------------------------------------------------------------- 1 | export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) 2 | echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" 3 | 4 | export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}" 5 | 6 | export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") 7 | echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" 8 | 9 | export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") 10 | echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" 11 | 12 | export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") 13 | echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" 14 | 15 | export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") 16 | echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" 17 | 18 | if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then 19 | export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg 20 | fi 21 | 22 | 23 | -------------------------------------------------------------------------------- /environments/.stackhpc/ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | any_errors_fatal = True 3 | stdout_callback = debug 4 | stderr_callback = debug 5 | callbacks_enabled = ansible.posix.profile_tasks 6 | gathering = smart 7 | forks = 30 8 | host_key_checking = False 9 | inventory = ../common/inventory,inventory 10 | collections_path = ../../ansible/collections 11 | roles_path = ../../ansible/roles 12 | filter_plugins = ../../ansible/filter_plugins 13 | library = ../../ansible/library 14 | 15 | [ssh_connection] 16 | ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null 17 | pipelining = True 18 | 19 | [inventory] 20 | # Fail when any inventory source cannot be parsed. 21 | any_unparsed_is_failed = True 22 | -------------------------------------------------------------------------------- /environments/.stackhpc/cacerts/myCA.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDgzCCAmugAwIBAgIUd5qnvmXczLvacv3Mu2hzwJlmimMwDQYJKoZIhvcNAQEL 3 | BQAwUTELMAkGA1UEBhMCWFgxFTATBgNVBAcMDERlZmF1bHQgQ2l0eTEcMBoGA1UE 4 | CgwTRGVmYXVsdCBDb21wYW55IEx0ZDENMAsGA1UEAwwEdGVzdDAeFw0yNTAyMTIx 5 | NjIxNTlaFw0zMDAyMTExNjIxNTlaMFExCzAJBgNVBAYTAlhYMRUwEwYDVQQHDAxE 6 | ZWZhdWx0IENpdHkxHDAaBgNVBAoME0RlZmF1bHQgQ29tcGFueSBMdGQxDTALBgNV 7 | BAMMBHRlc3QwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDExC9wqRyG 8 | vQ5FYGb48iDfq8er4WvWO94F/q746mCHvVJn7GTu3AMavIXCYqH9WnXY0lzey7xU 9 | /40/F/xihQfGYFrY+8ssYrT8Z+H3fSuwmq6XqsHcCupBQHKTTjZWaVMODxF4Eq5F 10 | Vyk4/AJpoOFLrzjUA9Sw74HKBH+r3N74x+3fFzElFGfjtFXPlgnYi9T9dXEEoNc7 11 | Udulcr6MrL+l6ITr0Grti4FP0qOari9a4XqC7G2Jtga1PF/GaMlyrmQphnhpS7ph 12 | n1dr6hYWmHZ1r1vcNBxBl71CoOVoLwk9v2x0jOsbYpzAp5CJEl/6whwo/Pn2JzIV 13 | xbCuVg9znbHpAgMBAAGjUzBRMB0GA1UdDgQWBBSEbb8xKKL1NwsRfzeZ7Shyq9xq 14 | QTAfBgNVHSMEGDAWgBSEbb8xKKL1NwsRfzeZ7Shyq9xqQTAPBgNVHRMBAf8EBTAD 15 | AQH/MA0GCSqGSIb3DQEBCwUAA4IBAQB2z7YMpZKAPY19EWaTV80Gwks56hBClcfR 16 | 6Y6d/7+ltML5pRHCFB2fF850Rj5vmnflSwrSWDcDbRktEfha3OIhHWtY8TzF7Zkx 17 | dIMyN8JaqjmJ488WGhcuqQDIK5sREg/JfECVeBId5mF390TKszlM9FNQL1NOC0D+ 18 | I/+BeWHYAu4dGWQR6xbC6SYUMbhTQrQSgJFckq5i2fQPcNK8Xlnzc+oxjJuqgsfB 19 | P1oLnrb2OVHEpjuxdK1UYds3z/6ilKwZQvx6uuv0baSbTsQT9TXKpbAZCynOQnGS 20 | 3rzTeOTapwsj1yVlAuo7koxbjFFaz6b1nGC5Ap/rGeVdIT7ZVKF/ 21 | -----END CERTIFICATE----- 22 | -------------------------------------------------------------------------------- /environments/.stackhpc/hooks/post-bootstrap.yml: -------------------------------------------------------------------------------- 1 | - hosts: podman:!builder 2 | become: yes 3 | gather_facts: false 4 | tags: podman 5 | tasks: 6 | - name: Configure container image registry to avoid docker.io ratelimits 7 | copy: 8 | dest: /etc/containers/registries.conf.d/003-arcus-mirror.conf 9 | content: | 10 | [[registry]] 11 | location="docker.io/library/" 12 | prefix="docker.io/library/" 13 | 14 | [[registry.mirror]] 15 | location = "{{ podman_registry_address }}" 16 | insecure = true 17 | when: "ci_cloud == 'ARCUS'" 18 | -------------------------------------------------------------------------------- /environments/.stackhpc/hooks/pre.yml: -------------------------------------------------------------------------------- 1 | - hosts: control:!builder 2 | become: yes 3 | gather_facts: false 4 | tasks: 5 | - name: Output OS version 6 | command: cat /etc/redhat-release 7 | changed_when: false 8 | 9 | - name: Write CI-generated inventory and secrets for debugging 10 | ansible.builtin.copy: 11 | dest: /etc/ci-config/ 12 | src: "{{ item }}" 13 | directory_mode: 0400 14 | mode: 0400 15 | owner: root 16 | group: root 17 | no_log: "{{ no_log | default(true) }}" 18 | loop: 19 | - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml" 20 | - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" 21 | - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" 22 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/everything: -------------------------------------------------------------------------------- 1 | ../../../environments/common/layouts/everything -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/extra_groups: -------------------------------------------------------------------------------- 1 | [basic_users:children] 2 | cluster 3 | 4 | [etc_hosts:children] 5 | cluster 6 | 7 | # -- Example of enabling FreeIPA with an in-appliance (dev-only) server 8 | # NB: The etc_hosts and basic_users group definitions above should be commented out 9 | # The freeipa_* hosts will pick up configuration from environments/.stackhpc/inventory/group_vars/all/freeipa.yml 10 | 11 | # [freeipa_server:children] 12 | # control 13 | # 14 | # [freeipa_client:children] 15 | # login 16 | # compute 17 | # 18 | # [resolv_conf:children] 19 | # freeipa_client 20 | # --- end of FreeIPA example --- 21 | 22 | [manila:children] 23 | # Allows demo; also installs manila client in fat image 24 | login 25 | compute 26 | 27 | [chrony:children] 28 | cluster 29 | 30 | [tuned:children] 31 | # Install tuned into fat image 32 | builder 33 | 34 | [squid:children] 35 | # Install squid into fat image 36 | builder 37 | 38 | [sssd:children] 39 | # Install sssd into fat image 40 | builder 41 | 42 | [rebuild:children] 43 | control 44 | 45 | [cacerts:children] 46 | cluster 47 | 48 | [compute_init:children] 49 | compute 50 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/.stackhpc/inventory/group_vars/all/.gitkeep -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/basic_users.yml: -------------------------------------------------------------------------------- 1 | test_demo_user_password: "{{ lookup('env', 'DEMO_USER_PASSWORD') | default(vault_demo_user_password, true) }}" # CI uses env, debug can set vault_demo_user_password 2 | 3 | basic_users_users: 4 | - name: demo_user # can't use rocky as $HOME isn't shared! 5 | password: "{{ test_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent 6 | uid: 1005 7 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/bastion.yml: -------------------------------------------------------------------------------- 1 | ci_cloud: "{{ lookup('env', 'CI_CLOUD') }}" 2 | bastion_config: 3 | ARCUS: 4 | user: slurm-app-ci 5 | ip: 128.232.222.183 6 | LEAFCLOUD: 7 | user: slurm-app-ci 8 | ip: 195.114.30.222 9 | SMS: 10 | user: slurm-app-ci 11 | ip: 185.45.78.150 12 | # NB: The bastion_{user,ip} variables are used directly in the CI workflow too 13 | bastion_user: "{{ bastion_config[ci_cloud].user }}" 14 | bastion_ip: "{{ bastion_config[ci_cloud].ip }}" 15 | ansible_ssh_common_args: '-o ProxyCommand="ssh {{ bastion_user }}@{{ bastion_ip }} -W %h:%p"' 16 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/freeipa.yml: -------------------------------------------------------------------------------- 1 | # This file provides examples of using freeipa role variables. These are NOT functional in CI as freeipa_{server,client} groups are not defined. 2 | 3 | # NB: Users defined this way have expired passwords 4 | freeipa_users: 5 | - name: demo_user # can't use rocky as $HOME isn't shared! 6 | password: "{{ test_demo_user_password }}" 7 | givenname: test 8 | sn: test 9 | 10 | # freeipa_client hosts must use a FreeIPA server for name resolution - requires hosts to be in group `resolv_conf`. 11 | resolv_conf_nameservers: 12 | - "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" 13 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/grafana.yml: -------------------------------------------------------------------------------- 1 | grafana_auth_anonymous: true 2 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/hpctests.yml: -------------------------------------------------------------------------------- 1 | hpctests_user: demo_user 2 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/manila.yml: -------------------------------------------------------------------------------- 1 | os_manila_mount_shares_arcus: 2 | - share_name: slurm-v2-home 3 | mount_path: /project 4 | - share_name: slurm-scratch 5 | mount_path: /scratch 6 | 7 | os_manila_mount_shares: "{{ os_manila_mount_shares_arcus if ci_cloud == 'ARCUS' else [] }}" 8 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/openhpc.yml: -------------------------------------------------------------------------------- 1 | openhpc_config_extra: 2 | SlurmctldDebug: debug 3 | SlurmdDebug: debug 4 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/openondemand.yml: -------------------------------------------------------------------------------- 1 | openondemand_auth: basic_pam 2 | openondemand_jupyter_partition: standard 3 | openondemand_desktop_partition: standard 4 | #openondemand_dashboard_support_url: 5 | #openondemand_dashboard_docs_url: 6 | #openondemand_filesapp_paths: 7 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/all/podman.yml: -------------------------------------------------------------------------------- 1 | arcus_podman_registry_address: 192.168.3.95:5000 2 | podman_registry_address: "{{ arcus_podman_registry_address if ci_cloud == 'ARCUS' else '' }}" 3 | -------------------------------------------------------------------------------- /environments/.stackhpc/inventory/group_vars/builder.yml: -------------------------------------------------------------------------------- 1 | #update_enable: false # Can uncomment for speed debugging non-update related build issues 2 | sssd_install_ldap: true # include sssd-ldap package in fatimage 3 | # update_enable: false # Can uncomment for speed debugging non-update related build issues 4 | 5 | # Uncomment below to use CI pulp servers 6 | 7 | # pulp_server_config: 8 | # LEAFCLOUD: 9 | # url: http://192.168.10.157:8080 10 | # password: lookup('env','LEAFCLOUD_PULP_PASSWORD') 11 | 12 | # appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" 13 | # pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" 14 | 15 | # Alternatively, configure to use ark directly: 16 | dnf_repos_username: slurm-app-ci 17 | dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" 18 | 19 | # Can be set regardless of approach above: 20 | pulp_site_upstream_username: slurm-app-ci 21 | pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" 22 | -------------------------------------------------------------------------------- /environments/.stackhpc/tofu/ARCUS.tfvars: -------------------------------------------------------------------------------- 1 | cluster_net = "portal-internal" 2 | cluster_subnet = "portal-internal" 3 | control_node_flavor = "vm.ska.cpu.general.eighth" 4 | other_node_flavor = "vm.ska.cpu.general.small" 5 | -------------------------------------------------------------------------------- /environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars: -------------------------------------------------------------------------------- 1 | cluster_networks = [ 2 | { 3 | network = "stackhpc-dev" 4 | subnet = "stackhpc-dev" 5 | } 6 | ] 7 | control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment 8 | other_node_flavor = "en1.xsmall" 9 | state_volume_type = "unencrypted" 10 | home_volume_type = "unencrypted" 11 | -------------------------------------------------------------------------------- /environments/.stackhpc/tofu/LEAFCLOUD.tfvars: -------------------------------------------------------------------------------- 1 | cluster_networks = [ 2 | { 3 | network = "slurmapp-ci" 4 | subnet = "slurmapp-ci" 5 | } 6 | ] 7 | control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment 8 | other_node_flavor = "en1.xsmall" 9 | state_volume_type = "unencrypted" 10 | home_volume_type = "unencrypted" 11 | -------------------------------------------------------------------------------- /environments/.stackhpc/tofu/SMS.tfvars: -------------------------------------------------------------------------------- 1 | cluster_networks = [ 2 | { 3 | network = "stackhpc-ipv4-geneve" 4 | subnet = "stackhpc-ipv4-geneve-subnet" 5 | } 6 | ] 7 | control_node_flavor = "general.v1.small" 8 | other_node_flavor = "general.v1.small" -------------------------------------------------------------------------------- /environments/.stackhpc/tofu/cluster_image.auto.tfvars.json: -------------------------------------------------------------------------------- 1 | { 2 | "cluster_image": { 3 | "RL8": "openhpc-RL8-250514-1502-5a923b2c", 4 | "RL9": "openhpc-RL9-250514-1502-5a923b2c" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /environments/common/.gitignore: -------------------------------------------------------------------------------- 1 | inventory/hosts 2 | -------------------------------------------------------------------------------- /environments/common/README.md: -------------------------------------------------------------------------------- 1 | # Common configuration 2 | 3 | This contains an inventory that defines variables which are common between the 4 | `production` and `development` environments. It is not intended to be used in 5 | a standalone fashion to deploy infrastructure, but is instead 6 | referenced in `ansible.cfg` from the `production` and `development` configurations. 7 | 8 | The pattern we use is that all resources referenced in the inventory 9 | are located in the environment directory containing the inventory that 10 | references them. For example, the file referenced in `inventory/group_vars/prometheus/defaults.yml` 11 | using the variable `prometheus_alert_rules_files` references a file in the 12 | `files` directory relative to this one. 13 | -------------------------------------------------------------------------------- /environments/common/files/grafana/grafana.repo.j2: -------------------------------------------------------------------------------- 1 | {{ ansible_managed | comment }} 2 | [grafana] 3 | baseurl = {{ appliances_pulp_url }}/pulp/content/{{ appliances_pulp_repos.grafana[ansible_distribution_major_version] | appliances_repo_to_subpath }} 4 | enabled = 0 5 | name = grafana 6 | async = 1 7 | gpgcheck = 0 8 | {% if 'dnf_repos' in group_names and dnf_repos_password is defined %} 9 | password = {{ dnf_repos_password }} 10 | username = {{ dnf_repos_username }} 11 | {% endif %} 12 | -------------------------------------------------------------------------------- /environments/common/files/opensearch/internal_users.yml.j2: -------------------------------------------------------------------------------- 1 | --- 2 | # See https://opensearch.org/docs/latest/security-plugin/configuration/yaml#internal_usersyml 3 | 4 | _meta: 5 | type: "internalusers" 6 | config_version: 2 7 | 8 | admin: 9 | hash: "{{ opensearch_admin_password_hash }}" 10 | reserved: true 11 | backend_roles: 12 | - "admin" 13 | description: "Admin user" 14 | -------------------------------------------------------------------------------- /environments/common/files/prometheus/rules/precompute.rules: -------------------------------------------------------------------------------- 1 | # Required for openhpc dashboard 2 | 3 | groups: 4 | - name: opehnpc 5 | interval: 60s 6 | rules: 7 | - record: node_cpu_system_seconds:record 8 | expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node"}[60s]))) 9 | - record: node_cpu_user_seconds:record 10 | expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="user",job="node"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node"}[60s]))) 11 | - record: node_cpu_iowait_seconds:record 12 | expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="iowait",job="node"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node"}[60s]))) 13 | - record: node_cpu_other_seconds:record 14 | expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode!="idle",mode!="user",mode!="system",mode!="iowait",job="node"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node"}[60s]))) 15 | - record: node_cpu_scaling_frequency_hertz_avg:record 16 | expr: avg by (instance) (node_cpu_scaling_frequency_hertz) 17 | - record: node_cpu_scaling_frequency_hertz_min:record 18 | expr: min by (instance) (node_cpu_scaling_frequency_hertz) 19 | - record: node_cpu_scaling_frequency_hertz_max:record 20 | expr: max by (instance) (node_cpu_scaling_frequency_hertz) 21 | -------------------------------------------------------------------------------- /environments/common/files/prometheus/rules/slurm.rules: -------------------------------------------------------------------------------- 1 | 2 | groups: 3 | - name: Slurm 4 | rules: 5 | - alert: SlurmNodeDown 6 | annotations: 7 | description: '{{ $value }} Slurm nodes are in down status' 8 | summary: 'At least one Slurm node is down.' 9 | expr: "slurm_nodes_down > 0\n" 10 | labels: 11 | severity: critical 12 | - alert: SlurmNodeFail 13 | annotations: 14 | description: '{{ $value }} Slurm nodes are in fail status' 15 | summary: 'At least one Slurm node is failed.' 16 | expr: "slurm_nodes_fail > 0\n" 17 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/alertmanager.yml: -------------------------------------------------------------------------------- 1 | 2 | alertmanager_port: '9093' # defined here as required for prometheus 3 | 4 | alertmanager_slack_receiver_name: slack-receiver 5 | alertmanager_slack_receiver_send_resolved: true 6 | alertmanager_slack_receiver: # defined here as needs prometheus address 7 | name: "{{ alertmanager_slack_receiver_name }}" 8 | slack_configs: 9 | - channel: "{{ alertmanager_slack_integration.channel | default('none') }}" 10 | api_url: https://slack.com/api/chat.postMessage 11 | http_config: 12 | authorization: 13 | credentials: "{{ alertmanager_slack_integration.app_creds | default('none') }}" 14 | text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" 15 | title_link: "{{ prometheus_web_external_url }}/alerts?receiver={{ alertmanager_slack_receiver_name }}" 16 | send_resolved: "{{ alertmanager_slack_receiver_send_resolved }}" 17 | 18 | alertmanager_web_external_url: "http://{{ hostvars[groups['alertmanager'].0].ansible_host }}:{{ alertmanager_port}}/" 19 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/ansible_init.yml: -------------------------------------------------------------------------------- 1 | ansible_init_wait: 300 # seconds 2 | 3 | ansible_init_pip_packages: 4 | # role defaults: 5 | - ansible 6 | - jmespath 7 | - requests 8 | # custom: 9 | - netaddr # required for gateway role 10 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/basic_users.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # See ansible/roles/basic_users/README.md for variable definitions. 4 | 5 | basic_users_users: [] 6 | 7 | # The following are defined for the purpose of compute-init 8 | basic_users_homedir_server: "{{ groups['control'] | first }}" 9 | basic_users_homedir_client: "{{ groups['login'] | first }}" -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/filebeat.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # Path to filebeat.yml configuration file template 4 | filebeat_config_path: "{{ appliances_repository_root }}/environments/common/files/filebeat/filebeat.yml" 5 | 6 | # User that runs the filebeat container 7 | filebeat_podman_user: podman -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/firewalld.yml: -------------------------------------------------------------------------------- 1 | # See ansible/roles/firewalld/README.md 2 | # for variable definitions. 3 | 4 | firewalld_configs_default: 5 | # A list of dicts defining firewalld rules. 6 | # Using the "everything" template firewalld is deployed on the login node to enable fail2ban. 7 | # However by default we rely on openstack security groups so make firewalld permissive. 8 | # Each dict contains: 9 | # name: An arbitrary name or description 10 | # group: An ansible group name - this rule is applied if the fail2ban node is in this group 11 | # rule: A dict of parameters passed to the `ansible.posix.firewalld` module. 12 | # FaiBy default we rely on openstack security groups so 13 | - name: Make firewalld permissive 14 | group: openhpc 15 | rule: 16 | zone: public 17 | state: enabled 18 | target: ACCEPT 19 | permanent: yes 20 | 21 | firewalld_configs_extra: [] # list of dicts with parameters as for firewalld_configs_default 22 | 23 | firewalld_configs: "{{ (firewalld_configs_default + firewalld_configs_extra) | selectattr('group', 'in', group_names) | map(attribute='rule') }}" 24 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/freeipa_server.yml: -------------------------------------------------------------------------------- 1 | # See ansible/roles/freeipa/README.md 2 | # These vars are only used when freeipa_server is enabled. They are not required when enabling only freeipa_client 3 | freeipa_realm: "{{ openhpc_cluster_name | upper }}.{{ cluster_domain_suffix | upper }}" 4 | freeipa_ds_password: "{{ vault_freeipa_ds_password }}" 5 | freeipa_admin_password: "{{ vault_freeipa_admin_password }}" 6 | # the below doesn't use ansible_default_ipv4.address as that requires facts, and allows for templating when group freeipa_server is empty 7 | freeipa_server_ip: "{{ hostvars[groups['freeipa_server'].0].ansible_host if groups['freeipa_server'] else false }}" 8 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/hpctests.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # See: ansible/roles/hpctests/README.md 3 | # for variable definitions. 4 | 5 | # hpctests_user: 6 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/k3s.yml: -------------------------------------------------------------------------------- 1 | k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first].k3s_bootstrap_token | default('') }}" 2 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/manila.yml: -------------------------------------------------------------------------------- 1 | # Default configuration for manila file shares, see 2 | # https://github.com/stackhpc/ansible-role-os-manila-mount 3 | # for all variable definitions, and override in your environment. 4 | 5 | os_manila_mount_shares: [] 6 | # - share_name: 7 | # share_user: 8 | # mount_path: 9 | # mount_user: 10 | # mount_group: 11 | # mount_mode: 12 | 13 | # os_manila_mount_ceph_version: nautilus # role default for RockyLinux 8 14 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/mysql.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # See ansible/roles/mysql for variable definitions. 4 | 5 | mysql_host: "{{ hostvars[groups['mysql'] | first].api_address }}" 6 | 7 | # The user which runs the mysql container 8 | mysql_podman_user: podman 9 | 10 | # Slurm recommends larger than default values: https://slurm.schedmd.com/accounting.html 11 | mysql_mysqld_options: 12 | - innodb-buffer-pool-size=1024M 13 | - innodb-lock-wait-timeout=900 14 | 15 | mysql_root_password: "{{ vault_mysql_root_password }}" 16 | mysql_datadir: "{{ appliances_state_dir | default('/var/lib') }}/mysql" 17 | 18 | mysql_databases: 19 | - name: slurm_acct_db 20 | config_file: '' 21 | login_user: root 22 | login_password: "{{ mysql_root_password }}" 23 | login_host: "{{ mysql_host }}" 24 | 25 | mysql_users: 26 | - name: slurm 27 | host: "%" 28 | password: "{{ vault_mysql_slurm_password }}" 29 | priv: "slurm_acct_db.*:ALL" 30 | login_user: root 31 | login_password: "{{ mysql_root_password }}" 32 | login_host: "{{ mysql_host }}" 33 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/opensearch.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # See: https://opensearch.org/docs/latest/security-plugin/configuration/index/ 3 | 4 | # Path to template that specifies opensearch users 5 | opensearch_internal_users_path: "{{ appliances_repository_root }}/environments/common/files/opensearch/internal_users.yml.j2" 6 | 7 | # define an idempotent bcrypt hash for the above (requires a 128bit salt in base64 encoding): 8 | opensearch_admin_password_salt: "{{ (2 | pow(128) | int) | random(seed=inventory_hostname) | b64encode }}" 9 | opensearch_admin_password_hash: "{{ vault_elasticsearch_admin_password | password_hash('bcrypt', opensearch_admin_password_salt[0:22]) }}" 10 | 11 | # user running the opensearch container 12 | opensearch_podman_user: podman 13 | 14 | # Path to host directories 15 | opensearch_config_path: "{{ appliances_state_dir | default('/usr/share') }}/opensearch/config" 16 | opensearch_data_path: "{{ appliances_state_dir | default('/usr/share') }}/opensearch/data" 17 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/os-manila-mount.yml: -------------------------------------------------------------------------------- 1 | # Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are 2 | # now generated by dnf_repos to allow injecting Ark creds: 3 | os_manila_mount_ceph_rpm_repos: [] 4 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/podman.yml: -------------------------------------------------------------------------------- 1 | podman_users: "{{ [appliances_local_users_podman] }}" # user to use for podman 2 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/proxy.yml: -------------------------------------------------------------------------------- 1 | # default proxy address to first squid api address port 3128 if squid group non-empty, else empty string to avoid breaking hostvars 2 | proxy_http_proxy: "{{ 'http://' + hostvars[groups['squid'].0].api_address + ':' + (squid_http_port | string) if groups['squid'] else '' }}" 3 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/pulp.yml: -------------------------------------------------------------------------------- 1 | pulp_site_port: 8080 2 | 3 | # If using Ark directly (no local Pulp server), override the following with Ark creds 4 | 5 | # dnf_repos_username: 6 | # dnf_repos_password: 7 | 8 | # If instead using local Pulp server, override below with Ark creds 9 | 10 | # pulp_site_upstream_username: 11 | # pulp_site_upstream_password: 12 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/selinux.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | selinux_state: disabled 4 | selinux_policy: targeted 5 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/slurm_exporter.yml: -------------------------------------------------------------------------------- 1 | slurm_exporter_port: 9341 # as defined by [1] and implemented in [2] 2 | #[1]: https://github.com/prometheus/prometheus/wiki/Default-port-allocations 3 | #[2]: https://github.com/stackhpc/prometheus-slurm-exporter/blob/master/lib/systemd/prometheus-slurm-exporter.service 4 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/squid.yml: -------------------------------------------------------------------------------- 1 | squid_http_port: 3128 # defined here for proxy role 2 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/sshd.yaml: -------------------------------------------------------------------------------- 1 | sshd_password_authentication: "{{ sssd_install_ldap | default(false) | bool }}" 2 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/systemd.yml: -------------------------------------------------------------------------------- 1 | _systemd_requiresmount_statedir: | 2 | {% if appliances_state_dir is defined %} 3 | [Unit] 4 | RequiresMountsFor={{ appliances_state_dir | default('') }} 5 | {% endif %} 6 | 7 | systemd_dropins: 8 | # NB: mysql does not need _systemd_requiresmount_statedir as role handles state dir correctly 9 | opensearch: 10 | group: opensearch 11 | content: "{{ _systemd_requiresmount_statedir }}" 12 | grafana-server: 13 | group: grafana 14 | content: "{{ _systemd_requiresmount_statedir }}" 15 | slurmdbd: 16 | group: openhpc 17 | content: "{{ _systemd_requiresmount_statedir }}" 18 | slurmctld: 19 | group: openhpc 20 | content: "{{ _systemd_requiresmount_statedir }}" 21 | prometheus: 22 | group: prometheus 23 | content: "{{ _systemd_requiresmount_statedir }}" 24 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/all/update.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | update_enable: false 4 | # These variables define the packages updates and are passed to ansible's yum module parameters with the same names: https://docs.ansible.com/ansible/latest/collections/ansible/builtin/yum_module.html 5 | update_name: '*' 6 | update_state: latest 7 | update_exclude: 8 | - grafana 9 | - apptainer # see https://github.com/stackhpc/ansible-slurm-appliance/pull/245 10 | update_disablerepo: omit 11 | # Log changes during update here on localhost: 12 | update_log_path: "{{ appliances_environment_root }}/logs/{{ inventory_hostname }}-updates.log" 13 | -------------------------------------------------------------------------------- /environments/common/inventory/group_vars/builder/defaults.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # NOTE: Might be better of as extra vars or in a builder specific inventory as 3 | # as dependent on alphabetical ordering of groups, so if these variables are 4 | # defined elsewhere the group that is ordered lower will determine the values. 5 | update_enable: true 6 | openhpc_slurm_service_started: false 7 | nfs_client_mnt_state: present 8 | block_devices_partition_state: skip 9 | block_devices_filesystem_state: skip 10 | block_devices_mount_state: present 11 | basic_users_manage_homedir: false 12 | grafana_state: stopped # as it tries to listen on the "real" grafana node 13 | block_devices_configurations: [] # as volumes will not be attached to Packer build VMs 14 | mysql_state: stopped # as it tries to connect to real mysql node 15 | opensearch_state: stopped # avoid writing config+certs+db into image 16 | cuda_persistenced_state: stopped # probably don't have GPU in Packer build VMs 17 | firewalld_enabled: false # dnf install of firewalld enables it 18 | firewalld_state: stopped 19 | squid_started: false 20 | squid_enabled: false 21 | squid_cache_disk: 0 # just needs to be defined 22 | squid_cache_mem: 0 23 | tuned_started: false 24 | tuned_enabled: false 25 | sssd_started: false 26 | sssd_enabled: false 27 | appliances_mode: build 28 | -------------------------------------------------------------------------------- /environments/common/layouts/README.md: -------------------------------------------------------------------------------- 1 | # Layouts 2 | 3 | This folder contains some predefined group mappings. You can copy them into 4 | an environment folder if you wish to modify them or just reference them directly 5 | in ansible.cfg as another inventory file. If you are referencing them in the 6 | inventory file, it is advisable to put them just after the common environment. -------------------------------------------------------------------------------- /environments/common/layouts/minimal: -------------------------------------------------------------------------------- 1 | [nfs:children] 2 | cluster 3 | 4 | [openhpc:children] 5 | cluster 6 | 7 | [mysql:children] 8 | control 9 | -------------------------------------------------------------------------------- /environments/skeleton/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": "foo", 3 | "description" : "Describe the environment here" 4 | } 5 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/README.md: -------------------------------------------------------------------------------- 1 | # {{ cookiecutter.environment | title }} cluster 2 | 3 | {{ cookiecutter.description }} 4 | 5 | See the main README.md in the repo root for an overview and general install instructions. Any environment-specific instructions should be added here. -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/activate: -------------------------------------------------------------------------------- 1 | export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) 2 | echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" 3 | 4 | export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}" 5 | 6 | export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") 7 | echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" 8 | 9 | export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") 10 | echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" 11 | 12 | export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") 13 | echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" 14 | 15 | export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") 16 | echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" 17 | 18 | if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then 19 | export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg 20 | fi 21 | 22 | 23 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | any_errors_fatal = True 3 | stdout_callback = debug 4 | stderr_callback = debug 5 | gathering = smart 6 | forks = 30 7 | host_key_checking = False 8 | inventory = ../common/inventory,inventory 9 | collections_path = ../../ansible/collections 10 | roles_path = ../../ansible/roles 11 | filter_plugins = ../../ansible/filter_plugins 12 | 13 | [ssh_connection] 14 | ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null 15 | pipelining = True 16 | 17 | [inventory] 18 | # Fail when any inventory source cannot be parsed. 19 | any_unparsed_is_failed = True 20 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/hooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/skeleton/{{cookiecutter.environment}}/hooks/.gitkeep -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stackhpc/ansible-slurm-appliance/8e4d80cee213c3adf28301d5259b8a6ddf0f3be8/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/.gitkeep -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml: -------------------------------------------------------------------------------- 1 | # Uncomment below and add Slack bot app creds in the adjacent file 2 | # vault_alertmanager.yml for Slack integration: 3 | # 4 | # alertmanager_slack_integration: 5 | # channel: '#alerts' 6 | # app_creds: "{% raw %}{{ vault_alertmanager_slack_integration_app_creds }}{% endraw %}" 7 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml: -------------------------------------------------------------------------------- 1 | basic_users_users: 2 | - name: demo_user 3 | password: "{% raw %}{{ vault_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}{% endraw %}" # idempotent 4 | uid: 1005 5 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml: -------------------------------------------------------------------------------- 1 | grafana_auth_anonymous: true -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml: -------------------------------------------------------------------------------- 1 | hpctests_user: demo_user 2 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml: -------------------------------------------------------------------------------- 1 | # Add a bot token here THEN VAULT-ENCRYPT this file! 2 | 3 | #vault_alertmanager_slack_integration_app_creds: '' 4 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/inventory/groups: -------------------------------------------------------------------------------- 1 | ../../../common/layouts/everything -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ opentofu external data program to list baremetal nodes 3 | 4 | Example usage: 5 | 6 | data "external" "example" { 7 | program = [this_file] 8 | } 9 | 10 | The external data resource's result attribute then contains a mapping of 11 | Ironic node names to their UUIDs. 12 | 13 | An empty list is returned if: 14 | - There are no baremetal nodes 15 | - The listing fails for any reason, e.g. 16 | - there is no baremetal service 17 | - admin credentials are required and are not provided 18 | """ 19 | 20 | import openstack 21 | import json 22 | 23 | nodes = [] 24 | proxy = None 25 | output = {} 26 | conn = openstack.connection.from_config() 27 | try: 28 | proxy = getattr(conn, 'baremetal', None) 29 | except Exception: 30 | pass 31 | if proxy is not None: 32 | nodes = proxy.nodes() 33 | for node in nodes: 34 | output[node.name] = node.id 35 | print(json.dumps(output)) 36 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf: -------------------------------------------------------------------------------- 1 | data "external" "baremetal_nodes" { 2 | # returns an empty map if cannot list baremetal nodes 3 | program = ["${path.module}/baremetal-node-list.py"] 4 | query = {} 5 | } 6 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf: -------------------------------------------------------------------------------- 1 | resource "local_file" "hosts" { 2 | content = templatefile("${path.module}/inventory.tpl", 3 | { 4 | "cluster_name": var.cluster_name, 5 | "cluster_domain_suffix": var.cluster_domain_suffix, 6 | "control": openstack_compute_instance_v2.control 7 | "login_groups": module.login 8 | "compute_groups": module.compute 9 | "state_dir": var.state_dir 10 | "cluster_home_volume": var.home_volume_provisioning != "none" 11 | }, 12 | ) 13 | filename = "../inventory/hosts.yml" 14 | } 15 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/tofu/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.7" # templatestring() function 3 | required_providers { 4 | openstack = { 5 | source = "terraform-provider-openstack/openstack" 6 | version = "~>3.0.0" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/tofu/network.tf: -------------------------------------------------------------------------------- 1 | 2 | data "openstack_networking_network_v2" "cluster_net" { 3 | 4 | for_each = {for net in var.cluster_networks: net.network => net} 5 | 6 | name = each.value.network 7 | } 8 | 9 | data "openstack_networking_subnet_v2" "cluster_subnet" { 10 | 11 | for_each = {for net in var.cluster_networks: net.network => net} 12 | 13 | name = each.value.subnet 14 | } 15 | 16 | data "openstack_identity_auth_scope_v3" "scope" { 17 | # This is an arbitrary name which is only used as a unique identifier so an 18 | # actual token isn't used as the ID. 19 | name = "scope" 20 | } 21 | 22 | data "openstack_networking_secgroup_v2" "login" { 23 | for_each = toset(var.login_security_groups) 24 | 25 | name = each.key 26 | tenant_id = data.openstack_identity_auth_scope_v3.scope.project_id 27 | } 28 | 29 | data "openstack_networking_secgroup_v2" "nonlogin" { 30 | for_each = toset(var.nonlogin_security_groups) 31 | 32 | name = each.key 33 | tenant_id = data.openstack_identity_auth_scope_v3.scope.project_id 34 | } 35 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 0.14" 3 | required_providers { 4 | openstack = { 5 | source = "terraform-provider-openstack/openstack" 6 | version = "~>3.0.0" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/network.tf: -------------------------------------------------------------------------------- 1 | 2 | data "openstack_networking_network_v2" "network" { 3 | 4 | for_each = {for net in var.networks: net.network => net} 5 | 6 | name = each.value.network 7 | } 8 | 9 | data "openstack_networking_subnet_v2" "subnet" { 10 | 11 | for_each = {for net in var.networks: net.network => net} 12 | 13 | name = each.value.subnet 14 | } 15 | -------------------------------------------------------------------------------- /environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ opentofu external data program to load inventory string variables from 3 | a (possibly vault-encrypted) secrets file. 4 | 5 | Example usage: 6 | 7 | data "external" "example" { 8 | program = [this_file] 9 | 10 | query = { 11 | path = "${path.module}/../inventory/group_vars/all/secrets.yml" 12 | } 13 | } 14 | 15 | The external data resource's result attribute then contains a mapping of 16 | variable names to values. 17 | 18 | NB: Only keys/values where values are strings are returned, in line with 19 | the external program protocol. 20 | 21 | NB: This approach is better than e.g. templating inventory vars as the 22 | inventory doesn't need to be valid, which is helpful when opentofu will 23 | template out hosts/groups. 24 | """ 25 | 26 | import sys, json, subprocess, yaml 27 | input = sys.stdin.read() 28 | secrets_path = json.loads(input)['path'] 29 | 30 | with open(secrets_path) as f: 31 | header = f.readline() 32 | if header.startswith('$ANSIBLE_VAULT'): 33 | cmd = ['ansible-vault', 'view', secrets_path] 34 | ansible = subprocess.run(cmd, capture_output=True, text=True) 35 | contents = ansible.stdout 36 | else: 37 | contents = f.read() 38 | 39 | data = yaml.safe_load(contents) 40 | 41 | output = {} 42 | for k, v in data.items(): 43 | if isinstance(v, str): 44 | output[k] = v 45 | print(json.dumps(output)) 46 | -------------------------------------------------------------------------------- /packer/.gitignore: -------------------------------------------------------------------------------- 1 | packer_cache 2 | roles 3 | output_* 4 | *.gz 5 | -------------------------------------------------------------------------------- /packer/ansible-inventory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # NOTE: This allows to make use of the ANSIBLE_CONFIG environment variable 4 | 5 | ansible-inventory --list --export 6 | -------------------------------------------------------------------------------- /packer/openhpc_extravars.yml: -------------------------------------------------------------------------------- 1 | workaround_ansible_issue_61497: yes # extravars files can't be empty 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ansible==6.7.0 # cloudalchemy.prometheus uses ansible.builtin.include, removed in ansible-core==2.16 => ansible==9 2 | openstacksdk 3 | python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild 4 | python-manilaclient 5 | python-ironicclient 6 | jmespath 7 | passlib[bcrypt]==1.7.4 8 | cookiecutter 9 | selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 10 | netaddr 11 | matplotlib 12 | pulp-cli==0.23.2 13 | beautifulsoup4==4.13.3 14 | --------------------------------------------------------------------------------