├── empty-resource ├── tests ├── unit │ ├── requirements.txt │ ├── test_os_platform.py │ ├── test_alert_rules │ │ ├── test_general.yaml │ │ ├── test_ipmi_dcmi.yaml │ │ ├── test_mega_raid.yaml │ │ ├── test_perccli.yaml │ │ ├── test_ssacli.yaml │ │ ├── test_ipmi_sel.yaml │ │ └── test_redfish.yaml │ ├── test_apt_helpers.py │ ├── test_checksum.py │ ├── test_literals.py │ ├── test_ssdlc.py │ └── test_hardware.py ├── manual │ ├── etc │ │ ├── 4_deploy_grafana_agent │ │ │ ├── outputs.tf │ │ │ ├── variables.tf │ │ │ ├── main.tf │ │ │ └── terragrunt.hcl │ │ ├── 2_add_k8s_cloud │ │ │ ├── variables.tf │ │ │ ├── outputs.tf │ │ │ ├── main.tf │ │ │ └── terragrunt.hcl │ │ ├── 1_add_machine │ │ │ ├── variables.tf │ │ │ ├── outputs.tf │ │ │ ├── terragrunt.hcl │ │ │ └── main.tf │ │ └── 3_deploy_cos │ │ │ ├── variables.tf │ │ │ ├── outputs.tf │ │ │ ├── terragrunt.hcl │ │ │ └── main.tf │ ├── scripts │ │ ├── get-local-ip.sh │ │ ├── get-kubeconfig.sh │ │ ├── get-preferred-ip.sh │ │ ├── wait-for-model-destroyed.sh │ │ ├── wait-for-model.sh │ │ ├── wait-for-application.sh │ │ ├── cleanup.sh │ │ ├── wait-for-microk8s.sh │ │ └── bootstrap.sh │ ├── jobs │ │ ├── eevee │ │ │ ├── job.tpl.yaml │ │ │ └── README.md │ │ ├── kongfu │ │ │ ├── job.tpl.yaml │ │ │ └── README.md │ │ ├── pianta │ │ │ ├── job.tpl.yaml │ │ │ └── README.md │ │ ├── torchtusk │ │ │ ├── job.tpl.yaml │ │ │ └── README.md │ │ ├── submit.sh │ │ └── README.md │ └── README.md ├── functional │ ├── requirements.txt │ ├── bundle.yaml.j2 │ ├── README.md │ └── conftest.py └── integration │ ├── requirements.txt │ ├── offers-overlay.yaml │ ├── export_mock_metrics.py │ ├── mock_data.py │ ├── utils.py │ ├── conftest.py │ └── test_cos_integration.py ├── requirements-dev.txt ├── requirements.txt ├── src ├── storelib_conf.template ├── prometheus_alert_rules │ ├── general.yaml │ ├── mega_raid.yaml │ ├── ipmi_dcmi.yaml │ ├── perccli.yaml │ ├── ssacli.yaml │ ├── ipmi_sel.yaml │ ├── lsi_sas.yaml │ ├── ipmi_sensors.yaml │ ├── dcgm.yaml │ └── smart.yaml ├── apt_helpers.py ├── os_platform.py ├── literals.py ├── ssdlc.py ├── config.py ├── prometheus_alert_rules_dynamic │ └── redfish.yaml ├── keys.py ├── hardware.py └── gpu_metrics │ └── dcgm_metrics.csv ├── templates ├── hardware-exporter.service.j2 └── hardware-exporter-config.yaml.j2 ├── .github ├── CODEOWNERS ├── workflows │ ├── test_prometheus_rules.yaml │ ├── release.yaml │ ├── promote.yaml │ └── cos_integration.yaml ├── .jira_sync_config.yaml └── ISSUE_TEMPLATE │ └── bug_report.yaml ├── SECURITY.md ├── DEVELOPMENT.md ├── .gitignore ├── charmcraft.yaml ├── tox.ini ├── README.md ├── config.yaml ├── metadata.yaml ├── CONTRIBUTING.md └── pyproject.toml /empty-resource: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/requirements.txt: -------------------------------------------------------------------------------- 1 | -r ../../requirements-dev.txt 2 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | coverage 3 | flake8 4 | parameterized 5 | -------------------------------------------------------------------------------- /tests/manual/etc/4_deploy_grafana_agent/outputs.tf: -------------------------------------------------------------------------------- 1 | output "model_name" { 2 | value = var.machine_model 3 | } 4 | -------------------------------------------------------------------------------- /tests/manual/scripts/get-local-ip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ip -4 -j a sho dev br0 | jq -r .[].addr_info[0].local 4 | -------------------------------------------------------------------------------- /tests/functional/requirements.txt: -------------------------------------------------------------------------------- 1 | async-lru 2 | pytest 3 | pytest-operator 4 | protobuf 5 | tenacity 6 | pydantic < 2 7 | -------------------------------------------------------------------------------- /tests/manual/scripts/get-kubeconfig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p ~/.kube 4 | sudo microk8s config > ~/.kube/config 5 | -------------------------------------------------------------------------------- /tests/manual/etc/2_add_k8s_cloud/variables.tf: -------------------------------------------------------------------------------- 1 | variable "kube_config" { 2 | description = "The file path to read the kube_config from" 3 | } 4 | -------------------------------------------------------------------------------- /tests/manual/scripts/get-preferred-ip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "$(ip -4 -j route get 2.2.2.2 | jq -r '.[] | .prefsrc')-$(ip -4 -j route get 2.2.2.2 | jq -r '.[] | .prefsrc')" 4 | -------------------------------------------------------------------------------- /tests/manual/jobs/eevee/job.tpl.yaml: -------------------------------------------------------------------------------- 1 | # job.yaml 2 | job_queue: eevee 3 | provision_data: 4 | distro: 5 | reserve_data: 6 | ssh_keys: 7 | - 8 | timeout: 21600 9 | -------------------------------------------------------------------------------- /tests/manual/jobs/kongfu/job.tpl.yaml: -------------------------------------------------------------------------------- 1 | # job.yaml 2 | job_queue: kongfu 3 | provision_data: 4 | distro: 5 | reserve_data: 6 | ssh_keys: 7 | - 8 | timeout: 21600 9 | -------------------------------------------------------------------------------- /tests/manual/jobs/pianta/job.tpl.yaml: -------------------------------------------------------------------------------- 1 | # job.yaml 2 | job_queue: pianta 3 | provision_data: 4 | distro: 5 | reserve_data: 6 | ssh_keys: 7 | - 8 | timeout: 21600 9 | -------------------------------------------------------------------------------- /tests/manual/jobs/torchtusk/job.tpl.yaml: -------------------------------------------------------------------------------- 1 | # job.yaml 2 | job_queue: torchtusk 3 | provision_data: 4 | distro: 5 | reserve_data: 6 | ssh_keys: 7 | - 8 | timeout: 21600 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cosl 2 | distro 3 | ops >= 2.2.0 4 | jinja2 5 | redfish # requests is included in this 6 | pydantic < 2 7 | git+https://github.com/canonical/prometheus-hardware-exporter.git@v1.1.0#egg=prometheus-hardware-exporter 8 | -------------------------------------------------------------------------------- /tests/manual/etc/2_add_k8s_cloud/outputs.tf: -------------------------------------------------------------------------------- 1 | output "k8s_cloud_name" { 2 | value = juju_kubernetes_cloud.k8s_cloud.name 3 | } 4 | 5 | output "k8s_cloud_credential" { 6 | value = juju_kubernetes_cloud.k8s_cloud.credential 7 | } 8 | -------------------------------------------------------------------------------- /src/storelib_conf.template: -------------------------------------------------------------------------------- 1 | # Debug Level: 2 | # 0 - No Debug 3 | # 1 - Level 1 4 | # 2 - Level 2 5 | DEBUGLEVEL=0 6 | DISABLELOG=1 7 | # Write option on startup 8 | # 0 - Append to existing debug file 9 | # 1 - create new file 10 | OVERWRITE=0 11 | # Directory where debug file will be created 12 | DEBUGDIR=$debug_dir 13 | -------------------------------------------------------------------------------- /tests/integration/requirements.txt: -------------------------------------------------------------------------------- 1 | jinja2 2 | juju~=3.5.0 # must be compatible with the juju CLI version installed by CI - see .github/workflows/cos_integration.yaml 3 | pytest 4 | pytest-operator 5 | prometheus-client 6 | pyinstaller # required to bundle export_mock_metrics script to send it to hw-oberver unit 7 | tenacity 8 | -------------------------------------------------------------------------------- /tests/manual/etc/1_add_machine/variables.tf: -------------------------------------------------------------------------------- 1 | variable "ssh_address" { 2 | description = "SSH address of the machine" 3 | } 4 | 5 | variable "public_key_file" { 6 | description = "The file path to read the public key from" 7 | } 8 | 9 | variable "private_key_file" { 10 | description = "The file path to read the private key from" 11 | } 12 | -------------------------------------------------------------------------------- /templates/hardware-exporter.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=HTTP service for prometheus hardware exporter. 3 | 4 | [Service] 5 | User=root 6 | Environment=PYTHONPATH={{ CHARMDIR }}/venv 7 | ExecStart=/usr/bin/python3 -m prometheus_hardware_exporter -c {{ CONFIG_FILE }} 8 | Restart=on-failure 9 | 10 | [Install] 11 | WantedBy=multi-user.target 12 | 13 | -------------------------------------------------------------------------------- /tests/manual/etc/2_add_k8s_cloud/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | juju = { 4 | version = "~> 0.17.0" 5 | source = "juju/juju" 6 | } 7 | } 8 | } 9 | 10 | provider "juju" {} 11 | 12 | resource "juju_kubernetes_cloud" "k8s_cloud" { 13 | name = "k8s" 14 | kubernetes_config = file(var.kube_config) 15 | } 16 | -------------------------------------------------------------------------------- /tests/manual/scripts/wait-for-model-destroyed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL="$1" 4 | 5 | if [ -z "$MODEL" ]; then 6 | echo "Wait for the model to be destroyed." 7 | echo "" 8 | echo "Usage: $0 " 9 | exit 1 10 | fi 11 | 12 | while juju show-model $MODEL > /dev/null ; do 13 | echo "$MODEL still exists.." 14 | sleep 5 15 | done; 16 | -------------------------------------------------------------------------------- /tests/manual/etc/1_add_machine/outputs.tf: -------------------------------------------------------------------------------- 1 | output "machine_model" { 2 | value = juju_model.hw-obs.name 3 | } 4 | output "ubuntu_name" { 5 | value = juju_application.ubuntu.name 6 | } 7 | 8 | output "hardware_observer_name" { 9 | value = juju_application.hardware-observer.name 10 | } 11 | 12 | output "machine_base" { 13 | value = juju_machine.machine.base 14 | } 15 | -------------------------------------------------------------------------------- /tests/manual/scripts/wait-for-model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL="$1" 4 | 5 | if [ -z "$MODEL" ]; then 6 | echo "Wait for all applications in the model reaches active and idle." 7 | echo "" 8 | echo "Usage: $0 " 9 | exit 1 10 | fi 11 | 12 | juju wait-for model $MODEL --timeout=20m0s --query='forEach(applications, app => app.status == "active")' 13 | -------------------------------------------------------------------------------- /tests/manual/scripts/wait-for-application.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL="$1" 4 | APPLICATION="$2" 5 | 6 | if [ -z "$APPLICATION" ]; then 7 | echo "Wait for an juju application to reach active and idle." 8 | echo "" 9 | echo "Usage: $0 " 10 | exit 1 11 | fi 12 | 13 | juju switch $MODEL 14 | 15 | juju wait-for application $APPLICATION --query='status=="active" || status=="idle"' 16 | -------------------------------------------------------------------------------- /tests/manual/scripts/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -xe 4 | 5 | # Clean up all resources idempotently 6 | 7 | juju destroy-model hw-obs --no-prompt --force --no-wait || true 8 | juju destroy-model cos --no-prompt --force --no-wait --destroy-storage || true 9 | juju remove-cloud k8s --client --controller "$(juju controllers --format json | jq -r '."current-controller"')" || true 10 | sudo /sbin/remove-juju-services || true 11 | -------------------------------------------------------------------------------- /tests/manual/etc/3_deploy_cos/variables.tf: -------------------------------------------------------------------------------- 1 | variable "machine_model" { 2 | description = "The machine model name" 3 | } 4 | 5 | variable "metallb_ip_ranges" { 6 | description = "The public IP addresses to services running in the Kubernetes cluster" 7 | } 8 | 9 | variable "k8s_cloud_name" { 10 | description = "The name of the k8s cloud" 11 | } 12 | 13 | variable "k8s_cloud_credential" { 14 | description = "The credential for the k8s cloud" 15 | } 16 | -------------------------------------------------------------------------------- /tests/manual/etc/3_deploy_cos/outputs.tf: -------------------------------------------------------------------------------- 1 | output "model_name" { 2 | value = local.cos_model_name 3 | } 4 | 5 | output "receive-remote-write-offer-url" { 6 | value = module.cos-lite-terraform.prometheus-receive-remote-write-offer-url 7 | } 8 | 9 | output "grafana-dashboard-offer-url" { 10 | value = module.cos-lite-terraform.grafana-dashboard-offer-url 11 | } 12 | 13 | output "loki-logging-offer-url" { 14 | value = module.cos-lite-terraform.loki-logging-offer-url 15 | } 16 | -------------------------------------------------------------------------------- /tests/manual/jobs/torchtusk/README.md: -------------------------------------------------------------------------------- 1 | # Testable Exporters 2 | 3 | - [x] Prometheus Hardware Exporter 4 | - [x] ipmi_dcmi 5 | - [x] ipmi_sel 6 | - [x] ipmi_sensor 7 | - [x] redfish 8 | - [ ] hpe_ssa (ssacli) 9 | - [ ] lsi_sas_2 (sas2ircu) 10 | - [ ] lsi_sas_3 (sas3ircu) 11 | - [ ] mega_raid (storcli) 12 | - [ ] poweredge_raid (perccli) 13 | - [x] DCGM Exporter (require NVIDIA) 14 | - [x] dcgm 15 | - [x] Smartctl Exporter (require S.M.A.R.T disks) 16 | - [x] smartctl 17 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/general.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: HardwareObserver 3 | rules: 4 | - alert: CollectorFailed 5 | expr: '{__name__=~"(.*)_collector_failed"} == 1' 6 | for: 30m 7 | labels: 8 | severity: error 9 | annotations: 10 | summary: Collector failed. (instance {{ $labels.instance }}) 11 | description: | 12 | A collector failed to fetch the metrics. Please reach out to hardware-observer maintainers. 13 | LABELS = {{ $labels }} 14 | -------------------------------------------------------------------------------- /templates/hardware-exporter-config.yaml.j2: -------------------------------------------------------------------------------- 1 | port: {{ PORT }} 2 | level: {{ LEVEL }} 3 | collect_timeout: {{ COLLECT_TIMEOUT }} 4 | {% if COLLECTORS | length > 0 %} 5 | enable_collectors: 6 | {% for collector in COLLECTORS %} 7 | - {{collector}} 8 | {% endfor %} 9 | {% endif %} 10 | 11 | {% if REDFISH_ENABLE %} 12 | redfish_host: "{{ REDFISH_HOST }}" 13 | redfish_username: "{{ REDFISH_USERNAME }}" 14 | redfish_password: "{{ REDFISH_PASSWORD }}" 15 | redfish_client_timeout: "{{ REDFISH_CLIENT_TIMEOUT }}" 16 | {% endif %} 17 | -------------------------------------------------------------------------------- /tests/integration/offers-overlay.yaml: -------------------------------------------------------------------------------- 1 | applications: 2 | alertmanager: 3 | offers: 4 | alertmanager-karma-dashboard: 5 | endpoints: 6 | - karma-dashboard 7 | grafana: 8 | offers: 9 | grafana-dashboards: 10 | endpoints: 11 | - grafana-dashboard 12 | loki: 13 | offers: 14 | loki-logging: 15 | endpoints: 16 | - logging 17 | prometheus: 18 | offers: 19 | prometheus-receive-remote-write: 20 | endpoints: 21 | - receive-remote-write 22 | -------------------------------------------------------------------------------- /tests/manual/etc/2_add_k8s_cloud/terragrunt.hcl: -------------------------------------------------------------------------------- 1 | dependency "add_machine" { 2 | config_path = "../1_add_machine" 3 | skip_outputs = true 4 | } 5 | 6 | terraform { 7 | before_hook "create-dot-kube-dir" { 8 | commands = ["plan"] 9 | execute = ["mkdir", "-p", format("%s/.kube", get_env("HOME"))] 10 | } 11 | 12 | before_hook "touch-kubeconfig" { 13 | commands = ["plan"] 14 | execute = ["touch", format("%s/.kube/config", get_env("HOME"))] 15 | } 16 | } 17 | 18 | inputs = { 19 | kube_config = format("%s/.kube/config", get_env("HOME")) 20 | } 21 | -------------------------------------------------------------------------------- /tests/manual/jobs/submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | JOB="$1" 6 | SERIES="$2" 7 | SSH_IMPORT_ID="$3" 8 | 9 | if [ ! -d "$JOB" ] || [ -z "$SERIES" ] || [ -z "$SSH_IMPORT_ID" ]; then 10 | echo "Usage: $0 " 11 | exit 1 12 | fi 13 | 14 | # testflinger cannot access /tmp file because it does not have necessary permission 15 | TEMPFILE="./.tmp-job.yaml" 16 | touch $TEMPFILE 17 | 18 | sed -e "s//$SERIES/g" -e "s//$SSH_IMPORT_ID/g" "$JOB/job.tpl.yaml" | tee "$TEMPFILE" 19 | 20 | testflinger submit $TEMPFILE 21 | 22 | rm -f $TEMPFILE 23 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation 2 | # To update the file: 3 | # - Edit it in the canonical/solutions-engineering-automation repository. 4 | # - Open a PR with the changes. 5 | # - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes. 6 | # 7 | # These owners will be the default owners for everything in the repo. Unless a 8 | # later match takes precedence, @canonical/soleng-reviewers will be requested for 9 | # review when someone opens a pull request. 10 | * @canonical/soleng-reviewers 11 | -------------------------------------------------------------------------------- /tests/manual/scripts/wait-for-microk8s.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | APPLICATION="$1" 5 | 6 | TIMEOUT=300 7 | DELAY=5 8 | START=$(date +%s) 9 | 10 | while true; do 11 | if sudo microk8s kubectl create clusterrole test --verb=get --resource=pods --request-timeout=5s >/dev/null 2>&1; then 12 | echo "✅ Kubernetes API is ready" 13 | sudo microk8s kubectl delete clusterrole test --ignore-not-found >/dev/null 2>&1 14 | break 15 | fi 16 | 17 | NOW=$(date +%s) 18 | if (( NOW - START > TIMEOUT )); then 19 | echo "❌ Timed out waiting for Kubernetes API" 20 | exit 1 21 | fi 22 | 23 | echo "⏳ Waiting for Kubernetes API..." 24 | sleep "$DELAY" 25 | done 26 | -------------------------------------------------------------------------------- /tests/functional/bundle.yaml.j2: -------------------------------------------------------------------------------- 1 | # Test basic deployment: 2 | # ubuntu:juju-info <-> grafana-agent:juju-info 3 | # ubuntu:juju-info <-> hardware-observer:general-info 4 | # grafana-agent:cos-agent <-> hardware-observer:cos-agent 5 | 6 | default-base: {{ base }} 7 | 8 | machines: 9 | "0": 10 | 11 | applications: 12 | ubuntu: 13 | charm: ubuntu 14 | num_units: 1 15 | to: 16 | - "0" 17 | grafana-agent: 18 | charm: grafana-agent 19 | channel: 1/stable 20 | hardware-observer: 21 | charm: {{ charm }} 22 | options: 23 | redfish-disable: {{ redfish_disable }} 24 | 25 | relations: 26 | - - grafana-agent:juju-info 27 | - ubuntu:juju-info 28 | - - hardware-observer:general-info 29 | - ubuntu:juju-info 30 | -------------------------------------------------------------------------------- /tests/unit/test_os_platform.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | 5 | from os_platform import OSPlatform, UbuntuSeries, get_os_platform 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "release,series", 10 | [("22.04", UbuntuSeries.JAMMY), ("20.04", UbuntuSeries.FOCAL), ("NR", None)], 11 | ) 12 | @pytest.mark.parametrize("machine", ["AMD64", "x86_86", "arm64", "riscv64"]) 13 | def test_os_platform_series(release, series, machine): 14 | """Get platform from a patched machine.""" 15 | with patch("distro.info", return_value={"version": release}): 16 | with patch("platform.machine", return_value=machine): 17 | result = get_os_platform() 18 | assert result == OSPlatform(release=release, machine=machine) 19 | assert result.series == series 20 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | # Security policy 8 | 9 | 10 | ## Reporting a vulnerability 11 | To report a security issue, file a [Private Security Report](https://github.com/canonical/hardware-observer-operator/security/advisories/new) 12 | with a description of the issue, the steps you took to create the issue, affected versions, and, 13 | if known, mitigations for the issue. 14 | 15 | The [Ubuntu Security disclosure and embargo policy](https://ubuntu.com/security/disclosure-policy) 16 | contains more information about what you can expect when you contact us and what we expect from you. 17 | -------------------------------------------------------------------------------- /.github/workflows/test_prometheus_rules.yaml: -------------------------------------------------------------------------------- 1 | name: Test prometheus rules 2 | 3 | on: 4 | workflow_call: 5 | workflow_dispatch: 6 | pull_request: 7 | types: [opened, synchronize, reopened] 8 | branches: [main] 9 | paths-ignore: 10 | - "**.md" 11 | - "**.rst" 12 | 13 | concurrency: 14 | group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} 15 | cancel-in-progress: true 16 | 17 | jobs: 18 | promtool: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Checkout repo 22 | uses: actions/checkout@v3 23 | 24 | # prometheus snap includes promtool 25 | - name: Install prometheus snap 26 | run: sudo snap install prometheus 27 | 28 | - name: Check validity of prometheus alert rules 29 | run: | 30 | promtool check rules src/prometheus_alert_rules/*.yaml 31 | 32 | - name: Run unit tests for prometheus alert rules 33 | run: | 34 | promtool test rules tests/unit/test_alert_rules/*.yaml 35 | -------------------------------------------------------------------------------- /tests/manual/etc/4_deploy_grafana_agent/variables.tf: -------------------------------------------------------------------------------- 1 | variable "machine_model" { 2 | description = "The machine model name" 3 | } 4 | 5 | variable "grafana_agent_base" { 6 | description = "The base for grafana agent" 7 | } 8 | 9 | variable "ubuntu_name" { 10 | description = "The name of ubuntu charm" 11 | default = "ubuntu" 12 | } 13 | 14 | variable "hardware_observer_name" { 15 | description = "The name of hardware observer charm" 16 | default = "hardware observer" 17 | } 18 | 19 | variable "receive-remote-write-offer-url" { 20 | description = "Offer URL from prometheus-k8s:receive-remote-write application" 21 | type = string 22 | default = null 23 | } 24 | 25 | variable "grafana-dashboard-offer-url" { 26 | description = "Offer URL from grafana-k8s:grafana-dashboard application" 27 | type = string 28 | default = null 29 | } 30 | 31 | variable "loki-logging-offer-url" { 32 | description = "Offer URL from loki-k8s:logging application" 33 | type = string 34 | default = null 35 | } 36 | -------------------------------------------------------------------------------- /tests/manual/jobs/eevee/README.md: -------------------------------------------------------------------------------- 1 | # Testable Exporters 2 | 3 | - [x] Prometheus Hardware Exporter 4 | - [x] ipmi_dcmi 5 | - [x] ipmi_sel 6 | - [x] ipmi_sensor 7 | - [x] redfish 8 | - [x] hpe_ssa (ssacli) 9 | - [ ] lsi_sas_2 (sas2ircu) 10 | - [ ] lsi_sas_3 (sas3ircu) 11 | - [ ] mega_raid (storcli) 12 | - [ ] poweredge_raid (perccli) 13 | - [ ] DCGM Exporter (require NVIDIA) 14 | - [ ] dcgm 15 | - [x] Smartctl Exporter (require S.M.A.R.T disks) 16 | - [x] smartctl 17 | 18 | ## Running the tests 19 | 20 | You can run the functional tests for real hardware by following these steps: 21 | 22 | ```shell 23 | # Adding relation will be tested as part of the test case, so we need to remove it before running the tests 24 | juju remove-relation -m hw-obs hardware-observer grafana-agent 25 | 26 | # We don't have redfish credential for this machine 27 | juju config -m hw-obs hardware-observer redfish-disable=true 28 | 29 | # Running the tests 30 | tox -e func -- -v --realhw --model hw-obs --no-deploy --collectors ipmi_dcmi ipmi_sel ipmi_sensor hpe_ssa --keep-models 31 | ``` 32 | -------------------------------------------------------------------------------- /DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | ## Setup environment 4 | 5 | To start working on this charm, you'll need a working [development setup](https://juju.is/docs/sdk/dev-setup). 6 | 7 | You can create an environment for development with `tox`: 8 | 9 | ```shell 10 | tox devenv -e integration 11 | source venv/bin/activate 12 | ``` 13 | 14 | ## Testing 15 | 16 | This project uses `tox` for managing test environments. There are some pre-configured environments 17 | that can be used for linting and formatting code when you're preparing contributions to the charm: 18 | 19 | ```shell 20 | tox run -e format # update your code according to linting rules 21 | tox run -e lint # code style 22 | tox run -e unit # unit tests 23 | tox run -e integration # integration tests 24 | tox # runs 'format', 'lint', and 'unit' environments 25 | ``` 26 | 27 | ## Build the charm 28 | 29 | Build the charm in this git repository using: 30 | 31 | ```shell 32 | charmcraft pack 33 | ``` 34 | 35 | 10 | [![Charmhub Badge](https://charmhub.io/hardware-observer/badge.svg)](https://charmhub.io/hardware-observer) 11 | [![Release Edge](https://github.com/canonical/hardware-observer-operator/actions/workflows/release.yaml/badge.svg)](https://github.com/canonical/hardware-observer-operator/actions/workflows/release.yaml) 12 | 13 | # Hardware Observer Operator 14 | 15 | ## Overview 16 | [Charmhub Page](https://charmhub.io/hardware-observer) 17 | 18 | Hardware-observer is a [subordinate machine charm](https://juju.is/docs/sdk/charm-taxonomy#heading--subordinate-charms) that provides monitoring and alerting of hardware resources on bare-metal infrastructure. This charm leverages the following exporters to provide detailed metrics: 19 | 20 | - **Prometheus Hardware Exporter**: For collecting metrics from BMCs and RAID controllers. 21 | - **Smartctl Exporter**: For collecting SMART metrics from storage devices. 22 | 23 | ### Prometheus Hardware Exporter 24 | Hardware-observer collects and exports Prometheus metrics from BMCs (using the IPMI and newer Redfish protocols) and various SAS and RAID controllers through the use of the [prometheus-hardware-exporter](https://github.com/canonical/prometheus-hardware-exporter) project. It additionally configures Prometheus alert rules that are fired when the status of any metric is suboptimal. 25 | 26 | Appropriate collectors and alert rules are installed based on the availability of one or more of the RAID/SAS controllers mentioned below: 27 | 28 | - Broadcom MegaRAID controller 29 | - Dell PowerEdge RAID Controller 30 | - LSI SAS-2 controller 31 | - LSI SAS-3 controller 32 | - HPE Smart Array controller 33 | 34 | ### Smartctl Exporter 35 | The Smartctl Exporter integrates with the Hardware-observer to provide monitoring of storage device health via SMART data. Metrics are collected and exported to Prometheus using the [smartctl-exporter](https://github.com/prometheus-community/smartctl_exporter). 36 | 37 | This charm is ideal for monitoring hardware resources when used in conjunction with the [Canonical Observability Stack](https://charmhub.io/topics/canonical-observability-stack). 38 | 39 | ## Uploading Resources 40 | 41 | In order to manage third-party hardware resources, vendor-specific CLI tools need to be uploaded via `juju attach-resource`. 42 | 43 | In the [Resources page](https://charmhub.io/hardware-observer/resources) on Charmhub, the name of the resource along with the download URL can be found. 44 | 45 | 46 | ## Other Links 47 | 48 | 49 | 50 | - [Contributing](CONTRIBUTING.md) 51 | 52 | - See the [Juju SDK documentation](https://juju.is/docs/sdk) for more information about developing and improving charms. 53 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | # 4 | 5 | options: 6 | hardware-exporter-port: 7 | type: int 8 | default: 10200 9 | description: | 10 | Start the prometheus hardware exporter at "hardware-exporter-port". By default, 11 | it will start at port 10200. 12 | smartctl-exporter-port: 13 | type: int 14 | default: 10201 15 | description: | 16 | Start the prometheus smartctl exporter at "smartctl-exporter-port". By default, 17 | it will start at port 10201. 18 | dcgm-snap-channel: 19 | type: string 20 | default: "auto" 21 | description: | 22 | Valid inputs are auto, v3/stable, v3/candidate, v3/edge, v4/stable, v4/candidate, v4/edge. 23 | 24 | When set to auto, the charm automatically checks the installed NVIDIA driver version and 25 | selects the most appropriate channel. For example, with driver 580, it will install from 26 | v4-cuda13/stable. The charm won't block if the driver is not installed or loaded. 27 | 28 | The v3 channels are compatible with CUDA 10, 11, and 12; you simply choose the desired 29 | release risk (stable, candidate, or edge). 30 | 31 | The v4 channels provide more flexibility. Even if you select v4/edge, the charm still 32 | detects the driver version and installs the matching CUDA build. For example, 33 | with driver 570, it will install from v4-cuda12/edge. 34 | smartctl-exporter-snap-channel: 35 | type: string 36 | default: "latest/stable" 37 | description: | 38 | Channel to install the Smartctl exporter snap if the hardware has smart disk. By default, it will install 39 | from latest/stable. 40 | exporter-log-level: 41 | type: string 42 | default: "INFO" 43 | description: | 44 | Start the prometheus exporter with log level "exporter-log-level". By 45 | default, it will set to INFO. Allowed values are "DEBUG", "INFO", 46 | "WARNING", "ERROR", "CRITICAL". Values other than those will result in 47 | failure of the exporter. 48 | collect-timeout: 49 | type: int 50 | default: 10 51 | description: | 52 | Timeout for collectors' shell commands in seconds. Changing this will also change 53 | the scrape_timeout config option for prometheus for the cos-agent relation with 54 | grafana-agent. 55 | This value is also used for the redfish client's timeout parameter. 56 | The value of this timeout should not be greater than prometheus scrape_interval (which 57 | is 60 seconds by default), as it greater would cause the scrape_timeout to be 58 | greater than scrape_interval. 59 | redfish-username: 60 | type: string 61 | default: "" 62 | description: | 63 | BMC username to be used by the redfish collector. 64 | redfish-password: 65 | type: string 66 | default: "" 67 | description: | 68 | BMC password to be used by the redfish collector. 69 | redfish-disable: 70 | type: boolean 71 | default: true 72 | description: | 73 | By default the Redfish collector is disabled. Set this option to false to enable it. 74 | -------------------------------------------------------------------------------- /tests/unit/test_literals.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | from unittest.mock import patch 5 | 6 | import pytest 7 | from pydantic import ValidationError 8 | 9 | from literals import HWObserverConfig 10 | 11 | 12 | @pytest.fixture(autouse=True) 13 | def mock_driver_to_cuda(): 14 | with patch("literals.get_cuda_version_from_driver") as mock: 15 | yield mock 16 | 17 | 18 | @pytest.fixture(autouse=True) 19 | def mock_driver_version(): 20 | with patch("literals.get_nvidia_driver_version") as mock: 21 | yield mock 22 | 23 | 24 | @pytest.mark.parametrize("dcgm_config", ["auto"]) 25 | def test_accepts_auto(dcgm_config): 26 | """Test that 'auto' passes validation without errors.""" 27 | cfg = HWObserverConfig(dcgm_snap_channel=dcgm_config) 28 | assert cfg.dcgm_snap_channel == dcgm_config 29 | 30 | 31 | @pytest.mark.parametrize( 32 | "dcgm_config", ["v3/stable", "v3/edge", "v3/candidate", "v4/stable", "v4/edge", "v4/candidate"] 33 | ) 34 | def test_valid_channels(mock_driver_to_cuda, dcgm_config): 35 | """Test valid v3 and v4 channels for supported CUDA versions.""" 36 | mock_driver_to_cuda.return_value = 12 37 | 38 | cfg = HWObserverConfig(dcgm_snap_channel=dcgm_config) 39 | assert cfg.dcgm_snap_channel == dcgm_config 40 | 41 | 42 | @pytest.mark.parametrize("dcgm_config", ["invalid/stable", "foo/edge", "123/candidate"]) 43 | def test_invalid_track(mock_driver_to_cuda, dcgm_config): 44 | """Invalid tracks should raise ValueError.""" 45 | mock_driver_to_cuda.return_value = 12 46 | with pytest.raises(ValidationError) as e: 47 | HWObserverConfig(dcgm_snap_channel=dcgm_config) 48 | assert "Invalid track" in str(e.value) 49 | 50 | 51 | @pytest.mark.parametrize("dcgm_config", ["v3/unknown", "v4/beta", "v3/dev"]) 52 | def test_invalid_risk(mock_driver_to_cuda, dcgm_config): 53 | """Invalid risk should raise ValueError.""" 54 | mock_driver_to_cuda.return_value = 12 55 | with pytest.raises(ValidationError) as e: 56 | HWObserverConfig(dcgm_snap_channel=dcgm_config) 57 | assert "Invalid channel risk" in str(e.value) 58 | 59 | 60 | def test_missing_risk(mock_driver_to_cuda): 61 | """Values without the risk should raise ValueError.""" 62 | mock_driver_to_cuda.return_value = 12 63 | with pytest.raises(ValidationError) as e: 64 | HWObserverConfig(dcgm_snap_channel="v3") 65 | assert "Channel must be in the form" in str(e.value) 66 | 67 | 68 | def test_incompatible_v3_with_cuda13(mock_driver_to_cuda): 69 | """v3 should fail if CUDA version is 13 (driver 580+).""" 70 | mock_driver_to_cuda.return_value = 13 71 | with pytest.raises(ValidationError) as e: 72 | HWObserverConfig(dcgm_snap_channel="v3/stable") 73 | assert "not compatible" in str(e.value) 74 | 75 | 76 | def test_incompatible_v4_with_cuda10(mock_driver_to_cuda): 77 | """v4 should fail if CUDA version is 10 (old driver).""" 78 | mock_driver_to_cuda.return_value = 10 79 | with pytest.raises(ValidationError) as e: 80 | HWObserverConfig(dcgm_snap_channel="v4/stable") 81 | assert "not compatible" in str(e.value) 82 | -------------------------------------------------------------------------------- /.github/workflows/cos_integration.yaml: -------------------------------------------------------------------------------- 1 | # This workflow runs a set of integration tests, 2 | # using hardware-observer-operator *from charmhub* (not locally built). 3 | # It is designed to be run periodically to catch potential issues 4 | # from recent changes to either hardware-observer-operator or COS. 5 | name: COS Integration tests 6 | 7 | on: 8 | workflow_call: 9 | workflow_dispatch: 10 | pull_request: 11 | types: [opened, synchronize, reopened] 12 | branches: [main] 13 | paths: 14 | - ".github/workflows/cos_integration.yaml" 15 | 16 | jobs: 17 | integration: 18 | runs-on: ubuntu-22.04 19 | timeout-minutes: 120 20 | steps: 21 | - name: Checkout 22 | uses: actions/checkout@v3 23 | 24 | - name: Get IP address of the host 25 | run: | 26 | # Finding preferred source ip address by trying to reach destination 2.2.2.2 27 | # This ip address will be used while enabling metallb 28 | echo "IPADDR=$(ip -4 -j route get 2.2.2.2 | jq -r '.[] | .prefsrc')" >> $GITHUB_ENV 29 | 30 | - name: Setup lxd controller 31 | uses: charmed-kubernetes/actions-operator@main 32 | with: 33 | # The juju version can be any stable version, as long as it is the same as libjuju version used. 34 | # If you update it here, update it also in tests/integration/requirements.txt and 'Setup k8s controller' step below 35 | juju-channel: 3.5/stable 36 | provider: lxd 37 | 38 | - name: Save lxd controller name 39 | id: lxd-controller 40 | # The `CONTROLLER_NAME` envvar is set by the actions-operator action 41 | run: echo "name=$CONTROLLER_NAME" >> $GITHUB_OUTPUT 42 | 43 | - name: Setup k8s controller 44 | uses: charmed-kubernetes/actions-operator@main 45 | with: 46 | # The juju version can be any stable version, as long as it is the same as libjuju version used. 47 | # If you update it here, update it also in tests/integration/requirements.txt and 'Setup lxd controller' step above 48 | juju-channel: 3.5/stable 49 | provider: microk8s 50 | channel: 1.28-strict/stable 51 | microk8s-addons: "hostpath-storage dns metallb:${{ env.IPADDR }}-${{ env.IPADDR }}" 52 | 53 | - name: Save k8s controller name 54 | id: k8s-controller 55 | # The `CONTROLLER_NAME` envvar is set by the actions-operator action 56 | run: echo "name=$CONTROLLER_NAME" >> $GITHUB_OUTPUT 57 | 58 | - name: Fix microk8s permissions 59 | run: chmod -R ugo+rwX ~/.kube 60 | 61 | - name: Run integration tests 62 | run: tox -e integration 63 | env: 64 | K8S_CONTROLLER: ${{ steps.k8s-controller.outputs.name }} 65 | LXD_CONTROLLER: ${{ steps.lxd-controller.outputs.name }} 66 | 67 | - name: Dump debug log 68 | if: failure() 69 | run: | 70 | for ctl in $(juju controllers --format json | jq -r '.controllers | keys[]'); do 71 | for mdl in $(juju models --format json | jq -r '.models[].name' | grep -v "admin/controller"); do 72 | juju debug-log -m $ctl:$mdl --replay --ms --no-tail 73 | done 74 | done || true 75 | shell: bash 76 | -------------------------------------------------------------------------------- /tests/unit/test_alert_rules/test_mega_raid.yaml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - ../../../src/prometheus_alert_rules/mega_raid.yaml 3 | 4 | evaluation_interval: 1m 5 | 6 | tests: 7 | 8 | - interval: 1m 9 | input_series: 10 | - series: 'storcli_command_success{instance="ubuntu-0"}' 11 | values: '0x15' # error 12 | 13 | alert_rule_test: 14 | - eval_time: 0m 15 | alertname: StorcliCommandFailed 16 | exp_alerts: 17 | - exp_labels: 18 | severity: critical 19 | instance: ubuntu-0 20 | exp_annotations: 21 | summary: Failed to run storcli. (instance ubuntu-0) 22 | description: | 23 | Failed to get MegaRAID controller information using storcli. 24 | VALUE = 0 25 | LABELS = map[__name__:storcli_command_success instance:ubuntu-0] 26 | 27 | 28 | - interval: 1m 29 | input_series: 30 | - series: 'storcli_command_success{instance="ubuntu-1"}' 31 | values: '1x15' 32 | - series: 'megaraid_controllers{instance="ubuntu-1"}' 33 | values: '0x15' # error 34 | 35 | alert_rule_test: 36 | - eval_time: 0m 37 | alertname: MegaRAIDControllerNotFound 38 | exp_alerts: 39 | - exp_labels: 40 | severity: warning 41 | instance: ubuntu-1 42 | exp_annotations: 43 | summary: MegaRAID controller not found. (instance ubuntu-1) 44 | description: | 45 | Cannot found MegaRAID controller on this host machine. 46 | NUMBER_OF_CONTROLLERS = 0 47 | LABELS = map[__name__:megaraid_controllers instance:ubuntu-1] 48 | 49 | 50 | - interval: 1m 51 | input_series: 52 | - series: 'storcli_command_success{instance="ubuntu-2"}' 53 | values: '1x15' 54 | - series: 'megaraid_controllers{instance="ubuntu-2", hostname="ubuntu-2"}' 55 | values: '1x15' 56 | - series: 'megaraid_virtual_drive_info{instance="ubuntu-2", controller_id="0", drive_group="0", virtual_drive_group="239", state="Dgrd", name="NVMe-RAID-1" }' 57 | values: '1x15' # error 58 | - series: 'megaraid_virtual_drive_info{instance="ubuntu-2", controller_id="0", drive_group="0", virtual_drive_group="240", state="Optl", name="NVMe-RAID-2" }' 59 | values: '0x15' # okay 60 | 61 | alert_rule_test: 62 | - eval_time: 0m 63 | alertname: MegaRAIDVirtualDriveNotOptimal 64 | exp_alerts: 65 | - exp_labels: 66 | severity: warning 67 | controller_id: 0 68 | drive_group: 0 69 | virtual_drive_group: 239 70 | name: NVMe-RAID-1 71 | instance: ubuntu-2 72 | state: Dgrd 73 | exp_annotations: 74 | summary: MegaRAID virtual drives are not in optimal state. (instance ubuntu-2) 75 | description: | 76 | MegaRAID virtual drives are not in optimal state. Please check the if the virtual drives are working as expected. 77 | STATE = Dgrd 78 | LABELS = map[__name__:megaraid_virtual_drive_info controller_id:0 drive_group:0 instance:ubuntu-2 name:NVMe-RAID-1 state:Dgrd virtual_drive_group:239] 79 | -------------------------------------------------------------------------------- /metadata.yaml: -------------------------------------------------------------------------------- 1 | # This file populates the Overview on Charmhub. 2 | 3 | # The charm package name, no spaces (required) 4 | # See https://juju.is/docs/sdk/naming#heading--naming-charms for guidance. 5 | name: hardware-observer 6 | 7 | # The following metadata are human-readable and will be published prominently on Charmhub. 8 | 9 | # (Recommended) 10 | display-name: Hardware Observer 11 | 12 | summary: Subordinate charm for monitoring hardware resources. 13 | 14 | description: Subordinate charm for monitoring hardware resources. 15 | 16 | website: https://github.com/canonical/hardware-observer-operator 17 | 18 | docs: https://discourse.charmhub.io/t/hardware-observer-docs-index/11112 19 | 20 | issues: https://github.com/canonical/hardware-observer-operator/issues 21 | 22 | subordinate: true 23 | 24 | resources: 25 | storcli-deb: 26 | type: file 27 | description: | 28 | (Optional) StorCLI deb file published by Broadcom for their RAID devices. 29 | Download v7.27 from: https://docs.broadcom.com/docs/1232743397. 30 | The download will start automatically upon accepting the license agreement. 31 | Unzip the downloaded file and attach the relevant deb package. 32 | E.g.: 33 | On AMD64 hosts, use ./Unified_storcli_all_os/Ubuntu/storcli_007.2705.0000.0000_all.deb 34 | On ARM64 hosts, use ./Unified_storcli_all_os/ARM/Linux/storcli64_007.2705.0000.0000_arm64.deb 35 | filename: storcli.deb 36 | 37 | perccli-deb: 38 | type: file 39 | description: | 40 | (Optional) PERCCLI deb file published by Dell for their RAID devices. 41 | Download v7.23 from https://www.dell.com/support/home/en-us/drivers/driversdetails?driverid=tdghn. 42 | Scroll down to "Available Formats" and download the PERCCLI_XXX_Linux.tar.gz file. 43 | Extract the downloaded file and attach the relevant deb package. 44 | E.g.: ./PERCCLI_7.2313.0_A14_Linux/perccli_007.2313.0000.0000_all.deb 45 | Note: perccli is only available for the AMD64 architecture. 46 | filename: perccli.deb 47 | 48 | sas2ircu-bin: 49 | type: file 50 | description: | 51 | (Optional) SAS2IRCU binary file published by Broadcom. 52 | Download vP20 from https://docs.broadcom.com/docs/12351735. 53 | The download will start automatically upon accepting the license agreement. 54 | Unzip the downloaded file and attach the relevant binary. 55 | E.g.: ./SAS2IRCU_P20/sas2ircu_linux_x86_rel/sas2ircu 56 | Note: sas2ircu is only available for the AMD64 architecture. 57 | filename: sas2ircu 58 | 59 | sas3ircu-bin: 60 | type: file 61 | description: | 62 | (Optional) SAS3IRCU binary file published by Broadcom. 63 | Download vP16 from https://docs.broadcom.com/docs/SAS3IRCU_P16.zip. 64 | The download will start automatically upon accepting the license agreement. 65 | Unzip the downloaded file and attach the relevant binary. 66 | E.g.: 67 | On AMD64 hosts, use ./SAS3IRCU_P16/sas3ircu_linux_x64_rel/sas3ircu. 68 | On ARM64 hosts, use ./SAS3IRCU_P16/sas3ircu_linux_arm_rel/sas3ircu. 69 | filename: sas3ircu 70 | 71 | provides: 72 | cos-agent: 73 | interface: cos_agent 74 | limit: 1 75 | 76 | requires: 77 | general-info: 78 | interface: juju-info 79 | scope: container 80 | -------------------------------------------------------------------------------- /tests/manual/jobs/README.md: -------------------------------------------------------------------------------- 1 | # List of Testflinger Jobs for Hardware Observer Manual Tests 2 | 3 | This directory contains a list of job queues on [testflinger][testflinger] that can be used for testing hardware 4 | observer manually. Each job queue is defined in a directory with a `README.md` that indicates the testable items on that 5 | machine. 6 | 7 | > [!Note] 8 | > You can only submit job defined in this directory! 9 | 10 | The `./submit.sh` script is a simple wrapper for `testflinger submit` that allow user to submit the jobs with customize 11 | [`distro`][job-schema] and [`ssk_keys`][sshkeys] (only support one ssh keys). 12 | 13 | ## Quick Start 14 | 15 | You can allocate a physical machine using the `./submit.sh` script. For example, to allocate machine from job queue 16 | [`torchtusk`](./torchtusk), and use ubuntu:24.04 (noble) as the OS image, and import ssh key using launchpad ID 17 | `lp:myusername-1234`. Run 18 | 19 | ```shell 20 | $ ./submit.sh torchtusk jammy lp:myusername-1234 21 | # job.yaml 22 | job_queue: torchtusk 23 | provision_data: 24 | distro: noble 25 | reserve_data: 26 | ssh_keys: 27 | - lp:myusername-1234 28 | timeout: 21600 29 | Job submitted successfully! 30 | job_id: 25a3b103-26dd-421c-817d-2950f968d327 31 | ``` 32 | 33 | Then, wait for the machine to become available 34 | 35 | ```shell 36 | $ testflinger poll 25a3b103-26dd-421c-817d-2950f968d327 37 | 38 | *************************************************** 39 | * Starting testflinger reserve phase on torchtusk * 40 | *************************************************** 41 | 42 | ... 43 | 44 | Number of key(s) added: 3 45 | 46 | Now try logging into the machine, with: "ssh -o 'StrictHostKeyChecking=no' -o 'UserKnownHostsFile=/dev/null' 'ubuntu@xxx.xxx.xxx.xxx'" 47 | and check to make sure that only the key(s) you wanted were added. 48 | 49 | *** TESTFLINGER SYSTEM RESERVED *** 50 | You can now connect to ubuntu@xxx.xxx.xxx.xxx 51 | Current time: [2025-03-17T05:40:47.103464] 52 | Reservation expires at: [2025-03-17T11:40:47.103513] 53 | Reservation will automatically timeout in 21600 seconds 54 | To end the reservation sooner use: testflinger-cli cancel 25a3b103-26dd-421c-817d-2950f968d327 55 | ``` 56 | 57 | Finally, you can login to the machine using the command provided 58 | 59 | ```shell 60 | ssh -o 'StrictHostKeyChecking=no' -o 'UserKnownHostsFile=/dev/null' 'ubuntu@xxx.xxx.xxx.xxx' # IP address is redarted 61 | ``` 62 | 63 | ## Contributing 64 | 65 | Please add more job queues to this directory to increase test coverage for Hardware Observer. An example contribution of 66 | job queue can be something like the following: 67 | 68 | ```text 69 | torchtusk/ 70 | ├── job.tpl.yaml 71 | └── README.md 72 | ``` 73 | 74 | where the **name of the directory** is the `job_queue`; the file **job.tpl.yaml** is the [job defintion][job-schema]; 75 | and `README.md` contains the testable items on that machine. Alternatively, you can simply copy an existing job, and 76 | update the job with different information. 77 | 78 | 79 | [testflinger]: https://certification.canonical.com/docs/ops/tel-labs-docs/how-to/use_machines_through_testflinger/ 80 | [job-schema]: https://canonical-testflinger.readthedocs-hosted.com/en/latest/reference/job-schema.html 81 | [sskkeys]: https://canonical-testflinger.readthedocs-hosted.com/en/latest/reference/test-phases.html#reserve 82 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributor Guide 2 | 3 | Thank you for your interest in helping us improve this project! We're open to 4 | community contributions, suggestions, fixes, and feedback. This documentation 5 | will assist you in navigating through our processes. 6 | 7 | Make sure to review this guide thoroughly before beginning your contribution. It 8 | provides all the necessary details to increase the likelihood of your contribution 9 | being accepted. 10 | 11 | This project is hosted and managed on [GitHub](https://github.com). If you're new to GitHub 12 | and not familiar with how it works, their 13 | [quickstart documentation](https://docs.github.com/en/get-started/quickstart) 14 | provides an excellent introduction to all the tools and processes you'll need 15 | to know. 16 | 17 | ## Prerequisites 18 | 19 | Before you can begin, you will need to: 20 | 21 | * Read and agree to abide by our 22 | [Code of Conduct](https://ubuntu.com/community/code-of-conduct). 23 | 24 | * Sign the Canonical 25 | [contributor license agreement](https://ubuntu.com/legal/contributors). This 26 | grants us your permission to use your contributions in the project. 27 | 28 | * Create (or have) a GitHub account. 29 | 30 | * If you're working in a local environment, it's important to create a signing 31 | key, typically using GPG or SSH, and register it in your GitHub account to 32 | verify the origin of your code changes. For instructions on setting this up, 33 | please refer to 34 | [Managing commit signature verification](https://docs.github.com/en/authentication/managing-commit-signature-verification). 35 | 36 | ## Contributing Code 37 | 38 | ### Workflow 39 | 40 | 1. **Choose/Create an Issue**: Before starting work on an enhancement, create an issue that explains your use case. This helps track progress and keeps the discussion organized. The issue will be tracked on the GitHub issue page. 41 | 42 | 2. **Fork the Repository**: Create a fork of the repository to make your changes. 43 | 44 | 3. **Create a New Branch**: Make sure to create a new branch for your contribution. 45 | 46 | 4. **Commit your changes**: Commit messages should be well-structured and provide a meaningful explanation of the changes made 47 | 48 | 5. **Submit a Pull Request**: Submit a pull request to merge your changes into the main branch. Reference the issue by adding issue link or `Fixes: #xxx` (replace `xxx` with the issue number) to automatically link the issue to your PR. 49 | 50 | 6. **Review Process**: A team member will review your pull request. They may suggest changes or leave comments, so keep an eye on the PR status and be ready to make updates if needed. 51 | 52 | 7. **Documentation**: Any documentation changes should be included as part of your PR or as a separate PR linked to your original PR. 53 | 54 | 55 | ### Hard Requirements 56 | 57 | - **Testing and Code Coverage**: Changes must be accompanied by appropriate unit tests and meet the project's code coverage requirements. Functional and integration tests should be added when applicable to ensure the stability of the codebase. 58 | 59 | - **Sign Your Commits**: Be sure to [sign your commits](https://docs.github.com/en/authentication/managing-commit-signature-verification/signing-commits), refer to the [Prerequisites](#prerequisites) section. 60 | 61 | ## Code of Conduct 62 | 63 | This project follows the Ubuntu Code of Conduct. You can read it in full [here](https://ubuntu.com/community/code-of-conduct). 64 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation 2 | # To update the file: 3 | # - Edit it in the canonical/solutions-engineering-automation repository. 4 | # - Open a PR with the changes. 5 | # - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes. 6 | 7 | [tool.setuptools_scm] 8 | 9 | [tool.flake8] 10 | max-line-length = 99 11 | max-doc-length = 99 12 | max-complexity = 10 13 | exclude = [ 14 | ".git", 15 | "__pycache__", 16 | ".tox", 17 | ".build", 18 | "build", 19 | "dist", 20 | ".eggs", 21 | "*.egg_info", 22 | "venv", 23 | ".venv", 24 | "report", 25 | "docs", 26 | "lib", 27 | "mod", 28 | "hooks/charmhelpers", 29 | "tests/charmhelpers", 30 | ] 31 | select = ["E", "W", "F", "C", "N", "R", "D", "H"] 32 | # Ignore W503, E501 because using black creates errors with this 33 | # Ignore D107 Missing docstring in __init__ 34 | # Ignore D415 Docstring first line punctuation (doesn't make sense for properties) 35 | # Ignore N818 Exceptions end with "Error" (not all exceptions are errors) 36 | # D100, D101, D102, D103: Ignore missing docstrings in tests 37 | ignore = ["C901", "W503", "E501", "D107", "D415", "N818", "D100", "D101", "D102", "D103", "W504"] 38 | per-file-ignores = ["tests/*:D100,D101,D102,D103,D104"] 39 | # Check for properly formatted copyright header in each file 40 | copyright-check = "True" 41 | copyright-author = "Canonical Ltd." 42 | copyright-regexp = "Copyright\\s\\d{4}([-,]\\d{4})*\\s+%(author)s" 43 | 44 | [tool.black] 45 | line-length = 99 46 | exclude = ''' 47 | /( 48 | | .eggs 49 | | .git 50 | | .tox 51 | | .venv 52 | | .build 53 | | build 54 | | lib 55 | | report 56 | | docs 57 | | mod 58 | | hooks/charmhelpers 59 | | tests/charmhelpers 60 | )/ 61 | ''' 62 | 63 | [tool.isort] 64 | profile = "black" 65 | line_length = 99 66 | skip_glob = [".eggs", ".git", ".tox", ".venv", ".build", "build", "lib", "report", "mod/*", "hooks/charmhelpers", "tests/charmhelpers"] 67 | 68 | [tool.pylint] 69 | max-line-length = 99 70 | disable = ["E1102"] 71 | ignore = ['.eggs', '.git', '.tox', '.venv', '.build', 'lib', 'report', 'tests', 'docs', "mod", "hooks/charmhelpers", "tests/charmhelpers"] 72 | 73 | [tool.mypy] 74 | warn_unused_ignores = true 75 | warn_unused_configs = true 76 | warn_unreachable = true 77 | disallow_untyped_defs = true 78 | ignore_missing_imports = true 79 | no_namespace_packages = true 80 | exclude = ['.eggs', '.git', '.tox', '.venv', '.build', 'lib', 'report', 'tests', 'docs', "mod", "hooks/charmhelpers", "tests/charmhelpers"] 81 | 82 | [tool.codespell] 83 | skip = ".eggs,.tox,.git,.venv,venv,build,.build,lib,report,docs,poetry.lock,htmlcov,mod,hooks/charmhelpers,tests/charmhelpers" 84 | quiet-level = 3 85 | check-filenames = true 86 | ignore-words-list = "assertIn" 87 | 88 | ## Ignore unsupported imports 89 | [[tool.mypy.overrides]] 90 | module = ["charmhelpers.*", "setuptools"] 91 | ignore_missing_imports = true 92 | 93 | [tool.coverage.run] 94 | relative_files = true 95 | source = ["."] 96 | omit = ["tests/**", "docs/**", "lib/**", "snap/**", "build/**", "setup.py", "mod/**", "hooks/charmhelpers/**", "tests/charmhelpers/**"] 97 | 98 | [tool.coverage.report] 99 | fail_under = 100 100 | show_missing = true 101 | 102 | [tool.coverage.html] 103 | directory = "tests/report/html" 104 | 105 | [tool.coverage.xml] 106 | output = "tests/report/coverage.xml" 107 | -------------------------------------------------------------------------------- /tests/unit/test_alert_rules/test_perccli.yaml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - ../../../src/prometheus_alert_rules/perccli.yaml 3 | 4 | evaluation_interval: 1m 5 | 6 | tests: 7 | 8 | - interval: 1m 9 | input_series: 10 | - series: 'perccli_command_success{instance="ubuntu-0"}' 11 | values: '0x15' # error: PerccliCommandFailed 12 | alert_rule_test: 13 | - eval_time: 0m 14 | alertname: PerccliCommandFailed 15 | exp_alerts: 16 | - exp_labels: 17 | severity: critical 18 | instance: ubuntu-0 19 | exp_annotations: 20 | summary: Failed to run perccli or controller not available. (instance ubuntu-0) 21 | description: | 22 | Failed to get PowerEdgeRAID controller information using perccli. 23 | INSTANCE = ubuntu-0 24 | SUCCESS = 0 25 | LABELS = map[__name__:perccli_command_success instance:ubuntu-0] 26 | 27 | - interval: 1m 28 | input_series: 29 | - series: 'perccli_command_success{instance="ubuntu-1"}' 30 | values: '1x15' 31 | - series: 'poweredgeraid_controllers{instance="ubuntu-1"}' 32 | values: '0x15' # error: PowerEdgeRAIDControllerNotFound 33 | 34 | alert_rule_test: 35 | - eval_time: 0m 36 | alertname: PowerEdgeRAIDControllerNotFound 37 | exp_alerts: 38 | - exp_labels: 39 | severity: warning 40 | instance: ubuntu-1 41 | exp_annotations: 42 | summary: PowerEdge RAID controller not found. (instance ubuntu-1) 43 | description: | 44 | Cannot find PowerEdge RAID controller on this host machine. 45 | INSTANCE = ubuntu-1 46 | LABELS = map[__name__:poweredgeraid_controllers instance:ubuntu-1] 47 | 48 | - interval: 1m 49 | input_series: 50 | - series: 'perccli_command_ctrl_success{instance="ubuntu-1", controller_id="0"}' 51 | values: '0x15' 52 | 53 | alert_rule_test: 54 | - eval_time: 0m 55 | alertname: PowerEdgeRAIDControllerSuccess 56 | exp_alerts: 57 | - exp_labels: 58 | severity: critical 59 | instance: ubuntu-1 60 | controller_id: 0 61 | exp_annotations: 62 | summary: PowerEdge RAID controller command not successful. (instance ubuntu-1) 63 | description: | 64 | Failed to get PowerEdge RAID controller information on controller 0. 65 | INSTANCE = ubuntu-1 66 | CONTROLLER_ID = 0 67 | LABELS = map[__name__:perccli_command_ctrl_success controller_id:0 instance:ubuntu-1] 68 | 69 | - interval: 1m 70 | input_series: 71 | - series: 'poweredgeraid_virtual_info{instance="ubuntu-1", controller_id="0", device_group="1", virtual_drive_id="2", state="Dgrd", cache_policy="NRWTD"}' 72 | values: '1x15' 73 | 74 | alert_rule_test: 75 | - eval_time: 0m 76 | alertname: PowerEdgeRAIDVirtualDriveNotOptimal 77 | exp_alerts: 78 | - exp_labels: 79 | severity: warning 80 | instance: ubuntu-1 81 | controller_id: 0 82 | device_group: 1 83 | virtual_drive_id: 2 84 | state: Dgrd 85 | cache_policy: NRWTD 86 | exp_annotations: 87 | summary: PowerEdge RAID virtual drives are not in optimal state. (instance ubuntu-1) 88 | description: | 89 | PowerEdge RAID virtual drives are not in optimal state. Please check the if the virtual drives are working as expected. 90 | STATE = Dgrd 91 | LABELS = map[__name__:poweredgeraid_virtual_info cache_policy:NRWTD controller_id:0 device_group:1 instance:ubuntu-1 state:Dgrd virtual_drive_id:2] 92 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules_dynamic/redfish.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: Redfish 3 | rules: 4 | - alert: RedfishCallFailed 5 | expr: redfish_call_success == 0 6 | for: 5m 7 | labels: 8 | severity: warning 9 | annotations: 10 | summary: Call to the Redfish API failed. (instance {{ $labels.instance }}) 11 | description: | 12 | Failure in calling the Redfish API. 13 | VALUE = {{ $value }} 14 | LABELS = {{ $labels }} 15 | 16 | - alert: RedfishServiceUnavailable 17 | expr: redfish_service_available == 0 18 | for: 5m 19 | labels: 20 | severity: warning 21 | annotations: 22 | summary: No redfish services available. (instance {{ $labels.instance }}) 23 | description: | 24 | No redfish services available. 25 | VALUE = {{ $value }} 26 | LABELS = {{ $labels }} 27 | 28 | - alert: RedfishSensorHealthNotOk 29 | expr: redfish_sensor_info{health!~"OK|N/A"} 30 | for: 5m 31 | labels: 32 | severity: critical 33 | annotations: 34 | summary: Redfish sensor health not Ok. (instance {{ $labels.instance }}) 35 | description: | 36 | Redfish sensor health not Ok. 37 | SENSOR_READING = {{ $labels.reading }} 38 | LABELS = {{ $labels }} 39 | 40 | - alert: RedfishProcessorHealthNotOk 41 | expr: redfish_processor_info{health!~"OK|NA"} 42 | for: 5m 43 | labels: 44 | severity: critical 45 | annotations: 46 | summary: Redfish processor health not OK. (instance {{ $labels.instance }}) 47 | description: | 48 | Redfish processor health not OK. 49 | LABELS = {{ $labels }} 50 | 51 | - alert: RedfishStorageControllerHealthNotOk 52 | expr: redfish_storage_controller_info{health!~"OK|NA"} 53 | for: 5m 54 | labels: 55 | severity: critical 56 | annotations: 57 | summary: Redfish storage controller health not OK. (instance {{ $labels.instance }}) 58 | description: | 59 | Redfish storage controller health not OK. 60 | LABELS = {{ $labels }} 61 | 62 | - alert: RedfishChassisHealthNotOk 63 | expr: redfish_chassis_info{health!~"OK|NA"} 64 | for: 5m 65 | labels: 66 | severity: critical 67 | annotations: 68 | summary: Redfish chassis health not OK. (instance {{ $labels.instance }}) 69 | description: | 70 | Redfish chassis health not OK. 71 | LABELS = {{ $labels }} 72 | 73 | - alert: RedfishStorageDriveHealthNotOk 74 | expr: redfish_storage_drive_info{health!~"OK|NA", state="Enabled"} 75 | for: 5m 76 | labels: 77 | severity: critical 78 | annotations: 79 | summary: Redfish storage drive health not OK. (instance {{ $labels.instance }}) 80 | description: | 81 | Redfish storage drive health not OK. 82 | LABELS = {{ $labels }} 83 | 84 | - alert: RedfishMemoryDimmHealthNotOk 85 | expr: redfish_memory_dimm_info{health!~"OK|NA"} 86 | for: 5m 87 | labels: 88 | severity: critical 89 | annotations: 90 | summary: Redfish memory dimm health not OK. (instance {{ $labels.instance }}) 91 | description: | 92 | Redfish memory dimm health not OK. 93 | LABELS = {{ $labels }} 94 | 95 | - alert: RedfishSmartStorageHealthNotOk 96 | expr: redfish_smart_storage_health == 0 97 | for: 5m 98 | labels: 99 | severity: critical 100 | annotations: 101 | summary: Redfish smart storage health not OK. (instance {{ $labels.instance }}) 102 | description: | 103 | Redfish smart storage health not OK. 104 | VALUE = {{ $value }} 105 | LABELS = {{ $labels }} 106 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/ipmi_sensors.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: IpmiSensors 3 | rules: 4 | - alert: IPMIMonitoringCommandSuccessMetricsMissing 5 | expr: absent_over_time(ipmimonitoring_command_success[5m]) 6 | labels: 7 | severity: critical 8 | annotations: 9 | summary: IPMI monitoring command success metrics missing. (instance {{ $labels.instance }}) 10 | description: | 11 | The ipmimonitoring_command_success metric has been missing for over 5 minutes. 12 | This may indicate IPMI monitoring command timeouts, or that IPMI tools/services are not installed or supported on this hardware. 13 | LABELS = {{ $labels }} 14 | 15 | - alert: IPMIMonitoringCommandFailed 16 | expr: ipmimonitoring_command_success == 0 17 | for: 5m 18 | labels: 19 | severity: critical 20 | annotations: 21 | summary: Failed to run ipmimonitoring. (instance {{ $labels.instance }}) 22 | description: | 23 | Failed to get ipmi sensor data using ipmimonitoring. 24 | VALUE = {{ $value }} 25 | LABELS = {{ $labels }} 26 | 27 | - alert: IPMITemperatureStateNotOk 28 | expr: ipmi_temperature_celsius{state=~"Warning|Critical"} 29 | for: 5m 30 | labels: 31 | severity: "{{ toLower $labels.state }}" 32 | annotations: 33 | summary: Temperature in {{ toLower $labels.state }} state. (instance {{ $labels.instance }}) 34 | description: | 35 | Temperature, recorded by ipmi sensor, in {{ toLower $labels.state }} state. 36 | TEMPERATURE_CELSIUS = {{ $value }} 37 | LABELS = {{ $labels }} 38 | 39 | - alert: IPMIPowerStateNotOk 40 | expr: ipmi_power_watts{state=~"Warning|Critical"} 41 | for: 5m 42 | labels: 43 | severity: "{{ toLower $labels.state }}" 44 | annotations: 45 | summary: Power in {{ toLower $labels.state }} state. (instance {{ $labels.instance }}) 46 | description: | 47 | Power, recorded by ipmi sensor, in {{ toLower $labels.state }} state. 48 | POWER_WATTS = {{ $value }} 49 | LABELS = {{ $labels }} 50 | 51 | - alert: IPMIVoltageStateNotOk 52 | expr: ipmi_voltage_volts{state=~"Warning|Critical"} 53 | for: 5m 54 | labels: 55 | severity: "{{ toLower $labels.state }}" 56 | annotations: 57 | summary: Voltage in {{ toLower $labels.state }} state. (instance {{ $labels.instance }}) 58 | description: | 59 | Voltage, recorded by ipmi sensor, in {{ toLower $labels.state }} state. 60 | VOLTAGE_VOLTS = {{ $value }} 61 | LABELS = {{ $labels }} 62 | 63 | - alert: IPMICurrentStateNotOk 64 | expr: ipmi_current_amperes{state=~"Warning|Critical"} 65 | for: 5m 66 | labels: 67 | severity: "{{ toLower $labels.state }}" 68 | annotations: 69 | summary: Current in {{ toLower $labels.state }} state. (instance {{ $labels.instance }}) 70 | description: | 71 | Current, recorded by ipmi sensor, in {{ toLower $labels.state }} state. 72 | CURRENT_AMPERES = {{ $value }} 73 | LABELS = {{ $labels }} 74 | 75 | - alert: IPMIFanSpeedStateNotOk 76 | expr: ipmi_fan_speed_rpm{state=~"Warning|Critical"} 77 | for: 5m 78 | labels: 79 | severity: "{{ toLower $labels.state }}" 80 | annotations: 81 | summary: Fan speed in {{ toLower $labels.state }} state. (instance {{ $labels.instance }}) 82 | description: | 83 | Fan speed, recorded by ipmi sensor, in {{ toLower $labels.state }} state. 84 | FAN_SPEED_RPM = {{ $value }} 85 | LABELS = {{ $labels }} 86 | 87 | # Entity Presence sensors are ignored since the state doesn't correspond to a real alert 88 | # Slot Connector sensors are ignored since they raise a high number of false positive alerts 89 | - alert: IPMISensorStateNotOk 90 | expr: ipmi_generic_sensor_value{state=~"Warning|Critical", type!~"Entity\\sPresence|Slot/Connector"} 91 | for: 5m 92 | labels: 93 | severity: "{{ toLower $labels.state }}" 94 | annotations: 95 | summary: IPMI sensor value in {{ toLower $labels.state }} state. (instance {{ $labels.instance }}) 96 | description: | 97 | A sensor value, recorded by ipmi sensor, in {{ toLower $labels.state }} state. Entity Presence and Slot Connector sensors are ignored. 98 | VALUE = {{ $value }} 99 | LABELS = {{ $labels }} 100 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/dcgm.yaml: -------------------------------------------------------------------------------- 1 | # The alerts use DCGM_FI_DEV_CLOCK_THROTTLE_REASONS metric to detect throttling events on NVIDIA GPUs, 2 | # which is a bitmask of throttle reasons found here: https://docs.nvidia.com/datacenter/dcgm/2.1/dcgm-api/group__dcgmFieldConstants.html. 3 | # The 8 least significant bits are used for the alerts, with each bit representing a different throttle reason. 4 | 5 | groups: 6 | - name: NVIDIA DCGM Throttling Alerts 7 | rules: 8 | - alert: GPUPowerBrakeThrottle 9 | # isolate the least significant 8 bits with % 256 10 | # check whether bit 7 (starts from bit 0) has been set with the >= bool 128 comparison 11 | expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 256 >= 128 12 | for: 5m 13 | labels: 14 | severity: warning 15 | annotations: 16 | summary: GPU Hardware Power Brake Slowdown throttling detected. (instance {{ $labels.Hostname }}) 17 | description: | 18 | HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }} 19 | This is an indicator of: 20 | - External Power Brake Assertion being triggered (e.g. by the system power supply) 21 | LABELS = {{ $labels }} 22 | - alert: GPUThermalHWThrottle 23 | # isolate the least significant 7 bits with % 128 24 | # check whether bit 6 (starts from bit 0) has been set with the >= bool 64 comparison 25 | expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 128 >= 64 26 | for: 5m 27 | labels: 28 | severity: warning 29 | annotations: 30 | summary: GPU Hardware Thermal throttling detected. (instance {{ $labels.Hostname }}) 31 | description: | 32 | HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }} 33 | This is an indicator of: 34 | - Temperature being too high 35 | LABELS = {{ $labels }} 36 | - alert: GPUThermalSWThrottle 37 | # isolate the least significant 6 bits with % 64 38 | # check whether bit 5 (starts from bit 0) has been set with the >= bool 32 comparison 39 | expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 64 >= 32 40 | for: 5m 41 | labels: 42 | severity: warning 43 | annotations: 44 | summary: GPU Software Thermal throttling detected. (instance {{ $labels.Hostname }}) 45 | description: | 46 | SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.gpu }} 47 | This is an indicator of: 48 | - Current GPU temperature above the GPU Max Operating Temperature 49 | - Current memory temperature above the Memory Max Operating Temperature 50 | LABELS = {{ $labels }} 51 | - alert: GPUSyncBoostThrottle 52 | # isolate the least significant 5 bits with % 32 53 | # check whether bit 4 (starts from bit 0) has been set with the >= bool 16 comparison 54 | expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 32 >= 16 55 | for: 5m 56 | labels: 57 | severity: warning 58 | annotations: 59 | summary: GPU Sync Boost throttling detected. (instance {{ $labels.Hostname }}) 60 | description: | 61 | This NVIDIA GPU: {{ $labels.gpu }} has been added to a Sync boost group with nvidia-smi or DCGM in order to maximize performance per watt. 62 | All GPUs in the sync boost group will boost to the minimum possible clocks across the entire group. 63 | Look at the throttle reasons for other GPUs in the system to see why those GPUs are holding this one at lower clocks. 64 | LABELS = {{ $labels }} 65 | - alert: GPUSlowdownThrottle 66 | # isolate the least significant 4 bits with % 16 67 | # check whether bit 3 (starts from bit 0) has been set with the >= bool 8 comparison 68 | expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 16 >= 8 69 | for: 5m 70 | labels: 71 | severity: warning 72 | annotations: 73 | summary: GPU Hardware Slowdown throttling detected. (instance {{ $labels.Hostname }}) 74 | description: | 75 | HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }} 76 | This is an indicator of: 77 | - Temperature being too high 78 | - External Power Brake Assertion is triggered (e.g. by the system power supply) 79 | - Power draw is too high and Fast Trigger protection is reducing the clocks 80 | - May be also reported during PState or clock change 81 | LABELS = {{ $labels }} 82 | - alert: GPUPowerThrottle 83 | # isolate the least significant 3 bits with % 8 84 | # check whether bit 2 (starts from bit 0) has been set with the >= bool 4 comparison 85 | expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 8 >= 4 86 | for: 5m 87 | labels: 88 | severity: warning 89 | annotations: 90 | summary: GPU Software Power throttling detected. (instance {{ $labels.Hostname }}) 91 | description: | 92 | SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.gpu }} 93 | LABELS = {{ $labels }} 94 | -------------------------------------------------------------------------------- /tests/unit/test_alert_rules/test_ssacli.yaml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - ../../../src/prometheus_alert_rules/ssacli.yaml 3 | 4 | evaluation_interval: 1m 5 | 6 | tests: 7 | 8 | - interval: 1m 9 | input_series: 10 | - series: 'ssacli_command_success{instance="ubuntu-0"}' 11 | values: '0x15' 12 | 13 | alert_rule_test: 14 | - eval_time: 0m 15 | alertname: SsaCLICommandFailed 16 | exp_alerts: 17 | - exp_labels: 18 | severity: critical 19 | instance: ubuntu-0 20 | exp_annotations: 21 | summary: Failed to run ssacli. (instance ubuntu-0) 22 | description: | 23 | Failed to get storage array information using ssacli. 24 | VALUE = 0 25 | LABELS = map[__name__:ssacli_command_success instance:ubuntu-0] 26 | 27 | 28 | - interval: 1m 29 | input_series: 30 | - series: 'ssacli_command_success{instance="ubuntu-1"}' 31 | values: '1x15' 32 | - series: 'ssacli_controllers{instance="ubuntu-1"}' 33 | values: '0x15' 34 | 35 | alert_rule_test: 36 | - eval_time: 0m 37 | alertname: SsaCLIControllerNotFound 38 | exp_alerts: 39 | - exp_labels: 40 | severity: warning 41 | instance: ubuntu-1 42 | exp_annotations: 43 | summary: ssacli controller not found. (instance ubuntu-1) 44 | description: | 45 | Cannot find ssacli controller on this host machine. 46 | NUMBER_OF_CONTROLLERS = 0 47 | LABELS = map[__name__:ssacli_controllers instance:ubuntu-1] 48 | 49 | 50 | - interval: 1m 51 | input_series: 52 | - series: 'ssacli_command_success{instance="ubuntu-2"}' 53 | values: '1x15' 54 | - series: 'ssacli_controller_info{instance="ubuntu-2", part="Cache Status", status="DOWN"}' 55 | values: '1x15' 56 | 57 | alert_rule_test: 58 | - eval_time: 0m 59 | alertname: SsaCLIControllerNotOK 60 | exp_alerts: 61 | - exp_labels: 62 | severity: critical 63 | instance: ubuntu-2 64 | part: Cache Status 65 | status: DOWN 66 | exp_annotations: 67 | summary: ssacli controller status not Ok. (instance ubuntu-2) 68 | description: | 69 | SSACLI controller status not OK. 70 | STATUS = DOWN 71 | LABELS = map[__name__:ssacli_controller_info instance:ubuntu-2 part:Cache Status status:DOWN] 72 | 73 | 74 | - interval: 1m 75 | input_series: 76 | - series: 'ssacli_controller_info{instance="ubuntu-2", part="Cache Status", status="NOT CONFIGURED"}' 77 | values: '1x15' 78 | - series: 'ssacli_controller_info{instance="ubuntu-3", part="Cache Status", status="OK"}' 79 | values: '1x15' 80 | alert_rule_test: 81 | - eval_time: 0m 82 | alertname: SsaCLIControllerNotOK 83 | # Expect no alerts when status is NOT CONFIGURED or OK 84 | exp_alerts: [] 85 | 86 | 87 | - interval: 1m 88 | input_series: 89 | - series: 'ssacli_command_success{instance="ubuntu-3"}' 90 | values: '1x15' 91 | - series: 'ssacli_logical_drive_info{instance="ubuntu-3", slot="2", status="DOWN"}' 92 | values: '1x15' 93 | - series: 'ssacli_logical_drive_info{instance="ubuntu-11", slot="2", status="OK"}' 94 | values: '1x15' 95 | 96 | alert_rule_test: 97 | - eval_time: 0m 98 | alertname: SsaCLILogicalDriveNotOK 99 | exp_alerts: 100 | - exp_labels: 101 | severity: critical 102 | instance: ubuntu-3 103 | slot: "2" 104 | status: DOWN 105 | exp_annotations: 106 | summary: ssacli logical drive status not Ok. (instance ubuntu-3) 107 | description: | 108 | SSACLI logical drive status not OK. 109 | STATUS = DOWN 110 | LABELS = map[__name__:ssacli_logical_drive_info instance:ubuntu-3 slot:2 status:DOWN] 111 | 112 | 113 | - interval: 1m 114 | input_series: 115 | - series: 'ssacli_command_success{instance="ubuntu-4"}' 116 | values: '1x15' 117 | - series: 'ssacli_physical_drive_info{instance="ubuntu-4", slot="2", status="CORRUPT"}' 118 | values: '1x15' 119 | - series: 'ssacli_physical_drive_info{instance="ubuntu-12", slot="2", status="OK"}' 120 | values: '1x15' 121 | 122 | alert_rule_test: 123 | - eval_time: 0m 124 | alertname: SsaCLIPhysicalDriveNotOK 125 | exp_alerts: 126 | - exp_labels: 127 | severity: critical 128 | instance: ubuntu-4 129 | slot: "2" 130 | status: CORRUPT 131 | exp_annotations: 132 | summary: ssacli physical drive status not Ok. (instance ubuntu-4) 133 | description: | 134 | SSACLI physical drive status not OK. 135 | STATUS = CORRUPT 136 | LABELS = map[__name__:ssacli_physical_drive_info instance:ubuntu-4 slot:2 status:CORRUPT] 137 | -------------------------------------------------------------------------------- /src/keys.py: -------------------------------------------------------------------------------- 1 | """Static parameters for keys.""" 2 | 3 | HPPUBLICKEY1024 = """ 4 | -----BEGIN PGP PUBLIC KEY BLOCK----- 5 | Version: GnuPG v1.4.0 (MingW32) 6 | 7 | mQGiBEIxWpoRBADb06sJgnD7MJnm2Ny1nmTFLDSZ8vkubP+pmfn9N9TE26oit+KI 8 | OnVTRVbSPl3F15wTjSBGR453MEfnzp1NrMk1GIa/m1nKAmgQ4t1714C4jQab0to+ 9 | gP51XhPhtAGt7BggorQw2RXa4KdTCh8ByOIaDKRYcESmMazSZ+Pscy2XRwCgm771 10 | 21RCM0RcG2dmHZZgKH8fTscD/RiY3CHI2jJl9WosIYXbZpOySzrLn0lRCRdNdpew 11 | Y5m1f3lhqoSvJk7pXjs4U+3XlOlUhgWl5HiXuWSVyPu2ilfGdfgpJslawI85fBQg 12 | Ul5kcrjLHHsApeG8oGStFJE2JAc+0D+whmGmJbjWKwuZJmgpm9INplA4h1BYJbx+ 13 | 6A3MBACFiMTttDPpJ+5eWr1VSZwxCZNqvPWmjpL5Nh9F8xzE7q+ad2CFKSebvRrv 14 | Jf7Y2m+wY9bmo5nJ3wHYEX3Aatt+QVF10G6wTdIz/Ohm/Pc4Li4NhzYOv7FKxVam 15 | 97UN0O8Rsl4GhE2eE8H+Q3QYFvknAWoTj3Rq3/A5FA6FsRFhxbQwSGV3bGV0dC1Q 16 | YWNrYXJkIENvbXBhbnkgKEhQIENvZGVzaWduaW5nIFNlcnZpY2UpiGQEExECACQF 17 | AkIxWpoCGwMFCRLMAwAGCwkIBwMCAxUCAwMWAgECHgECF4AACgkQUnvFOiaJuIc1 18 | 2wCgj2UotUgSegPHmcKdApY+4WFaz/QAnjI58l5bDD8eElBCErHVoq9uPMczuQIN 19 | BEIxWqUQCADnBXqoU8QeZPEy38oI0GrN2q7nvS+4UBQeIRVy8x+cOqDRDcE8PHej 20 | 7NtxP698U0WFGK47GszjiV4WTnvexuJk0B5AMEBHana8fVj7uRUcmyYZqOZd7EXn 21 | Q3Ivi8itfkTICkhZi7bmGsSF0iJ0eAI5n2bCqJykNQvJ6a3dWJKP8EgaBCZj+TGL 22 | WWJHDZsrn8g4BeaNS/MbmsCLAk8N6bWMGzAKfgxUraMCwuZ9fVyHFavHdeChUtna 23 | qnF4uw0hHLaGWmTJjziXVvVC1a8+inTxPZkVpAvD0A+/LNlkP7TtAdaVOJqv3+a3 24 | ybMQL851bRTFyt+H0XGHhzhhtuu9+DyfAAMFCADRWGxIfniVG7O4wtwLD3sWzR/W 25 | LmFlJYu4s9rSDgn3NDjigQzZoVtbuv3Z9IZxBMoYa50MuybuVDp55z/wmxvYoW2G 26 | 25kOFDKx/UmkKkUBLdokb5V1p9j5SJorGBSfsNAHflhmBhyuMP4CDISbBUSN7oO1 27 | Oj41jNxpqhy+8ayygSVcTNwMe909J/HdC//xFANLDhjKPf3ZAulWNhOvjTlpF46B 28 | yt1l8ZNinIeE7CFL7H+LlMl2Ml6wsOkrxsSauBis6nER4sYVqrMdzpUU2Sr2hj6Q 29 | sJ+9TS+IURcnxL/M851KCwLhwZKdphQjT3mXXsoCx/l3rI6cxpwYgjiKiZhOiE8E 30 | GBECAA8FAkIxWqUCGwwFCRLMAwAACgkQUnvFOiaJuIenewCdHcEvMxBYprqRjKUw 31 | 04EypyFtZTgAn0wds0nbpd2+VZ5WHbVRfU4y5Y5Y 32 | =+cX+ 33 | -----END PGP PUBLIC KEY BLOCK----- 34 | """ 35 | 36 | HPPUBLICKEY2048 = """ 37 | -----BEGIN PGP PUBLIC KEY BLOCK----- 38 | Version: GnuPG v1.4.10 (MingW32) 39 | 40 | mQENBFC+QboBCAC1bodHD7AmR00SkDMB4u9MXy+Z5vv8wbmGRaKDBYScpAknOljX 41 | d5tBADffAetd1hgLnrLKN8vHdIsYkmUyeEeEsnIUKtwvbx/f6PoZZPOIIIRh1d2W 42 | Mjw9qXIE+tgr2gWlq0Gi5BZzaKse1+khRQ2rewJBppblSGWgcmCMIq8OwAsrdbtr 43 | z7+37c/g/Y2VfAahc23YZW9LQ5MiaI4nS4JMZbWPYtBdF78B/D2t5FvmvDG0Cgjk 44 | Qi1U9IVjiFKixuoi6nRsvBLFYL/cI+vo4iyUC5x7qmKd8gN7A030gS67VrleNRki 45 | q0vaF6J46XpIl4o58t23FSAKKRbTwavYzdMpABEBAAG0NEhld2xldHQtUGFja2Fy 46 | ZCBDb21wYW55IFJTQSAoSFAgQ29kZXNpZ25pbmcgU2VydmljZSmJAT4EEwECACgF 47 | AlC+QboCGwMFCRLMAwAGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheAAAoJELBwaApc 48 | 4tR2x7sH/A3D4XxEEyrX6Z3HeWSSA80+n+r5QwfXm5unxsWEL3JyNg6sojlrJY4K 49 | 8k4ih4nkY4iblChTCSQwnqKXqkL5U+RIr+AJoPx+55M98u4eRTVYMHZD7/jFq85z 50 | ZFGUkFkars9E2aRzWhqbz0LINb9OUeX0tT5qQseHflO2PaJykxNPC14WhsBKC2lg 51 | dZWnGhO5QJFp69AnSp4k+Uo/1LMk87YEJIL1NDR0lrlKgRvFfFyTpRBt+Qb1Bb7g 52 | rjN0171g8t5GaPWamN3Oua/v4aZg15f3xydRF8y9TsYjiNz+2TzRjKv7AkpZaJST 53 | 06CqMjCgiZ6UFFGN0/oqLnwxdP3Mmh4= 54 | =aphN 55 | -----END PGP PUBLIC KEY BLOCK----- 56 | """ 57 | 58 | HPPUBLICKEY2048_KEY1 = """ 59 | -----BEGIN PGP PUBLIC KEY BLOCK----- 60 | Version: GnuPG v1.4.12 (MingW32) 61 | 62 | mQENBFRtGAgBCADlSku65P14hVdx9E/W0n6MwuB3WGqmsyKNoa3HezFdMjWERldI 63 | NNUdi8O28cZ6j2+Hi9L1HeQIQ9+7FHpR3JyQePBJtRX8WSEusfRtML98opDhJxKm 64 | 8Jyxb7aTvCwdNHz3yxADINkMtOj5oRm7VCr8XHkG7YU27ELs8B+BXWvjO21oSosi 65 | FurnhT+H3hQsYXfYA55aa21q0qX+L5dFJSNdzZVo7m9ybioVv2R5+PfBvdaSxCnm 66 | OpcGXFaKAsqVHeTW0pd3sdkin1rkbhOBaU5lFBt2ZiMtKpKHpT8TZnqHpFHFbgi8 67 | j2ARJj4IDct2OGILddUIZSFyue6WE2hpV5c/ABEBAAG0OEhld2xldHQtUGFja2Fy 68 | ZCBDb21wYW55IFJTQSAoSFAgQ29kZXNpZ25pbmcgU2VydmljZSkgLSAxiQE+BBMB 69 | AgAoBQJUbRgIAhsDBQkSzAMABgsJCAcDAgYVCAIJCgsEFgIDAQIeAQIXgAAKCRD6 70 | 3Y1ksSdeo6BJCADOfIPPLPpIOnFK9jH4t8lLUd+RyMc+alA3uTDPUJa/ZHa6DHfh 71 | 42iaPYVEV8OG0tnbMlHmwvsZ5c1/MRMw1UbxCvD88P2qM4SUrUjQUlSCms2GLGvF 72 | ftFXBiOJQ7/yBc9o+yoSvwPrrTxSCk4+Sqm0IfVXVzChDM9dM9YPY2Vzjd+LUaYC 73 | 3X+eSuggUDO0TmJLJd7tZdF9fVXq3lr63BZ5PY98MTCuOoeSMDa9FIUQf6vn6UUJ 74 | MDSRZ9OzhpNJOKR+ShVRwDK6My8gtVIW1EAW2w3VQWI2UNF07aLeO8UG6nTNWA23 75 | +OuZkUdgQovjcq01caSefgOkmiQOx6d74CAk 76 | =X+eo 77 | -----END PGP PUBLIC KEY BLOCK----- 78 | """ 79 | 80 | HPEPUBLICKEY2048_KEY1 = """ 81 | -----BEGIN PGP PUBLIC KEY BLOCK----- 82 | Version: GnuPG v1.4.12 (GNU/Linux) 83 | 84 | mQENBFZp0LkBCACXajRw3b4x7G7dulNYj0hUID4BtVFq/MjEb6PHckTxGxZDoQRX 85 | RK54tiTFA9wq3b4P3yEFnOjbjRoI0d7Ls67FADugFO+cDCtsV9yuDlaYP/U/h2nX 86 | N0R4AdYbsVd5yr6xr+GAy66Hmx5jFH3kbC+zJpOcI0tU9hcyU7gjbxu6KQ1ypI2Q 87 | VRKf8sRBJXgmkOlbYx35ZUMFcmVxrLJXvUuxmAVXgT9f5M3Z3rsGt/ab+/+1TFSb 88 | RsaqHsIPE0QH8ikqW4IeDQAo1T99pCdf7FWr45KFFTo7O4AZdLMWVgqeFHaSoZxJ 89 | 307VIINsWiwQoPp0tfU5NOOOwB1Sv3x9QgFtABEBAAG0P0hld2xldHQgUGFja2Fy 90 | ZCBFbnRlcnByaXNlIENvbXBhbnkgUlNBLTIwNDgtMjUgPHNpZ25ocEBocGUuY29t 91 | PokBPQQTAQIAJwUCVmnQuQIbLwUJEswDAAYLCQgHAwIGFQgCCQoLAxYCAQIeAQIX 92 | gAAKCRDCCK3eJsK3l9G+B/0ekblsBeN+xHIJ28pvo2aGb2KtWBwbT1ugI+aIS17K 93 | UQyHZJUQH+ZeRLvosuoiQEdcGIqmOxi2hVhSCQAOV1LAonY16ACveA5DFAEBz1+a 94 | WQyx6sOLLEAVX1VqGlBXxh3XLEUWOhlAf1gZPNtHsmURTUy2h1Lv/Yoj8KLyuK2n 95 | DmrLOS3Ro+RqWocaJfvAgXKgt6Fq/ChDUHOnar7lGswzMsbE/yzLJ7He4y89ImK+ 96 | 2ktR5HhDuxqgCe9CWH6Q/1WGhUa0hZ3nbluq7maa+kPe2g7JcRzPH/nJuDCAOZ7U 97 | 6mHE8j0kMQMYjgaYEx2wc02aQRmPyxhbDLjSbtjomXRr 98 | =voON 99 | -----END PGP PUBLIC KEY BLOCK----- 100 | """ 101 | 102 | HP_KEYS = [ 103 | HPEPUBLICKEY2048_KEY1, 104 | HPPUBLICKEY2048_KEY1, 105 | HPPUBLICKEY2048, 106 | HPPUBLICKEY1024, 107 | ] 108 | -------------------------------------------------------------------------------- /tests/functional/README.md: -------------------------------------------------------------------------------- 1 | # Functional Tests for the Hardware Observer Charm 2 | There are 2 main types of functional tests for the Hardware Observer charm - those which depend on 3 | real hardware to be present and those that can run without it. 4 | 5 | Here, "real hardware" refers to machines that are not VMs or containers and have access to real 6 | hardware resources like RAID cards and BMC management tools. 7 | 8 | Note: the built charm must be present in the root of the project's directory for the tests to run. 9 | 10 | ## Hardware Independent Tests 11 | These are the tests for hardware observer that do not require any real hardware. 12 | 13 | Hardware independent tests are run on every PR / weekly scheduled test run. 14 | 15 | These include: 16 | * Testing whether juju config changes produce the required results 17 | 18 | Running these tests is as simple as executing the `tox -e func -- -v` 19 | 20 | ## Hardware Dependent Tests 21 | These are the tests that depend on real hardware to be executed. This is performed manually when 22 | required, for example - validating the charm's full functionality before a new release. 23 | 24 | Hardware dependent tests are present in the `TestCharmWithHW` class in the `test_charm.py` module. 25 | The pytest marker `realhw` has been added to this class (which would include all the tests in this 26 | class). 27 | 28 | These tests will only be executed if the `--realhw` option for pytest is provided. Additionally, 29 | the `--collectors` option with space separated values can be provided, if specific hardware is 30 | present. Check the `conftest.py` for options. Otherwise, all these tests are skipped (this is done 31 | by checking for the presence of the `realhw` marker mentioned earlier.) 32 | 33 | Note: The operator must set up a test model with the machine added beforehand. The machine must be 34 | an actual host, containers or VMs won't work. 35 | Note: depending on the test, certain prerequisites are needed, e.g. having set up an nvidia driver. 36 | Check the tests' docstrings for details. 37 | 38 | Some of these tests include: 39 | * Check if all collectors are detected in the exporter config file 40 | * Test if metrics are available at the expected endpoint 41 | * Test if metrics specific to the collectors being tested are available 42 | * Test if smarctl-exporter snap is installed and running 43 | * Test if the dcgm snap is installed 44 | 45 | and more. 46 | 47 | In order to run these tests, several prerequisites may need to be completed. 48 | 1. Setup test environment 49 | 1. Build the charm 50 | 1. Add environment variables for Redfish credentials (if testing redfish). 51 | 1. Setup required resource files (if testing hardware raid). 52 | 1. Install the NVIDIA gpu driver and add the `--nvidia` flag (if testing NVIDIA gpu observability). 53 | 1. Find supported collectors 54 | 55 | ### 1. Setup test environment 56 | 57 | You can refer to dev-environment.md here, up to the "Add physical machine" section included. 58 | The end result should be a test model with a manually provisioned machine listed: 59 | 60 | ``` 61 | $ juju status 62 | Model Controller Cloud/Region Version SLA Timestamp 63 | test lxd-controller localhost/localhost 3.6.1 unsupported 01:39:10Z 64 | 65 | Machine State Address Inst id Base AZ Message 66 | 0 started 10.239.17.1 manual:10.239.17.1 ubuntu@22.04 Manually provisioned machine 67 | ``` 68 | 69 | ### 2. Build the charm 70 | 71 | Just run `charmcraft pack` from the project directory. 72 | 73 | ### 3. Add environment variables for Redfish credentials 74 | As part of the redfish collector specific tests, redfish credentials need to be provided for 75 | authentication. 76 | 77 | Therefore, the test expects these environment variables to be set: 78 | * `REDFISH_USERNAME` 79 | * `REDFISH_PASSWORD` 80 | 81 | ### 4. Setup required resource files 82 | Create a new `resources` directory in the root of the project. 83 | Check which collectors are supported on the machine and verify if they need to be manually 84 | downloaded (refer https://charmhub.io/hardware-observer/resources/). Download the required 85 | resource files from their respective third-party websites and add the extracted `.deb` file or 86 | binary to this directory. 87 | 88 | Note: The tests expect these resources to be named exactly in the manner provided below: 89 | * storcli.deb 90 | * perccli.deb 91 | * sas2ircu 92 | * sas3ircu 93 | 94 | ### 4. Find supported collectors 95 | Note down all the collectors supported by the machine as they need to be provided to pytest as part 96 | of its CLI arguments. 97 | 98 | This is done by passing the required collectors in a space-separated manner via `--collector` 99 | option to the tox target. 100 | 101 | The supported collectors can be found by checking the output of the `lshw` command (for RAID cards) 102 | or checking availability of Redfish and IPMI on the BMC. 103 | 104 | --- 105 | 106 | ### Running the tests 107 | 108 | After ensuring the prerequisite steps are complete, the final command to run the tests would look 109 | something like this: 110 | 111 | ``` 112 | tox -e func -- -v --realhw --model test --collectors ipmi_dcmi ipmi_sel ipmi_sensor redfish mega_raid --nvidia --keep-models 113 | ``` 114 | 115 | This would pass the required collectors to tox which then sends it to the pytest command and starts 116 | the hardware dependent tests. 117 | 118 | ### Troubleshooting 119 | 120 | Create a `pytest.ini` file with the following contents to follow the live pytest logs 121 | 122 | ``` 123 | [pytest] 124 | log_cli = True 125 | log_cli_level = INFO 126 | ``` 127 | 128 | Add this line if you'd like to pass some more pytest options without messing with the make command. 129 | ``` 130 | addopts = -vv -k 'ipmi_sensor' 131 | ``` 132 | -------------------------------------------------------------------------------- /src/hardware.py: -------------------------------------------------------------------------------- 1 | """Hardware support config and command helper.""" 2 | 3 | import json 4 | import logging 5 | import re 6 | import subprocess 7 | import typing as t 8 | from pathlib import Path 9 | from typing import Optional 10 | 11 | from charms.operator_libs_linux.v0 import apt 12 | 13 | from config import HWTool 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | # File path that contains the NVIDIA driver that is loaded and its version 18 | NVIDIA_DRIVER_PATH = Path("/proc/driver/nvidia/version") 19 | 20 | 21 | LSHW_SUPPORTED_STORAGES = { 22 | HWTool.SAS2IRCU: [ 23 | # Broadcom 24 | "SAS2004", 25 | "SAS2008", 26 | "SAS2108", 27 | "SAS2208", 28 | "SAS2304", 29 | "SAS2308", 30 | ], 31 | HWTool.SAS3IRCU: [ 32 | # Broadcom 33 | "SAS3004", 34 | "SAS3008", 35 | ], 36 | HWTool.SSACLI: [ 37 | "Smart Array Gen8 Controllers", 38 | "Smart Array Gen9 Controllers", 39 | ], 40 | } 41 | 42 | HWINFO_SUPPORTED_STORAGES = { 43 | HWTool.SSACLI: [ 44 | [ 45 | "Hardware Class: storage", 46 | 'Vendor: pci 0x9005 "Adaptec"', 47 | 'Device: pci 0x028f "Smart Storage PQI 12G SAS/PCIe 3"', 48 | 'SubDevice: pci 0x1100 "Smart Array P816i-a SR Gen10"', 49 | ] 50 | ] 51 | } 52 | 53 | 54 | def lshw(class_filter: t.Optional[str] = None) -> t.Any: 55 | """Return lshw output as dict.""" 56 | cmd = "lshw -json" 57 | if class_filter: 58 | cmd = cmd + " -c " + class_filter 59 | try: 60 | output = subprocess.check_output(cmd.split(), text=True) 61 | json_output = json.loads(output) 62 | # lshw has different output on different ubuntu series 63 | # if class_filter is not provided. 64 | if not class_filter and isinstance(json_output, list): 65 | json_output = json_output[0] 66 | return json_output 67 | except subprocess.CalledProcessError as err: 68 | logger.error(err) 69 | # Raise error because the cmd should always work. 70 | raise err 71 | 72 | 73 | def get_bmc_address() -> t.Optional[str]: 74 | """Get BMC IP address by ipmitool.""" 75 | apt.add_package("ipmitool", update_cache=False) 76 | cmd = "ipmitool lan print" 77 | try: 78 | output = subprocess.check_output(cmd.split(), text=True) 79 | for line in output.splitlines(): 80 | values = line.split(":") 81 | if values[0].strip() == "IP Address": 82 | return values[1].strip() 83 | except subprocess.CalledProcessError: 84 | logger.debug("IPMI is not available") 85 | return None 86 | 87 | 88 | def hwinfo(*args: str) -> t.Dict[str, str]: 89 | """Run hwinfo command and return output as dictionary. 90 | 91 | Args: 92 | args: Probe for a particular hardware class. 93 | Returns: 94 | hw_info: hardware information dictionary 95 | """ 96 | apt.add_package("hwinfo", update_cache=False) 97 | hw_classes = list(args) 98 | for idx, hw_item in enumerate(args): 99 | hw_classes[idx] = "--" + hw_item 100 | hw_info_cmd = ["hwinfo"] + hw_classes 101 | 102 | output = subprocess.check_output(hw_info_cmd, text=True) 103 | if "start debug info" in output.splitlines()[0]: 104 | output = output.split("=========== end debug info ============")[1] 105 | 106 | hardware: t.Dict[str, str] = {} 107 | for item in output.split("\n\n"): 108 | key = item.splitlines()[0].strip() 109 | hardware[key] = item 110 | return hardware 111 | 112 | 113 | def is_nvidia_driver_loaded() -> bool: 114 | """Determine if an NVIDIA driver has been loaded.""" 115 | return NVIDIA_DRIVER_PATH.exists() 116 | 117 | 118 | def get_nvidia_driver_version() -> int: 119 | """Get the NVIDIA driver version installed on the system.""" 120 | try: 121 | nvidia_driver_version = NVIDIA_DRIVER_PATH.read_text() 122 | match = re.search(r"NVRM version:.*?(\d+\.\d+(?:\.\d+)*)", nvidia_driver_version) 123 | if match: 124 | return int(match.group(1).split(".")[0]) 125 | except FileNotFoundError as e: 126 | msg = "NVIDIA driver version file not found." 127 | logger.error(msg) 128 | raise FileNotFoundError(msg) from e 129 | 130 | 131 | def get_cuda_version_from_driver() -> int: 132 | """Map the installed NVIDIA driver version to CUDA version.""" 133 | driver_version = get_nvidia_driver_version() 134 | 135 | if driver_version >= 580: 136 | return 13 137 | elif driver_version >= 525: 138 | return 12 139 | elif driver_version >= 450: 140 | logger.warning( 141 | "The installed NVIDIA driver version '%s' might not be supported in next DCGM " 142 | "releases. Consider updating the NVIDIA driver.", 143 | driver_version, 144 | ) 145 | return 11 146 | else: 147 | logger.warning( 148 | "The installed NVIDIA driver version '%s' is quite old and might not be supported " 149 | "by recent DCGM versions. Consider updating the NVIDIA driver.", 150 | driver_version, 151 | ) 152 | return 10 153 | 154 | 155 | def dcgm_v3_compatible(cuda_version: int, track: str, channel: Optional[str] = None) -> bool: 156 | """Check if the installed DCGM snap is v3 compatible.""" 157 | valid_channel = "v3" in channel if channel is not None else True 158 | return valid_channel and cuda_version < 13 and track in {"v3", "auto"} 159 | 160 | 161 | def dcgm_v4_compatible(cuda_version: int, track: str, channel: Optional[str] = None) -> bool: 162 | """Check if the installed DCGM snap is v4 compatible.""" 163 | valid_channel = f"v4-cuda{cuda_version}" in channel if channel is not None else True 164 | return valid_channel and cuda_version > 10 and cuda_version <= 13 and track in {"v4", "auto"} 165 | -------------------------------------------------------------------------------- /tests/unit/test_alert_rules/test_ipmi_sel.yaml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - ../../../src/prometheus_alert_rules/ipmi_sel.yaml 3 | 4 | evaluation_interval: 1m 5 | 6 | tests: 7 | - interval: 1m 8 | input_series: 9 | - series: ipmi_sel_command_success{instance="ubuntu-0"} 10 | values: '0x15' 11 | 12 | - series: ipmi_sel_command_success{instance="ubuntu-nominal"} 13 | values: '1x15' 14 | - series: ipmi_sel_state_nominal{instance="ubuntu-nominal"} 15 | values: '1x5 2x5 3x5' 16 | 17 | - series: ipmi_sel_command_success{instance="ubuntu-warning"} 18 | values: '1x15' 19 | - series: ipmi_sel_state_warning{instance="ubuntu-warning"} 20 | values: '1x5 2x5 3x5' 21 | 22 | - series: ipmi_sel_command_success{instance="ubuntu-critical"} 23 | values: '1x15' 24 | - series: ipmi_sel_state_critical{instance="ubuntu-critical"} 25 | values: '1x5 2x5 3x5' 26 | 27 | - series: node_systemd_unit_state{name="ipmiseld.service", instance="ubuntu-3", state="failed"} 28 | values: '1x15' 29 | 30 | - series: node_systemd_unit_state{name="ipmiseld.service", instance="ubuntu-4", state="inactive"} 31 | values: '1x15' 32 | 33 | alert_rule_test: 34 | - eval_time: 10m 35 | alertname: IPMISELCommandFailed 36 | exp_alerts: 37 | - exp_labels: 38 | severity: critical 39 | instance: ubuntu-0 40 | exp_annotations: 41 | summary: Failed to run ipmi-sel. (instance ubuntu-0) 42 | description: | 43 | Failed to get system event logs using ipmi-sel. 44 | VALUE = 0 45 | LABELS = map[__name__:ipmi_sel_command_success instance:ubuntu-0] 46 | 47 | - eval_time: 7m 48 | alertname: IPMISELStateWarning 49 | exp_alerts: 50 | - exp_labels: 51 | severity: warning 52 | instance: ubuntu-warning 53 | event_id: 2 54 | exp_annotations: 55 | summary: IPMI system event log in warning state. (instance ubuntu-warning) 56 | description: | 57 | IPMI SEL entry in warning state. 58 | LABELS = map[__name__:ipmi_sel_state_warning instance:ubuntu-warning] 59 | EVENT_ID = 2 60 | 61 | - eval_time: 13m 62 | alertname: IPMISELStateWarning 63 | exp_alerts: 64 | - exp_labels: 65 | severity: warning 66 | instance: ubuntu-warning 67 | event_id: 3 68 | exp_annotations: 69 | summary: IPMI system event log in warning state. (instance ubuntu-warning) 70 | description: | 71 | IPMI SEL entry in warning state. 72 | LABELS = map[__name__:ipmi_sel_state_warning instance:ubuntu-warning] 73 | EVENT_ID = 3 74 | 75 | - eval_time: 7m 76 | alertname: IPMISELStateCritical 77 | exp_alerts: 78 | - exp_labels: 79 | severity: critical 80 | instance: ubuntu-critical 81 | event_id: 2 82 | exp_annotations: 83 | summary: IPMI system event log in critical state. (instance ubuntu-critical) 84 | description: | 85 | IPMI SEL entry in critical state. 86 | LABELS = map[__name__:ipmi_sel_state_critical instance:ubuntu-critical] 87 | EVENT_ID = 2 88 | 89 | - eval_time: 13m 90 | alertname: IPMISELStateCritical 91 | exp_alerts: 92 | - exp_labels: 93 | severity: critical 94 | instance: ubuntu-critical 95 | event_id: 3 96 | exp_annotations: 97 | summary: IPMI system event log in critical state. (instance ubuntu-critical) 98 | description: | 99 | IPMI SEL entry in critical state. 100 | LABELS = map[__name__:ipmi_sel_state_critical instance:ubuntu-critical] 101 | EVENT_ID = 3 102 | 103 | - eval_time: 10m 104 | alertname: IPMISELDStateWarning 105 | exp_alerts: 106 | - exp_labels: 107 | severity: warning 108 | name: ipmiseld.service 109 | instance: ubuntu-3 110 | state: failed 111 | exp_annotations: 112 | summary: IPMISELD service is not active. (instance ubuntu-3) 113 | description: | 114 | The ipmiseld service is not active, indicating a potential problem. 115 | VALUE = 1 116 | LABELS = map[__name__:node_systemd_unit_state instance:ubuntu-3 name:ipmiseld.service state:failed] 117 | 118 | - exp_labels: 119 | severity: warning 120 | name: ipmiseld.service 121 | instance: ubuntu-4 122 | state: inactive 123 | exp_annotations: 124 | summary: IPMISELD service is not active. (instance ubuntu-4) 125 | description: | 126 | The ipmiseld service is not active, indicating a potential problem. 127 | VALUE = 1 128 | LABELS = map[__name__:node_systemd_unit_state instance:ubuntu-4 name:ipmiseld.service state:inactive] 129 | 130 | - interval: 1m 131 | input_series: 132 | - series: some_other_metric{instance="ubuntu-0"} 133 | values: '1x10' 134 | 135 | alert_rule_test: 136 | - eval_time: 6m 137 | alertname: IPMISELCommandSuccessMetricsMissing 138 | exp_alerts: 139 | - exp_labels: 140 | severity: critical 141 | exp_annotations: 142 | summary: IPMI SEL command success metrics missing. (instance ) 143 | description: | 144 | The ipmi_sel_command_success metric has been missing for over 5 minutes. 145 | This may indicate IPMI SEL command timeouts, or that IPMI tools/services are not installed or supported on this hardware. 146 | LABELS = map[] 147 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/smart.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: SMART 3 | rules: 4 | 5 | - alert: SmartNVMeDriveReliabilityDegraded 6 | # isolate the least significant three bits with % 8 7 | # check whether bit 2 (starts from bit 0) has been set with the >= 4 comparison 8 | # refer: https://en.wikipedia.org/wiki/Self-Monitoring,_Analysis_and_Reporting_Technology#Known_NVMe_S.M.A.R.T._attributes 9 | expr: smartctl_device_critical_warning % 8 >= 4 10 | for: 15m 11 | labels: 12 | severity: critical 13 | annotations: 14 | summary: SMART alert for critical warning attribute on an NVMe controller due to degradation in drive reliability. (instance {{ $labels.instance }}) 15 | description: | 16 | Drive reliability is degraded. Bit 2 of critical warning SMART attribute is set. 17 | VALUE = {{ $value }} 18 | LABELS = {{ $labels }} 19 | 20 | - alert: SmartNVMeDriveinReadOnlyMode 21 | # isolate the least significant four bits with % 16 22 | # check whether bit 3 (starts from bit 0) has been set with the >= 8 comparison 23 | # refer: https://en.wikipedia.org/wiki/Self-Monitoring,_Analysis_and_Reporting_Technology#Known_NVMe_S.M.A.R.T._attributes 24 | expr: smartctl_device_critical_warning % 16 >= 8 25 | for: 15m 26 | labels: 27 | severity: critical 28 | annotations: 29 | summary: SMART alert for critical warning attribute on an NVMe controller due to drive being in read-only mode. (instance {{ $labels.instance }}) 30 | description: | 31 | Drive is in read-only mode. Bit 3 of critical warning SMART attribute is set. 32 | VALUE = {{ $value }} 33 | LABELS = {{ $labels }} 34 | 35 | - alert: SmartHealthStatusFail 36 | expr: smartctl_device_smart_status == 0 37 | for: 2m 38 | labels: 39 | severity: critical 40 | annotations: 41 | summary: SMART health status failed for device. (instance {{ $labels.instance }}) 42 | description: | 43 | SMART health status failed for device. This means either that the device has already failed, or that it is predicting its own failure within the next 24 hours. 44 | VALUE = {{ $value }} 45 | LABELS = {{ $labels }} 46 | 47 | - alert: SmartExitStatusDiskFail 48 | # isolate the least significant four bits with % 16 49 | # check whether bit 3 (starts from bit 0) has been set with the >= 8 comparison 50 | # refer: https://www.smartmontools.org/browser/trunk/smartmontools/smartctl.8.in#EXIT_STATUS 51 | expr: smartctl_device_smartctl_exit_status % 16 >= 8 52 | for: 2m 53 | labels: 54 | severity: critical 55 | annotations: 56 | summary: smartctl exit status returned "DISK FAILING". (instance {{ $labels.instance }}) 57 | description: | 58 | smartctl exit status returned "DISK FAILING". Bit 3 of smartctl exit status is set. 59 | VALUE = {{ $value }} 60 | LABELS = {{ $labels }} 61 | 62 | - alert: SmartExitStatusPrefailBelowThreshold 63 | # isolate the least significant four bits with % 32 64 | # check whether bit 4 (starts from bit 0) has been set with the >= 16 comparison 65 | # refer: https://www.smartmontools.org/browser/trunk/smartmontools/smartctl.8.in#EXIT_STATUS 66 | expr: smartctl_device_smartctl_exit_status % 32 >= 16 67 | for: 2m 68 | labels: 69 | severity: warning 70 | annotations: 71 | summary: smartctl exit status reports pre-fail attribute for device is below threshold. (instance {{ $labels.instance }}) 72 | description: | 73 | smartctl exit status pre-fail attribute is below threshold. Bit 4 of smartctl exit status is set. 74 | VALUE = {{ $value }} 75 | LABELS = {{ $labels }} 76 | 77 | - alert: SmartNVMeWearoutIndicator 78 | expr: smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"} 79 | for: 15m 80 | labels: 81 | severity: critical 82 | annotations: 83 | summary: SMART alert for available spare space below threshold for NVMe device. (instance {{ $labels.instance }}) 84 | description: | 85 | Available spare space below threshold for NVMe device. 86 | VALUE = {{ $value }} 87 | LABELS = {{ $labels }} 88 | 89 | - alert: SmartAttributeWarning 90 | # based on https://www.backblaze.com/blog/what-smart-stats-indicate-hard-drive-failures/ 91 | expr: smartctl_device_attribute{attribute_id=~"5|187|188|197|198", attribute_value_type="raw"} > 0 92 | for: 2m 93 | labels: 94 | severity: warning 95 | annotations: 96 | summary: SMART device attribute correlating with drive failure has its raw value greater than zero. (instance {{ $labels.instance }}) 97 | description: | 98 | SMART raw value for attribute "{{ $labels.attribute_name }}" with id "{{ $labels.attribute_id }}" 99 | on device "{{ $labels.device }}" is greater than 0. 100 | VALUE = {{ $value }} 101 | LABELS = {{ $labels }} 102 | 103 | - alert: SmartNVMeDriveLifetimeWarning 104 | expr: smartctl_device_percentage_used{device=~"nvme.*"} >= 80 105 | for: 15m 106 | labels: 107 | severity: warning 108 | annotations: 109 | summary: NVMe drive is approaching its estimated lifetime (instance {{ $labels.instance }}) 110 | description: | 111 | The NVMe drive has reached 80% of its estimated lifetime. 112 | Note: A value of 100 does not indicate failure. For more details, visit https://charmhub.io/hardware-observer/docs/metrics-and-alerts-smart 113 | VALUE = {{ $value }} 114 | LABELS = {{ $labels }} 115 | 116 | - alert: SmartNVMeDriveLifetimeCritical 117 | expr: smartctl_device_percentage_used{device=~"nvme.*"} >= 90 118 | for: 15m 119 | labels: 120 | severity: critical 121 | annotations: 122 | summary: NVMe drive is close to reaching its estimated lifetime (instance {{ $labels.instance }}) 123 | description: | 124 | The NVMe drive has reached 90% of its estimated lifetime. 125 | Note: A value of 100 does not indicate failure. For more details, visit https://charmhub.io/hardware-observer/docs/metrics-and-alerts-smart 126 | VALUE = {{ $value }} 127 | LABELS = {{ $labels }} 128 | -------------------------------------------------------------------------------- /tests/functional/conftest.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import logging 3 | import os 4 | import platform 5 | from pathlib import Path 6 | 7 | import pytest 8 | from pytest_operator.plugin import OpsTest 9 | from utils import RESOURCES_DIR, Resource 10 | 11 | from config import HARDWARE_EXPORTER_COLLECTOR_MAPPING, TPR_RESOURCES, HWTool 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | def pytest_addoption(parser): 17 | parser.addoption( 18 | "--base", 19 | type=str.lower, 20 | default="ubuntu@22.04", 21 | choices=["ubuntu@20.04", "ubuntu@22.04", "ubuntu@24.04"], 22 | help="Set base for the applications.", 23 | ) 24 | 25 | parser.addoption( 26 | "--realhw", 27 | action="store_true", 28 | help="Enable real hardware testing.", 29 | ) 30 | 31 | parser.addoption( 32 | "--nvidia", 33 | action="store_true", 34 | help="Enable NVIDIA GPU support for testing with real hardware.", 35 | ) 36 | 37 | parser.addoption( 38 | "--collectors", 39 | nargs="+", 40 | type=str.lower, 41 | default="", 42 | choices=[ 43 | "ipmi_dcmi", 44 | "ipmi_sel", 45 | "ipmi_sensor", 46 | "redfish", 47 | "mega_raid", 48 | "poweredge_raid", 49 | "lsi_sas_2", 50 | "lsi_sas_3", 51 | "hpe_ssa", 52 | ], 53 | help="Provide space-separated list of collectors for testing with real hardware.", 54 | ) 55 | 56 | 57 | def get_this_script_dir() -> Path: 58 | filename = inspect.getframeinfo(inspect.currentframe()).filename # type: ignore[arg-type] 59 | path = os.path.dirname(os.path.abspath(filename)) 60 | return Path(path) 61 | 62 | 63 | @pytest.fixture(scope="module") 64 | def bundle(ops_test: OpsTest, request, charm_path, base, provided_collectors): 65 | """Configure the bundle depending on cli arguments.""" 66 | bundle_template_path = get_this_script_dir() / "bundle.yaml.j2" 67 | log.info("Rendering bundle %s", bundle_template_path) 68 | bundle = ops_test.render_bundle( 69 | bundle_template_path, 70 | charm=charm_path, 71 | base=base, 72 | redfish_disable=("redfish" not in provided_collectors), 73 | resources={ 74 | "storcli-deb": "empty-resource", 75 | "perccli-deb": "empty-resource", 76 | "sas2ircu-bin": "empty-resource", 77 | "sas3ircu-bin": "empty-resource", 78 | }, 79 | ) 80 | 81 | return bundle 82 | 83 | 84 | @pytest.fixture(scope="module") 85 | def base(request): 86 | return request.config.getoption("--base") 87 | 88 | 89 | @pytest.fixture(scope="module") 90 | def nvidia_present(request): 91 | return request.config.getoption("--nvidia") 92 | 93 | 94 | @pytest.fixture(scope="module") 95 | def realhw(request): 96 | return request.config.getoption("--realhw") 97 | 98 | 99 | @pytest.fixture(scope="module") 100 | def architecture(): 101 | machine = platform.machine() 102 | if machine == "aarch64": 103 | return "arm64" 104 | return "amd64" 105 | 106 | 107 | @pytest.fixture(scope="module") 108 | def provided_collectors(request): 109 | return set(request.config.getoption("collectors")) 110 | 111 | 112 | def pytest_configure(config): 113 | config.addinivalue_line("markers", "realhw: mark test as requiring real hardware to run.") 114 | 115 | 116 | def pytest_collection_modifyitems(config, items): 117 | if not config.getoption("--realhw"): 118 | # skip hw dependent tests in TestCharmWithHW marked with "realhw" 119 | skip_hw_dependent = pytest.mark.skip( 120 | reason="Hardware dependent test. Provide collectors with the --collectors option." 121 | ) 122 | for item in items: 123 | if "realhw" in item.keywords: 124 | item.add_marker(skip_hw_dependent) 125 | 126 | 127 | @pytest.fixture() 128 | def app(ops_test): 129 | return ops_test.model.applications["hardware-observer"] 130 | 131 | 132 | @pytest.fixture() 133 | def unit(app): 134 | return app.units[0] 135 | 136 | 137 | @pytest.fixture() 138 | def resources() -> list[Resource]: 139 | """Return list of Resource objects.""" 140 | return [ 141 | Resource( 142 | resource_name=TPR_RESOURCES.get(HWTool.STORCLI), 143 | file_name="storcli.deb", 144 | collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.STORCLI).replace( 145 | "collector.", "" 146 | ), 147 | bin_name=HWTool.STORCLI.value, 148 | ), 149 | Resource( 150 | resource_name=TPR_RESOURCES.get(HWTool.PERCCLI), 151 | file_name="perccli.deb", 152 | collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.PERCCLI).replace( 153 | "collector.", "" 154 | ), 155 | bin_name=HWTool.PERCCLI.value, 156 | ), 157 | Resource( 158 | resource_name=TPR_RESOURCES.get(HWTool.SAS2IRCU), 159 | file_name="sas2ircu", 160 | collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS2IRCU).replace( 161 | "collector.", "" 162 | ), 163 | bin_name=HWTool.SAS2IRCU.value, 164 | ), 165 | Resource( 166 | resource_name=TPR_RESOURCES.get(HWTool.SAS3IRCU), 167 | file_name="sas3ircu", 168 | collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS3IRCU).replace( 169 | "collector.", "" 170 | ), 171 | bin_name=HWTool.SAS3IRCU.value, 172 | ), 173 | ] 174 | 175 | 176 | @pytest.fixture() 177 | def required_resources(resources: list[Resource], provided_collectors: set) -> list[Resource]: 178 | """Return list of required resources to be attached as per hardware availability. 179 | 180 | Required resources will be empty if no collectors are provided. 181 | """ 182 | required_resources = [] 183 | 184 | for resource in resources: 185 | if resource.collector_name in provided_collectors: 186 | resource.file_path = f"{RESOURCES_DIR}/{resource.file_name}" 187 | required_resources.append(resource) 188 | 189 | return required_resources 190 | 191 | 192 | @pytest.fixture(scope="module") 193 | def charm_path(base: str, architecture: str) -> Path: 194 | """Fixture to determine the charm path based on the base and architecture.""" 195 | glob_path = f"hardware-observer_*{base}-{architecture}*.charm" 196 | paths = list(Path(".").glob(glob_path)) 197 | 198 | if not paths: 199 | raise FileNotFoundError(f"The path for the charm for {base}-{architecture} is not found.") 200 | 201 | if len(paths) > 1: 202 | raise FileNotFoundError( 203 | f"Multiple charms found for {base}-{architecture}. Please provide only one." 204 | ) 205 | 206 | # The bundle will need the full path to the charm 207 | path = paths[0].absolute() 208 | log.info(f"Using charm path: {path}") 209 | return path 210 | -------------------------------------------------------------------------------- /tests/unit/test_ssdlc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """Unit tests for SSDLC logging functionality.""" 5 | 6 | import unittest 7 | from datetime import datetime, timezone 8 | from unittest import mock 9 | 10 | from parameterized import parameterized 11 | 12 | from ssdlc import EXPORTER_NAME_TO_SERVICE, Service, SSDLCSysEvent, log_ssdlc_system_event 13 | 14 | 15 | class TestSSDLCLogging(unittest.TestCase): 16 | """Test SSDLC logging functions.""" 17 | 18 | @mock.patch("ssdlc.logger") 19 | @mock.patch("ssdlc.datetime") 20 | def test_log_ssdlc_system_event_with_exporter_name(self, mock_datetime, mock_logger): 21 | """Test logging with exporter_name string.""" 22 | # Setup mock datetime 23 | mock_now = mock.MagicMock() 24 | mock_now.isoformat.return_value = "2025-01-01T12:00:00+00:00" 25 | mock_datetime.now.return_value.astimezone.return_value = mock_now 26 | 27 | # Call the function with exporter name 28 | log_ssdlc_system_event(SSDLCSysEvent.STARTUP, "hardware-exporter") 29 | 30 | # Verify logger was called correctly 31 | mock_logger.warning.assert_called_once() 32 | logged_data = mock_logger.warning.call_args[0][0] 33 | 34 | self.assertEqual(logged_data["datetime"], "2025-01-01T12:00:00+00:00") 35 | self.assertEqual(logged_data["appid"], "service.hardware-exporter") 36 | self.assertEqual(logged_data["event"], "sys_startup:hardware-exporter") 37 | self.assertEqual(logged_data["level"], "WARN") 38 | self.assertIn("hardware observer start service", logged_data["description"]) 39 | 40 | @mock.patch("ssdlc.logger") 41 | @mock.patch("ssdlc.datetime") 42 | def test_log_ssdlc_system_event_with_different_exporter(self, mock_datetime, mock_logger): 43 | """Test logging with different exporter name.""" 44 | # Setup mock datetime 45 | mock_now = mock.MagicMock() 46 | mock_now.isoformat.return_value = "2025-01-01T12:00:00+00:00" 47 | mock_datetime.now.return_value.astimezone.return_value = mock_now 48 | 49 | # Call the function with dcgm exporter name 50 | log_ssdlc_system_event(SSDLCSysEvent.SHUTDOWN, "dcgm") 51 | 52 | # Verify logger was called correctly 53 | mock_logger.warning.assert_called_once() 54 | logged_data = mock_logger.warning.call_args[0][0] 55 | 56 | self.assertEqual(logged_data["datetime"], "2025-01-01T12:00:00+00:00") 57 | self.assertEqual(logged_data["appid"], "service.dcgm") 58 | self.assertEqual(logged_data["event"], "sys_shutdown:dcgm") 59 | self.assertEqual(logged_data["level"], "WARN") 60 | self.assertIn("hardware observer shutdown service", logged_data["description"]) 61 | 62 | @mock.patch("ssdlc.logger") 63 | def test_log_ssdlc_system_event_with_unknown_service(self, mock_logger): 64 | """Test logging with unknown service name.""" 65 | # Call the function with unknown service 66 | log_ssdlc_system_event(SSDLCSysEvent.STARTUP, "unknown-service") 67 | 68 | # Verify warning was logged with format string and args 69 | mock_logger.warning.assert_called_once_with( 70 | "Unknown service name: %s, skipping SSDLC logging", "unknown-service" 71 | ) 72 | 73 | @parameterized.expand( 74 | [ 75 | (SSDLCSysEvent.STARTUP, "hardware-exporter", ""), 76 | (SSDLCSysEvent.SHUTDOWN, "dcgm", ""), 77 | (SSDLCSysEvent.RESTART, "smartctl-exporter", ""), 78 | ( 79 | SSDLCSysEvent.CRASH, 80 | "hardware-exporter", 81 | "Connection timeout", 82 | ), 83 | ] 84 | ) 85 | @mock.patch("ssdlc.logger") 86 | @mock.patch("ssdlc.datetime") 87 | def test_log_ssdlc_system_event_all_events( 88 | self, event, service_name, msg, mock_datetime, mock_logger 89 | ): 90 | """Test logging all event types.""" 91 | # Setup mock datetime 92 | mock_now = mock.MagicMock() 93 | mock_now.isoformat.return_value = "2025-01-01T12:00:00+00:00" 94 | mock_datetime.now.return_value.astimezone.return_value = mock_now 95 | 96 | # Call the function 97 | log_ssdlc_system_event(event, service_name, msg) 98 | 99 | # Verify logger was called 100 | mock_logger.warning.assert_called_once() 101 | logged_data = mock_logger.warning.call_args[0][0] 102 | 103 | self.assertEqual(logged_data["datetime"], "2025-01-01T12:00:00+00:00") 104 | self.assertEqual(logged_data["appid"], f"service.{service_name}") 105 | self.assertEqual(logged_data["event"], f"{event.value}:{service_name}") 106 | self.assertEqual(logged_data["level"], "WARN") 107 | self.assertIsInstance(logged_data["description"], str) 108 | if msg: 109 | self.assertIn(msg, logged_data["description"]) 110 | 111 | def test_exporter_name_to_service_mapping(self): 112 | """Test that all exporters are mapped correctly.""" 113 | self.assertEqual( 114 | EXPORTER_NAME_TO_SERVICE["hardware-exporter"], 115 | Service.HARDWARE_EXPORTER, 116 | ) 117 | self.assertEqual( 118 | EXPORTER_NAME_TO_SERVICE["dcgm"], 119 | Service.DCGM_EXPORTER, 120 | ) 121 | self.assertEqual( 122 | EXPORTER_NAME_TO_SERVICE["smartctl-exporter"], 123 | Service.SMARTCTL_EXPORTER, 124 | ) 125 | 126 | @mock.patch("ssdlc.logger") 127 | @mock.patch("ssdlc.datetime") 128 | def test_log_ssdlc_system_event_with_additional_message(self, mock_datetime, mock_logger): 129 | """Test logging with additional message.""" 130 | # Setup mock datetime 131 | mock_now = mock.MagicMock() 132 | mock_now.isoformat.return_value = "2025-01-01T12:00:00+00:00" 133 | mock_datetime.now.return_value.astimezone.return_value = mock_now 134 | 135 | # Call with additional message 136 | additional_msg = "Service failed due to network error" 137 | log_ssdlc_system_event(SSDLCSysEvent.CRASH, "hardware-exporter", additional_msg) 138 | 139 | # Verify the additional message is included 140 | logged_data = mock_logger.warning.call_args[0][0] 141 | self.assertIn(additional_msg, logged_data["description"]) 142 | 143 | @mock.patch("ssdlc.logger") 144 | @mock.patch("ssdlc.datetime") 145 | def test_log_ssdlc_system_event_datetime_format(self, mock_datetime, mock_logger): 146 | """Test that datetime is in ISO 8601 format with timezone.""" 147 | # Use a real datetime to test formatting 148 | test_time = datetime(2025, 1, 15, 14, 30, 45, tzinfo=timezone.utc) 149 | mock_datetime.now.return_value.astimezone.return_value = test_time 150 | 151 | log_ssdlc_system_event(SSDLCSysEvent.STARTUP, "hardware-exporter") 152 | 153 | logged_data = mock_logger.warning.call_args[0][0] 154 | # Verify ISO 8601 format with timezone 155 | self.assertRegex( 156 | logged_data["datetime"], 157 | r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2}", 158 | ) 159 | 160 | 161 | if __name__ == "__main__": 162 | unittest.main() 163 | -------------------------------------------------------------------------------- /tests/unit/test_alert_rules/test_redfish.yaml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - ../../../src/prometheus_alert_rules_dynamic/redfish.yaml 3 | 4 | evaluation_interval: 1m 5 | 6 | tests: 7 | - interval: 1m 8 | input_series: 9 | - series: redfish_call_success{instance="ubuntu-0"} 10 | values: "0x15" 11 | 12 | alert_rule_test: 13 | - eval_time: 10m 14 | alertname: RedfishCallFailed 15 | exp_alerts: 16 | - exp_labels: 17 | severity: warning 18 | instance: ubuntu-0 19 | exp_annotations: 20 | summary: Call to the Redfish API failed. (instance ubuntu-0) 21 | description: | 22 | Failure in calling the Redfish API. 23 | VALUE = 0 24 | LABELS = map[__name__:redfish_call_success instance:ubuntu-0] 25 | 26 | - interval: 1m 27 | input_series: 28 | - series: redfish_service_available{instance="ubuntu-1"} 29 | values: "0x15" 30 | 31 | alert_rule_test: 32 | - eval_time: 10m 33 | alertname: RedfishServiceUnavailable 34 | exp_alerts: 35 | - exp_labels: 36 | severity: warning 37 | instance: ubuntu-1 38 | exp_annotations: 39 | summary: No redfish services available. (instance ubuntu-1) 40 | description: | 41 | No redfish services available. 42 | VALUE = 0 43 | LABELS = map[__name__:redfish_service_available instance:ubuntu-1] 44 | 45 | - interval: 1m 46 | input_series: 47 | - series: redfish_sensor_info{instance="ubuntu-2", health="Unhealthy", reading="82%"} 48 | values: "1x15" 49 | 50 | alert_rule_test: 51 | - eval_time: 10m 52 | alertname: RedfishSensorHealthNotOk 53 | exp_alerts: 54 | - exp_labels: 55 | severity: critical 56 | instance: ubuntu-2 57 | health: Unhealthy 58 | reading: 82% 59 | exp_annotations: 60 | summary: Redfish sensor health not Ok. (instance ubuntu-2) 61 | description: | 62 | Redfish sensor health not Ok. 63 | SENSOR_READING = 82% 64 | LABELS = map[__name__:redfish_sensor_info health:Unhealthy instance:ubuntu-2 reading:82%] 65 | 66 | - interval: 1m 67 | input_series: 68 | - series: redfish_processor_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", processor_id="p1", model="processor-model-1"} 69 | values: "1x15" 70 | 71 | alert_rule_test: 72 | - eval_time: 10m 73 | alertname: RedfishProcessorHealthNotOk 74 | exp_alerts: 75 | - exp_labels: 76 | severity: critical 77 | instance: ubuntu-1 78 | health: Unhealthy 79 | system_id: s1 80 | processor_id: p1 81 | model: processor-model-1 82 | exp_annotations: 83 | summary: Redfish processor health not OK. (instance ubuntu-1) 84 | description: | 85 | Redfish processor health not OK. 86 | LABELS = map[__name__:redfish_processor_info health:Unhealthy instance:ubuntu-1 model:processor-model-1 processor_id:p1 system_id:s1] 87 | 88 | - interval: 1m 89 | input_series: 90 | - series: redfish_storage_controller_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", storage_id="stor1", controller_id="ctrl1"} 91 | values: "1x15" 92 | 93 | alert_rule_test: 94 | - eval_time: 10m 95 | alertname: RedfishStorageControllerHealthNotOk 96 | exp_alerts: 97 | - exp_labels: 98 | severity: critical 99 | instance: ubuntu-1 100 | health: Unhealthy 101 | system_id: s1 102 | storage_id: stor1 103 | controller_id: ctrl1 104 | exp_annotations: 105 | summary: Redfish storage controller health not OK. (instance ubuntu-1) 106 | description: | 107 | Redfish storage controller health not OK. 108 | LABELS = map[__name__:redfish_storage_controller_info controller_id:ctrl1 health:Unhealthy instance:ubuntu-1 storage_id:stor1 system_id:s1] 109 | 110 | - interval: 1m 111 | input_series: 112 | - series: redfish_chassis_info{instance="ubuntu-1", health="Unhealthy", chassis_id="ch1", model="chassis-model1"} 113 | values: "1x15" 114 | 115 | alert_rule_test: 116 | - eval_time: 10m 117 | alertname: RedfishChassisHealthNotOk 118 | exp_alerts: 119 | - exp_labels: 120 | severity: critical 121 | instance: ubuntu-1 122 | health: Unhealthy 123 | chassis_id: ch1 124 | model: chassis-model1 125 | exp_annotations: 126 | summary: Redfish chassis health not OK. (instance ubuntu-1) 127 | description: | 128 | Redfish chassis health not OK. 129 | LABELS = map[__name__:redfish_chassis_info chassis_id:ch1 health:Unhealthy instance:ubuntu-1 model:chassis-model1] 130 | 131 | - interval: 1m 132 | input_series: 133 | - series: redfish_storage_drive_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", state="Enabled", storage_id="stor1", drive_id="dr1"} 134 | values: "1x15" 135 | 136 | alert_rule_test: 137 | - eval_time: 10m 138 | alertname: RedfishStorageDriveHealthNotOk 139 | exp_alerts: 140 | - exp_labels: 141 | severity: critical 142 | instance: ubuntu-1 143 | health: Unhealthy 144 | system_id: s1 145 | storage_id: stor1 146 | drive_id: dr1 147 | state: Enabled 148 | exp_annotations: 149 | summary: Redfish storage drive health not OK. (instance ubuntu-1) 150 | description: | 151 | Redfish storage drive health not OK. 152 | LABELS = map[__name__:redfish_storage_drive_info drive_id:dr1 health:Unhealthy instance:ubuntu-1 state:Enabled storage_id:stor1 system_id:s1] 153 | 154 | 155 | - interval: 1m 156 | input_series: 157 | - series: redfish_memory_dimm_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", memory_id="mem1"} 158 | values: "1x15" 159 | 160 | alert_rule_test: 161 | - eval_time: 10m 162 | alertname: RedfishMemoryDimmHealthNotOk 163 | exp_alerts: 164 | - exp_labels: 165 | severity: critical 166 | instance: ubuntu-1 167 | health: Unhealthy 168 | system_id: s1 169 | memory_id: mem1 170 | exp_annotations: 171 | summary: Redfish memory dimm health not OK. (instance ubuntu-1) 172 | description: | 173 | Redfish memory dimm health not OK. 174 | LABELS = map[__name__:redfish_memory_dimm_info health:Unhealthy instance:ubuntu-1 memory_id:mem1 system_id:s1] 175 | 176 | - interval: 1m 177 | input_series: 178 | - series: redfish_smart_storage_health{instance="ubuntu-1"} 179 | values: "0x15" 180 | 181 | alert_rule_test: 182 | - eval_time: 10m 183 | alertname: RedfishSmartStorageHealthNotOk 184 | exp_alerts: 185 | - exp_labels: 186 | severity: critical 187 | instance: ubuntu-1 188 | exp_annotations: 189 | summary: Redfish smart storage health not OK. (instance ubuntu-1) 190 | description: | 191 | Redfish smart storage health not OK. 192 | VALUE = 0 193 | LABELS = map[__name__:redfish_smart_storage_health instance:ubuntu-1] 194 | -------------------------------------------------------------------------------- /tests/integration/test_cos_integration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2024 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import asyncio 6 | import json 7 | import logging 8 | import subprocess 9 | from pathlib import Path 10 | 11 | import pytest 12 | from mock_data import EXPECTED_ALERTS 13 | from pytest_operator.plugin import OpsTest 14 | from tenacity import AsyncRetrying, RetryError, stop_after_attempt, wait_fixed 15 | from utils import Alert 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | @pytest.mark.abort_on_fail 21 | @pytest.mark.skip_if_deployed 22 | async def test_setup_and_deploy(base, channel, lxd_ctl, k8s_ctl, lxd_model, k8s_model): 23 | """Setup models and then deploy Hardware Observer and COS.""" 24 | await _deploy_cos(channel, k8s_ctl, k8s_model) 25 | 26 | await _deploy_hardware_observer(base, channel, lxd_model) 27 | 28 | await _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model) 29 | 30 | # This verifies that the cross-controller relation with COS is successful 31 | assert lxd_model.applications["grafana-agent"].status == "active" 32 | 33 | 34 | async def test_alerts(ops_test: OpsTest, lxd_model, k8s_model): 35 | """Verify that the required alerts are fired.""" 36 | await _disable_hardware_exporter(ops_test, lxd_model) 37 | await _export_mock_metrics(lxd_model) 38 | 39 | # Run juju action to get the ip address that traefik is configured to serve on 40 | returncode, stdout, stderr = await ops_test.run( 41 | "juju", 42 | "run", 43 | "--format", 44 | "json", 45 | "traefik/0", 46 | "show-proxied-endpoints", 47 | ) 48 | json_data = json.loads(stdout) 49 | proxied_endpoints = json.loads(json_data["traefik/0"]["results"]["proxied-endpoints"]) 50 | prometheus_url = proxied_endpoints["prometheus/0"]["url"] 51 | prometheus_alerts_endpoint = f"{prometheus_url}/api/v1/alerts" 52 | 53 | cmd = ["curl", prometheus_alerts_endpoint] 54 | 55 | # Sometimes alerts take some time to show after the metrics are exposed on the host. 56 | # Additionally, some alerts longer duration like 5m, and they take some time to 57 | # transition to `firing` state. 58 | # So retrying for upto 15 minutes. 59 | try: 60 | async for attempt in AsyncRetrying(stop=stop_after_attempt(45), wait=wait_fixed(20)): 61 | with attempt: 62 | try: 63 | alerts_response = subprocess.check_output(cmd) 64 | except subprocess.CalledProcessError: 65 | logger.error("Failed to fetch alerts data from COS") 66 | raise 67 | 68 | alerts = json.loads(alerts_response)["data"]["alerts"] 69 | 70 | received_alerts = [ 71 | Alert( 72 | state=received_alert["state"], 73 | value=float(received_alert["value"]), 74 | labels=received_alert["labels"], 75 | ) 76 | for received_alert in alerts 77 | ] 78 | expected_alerts = [ 79 | Alert( 80 | state=expected_alert["state"], 81 | value=float(expected_alert["value"]), 82 | labels=expected_alert["labels"], 83 | ) 84 | for expected_alert in EXPECTED_ALERTS 85 | ] 86 | 87 | for expected_alert in expected_alerts: 88 | assert any( 89 | expected_alert.is_same_alert(received_alert) 90 | for received_alert in received_alerts 91 | ) 92 | 93 | except RetryError: 94 | pytest.fail("Expected alerts not found in COS.") 95 | 96 | 97 | async def _disable_hardware_exporter(ops_test: OpsTest, lxd_model): 98 | """Disable the hardware exporter service.""" 99 | disable_cmd = "sudo systemctl stop hardware-exporter.service" 100 | 101 | hardware_observer = lxd_model.applications.get("hardware-observer") 102 | hardware_observer_unit = hardware_observer.units[0] 103 | 104 | disable_action = await hardware_observer_unit.run(disable_cmd) 105 | await disable_action.wait() 106 | 107 | 108 | async def _export_mock_metrics(lxd_model): 109 | """Expose the mock metrics for further testing.""" 110 | hardware_observer = lxd_model.applications.get("hardware-observer") 111 | hardware_observer_unit = hardware_observer.units[0] 112 | 113 | # Create an executable from `export_mock_metrics.py` 114 | bundle_cmd = [ 115 | "pyinstaller", 116 | "--onefile", 117 | str(Path(__file__).parent.resolve() / "export_mock_metrics.py"), 118 | ] 119 | try: 120 | subprocess.run(bundle_cmd) 121 | except subprocess.CalledProcessError: 122 | logger.error("Failed to bundle export_mock_metrics") 123 | raise 124 | 125 | # scp the executable to hardware-observer unit 126 | await hardware_observer_unit.scp_to("./dist/export_mock_metrics", "/home/ubuntu") 127 | 128 | # Run the executable in the background without waiting. 129 | run_export_mock_metrics_cmd = "/home/ubuntu/export_mock_metrics" 130 | await hardware_observer_unit.run(run_export_mock_metrics_cmd) 131 | 132 | 133 | async def _deploy_cos(channel, ctl, model): 134 | """Deploy COS on the existing k8s cloud.""" 135 | # Deploying via CLI because of https://github.com/juju/python-libjuju/issues/1032. 136 | cmd = [ 137 | "juju", 138 | "deploy", 139 | "cos-lite", 140 | "--channel", 141 | channel, 142 | "--trust", 143 | "-m", 144 | f"{ctl.controller_name}:{model.name}", 145 | "--overlay", 146 | str(Path(__file__).parent.resolve() / "offers-overlay.yaml"), 147 | ] 148 | subprocess.run(cmd, check=True) 149 | 150 | 151 | async def _deploy_hardware_observer(base, channel, model): 152 | """Deploy Hardware Observer and Grafana Agent on the existing lxd cloud.""" 153 | await asyncio.gather( 154 | # Principal Ubuntu 155 | model.deploy("ubuntu", num_units=1, base=base, channel=channel), 156 | # Hardware Observer 157 | model.deploy("hardware-observer", base=base, num_units=0, channel=channel), 158 | # Grafana Agent 159 | model.deploy("grafana-agent", num_units=0, base=base, channel=channel), 160 | ) 161 | 162 | await model.add_relation("ubuntu:juju-info", "hardware-observer:general-info") 163 | await model.add_relation("hardware-observer:cos-agent", "grafana-agent:cos-agent") 164 | await model.add_relation("ubuntu:juju-info", "grafana-agent:juju-info") 165 | 166 | await model.block_until(lambda: model.applications["hardware-observer"].status == "active") 167 | 168 | 169 | async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model): 170 | """Add relations between Grafana Agent and COS.""" 171 | cos_saas_names = ["prometheus-receive-remote-write", "loki-logging", "grafana-dashboards"] 172 | for saas in cos_saas_names: 173 | # Using juju cli since Model.consume() from libjuju causes error. 174 | # https://github.com/juju/python-libjuju/issues/1031 175 | cmd = [ 176 | "juju", 177 | "consume", 178 | "--model", 179 | f"{lxd_ctl.controller_name}:{k8s_model.name}", 180 | f"{k8s_ctl.controller_name}:admin/{k8s_model.name}.{saas}", 181 | ] 182 | subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 183 | await lxd_model.add_relation("grafana-agent", saas), 184 | 185 | # `idle_period` needs to be greater than the scrape interval to make sure metrics ingested. 186 | await asyncio.gather( 187 | # First, we wait for the critical phase to pass with raise_on_error=False. 188 | # (In CI, using github runners, we often see unreproducible hook failures.) 189 | lxd_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False), 190 | k8s_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False), 191 | ) 192 | 193 | await asyncio.gather( 194 | # Then we wait for "active", without raise_on_error=False, so the test fails sooner in case 195 | # there is a persistent error status. 196 | lxd_model.wait_for_idle(status="active", timeout=7200, idle_period=180), 197 | k8s_model.wait_for_idle(status="active", timeout=7200, idle_period=180), 198 | ) 199 | -------------------------------------------------------------------------------- /src/gpu_metrics/dcgm_metrics.csv: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # [ WARNING ] 3 | # Configuration file maintained by Juju. Local changes may be overwritten. 4 | ############################################################################### 5 | 6 | # Selected metrics for dcgm-exporter 7 | # Default metric list https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/default-counters.csv 8 | 9 | # Format 10 | # If line starts with a '#' it is considered a comment 11 | # Boolean values decode to - 1 = enabled 0 = disabled 12 | # DCGM FIELD, Prometheus metric type, help message 13 | 14 | 15 | 16 | 17 | # DEFAULT METRICS 18 | # Clocks 19 | DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). 20 | DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). 21 | 22 | # Temperature 23 | DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). 24 | DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). 25 | 26 | # Power 27 | DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). 28 | DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). 29 | 30 | # PCIE 31 | DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. 32 | DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. 33 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. 34 | 35 | # Utilization (the sample period varies depending on the product) 36 | DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). 37 | DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). 38 | DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). 39 | DCGM_FI_DEV_DEC_UTIL, gauge, Decoder utilization (in %). 40 | 41 | # Errors and violations 42 | DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. 43 | 44 | # Memory usage 45 | DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). 46 | DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). 47 | 48 | # NVLink 49 | DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes 50 | 51 | # VGPU License status 52 | DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status 53 | 54 | # Remapped rows 55 | DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors 56 | DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors 57 | DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed 58 | 59 | # Static configuration information and features 60 | DCGM_FI_DRIVER_VERSION, label, Driver Version 61 | 62 | 63 | 64 | 65 | # CUSTOM METRICS 66 | # Clocks 67 | DCGM_FI_DEV_VIDEO_CLOCK, gauge, Video encoder/decoder clock (in MHz). 68 | 69 | # Temperature 70 | DCGM_FI_DEV_FAN_SPEED, gauge, Fan speed (in 0-100%) 71 | 72 | # Power 73 | DCGM_FI_DEV_POWER_USAGE_INSTANT, gauge, Current instantaneous power usage (in W). 74 | 75 | # Errors and violations 76 | DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, counter, Throttling reasons bitmask 77 | DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). 78 | DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). 79 | DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). 80 | DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). 81 | DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). 82 | DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). 83 | 84 | # Memory usage 85 | DCGM_FI_DEV_FB_RESERVED, gauge, Frame buffer memory reserved (in MB). 86 | DCGM_FI_DEV_FB_USED_PERCENT, gauge, Frame buffer percentage used (in 0-100%) - Used/(Total - Reserved) 87 | 88 | # ECC 89 | DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. 90 | DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. 91 | DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. 92 | DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. 93 | 94 | # Retired pages 95 | DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. 96 | DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. 97 | DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. 98 | 99 | # NVLink 100 | DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. 101 | DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. 102 | DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. 103 | DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. 104 | 105 | # VGPU 106 | DCGM_FI_DEV_VGPU_UTILIZATIONS, gauge, vGPUs utilization 107 | 108 | # Bar 109 | DCGM_FI_DEV_BAR1_USED, gauge, Used BAR1 (in MB) 110 | DCGM_FI_DEV_BAR1_FREE, gauge, Free BAR1 (in MB) 111 | 112 | # DCP metrics 113 | DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. 114 | DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. 115 | DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. 116 | DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. 117 | DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. 118 | DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. 119 | DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. 120 | DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. 121 | DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. 122 | DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. 123 | 124 | # Features and modes 125 | DCGM_FI_DEV_COMPUTE_MODE, gauge, Compute mode 126 | DCGM_FI_DEV_PERSISTENCE_MODE, gauge, Persistance mode (1 or 0) 127 | DCGM_FI_DEV_CC_MODE, gauge, ConfidentialCompute/AmpereProtectedMemory status (1 or 0) 128 | DCGM_FI_DEV_ECC_CURRENT, gauge, Current ECC mode 129 | DCGM_FI_DEV_VIRTUAL_MODE, gauge, Virtualization mode 130 | DCGM_FI_DEV_AUTOBOOST, gauge, Auto-boost enabled 131 | DCGM_FI_DEV_BAR1_TOTAL, gauge, Total BAR1 (in MB) 132 | DCGM_FI_DEV_MAX_SM_CLOCK, gauge, Maximum supported SM clock 133 | DCGM_FI_DEV_MAX_MEM_CLOCK, gauge, Maximum supported Memory clock 134 | DCGM_FI_DEV_GPU_MAX_OP_TEMP, gauge, Maximum operating temperature 135 | DCGM_FI_DEV_SLOWDOWN_TEMP, gauge, Slowdown temperature 136 | DCGM_FI_DEV_SHUTDOWN_TEMP, gauge, Shutdown temperature 137 | DCGM_FI_DEV_POWER_MGMT_LIMIT, gauge, Current Power limit 138 | DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN, gauge, Minimum Power limit 139 | DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, gauge, Maximum Power limit 140 | DCGM_FI_DEV_ENFORCED_POWER_LIMIT, gauge, Effective Power limit that the driver enforces after taking into account all limiters 141 | DCGM_FI_DEV_FB_TOTAL, gauge, Total Frame buffer (in MB) 142 | DCGM_FI_DEV_COUNT, gauge, Number of devices on the node 143 | 144 | # Static configuration information and features 145 | DCGM_FI_NVML_VERSION, label, NVML Version 146 | DCGM_FI_DEV_BRAND, label, Device Brand 147 | DCGM_FI_DEV_SERIAL, label, Device Serial Number 148 | DCGM_FI_DEV_NAME, label, Device Name 149 | DCGM_FI_DEV_MINOR_NUMBER, label, Device node minor (/dev/nvidia#) 150 | DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY, label, Cuda compute capability for the device (The major version is the upper 32 bits and the minor version is the lower 32 bits) 151 | DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version 152 | DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version 153 | DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version 154 | DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version 155 | DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device 156 | -------------------------------------------------------------------------------- /tests/unit/test_hardware.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import unittest 3 | from unittest import mock 4 | 5 | import pytest 6 | 7 | from hardware import ( 8 | get_bmc_address, 9 | get_cuda_version_from_driver, 10 | get_nvidia_driver_version, 11 | hwinfo, 12 | is_nvidia_driver_loaded, 13 | lshw, 14 | ) 15 | 16 | 17 | class TestHwinfo: 18 | @pytest.mark.parametrize( 19 | "hw_classes,expect_cmd,hwinfo_output,expect", 20 | [ 21 | ( 22 | [], 23 | ["hwinfo"], 24 | ( 25 | "" 26 | "============ start debug info ============" 27 | "random-string" 28 | "random-string" 29 | "random-string" 30 | "random-string" 31 | "=========== end debug info ============" 32 | "10: key-a\n" 33 | " [Created at pci.386]\n" 34 | " Unique ID: unique-id-a\n" 35 | " Parent ID: parent-id-a\n" 36 | "\n" 37 | "11: key-b\n" 38 | " [Created at pci.386]\n" 39 | " Unique ID: unique-id-b\n" 40 | " Parent ID: parent-id-b\n" 41 | ), 42 | { 43 | "10: key-a": ( 44 | "10: key-a\n" 45 | " [Created at pci.386]\n" 46 | " Unique ID: unique-id-a\n" 47 | " Parent ID: parent-id-a" 48 | ), 49 | "11: key-b": ( 50 | "11: key-b\n" 51 | " [Created at pci.386]\n" 52 | " Unique ID: unique-id-b\n" 53 | " Parent ID: parent-id-b\n" 54 | ), 55 | }, 56 | ), 57 | ( 58 | ["storage"], 59 | ["hwinfo", "--storage"], 60 | ( 61 | "" 62 | "10: key-a\n" 63 | " [Created at pci.386]\n" 64 | " Unique ID: unique-id-a\n" 65 | " Parent ID: parent-id-a\n" 66 | "\n" 67 | "11: key-b\n" 68 | " [Created at pci.386]\n" 69 | " Unique ID: unique-id-b\n" 70 | " Parent ID: parent-id-b\n" 71 | ), 72 | { 73 | "10: key-a": ( 74 | "10: key-a\n" 75 | " [Created at pci.386]\n" 76 | " Unique ID: unique-id-a\n" 77 | " Parent ID: parent-id-a" 78 | ), 79 | "11: key-b": ( 80 | "11: key-b\n" 81 | " [Created at pci.386]\n" 82 | " Unique ID: unique-id-b\n" 83 | " Parent ID: parent-id-b\n" 84 | ), 85 | }, 86 | ), 87 | ], 88 | ) 89 | @mock.patch("hardware.apt") 90 | @mock.patch("hardware.subprocess.check_output") 91 | def test_hwinfo_output( 92 | self, mock_subprocess, mock_apt, hw_classes, expect_cmd, hwinfo_output, expect 93 | ): 94 | mock_subprocess.return_value = hwinfo_output 95 | output = hwinfo(*hw_classes) 96 | mock_subprocess.assert_called_with(expect_cmd, text=True) 97 | assert output == expect 98 | 99 | 100 | class TestLshw(unittest.TestCase): 101 | @mock.patch("hardware.apt") 102 | @mock.patch("hardware.subprocess.check_output") 103 | def test_lshw_output(self, mock_subprocess, mock_apt): 104 | mock_subprocess.return_value = """[{"expected_output": 1}]""" 105 | for class_filter in [None, "storage"]: 106 | output = lshw(class_filter) 107 | if class_filter is not None: 108 | mock_subprocess.assert_called_with( 109 | f"lshw -json -c {class_filter}".split(), 110 | text=True, 111 | ) 112 | self.assertEqual(output, [{"expected_output": 1}]) 113 | else: 114 | mock_subprocess.assert_called_with( 115 | "lshw -json".split(), 116 | text=True, 117 | ) 118 | self.assertEqual(output, {"expected_output": 1}) 119 | 120 | @mock.patch("hardware.subprocess.check_output") 121 | def test_lshw_dict_output(self, mock_subprocess): 122 | mock_subprocess.return_value = """{"expected_output": 1}""" 123 | output = lshw() 124 | mock_subprocess.assert_called_with( 125 | "lshw -json".split(), 126 | text=True, 127 | ) 128 | self.assertEqual(output, {"expected_output": 1}) 129 | 130 | @mock.patch( 131 | "hardware.subprocess.check_output", 132 | side_effect=subprocess.CalledProcessError(-1, "cmd"), 133 | return_value="[{}]", 134 | ) 135 | def test_lshw_error_handling(self, mock_subprocess): 136 | with self.assertRaises(subprocess.CalledProcessError): 137 | lshw() 138 | 139 | 140 | class TestGetBMCAddress(unittest.TestCase): 141 | @mock.patch("hardware.apt") 142 | @mock.patch("hardware.subprocess.check_output") 143 | def test_get_bmc_address(self, mock_check_output, mock_apt): 144 | mock_check_output.return_value = """ 145 | Set in Progress : Set Complete 146 | Auth Type Support : NONE MD5 PASSWORD 147 | Auth Type Enable : Callback : MD5 PASSWORD 148 | : User : MD5 PASSWORD 149 | : Operator : MD5 PASSWORD 150 | : Admin : MD5 PASSWORD 151 | : OEM : 152 | IP Address Source : Static Address 153 | IP Address : 10.244.120.100 154 | Subnet Mask : 255.255.252.0 155 | MAC Address : 5a:ba:3c:3b:b4:59 156 | SNMP Community String : 157 | BMC ARP Control : ARP Responses Enabled, Gratuitous ARP Disabled 158 | Default Gateway IP : 10.240.128.1 159 | 802.1q VLAN ID : Disabled 160 | 802.1q VLAN Priority : 0 161 | RMCP+ Cipher Suites : 0,1,2,3 162 | Cipher Suite Priv Max : XXXaXXXXXXXXXXX 163 | : X=Cipher Suite Unused 164 | : c=CALLBACK 165 | : u=USER 166 | : o=OPERATOR 167 | : a=ADMIN 168 | : O=OEM 169 | Bad Password Threshold : Not Available 170 | """.strip() 171 | 172 | output = get_bmc_address() 173 | self.assertEqual(output, "10.244.120.100") 174 | 175 | @mock.patch("hardware.apt") 176 | @mock.patch( 177 | "hardware.subprocess.check_output", 178 | side_effect=subprocess.CalledProcessError(-1, "cmd"), 179 | ) 180 | def test_get_bmc_address_error_handling(self, mock_subprocess, mock_apt): 181 | output = get_bmc_address() 182 | self.assertEqual(output, None) 183 | 184 | 185 | @pytest.mark.parametrize("path_exists,expected", [(True, True), (False, False)]) 186 | @mock.patch("hardware.Path.exists") 187 | def test_is_nvidia_driver_loaded(mock_path, path_exists, expected): 188 | mock_path.return_value = path_exists 189 | assert is_nvidia_driver_loaded() == expected 190 | 191 | 192 | @mock.patch("hardware.NVIDIA_DRIVER_PATH") 193 | def test_get_nvidia_driver_version(mock_driver_path): 194 | mock_driver_path.read_text.return_value = ( 195 | "NVRM version: NVIDIA UNIX x86_64 Kernel Module 570.172.08" 196 | ) 197 | result = get_nvidia_driver_version() 198 | assert result == 570 199 | 200 | 201 | @mock.patch("hardware.NVIDIA_DRIVER_PATH") 202 | def test_get_nvidia_driver_version_file_not_found(mock_driver_path): 203 | mock_driver_path.read_text.side_effect = FileNotFoundError 204 | 205 | with pytest.raises(FileNotFoundError): 206 | get_nvidia_driver_version() 207 | 208 | 209 | @pytest.mark.parametrize( 210 | "driver_version, expected", 211 | [ 212 | (590, 13), 213 | (580, 13), 214 | (570, 12), 215 | (525, 12), 216 | (500, 11), 217 | (450, 11), 218 | (400, 10), 219 | (390, 10), 220 | ], 221 | ) 222 | @mock.patch("hardware.get_nvidia_driver_version") 223 | def test_get_cuda_version_from_driver(mock_nvidia_driver, driver_version, expected): 224 | mock_nvidia_driver.return_value = driver_version 225 | assert get_cuda_version_from_driver() == expected 226 | --------------------------------------------------------------------------------