├── empty-resource
├── tests
    ├── unit
    │   ├── requirements.txt
    │   ├── test_os_platform.py
    │   ├── test_alert_rules
    │   │   ├── test_general.yaml
    │   │   ├── test_ipmi_dcmi.yaml
    │   │   ├── test_mega_raid.yaml
    │   │   ├── test_perccli.yaml
    │   │   ├── test_ssacli.yaml
    │   │   ├── test_ipmi_sel.yaml
    │   │   └── test_redfish.yaml
    │   ├── test_apt_helpers.py
    │   ├── test_checksum.py
    │   ├── test_literals.py
    │   ├── test_ssdlc.py
    │   └── test_hardware.py
    ├── manual
    │   ├── etc
    │   │   ├── 4_deploy_grafana_agent
    │   │   │   ├── outputs.tf
    │   │   │   ├── variables.tf
    │   │   │   ├── main.tf
    │   │   │   └── terragrunt.hcl
    │   │   ├── 2_add_k8s_cloud
    │   │   │   ├── variables.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── main.tf
    │   │   │   └── terragrunt.hcl
    │   │   ├── 1_add_machine
    │   │   │   ├── variables.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── terragrunt.hcl
    │   │   │   └── main.tf
    │   │   └── 3_deploy_cos
    │   │   │   ├── variables.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── terragrunt.hcl
    │   │   │   └── main.tf
    │   ├── scripts
    │   │   ├── get-local-ip.sh
    │   │   ├── get-kubeconfig.sh
    │   │   ├── get-preferred-ip.sh
    │   │   ├── wait-for-model-destroyed.sh
    │   │   ├── wait-for-model.sh
    │   │   ├── wait-for-application.sh
    │   │   ├── cleanup.sh
    │   │   ├── wait-for-microk8s.sh
    │   │   └── bootstrap.sh
    │   ├── jobs
    │   │   ├── eevee
    │   │   │   ├── job.tpl.yaml
    │   │   │   └── README.md
    │   │   ├── kongfu
    │   │   │   ├── job.tpl.yaml
    │   │   │   └── README.md
    │   │   ├── pianta
    │   │   │   ├── job.tpl.yaml
    │   │   │   └── README.md
    │   │   ├── torchtusk
    │   │   │   ├── job.tpl.yaml
    │   │   │   └── README.md
    │   │   ├── submit.sh
    │   │   └── README.md
    │   └── README.md
    ├── functional
    │   ├── requirements.txt
    │   ├── bundle.yaml.j2
    │   ├── README.md
    │   └── conftest.py
    └── integration
    │   ├── requirements.txt
    │   ├── offers-overlay.yaml
    │   ├── export_mock_metrics.py
    │   ├── mock_data.py
    │   ├── utils.py
    │   ├── conftest.py
    │   └── test_cos_integration.py
├── requirements-dev.txt
├── requirements.txt
├── src
    ├── storelib_conf.template
    ├── prometheus_alert_rules
    │   ├── general.yaml
    │   ├── mega_raid.yaml
    │   ├── ipmi_dcmi.yaml
    │   ├── perccli.yaml
    │   ├── ssacli.yaml
    │   ├── ipmi_sel.yaml
    │   ├── lsi_sas.yaml
    │   ├── ipmi_sensors.yaml
    │   ├── dcgm.yaml
    │   └── smart.yaml
    ├── apt_helpers.py
    ├── os_platform.py
    ├── literals.py
    ├── ssdlc.py
    ├── config.py
    ├── prometheus_alert_rules_dynamic
    │   └── redfish.yaml
    ├── keys.py
    ├── hardware.py
    └── gpu_metrics
    │   └── dcgm_metrics.csv
├── templates
    ├── hardware-exporter.service.j2
    └── hardware-exporter-config.yaml.j2
├── .github
    ├── CODEOWNERS
    ├── workflows
    │   ├── test_prometheus_rules.yaml
    │   ├── release.yaml
    │   ├── promote.yaml
    │   └── cos_integration.yaml
    ├── .jira_sync_config.yaml
    └── ISSUE_TEMPLATE
    │   └── bug_report.yaml
├── SECURITY.md
├── DEVELOPMENT.md
├── .gitignore
├── charmcraft.yaml
├── tox.ini
├── README.md
├── config.yaml
├── metadata.yaml
├── CONTRIBUTING.md
└── pyproject.toml


/empty-resource:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/unit/requirements.txt:
--------------------------------------------------------------------------------
1 | -r ../../requirements-dev.txt
2 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | coverage
3 | flake8
4 | parameterized
5 | 


--------------------------------------------------------------------------------
/tests/manual/etc/4_deploy_grafana_agent/outputs.tf:
--------------------------------------------------------------------------------
1 | output "model_name" {
2 |   value = var.machine_model
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/manual/scripts/get-local-ip.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ip -4 -j a sho dev br0 | jq -r .[].addr_info[0].local
4 | 


--------------------------------------------------------------------------------
/tests/functional/requirements.txt:
--------------------------------------------------------------------------------
1 | async-lru
2 | pytest
3 | pytest-operator
4 | protobuf
5 | tenacity
6 | pydantic < 2
7 | 


--------------------------------------------------------------------------------
/tests/manual/scripts/get-kubeconfig.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p ~/.kube
4 | sudo microk8s config > ~/.kube/config
5 | 


--------------------------------------------------------------------------------
/tests/manual/etc/2_add_k8s_cloud/variables.tf:
--------------------------------------------------------------------------------
1 | variable "kube_config" {
2 |   description = "The file path to read the kube_config from"
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/manual/scripts/get-preferred-ip.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo "$(ip -4 -j route get 2.2.2.2 | jq -r '.[] | .prefsrc')-$(ip -4 -j route get 2.2.2.2 | jq -r '.[] | .prefsrc')"
4 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/eevee/job.tpl.yaml:
--------------------------------------------------------------------------------
1 | # job.yaml
2 | job_queue: eevee
3 | provision_data:
4 |   distro: <SERIES>
5 | reserve_data:
6 |   ssh_keys:
7 |     - <SSH_IMPORT_ID>
8 |   timeout: 21600
9 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/kongfu/job.tpl.yaml:
--------------------------------------------------------------------------------
1 | # job.yaml
2 | job_queue: kongfu
3 | provision_data:
4 |   distro: <SERIES>
5 | reserve_data:
6 |   ssh_keys:
7 |     - <SSH_IMPORT_ID>
8 |   timeout: 21600
9 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/pianta/job.tpl.yaml:
--------------------------------------------------------------------------------
1 | # job.yaml
2 | job_queue: pianta
3 | provision_data:
4 |   distro: <SERIES>
5 | reserve_data:
6 |   ssh_keys:
7 |     - <SSH_IMPORT_ID>
8 |   timeout: 21600
9 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/torchtusk/job.tpl.yaml:
--------------------------------------------------------------------------------
1 | # job.yaml
2 | job_queue: torchtusk
3 | provision_data:
4 |   distro: <SERIES>
5 | reserve_data:
6 |   ssh_keys:
7 |     - <SSH_IMPORT_ID>
8 |   timeout: 21600
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cosl
2 | distro
3 | ops >= 2.2.0
4 | jinja2
5 | redfish  # requests is included in this
6 | pydantic < 2
7 | git+https://github.com/canonical/prometheus-hardware-exporter.git@v1.1.0#egg=prometheus-hardware-exporter
8 | 


--------------------------------------------------------------------------------
/tests/manual/etc/2_add_k8s_cloud/outputs.tf:
--------------------------------------------------------------------------------
1 | output "k8s_cloud_name" {
2 |   value = juju_kubernetes_cloud.k8s_cloud.name
3 | }
4 | 
5 | output "k8s_cloud_credential" {
6 |   value = juju_kubernetes_cloud.k8s_cloud.credential
7 | }
8 | 


--------------------------------------------------------------------------------
/src/storelib_conf.template:
--------------------------------------------------------------------------------
 1 | # Debug Level:
 2 | # 0 - No Debug
 3 | # 1 - Level 1
 4 | # 2 - Level 2
 5 | DEBUGLEVEL=0
 6 | DISABLELOG=1
 7 | # Write option on startup
 8 | # 0 - Append to existing debug file
 9 | # 1 - create new file
10 | OVERWRITE=0
11 | # Directory where debug file will be created
12 | DEBUGDIR=$debug_dir
13 | 


--------------------------------------------------------------------------------
/tests/integration/requirements.txt:
--------------------------------------------------------------------------------
1 | jinja2
2 | juju~=3.5.0  # must be compatible with the juju CLI version installed by CI - see .github/workflows/cos_integration.yaml
3 | pytest
4 | pytest-operator
5 | prometheus-client
6 | pyinstaller # required to bundle export_mock_metrics script to send it to hw-oberver unit
7 | tenacity
8 | 


--------------------------------------------------------------------------------
/tests/manual/etc/1_add_machine/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "ssh_address" {
 2 |   description = "SSH address of the machine"
 3 | }
 4 | 
 5 | variable "public_key_file" {
 6 |   description = "The file path to read the public key from"
 7 | }
 8 | 
 9 | variable "private_key_file" {
10 |   description = "The file path to read the private key from"
11 | }
12 | 


--------------------------------------------------------------------------------
/templates/hardware-exporter.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=HTTP service for prometheus hardware exporter.
 3 | 
 4 | [Service]
 5 | User=root
 6 | Environment=PYTHONPATH={{ CHARMDIR }}/venv
 7 | ExecStart=/usr/bin/python3 -m prometheus_hardware_exporter -c {{ CONFIG_FILE }}
 8 | Restart=on-failure
 9 | 
10 | [Install]
11 | WantedBy=multi-user.target
12 | 
13 | 


--------------------------------------------------------------------------------
/tests/manual/etc/2_add_k8s_cloud/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     juju = {
 4 |       version = "~> 0.17.0"
 5 |       source  = "juju/juju"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "juju" {}
11 | 
12 | resource "juju_kubernetes_cloud" "k8s_cloud" {
13 |   name              = "k8s"
14 |   kubernetes_config = file(var.kube_config)
15 | }
16 | 


--------------------------------------------------------------------------------
/tests/manual/scripts/wait-for-model-destroyed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL="$1"
 4 | 
 5 | if [ -z "$MODEL" ]; then
 6 |     echo "Wait for the model to be destroyed."
 7 |     echo ""
 8 |     echo "Usage: $0 <MODEL>"
 9 |     exit 1
10 | fi
11 | 
12 | while juju show-model $MODEL > /dev/null ; do
13 |     echo "$MODEL still exists.."
14 |     sleep 5
15 | done;
16 | 


--------------------------------------------------------------------------------
/tests/manual/etc/1_add_machine/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "machine_model" {
 2 |   value = juju_model.hw-obs.name
 3 | }
 4 | output "ubuntu_name" {
 5 |   value = juju_application.ubuntu.name
 6 | }
 7 | 
 8 | output "hardware_observer_name" {
 9 |   value = juju_application.hardware-observer.name
10 | }
11 | 
12 | output "machine_base" {
13 |   value = juju_machine.machine.base
14 | }
15 | 


--------------------------------------------------------------------------------
/tests/manual/scripts/wait-for-model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL="$1"
 4 | 
 5 | if [ -z "$MODEL" ]; then
 6 |     echo "Wait for all applications in the model reaches active and idle."
 7 |     echo ""
 8 |     echo "Usage: $0 <MODEL>"
 9 |     exit 1
10 | fi
11 | 
12 | juju wait-for model $MODEL --timeout=20m0s --query='forEach(applications, app => app.status == "active")'
13 | 


--------------------------------------------------------------------------------
/tests/manual/scripts/wait-for-application.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL="$1"
 4 | APPLICATION="$2"
 5 | 
 6 | if [ -z "$APPLICATION" ]; then
 7 |     echo "Wait for an juju application to reach active and idle."
 8 |     echo ""
 9 |     echo "Usage: $0 <MODEL> <APPLICATION>"
10 |     exit 1
11 | fi
12 | 
13 | juju switch $MODEL
14 | 
15 | juju wait-for application $APPLICATION --query='status=="active" || status=="idle"'
16 | 


--------------------------------------------------------------------------------
/tests/manual/scripts/cleanup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | # Clean up all resources idempotently
 6 | 
 7 | juju destroy-model hw-obs --no-prompt --force --no-wait || true
 8 | juju destroy-model cos --no-prompt --force --no-wait --destroy-storage || true
 9 | juju remove-cloud k8s --client --controller "$(juju controllers --format json | jq -r '."current-controller"')" || true
10 | sudo /sbin/remove-juju-services || true
11 | 


--------------------------------------------------------------------------------
/tests/manual/etc/3_deploy_cos/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "machine_model" {
 2 |   description = "The machine model name"
 3 | }
 4 | 
 5 | variable "metallb_ip_ranges" {
 6 |   description = "The public IP addresses to services running in the Kubernetes cluster"
 7 | }
 8 | 
 9 | variable "k8s_cloud_name" {
10 |   description = "The name of the k8s cloud"
11 | }
12 | 
13 | variable "k8s_cloud_credential" {
14 |   description = "The credential for the k8s cloud"
15 | }
16 | 


--------------------------------------------------------------------------------
/tests/manual/etc/3_deploy_cos/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "model_name" {
 2 |   value = local.cos_model_name
 3 | }
 4 | 
 5 | output "receive-remote-write-offer-url" {
 6 |   value = module.cos-lite-terraform.prometheus-receive-remote-write-offer-url
 7 | }
 8 | 
 9 | output "grafana-dashboard-offer-url" {
10 |   value = module.cos-lite-terraform.grafana-dashboard-offer-url
11 | }
12 | 
13 | output "loki-logging-offer-url" {
14 |   value = module.cos-lite-terraform.loki-logging-offer-url
15 | }
16 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/torchtusk/README.md:
--------------------------------------------------------------------------------
 1 | # Testable Exporters
 2 | 
 3 | - [x] Prometheus Hardware Exporter
 4 |   - [x] ipmi_dcmi
 5 |   - [x] ipmi_sel
 6 |   - [x] ipmi_sensor
 7 |   - [x] redfish
 8 |   - [ ] hpe_ssa (ssacli)
 9 |   - [ ] lsi_sas_2 (sas2ircu)
10 |   - [ ] lsi_sas_3 (sas3ircu)
11 |   - [ ] mega_raid (storcli)
12 |   - [ ] poweredge_raid (perccli)
13 | - [x] DCGM Exporter (require NVIDIA)
14 |   - [x] dcgm
15 | - [x] Smartctl Exporter (require S.M.A.R.T disks)
16 |   - [x] smartctl
17 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/general.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: HardwareObserver
 3 |   rules:
 4 |     - alert: CollectorFailed
 5 |       expr: '{__name__=~"(.*)_collector_failed"} == 1'
 6 |       for: 30m
 7 |       labels:
 8 |         severity: error
 9 |       annotations:
10 |         summary: Collector failed. (instance {{ $labels.instance }})
11 |         description: |
12 |           A collector failed to fetch the metrics. Please reach out to hardware-observer maintainers.
13 |             LABELS = {{ $labels }}
14 | 


--------------------------------------------------------------------------------
/templates/hardware-exporter-config.yaml.j2:
--------------------------------------------------------------------------------
 1 | port: {{ PORT }}
 2 | level: {{ LEVEL }}
 3 | collect_timeout: {{ COLLECT_TIMEOUT }}
 4 | {% if COLLECTORS | length > 0 %}
 5 | enable_collectors:
 6 |   {% for collector in COLLECTORS %}
 7 |   - {{collector}}
 8 |   {% endfor %}
 9 | {% endif %}
10 | 
11 | {% if REDFISH_ENABLE  %}
12 | redfish_host: "{{ REDFISH_HOST }}"
13 | redfish_username: "{{ REDFISH_USERNAME }}"
14 | redfish_password: "{{ REDFISH_PASSWORD }}"
15 | redfish_client_timeout: "{{ REDFISH_CLIENT_TIMEOUT }}"
16 | {% endif %}
17 | 


--------------------------------------------------------------------------------
/tests/integration/offers-overlay.yaml:
--------------------------------------------------------------------------------
 1 | applications:
 2 |   alertmanager:
 3 |     offers:
 4 |       alertmanager-karma-dashboard:
 5 |         endpoints:
 6 |           - karma-dashboard
 7 |   grafana:
 8 |     offers:
 9 |       grafana-dashboards:
10 |         endpoints:
11 |           - grafana-dashboard
12 |   loki:
13 |     offers:
14 |       loki-logging:
15 |         endpoints:
16 |         - logging
17 |   prometheus:
18 |     offers:
19 |       prometheus-receive-remote-write:
20 |         endpoints:
21 |         - receive-remote-write
22 | 


--------------------------------------------------------------------------------
/tests/manual/etc/2_add_k8s_cloud/terragrunt.hcl:
--------------------------------------------------------------------------------
 1 | dependency "add_machine" {
 2 |   config_path  = "../1_add_machine"
 3 |   skip_outputs = true
 4 | }
 5 | 
 6 | terraform {
 7 |   before_hook "create-dot-kube-dir" {
 8 |     commands = ["plan"]
 9 |     execute  = ["mkdir", "-p", format("%s/.kube", get_env("HOME"))]
10 |   }
11 | 
12 |   before_hook "touch-kubeconfig" {
13 |     commands = ["plan"]
14 |     execute  = ["touch", format("%s/.kube/config", get_env("HOME"))]
15 |   }
16 | }
17 | 
18 | inputs = {
19 |   kube_config = format("%s/.kube/config", get_env("HOME"))
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/submit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | JOB="$1"
 6 | SERIES="$2"
 7 | SSH_IMPORT_ID="$3"
 8 | 
 9 | if [ ! -d "$JOB" ] || [ -z "$SERIES" ] || [ -z "$SSH_IMPORT_ID" ]; then
10 |   echo "Usage: $0 <JOB> <SERIES> <SSH_IMPORT_ID>"
11 |   exit 1
12 | fi
13 | 
14 | # testflinger cannot access /tmp file because it does not have necessary permission
15 | TEMPFILE="./.tmp-job.yaml"
16 | touch $TEMPFILE
17 | 
18 | sed -e "s/<SERIES>/$SERIES/g" -e "s/<SSH_IMPORT_ID>/$SSH_IMPORT_ID/g" "$JOB/job.tpl.yaml" | tee "$TEMPFILE"
19 | 
20 | testflinger submit $TEMPFILE
21 | 
22 | rm -f $TEMPFILE
23 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation
 2 | # To update the file:
 3 | # - Edit it in the canonical/solutions-engineering-automation repository.
 4 | # - Open a PR with the changes.
 5 | # - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes.
 6 | #
 7 | # These owners will be the default owners for everything in the repo. Unless a
 8 | # later match takes precedence, @canonical/soleng-reviewers will be requested for
 9 | # review when someone opens a pull request.
10 | *    @canonical/soleng-reviewers
11 | 


--------------------------------------------------------------------------------
/tests/manual/scripts/wait-for-microk8s.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | APPLICATION="$1"
 5 | 
 6 | TIMEOUT=300
 7 | DELAY=5
 8 | START=$(date +%s)
 9 | 
10 | while true; do
11 |   if sudo microk8s kubectl create clusterrole test --verb=get --resource=pods --request-timeout=5s >/dev/null 2>&1; then
12 |     echo "✅ Kubernetes API is ready"
13 |     sudo microk8s kubectl delete clusterrole test --ignore-not-found >/dev/null 2>&1
14 |     break
15 |   fi
16 | 
17 |   NOW=$(date +%s)
18 |   if (( NOW - START > TIMEOUT )); then
19 |     echo "❌ Timed out waiting for Kubernetes API"
20 |     exit 1
21 |   fi
22 | 
23 |   echo "⏳ Waiting for Kubernetes API..."
24 |   sleep "$DELAY"
25 | done
26 | 


--------------------------------------------------------------------------------
/tests/functional/bundle.yaml.j2:
--------------------------------------------------------------------------------
 1 | # Test basic deployment:
 2 | # ubuntu:juju-info <-> grafana-agent:juju-info
 3 | # ubuntu:juju-info <-> hardware-observer:general-info
 4 | # grafana-agent:cos-agent <-> hardware-observer:cos-agent
 5 | 
 6 | default-base: {{ base }}
 7 | 
 8 | machines:
 9 |   "0":
10 | 
11 | applications:
12 |   ubuntu:
13 |     charm: ubuntu
14 |     num_units: 1
15 |     to:
16 |       - "0"
17 |   grafana-agent:
18 |     charm: grafana-agent
19 |     channel: 1/stable
20 |   hardware-observer:
21 |     charm: {{ charm }}
22 |     options:
23 |         redfish-disable: {{ redfish_disable }}
24 | 
25 | relations:
26 | - - grafana-agent:juju-info
27 |   - ubuntu:juju-info
28 | - - hardware-observer:general-info
29 |   - ubuntu:juju-info
30 | 


--------------------------------------------------------------------------------
/tests/unit/test_os_platform.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import pytest
 4 | 
 5 | from os_platform import OSPlatform, UbuntuSeries, get_os_platform
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     "release,series",
10 |     [("22.04", UbuntuSeries.JAMMY), ("20.04", UbuntuSeries.FOCAL), ("NR", None)],
11 | )
12 | @pytest.mark.parametrize("machine", ["AMD64", "x86_86", "arm64", "riscv64"])
13 | def test_os_platform_series(release, series, machine):
14 |     """Get platform from a patched machine."""
15 |     with patch("distro.info", return_value={"version": release}):
16 |         with patch("platform.machine", return_value=machine):
17 |             result = get_os_platform()
18 |     assert result == OSPlatform(release=release, machine=machine)
19 |     assert result.series == series
20 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation -->
 2 | <!-- To update the file: -->
 3 | <!-- - Edit it in the canonical/solutions-engineering-automation repository. -->
 4 | <!-- - Open a PR with the changes. -->
 5 | <!-- - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes. -->
 6 | 
 7 | # Security policy
 8 | 
 9 | 
10 | ## Reporting a vulnerability
11 | To report a security issue, file a [Private Security Report](https://github.com/canonical/hardware-observer-operator/security/advisories/new)
12 | with a description of the issue, the steps you took to create the issue, affected versions, and,
13 | if known, mitigations for the issue.
14 | 
15 | The [Ubuntu Security disclosure and embargo policy](https://ubuntu.com/security/disclosure-policy)
16 | contains more information about what you can expect when you contact us and what we expect from you.
17 | 


--------------------------------------------------------------------------------
/.github/workflows/test_prometheus_rules.yaml:
--------------------------------------------------------------------------------
 1 | name: Test prometheus rules
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |   workflow_dispatch:
 6 |   pull_request:
 7 |     types: [opened, synchronize, reopened]
 8 |     branches: [main]
 9 |     paths-ignore:
10 |       - "**.md"
11 |       - "**.rst"
12 | 
13 | concurrency:
14 |   group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
15 |   cancel-in-progress: true
16 | 
17 | jobs:
18 |   promtool:
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - name: Checkout repo
22 |         uses: actions/checkout@v3
23 | 
24 |       # prometheus snap includes promtool
25 |       - name: Install prometheus snap
26 |         run: sudo snap install prometheus
27 | 
28 |       - name: Check validity of prometheus alert rules
29 |         run: |
30 |           promtool check rules src/prometheus_alert_rules/*.yaml
31 | 
32 |       - name: Run unit tests for prometheus alert rules
33 |         run: |
34 |           promtool test rules tests/unit/test_alert_rules/*.yaml
35 | 


--------------------------------------------------------------------------------
/tests/manual/etc/4_deploy_grafana_agent/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "machine_model" {
 2 |   description = "The machine model name"
 3 | }
 4 | 
 5 | variable "grafana_agent_base" {
 6 |   description = "The base for grafana agent"
 7 | }
 8 | 
 9 | variable "ubuntu_name" {
10 |   description = "The name of ubuntu charm"
11 |   default     = "ubuntu"
12 | }
13 | 
14 | variable "hardware_observer_name" {
15 |   description = "The name of hardware observer charm"
16 |   default     = "hardware observer"
17 | }
18 | 
19 | variable "receive-remote-write-offer-url" {
20 |   description = "Offer URL from prometheus-k8s:receive-remote-write application"
21 |   type        = string
22 |   default     = null
23 | }
24 | 
25 | variable "grafana-dashboard-offer-url" {
26 |   description = "Offer URL from grafana-k8s:grafana-dashboard application"
27 |   type        = string
28 |   default     = null
29 | }
30 | 
31 | variable "loki-logging-offer-url" {
32 |   description = "Offer URL from loki-k8s:logging application"
33 |   type        = string
34 |   default     = null
35 | }
36 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/eevee/README.md:
--------------------------------------------------------------------------------
 1 | # Testable Exporters
 2 | 
 3 | - [x] Prometheus Hardware Exporter
 4 |   - [x] ipmi_dcmi
 5 |   - [x] ipmi_sel
 6 |   - [x] ipmi_sensor
 7 |   - [x] redfish
 8 |   - [x] hpe_ssa (ssacli)
 9 |   - [ ] lsi_sas_2 (sas2ircu)
10 |   - [ ] lsi_sas_3 (sas3ircu)
11 |   - [ ] mega_raid (storcli)
12 |   - [ ] poweredge_raid (perccli)
13 | - [ ] DCGM Exporter (require NVIDIA)
14 |   - [ ] dcgm
15 | - [x] Smartctl Exporter (require S.M.A.R.T disks)
16 |   - [x] smartctl
17 | 
18 | ## Running the tests
19 | 
20 | You can run the functional tests for real hardware by following these steps:
21 | 
22 | ```shell
23 | # Adding relation will be tested as part of the test case, so we need to remove it before running the tests
24 | juju remove-relation -m hw-obs hardware-observer grafana-agent
25 | 
26 | # We don't have redfish credential for this machine
27 | juju config -m hw-obs hardware-observer redfish-disable=true
28 | 
29 | # Running the tests
30 | tox -e func -- -v --realhw --model hw-obs --no-deploy  --collectors ipmi_dcmi ipmi_sel ipmi_sensor hpe_ssa  --keep-models
31 | ```
32 | 


--------------------------------------------------------------------------------
/DEVELOPMENT.md:
--------------------------------------------------------------------------------
 1 | # Development
 2 | 
 3 | ## Setup environment
 4 | 
 5 | To start working on this charm, you'll need a working [development setup](https://juju.is/docs/sdk/dev-setup).
 6 | 
 7 | You can create an environment for development with `tox`:
 8 | 
 9 | ```shell
10 | tox devenv -e integration
11 | source venv/bin/activate
12 | ```
13 | 
14 | ## Testing
15 | 
16 | This project uses `tox` for managing test environments. There are some pre-configured environments
17 | that can be used for linting and formatting code when you're preparing contributions to the charm:
18 | 
19 | ```shell
20 | tox run -e format        # update your code according to linting rules
21 | tox run -e lint          # code style
22 | tox run -e unit          # unit tests
23 | tox run -e integration   # integration tests
24 | tox                      # runs 'format', 'lint', and 'unit' environments
25 | ```
26 | 
27 | ## Build the charm
28 | 
29 | Build the charm in this git repository using:
30 | 
31 | ```shell
32 | charmcraft pack
33 | ```
34 | 
35 | <!-- You may want to include any contribution/style guidelines in this document>
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation
 2 | # To update the file:
 3 | # - Edit it in the canonical/solutions-engineering-automation repository.
 4 | # - Open a PR with the changes.
 5 | # - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes.
 6 | 
 7 | # Python Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | 
12 | # Test files and directories
13 | .pytest_cache/
14 | .coverage*
15 | .tox
16 | reports/
17 | **/report/
18 | htmlcov/
19 | .mypy_cache
20 | .unit-state.db
21 | 
22 | # python virtual environments (for local dev)
23 | .venv
24 | venv
25 | env
26 | 
27 | # Build artefacts
28 | output/
29 | .build/
30 | build/
31 | *.charm
32 | *.snap
33 | # python build artefacts
34 | deb_dist/
35 | dist/
36 | *.egg-info/
37 | 
38 | # Log files
39 | *.log
40 | 
41 | # general backup files
42 | *~
43 | *.bak
44 | 
45 | # Note: for editor-specific files, please don't add them here, as they are specific to your environment, not the project.
46 | # Instead, consider using a global gitignore on your workstation.
47 | 


--------------------------------------------------------------------------------
/tests/manual/etc/3_deploy_cos/terragrunt.hcl:
--------------------------------------------------------------------------------
 1 | dependency "add_machine" {
 2 |   config_path = "../1_add_machine"
 3 |   mock_outputs = {
 4 |     machine_model = "mocked_machine_model"
 5 |   }
 6 | }
 7 | 
 8 | dependency "add_k8s_cloud" {
 9 |   config_path = "../2_add_k8s_cloud"
10 |   mock_outputs = {
11 |     k8s_cloud_name       = "mocked_k8s_cloud_name"
12 |     k8s_cloud_credential = "mocked_k8s_cloud_credential"
13 |   }
14 | }
15 | 
16 | terraform {
17 |   after_hook "wait-for-cos" {
18 |     commands     = ["apply"]
19 |     execute      = [find_in_parent_folders("./scripts/wait-for-model.sh"), "cos"]
20 |     run_on_error = true
21 |   }
22 | 
23 |   after_hook "wait-for-cos-destroyed" {
24 |     commands     = ["destroy"]
25 |     execute      = [find_in_parent_folders("./scripts/wait-for-model-destroyed.sh"), "cos"]
26 |     run_on_error = true
27 |   }
28 | }
29 | 
30 | inputs = {
31 |   machine_model        = "${dependency.add_machine.outputs.machine_model}"
32 |   k8s_cloud_name       = "${dependency.add_k8s_cloud.outputs.k8s_cloud_name}"
33 |   k8s_cloud_credential = "${dependency.add_k8s_cloud.outputs.k8s_cloud_credential}"
34 |   metallb_ip_ranges    = run_cmd("--terragrunt-quiet", find_in_parent_folders("./scripts/get-preferred-ip.sh"))
35 | }
36 | 


--------------------------------------------------------------------------------
/tests/integration/export_mock_metrics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright 2024 Canonical Ltd.
 3 | # See LICENSE file for licensing details.
 4 | 
 5 | # This file is supposed to run on the hardware observer unit.
 6 | 
 7 | import time
 8 | 
 9 | from mock_data import SAMPLE_METRICS
10 | from prometheus_client import REGISTRY, start_http_server
11 | from prometheus_client.core import GaugeMetricFamily
12 | 
13 | 
14 | class SyntheticCollector:
15 |     """Collector for creating synthetic(mock) metrics."""
16 | 
17 |     def collect(self):
18 |         for sample_metric in SAMPLE_METRICS:
19 |             metric = GaugeMetricFamily(
20 |                 name=sample_metric["name"],
21 |                 documentation=sample_metric["documentation"],
22 |                 labels=list(sample_metric["labels"].keys()),
23 |             )
24 |             metric.add_metric(  # type: ignore[attr-defined]
25 |                 labels=list(sample_metric["labels"].values()), value=sample_metric["value"]
26 |             )
27 |             yield metric
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     start_http_server(10200)  # start at default port (see config.yaml)
32 |     REGISTRY.register(SyntheticCollector())
33 | 
34 |     while True:
35 |         time.sleep(10)  # Keep the server running
36 | 


--------------------------------------------------------------------------------
/charmcraft.yaml:
--------------------------------------------------------------------------------
 1 | # Learn more about charmcraft.yaml configuration at:
 2 | # https://juju.is/docs/sdk/charmcraft-config
 3 | # This is a template `charmcraft.yaml` file for ops charms
 4 | 
 5 | type: charm
 6 | 
 7 | parts:
 8 |   charm:
 9 |     plugin: charm
10 |     source: .
11 |     build-packages: [git]
12 | 
13 | platforms:
14 |   ubuntu@20.04:amd64:
15 |   ubuntu@22.04:amd64:
16 |   ubuntu@24.04:amd64:
17 |   ubuntu@20.04:arm64:
18 |   ubuntu@22.04:arm64:
19 |   ubuntu@24.04:arm64:
20 |   ubuntu@20.04:s390x:
21 |   ubuntu@22.04:s390x:
22 |   ubuntu@24.04:s390x:
23 |   ubuntu@20.04:ppc64el:
24 |   ubuntu@22.04:ppc64el:
25 |   ubuntu@24.04:ppc64el:
26 | 
27 | actions:
28 |   redetect-hardware:
29 |     description: >
30 |       Redetect the hardware on the device and provide an option to
31 |       reinitialize the charm.
32 | 
33 |       By default, this will only show the current hardware tool list and compare with new
34 |       detection.
35 |       The exporter service(s) will be reconfigured and restarted if option
36 |       `apply` is provided.
37 |     params:
38 |       apply:
39 |         type: boolean
40 |         description: |
41 |           Use the re-detected list of hardware tools as the new enable-list to reconfigure
42 |           and restart the exporter.
43 |         default: false
44 | 


--------------------------------------------------------------------------------
/tests/manual/etc/1_add_machine/terragrunt.hcl:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   after_hook "wait-for-ubuntu" {
 3 |     commands     = ["apply"]
 4 |     execute      = [find_in_parent_folders("./scripts/wait-for-application.sh"), "hw-obs", "ubuntu"]
 5 |     run_on_error = true
 6 |   }
 7 | 
 8 |   after_hook "wait-for-microk8s" {
 9 |     commands     = ["apply"]
10 |     execute      = [find_in_parent_folders("./scripts/wait-for-application.sh"), "hw-obs", "microk8s"]
11 |     run_on_error = true
12 |   }
13 |   after_hook "wait-for-microk8s-api-server" {
14 |     commands     = ["apply"]
15 |     execute      = [find_in_parent_folders("./scripts/wait-for-microk8s.sh"), "microk8s"]
16 |     run_on_error = true
17 |   }
18 |   after_hook "get-kubeconfig" {
19 |     commands     = ["apply"]
20 |     execute      = [find_in_parent_folders("./scripts/get-kubeconfig.sh")]
21 |     run_on_error = true
22 |   }
23 | 
24 |   after_hook "cleanup" {
25 |     commands     = ["destroy"]
26 |     execute      = [find_in_parent_folders("./scripts/cleanup.sh")]
27 |     run_on_error = true
28 |   }
29 | }
30 | 
31 | inputs = {
32 |   ssh_address      = format("ubuntu@%s", run_cmd("--terragrunt-quiet", find_in_parent_folders("./scripts/get-local-ip.sh")))
33 |   public_key_file  = format("%s/.ssh/id_rsa.pub", get_env("HOME"))
34 |   private_key_file = format("%s/.ssh/id_rsa", get_env("HOME"))
35 | }
36 | 


--------------------------------------------------------------------------------
/tests/manual/etc/3_deploy_cos/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     juju = {
 4 |       version = "~> 0.20.0"
 5 |       source  = "juju/juju"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "juju" {}
11 | 
12 | locals {
13 |   cos_model_name = "cos"
14 | }
15 | 
16 | module "cos-lite-terraform" {
17 |   source = "git::https://github.com/canonical/snap-openstack.git//sunbeam-python/sunbeam/features/observability/etc/deploy-cos"
18 | 
19 |   model      = local.cos_model_name
20 |   cloud      = var.k8s_cloud_name
21 |   region     = "default"
22 |   credential = var.k8s_cloud_credential
23 | 
24 |   config = {
25 |     workload-storage = "microk8s-hostpath"
26 |   }
27 | 
28 |   # Add some temporary overrides, can remove when these channels are updated to latest/stable
29 |   alertmanager-channel = "1/stable"
30 |   prometheus-channel = "1/stable"
31 |   grafana-channel = "1/stable"
32 |   catalogue-channel = "1/stable"
33 |   loki-channel = "1/stable"
34 | }
35 | 
36 | resource "juju_application" "metallb" {
37 |   name  = "metallb"
38 |   trust = true
39 |   units = 1
40 | 
41 |   model = local.cos_model_name
42 |   config = {
43 |     iprange = var.metallb_ip_ranges
44 |   }
45 | 
46 |   charm {
47 |     name    = "metallb"
48 |     channel = "latest/stable"
49 |     base    = "ubuntu@22.04"
50 |   }
51 | 
52 |   depends_on = [module.cos-lite-terraform]
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/.github/.jira_sync_config.yaml:
--------------------------------------------------------------------------------
 1 | # This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation
 2 | # To update the file:
 3 | # - Edit it in the canonical/solutions-engineering-automation repository.
 4 | # - Open a PR with the changes.
 5 | # - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes.
 6 | #
 7 | # For more info about the settings, please refre to the github repository:
 8 | # https://github.com/canonical/gh-jira-sync-bot
 9 | #
10 | 
11 | settings:
12 |   # Jira project key to create the issue in
13 |   jira_project_key: "SOLENG"
14 | 
15 |   # Dictionary mapping GitHub issue status to Jira issue status
16 |   status_mapping:
17 |     opened: Untriaged
18 |     closed: done
19 |     not_planned: rejected
20 | 
21 |   # (Optional) Jira project components that should be attached to the created issue
22 |   # Component names are case-sensitive
23 |   components:
24 |     - hardware-observer
25 | 
26 |   # (Optional) (Default: false) Add a new comment in GitHub with a link to Jira created issue
27 |   add_gh_comment: true
28 | 
29 |   # (Optional) (Default: None) Parent Epic key to link the issue to
30 |   epic_key: SOLENG-46
31 | 
32 |   # (Optional) Dictionary mapping GitHub issue labels to Jira issue types.
33 |   # If label on the issue is not in specified list, this issue will be created as a Bug
34 |   label_mapping:
35 |     enhancement: Story
36 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/mega_raid.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: MegaRAID
 3 |   rules:
 4 | 
 5 |   - alert: StorcliCommandFailed
 6 |     expr: storcli_command_success == 0
 7 |     for: 0m
 8 |     labels:
 9 |       severity: critical
10 |     annotations:
11 |       summary: Failed to run storcli. (instance {{ $labels.instance }})
12 |       description: |
13 |         Failed to get MegaRAID controller information using storcli.
14 |           VALUE = {{ $value }}
15 |           LABELS = {{ $labels }}
16 | 
17 |   - alert: MegaRAIDControllerNotFound
18 |     expr: megaraid_controllers == 0
19 |     for: 0m
20 |     labels:
21 |       severity: warning
22 |     annotations:
23 |       summary: MegaRAID controller not found. (instance {{ $labels.instance }})
24 |       description: |
25 |         Cannot found MegaRAID controller on this host machine.
26 |           NUMBER_OF_CONTROLLERS = {{ $value }}
27 |           LABELS = {{ $labels }}
28 | 
29 |   - alert: MegaRAIDVirtualDriveNotOptimal
30 |     expr: megaraid_virtual_drive_info{state != "Optl"} == 1
31 |     for: 0m
32 |     labels:
33 |       severity: warning
34 |     annotations:
35 |       summary: MegaRAID virtual drives are not in optimal state. (instance {{ $labels.instance }})
36 |       description: |
37 |         MegaRAID virtual drives are not in optimal state. Please check the if the virtual drives are working as expected.
38 |           STATE = {{ $labels.state }}
39 |           LABELS = {{ $labels }}
40 | 


--------------------------------------------------------------------------------
/tests/integration/mock_data.py:
--------------------------------------------------------------------------------
 1 | # Metrics
 2 | SAMPLE_METRICS = [
 3 |     {
 4 |         "name": "ipmi_dcmi_command_success",
 5 |         "documentation": "Indicates if the ipmi dcmi command is successful or not",
 6 |         "labels": {},
 7 |         "value": 0.0,
 8 |     },
 9 |     {
10 |         "name": "redfish_call_success",
11 |         "documentation": "Indicates if call to the redfish API succeeded or not",
12 |         "labels": {},
13 |         "value": 1.0,
14 |     },
15 |     {
16 |         "name": "ipmi_temperature_celsius",
17 |         "documentation": "Temperature measure from temperature sensors",
18 |         "labels": {"name": "testname", "state": "Critical", "unit": "C"},
19 |         "value": 200,
20 |     },
21 | ]
22 | 
23 | 
24 | # Expected alerts based on above metrics
25 | EXPECTED_ALERTS = [
26 |     {
27 |         "labels": {
28 |             "alertname": "IPMIDCMICommandFailed",
29 |             "juju_application": "hardware-observer",
30 |             "juju_unit": "hardware-observer/0",
31 |             "severity": "critical",
32 |         },
33 |         "state": "firing",
34 |         "value": 0.0,
35 |     },
36 |     {
37 |         "labels": {
38 |             "alertname": "IPMITemperatureStateNotOk",
39 |             "juju_application": "hardware-observer",
40 |             "juju_unit": "hardware-observer/0",
41 |             "severity": "critical",
42 |         },
43 |         "state": "firing",
44 |         "value": 200,
45 |     },
46 | ]
47 | 


--------------------------------------------------------------------------------
/tests/manual/etc/4_deploy_grafana_agent/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     juju = {
 4 |       version = "~> 0.20.0"
 5 |       source  = "juju/juju"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "juju" {}
11 | 
12 | module "grafana-agent" {
13 |   source = "git::https://github.com/canonical/snap-openstack.git//sunbeam-python/sunbeam/features/observability/etc/deploy-grafana-agent"
14 | 
15 |   grafana-agent-base             = var.grafana_agent_base
16 |   grafana-agent-channel          = "1/stable" # Can move back to latest/stable when the charm is updated
17 |   principal-application-model    = var.machine_model
18 |   receive-remote-write-offer-url = var.receive-remote-write-offer-url
19 |   grafana-dashboard-offer-url    = var.grafana-dashboard-offer-url
20 |   logging-offer-url              = var.loki-logging-offer-url
21 | 
22 | }
23 | 
24 | resource "juju_integration" "ubuntu-to-grafana-agent" {
25 |   model = var.machine_model
26 | 
27 |   application {
28 |     name     = var.ubuntu_name
29 |     endpoint = "juju-info"
30 |   }
31 | 
32 |   application {
33 |     name     = "grafana-agent"
34 |     endpoint = "juju-info"
35 |   }
36 | }
37 | 
38 | resource "juju_integration" "hardware-observer-to-grafana-agent" {
39 |   model = var.machine_model
40 | 
41 |   application {
42 |     name     = var.hardware_observer_name
43 |     endpoint = "cos-agent"
44 |   }
45 | 
46 |   application {
47 |     name     = "grafana-agent"
48 |     endpoint = "cos-agent"
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/tests/integration/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | 
 4 | import logging
 5 | from dataclasses import dataclass
 6 | 
 7 | from juju.controller import Controller
 8 | from juju.model import Model
 9 | from pytest_operator.plugin import OpsTest
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | async def get_or_add_model(ops_test: OpsTest, controller: Controller, model_name: str) -> Model:
15 |     # Pytest Operator provides a --model option. If provided, model with that name will be used.
16 |     # So, we need to check if it already exists.
17 |     if model_name not in await controller.get_models():
18 |         await controller.add_model(model_name)
19 |         ctl_name = controller.controller_name
20 |         await ops_test.track_model(
21 |             f"{ctl_name}-{model_name}", cloud_name=ctl_name, model_name=model_name, keep=False
22 |         )
23 | 
24 |     return await controller.get_model(model_name)
25 | 
26 | 
27 | @dataclass
28 | class Alert:
29 |     """Alert data wrapper."""
30 | 
31 |     state: str
32 |     value: float
33 |     labels: dict
34 | 
35 |     def is_same_alert(self, other) -> bool:
36 |         """Check if the two alerts are the same based on relevant fields."""
37 |         if self.state != other.state or self.value != other.value:
38 |             return False
39 |         for key, value in self.labels.items():
40 |             if other.labels.get(key) != value:
41 |                 return False
42 |         return True
43 | 


--------------------------------------------------------------------------------
/src/apt_helpers.py:
--------------------------------------------------------------------------------
 1 | """Apt helper module for missing features in operator_libs_linux."""
 2 | 
 3 | import re
 4 | from subprocess import PIPE, CalledProcessError, check_output
 5 | from typing import Optional
 6 | 
 7 | from charms.operator_libs_linux.v0 import apt
 8 | 
 9 | 
10 | def get_candidate_version(package: str) -> Optional[str]:
11 |     """Get candidate version of package from apt-cache.
12 | 
13 |     Related issue: https://github.com/canonical/operator-libs-linux/issues/113
14 |     """
15 |     try:
16 |         output = check_output(
17 |             ["apt-cache", "policy", package], stderr=PIPE, universal_newlines=True
18 |         )
19 |     except CalledProcessError as e:
20 |         raise apt.PackageError(f"Could not list packages in apt-cache: {e.output}") from None
21 | 
22 |     lines = [line.strip() for line in output.strip().split("\n")]
23 |     for line in lines:
24 |         candidate_matcher = re.compile(r"^Candidate:\s(?P<version>(.*))")
25 |         matches = candidate_matcher.search(line)
26 |         if matches:
27 |             return matches.groupdict().get("version")
28 |     raise apt.PackageError(f"Could not find candidate version package in apt-cache: {output}")
29 | 
30 | 
31 | def add_pkg_with_candidate_version(pkg: str) -> None:
32 |     """Install package with apt-cache candidate version.
33 | 
34 |     Related issue: https://github.com/canonical/operator-libs-linux/issues/113
35 |     """
36 |     version = get_candidate_version(pkg)
37 |     apt.add_package(pkg, version=version, update_cache=False)
38 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/pianta/README.md:
--------------------------------------------------------------------------------
 1 | # Testable Exporters
 2 | 
 3 | - [x] Prometheus Hardware Exporter
 4 |   - [x] ipmi_dcmi
 5 |   - [x] ipmi_sel
 6 |   - [x] ipmi_sensor
 7 |   - [x] redfish
 8 |   - [ ] hpe_ssa (ssacli)
 9 |   - [ ] lsi_sas_2 (sas2ircu)
10 |   - [ ] lsi_sas_3 (sas3ircu)
11 |   - [ ] mega_raid (storcli)
12 |   - [x] poweredge_raid (perccli)
13 | - [ ] DCGM Exporter (require NVIDIA)
14 |   - [ ] dcgm
15 | - [x] Smartctl Exporter (require S.M.A.R.T disks)
16 |   - [x] smartctl
17 | 
18 | ## Resources
19 | 
20 | To test the above exporters, you will need to manually attach resource for `perccli-deb`. You can find the instruction
21 | on how to download the resource [here](https://charmhub.io/hardware-observer/resources/perccli-deb). Once you downloaded
22 | the resource, you can attach the resource using
23 | 
24 | ```shell
25 | juju attach-resource hardware-observer perccli-deb=<PATH-TO-PERCCLI-DEB>
26 | ```
27 | 
28 | ## Redfish credential
29 | 
30 | Please consult the team for redfish credential.
31 | 
32 | After you have obtained the redfish credential, you can follow the steps to config Hardware Observer Operator to use
33 | that redfish credential.
34 | 
35 | 0. Find the user: `sudo ipmitool user list 1`
36 | 1. Enable the user: `sudo ipmitool user enable <USER-ID>`
37 | 2. Update charm config: `juju config hardware-observer redfish-username=<username> redfish-password=<password>`
38 | 
39 | As a good practice, you should disable the testing redfish user when you are done with testing: `sudo ipmitool user disable <USER-ID>`.
40 | 


--------------------------------------------------------------------------------
/tests/unit/test_alert_rules/test_general.yaml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - ../../../src/prometheus_alert_rules/general.yaml
 3 | 
 4 | evaluation_interval: 2m
 5 | 
 6 | tests:
 7 |   - interval: 2m
 8 |     input_series:
 9 |       - series: ipmidcmi_collector_failed{instance="ubuntu-99", collector="ipmidcmi"}
10 |         values: '1x15'
11 |       - series: ipmisel_collector_failed{instance="ubuntu-99", collector="ipmisel"}
12 |         values: '1x15'
13 | 
14 |     alert_rule_test:
15 |       - eval_time: 32m
16 |         alertname: CollectorFailed
17 |         exp_alerts:
18 |           - exp_labels:
19 |               severity: error
20 |               instance: ubuntu-99
21 |               collector: ipmidcmi
22 |             exp_annotations:
23 |               summary: Collector failed. (instance ubuntu-99)
24 |               description: |
25 |                 A collector failed to fetch the metrics. Please reach out to hardware-observer maintainers.
26 |                   LABELS = map[__name__:ipmidcmi_collector_failed collector:ipmidcmi instance:ubuntu-99]
27 |           - exp_labels:
28 |               severity: error
29 |               instance: ubuntu-99
30 |               collector: ipmisel
31 |             exp_annotations:
32 |               summary: Collector failed. (instance ubuntu-99)
33 |               description: |
34 |                 A collector failed to fetch the metrics. Please reach out to hardware-observer maintainers.
35 |                   LABELS = map[__name__:ipmisel_collector_failed collector:ipmisel instance:ubuntu-99]
36 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/kongfu/README.md:
--------------------------------------------------------------------------------
 1 | # Testable Exporters
 2 | 
 3 | - [x] Prometheus Hardware Exporter
 4 |   - [x] ipmi_dcmi
 5 |   - [x] ipmi_sel
 6 |   - [x] ipmi_sensor
 7 |   - [x] redfish
 8 |   - [ ] hpe_ssa (ssacli)
 9 |   - [ ] lsi_sas_2 (sas2ircu)
10 |   - [ ] lsi_sas_3 (sas3ircu)
11 |   - [x] mega_raid (storcli)
12 |   - [ ] poweredge_raid (perccli)
13 | - [ ] DCGM Exporter (require NVIDIA)
14 |   - [ ] dcgm
15 | - [x] Smartctl Exporter (require S.M.A.R.T disks)
16 |   - [x] smartctl
17 | 
18 | ## Resources
19 | 
20 | To test the above exporters, you will need to manually attach resource for `storcli-deb`. You can find the instruction
21 | on how to download the resource [here](https://charmhub.io/hardware-observer/resources/storcli-deb). Once you downloaded
22 | the resource, you can attach the resource using
23 | 
24 | ```shell
25 | juju attach-resource hardware-observer storcli-deb=<PATH-TO-STORCLI-DEB>
26 | ```
27 | 
28 | ## Running the tests
29 | 
30 | You can run the functional tests for real hardware by following these steps:
31 | 
32 | ```shell
33 | # Adding relation will be tested as part of the test case, so we need to remove it before running the tests
34 | juju remove-relation -m hw-obs hardware-observer grafana-agent
35 | 
36 | # We don't have redfish credential for this machine
37 | juju config -m hw-obs hardware-observer redfish-disable=true
38 | 
39 | # If you already attach the `storcli-deb` resource
40 | tox -e func -- -v -k 'not test_required_resources' --realhw --model hw-obs --no-deploy  --collectors ipmi_dcmi ipmi_sel ipmi_sensor mega_raid  --keep-models
41 | ```
42 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/ipmi_dcmi.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: IpmiDcmi
 3 |   rules:
 4 |     - alert: IPMIDCMICommandSuccessMetricsMissing
 5 |       expr: absent_over_time(ipmi_dcmi_command_success[5m])
 6 |       labels:
 7 |         severity: critical
 8 |       annotations:
 9 |         summary: IPMI DCMI command success metrics missing. (instance {{ $labels.instance }})
10 |         description: |
11 |           The ipmi_dcmi_command_success metric has been missing for over 5 minutes.
12 |           This may indicate IPMI DCMI command timeouts, or that IPMI tools/services are not installed or supported on this hardware.
13 |             LABELS = {{ $labels }}
14 | 
15 |     - alert: IPMIDCMICommandFailed
16 |       expr: ipmi_dcmi_command_success == 0
17 |       for: 5m
18 |       labels:
19 |         severity: critical
20 |       annotations:
21 |         summary: Failed to run ipmi_dcmi. (instance {{ $labels.instance }})
22 |         description: |
23 |           Failed to get system power statistics using ipmi_dcmi.
24 |             VALUE = {{ $value }}
25 |             LABELS = {{ $labels }}
26 | 
27 |     - alert: IPMIDCMIPowerConsumptionPercentageOutstanding
28 |       expr: ipmi_dcmi_power_consumption_percentage >= 0.8
29 |       for: 5m
30 |       labels:
31 |         severity: warning
32 |       annotations:
33 |         summary: IPMI DCMI power consumption percentage is high. (instance {{ $labels.instance }})
34 |         description: |
35 |           IPMI DCMI power consumption percentage is high for over 5 minutes.
36 |             POWER_CONSUMPTION_PERCENTAGE = {{ $value }}
37 |             LABELS = {{ $labels }}
38 | 


--------------------------------------------------------------------------------
/tests/manual/etc/4_deploy_grafana_agent/terragrunt.hcl:
--------------------------------------------------------------------------------
 1 | dependency "add_machine" {
 2 |   config_path = "../1_add_machine"
 3 |   mock_outputs = {
 4 |     machine_base           = "mocked_os@mocked_release"
 5 |     machine_model          = "mocked_machine_model"
 6 |     ubuntu_name            = "mocked_ubuntu_name"
 7 |     hardware_observer_name = "mocked_hardware_observer_name"
 8 |   }
 9 | }
10 | 
11 | dependency "deploy_cos" {
12 |   config_path = "../3_deploy_cos"
13 |   mock_outputs = {
14 |     receive-remote-write-offer-url = "mocked_receive-remote-write-offer-url"
15 |     grafana-dashboard-offer-url    = "mocked_grafana-dashboard-offer-url"
16 |     loki-logging-offer-url         = "mocked_loki-logging-offer-url"
17 |   }
18 | }
19 | 
20 | terraform {
21 |   after_hook "wait-for-grafana-agent" {
22 |     commands     = ["apply"]
23 |     execute      = [find_in_parent_folders("./scripts/wait-for-application.sh"), "hw-obs", "grafana-agent"]
24 |     run_on_error = true
25 |   }
26 | }
27 | 
28 | inputs = {
29 |   machine_model                  = "${dependency.add_machine.outputs.machine_model}"
30 |   grafana_agent_base             = "${dependency.add_machine.outputs.machine_base}"
31 |   ubuntu_name                    = "${dependency.add_machine.outputs.ubuntu_name}"
32 |   hardware_observer_name         = "${dependency.add_machine.outputs.hardware_observer_name}"
33 |   receive-remote-write-offer-url = "${dependency.deploy_cos.outputs.receive-remote-write-offer-url}"
34 |   grafana-dashboard-offer-url    = "${dependency.deploy_cos.outputs.grafana-dashboard-offer-url}"
35 |   loki-logging-offer-url         = "${dependency.deploy_cos.outputs.loki-logging-offer-url}"
36 | }
37 | 


--------------------------------------------------------------------------------
/tests/manual/README.md:
--------------------------------------------------------------------------------
 1 | # Manual testing for Hardware Observer
 2 | 
 3 | Testing Hardware Observer Operator requires machines with various hardware devices (e.g. GPU, RAID controller, S.M.A.R.T
 4 | disks) that are not easily accessible to the CI environment. Fortunately, with [Testflinger][testflinger], we could
 5 | easily allocate physical machines with various hardware devices for testing. However, machines on testflinger can only
 6 | be reserved for 6 hours at most, so it can be quite cumbersome to reproduce the test environment for long term
 7 | development or testing. For this reason, we created a terraform plan that can easily deploys (or re-deploys) the
 8 | environment for testing Hardware Observer Operator with COS-Lite integrations.
 9 | 
10 | > [!WARNING]
11 | > This is not a production environment!
12 | 
13 | > [!WARNING]
14 | > The use of testflinger is restricted. External contributor will not be able to use Testflinger to allocate physical
15 | > machine. However, the terraform plan should still work if you somehow have access to a machine with hardware devices.
16 | 
17 | ## Quick start
18 | 
19 | The overall workflow is outlined below:
20 | 
21 | ```shell
22 | # On the host machine (e.g. your laptop or desktop), run
23 | git clone https://github.com/canonical/hardware-observer-operator.git
24 | cd hardware-observer-operator/tests/manual/jobs
25 | ./submit.sh torchtusk noble lp:myusername-1234
26 | 
27 | # Wait until the machine is ready, then ssh into the machine
28 | ssh ubuntu@xxx.xxx.xxx
29 | 
30 | # On the testflinger machine, run
31 | git clone https://github.com/canonical/hardware-observer-operator.git
32 | cd hardware-observer-operator/tests/manual/
33 | ./scripts/bootstrap.sh
34 | terragrunt run-all apply
35 | ```
36 | 
37 | [testflinger]: https://canonical-testflinger.readthedocs-hosted.com/en/latest/
38 | 


--------------------------------------------------------------------------------
/tests/unit/test_apt_helpers.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from subprocess import CalledProcessError
 3 | from unittest import mock
 4 | 
 5 | from charms.operator_libs_linux.v0 import apt
 6 | 
 7 | import apt_helpers
 8 | 
 9 | APT_CACHE_POLICY_FREEIPMI_TOOLS_OUTPUT = """freeipmi-tools:
10 |   Installed: (none)
11 |   Candidate: 1.6.4-3ubuntu1.1
12 |   Version table:
13 |      1.6.9-2~bpo20.04.1 100
14 |         100 http://tw.archive.ubuntu.com/ubuntu focal-backports/main amd64 Packages
15 |      1.6.4-3ubuntu1.1 500
16 |         500 http://tw.archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages
17 |         100 /var/lib/dpkg/status
18 |      1.6.4-3ubuntu1 500
19 |         500 http://tw.archive.ubuntu.com/ubuntu focal/main amd64 Packages
20 | """
21 | 
22 | 
23 | class TestGetCandidateVersion(unittest.TestCase):
24 |     @mock.patch("apt_helpers.check_output")
25 |     def test_install_freeipmi_tools_on_focal(self, mock_check_output):
26 |         mock_check_output.return_value = APT_CACHE_POLICY_FREEIPMI_TOOLS_OUTPUT
27 |         version = apt_helpers.get_candidate_version("freeipmi-tools")
28 |         self.assertEqual(version, "1.6.4-3ubuntu1.1")
29 | 
30 |     @mock.patch("apt_helpers.check_output")
31 |     def test_checkoutput_failed(self, mock_check_output):
32 |         mock_check_output.side_effect = CalledProcessError(-1, "cmd")
33 | 
34 |         with self.assertRaises(apt.PackageError):
35 |             apt_helpers.get_candidate_version("freeipmi-tools")
36 | 
37 |     @mock.patch("apt_helpers.check_output")
38 |     def test_checkoutput_version_not_found_error(self, mock_check_output):
39 |         fake_output = APT_CACHE_POLICY_FREEIPMI_TOOLS_OUTPUT.replace("Candidate", "NotCandidate")
40 |         mock_check_output.return_value = fake_output
41 | 
42 |         with self.assertRaises(apt.PackageError):
43 |             apt_helpers.get_candidate_version("freeipmi-tools")
44 | 


--------------------------------------------------------------------------------
/tests/manual/etc/1_add_machine/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     juju = {
 4 |       version = "~> 0.17.0"
 5 |       source  = "juju/juju"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "juju" {}
11 | 
12 | resource "juju_model" "hw-obs" {
13 |   name = "hw-obs"
14 | 
15 |   cloud {
16 |     name = "localhost"
17 |   }
18 | }
19 | 
20 | resource "juju_machine" "machine" {
21 |   model = juju_model.hw-obs.name
22 | 
23 |   ssh_address      = var.ssh_address
24 |   public_key_file  = var.public_key_file
25 |   private_key_file = var.private_key_file
26 | }
27 | 
28 | resource "juju_application" "ubuntu" {
29 |   name  = "ubuntu"
30 |   model = juju_model.hw-obs.name
31 | 
32 |   units     = 1
33 |   placement = juju_machine.machine.machine_id
34 | 
35 |   charm {
36 |     name    = "ubuntu"
37 |     base    = juju_machine.machine.base
38 |     channel = "latest/stable"
39 |   }
40 | }
41 | 
42 | resource "juju_application" "microk8s" {
43 |   name  = "microk8s"
44 |   model = juju_model.hw-obs.name
45 | 
46 |   units     = 1
47 |   placement = juju_machine.machine.machine_id
48 |   config = {
49 |     hostpath_storage = true
50 |   }
51 | 
52 |   charm {
53 |     name    = "microk8s"
54 |     base    = juju_machine.machine.base
55 |     channel = "1.28/stable"
56 |   }
57 | }
58 | 
59 | resource "juju_application" "hardware-observer" {
60 |   name  = "hardware-observer"
61 |   model = juju_model.hw-obs.name
62 | 
63 |   units = 0
64 | 
65 |   charm {
66 |     name    = "hardware-observer"
67 |     base    = juju_machine.machine.base
68 |     channel = "latest/stable"
69 |   }
70 | }
71 | 
72 | resource "juju_integration" "ubuntu-to-hardware-observer" {
73 |   model = juju_model.hw-obs.name
74 | 
75 |   application {
76 |     name     = juju_application.ubuntu.name
77 |     endpoint = "juju-info"
78 |   }
79 | 
80 |   application {
81 |     name     = juju_application.hardware-observer.name
82 |     endpoint = "general-info"
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | # This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation
 2 | # To update the file:
 3 | # - Edit it in the canonical/solutions-engineering-automation repository.
 4 | # - Open a PR with the changes.
 5 | # - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes.
 6 | name: Release to Edge
 7 | 
 8 | on:
 9 |   push:
10 |     branches: [ main ]
11 |   release:
12 |     types: [ published ]
13 | 
14 | jobs:
15 |   check:
16 |     uses: ./.github/workflows/check.yaml
17 |     secrets: inherit
18 | 
19 |   release:
20 |     needs: check
21 |     runs-on: ubuntu-24.04
22 |     steps:
23 |       - name: Checkout
24 |         uses: actions/checkout@v6
25 |         with:
26 |           submodules: true
27 | 
28 |       - name: Download the tested charms
29 |         uses: actions/download-artifact@v7
30 |         with:
31 |           pattern: built_charms_*
32 |           merge-multiple: true
33 | 
34 |       - name: List the names of the tested charms
35 |         run: |
36 |           echo "CHARM_NAMES=$(ls *.charm | paste -sd ,)" | tee -a "$GITHUB_ENV"
37 | 
38 |       - name: Pack and upload to charmhub
39 |         uses: canonical/charming-actions/upload-charm@2.7.0
40 |         with:
41 |           credentials: "${{ secrets.CHARMHUB_TOKEN }}"
42 |           github-token: "${{ secrets.GITHUB_TOKEN }}"
43 |           built-charm-path: "${{ env.CHARM_NAMES }}"
44 | 
45 |   notify-on-release-failure:
46 |     runs-on: ubuntu-latest
47 |     needs:
48 |       - check
49 |       - release
50 |     if: ${{ failure() && github.run_attempt == 1 }}
51 |     steps:
52 |       - name: Notify release failure
53 |         uses: mattermost/action-mattermost-notify@master
54 |         with:
55 |           MATTERMOST_WEBHOOK_URL: ${{ secrets.MATTERMOST_WEBHOOK_URL }}
56 |           TEXT: |
57 |             :rotating_light: @soleng release failed in project [${{ github.repository }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
58 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/perccli.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: PowerEdgeRAID
 3 |   rules:
 4 |   - alert: PerccliCommandFailed
 5 |     expr: perccli_command_success == 0
 6 |     for: 0m
 7 |     labels:
 8 |       severity: critical
 9 |     annotations:
10 |       summary: Failed to run perccli or controller not available. (instance {{ $labels.instance }})
11 |       description: |
12 |         Failed to get PowerEdgeRAID controller information using perccli.
13 |           INSTANCE = {{ $labels.instance }}
14 |           SUCCESS = {{ $value }}
15 |           LABELS = {{ $labels }}
16 | 
17 |   - alert: PowerEdgeRAIDControllerNotFound
18 |     expr: poweredgeraid_controllers == 0
19 |     for: 0m
20 |     labels:
21 |       severity: warning
22 |     annotations:
23 |       summary: PowerEdge RAID controller not found. (instance {{ $labels.instance }})
24 |       description: |
25 |         Cannot find PowerEdge RAID controller on this host machine.
26 |           INSTANCE = {{ $labels.instance }}
27 |           LABELS = {{ $labels }}
28 | 
29 |   - alert: PowerEdgeRAIDControllerSuccess
30 |     expr: perccli_command_ctrl_success == 0
31 |     for: 0m
32 |     labels:
33 |       severity: critical
34 |     annotations:
35 |       summary: PowerEdge RAID controller command not successful.  (instance {{ $labels.instance }})
36 |       description: |
37 |         Failed to get PowerEdge RAID controller information on controller {{ $labels.controller_id }}.
38 |           INSTANCE = {{ $labels.instance }}
39 |           CONTROLLER_ID = {{ $labels.controller_id }}
40 |           LABELS = {{ $labels }}
41 | 
42 |   - alert: PowerEdgeRAIDVirtualDriveNotOptimal
43 |     expr: poweredgeraid_virtual_info{state != "Optl"} == 1
44 |     for: 0m
45 |     labels:
46 |       severity: warning
47 |     annotations:
48 |       summary: PowerEdge RAID virtual drives are not in optimal state. (instance {{ $labels.instance }})
49 |       description: |
50 |         PowerEdge RAID virtual drives are not in optimal state. Please check the if the virtual drives are working as expected.
51 |           STATE = {{ $labels.state }}
52 |           LABELS = {{ $labels }}
53 | 


--------------------------------------------------------------------------------
/tests/manual/scripts/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -xe
 4 | 
 5 | sudo apt update
 6 | 
 7 | # Set up packages
 8 | sudo snap install juju
 9 | sudo snap install terraform --classic
10 | sudo snap install charmcraft --classic
11 | sudo apt-get install tox jq ubuntu-drivers-common -y
12 | 
13 | # Download terragrunt
14 | if [ ! -e /usr/local/bin/terragrunt ]; then
15 |     OS="linux"
16 |     ARCH="amd64"
17 |     VERSION="v0.69.10"
18 |     BINARY_NAME="terragrunt_${OS}_${ARCH}"
19 |     sudo curl -sL "https://github.com/gruntwork-io/terragrunt/releases/download/$VERSION/$BINARY_NAME" -o "/usr/local/bin/terragrunt"
20 |     sudo chmod +x /usr/local/bin/terragrunt
21 | else
22 |     echo "terragrunt already installed"
23 | fi
24 | 
25 | # Install nvidia package
26 | sudo ubuntu-drivers --gpgpu install
27 | if ! sudo modprobe nvidia; then
28 |     echo "Failed to add nvidia kernel module"
29 | fi
30 | 
31 | # Workaround for https://bugs.launchpad.net/juju/+bug/1964513
32 | USER=$(whoami)
33 | BRIDGE="br0"
34 | if [ -z "$(sudo --user $USER lxc storage list --format csv)" ]; then
35 |     echo 'Bootstrapping LXD'
36 |     cat <<EOF | sudo --user $USER lxd init --preseed
37 | networks:
38 | - config:
39 |     ipv4.address: auto
40 |     ipv6.address: none
41 |   name: $BRIDGE
42 |   project: default
43 | storage_pools:
44 | - name: default
45 |   driver: dir
46 | profiles:
47 | - devices:
48 |     eth0:
49 |       name: eth0
50 |       network: $BRIDGE
51 |       type: nic
52 |     root:
53 |       path: /
54 |       pool: default
55 |       type: disk
56 |   name: default
57 | EOF
58 | fi
59 | BR0_ADDR=$(ip -4 -j a sho dev $BRIDGE | jq -r .[].addr_info[0].local)
60 | if [ -z "$BR0_ADDR" ]; then
61 |     echo 'Failed to configure LXD with bridge $BRIDGE'
62 |     exit 1
63 | fi
64 | 
65 | # Generate SSH key and add of known ips to known hosts
66 | [ -f $HOME/.ssh/id_rsa ] || ssh-keygen -b 4096 -f $HOME/.ssh/id_rsa -t rsa -N ""
67 | cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
68 | ssh-keyscan -H $(hostname --all-ip-addresses) >> $HOME/.ssh/known_hosts
69 | 
70 | # Bootstrap juju controller
71 | juju bootstrap localhost lxd-controller
72 | 


--------------------------------------------------------------------------------
/.github/workflows/promote.yaml:
--------------------------------------------------------------------------------
 1 | # This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation
 2 | # To update the file:
 3 | # - Edit it in the canonical/solutions-engineering-automation repository.
 4 | # - Open a PR with the changes.
 5 | # - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes.
 6 | 
 7 | # Workflow temporarily disable until there are fixes for:
 8 | # https://github.com/canonical/charming-actions/issues/157
 9 | # https://github.com/canonical/charmcraft/issues/2243
10 | 
11 | name: Promote charm
12 | on:
13 |   workflow_dispatch:
14 | 
15 | jobs:
16 |   disabled_notice:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - run: echo "This workflow is temporarily disabled. No actions will be performed."
20 | #
21 | # on:
22 | #   workflow_dispatch:
23 | #     inputs:
24 | #       channel-promotion:
25 | #         description: 'Channel Promotion, e.g. latest/edge -> latest/candidate'
26 | #         required: true
27 | #         type: choice
28 | #         options:
29 | #           - 'latest/edge -> latest/candidate'
30 | #           - 'latest/candidate -> latest/stable'
31 | #
32 | # jobs:
33 | #   promote-charm:
34 | #     name: Promote charm
35 | #     runs-on: ubuntu-22.04
36 | #     steps:
37 | #       - uses: actions/checkout@v5
38 | #       - name: Set channels
39 | #         id: set-channels
40 | #         run: |
41 | #           channel_promotion="${{ github.event.inputs.channel-promotion }}"
42 | #           origin=$(echo "$channel_promotion" | sed 's/\s*->.*//')
43 | #           destination=$(echo "$channel_promotion" | sed 's/.*->\s*//')
44 | #           echo "destination-channel=$destination" >> $GITHUB_OUTPUT
45 | #           echo "origin-channel=$origin" >> $GITHUB_OUTPUT
46 | #       - name: Promote charm to channel
47 | #         uses: canonical/charming-actions/promote-charm@2.6.3
48 | #         with:
49 | #           credentials: ${{ secrets.CHARMHUB_TOKEN }}
50 | #           destination-channel: ${{ steps.set-channels.outputs.destination-channel }}
51 | #           origin-channel: ${{ steps.set-channels.outputs.origin-channel }}
52 | #           charmcraft-channel: "3.x/stable"
53 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/ssacli.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: SsaCLI
 3 |   rules:
 4 | 
 5 |   - alert: SsaCLICommandFailed
 6 |     expr: ssacli_command_success == 0
 7 |     for: 0m
 8 |     labels:
 9 |       severity: critical
10 |     annotations:
11 |       summary: Failed to run ssacli. (instance {{ $labels.instance }})
12 |       description: |
13 |         Failed to get storage array information using ssacli.
14 |           VALUE = {{ $value }}
15 |           LABELS = {{ $labels }}
16 | 
17 |   - alert: SsaCLIControllerNotFound
18 |     expr: ssacli_controllers == 0
19 |     for: 0m
20 |     labels:
21 |       severity: warning
22 |     annotations:
23 |       summary: ssacli controller not found. (instance {{ $labels.instance }})
24 |       description: |
25 |         Cannot find ssacli controller on this host machine.
26 |           NUMBER_OF_CONTROLLERS = {{ $value }}
27 |           LABELS = {{ $labels }}
28 | 
29 |   - alert: SsaCLIControllerNotOK
30 |     expr: ssacli_controller_info{status!~"^(OK|NOT CONFIGURED)$"} == 1
31 |     for: 0m
32 |     labels:
33 |       severity: critical
34 |     annotations:
35 |       summary: ssacli controller status not Ok. (instance {{ $labels.instance }})
36 |       description: |
37 |         SSACLI controller status not OK.
38 |           STATUS = {{ $labels.status }}
39 |           LABELS = {{ $labels }}
40 | 
41 |   - alert: SsaCLILogicalDriveNotOK
42 |     expr: ssacli_logical_drive_info{status != "OK"} == 1
43 |     for: 0m
44 |     labels:
45 |       severity: critical
46 |     annotations:
47 |       summary: ssacli logical drive status not Ok. (instance {{ $labels.instance }})
48 |       description: |
49 |         SSACLI logical drive status not OK.
50 |           STATUS = {{ $labels.status }}
51 |           LABELS = {{ $labels }}
52 | 
53 |   - alert: SsaCLIPhysicalDriveNotOK
54 |     expr: ssacli_physical_drive_info{status != "OK"} == 1
55 |     for: 0m
56 |     labels:
57 |       severity: critical
58 |     annotations:
59 |       summary: ssacli physical drive status not Ok. (instance {{ $labels.instance }})
60 |       description: |
61 |         SSACLI physical drive status not OK.
62 |           STATUS = {{ $labels.status }}
63 |           LABELS = {{ $labels }}
64 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: Create a report to help us improve the charm
 3 | labels:
 4 |   - kind/bug
 5 |   - needs-triage
 6 | body:
 7 |   - type: textarea
 8 |     id: Description
 9 |     attributes:
10 |       label: Description
11 |       description: "Please describe the unexpected behaviour: "
12 |       placeholder: ex. "`juju run ...` returns the error `not found`"
13 |     validations:
14 |       required: true
15 | 
16 |   - type: input
17 |     id: Charm-Revision
18 |     attributes:
19 |       label: Charm revision
20 |       description: "What revision of the charm are you using?"
21 |       placeholder: ex. latest/stable
22 |     validations:
23 |       required: true
24 | 
25 |   - type: input
26 |     id: Version
27 |     attributes:
28 |       label: Juju version
29 |       description: "What version of Juju are you using?"
30 |       placeholder: ex. 3.6.1
31 |     validations:
32 |       required: true
33 | 
34 |   - type: dropdown
35 |     id: Cloud
36 |     attributes:
37 |       label: Cloud
38 |       description: Which cloud(s) are you using?
39 |       multiple: true
40 |       options:
41 |         - AWS
42 |         - Azure
43 |         - GCP
44 |         - Kubernetes
45 |         - LXD
46 |         - MAAS
47 |         - Manual
48 |         - OpenStack
49 |         - Oracle
50 |         - vSphere
51 | 
52 |   - type: textarea
53 |     id: Expected-Behaviour
54 |     attributes:
55 |       label: Expected behaviour
56 |       description: "Describe what you expected to happen: "
57 |       placeholder: ex. "`juju run ...` returns a connection string"
58 |     validations:
59 |       required: true
60 | 
61 |   - type: textarea
62 |     id: Reproduction-Steps
63 |     attributes:
64 |       label: "Reproduce / Test"
65 |       description: "Steps to reproduce the unexpected behaviour: "
66 |       placeholder: ex. "juju deploy ... && juju status"
67 |     validations:
68 |       required: true
69 | 
70 |   - type: textarea
71 |     id: Info-Notes
72 |     attributes:
73 |       label: "Notes & References"
74 |       description: "Please add anything to help diagnose the issue e.g. charm debug logs, kubectl logs, etc."
75 |     validations:
76 |       required: false
77 | 


--------------------------------------------------------------------------------
/src/os_platform.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Canonical Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # For further info, check https://github.com/canonical/charmcraft
16 | """Platform-related Charmcraft utilities."""
17 | import dataclasses
18 | import platform
19 | import typing as t
20 | from enum import Enum
21 | 
22 | import distro
23 | 
24 | 
25 | class UbuntuSeries(str, Enum):
26 |     """Ubuntu Series."""
27 | 
28 |     NOBLE = "24.04"
29 |     JAMMY = "22.04"
30 |     FOCAL = "20.04"
31 |     BIONIC = "18.04"
32 |     XENIAL = "16.04"
33 | 
34 | 
35 | class Architecture(str, Enum):
36 |     """Host architecture."""
37 | 
38 |     X86_64 = "x86_64"
39 |     AARCH64 = "aarch64"
40 | 
41 | 
42 | @dataclasses.dataclass
43 | class OSPlatform:
44 |     """Description of an operating system platform."""
45 | 
46 |     release: str
47 |     machine: str
48 | 
49 |     @property
50 |     def series(self) -> t.Optional[UbuntuSeries]:
51 |         """Return series base on system and release."""
52 |         for series in UbuntuSeries:
53 |             if series == self.release:
54 |                 return series
55 |         return None
56 | 
57 |     @property
58 |     def architecture(self) -> t.Optional[Architecture]:
59 |         """Return architecture base on machine type."""
60 |         for arch in Architecture:
61 |             if arch == self.machine:
62 |                 return arch
63 |         return None
64 | 
65 | 
66 | def get_os_platform() -> OSPlatform:
67 |     """Determine a system/release combo for an OS using /etc/os-release if available."""
68 |     machine = platform.machine()
69 |     info = distro.info()
70 |     release = info.get("version", "")
71 | 
72 |     return OSPlatform(release=release, machine=machine)
73 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/ipmi_sel.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: IpmiSel
 3 |   rules:
 4 |     - alert: IPMISELCommandSuccessMetricsMissing
 5 |       expr: absent_over_time(ipmi_sel_command_success[5m])
 6 |       labels:
 7 |         severity: critical
 8 |       annotations:
 9 |         summary: IPMI SEL command success metrics missing. (instance {{ $labels.instance }})
10 |         description: |
11 |           The ipmi_sel_command_success metric has been missing for over 5 minutes.
12 |           This may indicate IPMI SEL command timeouts, or that IPMI tools/services are not installed or supported on this hardware.
13 |             LABELS = {{ $labels }}
14 | 
15 |     - alert: IPMISELCommandFailed
16 |       expr: ipmi_sel_command_success == 0
17 |       for: 5m
18 |       labels:
19 |         severity: critical
20 |       annotations:
21 |         summary: Failed to run ipmi-sel. (instance {{ $labels.instance }})
22 |         description: |
23 |           Failed to get system event logs using ipmi-sel.
24 |             VALUE = {{ $value }}
25 |             LABELS = {{ $labels }}
26 | 
27 |     - alert: IPMISELStateWarning
28 |       expr: ipmi_sel_state_warning > 0
29 |       labels:
30 |         severity: warning
31 |         event_id: '{{ $value }}'
32 |       annotations:
33 |         summary: IPMI system event log in warning state. (instance {{ $labels.instance }})
34 |         description: |
35 |           IPMI SEL entry in warning state.
36 |             LABELS = {{ $labels }}
37 |             EVENT_ID = {{ $value }}
38 | 
39 |     - alert: IPMISELStateCritical
40 |       expr: ipmi_sel_state_critical > 0
41 |       labels:
42 |         severity: critical
43 |         event_id: '{{ $value }}'
44 |       annotations:
45 |         summary: IPMI system event log in critical state. (instance {{ $labels.instance }})
46 |         description: |
47 |           IPMI SEL entry in critical state.
48 |             LABELS = {{ $labels }}
49 |             EVENT_ID = {{ $value }}
50 | 
51 |     - alert: IPMISELDStateWarning
52 |       expr: node_systemd_unit_state{name="ipmiseld.service", state=~"failed|inactive"} == 1
53 |       for: 5m
54 |       labels:
55 |         severity: warning
56 |       annotations:
57 |         summary: IPMISELD service is not active. (instance {{ $labels.instance }})
58 |         description: |
59 |           The ipmiseld service is not active, indicating a potential problem.
60 |             VALUE = {{ $value }}
61 |             LABELS = {{ $labels }}
62 | 


--------------------------------------------------------------------------------
/tests/integration/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | 
 4 | import os
 5 | 
 6 | import pytest
 7 | import pytest_asyncio
 8 | from juju.controller import Controller
 9 | from utils import get_or_add_model
10 | 
11 | LXD_CTL_NAME = os.environ.get("LXD_CONTROLLER")
12 | K8S_CTL_NAME = os.environ.get("K8S_CONTROLLER")
13 | 
14 | MODEL_CONFIG = {"logging-config": "<root>=WARNING; unit=DEBUG"}
15 | 
16 | 
17 | def pytest_addoption(parser):
18 |     parser.addoption(
19 |         "--base",
20 |         type=str.lower,
21 |         default="ubuntu@22.04",
22 |         choices=["ubuntu@20.04", "ubuntu@22.04", "ubuntu@24.04"],
23 |         help="Set base for the applications.",
24 |     )
25 |     parser.addoption(
26 |         "--channel",
27 |         type=str,
28 |         default="edge",
29 |         choices=["edge", "candidate", "stable"],
30 |         help="Charmhub channel to use during charms deployment",
31 |     )
32 | 
33 | 
34 | @pytest.fixture(scope="module")
35 | def base(request):
36 |     return request.config.getoption("--base")
37 | 
38 | 
39 | @pytest.fixture(scope="module")
40 | def channel(request):
41 |     return request.config.getoption("--channel")
42 | 
43 | 
44 | @pytest_asyncio.fixture()
45 | async def lxd_ctl():
46 |     """Get the controller object referring the lxd controller."""
47 |     if LXD_CTL_NAME is None:
48 |         pytest.fail("LXD_CONTROLLER env variable should be provided")
49 |     lxd_ctl = Controller()
50 |     await lxd_ctl.connect(LXD_CTL_NAME)
51 | 
52 |     return lxd_ctl
53 | 
54 | 
55 | @pytest_asyncio.fixture()
56 | async def k8s_ctl():
57 |     """Get the controller object referring the k8s controller."""
58 |     if K8S_CTL_NAME is None:
59 |         pytest.fail("K8S_CONTROLLER env variable should be provided")
60 |     k8s_ctl = Controller()
61 |     await k8s_ctl.connect(K8S_CTL_NAME)
62 | 
63 |     return k8s_ctl
64 | 
65 | 
66 | @pytest_asyncio.fixture()
67 | async def lxd_model(ops_test, lxd_ctl):
68 |     """Get the model object referring the lxd model."""
69 |     model_name = ops_test.model_name
70 |     lxd_model = await get_or_add_model(ops_test, lxd_ctl, model_name)
71 |     await lxd_model.set_config(MODEL_CONFIG)
72 | 
73 |     return lxd_model
74 | 
75 | 
76 | @pytest_asyncio.fixture()
77 | async def k8s_model(ops_test, k8s_ctl):
78 |     """Get the model object referring the k8s model."""
79 |     model_name = ops_test.model_name
80 |     k8s_model = await get_or_add_model(ops_test, k8s_ctl, model_name)
81 |     await k8s_model.set_config(MODEL_CONFIG)
82 | 
83 |     return k8s_model
84 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation
 2 | # To update the file:
 3 | # - Edit it in the canonical/solutions-engineering-automation repository.
 4 | # - Open a PR with the changes.
 5 | # - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes.
 6 | 
 7 | [tox]
 8 | skipsdist=True
 9 | skip_missing_interpreters = True
10 | envlist = lint, unit
11 | 
12 | [testenv]
13 | basepython = python3
14 | setenv =
15 |   PYTHONPATH = {toxinidir}:{toxinidir}/src/:{toxinidir}/reactive/:{toxinidir}/hooks/:{toxinidir}/lib/:{toxinidir}/actions:{toxinidir}/files/:{toxinidir}/files/plugins/
16 |   # avoid state written to file during tests - see https://github.com/juju/charm-helpers/blob/85dcbeaf63b0d0f38e8cb17825985460dc2cd02d/charmhelpers/core/unitdata.py#L179-L184
17 |   UNIT_STATE_DB = :memory:
18 |   # Default to juju 3, but don't overwrite it if already set in the environment.
19 |   # This allows us to still test with juju2.9 for some projects by updating the env externally.
20 |   TEST_JUJU3 = {env:TEST_JUJU3:1}
21 | passenv = *
22 | 
23 | [testenv:lint]
24 | commands =
25 |     black --check --diff --color .
26 |     isort --check --diff --color .
27 |     flake8
28 | deps =
29 |     black
30 |     colorama
31 |     flake8
32 |     flake8-colors
33 |     flake8-docstrings
34 |     flake8-import-order
35 |     flake8-pyproject
36 |     isort
37 |     pep8-naming
38 |     # so pylint and mypy can reason about the code
39 |     {[testenv:unit]deps}
40 |     {[testenv:func]deps}
41 | 
42 | [testenv:reformat]
43 | commands =
44 |     black .
45 |     isort .
46 | deps =
47 |     black
48 |     isort
49 | 
50 | [testenv:unit]
51 | setenv =
52 |     {[testenv]setenv}
53 |     COVERAGE_FILE = .coverage-unit
54 | commands = pytest {toxinidir}/tests/unit \
55 |    -v \
56 |    --cov \
57 |    --cov-report=term-missing \
58 |    --cov-report=html \
59 |    --cov-report=xml \
60 |    {posargs}
61 | 
62 | deps =
63 |   pytest
64 |   pytest-cov
65 |   -r {toxinidir}/requirements.txt
66 |   -r {toxinidir}/tests/unit/requirements.txt
67 | 
68 | [testenv:func]
69 | setenv =
70 |     {[testenv]setenv}
71 |     COVERAGE_FILE = .coverage-func
72 | deps =
73 |   pytest
74 |   pytest-cov
75 |   pytest-operator
76 |   -r {toxinidir}/requirements.txt
77 |   -r {toxinidir}/tests/functional/requirements.txt
78 | commands = pytest {toxinidir}/tests/functional \
79 |    -v \
80 |    --cov \
81 |    --cov-report=term-missing \
82 |    --cov-report=html \
83 |    --cov-report=xml \
84 |    --cov-fail-under=0 \
85 |    {posargs}
86 | 


--------------------------------------------------------------------------------
/tests/unit/test_alert_rules/test_ipmi_dcmi.yaml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - ../../../src/prometheus_alert_rules/ipmi_dcmi.yaml
 3 | 
 4 | evaluation_interval: 1m
 5 | 
 6 | tests:
 7 |   - interval: 1m
 8 |     input_series:
 9 |       - series: ipmi_dcmi_command_success{instance="ubuntu-0"}
10 |         values: '0x15'
11 | 
12 |       - series: ipmi_dcmi_command_success{instance="ubuntu-1"}
13 |         values: '1x15'
14 |       - series: ipmi_dcmi_power_cosumption_watts{instance="ubuntu-1"}
15 |         values: '102x15'
16 | 
17 |       - series: ipmi_dcmi_power_consumption_percentage{instance="ubuntu-2"}
18 |         values: '0.85x15'
19 |       - series: ipmi_dcmi_power_consumption_percentage{instance="ubuntu-3"}
20 |         values: '0.75x15'
21 | 
22 |     alert_rule_test:
23 |       - eval_time: 10m
24 |         alertname: IPMIDCMICommandFailed
25 |         exp_alerts:
26 |           - exp_labels:
27 |               severity: critical
28 |               instance: ubuntu-0
29 |             exp_annotations:
30 |               summary: Failed to run ipmi_dcmi. (instance ubuntu-0)
31 |               description: |
32 |                 Failed to get system power statistics using ipmi_dcmi.
33 |                   VALUE = 0
34 |                   LABELS = map[__name__:ipmi_dcmi_command_success instance:ubuntu-0]
35 | 
36 |       - eval_time: 10m
37 |         alertname: IPMIDCMIPowerConsumptionPercentageOutstanding
38 |         exp_alerts:
39 |           - exp_labels:
40 |               severity: warning
41 |               instance: ubuntu-2
42 |             exp_annotations:
43 |               summary: IPMI DCMI power consumption percentage is high. (instance ubuntu-2)
44 |               description: |
45 |                 IPMI DCMI power consumption percentage is high for over 5 minutes.
46 |                   POWER_CONSUMPTION_PERCENTAGE = 0.85
47 |                   LABELS = map[__name__:ipmi_dcmi_power_consumption_percentage instance:ubuntu-2]
48 | 
49 |   - interval: 1m
50 |     input_series:
51 |       - series: some_other_metric{instance="ubuntu-0"}
52 |         values: '1x10'
53 | 
54 |     alert_rule_test:
55 |       - eval_time: 6m
56 |         alertname: IPMIDCMICommandSuccessMetricsMissing
57 |         exp_alerts:
58 |           - exp_labels:
59 |               severity: critical
60 |             exp_annotations:
61 |               summary: IPMI DCMI command success metrics missing. (instance )
62 |               description: |
63 |                 The ipmi_dcmi_command_success metric has been missing for over 5 minutes.
64 |                 This may indicate IPMI DCMI command timeouts, or that IPMI tools/services are not installed or supported on this hardware.
65 |                   LABELS = map[]
66 | 


--------------------------------------------------------------------------------
/src/literals.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | """Literals for the charm."""
 4 | import logging
 5 | 
 6 | import pydantic
 7 | 
 8 | from hardware import (
 9 |     dcgm_v3_compatible,
10 |     dcgm_v4_compatible,
11 |     get_cuda_version_from_driver,
12 |     get_nvidia_driver_version,
13 | )
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | # TODO: Add more charm configuration options. See #468
19 | class HWObserverConfig(pydantic.BaseModel):
20 |     class Config:
21 |         """Pydantic config."""
22 | 
23 |         allow_population_by_field_name = True
24 | 
25 |     dcgm_snap_channel: str = pydantic.Field(
26 |         default="", description="Snap channel for DCGM", alias="dcgm-snap-channel"
27 |     )
28 |     redfish_disable: bool = pydantic.Field(
29 |         default=True, description="Disable Redfish exporter", alias="redfish-disable"
30 |     )
31 | 
32 |     @pydantic.validator("dcgm_snap_channel", pre=True)
33 |     @classmethod
34 |     def validate_dcgm_channel(cls, value):
35 |         """Validate the DCGM snap channel format and driver compatibility."""
36 |         if value == "auto":
37 |             return value
38 |         try:
39 |             track, risk = value.split("/", 1)
40 |         except ValueError:
41 |             raise ValueError("Channel must be in the form '<track>/<risk>'")
42 | 
43 |         valid_tracks = {"v3", "v4"}
44 |         valid_risks = {"stable", "edge", "candidate"}
45 | 
46 |         if track not in valid_tracks:
47 |             raise ValueError(f"Invalid track '{track}'. Must be one of: {sorted(valid_tracks)}")
48 |         if risk not in valid_risks:
49 |             raise ValueError(
50 |                 f"Invalid channel risk '{risk}'. Must be one of: {sorted(valid_risks)}"
51 |             )
52 | 
53 |         driver_version = get_nvidia_driver_version()
54 |         cuda_version = get_cuda_version_from_driver()
55 |         if not dcgm_v3_compatible(cuda_version, track) and not dcgm_v4_compatible(
56 |             cuda_version, track
57 |         ):
58 |             raise ValueError(f"DCGM {track} is not compatible with driver {driver_version}.")
59 | 
60 |         return value
61 | 
62 |     @pydantic.validator("redfish_disable", pre=True)
63 |     @classmethod
64 |     def validate_redfish_disable(cls, value):
65 |         """Validate the Redfish disable option.
66 | 
67 |         Juju already checks for boolean values, but we want to log a warning.
68 |         """
69 |         if value is True:
70 |             logger.warning(
71 |                 "Redfish alert rules are considered experimental and may be changed or removed "
72 |                 "in future releases."
73 |             )
74 |         return value
75 | 


--------------------------------------------------------------------------------
/src/ssdlc.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | 
 4 | """SSDLC (Secure Software Development Lifecycle) Logging.
 5 | 
 6 | These events provide critical visibility into the asset's lifecycle and health, and can help
 7 | detect potential tampering or malicious activities aimed at altering system behavior.
 8 | 
 9 | Logging these events allows for the identification of unauthorized changes to system states,
10 | such as unapproved restarts or unexpected shutdowns, which may indicate security incidents
11 | or availability attacks, or changes to security settings.
12 | """
13 | from datetime import datetime, timezone
14 | from enum import Enum
15 | from logging import getLogger
16 | 
17 | logger = getLogger(__name__)
18 | 
19 | 
20 | class SSDLCSysEvent(str, Enum):  # noqa: N801
21 |     """Constant event defined in SSDLC."""
22 | 
23 |     STARTUP = "sys_startup"
24 |     SHUTDOWN = "sys_shutdown"
25 |     RESTART = "sys_restart"
26 |     CRASH = "sys_crash"
27 | 
28 | 
29 | _EVENT_MESSAGE_MAPS = {
30 |     SSDLCSysEvent.STARTUP: "hardware observer start service %s",
31 |     SSDLCSysEvent.SHUTDOWN: "hardware observer shutdown service %s",
32 |     SSDLCSysEvent.RESTART: "hardware observer restart service %s",
33 |     SSDLCSysEvent.CRASH: "hardware observer service %s crash",
34 | }
35 | 
36 | 
37 | class Service(str, Enum):
38 |     HARDWARE_EXPORTER = "hardware-exporter"
39 |     DCGM_EXPORTER = "dcgm"
40 |     SMARTCTL_EXPORTER = "smartctl-exporter"
41 | 
42 | 
43 | # Mapping from exporter_name to Service enum
44 | EXPORTER_NAME_TO_SERVICE = {
45 |     "hardware-exporter": Service.HARDWARE_EXPORTER,
46 |     "dcgm": Service.DCGM_EXPORTER,
47 |     "smartctl-exporter": Service.SMARTCTL_EXPORTER,
48 | }
49 | 
50 | 
51 | def log_ssdlc_system_event(event: SSDLCSysEvent, service: str, msg: str = ""):
52 |     """Log system startup event in SSDLC required format.
53 | 
54 |     Args:
55 |         event: The SSDLC system event type
56 |         service: exporter_name string (e.g., "hardware-exporter", "dcgm")
57 |         msg: Optional additional message
58 |     """
59 |     # Map exporter_name to Service enum
60 |     service_enum = EXPORTER_NAME_TO_SERVICE.get(service)
61 |     if not service_enum:
62 |         logger.warning("Unknown service name: %s, skipping SSDLC logging", service)
63 |         return
64 | 
65 |     event_msg = _EVENT_MESSAGE_MAPS[event].format(service_enum)
66 | 
67 |     now = datetime.now(timezone.utc).astimezone()
68 |     logger.warning(
69 |         {
70 |             "datetime": now.isoformat(),
71 |             "appid": f"service.{service_enum.value}",
72 |             "event": f"{event.value}:{service_enum.value}",
73 |             "level": "WARN",
74 |             "description": f"{event_msg} {msg}".strip(),
75 |         },
76 |     )
77 | 


--------------------------------------------------------------------------------
/tests/unit/test_checksum.py:
--------------------------------------------------------------------------------
 1 | from unittest import mock
 2 | 
 3 | from checksum import PERCCLI_VERSION_INFOS, STORCLI_VERSION_INFOS, validate_checksum
 4 | from os_platform import OSPlatform
 5 | 
 6 | 
 7 | class TestCheckFileSum:
 8 |     @mock.patch(
 9 |         "checksum.get_os_platform",
10 |         return_value=OSPlatform(
11 |             release="20.04",
12 |             machine="x86_64",
13 |         ),
14 |     )
15 |     @mock.patch("checksum.hashlib.sha256")
16 |     def test_validate_checksum(self, mock_sha256, mock_get_os_platform, tmp_path):
17 |         mock_sha256.return_value = mock.Mock()
18 |         mock_sha256.return_value.hexdigest.return_value = (
19 |             "53c8ee43808779f8263c25b3cb975d816d207659684f3c7de1df4bbd2447ead4"
20 |         )
21 | 
22 |         target = tmp_path / "perccli"
23 |         target.write_text("fake file")
24 | 
25 |         ok = validate_checksum(PERCCLI_VERSION_INFOS, target)
26 |         assert ok
27 | 
28 |     @mock.patch(
29 |         "checksum.get_os_platform",
30 |         return_value=OSPlatform(
31 |             release="14.04",  # old version which should be cover by field support_all_series
32 |             machine="x86_64",
33 |         ),
34 |     )
35 |     @mock.patch("checksum.hashlib.sha256")
36 |     def test_validate_checksum_support_all_series(
37 |         self,
38 |         mock_sha256,
39 |         mock_get_os_platform,
40 |         tmp_path,
41 |     ):
42 |         mock_sha256.return_value = mock.Mock()
43 |         mock_sha256.return_value.hexdigest.return_value = (
44 |             "45ff0d3c7fc8b77f64de7de7b3698307971546a6be00982934a19ee44f5d91bb"
45 |         )
46 | 
47 |         target = tmp_path / "storcli"
48 |         target.write_text("fake file")
49 | 
50 |         ok = validate_checksum(STORCLI_VERSION_INFOS, target)
51 |         assert ok
52 | 
53 |     def test_validate_checksum_fail(self, tmp_path):
54 |         target = tmp_path / "perccli"
55 |         target.write_text("fake file")
56 | 
57 |         ok = validate_checksum(PERCCLI_VERSION_INFOS, target)
58 |         assert not ok
59 | 
60 |     @mock.patch(
61 |         "checksum.get_os_platform",
62 |         return_value=OSPlatform(
63 |             release="20.04",
64 |             machine="fake machine architecture",
65 |         ),
66 |     )
67 |     @mock.patch("checksum.hashlib.sha256")
68 |     def test_validate_checksum_wrong_architecture(
69 |         self, mock_sha256, mock_get_os_platform, tmp_path
70 |     ):
71 |         mock_sha256.return_value = mock.Mock()
72 |         mock_sha256.return_value.hexdigest.return_value = (
73 |             "53c8ee43808779f8263c25b3cb975d816d207659684f3c7de1df4bbd2447ead4"
74 |         )
75 | 
76 |         target = tmp_path / "perccli"
77 |         target.write_text("fake file")
78 | 
79 |         ok = validate_checksum(PERCCLI_VERSION_INFOS, target)
80 |         assert not ok
81 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/lsi_sas.yaml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | 
 3 | - name: LSI SAS controller alert rules
 4 |   rules:
 5 | 
 6 |   - alert: SasircuCommandFailed
 7 |     expr: '{__name__=~"sas[2,3]ircu_command_success"} == 0'
 8 |     for: 0m
 9 |     labels:
10 |       severity: critical
11 |     annotations:
12 |       summary: Failed to run sas2ircu or sas3ircu. (instance {{ $labels.instance }})
13 |       description: |
14 |         Failed to get LSI SAS controller information using sas2ircu or sas3ircu.
15 |           INSTANCE = {{ $labels.instance }}
16 |           SUCCESS = {{ $value }}
17 |           LABELS = {{ $labels }}
18 | 
19 |   - alert: LSISASControllerNotFound
20 |     expr: '{__name__=~"lsi_sas_[2,3]_controllers"} == 0'
21 |     for: 0m
22 |     labels:
23 |       severity: warning
24 |     annotations:
25 |       summary: LSI SAS controller not found. (instance {{ $labels.instance }})
26 |       description: |
27 |         Cannot found LSI SAS controller on this host machine.
28 |           INSTANCE = {{ $labels.instance }}
29 |           NUMBER_OF_CONTROLLERS = {{ $value }}
30 |           LABELS = {{ $labels }}
31 | 
32 |   - alert: LSISASIRVolumeNotFound
33 |     expr: '{__name__=~"lsi_sas_[2,3]_ir_volumes"} == 0'
34 |     for: 0m
35 |     labels:
36 |       severity: warning
37 |     annotations:
38 |       summary: LSI SAS IR volume not found. (instance {{ $labels.instance }})
39 |       description: |
40 |         Cannot found LSI SAS integrated RAID volumes on this controller.
41 |           CONTROLLER_ID = {{ $labels.controller_id }}
42 |           NUMBER_OF_VOLUMES = {{ $value }}
43 |           LABELS = {{ $labels }}
44 | 
45 |   - alert: LSISASIRVolumeUnready
46 |     expr: '{__name__=~"lsi_sas_[2,3]_ir_volume_info", status !~ "Okay.*"} == 1'
47 |     for: 0m
48 |     labels:
49 |       severity: critical
50 |     annotations:
51 |       summary: LSI SAS volume is not ready. (instance {{ $labels.instance }})
52 |       description: |
53 |         LSI SAS volume is not in "Okay" state. Please check the if the volume is working as expected.
54 |           CONTROLLER_ID = {{ $labels.controller_id }}
55 |           VOLUME_ID = {{ $labels.volume_id}}
56 |           STATUS = {{ $labels.status }}
57 |           LABELS = {{ $labels }}
58 | 
59 |   - alert: LSISASPhysicalDiskUnready
60 |     expr: '{__name__=~"lsi_sas_[2,3]_physical_device_info", state !~ "(Ready.*|Optimal.*)"} == 1'
61 |     for: 0m
62 |     labels:
63 |       severity: critical
64 |     annotations:
65 |       summary: LSI SAS physical disk is not ready. (instance {{ $labels.instance }})
66 |       description: |
67 |         LSI SAS physical disk not in "Ready" or "Optimal" state. Please check the if the physical disk is working as expected.
68 |           CONTROLLER_ID = {{ $labels.controller_id }}
69 |           ENCLOSURE_ID = {{ $labels.enclosure_id }}
70 |           SLOT_ID = {{ $labels.slot_id }}
71 |           STATE = {{ $labels.state }}
72 |           LABELS = {{ $labels }}
73 | 


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | """Config."""
 2 | 
 3 | import typing as t
 4 | from enum import Enum
 5 | from pathlib import Path
 6 | 
 7 | from pydantic import BaseModel  # pylint: disable = no-name-in-module
 8 | 
 9 | DEFAULT_BIND_ADDRESS = "127.0.0.1"
10 | DCGM_EXPORTER_PORT = 9400
11 | 
12 | 
13 | class ExporterSettings(BaseModel):  # pylint: disable = too-few-public-methods
14 |     """Constant settings common across exporters."""
15 | 
16 |     health_retry_count: int = 3
17 |     health_retry_timeout: int = 3
18 |     service_template: str
19 |     service_path: Path
20 |     name: str
21 |     config_template: str
22 |     config_path: Path
23 | 
24 | 
25 | class HardwareExporterSettings(ExporterSettings):  # pylint: disable = too-few-public-methods
26 |     """Constant settings for Hardware Exporter."""
27 | 
28 |     name: str = "hardware-exporter"
29 |     config_path: Path = Path(f"/etc/{name}-config.yaml")
30 |     service_path: Path = Path(f"/etc/systemd/system/{name}.service")
31 |     config_template: str = f"{name}-config.yaml.j2"
32 |     service_template: str = f"{name}.service.j2"
33 |     crash_msg: str = "Hardware exporter crashed unexpectedly, please refer to systemd logs..."
34 | 
35 |     redfish_timeout: int = 10
36 |     redfish_max_retry: int = 2
37 | 
38 | 
39 | HARDWARE_EXPORTER_SETTINGS = HardwareExporterSettings()
40 | 
41 | 
42 | class SystemVendor(str, Enum):
43 |     """Different hardware system vendor."""
44 | 
45 |     DELL = "Dell Inc."
46 |     HP = "HP"
47 |     HPE = "HPE"
48 | 
49 | 
50 | class StorageVendor(str, Enum):
51 |     """Hardware Storage vendor."""
52 | 
53 |     BROADCOM = "Broadcom / LSI"
54 | 
55 | 
56 | class HWTool(str, Enum):
57 |     """Tools for RAID."""
58 | 
59 |     # Storage
60 |     STORCLI = "storcli"
61 |     SSACLI = "ssacli"
62 |     SAS2IRCU = "sas2ircu"
63 |     SAS3IRCU = "sas3ircu"
64 |     PERCCLI = "perccli"
65 |     IPMI_DCMI = "ipmi_dcmi"
66 |     IPMI_SEL = "ipmi_sel"
67 |     IPMI_SENSOR = "ipmi_sensor"
68 |     REDFISH = "redfish"
69 |     SMARTCTL_EXPORTER = "smartctl-exporter"
70 |     DCGM = "dcgm"
71 |     NVIDIA_DRIVER = "nvidia-driver"
72 | 
73 | 
74 | TPR_RESOURCES: t.Dict[HWTool, str] = {
75 |     HWTool.STORCLI: "storcli-deb",
76 |     HWTool.PERCCLI: "perccli-deb",
77 |     HWTool.SAS2IRCU: "sas2ircu-bin",
78 |     HWTool.SAS3IRCU: "sas3ircu-bin",
79 | }
80 | 
81 | HARDWARE_EXPORTER_COLLECTOR_MAPPING = {
82 |     HWTool.STORCLI: "collector.mega_raid",
83 |     HWTool.PERCCLI: "collector.poweredge_raid",
84 |     HWTool.SAS2IRCU: "collector.lsi_sas_2",
85 |     HWTool.SAS3IRCU: "collector.lsi_sas_3",
86 |     HWTool.SSACLI: "collector.hpe_ssa",
87 |     HWTool.IPMI_DCMI: "collector.ipmi_dcmi",
88 |     HWTool.IPMI_SEL: "collector.ipmi_sel",
89 |     HWTool.IPMI_SENSOR: "collector.ipmi_sensor",
90 |     HWTool.REDFISH: "collector.redfish",
91 | }
92 | 
93 | TOOLS_DIR = Path("/usr/sbin")
94 | 
95 | # SNAP environment
96 | SNAP_COMMON = Path(f"/var/snap/{HARDWARE_EXPORTER_SETTINGS.name}/common")
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Avoid using this README file for information that is maintained or published elsewhere, e.g.:
 3 | 
 4 | * metadata.yaml > published on Charmhub
 5 | * documentation > published on (or linked to from) Charmhub
 6 | * detailed contribution guide > documentation or CONTRIBUTING.md
 7 | 
 8 | Use links instead.
 9 | -->
10 | [![Charmhub Badge](https://charmhub.io/hardware-observer/badge.svg)](https://charmhub.io/hardware-observer)
11 | [![Release Edge](https://github.com/canonical/hardware-observer-operator/actions/workflows/release.yaml/badge.svg)](https://github.com/canonical/hardware-observer-operator/actions/workflows/release.yaml)
12 | 
13 | # Hardware Observer Operator
14 | 
15 | ## Overview
16 | [Charmhub Page](https://charmhub.io/hardware-observer)
17 | 
18 | Hardware-observer is a [subordinate machine charm](https://juju.is/docs/sdk/charm-taxonomy#heading--subordinate-charms) that provides monitoring and alerting of hardware resources on bare-metal infrastructure. This charm leverages the following exporters to provide detailed metrics:
19 | 
20 | - **Prometheus Hardware Exporter**: For collecting metrics from BMCs and RAID controllers.
21 | - **Smartctl Exporter**: For collecting SMART metrics from storage devices.
22 | 
23 | ### Prometheus Hardware Exporter
24 | Hardware-observer collects and exports Prometheus metrics from BMCs (using the IPMI and newer Redfish protocols) and various SAS and RAID controllers through the use of the [prometheus-hardware-exporter](https://github.com/canonical/prometheus-hardware-exporter) project. It additionally configures Prometheus alert rules that are fired when the status of any metric is suboptimal.
25 | 
26 | Appropriate collectors and alert rules are installed based on the availability of one or more of the RAID/SAS controllers mentioned below:
27 | 
28 | - Broadcom MegaRAID controller
29 | - Dell PowerEdge RAID Controller
30 | - LSI SAS-2 controller
31 | - LSI SAS-3 controller
32 | - HPE Smart Array controller
33 | 
34 | ### Smartctl Exporter
35 | The Smartctl Exporter integrates with the Hardware-observer to provide monitoring of storage device health via SMART data. Metrics are collected and exported to Prometheus using the [smartctl-exporter](https://github.com/prometheus-community/smartctl_exporter).
36 | 
37 | This charm is ideal for monitoring hardware resources when used in conjunction with the [Canonical Observability Stack](https://charmhub.io/topics/canonical-observability-stack).
38 | 
39 | ## Uploading Resources
40 | 
41 | In order to manage third-party hardware resources, vendor-specific CLI tools need to be uploaded via `juju attach-resource`.
42 | 
43 | In the [Resources page](https://charmhub.io/hardware-observer/resources) on Charmhub, the name of the resource along with the download URL can be found.
44 | 
45 | 
46 | ## Other Links
47 | 
48 | <!-- If your charm is documented somewhere else other than Charmhub, provide a link separately. -->
49 | 
50 | - [Contributing](CONTRIBUTING.md) <!-- or link to other contribution documentation -->
51 | 
52 | - See the [Juju SDK documentation](https://juju.is/docs/sdk) for more information about developing and improving charms.
53 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | #
 4 | 
 5 | options:
 6 |   hardware-exporter-port:
 7 |     type: int
 8 |     default: 10200
 9 |     description: |
10 |       Start the prometheus hardware exporter at "hardware-exporter-port". By default,
11 |       it will start at port 10200.
12 |   smartctl-exporter-port:
13 |     type: int
14 |     default: 10201
15 |     description: |
16 |       Start the prometheus smartctl exporter at "smartctl-exporter-port". By default,
17 |       it will start at port 10201.
18 |   dcgm-snap-channel:
19 |     type: string
20 |     default: "auto"
21 |     description: |
22 |       Valid inputs are auto, v3/stable, v3/candidate, v3/edge, v4/stable, v4/candidate, v4/edge.
23 | 
24 |       When set to auto, the charm automatically checks the installed NVIDIA driver version and
25 |       selects the most appropriate channel. For example, with driver 580, it will install from
26 |       v4-cuda13/stable. The charm won't block if the driver is not installed or loaded.
27 | 
28 |       The v3 channels are compatible with CUDA 10, 11, and 12; you simply choose the desired
29 |       release risk (stable, candidate, or edge).
30 | 
31 |       The v4 channels provide more flexibility. Even if you select v4/edge, the charm still
32 |       detects the driver version and installs the matching CUDA build. For example,
33 |       with driver 570, it will install from v4-cuda12/edge.
34 |   smartctl-exporter-snap-channel:
35 |     type: string
36 |     default: "latest/stable"
37 |     description: |
38 |       Channel to install the Smartctl exporter snap if the hardware has smart disk. By default, it will install
39 |       from latest/stable.
40 |   exporter-log-level:
41 |     type: string
42 |     default: "INFO"
43 |     description: |
44 |       Start the prometheus exporter with log level "exporter-log-level". By
45 |       default, it will set to INFO. Allowed values are "DEBUG", "INFO",
46 |       "WARNING", "ERROR", "CRITICAL". Values other than those will result in
47 |       failure of the exporter.
48 |   collect-timeout:
49 |     type: int
50 |     default: 10
51 |     description: |
52 |       Timeout for collectors' shell commands in seconds. Changing this will also change
53 |       the scrape_timeout config option for prometheus for the cos-agent relation with
54 |       grafana-agent.
55 |       This value is also used for the redfish client's timeout parameter.
56 |       The value of this timeout should not be greater than prometheus scrape_interval (which
57 |       is 60 seconds by default), as it greater would cause the scrape_timeout to be
58 |       greater than scrape_interval.
59 |   redfish-username:
60 |     type: string
61 |     default: ""
62 |     description: |
63 |       BMC username to be used by the redfish collector.
64 |   redfish-password:
65 |     type: string
66 |     default: ""
67 |     description: |
68 |       BMC password to be used by the redfish collector.
69 |   redfish-disable:
70 |     type: boolean
71 |     default: true
72 |     description: |
73 |       By default the Redfish collector is disabled. Set this option to false to enable it.
74 | 


--------------------------------------------------------------------------------
/tests/unit/test_literals.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | 
 4 | from unittest.mock import patch
 5 | 
 6 | import pytest
 7 | from pydantic import ValidationError
 8 | 
 9 | from literals import HWObserverConfig
10 | 
11 | 
12 | @pytest.fixture(autouse=True)
13 | def mock_driver_to_cuda():
14 |     with patch("literals.get_cuda_version_from_driver") as mock:
15 |         yield mock
16 | 
17 | 
18 | @pytest.fixture(autouse=True)
19 | def mock_driver_version():
20 |     with patch("literals.get_nvidia_driver_version") as mock:
21 |         yield mock
22 | 
23 | 
24 | @pytest.mark.parametrize("dcgm_config", ["auto"])
25 | def test_accepts_auto(dcgm_config):
26 |     """Test that 'auto' passes validation without errors."""
27 |     cfg = HWObserverConfig(dcgm_snap_channel=dcgm_config)
28 |     assert cfg.dcgm_snap_channel == dcgm_config
29 | 
30 | 
31 | @pytest.mark.parametrize(
32 |     "dcgm_config", ["v3/stable", "v3/edge", "v3/candidate", "v4/stable", "v4/edge", "v4/candidate"]
33 | )
34 | def test_valid_channels(mock_driver_to_cuda, dcgm_config):
35 |     """Test valid v3 and v4 channels for supported CUDA versions."""
36 |     mock_driver_to_cuda.return_value = 12
37 | 
38 |     cfg = HWObserverConfig(dcgm_snap_channel=dcgm_config)
39 |     assert cfg.dcgm_snap_channel == dcgm_config
40 | 
41 | 
42 | @pytest.mark.parametrize("dcgm_config", ["invalid/stable", "foo/edge", "123/candidate"])
43 | def test_invalid_track(mock_driver_to_cuda, dcgm_config):
44 |     """Invalid tracks should raise ValueError."""
45 |     mock_driver_to_cuda.return_value = 12
46 |     with pytest.raises(ValidationError) as e:
47 |         HWObserverConfig(dcgm_snap_channel=dcgm_config)
48 |     assert "Invalid track" in str(e.value)
49 | 
50 | 
51 | @pytest.mark.parametrize("dcgm_config", ["v3/unknown", "v4/beta", "v3/dev"])
52 | def test_invalid_risk(mock_driver_to_cuda, dcgm_config):
53 |     """Invalid risk should raise ValueError."""
54 |     mock_driver_to_cuda.return_value = 12
55 |     with pytest.raises(ValidationError) as e:
56 |         HWObserverConfig(dcgm_snap_channel=dcgm_config)
57 |     assert "Invalid channel risk" in str(e.value)
58 | 
59 | 
60 | def test_missing_risk(mock_driver_to_cuda):
61 |     """Values without the risk should raise ValueError."""
62 |     mock_driver_to_cuda.return_value = 12
63 |     with pytest.raises(ValidationError) as e:
64 |         HWObserverConfig(dcgm_snap_channel="v3")
65 |     assert "Channel must be in the form" in str(e.value)
66 | 
67 | 
68 | def test_incompatible_v3_with_cuda13(mock_driver_to_cuda):
69 |     """v3 should fail if CUDA version is 13 (driver 580+)."""
70 |     mock_driver_to_cuda.return_value = 13
71 |     with pytest.raises(ValidationError) as e:
72 |         HWObserverConfig(dcgm_snap_channel="v3/stable")
73 |     assert "not compatible" in str(e.value)
74 | 
75 | 
76 | def test_incompatible_v4_with_cuda10(mock_driver_to_cuda):
77 |     """v4 should fail if CUDA version is 10 (old driver)."""
78 |     mock_driver_to_cuda.return_value = 10
79 |     with pytest.raises(ValidationError) as e:
80 |         HWObserverConfig(dcgm_snap_channel="v4/stable")
81 |     assert "not compatible" in str(e.value)
82 | 


--------------------------------------------------------------------------------
/.github/workflows/cos_integration.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow runs a set of integration tests,
 2 | # using hardware-observer-operator *from charmhub* (not locally built).
 3 | # It is designed to be run periodically to catch potential issues
 4 | # from recent changes to either hardware-observer-operator or COS.
 5 | name: COS Integration tests
 6 | 
 7 | on:
 8 |   workflow_call:
 9 |   workflow_dispatch:
10 |   pull_request:
11 |     types: [opened, synchronize, reopened]
12 |     branches: [main]
13 |     paths:
14 |       - ".github/workflows/cos_integration.yaml"
15 | 
16 | jobs:
17 |   integration:
18 |     runs-on: ubuntu-22.04
19 |     timeout-minutes: 120
20 |     steps:
21 |       - name: Checkout
22 |         uses: actions/checkout@v3
23 | 
24 |       - name: Get IP address of the host
25 |         run: |
26 |           # Finding preferred source ip address by trying to reach destination 2.2.2.2
27 |           # This ip address will be used while enabling metallb
28 |           echo "IPADDR=$(ip -4 -j route get 2.2.2.2 | jq -r '.[] | .prefsrc')" >> $GITHUB_ENV
29 | 
30 |       - name: Setup lxd controller
31 |         uses: charmed-kubernetes/actions-operator@main
32 |         with:
33 |           # The juju version can be any stable version, as long as it is the same as libjuju version used.
34 |           # If you update it here, update it also in tests/integration/requirements.txt and 'Setup k8s controller' step below
35 |           juju-channel: 3.5/stable
36 |           provider: lxd
37 | 
38 |       - name: Save lxd controller name
39 |         id: lxd-controller
40 |         # The `CONTROLLER_NAME` envvar is set by the actions-operator action
41 |         run: echo "name=$CONTROLLER_NAME" >> $GITHUB_OUTPUT
42 | 
43 |       - name: Setup k8s controller
44 |         uses: charmed-kubernetes/actions-operator@main
45 |         with:
46 |           # The juju version can be any stable version, as long as it is the same as libjuju version used.
47 |           # If you update it here, update it also in tests/integration/requirements.txt and 'Setup lxd controller' step above
48 |           juju-channel: 3.5/stable
49 |           provider: microk8s
50 |           channel: 1.28-strict/stable
51 |           microk8s-addons: "hostpath-storage dns metallb:${{ env.IPADDR }}-${{ env.IPADDR }}"
52 | 
53 |       - name: Save k8s controller name
54 |         id: k8s-controller
55 |         # The `CONTROLLER_NAME` envvar is set by the actions-operator action
56 |         run: echo "name=$CONTROLLER_NAME" >> $GITHUB_OUTPUT
57 | 
58 |       - name: Fix microk8s permissions
59 |         run: chmod -R ugo+rwX ~/.kube
60 | 
61 |       - name: Run integration tests
62 |         run: tox -e integration
63 |         env:
64 |           K8S_CONTROLLER: ${{ steps.k8s-controller.outputs.name }}
65 |           LXD_CONTROLLER: ${{ steps.lxd-controller.outputs.name }}
66 | 
67 |       - name: Dump debug log
68 |         if: failure()
69 |         run: |
70 |           for ctl in $(juju controllers --format json | jq -r '.controllers | keys[]'); do
71 |             for mdl in $(juju models --format json | jq -r '.models[].name' | grep -v "admin/controller"); do
72 |               juju debug-log -m $ctl:$mdl --replay --ms --no-tail
73 |             done
74 |           done || true
75 |         shell: bash
76 | 


--------------------------------------------------------------------------------
/tests/unit/test_alert_rules/test_mega_raid.yaml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - ../../../src/prometheus_alert_rules/mega_raid.yaml
 3 | 
 4 | evaluation_interval: 1m
 5 | 
 6 | tests:
 7 | 
 8 |   - interval: 1m
 9 |     input_series:
10 |       - series: 'storcli_command_success{instance="ubuntu-0"}'
11 |         values: '0x15' # error
12 | 
13 |     alert_rule_test:
14 |       - eval_time: 0m
15 |         alertname: StorcliCommandFailed
16 |         exp_alerts:
17 |           - exp_labels:
18 |               severity: critical
19 |               instance: ubuntu-0
20 |             exp_annotations:
21 |               summary: Failed to run storcli. (instance ubuntu-0)
22 |               description: |
23 |                 Failed to get MegaRAID controller information using storcli.
24 |                   VALUE = 0
25 |                   LABELS = map[__name__:storcli_command_success instance:ubuntu-0]
26 | 
27 | 
28 |   - interval: 1m
29 |     input_series:
30 |       - series: 'storcli_command_success{instance="ubuntu-1"}'
31 |         values: '1x15'
32 |       - series: 'megaraid_controllers{instance="ubuntu-1"}'
33 |         values: '0x15' # error
34 | 
35 |     alert_rule_test:
36 |       - eval_time: 0m
37 |         alertname: MegaRAIDControllerNotFound
38 |         exp_alerts:
39 |           - exp_labels:
40 |               severity: warning
41 |               instance: ubuntu-1
42 |             exp_annotations:
43 |               summary: MegaRAID controller not found. (instance ubuntu-1)
44 |               description: |
45 |                 Cannot found MegaRAID controller on this host machine.
46 |                   NUMBER_OF_CONTROLLERS = 0
47 |                   LABELS = map[__name__:megaraid_controllers instance:ubuntu-1]
48 | 
49 | 
50 |   - interval: 1m
51 |     input_series:
52 |       - series: 'storcli_command_success{instance="ubuntu-2"}'
53 |         values: '1x15'
54 |       - series: 'megaraid_controllers{instance="ubuntu-2", hostname="ubuntu-2"}'
55 |         values: '1x15'
56 |       - series: 'megaraid_virtual_drive_info{instance="ubuntu-2", controller_id="0", drive_group="0", virtual_drive_group="239", state="Dgrd", name="NVMe-RAID-1" }'
57 |         values: '1x15' # error
58 |       - series: 'megaraid_virtual_drive_info{instance="ubuntu-2", controller_id="0", drive_group="0", virtual_drive_group="240", state="Optl", name="NVMe-RAID-2" }'
59 |         values: '0x15' # okay
60 | 
61 |     alert_rule_test:
62 |       - eval_time: 0m
63 |         alertname: MegaRAIDVirtualDriveNotOptimal
64 |         exp_alerts:
65 |           - exp_labels:
66 |               severity: warning
67 |               controller_id: 0
68 |               drive_group: 0
69 |               virtual_drive_group: 239
70 |               name: NVMe-RAID-1
71 |               instance: ubuntu-2
72 |               state: Dgrd
73 |             exp_annotations:
74 |               summary: MegaRAID virtual drives are not in optimal state. (instance ubuntu-2)
75 |               description: |
76 |                 MegaRAID virtual drives are not in optimal state. Please check the if the virtual drives are working as expected.
77 |                   STATE = Dgrd
78 |                   LABELS = map[__name__:megaraid_virtual_drive_info controller_id:0 drive_group:0 instance:ubuntu-2 name:NVMe-RAID-1 state:Dgrd virtual_drive_group:239]
79 | 


--------------------------------------------------------------------------------
/metadata.yaml:
--------------------------------------------------------------------------------
 1 | # This file populates the Overview on Charmhub.
 2 | 
 3 | # The charm package name, no spaces (required)
 4 | # See https://juju.is/docs/sdk/naming#heading--naming-charms for guidance.
 5 | name: hardware-observer
 6 | 
 7 | # The following metadata are human-readable and will be published prominently on Charmhub.
 8 | 
 9 | # (Recommended)
10 | display-name: Hardware Observer
11 | 
12 | summary: Subordinate charm for monitoring hardware resources.
13 | 
14 | description: Subordinate charm for monitoring hardware resources.
15 | 
16 | website: https://github.com/canonical/hardware-observer-operator
17 | 
18 | docs: https://discourse.charmhub.io/t/hardware-observer-docs-index/11112
19 | 
20 | issues: https://github.com/canonical/hardware-observer-operator/issues
21 | 
22 | subordinate: true
23 | 
24 | resources:
25 |   storcli-deb:
26 |     type: file
27 |     description: |
28 |       (Optional) StorCLI deb file published by Broadcom for their RAID devices.
29 |       Download v7.27 from: https://docs.broadcom.com/docs/1232743397.
30 |       The download will start automatically upon accepting the license agreement.
31 |       Unzip the downloaded file and attach the relevant deb package.
32 |       E.g.:
33 |       On AMD64 hosts, use ./Unified_storcli_all_os/Ubuntu/storcli_007.2705.0000.0000_all.deb
34 |       On ARM64 hosts, use ./Unified_storcli_all_os/ARM/Linux/storcli64_007.2705.0000.0000_arm64.deb
35 |     filename: storcli.deb
36 | 
37 |   perccli-deb:
38 |     type: file
39 |     description: |
40 |       (Optional) PERCCLI deb file published by Dell for their RAID devices.
41 |       Download v7.23 from https://www.dell.com/support/home/en-us/drivers/driversdetails?driverid=tdghn.
42 |       Scroll down to "Available Formats" and download the PERCCLI_XXX_Linux.tar.gz file.
43 |       Extract the downloaded file and attach the relevant deb package.
44 |       E.g.: ./PERCCLI_7.2313.0_A14_Linux/perccli_007.2313.0000.0000_all.deb
45 |       Note: perccli is only available for the AMD64 architecture.
46 |     filename: perccli.deb
47 | 
48 |   sas2ircu-bin:
49 |     type: file
50 |     description: |
51 |       (Optional) SAS2IRCU binary file published by Broadcom.
52 |       Download vP20 from https://docs.broadcom.com/docs/12351735.
53 |       The download will start automatically upon accepting the license agreement.
54 |       Unzip the downloaded file and attach the relevant binary.
55 |       E.g.: ./SAS2IRCU_P20/sas2ircu_linux_x86_rel/sas2ircu
56 |       Note: sas2ircu is only available for the AMD64 architecture.
57 |     filename: sas2ircu
58 | 
59 |   sas3ircu-bin:
60 |     type: file
61 |     description: |
62 |       (Optional) SAS3IRCU binary file published by Broadcom.
63 |       Download vP16 from https://docs.broadcom.com/docs/SAS3IRCU_P16.zip.
64 |       The download will start automatically upon accepting the license agreement.
65 |       Unzip the downloaded file and attach the relevant binary.
66 |       E.g.:
67 |       On AMD64 hosts, use ./SAS3IRCU_P16/sas3ircu_linux_x64_rel/sas3ircu.
68 |       On ARM64 hosts, use ./SAS3IRCU_P16/sas3ircu_linux_arm_rel/sas3ircu.
69 |     filename: sas3ircu
70 | 
71 | provides:
72 |   cos-agent:
73 |     interface: cos_agent
74 |     limit: 1
75 | 
76 | requires:
77 |   general-info:
78 |     interface: juju-info
79 |     scope: container
80 | 


--------------------------------------------------------------------------------
/tests/manual/jobs/README.md:
--------------------------------------------------------------------------------
 1 | # List of Testflinger Jobs for Hardware Observer Manual Tests
 2 | 
 3 | This directory contains a list of job queues on [testflinger][testflinger] that can be used for testing hardware
 4 | observer manually. Each job queue is defined in a directory with a `README.md` that indicates the testable items on that
 5 | machine.
 6 | 
 7 | > [!Note]
 8 | > You can only submit job defined in this directory!
 9 | 
10 | The `./submit.sh` script is a simple wrapper for `testflinger submit` that allow user to submit the jobs with customize
11 | [`distro`][job-schema] and [`ssk_keys`][sshkeys] (only support one ssh keys).
12 | 
13 | ## Quick Start
14 | 
15 | You can allocate a physical machine using the `./submit.sh` script. For example, to allocate machine from job queue
16 | [`torchtusk`](./torchtusk), and use ubuntu:24.04 (noble) as the OS image, and import ssh key using launchpad ID
17 | `lp:myusername-1234`. Run
18 | 
19 | ```shell
20 | $ ./submit.sh torchtusk jammy lp:myusername-1234
21 | # job.yaml
22 | job_queue: torchtusk
23 | provision_data:
24 |     distro: noble
25 | reserve_data:
26 | ssh_keys:
27 |     - lp:myusername-1234
28 | timeout: 21600
29 | Job submitted successfully!
30 | job_id: 25a3b103-26dd-421c-817d-2950f968d327
31 | ```
32 | 
33 | Then, wait for the machine to become available
34 | 
35 | ```shell
36 | $ testflinger poll 25a3b103-26dd-421c-817d-2950f968d327
37 | 
38 | ***************************************************
39 | * Starting testflinger reserve phase on torchtusk *
40 | ***************************************************
41 | 
42 | ...
43 | 
44 | Number of key(s) added: 3
45 | 
46 | Now try logging into the machine, with:   "ssh -o 'StrictHostKeyChecking=no' -o 'UserKnownHostsFile=/dev/null' 'ubuntu@xxx.xxx.xxx.xxx'"
47 | and check to make sure that only the key(s) you wanted were added.
48 | 
49 | *** TESTFLINGER SYSTEM RESERVED ***
50 | You can now connect to ubuntu@xxx.xxx.xxx.xxx
51 | Current time:           [2025-03-17T05:40:47.103464]
52 | Reservation expires at: [2025-03-17T11:40:47.103513]
53 | Reservation will automatically timeout in 21600 seconds
54 | To end the reservation sooner use: testflinger-cli cancel 25a3b103-26dd-421c-817d-2950f968d327
55 | ```
56 | 
57 | Finally, you can login to the machine using the command provided
58 | 
59 | ```shell
60 | ssh -o 'StrictHostKeyChecking=no' -o 'UserKnownHostsFile=/dev/null' 'ubuntu@xxx.xxx.xxx.xxx'  # IP address is redarted
61 | ```
62 | 
63 | ## Contributing
64 | 
65 | Please add more job queues to this directory to increase test coverage for Hardware Observer. An example contribution of
66 | job queue can be something like the following:
67 | 
68 | ```text
69 | torchtusk/
70 | ├── job.tpl.yaml
71 | └── README.md
72 | ```
73 | 
74 | where the **name of the directory** is the `job_queue`; the file **job.tpl.yaml** is the [job defintion][job-schema];
75 | and `README.md` contains the testable items on that machine. Alternatively, you can simply copy an existing job, and
76 | update the job with different information.
77 | 
78 | 
79 | [testflinger]: https://certification.canonical.com/docs/ops/tel-labs-docs/how-to/use_machines_through_testflinger/
80 | [job-schema]: https://canonical-testflinger.readthedocs-hosted.com/en/latest/reference/job-schema.html
81 | [sskkeys]: https://canonical-testflinger.readthedocs-hosted.com/en/latest/reference/test-phases.html#reserve
82 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributor Guide
 2 | 
 3 | Thank you for your interest in helping us improve this project! We're open to
 4 | community contributions, suggestions, fixes, and feedback. This documentation
 5 | will assist you in navigating through our processes.
 6 | 
 7 | Make sure to review this guide thoroughly before beginning your contribution. It
 8 | provides all the necessary details to increase the likelihood of your contribution
 9 | being accepted.
10 | 
11 | This project is hosted and managed on [GitHub](https://github.com). If you're new to GitHub
12 | and not familiar with how it works, their
13 | [quickstart documentation](https://docs.github.com/en/get-started/quickstart)
14 | provides an excellent introduction to all the tools and processes you'll need
15 | to know.
16 | 
17 | ## Prerequisites
18 | 
19 | Before you can begin, you will need to:
20 | 
21 | * Read and agree to abide by our
22 |   [Code of Conduct](https://ubuntu.com/community/code-of-conduct).
23 | 
24 | * Sign the Canonical
25 |   [contributor license agreement](https://ubuntu.com/legal/contributors). This
26 |   grants us your permission to use your contributions in the project.
27 | 
28 | * Create (or have) a GitHub account.
29 | 
30 | * If you're working in a local environment, it's important to create a signing
31 |   key, typically using GPG or SSH, and register it in your GitHub account to
32 |   verify the origin of your code changes. For instructions on setting this up,
33 |   please refer to
34 |   [Managing commit signature verification](https://docs.github.com/en/authentication/managing-commit-signature-verification).
35 | 
36 | ## Contributing Code
37 | 
38 | ### Workflow
39 | 
40 | 1. **Choose/Create an Issue**: Before starting work on an enhancement, create an issue that explains your use case. This helps track progress and keeps the discussion organized. The issue will be tracked on the GitHub issue page.
41 | 
42 | 2. **Fork the Repository**: Create a fork of the repository to make your changes.
43 | 
44 | 3. **Create a New Branch**: Make sure to create a new branch for your contribution.
45 | 
46 | 4. **Commit your changes**: Commit messages should be well-structured and provide a meaningful explanation of the changes made
47 | 
48 | 5. **Submit a Pull Request**: Submit a pull request to merge your changes into the main branch. Reference the issue by adding issue link or `Fixes: #xxx` (replace `xxx` with the issue number) to automatically link the issue to your PR.
49 | 
50 | 6. **Review Process**: A team member will review your pull request. They may suggest changes or leave comments, so keep an eye on the PR status and be ready to make updates if needed.
51 | 
52 | 7. **Documentation**: Any documentation changes should be included as part of your PR or as a separate PR linked to your original PR.
53 | 
54 | 
55 | ### Hard Requirements
56 | 
57 | - **Testing and Code Coverage**: Changes must be accompanied by appropriate unit tests and meet the project's code coverage requirements. Functional and integration tests should be added when applicable to ensure the stability of the codebase.
58 | 
59 | - **Sign Your Commits**: Be sure to [sign your commits](https://docs.github.com/en/authentication/managing-commit-signature-verification/signing-commits), refer to the [Prerequisites](#prerequisites) section.
60 | 
61 | ## Code of Conduct
62 | 
63 | This project follows the Ubuntu Code of Conduct. You can read it in full [here](https://ubuntu.com/community/code-of-conduct).
64 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | # This file is centrally managed as a template file in https://github.com/canonical/solutions-engineering-automation
  2 | # To update the file:
  3 | # - Edit it in the canonical/solutions-engineering-automation repository.
  4 | # - Open a PR with the changes.
  5 | # - When the PR merges, the soleng-terraform bot will open a PR to the target repositories with the changes.
  6 | 
  7 | [tool.setuptools_scm]
  8 | 
  9 | [tool.flake8]
 10 | max-line-length = 99
 11 | max-doc-length = 99
 12 | max-complexity = 10
 13 | exclude = [
 14 |     ".git",
 15 |     "__pycache__",
 16 |     ".tox",
 17 |     ".build",
 18 |     "build",
 19 |     "dist",
 20 |     ".eggs",
 21 |     "*.egg_info",
 22 |     "venv",
 23 |     ".venv",
 24 |     "report",
 25 |     "docs",
 26 |     "lib",
 27 |     "mod",
 28 |     "hooks/charmhelpers",
 29 |     "tests/charmhelpers",
 30 | ]
 31 | select = ["E", "W", "F", "C", "N", "R", "D", "H"]
 32 | # Ignore W503, E501 because using black creates errors with this
 33 | # Ignore D107 Missing docstring in __init__
 34 | # Ignore D415 Docstring first line punctuation (doesn't make sense for properties)
 35 | # Ignore N818 Exceptions end with "Error" (not all exceptions are errors)
 36 | # D100, D101, D102, D103: Ignore missing docstrings in tests
 37 | ignore = ["C901", "W503", "E501", "D107", "D415", "N818", "D100", "D101", "D102", "D103", "W504"]
 38 | per-file-ignores = ["tests/*:D100,D101,D102,D103,D104"]
 39 | # Check for properly formatted copyright header in each file
 40 | copyright-check = "True"
 41 | copyright-author = "Canonical Ltd."
 42 | copyright-regexp = "Copyright\\s\\d{4}([-,]\\d{4})*\\s+%(author)s"
 43 | 
 44 | [tool.black]
 45 | line-length = 99
 46 | exclude = '''
 47 | /(
 48 |     | .eggs
 49 |     | .git
 50 |     | .tox
 51 |     | .venv
 52 |     | .build
 53 |     | build
 54 |     | lib
 55 |     | report
 56 |     | docs
 57 |     | mod
 58 |     | hooks/charmhelpers
 59 |     | tests/charmhelpers
 60 | )/
 61 | '''
 62 | 
 63 | [tool.isort]
 64 | profile = "black"
 65 | line_length = 99
 66 | skip_glob = [".eggs", ".git", ".tox", ".venv", ".build", "build", "lib", "report", "mod/*", "hooks/charmhelpers", "tests/charmhelpers"]
 67 | 
 68 | [tool.pylint]
 69 | max-line-length = 99
 70 | disable = ["E1102"]
 71 | ignore = ['.eggs', '.git', '.tox', '.venv', '.build', 'lib', 'report', 'tests', 'docs', "mod", "hooks/charmhelpers", "tests/charmhelpers"]
 72 | 
 73 | [tool.mypy]
 74 | warn_unused_ignores = true
 75 | warn_unused_configs = true
 76 | warn_unreachable = true
 77 | disallow_untyped_defs = true
 78 | ignore_missing_imports = true
 79 | no_namespace_packages = true
 80 | exclude = ['.eggs', '.git', '.tox', '.venv', '.build', 'lib', 'report', 'tests', 'docs', "mod", "hooks/charmhelpers", "tests/charmhelpers"]
 81 | 
 82 | [tool.codespell]
 83 | skip = ".eggs,.tox,.git,.venv,venv,build,.build,lib,report,docs,poetry.lock,htmlcov,mod,hooks/charmhelpers,tests/charmhelpers"
 84 | quiet-level = 3
 85 | check-filenames = true
 86 | ignore-words-list = "assertIn"
 87 | 
 88 | ## Ignore unsupported imports
 89 | [[tool.mypy.overrides]]
 90 | module = ["charmhelpers.*", "setuptools"]
 91 | ignore_missing_imports = true
 92 | 
 93 | [tool.coverage.run]
 94 | relative_files = true
 95 | source = ["."]
 96 | omit = ["tests/**", "docs/**", "lib/**", "snap/**", "build/**", "setup.py", "mod/**", "hooks/charmhelpers/**", "tests/charmhelpers/**"]
 97 | 
 98 | [tool.coverage.report]
 99 | fail_under = 100
100 | show_missing = true
101 | 
102 | [tool.coverage.html]
103 | directory = "tests/report/html"
104 | 
105 | [tool.coverage.xml]
106 | output = "tests/report/coverage.xml"
107 | 


--------------------------------------------------------------------------------
/tests/unit/test_alert_rules/test_perccli.yaml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - ../../../src/prometheus_alert_rules/perccli.yaml
 3 | 
 4 | evaluation_interval: 1m
 5 | 
 6 | tests:
 7 | 
 8 |   - interval: 1m
 9 |     input_series: 
10 |       - series: 'perccli_command_success{instance="ubuntu-0"}'
11 |         values: '0x15'  # error: PerccliCommandFailed
12 |     alert_rule_test:
13 |       - eval_time: 0m
14 |         alertname: PerccliCommandFailed
15 |         exp_alerts:
16 |           - exp_labels:
17 |               severity: critical
18 |               instance: ubuntu-0
19 |             exp_annotations:
20 |               summary: Failed to run perccli or controller not available. (instance ubuntu-0)
21 |               description: |
22 |                 Failed to get PowerEdgeRAID controller information using perccli.
23 |                   INSTANCE = ubuntu-0
24 |                   SUCCESS = 0
25 |                   LABELS = map[__name__:perccli_command_success instance:ubuntu-0]
26 | 
27 |   - interval: 1m
28 |     input_series:
29 |       - series: 'perccli_command_success{instance="ubuntu-1"}'
30 |         values: '1x15'
31 |       - series: 'poweredgeraid_controllers{instance="ubuntu-1"}'
32 |         values: '0x15' # error: PowerEdgeRAIDControllerNotFound 
33 | 
34 |     alert_rule_test:
35 |       - eval_time: 0m
36 |         alertname: PowerEdgeRAIDControllerNotFound
37 |         exp_alerts:
38 |           - exp_labels:
39 |               severity: warning
40 |               instance: ubuntu-1
41 |             exp_annotations:
42 |               summary: PowerEdge RAID controller not found. (instance ubuntu-1)
43 |               description: |
44 |                 Cannot find PowerEdge RAID controller on this host machine.
45 |                   INSTANCE = ubuntu-1
46 |                   LABELS = map[__name__:poweredgeraid_controllers instance:ubuntu-1]
47 | 
48 |   - interval: 1m
49 |     input_series:
50 |       - series: 'perccli_command_ctrl_success{instance="ubuntu-1", controller_id="0"}'
51 |         values: '0x15'
52 | 
53 |     alert_rule_test:
54 |       - eval_time: 0m
55 |         alertname: PowerEdgeRAIDControllerSuccess
56 |         exp_alerts:
57 |           - exp_labels:
58 |               severity: critical
59 |               instance: ubuntu-1
60 |               controller_id: 0
61 |             exp_annotations:
62 |               summary: PowerEdge RAID controller command not successful.  (instance ubuntu-1)
63 |               description: |
64 |                 Failed to get PowerEdge RAID controller information on controller 0.
65 |                   INSTANCE = ubuntu-1
66 |                   CONTROLLER_ID = 0
67 |                   LABELS = map[__name__:perccli_command_ctrl_success controller_id:0 instance:ubuntu-1]
68 | 
69 |   - interval: 1m
70 |     input_series:
71 |       - series: 'poweredgeraid_virtual_info{instance="ubuntu-1", controller_id="0", device_group="1", virtual_drive_id="2", state="Dgrd", cache_policy="NRWTD"}'
72 |         values: '1x15'
73 | 
74 |     alert_rule_test:
75 |       - eval_time: 0m
76 |         alertname: PowerEdgeRAIDVirtualDriveNotOptimal
77 |         exp_alerts:
78 |           - exp_labels:
79 |               severity: warning
80 |               instance: ubuntu-1
81 |               controller_id: 0
82 |               device_group: 1
83 |               virtual_drive_id: 2
84 |               state: Dgrd
85 |               cache_policy: NRWTD
86 |             exp_annotations:
87 |               summary: PowerEdge RAID virtual drives are not in optimal state. (instance ubuntu-1)
88 |               description: |
89 |                 PowerEdge RAID virtual drives are not in optimal state. Please check the if the virtual drives are working as expected.
90 |                   STATE = Dgrd
91 |                   LABELS = map[__name__:poweredgeraid_virtual_info cache_policy:NRWTD controller_id:0 device_group:1 instance:ubuntu-1 state:Dgrd virtual_drive_id:2]
92 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules_dynamic/redfish.yaml:
--------------------------------------------------------------------------------
  1 | groups:
  2 |   - name: Redfish
  3 |     rules:
  4 |       - alert: RedfishCallFailed
  5 |         expr: redfish_call_success == 0
  6 |         for: 5m
  7 |         labels:
  8 |           severity: warning
  9 |         annotations:
 10 |           summary: Call to the Redfish API failed. (instance {{ $labels.instance }})
 11 |           description: |
 12 |             Failure in calling the Redfish API.
 13 |               VALUE = {{ $value }}
 14 |               LABELS = {{ $labels }}
 15 | 
 16 |       - alert: RedfishServiceUnavailable
 17 |         expr: redfish_service_available == 0
 18 |         for: 5m
 19 |         labels:
 20 |           severity: warning
 21 |         annotations:
 22 |           summary: No redfish services available. (instance {{ $labels.instance }})
 23 |           description: |
 24 |             No redfish services available.
 25 |               VALUE = {{ $value }}
 26 |               LABELS = {{ $labels }}
 27 | 
 28 |       - alert: RedfishSensorHealthNotOk
 29 |         expr: redfish_sensor_info{health!~"OK|N/A"}
 30 |         for: 5m
 31 |         labels:
 32 |           severity: critical
 33 |         annotations:
 34 |           summary: Redfish sensor health not Ok. (instance {{ $labels.instance }})
 35 |           description: |
 36 |             Redfish sensor health not Ok.
 37 |               SENSOR_READING = {{ $labels.reading }}
 38 |               LABELS = {{ $labels }}
 39 | 
 40 |       - alert: RedfishProcessorHealthNotOk
 41 |         expr: redfish_processor_info{health!~"OK|NA"}
 42 |         for: 5m
 43 |         labels:
 44 |           severity: critical
 45 |         annotations:
 46 |           summary: Redfish processor health not OK. (instance {{ $labels.instance }})
 47 |           description: |
 48 |             Redfish processor health not OK.
 49 |               LABELS = {{ $labels }}
 50 | 
 51 |       - alert: RedfishStorageControllerHealthNotOk
 52 |         expr: redfish_storage_controller_info{health!~"OK|NA"}
 53 |         for: 5m
 54 |         labels:
 55 |           severity: critical
 56 |         annotations:
 57 |           summary: Redfish storage controller health not OK. (instance {{ $labels.instance }})
 58 |           description: |
 59 |             Redfish storage controller health not OK.
 60 |               LABELS = {{ $labels }}
 61 | 
 62 |       - alert: RedfishChassisHealthNotOk
 63 |         expr: redfish_chassis_info{health!~"OK|NA"}
 64 |         for: 5m
 65 |         labels:
 66 |           severity: critical
 67 |         annotations:
 68 |           summary: Redfish chassis health not OK. (instance {{ $labels.instance }})
 69 |           description: |
 70 |             Redfish chassis health not OK.
 71 |               LABELS = {{ $labels }}
 72 | 
 73 |       - alert: RedfishStorageDriveHealthNotOk
 74 |         expr: redfish_storage_drive_info{health!~"OK|NA", state="Enabled"}
 75 |         for: 5m
 76 |         labels:
 77 |           severity: critical
 78 |         annotations:
 79 |           summary: Redfish storage drive health not OK. (instance {{ $labels.instance }})
 80 |           description: |
 81 |             Redfish storage drive health not OK.
 82 |               LABELS = {{ $labels }}
 83 | 
 84 |       - alert: RedfishMemoryDimmHealthNotOk
 85 |         expr: redfish_memory_dimm_info{health!~"OK|NA"}
 86 |         for: 5m
 87 |         labels:
 88 |           severity: critical
 89 |         annotations:
 90 |           summary: Redfish memory dimm health not OK. (instance {{ $labels.instance }})
 91 |           description: |
 92 |             Redfish memory dimm health not OK.
 93 |               LABELS = {{ $labels }}
 94 | 
 95 |       - alert: RedfishSmartStorageHealthNotOk
 96 |         expr: redfish_smart_storage_health == 0
 97 |         for: 5m
 98 |         labels:
 99 |           severity: critical
100 |         annotations:
101 |           summary: Redfish smart storage health not OK. (instance {{ $labels.instance }})
102 |           description: |
103 |             Redfish smart storage health not OK.
104 |               VALUE = {{ $value }}
105 |               LABELS = {{ $labels }}
106 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/ipmi_sensors.yaml:
--------------------------------------------------------------------------------
  1 | groups:
  2 | - name: IpmiSensors
  3 |   rules:
  4 |     - alert: IPMIMonitoringCommandSuccessMetricsMissing
  5 |       expr: absent_over_time(ipmimonitoring_command_success[5m])
  6 |       labels:
  7 |         severity: critical
  8 |       annotations:
  9 |         summary: IPMI monitoring command success metrics missing. (instance {{ $labels.instance }})
 10 |         description: |
 11 |           The ipmimonitoring_command_success metric has been missing for over 5 minutes.
 12 |           This may indicate IPMI monitoring command timeouts, or that IPMI tools/services are not installed or supported on this hardware.
 13 |             LABELS = {{ $labels }}
 14 | 
 15 |     - alert: IPMIMonitoringCommandFailed
 16 |       expr: ipmimonitoring_command_success == 0
 17 |       for: 5m
 18 |       labels:
 19 |         severity: critical
 20 |       annotations:
 21 |         summary: Failed to run ipmimonitoring. (instance {{ $labels.instance }})
 22 |         description: |
 23 |           Failed to get ipmi sensor data using ipmimonitoring.
 24 |             VALUE = {{ $value }}
 25 |             LABELS = {{ $labels }}
 26 | 
 27 |     - alert: IPMITemperatureStateNotOk
 28 |       expr: ipmi_temperature_celsius{state=~"Warning|Critical"}
 29 |       for: 5m
 30 |       labels:
 31 |         severity: "{{ toLower $labels.state }}"
 32 |       annotations:
 33 |         summary: Temperature in {{ toLower $labels.state }} state. (instance {{ $labels.instance }})
 34 |         description: |
 35 |           Temperature, recorded by ipmi sensor, in {{ toLower $labels.state }} state.
 36 |             TEMPERATURE_CELSIUS = {{ $value }}
 37 |             LABELS = {{ $labels }}
 38 | 
 39 |     - alert: IPMIPowerStateNotOk
 40 |       expr: ipmi_power_watts{state=~"Warning|Critical"}
 41 |       for: 5m
 42 |       labels:
 43 |         severity: "{{ toLower $labels.state }}"
 44 |       annotations:
 45 |         summary: Power in {{ toLower $labels.state }} state. (instance {{ $labels.instance }})
 46 |         description: |
 47 |           Power, recorded by ipmi sensor, in {{ toLower $labels.state }} state.
 48 |             POWER_WATTS = {{ $value }}
 49 |             LABELS = {{ $labels }}
 50 | 
 51 |     - alert: IPMIVoltageStateNotOk
 52 |       expr: ipmi_voltage_volts{state=~"Warning|Critical"}
 53 |       for: 5m
 54 |       labels:
 55 |         severity: "{{ toLower $labels.state }}"
 56 |       annotations:
 57 |         summary: Voltage in {{ toLower $labels.state }} state. (instance {{ $labels.instance }})
 58 |         description: |
 59 |           Voltage, recorded by ipmi sensor, in {{ toLower $labels.state }} state.
 60 |             VOLTAGE_VOLTS = {{ $value }}
 61 |             LABELS = {{ $labels }}
 62 | 
 63 |     - alert: IPMICurrentStateNotOk
 64 |       expr: ipmi_current_amperes{state=~"Warning|Critical"}
 65 |       for: 5m
 66 |       labels:
 67 |         severity: "{{ toLower $labels.state }}"
 68 |       annotations:
 69 |         summary: Current in {{ toLower $labels.state }} state. (instance {{ $labels.instance }})
 70 |         description: |
 71 |           Current, recorded by ipmi sensor, in {{ toLower $labels.state }} state.
 72 |             CURRENT_AMPERES = {{ $value }}
 73 |             LABELS = {{ $labels }}
 74 | 
 75 |     - alert: IPMIFanSpeedStateNotOk
 76 |       expr: ipmi_fan_speed_rpm{state=~"Warning|Critical"}
 77 |       for: 5m
 78 |       labels:
 79 |         severity: "{{ toLower $labels.state }}"
 80 |       annotations:
 81 |         summary: Fan speed in {{ toLower $labels.state }} state. (instance {{ $labels.instance }})
 82 |         description: |
 83 |           Fan speed, recorded by ipmi sensor, in {{ toLower $labels.state }} state.
 84 |             FAN_SPEED_RPM = {{ $value }}
 85 |             LABELS = {{ $labels }}
 86 | 
 87 |     # Entity Presence sensors are ignored since the state doesn't correspond to a real alert
 88 |     # Slot Connector sensors are ignored since they raise a high number of false positive alerts
 89 |     - alert: IPMISensorStateNotOk
 90 |       expr: ipmi_generic_sensor_value{state=~"Warning|Critical", type!~"Entity\\sPresence|Slot/Connector"}
 91 |       for: 5m
 92 |       labels:
 93 |         severity: "{{ toLower $labels.state }}"
 94 |       annotations:
 95 |         summary: IPMI sensor value in {{ toLower $labels.state }} state. (instance {{ $labels.instance }})
 96 |         description: |
 97 |           A sensor value, recorded by ipmi sensor, in {{ toLower $labels.state }} state. Entity Presence and Slot Connector sensors are ignored.
 98 |             VALUE = {{ $value }}
 99 |             LABELS = {{ $labels }}
100 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/dcgm.yaml:
--------------------------------------------------------------------------------
 1 | # The alerts use DCGM_FI_DEV_CLOCK_THROTTLE_REASONS metric to detect throttling events on NVIDIA GPUs,
 2 | # which is a bitmask of throttle reasons found here: https://docs.nvidia.com/datacenter/dcgm/2.1/dcgm-api/group__dcgmFieldConstants.html.
 3 | # The 8 least significant bits are used for the alerts, with each bit representing a different throttle reason.
 4 | 
 5 | groups:
 6 | - name: NVIDIA DCGM Throttling Alerts
 7 |   rules:
 8 |     - alert: GPUPowerBrakeThrottle
 9 |       # isolate the least significant 8 bits with % 256
10 |       # check whether bit 7 (starts from bit 0) has been set with the >= bool 128 comparison
11 |       expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 256 >= 128
12 |       for: 5m
13 |       labels:
14 |         severity: warning
15 |       annotations:
16 |         summary: GPU Hardware Power Brake Slowdown throttling detected. (instance {{ $labels.Hostname }})
17 |         description: |
18 |           HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }}
19 |           This is an indicator of: 
20 |               - External Power Brake Assertion being triggered (e.g. by the system power supply)
21 |             LABELS = {{ $labels }}
22 |     - alert: GPUThermalHWThrottle
23 |       # isolate the least significant 7 bits with % 128
24 |       # check whether bit 6 (starts from bit 0) has been set with the >= bool 64 comparison
25 |       expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 128 >= 64
26 |       for: 5m
27 |       labels:
28 |         severity: warning
29 |       annotations:
30 |         summary: GPU Hardware Thermal throttling detected. (instance {{ $labels.Hostname }})
31 |         description: |
32 |           HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }}
33 |           This is an indicator of:
34 |               - Temperature being too high
35 |             LABELS = {{ $labels }}
36 |     - alert: GPUThermalSWThrottle
37 |       # isolate the least significant 6 bits with % 64
38 |       # check whether bit 5 (starts from bit 0) has been set with the >= bool 32 comparison
39 |       expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 64 >= 32
40 |       for: 5m
41 |       labels:
42 |         severity: warning
43 |       annotations:
44 |         summary: GPU Software Thermal throttling detected. (instance {{ $labels.Hostname }})
45 |         description: |
46 |           SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.gpu }}
47 |           This is an indicator of:
48 |               - Current GPU temperature above the GPU Max Operating Temperature
49 |               - Current memory temperature above the Memory Max Operating Temperature
50 |             LABELS = {{ $labels }}
51 |     - alert: GPUSyncBoostThrottle
52 |       # isolate the least significant 5 bits with % 32
53 |       # check whether bit 4 (starts from bit 0) has been set with the >= bool 16 comparison
54 |       expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 32 >= 16
55 |       for: 5m
56 |       labels:
57 |         severity: warning
58 |       annotations:
59 |         summary: GPU Sync Boost throttling detected. (instance {{ $labels.Hostname }})
60 |         description: |
61 |           This NVIDIA GPU: {{ $labels.gpu }} has been added to a Sync boost group with nvidia-smi or DCGM in order to maximize performance per watt.
62 |           All GPUs in the sync boost group will boost to the minimum possible clocks across the entire group.
63 |           Look at the throttle reasons for other GPUs in the system to see why those GPUs are holding this one at lower clocks.
64 |             LABELS = {{ $labels }}
65 |     - alert: GPUSlowdownThrottle
66 |       # isolate the least significant 4 bits with % 16
67 |       # check whether bit 3 (starts from bit 0) has been set with the >= bool 8 comparison
68 |       expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 16 >= 8
69 |       for: 5m
70 |       labels:
71 |         severity: warning
72 |       annotations:
73 |         summary: GPU Hardware Slowdown throttling detected. (instance {{ $labels.Hostname }})
74 |         description: |
75 |           HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }}
76 |           This is an indicator of:
77 |               - Temperature being too high
78 |               - External Power Brake Assertion is triggered (e.g. by the system power supply)
79 |               - Power draw is too high and Fast Trigger protection is reducing the clocks
80 |               - May be also reported during PState or clock change
81 |             LABELS = {{ $labels }}
82 |     - alert: GPUPowerThrottle
83 |       # isolate the least significant 3 bits with % 8
84 |       # check whether bit 2 (starts from bit 0) has been set with the >= bool 4 comparison
85 |       expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 8 >= 4
86 |       for: 5m
87 |       labels:
88 |         severity: warning
89 |       annotations:
90 |         summary: GPU Software Power throttling detected. (instance {{ $labels.Hostname }})
91 |         description: |
92 |           SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.gpu }}
93 |             LABELS = {{ $labels }}
94 | 


--------------------------------------------------------------------------------
/tests/unit/test_alert_rules/test_ssacli.yaml:
--------------------------------------------------------------------------------
  1 | rule_files:
  2 |   - ../../../src/prometheus_alert_rules/ssacli.yaml
  3 | 
  4 | evaluation_interval: 1m
  5 | 
  6 | tests:
  7 | 
  8 |   - interval: 1m
  9 |     input_series:
 10 |       - series: 'ssacli_command_success{instance="ubuntu-0"}'
 11 |         values: '0x15'
 12 | 
 13 |     alert_rule_test:
 14 |       - eval_time: 0m
 15 |         alertname: SsaCLICommandFailed
 16 |         exp_alerts:
 17 |           - exp_labels:
 18 |               severity: critical
 19 |               instance: ubuntu-0
 20 |             exp_annotations:
 21 |               summary: Failed to run ssacli. (instance ubuntu-0)
 22 |               description: |
 23 |                 Failed to get storage array information using ssacli.
 24 |                   VALUE = 0
 25 |                   LABELS = map[__name__:ssacli_command_success instance:ubuntu-0]
 26 | 
 27 | 
 28 |   - interval: 1m
 29 |     input_series:
 30 |       - series: 'ssacli_command_success{instance="ubuntu-1"}'
 31 |         values: '1x15'
 32 |       - series: 'ssacli_controllers{instance="ubuntu-1"}'
 33 |         values: '0x15'
 34 | 
 35 |     alert_rule_test:
 36 |       - eval_time: 0m
 37 |         alertname: SsaCLIControllerNotFound
 38 |         exp_alerts:
 39 |           - exp_labels:
 40 |               severity: warning
 41 |               instance: ubuntu-1
 42 |             exp_annotations:
 43 |               summary: ssacli controller not found. (instance ubuntu-1)
 44 |               description: |
 45 |                 Cannot find ssacli controller on this host machine.
 46 |                   NUMBER_OF_CONTROLLERS = 0
 47 |                   LABELS = map[__name__:ssacli_controllers instance:ubuntu-1]
 48 | 
 49 | 
 50 |   - interval: 1m
 51 |     input_series:
 52 |       - series: 'ssacli_command_success{instance="ubuntu-2"}'
 53 |         values: '1x15'
 54 |       - series: 'ssacli_controller_info{instance="ubuntu-2", part="Cache Status", status="DOWN"}'
 55 |         values: '1x15'
 56 | 
 57 |     alert_rule_test:
 58 |       - eval_time: 0m
 59 |         alertname: SsaCLIControllerNotOK
 60 |         exp_alerts:
 61 |           - exp_labels:
 62 |               severity: critical
 63 |               instance: ubuntu-2
 64 |               part: Cache Status
 65 |               status: DOWN
 66 |             exp_annotations:
 67 |               summary: ssacli controller status not Ok. (instance ubuntu-2)
 68 |               description: |
 69 |                 SSACLI controller status not OK.
 70 |                   STATUS = DOWN
 71 |                   LABELS = map[__name__:ssacli_controller_info instance:ubuntu-2 part:Cache Status status:DOWN]
 72 | 
 73 | 
 74 |   - interval: 1m
 75 |     input_series:
 76 |       - series: 'ssacli_controller_info{instance="ubuntu-2", part="Cache Status", status="NOT CONFIGURED"}'
 77 |         values: '1x15'
 78 |       - series: 'ssacli_controller_info{instance="ubuntu-3", part="Cache Status", status="OK"}'
 79 |         values: '1x15'
 80 |     alert_rule_test:
 81 |       - eval_time: 0m
 82 |         alertname: SsaCLIControllerNotOK
 83 |         # Expect no alerts when status is NOT CONFIGURED or OK
 84 |         exp_alerts: []
 85 | 
 86 | 
 87 |   - interval: 1m
 88 |     input_series:
 89 |       - series: 'ssacli_command_success{instance="ubuntu-3"}'
 90 |         values: '1x15'
 91 |       - series: 'ssacli_logical_drive_info{instance="ubuntu-3", slot="2", status="DOWN"}'
 92 |         values: '1x15'
 93 |       - series: 'ssacli_logical_drive_info{instance="ubuntu-11", slot="2", status="OK"}'
 94 |         values: '1x15'
 95 | 
 96 |     alert_rule_test:
 97 |       - eval_time: 0m
 98 |         alertname: SsaCLILogicalDriveNotOK
 99 |         exp_alerts:
100 |           - exp_labels:
101 |               severity: critical
102 |               instance: ubuntu-3
103 |               slot: "2"
104 |               status: DOWN
105 |             exp_annotations:
106 |               summary: ssacli logical drive status not Ok. (instance ubuntu-3)
107 |               description: |
108 |                 SSACLI logical drive status not OK.
109 |                   STATUS = DOWN
110 |                   LABELS = map[__name__:ssacli_logical_drive_info instance:ubuntu-3 slot:2 status:DOWN]
111 | 
112 | 
113 |   - interval: 1m
114 |     input_series:
115 |       - series: 'ssacli_command_success{instance="ubuntu-4"}'
116 |         values: '1x15'
117 |       - series: 'ssacli_physical_drive_info{instance="ubuntu-4", slot="2", status="CORRUPT"}'
118 |         values: '1x15'
119 |       - series: 'ssacli_physical_drive_info{instance="ubuntu-12", slot="2", status="OK"}'
120 |         values: '1x15'
121 | 
122 |     alert_rule_test:
123 |       - eval_time: 0m
124 |         alertname: SsaCLIPhysicalDriveNotOK
125 |         exp_alerts:
126 |           - exp_labels:
127 |               severity: critical
128 |               instance: ubuntu-4
129 |               slot: "2"
130 |               status: CORRUPT
131 |             exp_annotations:
132 |               summary: ssacli physical drive status not Ok. (instance ubuntu-4)
133 |               description: |
134 |                 SSACLI physical drive status not OK.
135 |                   STATUS = CORRUPT
136 |                   LABELS = map[__name__:ssacli_physical_drive_info instance:ubuntu-4 slot:2 status:CORRUPT]
137 | 


--------------------------------------------------------------------------------
/src/keys.py:
--------------------------------------------------------------------------------
  1 | """Static parameters for keys."""
  2 | 
  3 | HPPUBLICKEY1024 = """
  4 | -----BEGIN PGP PUBLIC KEY BLOCK-----
  5 | Version: GnuPG v1.4.0 (MingW32)
  6 | 
  7 | mQGiBEIxWpoRBADb06sJgnD7MJnm2Ny1nmTFLDSZ8vkubP+pmfn9N9TE26oit+KI
  8 | OnVTRVbSPl3F15wTjSBGR453MEfnzp1NrMk1GIa/m1nKAmgQ4t1714C4jQab0to+
  9 | gP51XhPhtAGt7BggorQw2RXa4KdTCh8ByOIaDKRYcESmMazSZ+Pscy2XRwCgm771
 10 | 21RCM0RcG2dmHZZgKH8fTscD/RiY3CHI2jJl9WosIYXbZpOySzrLn0lRCRdNdpew
 11 | Y5m1f3lhqoSvJk7pXjs4U+3XlOlUhgWl5HiXuWSVyPu2ilfGdfgpJslawI85fBQg
 12 | Ul5kcrjLHHsApeG8oGStFJE2JAc+0D+whmGmJbjWKwuZJmgpm9INplA4h1BYJbx+
 13 | 6A3MBACFiMTttDPpJ+5eWr1VSZwxCZNqvPWmjpL5Nh9F8xzE7q+ad2CFKSebvRrv
 14 | Jf7Y2m+wY9bmo5nJ3wHYEX3Aatt+QVF10G6wTdIz/Ohm/Pc4Li4NhzYOv7FKxVam
 15 | 97UN0O8Rsl4GhE2eE8H+Q3QYFvknAWoTj3Rq3/A5FA6FsRFhxbQwSGV3bGV0dC1Q
 16 | YWNrYXJkIENvbXBhbnkgKEhQIENvZGVzaWduaW5nIFNlcnZpY2UpiGQEExECACQF
 17 | AkIxWpoCGwMFCRLMAwAGCwkIBwMCAxUCAwMWAgECHgECF4AACgkQUnvFOiaJuIc1
 18 | 2wCgj2UotUgSegPHmcKdApY+4WFaz/QAnjI58l5bDD8eElBCErHVoq9uPMczuQIN
 19 | BEIxWqUQCADnBXqoU8QeZPEy38oI0GrN2q7nvS+4UBQeIRVy8x+cOqDRDcE8PHej
 20 | 7NtxP698U0WFGK47GszjiV4WTnvexuJk0B5AMEBHana8fVj7uRUcmyYZqOZd7EXn
 21 | Q3Ivi8itfkTICkhZi7bmGsSF0iJ0eAI5n2bCqJykNQvJ6a3dWJKP8EgaBCZj+TGL
 22 | WWJHDZsrn8g4BeaNS/MbmsCLAk8N6bWMGzAKfgxUraMCwuZ9fVyHFavHdeChUtna
 23 | qnF4uw0hHLaGWmTJjziXVvVC1a8+inTxPZkVpAvD0A+/LNlkP7TtAdaVOJqv3+a3
 24 | ybMQL851bRTFyt+H0XGHhzhhtuu9+DyfAAMFCADRWGxIfniVG7O4wtwLD3sWzR/W
 25 | LmFlJYu4s9rSDgn3NDjigQzZoVtbuv3Z9IZxBMoYa50MuybuVDp55z/wmxvYoW2G
 26 | 25kOFDKx/UmkKkUBLdokb5V1p9j5SJorGBSfsNAHflhmBhyuMP4CDISbBUSN7oO1
 27 | Oj41jNxpqhy+8ayygSVcTNwMe909J/HdC//xFANLDhjKPf3ZAulWNhOvjTlpF46B
 28 | yt1l8ZNinIeE7CFL7H+LlMl2Ml6wsOkrxsSauBis6nER4sYVqrMdzpUU2Sr2hj6Q
 29 | sJ+9TS+IURcnxL/M851KCwLhwZKdphQjT3mXXsoCx/l3rI6cxpwYgjiKiZhOiE8E
 30 | GBECAA8FAkIxWqUCGwwFCRLMAwAACgkQUnvFOiaJuIenewCdHcEvMxBYprqRjKUw
 31 | 04EypyFtZTgAn0wds0nbpd2+VZ5WHbVRfU4y5Y5Y
 32 | =+cX+
 33 | -----END PGP PUBLIC KEY BLOCK-----
 34 | """
 35 | 
 36 | HPPUBLICKEY2048 = """
 37 | -----BEGIN PGP PUBLIC KEY BLOCK-----
 38 | Version: GnuPG v1.4.10 (MingW32)
 39 | 
 40 | mQENBFC+QboBCAC1bodHD7AmR00SkDMB4u9MXy+Z5vv8wbmGRaKDBYScpAknOljX
 41 | d5tBADffAetd1hgLnrLKN8vHdIsYkmUyeEeEsnIUKtwvbx/f6PoZZPOIIIRh1d2W
 42 | Mjw9qXIE+tgr2gWlq0Gi5BZzaKse1+khRQ2rewJBppblSGWgcmCMIq8OwAsrdbtr
 43 | z7+37c/g/Y2VfAahc23YZW9LQ5MiaI4nS4JMZbWPYtBdF78B/D2t5FvmvDG0Cgjk
 44 | Qi1U9IVjiFKixuoi6nRsvBLFYL/cI+vo4iyUC5x7qmKd8gN7A030gS67VrleNRki
 45 | q0vaF6J46XpIl4o58t23FSAKKRbTwavYzdMpABEBAAG0NEhld2xldHQtUGFja2Fy
 46 | ZCBDb21wYW55IFJTQSAoSFAgQ29kZXNpZ25pbmcgU2VydmljZSmJAT4EEwECACgF
 47 | AlC+QboCGwMFCRLMAwAGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheAAAoJELBwaApc
 48 | 4tR2x7sH/A3D4XxEEyrX6Z3HeWSSA80+n+r5QwfXm5unxsWEL3JyNg6sojlrJY4K
 49 | 8k4ih4nkY4iblChTCSQwnqKXqkL5U+RIr+AJoPx+55M98u4eRTVYMHZD7/jFq85z
 50 | ZFGUkFkars9E2aRzWhqbz0LINb9OUeX0tT5qQseHflO2PaJykxNPC14WhsBKC2lg
 51 | dZWnGhO5QJFp69AnSp4k+Uo/1LMk87YEJIL1NDR0lrlKgRvFfFyTpRBt+Qb1Bb7g
 52 | rjN0171g8t5GaPWamN3Oua/v4aZg15f3xydRF8y9TsYjiNz+2TzRjKv7AkpZaJST
 53 | 06CqMjCgiZ6UFFGN0/oqLnwxdP3Mmh4=
 54 | =aphN
 55 | -----END PGP PUBLIC KEY BLOCK-----
 56 | """
 57 | 
 58 | HPPUBLICKEY2048_KEY1 = """
 59 | -----BEGIN PGP PUBLIC KEY BLOCK-----
 60 | Version: GnuPG v1.4.12 (MingW32)
 61 | 
 62 | mQENBFRtGAgBCADlSku65P14hVdx9E/W0n6MwuB3WGqmsyKNoa3HezFdMjWERldI
 63 | NNUdi8O28cZ6j2+Hi9L1HeQIQ9+7FHpR3JyQePBJtRX8WSEusfRtML98opDhJxKm
 64 | 8Jyxb7aTvCwdNHz3yxADINkMtOj5oRm7VCr8XHkG7YU27ELs8B+BXWvjO21oSosi
 65 | FurnhT+H3hQsYXfYA55aa21q0qX+L5dFJSNdzZVo7m9ybioVv2R5+PfBvdaSxCnm
 66 | OpcGXFaKAsqVHeTW0pd3sdkin1rkbhOBaU5lFBt2ZiMtKpKHpT8TZnqHpFHFbgi8
 67 | j2ARJj4IDct2OGILddUIZSFyue6WE2hpV5c/ABEBAAG0OEhld2xldHQtUGFja2Fy
 68 | ZCBDb21wYW55IFJTQSAoSFAgQ29kZXNpZ25pbmcgU2VydmljZSkgLSAxiQE+BBMB
 69 | AgAoBQJUbRgIAhsDBQkSzAMABgsJCAcDAgYVCAIJCgsEFgIDAQIeAQIXgAAKCRD6
 70 | 3Y1ksSdeo6BJCADOfIPPLPpIOnFK9jH4t8lLUd+RyMc+alA3uTDPUJa/ZHa6DHfh
 71 | 42iaPYVEV8OG0tnbMlHmwvsZ5c1/MRMw1UbxCvD88P2qM4SUrUjQUlSCms2GLGvF
 72 | ftFXBiOJQ7/yBc9o+yoSvwPrrTxSCk4+Sqm0IfVXVzChDM9dM9YPY2Vzjd+LUaYC
 73 | 3X+eSuggUDO0TmJLJd7tZdF9fVXq3lr63BZ5PY98MTCuOoeSMDa9FIUQf6vn6UUJ
 74 | MDSRZ9OzhpNJOKR+ShVRwDK6My8gtVIW1EAW2w3VQWI2UNF07aLeO8UG6nTNWA23
 75 | +OuZkUdgQovjcq01caSefgOkmiQOx6d74CAk
 76 | =X+eo
 77 | -----END PGP PUBLIC KEY BLOCK-----
 78 | """
 79 | 
 80 | HPEPUBLICKEY2048_KEY1 = """
 81 | -----BEGIN PGP PUBLIC KEY BLOCK-----
 82 | Version: GnuPG v1.4.12 (GNU/Linux)
 83 | 
 84 | mQENBFZp0LkBCACXajRw3b4x7G7dulNYj0hUID4BtVFq/MjEb6PHckTxGxZDoQRX
 85 | RK54tiTFA9wq3b4P3yEFnOjbjRoI0d7Ls67FADugFO+cDCtsV9yuDlaYP/U/h2nX
 86 | N0R4AdYbsVd5yr6xr+GAy66Hmx5jFH3kbC+zJpOcI0tU9hcyU7gjbxu6KQ1ypI2Q
 87 | VRKf8sRBJXgmkOlbYx35ZUMFcmVxrLJXvUuxmAVXgT9f5M3Z3rsGt/ab+/+1TFSb
 88 | RsaqHsIPE0QH8ikqW4IeDQAo1T99pCdf7FWr45KFFTo7O4AZdLMWVgqeFHaSoZxJ
 89 | 307VIINsWiwQoPp0tfU5NOOOwB1Sv3x9QgFtABEBAAG0P0hld2xldHQgUGFja2Fy
 90 | ZCBFbnRlcnByaXNlIENvbXBhbnkgUlNBLTIwNDgtMjUgPHNpZ25ocEBocGUuY29t
 91 | PokBPQQTAQIAJwUCVmnQuQIbLwUJEswDAAYLCQgHAwIGFQgCCQoLAxYCAQIeAQIX
 92 | gAAKCRDCCK3eJsK3l9G+B/0ekblsBeN+xHIJ28pvo2aGb2KtWBwbT1ugI+aIS17K
 93 | UQyHZJUQH+ZeRLvosuoiQEdcGIqmOxi2hVhSCQAOV1LAonY16ACveA5DFAEBz1+a
 94 | WQyx6sOLLEAVX1VqGlBXxh3XLEUWOhlAf1gZPNtHsmURTUy2h1Lv/Yoj8KLyuK2n
 95 | DmrLOS3Ro+RqWocaJfvAgXKgt6Fq/ChDUHOnar7lGswzMsbE/yzLJ7He4y89ImK+
 96 | 2ktR5HhDuxqgCe9CWH6Q/1WGhUa0hZ3nbluq7maa+kPe2g7JcRzPH/nJuDCAOZ7U
 97 | 6mHE8j0kMQMYjgaYEx2wc02aQRmPyxhbDLjSbtjomXRr
 98 | =voON
 99 | -----END PGP PUBLIC KEY BLOCK-----
100 | """
101 | 
102 | HP_KEYS = [
103 |     HPEPUBLICKEY2048_KEY1,
104 |     HPPUBLICKEY2048_KEY1,
105 |     HPPUBLICKEY2048,
106 |     HPPUBLICKEY1024,
107 | ]
108 | 


--------------------------------------------------------------------------------
/tests/functional/README.md:
--------------------------------------------------------------------------------
  1 | # Functional Tests for the Hardware Observer Charm
  2 | There are 2 main types of functional tests for the Hardware Observer charm - those which depend on
  3 | real hardware to be present and those that can run without it.
  4 | 
  5 | Here, "real hardware" refers to machines that are not VMs or containers and have access to real
  6 | hardware resources like RAID cards and BMC management tools.
  7 | 
  8 | Note: the built charm must be present in the root of the project's directory for the tests to run.
  9 | 
 10 | ## Hardware Independent Tests
 11 | These are the tests for hardware observer that do not require any real hardware.
 12 | 
 13 | Hardware independent tests are run on every PR / weekly scheduled test run.
 14 | 
 15 | These include:
 16 | * Testing whether juju config changes produce the required results
 17 | 
 18 | Running these tests is as simple as executing the `tox -e func -- -v`
 19 | 
 20 | ## Hardware Dependent Tests
 21 | These are the tests that depend on real hardware to be executed. This is performed manually when
 22 | required, for example - validating the charm's full functionality before a new release.
 23 | 
 24 | Hardware dependent tests are present in the `TestCharmWithHW` class in the `test_charm.py` module.
 25 | The pytest marker `realhw` has been added to this class (which would include all the tests in this
 26 | class).
 27 | 
 28 | These tests will only be executed if the `--realhw` option for pytest is provided. Additionally,
 29 | the `--collectors` option with space separated values can be provided, if specific hardware is
 30 | present. Check the `conftest.py` for options. Otherwise, all these tests are skipped (this is done
 31 | by checking for the presence of the `realhw` marker mentioned earlier.)
 32 | 
 33 | Note: The operator must set up a test model with the machine added beforehand. The machine must be
 34 | an actual host, containers or VMs won't work.
 35 | Note: depending on the test, certain prerequisites are needed, e.g. having set up an nvidia driver.
 36 | Check the tests' docstrings for details.
 37 | 
 38 | Some of these tests include:
 39 | * Check if all collectors are detected in the exporter config file
 40 | * Test if metrics are available at the expected endpoint
 41 | * Test if metrics specific to the collectors being tested are available
 42 | * Test if smarctl-exporter snap is installed and running
 43 | * Test if the dcgm snap is installed
 44 | 
 45 | and more.
 46 | 
 47 | In order to run these tests, several prerequisites may need to be completed.
 48 | 1. Setup test environment
 49 | 1. Build the charm
 50 | 1. Add environment variables for Redfish credentials (if testing redfish).
 51 | 1. Setup required resource files (if testing hardware raid).
 52 | 1. Install the NVIDIA gpu driver and add the `--nvidia` flag (if testing NVIDIA gpu observability).
 53 | 1. Find supported collectors
 54 | 
 55 | ### 1. Setup test environment
 56 | 
 57 | You can refer to dev-environment.md here, up to the "Add physical machine" section included.
 58 | The end result should be a test model with a manually provisioned machine listed:
 59 | 
 60 | ```
 61 | $ juju status
 62 | Model  Controller      Cloud/Region         Version  SLA          Timestamp
 63 | test   lxd-controller  localhost/localhost  3.6.1    unsupported  01:39:10Z
 64 | 
 65 | Machine  State    Address      Inst id             Base          AZ  Message
 66 | 0        started  10.239.17.1  manual:10.239.17.1  ubuntu@22.04      Manually provisioned machine
 67 | ```
 68 | 
 69 | ### 2. Build the charm
 70 | 
 71 | Just run `charmcraft pack` from the project directory.
 72 | 
 73 | ### 3. Add environment variables for Redfish credentials
 74 | As part of the redfish collector specific tests, redfish credentials need to be provided for
 75 | authentication.
 76 | 
 77 | Therefore, the test expects these environment variables to be set:
 78 | * `REDFISH_USERNAME`
 79 | * `REDFISH_PASSWORD`
 80 | 
 81 | ### 4. Setup required resource files
 82 | Create a new `resources` directory in the root of the project.
 83 | Check which collectors are supported on the machine and verify if they need to be manually
 84 | downloaded (refer https://charmhub.io/hardware-observer/resources/).  Download the required
 85 | resource files from their respective third-party websites and add the extracted `.deb` file or
 86 | binary to this directory.
 87 | 
 88 | Note: The tests expect these resources to be named exactly in the manner provided below:
 89 | * storcli.deb
 90 | * perccli.deb
 91 | * sas2ircu
 92 | * sas3ircu
 93 | 
 94 | ### 4. Find supported collectors
 95 | Note down all the collectors supported by the machine as they need to be provided to pytest as part
 96 | of its CLI arguments.
 97 | 
 98 | This is done by passing the required collectors in a space-separated manner via `--collector`
 99 | option to the tox target.
100 | 
101 | The supported collectors can be found by checking the output of the `lshw` command (for RAID cards)
102 | or checking availability of Redfish and IPMI on the BMC.
103 | 
104 | ---
105 | 
106 | ### Running the tests
107 | 
108 | After ensuring the prerequisite steps are complete, the final command to run the tests would look
109 | something like this:
110 | 
111 | ```
112 | tox -e func -- -v --realhw --model test --collectors ipmi_dcmi ipmi_sel ipmi_sensor redfish mega_raid --nvidia --keep-models
113 | ```
114 | 
115 | This would pass the required collectors to tox which then sends it to the pytest command and starts
116 | the hardware dependent tests.
117 | 
118 | ### Troubleshooting
119 | 
120 | Create a `pytest.ini` file with the following contents to follow the live pytest logs
121 | 
122 | ```
123 | [pytest]
124 | log_cli = True
125 | log_cli_level = INFO
126 | ```
127 | 
128 | Add this line if you'd like to pass some more pytest options without messing with the make command.
129 | ```
130 | addopts = -vv -k 'ipmi_sensor'
131 | ```
132 | 


--------------------------------------------------------------------------------
/src/hardware.py:
--------------------------------------------------------------------------------
  1 | """Hardware support config and command helper."""
  2 | 
  3 | import json
  4 | import logging
  5 | import re
  6 | import subprocess
  7 | import typing as t
  8 | from pathlib import Path
  9 | from typing import Optional
 10 | 
 11 | from charms.operator_libs_linux.v0 import apt
 12 | 
 13 | from config import HWTool
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | # File path that contains the NVIDIA driver that is loaded and its version
 18 | NVIDIA_DRIVER_PATH = Path("/proc/driver/nvidia/version")
 19 | 
 20 | 
 21 | LSHW_SUPPORTED_STORAGES = {
 22 |     HWTool.SAS2IRCU: [
 23 |         # Broadcom
 24 |         "SAS2004",
 25 |         "SAS2008",
 26 |         "SAS2108",
 27 |         "SAS2208",
 28 |         "SAS2304",
 29 |         "SAS2308",
 30 |     ],
 31 |     HWTool.SAS3IRCU: [
 32 |         # Broadcom
 33 |         "SAS3004",
 34 |         "SAS3008",
 35 |     ],
 36 |     HWTool.SSACLI: [
 37 |         "Smart Array Gen8 Controllers",
 38 |         "Smart Array Gen9 Controllers",
 39 |     ],
 40 | }
 41 | 
 42 | HWINFO_SUPPORTED_STORAGES = {
 43 |     HWTool.SSACLI: [
 44 |         [
 45 |             "Hardware Class: storage",
 46 |             'Vendor: pci 0x9005 "Adaptec"',
 47 |             'Device: pci 0x028f "Smart Storage PQI 12G SAS/PCIe 3"',
 48 |             'SubDevice: pci 0x1100 "Smart Array P816i-a SR Gen10"',
 49 |         ]
 50 |     ]
 51 | }
 52 | 
 53 | 
 54 | def lshw(class_filter: t.Optional[str] = None) -> t.Any:
 55 |     """Return lshw output as dict."""
 56 |     cmd = "lshw -json"
 57 |     if class_filter:
 58 |         cmd = cmd + " -c " + class_filter
 59 |     try:
 60 |         output = subprocess.check_output(cmd.split(), text=True)
 61 |         json_output = json.loads(output)
 62 |         # lshw has different output on different ubuntu series
 63 |         # if class_filter is not provided.
 64 |         if not class_filter and isinstance(json_output, list):
 65 |             json_output = json_output[0]
 66 |         return json_output
 67 |     except subprocess.CalledProcessError as err:
 68 |         logger.error(err)
 69 |         # Raise error because the cmd should always work.
 70 |         raise err
 71 | 
 72 | 
 73 | def get_bmc_address() -> t.Optional[str]:
 74 |     """Get BMC IP address by ipmitool."""
 75 |     apt.add_package("ipmitool", update_cache=False)
 76 |     cmd = "ipmitool lan print"
 77 |     try:
 78 |         output = subprocess.check_output(cmd.split(), text=True)
 79 |         for line in output.splitlines():
 80 |             values = line.split(":")
 81 |             if values[0].strip() == "IP Address":
 82 |                 return values[1].strip()
 83 |     except subprocess.CalledProcessError:
 84 |         logger.debug("IPMI is not available")
 85 |     return None
 86 | 
 87 | 
 88 | def hwinfo(*args: str) -> t.Dict[str, str]:
 89 |     """Run hwinfo command and return output as dictionary.
 90 | 
 91 |     Args:
 92 |         args: Probe for a particular hardware class.
 93 |     Returns:
 94 |         hw_info: hardware information dictionary
 95 |     """
 96 |     apt.add_package("hwinfo", update_cache=False)
 97 |     hw_classes = list(args)
 98 |     for idx, hw_item in enumerate(args):
 99 |         hw_classes[idx] = "--" + hw_item
100 |     hw_info_cmd = ["hwinfo"] + hw_classes
101 | 
102 |     output = subprocess.check_output(hw_info_cmd, text=True)
103 |     if "start debug info" in output.splitlines()[0]:
104 |         output = output.split("=========== end debug info ============")[1]
105 | 
106 |     hardware: t.Dict[str, str] = {}
107 |     for item in output.split("\n\n"):
108 |         key = item.splitlines()[0].strip()
109 |         hardware[key] = item
110 |     return hardware
111 | 
112 | 
113 | def is_nvidia_driver_loaded() -> bool:
114 |     """Determine if an NVIDIA driver has been loaded."""
115 |     return NVIDIA_DRIVER_PATH.exists()
116 | 
117 | 
118 | def get_nvidia_driver_version() -> int:
119 |     """Get the NVIDIA driver version installed on the system."""
120 |     try:
121 |         nvidia_driver_version = NVIDIA_DRIVER_PATH.read_text()
122 |         match = re.search(r"NVRM version:.*?(\d+\.\d+(?:\.\d+)*)", nvidia_driver_version)
123 |         if match:
124 |             return int(match.group(1).split(".")[0])
125 |     except FileNotFoundError as e:
126 |         msg = "NVIDIA driver version file not found."
127 |         logger.error(msg)
128 |         raise FileNotFoundError(msg) from e
129 | 
130 | 
131 | def get_cuda_version_from_driver() -> int:
132 |     """Map the installed NVIDIA driver version to CUDA version."""
133 |     driver_version = get_nvidia_driver_version()
134 | 
135 |     if driver_version >= 580:
136 |         return 13
137 |     elif driver_version >= 525:
138 |         return 12
139 |     elif driver_version >= 450:
140 |         logger.warning(
141 |             "The installed NVIDIA driver version '%s' might not be supported in next DCGM "
142 |             "releases. Consider updating the NVIDIA driver.",
143 |             driver_version,
144 |         )
145 |         return 11
146 |     else:
147 |         logger.warning(
148 |             "The installed NVIDIA driver version '%s' is quite old and might not be supported "
149 |             "by recent DCGM versions. Consider updating the NVIDIA driver.",
150 |             driver_version,
151 |         )
152 |         return 10
153 | 
154 | 
155 | def dcgm_v3_compatible(cuda_version: int, track: str, channel: Optional[str] = None) -> bool:
156 |     """Check if the installed DCGM snap is v3 compatible."""
157 |     valid_channel = "v3" in channel if channel is not None else True
158 |     return valid_channel and cuda_version < 13 and track in {"v3", "auto"}
159 | 
160 | 
161 | def dcgm_v4_compatible(cuda_version: int, track: str, channel: Optional[str] = None) -> bool:
162 |     """Check if the installed DCGM snap is v4 compatible."""
163 |     valid_channel = f"v4-cuda{cuda_version}" in channel if channel is not None else True
164 |     return valid_channel and cuda_version > 10 and cuda_version <= 13 and track in {"v4", "auto"}
165 | 


--------------------------------------------------------------------------------
/tests/unit/test_alert_rules/test_ipmi_sel.yaml:
--------------------------------------------------------------------------------
  1 | rule_files:
  2 |   - ../../../src/prometheus_alert_rules/ipmi_sel.yaml
  3 | 
  4 | evaluation_interval: 1m
  5 | 
  6 | tests:
  7 |   - interval: 1m
  8 |     input_series:
  9 |       - series: ipmi_sel_command_success{instance="ubuntu-0"}
 10 |         values: '0x15'
 11 | 
 12 |       - series: ipmi_sel_command_success{instance="ubuntu-nominal"}
 13 |         values: '1x15'
 14 |       - series: ipmi_sel_state_nominal{instance="ubuntu-nominal"}
 15 |         values: '1x5 2x5 3x5'
 16 | 
 17 |       - series: ipmi_sel_command_success{instance="ubuntu-warning"}
 18 |         values: '1x15'
 19 |       - series: ipmi_sel_state_warning{instance="ubuntu-warning"}
 20 |         values: '1x5 2x5 3x5'
 21 | 
 22 |       - series: ipmi_sel_command_success{instance="ubuntu-critical"}
 23 |         values: '1x15'
 24 |       - series: ipmi_sel_state_critical{instance="ubuntu-critical"}
 25 |         values: '1x5 2x5 3x5'
 26 | 
 27 |       - series: node_systemd_unit_state{name="ipmiseld.service", instance="ubuntu-3", state="failed"}
 28 |         values: '1x15'
 29 | 
 30 |       - series: node_systemd_unit_state{name="ipmiseld.service", instance="ubuntu-4", state="inactive"}
 31 |         values: '1x15'
 32 | 
 33 |     alert_rule_test:
 34 |       - eval_time: 10m
 35 |         alertname: IPMISELCommandFailed
 36 |         exp_alerts:
 37 |           - exp_labels:
 38 |               severity: critical
 39 |               instance: ubuntu-0
 40 |             exp_annotations:
 41 |               summary: Failed to run ipmi-sel. (instance ubuntu-0)
 42 |               description: |
 43 |                 Failed to get system event logs using ipmi-sel.
 44 |                   VALUE = 0
 45 |                   LABELS = map[__name__:ipmi_sel_command_success instance:ubuntu-0]
 46 | 
 47 |       - eval_time: 7m
 48 |         alertname: IPMISELStateWarning
 49 |         exp_alerts:
 50 |           - exp_labels:
 51 |               severity: warning
 52 |               instance: ubuntu-warning
 53 |               event_id: 2
 54 |             exp_annotations:
 55 |               summary: IPMI system event log in warning state. (instance ubuntu-warning)
 56 |               description: |
 57 |                 IPMI SEL entry in warning state.
 58 |                   LABELS = map[__name__:ipmi_sel_state_warning instance:ubuntu-warning]
 59 |                   EVENT_ID = 2
 60 | 
 61 |       - eval_time: 13m
 62 |         alertname: IPMISELStateWarning
 63 |         exp_alerts:
 64 |           - exp_labels:
 65 |               severity: warning
 66 |               instance: ubuntu-warning
 67 |               event_id: 3
 68 |             exp_annotations:
 69 |               summary: IPMI system event log in warning state. (instance ubuntu-warning)
 70 |               description: |
 71 |                 IPMI SEL entry in warning state.
 72 |                   LABELS = map[__name__:ipmi_sel_state_warning instance:ubuntu-warning]
 73 |                   EVENT_ID = 3
 74 | 
 75 |       - eval_time: 7m
 76 |         alertname: IPMISELStateCritical
 77 |         exp_alerts:
 78 |           - exp_labels:
 79 |               severity: critical
 80 |               instance: ubuntu-critical
 81 |               event_id: 2
 82 |             exp_annotations:
 83 |               summary: IPMI system event log in critical state. (instance ubuntu-critical)
 84 |               description: |
 85 |                 IPMI SEL entry in critical state.
 86 |                   LABELS = map[__name__:ipmi_sel_state_critical instance:ubuntu-critical]
 87 |                   EVENT_ID = 2
 88 | 
 89 |       - eval_time: 13m
 90 |         alertname: IPMISELStateCritical
 91 |         exp_alerts:
 92 |           - exp_labels:
 93 |               severity: critical
 94 |               instance: ubuntu-critical
 95 |               event_id: 3
 96 |             exp_annotations:
 97 |               summary: IPMI system event log in critical state. (instance ubuntu-critical)
 98 |               description: |
 99 |                 IPMI SEL entry in critical state.
100 |                   LABELS = map[__name__:ipmi_sel_state_critical instance:ubuntu-critical]
101 |                   EVENT_ID = 3
102 | 
103 |       - eval_time: 10m
104 |         alertname: IPMISELDStateWarning
105 |         exp_alerts:
106 |           - exp_labels:
107 |               severity: warning
108 |               name: ipmiseld.service
109 |               instance: ubuntu-3
110 |               state: failed
111 |             exp_annotations:
112 |               summary: IPMISELD service is not active. (instance ubuntu-3)
113 |               description: |
114 |                 The ipmiseld service is not active, indicating a potential problem.
115 |                   VALUE = 1
116 |                   LABELS = map[__name__:node_systemd_unit_state instance:ubuntu-3 name:ipmiseld.service state:failed]
117 | 
118 |           - exp_labels:
119 |               severity: warning
120 |               name: ipmiseld.service
121 |               instance: ubuntu-4
122 |               state: inactive
123 |             exp_annotations:
124 |               summary: IPMISELD service is not active. (instance ubuntu-4)
125 |               description: |
126 |                 The ipmiseld service is not active, indicating a potential problem.
127 |                   VALUE = 1
128 |                   LABELS = map[__name__:node_systemd_unit_state instance:ubuntu-4 name:ipmiseld.service state:inactive]
129 | 
130 |   - interval: 1m
131 |     input_series:
132 |       - series: some_other_metric{instance="ubuntu-0"}
133 |         values: '1x10'
134 | 
135 |     alert_rule_test:
136 |       - eval_time: 6m
137 |         alertname: IPMISELCommandSuccessMetricsMissing
138 |         exp_alerts:
139 |           - exp_labels:
140 |               severity: critical
141 |             exp_annotations:
142 |               summary: IPMI SEL command success metrics missing. (instance )
143 |               description: |
144 |                 The ipmi_sel_command_success metric has been missing for over 5 minutes.
145 |                 This may indicate IPMI SEL command timeouts, or that IPMI tools/services are not installed or supported on this hardware.
146 |                   LABELS = map[]
147 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/smart.yaml:
--------------------------------------------------------------------------------
  1 | groups:
  2 | - name: SMART
  3 |   rules:
  4 | 
  5 |   - alert: SmartNVMeDriveReliabilityDegraded
  6 |     # isolate the least significant three bits with % 8
  7 |     # check whether bit 2 (starts from bit 0) has been set with the >= 4 comparison
  8 |     # refer: https://en.wikipedia.org/wiki/Self-Monitoring,_Analysis_and_Reporting_Technology#Known_NVMe_S.M.A.R.T._attributes
  9 |     expr: smartctl_device_critical_warning % 8 >= 4
 10 |     for: 15m
 11 |     labels:
 12 |       severity: critical
 13 |     annotations:
 14 |       summary: SMART alert for critical warning attribute on an NVMe controller due to degradation in drive reliability. (instance {{ $labels.instance }})
 15 |       description: |
 16 |         Drive reliability is degraded. Bit 2 of critical warning SMART attribute is set.
 17 |           VALUE = {{ $value }}
 18 |           LABELS = {{ $labels }}
 19 | 
 20 |   - alert: SmartNVMeDriveinReadOnlyMode
 21 |     # isolate the least significant four bits with % 16
 22 |     # check whether bit 3 (starts from bit 0) has been set with the >= 8 comparison
 23 |     # refer: https://en.wikipedia.org/wiki/Self-Monitoring,_Analysis_and_Reporting_Technology#Known_NVMe_S.M.A.R.T._attributes
 24 |     expr: smartctl_device_critical_warning % 16 >= 8
 25 |     for: 15m
 26 |     labels:
 27 |       severity: critical
 28 |     annotations:
 29 |       summary: SMART alert for critical warning attribute on an NVMe controller due to drive being in read-only mode. (instance {{ $labels.instance }})
 30 |       description: |
 31 |         Drive is in read-only mode. Bit 3 of critical warning SMART attribute is set.
 32 |           VALUE = {{ $value }}
 33 |           LABELS = {{ $labels }}
 34 | 
 35 |   - alert: SmartHealthStatusFail
 36 |     expr: smartctl_device_smart_status == 0
 37 |     for: 2m
 38 |     labels:
 39 |       severity: critical
 40 |     annotations:
 41 |       summary: SMART health status failed for device. (instance {{ $labels.instance }})
 42 |       description: |
 43 |         SMART health status failed for device. This means either that the device has already failed, or that it is predicting its own failure within the next 24 hours.
 44 |           VALUE = {{ $value }}
 45 |           LABELS = {{ $labels }}
 46 | 
 47 |   - alert: SmartExitStatusDiskFail
 48 |     # isolate the least significant four bits with % 16
 49 |     # check whether bit 3 (starts from bit 0) has been set with the >= 8 comparison
 50 |     # refer: https://www.smartmontools.org/browser/trunk/smartmontools/smartctl.8.in#EXIT_STATUS
 51 |     expr: smartctl_device_smartctl_exit_status % 16 >= 8
 52 |     for: 2m
 53 |     labels:
 54 |       severity: critical
 55 |     annotations:
 56 |       summary: smartctl exit status returned "DISK FAILING". (instance {{ $labels.instance }})
 57 |       description: |
 58 |         smartctl exit status returned "DISK FAILING". Bit 3 of smartctl exit status is set.
 59 |           VALUE = {{ $value }}
 60 |           LABELS = {{ $labels }}
 61 | 
 62 |   - alert: SmartExitStatusPrefailBelowThreshold
 63 |     # isolate the least significant four bits with % 32
 64 |     # check whether bit 4 (starts from bit 0) has been set with the >= 16 comparison
 65 |     # refer: https://www.smartmontools.org/browser/trunk/smartmontools/smartctl.8.in#EXIT_STATUS
 66 |     expr: smartctl_device_smartctl_exit_status % 32 >= 16
 67 |     for: 2m
 68 |     labels:
 69 |       severity: warning
 70 |     annotations:
 71 |       summary: smartctl exit status reports pre-fail attribute for device is below threshold. (instance {{ $labels.instance }})
 72 |       description: |
 73 |         smartctl exit status pre-fail attribute is below threshold. Bit 4 of smartctl exit status is set.
 74 |           VALUE = {{ $value }}
 75 |           LABELS = {{ $labels }}
 76 | 
 77 |   - alert: SmartNVMeWearoutIndicator
 78 |     expr: smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}
 79 |     for: 15m
 80 |     labels:
 81 |       severity: critical
 82 |     annotations:
 83 |       summary: SMART alert for available spare space below threshold for NVMe device. (instance {{ $labels.instance }})
 84 |       description: |
 85 |         Available spare space below threshold for NVMe device.
 86 |           VALUE = {{ $value }}
 87 |           LABELS = {{ $labels }}
 88 | 
 89 |   - alert: SmartAttributeWarning
 90 |     # based on https://www.backblaze.com/blog/what-smart-stats-indicate-hard-drive-failures/
 91 |     expr: smartctl_device_attribute{attribute_id=~"5|187|188|197|198", attribute_value_type="raw"} > 0
 92 |     for: 2m
 93 |     labels:
 94 |       severity: warning
 95 |     annotations:
 96 |       summary: SMART device attribute correlating with drive failure has its raw value greater than zero. (instance {{ $labels.instance }})
 97 |       description: |
 98 |         SMART raw value for attribute "{{ $labels.attribute_name }}" with id "{{ $labels.attribute_id }}"
 99 |         on device "{{ $labels.device }}" is greater than 0.
100 |           VALUE = {{ $value }}
101 |           LABELS = {{ $labels }}
102 | 
103 |   - alert: SmartNVMeDriveLifetimeWarning
104 |     expr: smartctl_device_percentage_used{device=~"nvme.*"} >= 80
105 |     for: 15m
106 |     labels:
107 |       severity: warning
108 |     annotations:
109 |       summary: NVMe drive is approaching its estimated lifetime (instance {{ $labels.instance }})
110 |       description: |
111 |         The NVMe drive has reached 80% of its estimated lifetime.
112 |         Note: A value of 100 does not indicate failure. For more details, visit https://charmhub.io/hardware-observer/docs/metrics-and-alerts-smart
113 |           VALUE = {{ $value }}
114 |           LABELS = {{ $labels }}
115 | 
116 |   - alert: SmartNVMeDriveLifetimeCritical
117 |     expr: smartctl_device_percentage_used{device=~"nvme.*"} >= 90
118 |     for: 15m
119 |     labels:
120 |       severity: critical
121 |     annotations:
122 |       summary: NVMe drive is close to reaching its estimated lifetime (instance {{ $labels.instance }})
123 |       description: |
124 |         The NVMe drive has reached 90% of its estimated lifetime.
125 |         Note: A value of 100 does not indicate failure. For more details, visit https://charmhub.io/hardware-observer/docs/metrics-and-alerts-smart
126 |           VALUE = {{ $value }}
127 |           LABELS = {{ $labels }}
128 | 


--------------------------------------------------------------------------------
/tests/functional/conftest.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import logging
  3 | import os
  4 | import platform
  5 | from pathlib import Path
  6 | 
  7 | import pytest
  8 | from pytest_operator.plugin import OpsTest
  9 | from utils import RESOURCES_DIR, Resource
 10 | 
 11 | from config import HARDWARE_EXPORTER_COLLECTOR_MAPPING, TPR_RESOURCES, HWTool
 12 | 
 13 | log = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def pytest_addoption(parser):
 17 |     parser.addoption(
 18 |         "--base",
 19 |         type=str.lower,
 20 |         default="ubuntu@22.04",
 21 |         choices=["ubuntu@20.04", "ubuntu@22.04", "ubuntu@24.04"],
 22 |         help="Set base for the applications.",
 23 |     )
 24 | 
 25 |     parser.addoption(
 26 |         "--realhw",
 27 |         action="store_true",
 28 |         help="Enable real hardware testing.",
 29 |     )
 30 | 
 31 |     parser.addoption(
 32 |         "--nvidia",
 33 |         action="store_true",
 34 |         help="Enable NVIDIA GPU support for testing with real hardware.",
 35 |     )
 36 | 
 37 |     parser.addoption(
 38 |         "--collectors",
 39 |         nargs="+",
 40 |         type=str.lower,
 41 |         default="",
 42 |         choices=[
 43 |             "ipmi_dcmi",
 44 |             "ipmi_sel",
 45 |             "ipmi_sensor",
 46 |             "redfish",
 47 |             "mega_raid",
 48 |             "poweredge_raid",
 49 |             "lsi_sas_2",
 50 |             "lsi_sas_3",
 51 |             "hpe_ssa",
 52 |         ],
 53 |         help="Provide space-separated list of collectors for testing with real hardware.",
 54 |     )
 55 | 
 56 | 
 57 | def get_this_script_dir() -> Path:
 58 |     filename = inspect.getframeinfo(inspect.currentframe()).filename  # type: ignore[arg-type]
 59 |     path = os.path.dirname(os.path.abspath(filename))
 60 |     return Path(path)
 61 | 
 62 | 
 63 | @pytest.fixture(scope="module")
 64 | def bundle(ops_test: OpsTest, request, charm_path, base, provided_collectors):
 65 |     """Configure the bundle depending on cli arguments."""
 66 |     bundle_template_path = get_this_script_dir() / "bundle.yaml.j2"
 67 |     log.info("Rendering bundle %s", bundle_template_path)
 68 |     bundle = ops_test.render_bundle(
 69 |         bundle_template_path,
 70 |         charm=charm_path,
 71 |         base=base,
 72 |         redfish_disable=("redfish" not in provided_collectors),
 73 |         resources={
 74 |             "storcli-deb": "empty-resource",
 75 |             "perccli-deb": "empty-resource",
 76 |             "sas2ircu-bin": "empty-resource",
 77 |             "sas3ircu-bin": "empty-resource",
 78 |         },
 79 |     )
 80 | 
 81 |     return bundle
 82 | 
 83 | 
 84 | @pytest.fixture(scope="module")
 85 | def base(request):
 86 |     return request.config.getoption("--base")
 87 | 
 88 | 
 89 | @pytest.fixture(scope="module")
 90 | def nvidia_present(request):
 91 |     return request.config.getoption("--nvidia")
 92 | 
 93 | 
 94 | @pytest.fixture(scope="module")
 95 | def realhw(request):
 96 |     return request.config.getoption("--realhw")
 97 | 
 98 | 
 99 | @pytest.fixture(scope="module")
100 | def architecture():
101 |     machine = platform.machine()
102 |     if machine == "aarch64":
103 |         return "arm64"
104 |     return "amd64"
105 | 
106 | 
107 | @pytest.fixture(scope="module")
108 | def provided_collectors(request):
109 |     return set(request.config.getoption("collectors"))
110 | 
111 | 
112 | def pytest_configure(config):
113 |     config.addinivalue_line("markers", "realhw: mark test as requiring real hardware to run.")
114 | 
115 | 
116 | def pytest_collection_modifyitems(config, items):
117 |     if not config.getoption("--realhw"):
118 |         # skip hw dependent tests in TestCharmWithHW marked with "realhw"
119 |         skip_hw_dependent = pytest.mark.skip(
120 |             reason="Hardware dependent test. Provide collectors with the --collectors option."
121 |         )
122 |         for item in items:
123 |             if "realhw" in item.keywords:
124 |                 item.add_marker(skip_hw_dependent)
125 | 
126 | 
127 | @pytest.fixture()
128 | def app(ops_test):
129 |     return ops_test.model.applications["hardware-observer"]
130 | 
131 | 
132 | @pytest.fixture()
133 | def unit(app):
134 |     return app.units[0]
135 | 
136 | 
137 | @pytest.fixture()
138 | def resources() -> list[Resource]:
139 |     """Return list of Resource objects."""
140 |     return [
141 |         Resource(
142 |             resource_name=TPR_RESOURCES.get(HWTool.STORCLI),
143 |             file_name="storcli.deb",
144 |             collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.STORCLI).replace(
145 |                 "collector.", ""
146 |             ),
147 |             bin_name=HWTool.STORCLI.value,
148 |         ),
149 |         Resource(
150 |             resource_name=TPR_RESOURCES.get(HWTool.PERCCLI),
151 |             file_name="perccli.deb",
152 |             collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.PERCCLI).replace(
153 |                 "collector.", ""
154 |             ),
155 |             bin_name=HWTool.PERCCLI.value,
156 |         ),
157 |         Resource(
158 |             resource_name=TPR_RESOURCES.get(HWTool.SAS2IRCU),
159 |             file_name="sas2ircu",
160 |             collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS2IRCU).replace(
161 |                 "collector.", ""
162 |             ),
163 |             bin_name=HWTool.SAS2IRCU.value,
164 |         ),
165 |         Resource(
166 |             resource_name=TPR_RESOURCES.get(HWTool.SAS3IRCU),
167 |             file_name="sas3ircu",
168 |             collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS3IRCU).replace(
169 |                 "collector.", ""
170 |             ),
171 |             bin_name=HWTool.SAS3IRCU.value,
172 |         ),
173 |     ]
174 | 
175 | 
176 | @pytest.fixture()
177 | def required_resources(resources: list[Resource], provided_collectors: set) -> list[Resource]:
178 |     """Return list of required resources to be attached as per hardware availability.
179 | 
180 |     Required resources will be empty if no collectors are provided.
181 |     """
182 |     required_resources = []
183 | 
184 |     for resource in resources:
185 |         if resource.collector_name in provided_collectors:
186 |             resource.file_path = f"{RESOURCES_DIR}/{resource.file_name}"
187 |             required_resources.append(resource)
188 | 
189 |     return required_resources
190 | 
191 | 
192 | @pytest.fixture(scope="module")
193 | def charm_path(base: str, architecture: str) -> Path:
194 |     """Fixture to determine the charm path based on the base and architecture."""
195 |     glob_path = f"hardware-observer_*{base}-{architecture}*.charm"
196 |     paths = list(Path(".").glob(glob_path))
197 | 
198 |     if not paths:
199 |         raise FileNotFoundError(f"The path for the charm for {base}-{architecture} is not found.")
200 | 
201 |     if len(paths) > 1:
202 |         raise FileNotFoundError(
203 |             f"Multiple charms found for {base}-{architecture}. Please provide only one."
204 |         )
205 | 
206 |     # The bundle will need the full path to the charm
207 |     path = paths[0].absolute()
208 |     log.info(f"Using charm path: {path}")
209 |     return path
210 | 


--------------------------------------------------------------------------------
/tests/unit/test_ssdlc.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Canonical Ltd.
  2 | # See LICENSE file for licensing details.
  3 | 
  4 | """Unit tests for SSDLC logging functionality."""
  5 | 
  6 | import unittest
  7 | from datetime import datetime, timezone
  8 | from unittest import mock
  9 | 
 10 | from parameterized import parameterized
 11 | 
 12 | from ssdlc import EXPORTER_NAME_TO_SERVICE, Service, SSDLCSysEvent, log_ssdlc_system_event
 13 | 
 14 | 
 15 | class TestSSDLCLogging(unittest.TestCase):
 16 |     """Test SSDLC logging functions."""
 17 | 
 18 |     @mock.patch("ssdlc.logger")
 19 |     @mock.patch("ssdlc.datetime")
 20 |     def test_log_ssdlc_system_event_with_exporter_name(self, mock_datetime, mock_logger):
 21 |         """Test logging with exporter_name string."""
 22 |         # Setup mock datetime
 23 |         mock_now = mock.MagicMock()
 24 |         mock_now.isoformat.return_value = "2025-01-01T12:00:00+00:00"
 25 |         mock_datetime.now.return_value.astimezone.return_value = mock_now
 26 | 
 27 |         # Call the function with exporter name
 28 |         log_ssdlc_system_event(SSDLCSysEvent.STARTUP, "hardware-exporter")
 29 | 
 30 |         # Verify logger was called correctly
 31 |         mock_logger.warning.assert_called_once()
 32 |         logged_data = mock_logger.warning.call_args[0][0]
 33 | 
 34 |         self.assertEqual(logged_data["datetime"], "2025-01-01T12:00:00+00:00")
 35 |         self.assertEqual(logged_data["appid"], "service.hardware-exporter")
 36 |         self.assertEqual(logged_data["event"], "sys_startup:hardware-exporter")
 37 |         self.assertEqual(logged_data["level"], "WARN")
 38 |         self.assertIn("hardware observer start service", logged_data["description"])
 39 | 
 40 |     @mock.patch("ssdlc.logger")
 41 |     @mock.patch("ssdlc.datetime")
 42 |     def test_log_ssdlc_system_event_with_different_exporter(self, mock_datetime, mock_logger):
 43 |         """Test logging with different exporter name."""
 44 |         # Setup mock datetime
 45 |         mock_now = mock.MagicMock()
 46 |         mock_now.isoformat.return_value = "2025-01-01T12:00:00+00:00"
 47 |         mock_datetime.now.return_value.astimezone.return_value = mock_now
 48 | 
 49 |         # Call the function with dcgm exporter name
 50 |         log_ssdlc_system_event(SSDLCSysEvent.SHUTDOWN, "dcgm")
 51 | 
 52 |         # Verify logger was called correctly
 53 |         mock_logger.warning.assert_called_once()
 54 |         logged_data = mock_logger.warning.call_args[0][0]
 55 | 
 56 |         self.assertEqual(logged_data["datetime"], "2025-01-01T12:00:00+00:00")
 57 |         self.assertEqual(logged_data["appid"], "service.dcgm")
 58 |         self.assertEqual(logged_data["event"], "sys_shutdown:dcgm")
 59 |         self.assertEqual(logged_data["level"], "WARN")
 60 |         self.assertIn("hardware observer shutdown service", logged_data["description"])
 61 | 
 62 |     @mock.patch("ssdlc.logger")
 63 |     def test_log_ssdlc_system_event_with_unknown_service(self, mock_logger):
 64 |         """Test logging with unknown service name."""
 65 |         # Call the function with unknown service
 66 |         log_ssdlc_system_event(SSDLCSysEvent.STARTUP, "unknown-service")
 67 | 
 68 |         # Verify warning was logged with format string and args
 69 |         mock_logger.warning.assert_called_once_with(
 70 |             "Unknown service name: %s, skipping SSDLC logging", "unknown-service"
 71 |         )
 72 | 
 73 |     @parameterized.expand(
 74 |         [
 75 |             (SSDLCSysEvent.STARTUP, "hardware-exporter", ""),
 76 |             (SSDLCSysEvent.SHUTDOWN, "dcgm", ""),
 77 |             (SSDLCSysEvent.RESTART, "smartctl-exporter", ""),
 78 |             (
 79 |                 SSDLCSysEvent.CRASH,
 80 |                 "hardware-exporter",
 81 |                 "Connection timeout",
 82 |             ),
 83 |         ]
 84 |     )
 85 |     @mock.patch("ssdlc.logger")
 86 |     @mock.patch("ssdlc.datetime")
 87 |     def test_log_ssdlc_system_event_all_events(
 88 |         self, event, service_name, msg, mock_datetime, mock_logger
 89 |     ):
 90 |         """Test logging all event types."""
 91 |         # Setup mock datetime
 92 |         mock_now = mock.MagicMock()
 93 |         mock_now.isoformat.return_value = "2025-01-01T12:00:00+00:00"
 94 |         mock_datetime.now.return_value.astimezone.return_value = mock_now
 95 | 
 96 |         # Call the function
 97 |         log_ssdlc_system_event(event, service_name, msg)
 98 | 
 99 |         # Verify logger was called
100 |         mock_logger.warning.assert_called_once()
101 |         logged_data = mock_logger.warning.call_args[0][0]
102 | 
103 |         self.assertEqual(logged_data["datetime"], "2025-01-01T12:00:00+00:00")
104 |         self.assertEqual(logged_data["appid"], f"service.{service_name}")
105 |         self.assertEqual(logged_data["event"], f"{event.value}:{service_name}")
106 |         self.assertEqual(logged_data["level"], "WARN")
107 |         self.assertIsInstance(logged_data["description"], str)
108 |         if msg:
109 |             self.assertIn(msg, logged_data["description"])
110 | 
111 |     def test_exporter_name_to_service_mapping(self):
112 |         """Test that all exporters are mapped correctly."""
113 |         self.assertEqual(
114 |             EXPORTER_NAME_TO_SERVICE["hardware-exporter"],
115 |             Service.HARDWARE_EXPORTER,
116 |         )
117 |         self.assertEqual(
118 |             EXPORTER_NAME_TO_SERVICE["dcgm"],
119 |             Service.DCGM_EXPORTER,
120 |         )
121 |         self.assertEqual(
122 |             EXPORTER_NAME_TO_SERVICE["smartctl-exporter"],
123 |             Service.SMARTCTL_EXPORTER,
124 |         )
125 | 
126 |     @mock.patch("ssdlc.logger")
127 |     @mock.patch("ssdlc.datetime")
128 |     def test_log_ssdlc_system_event_with_additional_message(self, mock_datetime, mock_logger):
129 |         """Test logging with additional message."""
130 |         # Setup mock datetime
131 |         mock_now = mock.MagicMock()
132 |         mock_now.isoformat.return_value = "2025-01-01T12:00:00+00:00"
133 |         mock_datetime.now.return_value.astimezone.return_value = mock_now
134 | 
135 |         # Call with additional message
136 |         additional_msg = "Service failed due to network error"
137 |         log_ssdlc_system_event(SSDLCSysEvent.CRASH, "hardware-exporter", additional_msg)
138 | 
139 |         # Verify the additional message is included
140 |         logged_data = mock_logger.warning.call_args[0][0]
141 |         self.assertIn(additional_msg, logged_data["description"])
142 | 
143 |     @mock.patch("ssdlc.logger")
144 |     @mock.patch("ssdlc.datetime")
145 |     def test_log_ssdlc_system_event_datetime_format(self, mock_datetime, mock_logger):
146 |         """Test that datetime is in ISO 8601 format with timezone."""
147 |         # Use a real datetime to test formatting
148 |         test_time = datetime(2025, 1, 15, 14, 30, 45, tzinfo=timezone.utc)
149 |         mock_datetime.now.return_value.astimezone.return_value = test_time
150 | 
151 |         log_ssdlc_system_event(SSDLCSysEvent.STARTUP, "hardware-exporter")
152 | 
153 |         logged_data = mock_logger.warning.call_args[0][0]
154 |         # Verify ISO 8601 format with timezone
155 |         self.assertRegex(
156 |             logged_data["datetime"],
157 |             r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2}",
158 |         )
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     unittest.main()
163 | 


--------------------------------------------------------------------------------
/tests/unit/test_alert_rules/test_redfish.yaml:
--------------------------------------------------------------------------------
  1 | rule_files:
  2 |   - ../../../src/prometheus_alert_rules_dynamic/redfish.yaml
  3 | 
  4 | evaluation_interval: 1m
  5 | 
  6 | tests:
  7 |   - interval: 1m
  8 |     input_series:
  9 |       - series: redfish_call_success{instance="ubuntu-0"}
 10 |         values: "0x15"
 11 | 
 12 |     alert_rule_test:
 13 |       - eval_time: 10m
 14 |         alertname: RedfishCallFailed
 15 |         exp_alerts:
 16 |           - exp_labels:
 17 |               severity: warning
 18 |               instance: ubuntu-0
 19 |             exp_annotations:
 20 |               summary: Call to the Redfish API failed. (instance ubuntu-0)
 21 |               description: |
 22 |                 Failure in calling the Redfish API.
 23 |                   VALUE = 0
 24 |                   LABELS = map[__name__:redfish_call_success instance:ubuntu-0]
 25 | 
 26 |   - interval: 1m
 27 |     input_series:
 28 |       - series: redfish_service_available{instance="ubuntu-1"}
 29 |         values: "0x15"
 30 | 
 31 |     alert_rule_test:
 32 |       - eval_time: 10m
 33 |         alertname: RedfishServiceUnavailable
 34 |         exp_alerts:
 35 |           - exp_labels:
 36 |               severity: warning
 37 |               instance: ubuntu-1
 38 |             exp_annotations:
 39 |               summary: No redfish services available. (instance ubuntu-1)
 40 |               description: |
 41 |                 No redfish services available.
 42 |                   VALUE = 0
 43 |                   LABELS = map[__name__:redfish_service_available instance:ubuntu-1]
 44 | 
 45 |   - interval: 1m
 46 |     input_series:
 47 |       - series: redfish_sensor_info{instance="ubuntu-2", health="Unhealthy", reading="82%"}
 48 |         values: "1x15"
 49 | 
 50 |     alert_rule_test:
 51 |       - eval_time: 10m
 52 |         alertname: RedfishSensorHealthNotOk
 53 |         exp_alerts:
 54 |           - exp_labels:
 55 |               severity: critical
 56 |               instance: ubuntu-2
 57 |               health: Unhealthy
 58 |               reading: 82%
 59 |             exp_annotations:
 60 |               summary: Redfish sensor health not Ok. (instance ubuntu-2)
 61 |               description: |
 62 |                 Redfish sensor health not Ok.
 63 |                   SENSOR_READING = 82%
 64 |                   LABELS = map[__name__:redfish_sensor_info health:Unhealthy instance:ubuntu-2 reading:82%]
 65 | 
 66 |   - interval: 1m
 67 |     input_series:
 68 |       - series: redfish_processor_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", processor_id="p1", model="processor-model-1"}
 69 |         values: "1x15"
 70 | 
 71 |     alert_rule_test:
 72 |       - eval_time: 10m
 73 |         alertname: RedfishProcessorHealthNotOk
 74 |         exp_alerts:
 75 |           - exp_labels:
 76 |               severity: critical
 77 |               instance: ubuntu-1
 78 |               health: Unhealthy
 79 |               system_id: s1
 80 |               processor_id: p1
 81 |               model: processor-model-1
 82 |             exp_annotations:
 83 |               summary: Redfish processor health not OK. (instance ubuntu-1)
 84 |               description: |
 85 |                 Redfish processor health not OK.
 86 |                   LABELS = map[__name__:redfish_processor_info health:Unhealthy instance:ubuntu-1 model:processor-model-1 processor_id:p1 system_id:s1]
 87 | 
 88 |   - interval: 1m
 89 |     input_series:
 90 |       - series: redfish_storage_controller_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", storage_id="stor1", controller_id="ctrl1"}
 91 |         values: "1x15"
 92 | 
 93 |     alert_rule_test:
 94 |       - eval_time: 10m
 95 |         alertname: RedfishStorageControllerHealthNotOk
 96 |         exp_alerts:
 97 |           - exp_labels:
 98 |               severity: critical
 99 |               instance: ubuntu-1
100 |               health: Unhealthy
101 |               system_id: s1
102 |               storage_id: stor1
103 |               controller_id: ctrl1
104 |             exp_annotations:
105 |               summary: Redfish storage controller health not OK. (instance ubuntu-1)
106 |               description: |
107 |                 Redfish storage controller health not OK.
108 |                   LABELS = map[__name__:redfish_storage_controller_info controller_id:ctrl1 health:Unhealthy instance:ubuntu-1 storage_id:stor1 system_id:s1]
109 | 
110 |   - interval: 1m
111 |     input_series:
112 |       - series: redfish_chassis_info{instance="ubuntu-1", health="Unhealthy", chassis_id="ch1", model="chassis-model1"}
113 |         values: "1x15"
114 | 
115 |     alert_rule_test:
116 |       - eval_time: 10m
117 |         alertname: RedfishChassisHealthNotOk
118 |         exp_alerts:
119 |           - exp_labels:
120 |               severity: critical
121 |               instance: ubuntu-1
122 |               health: Unhealthy
123 |               chassis_id: ch1
124 |               model: chassis-model1
125 |             exp_annotations:
126 |               summary: Redfish chassis health not OK. (instance ubuntu-1)
127 |               description: |
128 |                 Redfish chassis health not OK.
129 |                   LABELS = map[__name__:redfish_chassis_info chassis_id:ch1 health:Unhealthy instance:ubuntu-1 model:chassis-model1]
130 | 
131 |   - interval: 1m
132 |     input_series:
133 |       - series: redfish_storage_drive_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", state="Enabled", storage_id="stor1", drive_id="dr1"}
134 |         values: "1x15"
135 | 
136 |     alert_rule_test:
137 |       - eval_time: 10m
138 |         alertname: RedfishStorageDriveHealthNotOk
139 |         exp_alerts:
140 |           - exp_labels:
141 |               severity: critical
142 |               instance: ubuntu-1
143 |               health: Unhealthy
144 |               system_id: s1
145 |               storage_id: stor1
146 |               drive_id: dr1
147 |               state: Enabled
148 |             exp_annotations:
149 |               summary: Redfish storage drive health not OK. (instance ubuntu-1)
150 |               description: |
151 |                 Redfish storage drive health not OK.
152 |                   LABELS = map[__name__:redfish_storage_drive_info drive_id:dr1 health:Unhealthy instance:ubuntu-1 state:Enabled storage_id:stor1 system_id:s1]
153 | 
154 | 
155 |   - interval: 1m
156 |     input_series:
157 |       - series: redfish_memory_dimm_info{instance="ubuntu-1", health="Unhealthy", system_id="s1", memory_id="mem1"}
158 |         values: "1x15"
159 | 
160 |     alert_rule_test:
161 |       - eval_time: 10m
162 |         alertname: RedfishMemoryDimmHealthNotOk
163 |         exp_alerts:
164 |           - exp_labels:
165 |               severity: critical
166 |               instance: ubuntu-1
167 |               health: Unhealthy
168 |               system_id: s1
169 |               memory_id: mem1
170 |             exp_annotations:
171 |               summary: Redfish memory dimm health not OK. (instance ubuntu-1)
172 |               description: |
173 |                 Redfish memory dimm health not OK.
174 |                   LABELS = map[__name__:redfish_memory_dimm_info health:Unhealthy instance:ubuntu-1 memory_id:mem1 system_id:s1]
175 | 
176 |   - interval: 1m
177 |     input_series:
178 |       - series: redfish_smart_storage_health{instance="ubuntu-1"}
179 |         values: "0x15"
180 | 
181 |     alert_rule_test:
182 |       - eval_time: 10m
183 |         alertname: RedfishSmartStorageHealthNotOk
184 |         exp_alerts:
185 |           - exp_labels:
186 |               severity: critical
187 |               instance: ubuntu-1
188 |             exp_annotations:
189 |               summary: Redfish smart storage health not OK. (instance ubuntu-1)
190 |               description: |
191 |                 Redfish smart storage health not OK.
192 |                   VALUE = 0
193 |                   LABELS = map[__name__:redfish_smart_storage_health instance:ubuntu-1]
194 | 


--------------------------------------------------------------------------------
/tests/integration/test_cos_integration.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright 2024 Canonical Ltd.
  3 | # See LICENSE file for licensing details.
  4 | 
  5 | import asyncio
  6 | import json
  7 | import logging
  8 | import subprocess
  9 | from pathlib import Path
 10 | 
 11 | import pytest
 12 | from mock_data import EXPECTED_ALERTS
 13 | from pytest_operator.plugin import OpsTest
 14 | from tenacity import AsyncRetrying, RetryError, stop_after_attempt, wait_fixed
 15 | from utils import Alert
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | @pytest.mark.abort_on_fail
 21 | @pytest.mark.skip_if_deployed
 22 | async def test_setup_and_deploy(base, channel, lxd_ctl, k8s_ctl, lxd_model, k8s_model):
 23 |     """Setup models and then deploy Hardware Observer and COS."""
 24 |     await _deploy_cos(channel, k8s_ctl, k8s_model)
 25 | 
 26 |     await _deploy_hardware_observer(base, channel, lxd_model)
 27 | 
 28 |     await _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model)
 29 | 
 30 |     # This verifies that the cross-controller relation with COS is successful
 31 |     assert lxd_model.applications["grafana-agent"].status == "active"
 32 | 
 33 | 
 34 | async def test_alerts(ops_test: OpsTest, lxd_model, k8s_model):
 35 |     """Verify that the required alerts are fired."""
 36 |     await _disable_hardware_exporter(ops_test, lxd_model)
 37 |     await _export_mock_metrics(lxd_model)
 38 | 
 39 |     # Run juju action to get the ip address that traefik is configured to serve on
 40 |     returncode, stdout, stderr = await ops_test.run(
 41 |         "juju",
 42 |         "run",
 43 |         "--format",
 44 |         "json",
 45 |         "traefik/0",
 46 |         "show-proxied-endpoints",
 47 |     )
 48 |     json_data = json.loads(stdout)
 49 |     proxied_endpoints = json.loads(json_data["traefik/0"]["results"]["proxied-endpoints"])
 50 |     prometheus_url = proxied_endpoints["prometheus/0"]["url"]
 51 |     prometheus_alerts_endpoint = f"{prometheus_url}/api/v1/alerts"
 52 | 
 53 |     cmd = ["curl", prometheus_alerts_endpoint]
 54 | 
 55 |     # Sometimes alerts take some time to show after the metrics are exposed on the host.
 56 |     # Additionally, some alerts longer duration like 5m, and they take some time to
 57 |     # transition to `firing` state.
 58 |     # So retrying for upto 15 minutes.
 59 |     try:
 60 |         async for attempt in AsyncRetrying(stop=stop_after_attempt(45), wait=wait_fixed(20)):
 61 |             with attempt:
 62 |                 try:
 63 |                     alerts_response = subprocess.check_output(cmd)
 64 |                 except subprocess.CalledProcessError:
 65 |                     logger.error("Failed to fetch alerts data from COS")
 66 |                     raise
 67 | 
 68 |                 alerts = json.loads(alerts_response)["data"]["alerts"]
 69 | 
 70 |                 received_alerts = [
 71 |                     Alert(
 72 |                         state=received_alert["state"],
 73 |                         value=float(received_alert["value"]),
 74 |                         labels=received_alert["labels"],
 75 |                     )
 76 |                     for received_alert in alerts
 77 |                 ]
 78 |                 expected_alerts = [
 79 |                     Alert(
 80 |                         state=expected_alert["state"],
 81 |                         value=float(expected_alert["value"]),
 82 |                         labels=expected_alert["labels"],
 83 |                     )
 84 |                     for expected_alert in EXPECTED_ALERTS
 85 |                 ]
 86 | 
 87 |                 for expected_alert in expected_alerts:
 88 |                     assert any(
 89 |                         expected_alert.is_same_alert(received_alert)
 90 |                         for received_alert in received_alerts
 91 |                     )
 92 | 
 93 |     except RetryError:
 94 |         pytest.fail("Expected alerts not found in COS.")
 95 | 
 96 | 
 97 | async def _disable_hardware_exporter(ops_test: OpsTest, lxd_model):
 98 |     """Disable the hardware exporter service."""
 99 |     disable_cmd = "sudo systemctl stop hardware-exporter.service"
100 | 
101 |     hardware_observer = lxd_model.applications.get("hardware-observer")
102 |     hardware_observer_unit = hardware_observer.units[0]
103 | 
104 |     disable_action = await hardware_observer_unit.run(disable_cmd)
105 |     await disable_action.wait()
106 | 
107 | 
108 | async def _export_mock_metrics(lxd_model):
109 |     """Expose the mock metrics for further testing."""
110 |     hardware_observer = lxd_model.applications.get("hardware-observer")
111 |     hardware_observer_unit = hardware_observer.units[0]
112 | 
113 |     # Create an executable from `export_mock_metrics.py`
114 |     bundle_cmd = [
115 |         "pyinstaller",
116 |         "--onefile",
117 |         str(Path(__file__).parent.resolve() / "export_mock_metrics.py"),
118 |     ]
119 |     try:
120 |         subprocess.run(bundle_cmd)
121 |     except subprocess.CalledProcessError:
122 |         logger.error("Failed to bundle export_mock_metrics")
123 |         raise
124 | 
125 |     # scp the executable to hardware-observer unit
126 |     await hardware_observer_unit.scp_to("./dist/export_mock_metrics", "/home/ubuntu")
127 | 
128 |     # Run the executable in the background without waiting.
129 |     run_export_mock_metrics_cmd = "/home/ubuntu/export_mock_metrics"
130 |     await hardware_observer_unit.run(run_export_mock_metrics_cmd)
131 | 
132 | 
133 | async def _deploy_cos(channel, ctl, model):
134 |     """Deploy COS on the existing k8s cloud."""
135 |     # Deploying via CLI because of https://github.com/juju/python-libjuju/issues/1032.
136 |     cmd = [
137 |         "juju",
138 |         "deploy",
139 |         "cos-lite",
140 |         "--channel",
141 |         channel,
142 |         "--trust",
143 |         "-m",
144 |         f"{ctl.controller_name}:{model.name}",
145 |         "--overlay",
146 |         str(Path(__file__).parent.resolve() / "offers-overlay.yaml"),
147 |     ]
148 |     subprocess.run(cmd, check=True)
149 | 
150 | 
151 | async def _deploy_hardware_observer(base, channel, model):
152 |     """Deploy Hardware Observer and Grafana Agent on the existing lxd cloud."""
153 |     await asyncio.gather(
154 |         # Principal Ubuntu
155 |         model.deploy("ubuntu", num_units=1, base=base, channel=channel),
156 |         # Hardware Observer
157 |         model.deploy("hardware-observer", base=base, num_units=0, channel=channel),
158 |         # Grafana Agent
159 |         model.deploy("grafana-agent", num_units=0, base=base, channel=channel),
160 |     )
161 | 
162 |     await model.add_relation("ubuntu:juju-info", "hardware-observer:general-info")
163 |     await model.add_relation("hardware-observer:cos-agent", "grafana-agent:cos-agent")
164 |     await model.add_relation("ubuntu:juju-info", "grafana-agent:juju-info")
165 | 
166 |     await model.block_until(lambda: model.applications["hardware-observer"].status == "active")
167 | 
168 | 
169 | async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model):
170 |     """Add relations between Grafana Agent and COS."""
171 |     cos_saas_names = ["prometheus-receive-remote-write", "loki-logging", "grafana-dashboards"]
172 |     for saas in cos_saas_names:
173 |         # Using juju cli since Model.consume() from libjuju causes error.
174 |         # https://github.com/juju/python-libjuju/issues/1031
175 |         cmd = [
176 |             "juju",
177 |             "consume",
178 |             "--model",
179 |             f"{lxd_ctl.controller_name}:{k8s_model.name}",
180 |             f"{k8s_ctl.controller_name}:admin/{k8s_model.name}.{saas}",
181 |         ]
182 |         subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
183 |         await lxd_model.add_relation("grafana-agent", saas),
184 | 
185 |     # `idle_period` needs to be greater than the scrape interval to make sure metrics ingested.
186 |     await asyncio.gather(
187 |         # First, we wait for the critical phase to pass with raise_on_error=False.
188 |         # (In CI, using github runners, we often see unreproducible hook failures.)
189 |         lxd_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
190 |         k8s_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
191 |     )
192 | 
193 |     await asyncio.gather(
194 |         # Then we wait for "active", without raise_on_error=False, so the test fails sooner in case
195 |         # there is a persistent error status.
196 |         lxd_model.wait_for_idle(status="active", timeout=7200, idle_period=180),
197 |         k8s_model.wait_for_idle(status="active", timeout=7200, idle_period=180),
198 |     )
199 | 


--------------------------------------------------------------------------------
/src/gpu_metrics/dcgm_metrics.csv:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | # [ WARNING ]
  3 | # Configuration file maintained by Juju. Local changes may be overwritten.
  4 | ###############################################################################
  5 | 
  6 | # Selected metrics for dcgm-exporter
  7 | # Default metric list https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/default-counters.csv
  8 | 
  9 | # Format
 10 | # If line starts with a '#' it is considered a comment
 11 | # Boolean values decode to - 1 = enabled 0 = disabled
 12 | # DCGM FIELD, Prometheus metric type, help message
 13 | 
 14 | 
 15 | 
 16 | 
 17 | # DEFAULT METRICS
 18 | # Clocks
 19 | DCGM_FI_DEV_SM_CLOCK,    gauge, SM clock frequency (in MHz).
 20 | DCGM_FI_DEV_MEM_CLOCK,   gauge, Memory clock frequency (in MHz).
 21 | 
 22 | # Temperature
 23 | DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
 24 | DCGM_FI_DEV_GPU_TEMP,    gauge, GPU temperature (in C).
 25 | 
 26 | # Power
 27 | DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
 28 | DCGM_FI_DEV_POWER_USAGE,                gauge, Power draw (in W).
 29 | 
 30 | # PCIE
 31 | DCGM_FI_PROF_PCIE_TX_BYTES,      counter, Total number of bytes transmitted through PCIe TX via NVML.
 32 | DCGM_FI_PROF_PCIE_RX_BYTES,      counter, Total number of bytes received through PCIe RX via NVML.
 33 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
 34 | 
 35 | # Utilization (the sample period varies depending on the product)
 36 | DCGM_FI_DEV_GPU_UTIL,      gauge, GPU utilization (in %).
 37 | DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
 38 | DCGM_FI_DEV_ENC_UTIL,      gauge, Encoder utilization (in %).
 39 | DCGM_FI_DEV_DEC_UTIL,      gauge, Decoder utilization (in %).
 40 | 
 41 | # Errors and violations
 42 | DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
 43 | 
 44 | # Memory usage
 45 | DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
 46 | DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).
 47 | 
 48 | # NVLink
 49 | DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes
 50 | 
 51 | # VGPU License status 
 52 | DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
 53 | 
 54 | # Remapped rows
 55 | DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
 56 | DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS,   counter, Number of remapped rows for correctable errors
 57 | DCGM_FI_DEV_ROW_REMAP_FAILURE,             gauge, Whether remapping of rows has failed
 58 | 
 59 | # Static configuration information and features
 60 | DCGM_FI_DRIVER_VERSION, label, Driver Version
 61 | 
 62 | 
 63 | 
 64 | 
 65 | # CUSTOM METRICS
 66 | # Clocks
 67 | DCGM_FI_DEV_VIDEO_CLOCK, gauge, Video encoder/decoder clock (in MHz).
 68 | 
 69 | # Temperature
 70 | DCGM_FI_DEV_FAN_SPEED, gauge, Fan speed (in 0-100%)
 71 | 
 72 | # Power
 73 | DCGM_FI_DEV_POWER_USAGE_INSTANT, gauge, Current instantaneous power usage (in W).
 74 | 
 75 | # Errors and violations
 76 | DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, counter, Throttling reasons bitmask
 77 | DCGM_FI_DEV_POWER_VIOLATION,        counter, Throttling duration due to power constraints (in us).
 78 | DCGM_FI_DEV_THERMAL_VIOLATION,      counter, Throttling duration due to thermal constraints (in us).
 79 | DCGM_FI_DEV_SYNC_BOOST_VIOLATION,   counter, Throttling duration due to sync-boost constraints (in us).
 80 | DCGM_FI_DEV_BOARD_LIMIT_VIOLATION,  counter, Throttling duration due to board limit constraints (in us).
 81 | DCGM_FI_DEV_LOW_UTIL_VIOLATION,     counter, Throttling duration due to low utilization (in us).
 82 | DCGM_FI_DEV_RELIABILITY_VIOLATION,  counter, Throttling duration due to reliability constraints (in us).
 83 | 
 84 | # Memory usage
 85 | DCGM_FI_DEV_FB_RESERVED,     gauge, Frame buffer memory reserved (in MB).
 86 | DCGM_FI_DEV_FB_USED_PERCENT, gauge, Frame buffer percentage used (in 0-100%) - Used/(Total - Reserved)
 87 | 
 88 | # ECC
 89 | DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
 90 | DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
 91 | DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
 92 | DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
 93 | 
 94 | # Retired pages
 95 | DCGM_FI_DEV_RETIRED_SBE,     counter, Total number of retired pages due to single-bit errors.
 96 | DCGM_FI_DEV_RETIRED_DBE,     counter, Total number of retired pages due to double-bit errors.
 97 | DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
 98 | 
 99 | # NVLink
100 | DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
101 | DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
102 | DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,   counter, Total number of NVLink retries.
103 | DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
104 | 
105 | # VGPU
106 | DCGM_FI_DEV_VGPU_UTILIZATIONS,   gauge, vGPUs utilization
107 | 
108 | # Bar
109 | DCGM_FI_DEV_BAR1_USED,  gauge, Used BAR1 (in MB)
110 | DCGM_FI_DEV_BAR1_FREE,  gauge, Free BAR1 (in MB)
111 | 
112 | # DCP metrics
113 | DCGM_FI_PROF_GR_ENGINE_ACTIVE,   gauge, Ratio of time the graphics engine is active.
114 | DCGM_FI_PROF_SM_ACTIVE,          gauge, The ratio of cycles an SM has at least 1 warp assigned.
115 | DCGM_FI_PROF_SM_OCCUPANCY,       gauge, The ratio of number of warps resident on an SM.
116 | DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active.
117 | DCGM_FI_PROF_DRAM_ACTIVE,        gauge, Ratio of cycles the device memory interface is active sending or receiving data.
118 | DCGM_FI_PROF_PIPE_FP64_ACTIVE,   gauge, Ratio of cycles the fp64 pipes are active.
119 | DCGM_FI_PROF_PIPE_FP32_ACTIVE,   gauge, Ratio of cycles the fp32 pipes are active.
120 | DCGM_FI_PROF_PIPE_FP16_ACTIVE,   gauge, Ratio of cycles the fp16 pipes are active.
121 | DCGM_FI_PROF_PCIE_TX_BYTES,      gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
122 | DCGM_FI_PROF_PCIE_RX_BYTES,      gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
123 | 
124 | # Features and modes
125 | DCGM_FI_DEV_COMPUTE_MODE,            gauge, Compute mode
126 | DCGM_FI_DEV_PERSISTENCE_MODE,        gauge, Persistance mode (1 or 0)
127 | DCGM_FI_DEV_CC_MODE,                 gauge, ConfidentialCompute/AmpereProtectedMemory status (1 or 0)
128 | DCGM_FI_DEV_ECC_CURRENT,             gauge, Current ECC mode
129 | DCGM_FI_DEV_VIRTUAL_MODE,            gauge, Virtualization mode
130 | DCGM_FI_DEV_AUTOBOOST,               gauge, Auto-boost enabled
131 | DCGM_FI_DEV_BAR1_TOTAL,              gauge, Total BAR1 (in MB)
132 | DCGM_FI_DEV_MAX_SM_CLOCK,            gauge, Maximum supported SM clock
133 | DCGM_FI_DEV_MAX_MEM_CLOCK,           gauge, Maximum supported Memory clock
134 | DCGM_FI_DEV_GPU_MAX_OP_TEMP,         gauge, Maximum operating temperature
135 | DCGM_FI_DEV_SLOWDOWN_TEMP,           gauge, Slowdown temperature
136 | DCGM_FI_DEV_SHUTDOWN_TEMP,           gauge, Shutdown temperature
137 | DCGM_FI_DEV_POWER_MGMT_LIMIT,        gauge, Current Power limit
138 | DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN,    gauge, Minimum Power limit
139 | DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX,    gauge, Maximum Power limit
140 | DCGM_FI_DEV_ENFORCED_POWER_LIMIT,    gauge, Effective Power limit that the driver enforces after taking into account all limiters
141 | DCGM_FI_DEV_FB_TOTAL,                gauge, Total Frame buffer (in MB)
142 | DCGM_FI_DEV_COUNT,                   gauge, Number of devices on the node
143 | 
144 | # Static configuration information and features
145 | DCGM_FI_NVML_VERSION,                label, NVML Version
146 | DCGM_FI_DEV_BRAND,                   label, Device Brand
147 | DCGM_FI_DEV_SERIAL,                  label, Device Serial Number
148 | DCGM_FI_DEV_NAME,                    label, Device Name
149 | DCGM_FI_DEV_MINOR_NUMBER,            label, Device node minor (/dev/nvidia#)
150 | DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY, label, Cuda compute capability for the device (The major version is the upper 32 bits and the minor version is the lower 32 bits)
151 | DCGM_FI_DEV_OEM_INFOROM_VER,         label, OEM inforom version
152 | DCGM_FI_DEV_ECC_INFOROM_VER,         label, ECC inforom version
153 | DCGM_FI_DEV_POWER_INFOROM_VER,       label, Power management object inforom version
154 | DCGM_FI_DEV_INFOROM_IMAGE_VER,       label, Inforom image version
155 | DCGM_FI_DEV_VBIOS_VERSION,           label, VBIOS version of the device
156 | 


--------------------------------------------------------------------------------
/tests/unit/test_hardware.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import unittest
  3 | from unittest import mock
  4 | 
  5 | import pytest
  6 | 
  7 | from hardware import (
  8 |     get_bmc_address,
  9 |     get_cuda_version_from_driver,
 10 |     get_nvidia_driver_version,
 11 |     hwinfo,
 12 |     is_nvidia_driver_loaded,
 13 |     lshw,
 14 | )
 15 | 
 16 | 
 17 | class TestHwinfo:
 18 |     @pytest.mark.parametrize(
 19 |         "hw_classes,expect_cmd,hwinfo_output,expect",
 20 |         [
 21 |             (
 22 |                 [],
 23 |                 ["hwinfo"],
 24 |                 (
 25 |                     ""
 26 |                     "============ start debug info ============"
 27 |                     "random-string"
 28 |                     "random-string"
 29 |                     "random-string"
 30 |                     "random-string"
 31 |                     "=========== end debug info ============"
 32 |                     "10: key-a\n"
 33 |                     "  [Created at pci.386]\n"
 34 |                     "  Unique ID: unique-id-a\n"
 35 |                     "  Parent ID: parent-id-a\n"
 36 |                     "\n"
 37 |                     "11: key-b\n"
 38 |                     "  [Created at pci.386]\n"
 39 |                     "  Unique ID: unique-id-b\n"
 40 |                     "  Parent ID: parent-id-b\n"
 41 |                 ),
 42 |                 {
 43 |                     "10: key-a": (
 44 |                         "10: key-a\n"
 45 |                         "  [Created at pci.386]\n"
 46 |                         "  Unique ID: unique-id-a\n"
 47 |                         "  Parent ID: parent-id-a"
 48 |                     ),
 49 |                     "11: key-b": (
 50 |                         "11: key-b\n"
 51 |                         "  [Created at pci.386]\n"
 52 |                         "  Unique ID: unique-id-b\n"
 53 |                         "  Parent ID: parent-id-b\n"
 54 |                     ),
 55 |                 },
 56 |             ),
 57 |             (
 58 |                 ["storage"],
 59 |                 ["hwinfo", "--storage"],
 60 |                 (
 61 |                     ""
 62 |                     "10: key-a\n"
 63 |                     "  [Created at pci.386]\n"
 64 |                     "  Unique ID: unique-id-a\n"
 65 |                     "  Parent ID: parent-id-a\n"
 66 |                     "\n"
 67 |                     "11: key-b\n"
 68 |                     "  [Created at pci.386]\n"
 69 |                     "  Unique ID: unique-id-b\n"
 70 |                     "  Parent ID: parent-id-b\n"
 71 |                 ),
 72 |                 {
 73 |                     "10: key-a": (
 74 |                         "10: key-a\n"
 75 |                         "  [Created at pci.386]\n"
 76 |                         "  Unique ID: unique-id-a\n"
 77 |                         "  Parent ID: parent-id-a"
 78 |                     ),
 79 |                     "11: key-b": (
 80 |                         "11: key-b\n"
 81 |                         "  [Created at pci.386]\n"
 82 |                         "  Unique ID: unique-id-b\n"
 83 |                         "  Parent ID: parent-id-b\n"
 84 |                     ),
 85 |                 },
 86 |             ),
 87 |         ],
 88 |     )
 89 |     @mock.patch("hardware.apt")
 90 |     @mock.patch("hardware.subprocess.check_output")
 91 |     def test_hwinfo_output(
 92 |         self, mock_subprocess, mock_apt, hw_classes, expect_cmd, hwinfo_output, expect
 93 |     ):
 94 |         mock_subprocess.return_value = hwinfo_output
 95 |         output = hwinfo(*hw_classes)
 96 |         mock_subprocess.assert_called_with(expect_cmd, text=True)
 97 |         assert output == expect
 98 | 
 99 | 
100 | class TestLshw(unittest.TestCase):
101 |     @mock.patch("hardware.apt")
102 |     @mock.patch("hardware.subprocess.check_output")
103 |     def test_lshw_output(self, mock_subprocess, mock_apt):
104 |         mock_subprocess.return_value = """[{"expected_output": 1}]"""
105 |         for class_filter in [None, "storage"]:
106 |             output = lshw(class_filter)
107 |             if class_filter is not None:
108 |                 mock_subprocess.assert_called_with(
109 |                     f"lshw -json -c {class_filter}".split(),
110 |                     text=True,
111 |                 )
112 |                 self.assertEqual(output, [{"expected_output": 1}])
113 |             else:
114 |                 mock_subprocess.assert_called_with(
115 |                     "lshw -json".split(),
116 |                     text=True,
117 |                 )
118 |                 self.assertEqual(output, {"expected_output": 1})
119 | 
120 |     @mock.patch("hardware.subprocess.check_output")
121 |     def test_lshw_dict_output(self, mock_subprocess):
122 |         mock_subprocess.return_value = """{"expected_output": 1}"""
123 |         output = lshw()
124 |         mock_subprocess.assert_called_with(
125 |             "lshw -json".split(),
126 |             text=True,
127 |         )
128 |         self.assertEqual(output, {"expected_output": 1})
129 | 
130 |     @mock.patch(
131 |         "hardware.subprocess.check_output",
132 |         side_effect=subprocess.CalledProcessError(-1, "cmd"),
133 |         return_value="[{}]",
134 |     )
135 |     def test_lshw_error_handling(self, mock_subprocess):
136 |         with self.assertRaises(subprocess.CalledProcessError):
137 |             lshw()
138 | 
139 | 
140 | class TestGetBMCAddress(unittest.TestCase):
141 |     @mock.patch("hardware.apt")
142 |     @mock.patch("hardware.subprocess.check_output")
143 |     def test_get_bmc_address(self, mock_check_output, mock_apt):
144 |         mock_check_output.return_value = """
145 |             Set in Progress         : Set Complete
146 |             Auth Type Support       : NONE MD5 PASSWORD
147 |             Auth Type Enable        : Callback : MD5 PASSWORD
148 |                                     : User     : MD5 PASSWORD
149 |                                     : Operator : MD5 PASSWORD
150 |                                     : Admin    : MD5 PASSWORD
151 |                                     : OEM      :
152 |             IP Address Source		: Static Address
153 |             IP Address              : 10.244.120.100
154 |             Subnet Mask             : 255.255.252.0
155 |             MAC Address             : 5a:ba:3c:3b:b4:59
156 |             SNMP Community String   :
157 |             BMC ARP Control         : ARP Responses Enabled, Gratuitous ARP Disabled
158 |             Default Gateway IP      : 10.240.128.1
159 |             802.1q VLAN ID          : Disabled
160 |             802.1q VLAN Priority    : 0
161 |             RMCP+ Cipher Suites     : 0,1,2,3
162 |             Cipher Suite Priv Max   : XXXaXXXXXXXXXXX
163 |                                     :     X=Cipher Suite Unused
164 |                                     :     c=CALLBACK
165 |                                     :     u=USER
166 |                                     :     o=OPERATOR
167 |                                     :     a=ADMIN
168 |                                     :     O=OEM
169 |             Bad Password Threshold  : Not Available
170 |             """.strip()
171 | 
172 |         output = get_bmc_address()
173 |         self.assertEqual(output, "10.244.120.100")
174 | 
175 |     @mock.patch("hardware.apt")
176 |     @mock.patch(
177 |         "hardware.subprocess.check_output",
178 |         side_effect=subprocess.CalledProcessError(-1, "cmd"),
179 |     )
180 |     def test_get_bmc_address_error_handling(self, mock_subprocess, mock_apt):
181 |         output = get_bmc_address()
182 |         self.assertEqual(output, None)
183 | 
184 | 
185 | @pytest.mark.parametrize("path_exists,expected", [(True, True), (False, False)])
186 | @mock.patch("hardware.Path.exists")
187 | def test_is_nvidia_driver_loaded(mock_path, path_exists, expected):
188 |     mock_path.return_value = path_exists
189 |     assert is_nvidia_driver_loaded() == expected
190 | 
191 | 
192 | @mock.patch("hardware.NVIDIA_DRIVER_PATH")
193 | def test_get_nvidia_driver_version(mock_driver_path):
194 |     mock_driver_path.read_text.return_value = (
195 |         "NVRM version: NVIDIA UNIX x86_64 Kernel Module  570.172.08"
196 |     )
197 |     result = get_nvidia_driver_version()
198 |     assert result == 570
199 | 
200 | 
201 | @mock.patch("hardware.NVIDIA_DRIVER_PATH")
202 | def test_get_nvidia_driver_version_file_not_found(mock_driver_path):
203 |     mock_driver_path.read_text.side_effect = FileNotFoundError
204 | 
205 |     with pytest.raises(FileNotFoundError):
206 |         get_nvidia_driver_version()
207 | 
208 | 
209 | @pytest.mark.parametrize(
210 |     "driver_version, expected",
211 |     [
212 |         (590, 13),
213 |         (580, 13),
214 |         (570, 12),
215 |         (525, 12),
216 |         (500, 11),
217 |         (450, 11),
218 |         (400, 10),
219 |         (390, 10),
220 |     ],
221 | )
222 | @mock.patch("hardware.get_nvidia_driver_version")
223 | def test_get_cuda_version_from_driver(mock_nvidia_driver, driver_version, expected):
224 |     mock_nvidia_driver.return_value = driver_version
225 |     assert get_cuda_version_from_driver() == expected
226 | 


--------------------------------------------------------------------------------