├── .wokeignore
├── .jujuignore
├── CODEOWNERS
├── .gitignore
├── .github
    ├── renovate.json5
    ├── workflows
    │   ├── pull-request.yaml
    │   ├── release.yaml
    │   ├── tiobe-scan.yaml
    │   ├── update-libs.yaml
    │   ├── quality-gates.yaml
    │   └── promote.yaml
    ├── .jira_sync_config.yaml
    ├── pull_request_template.md
    └── ISSUE_TEMPLATE
    │   ├── enhancement_proposal.yml
    │   └── bug_report.yml
├── INTEGRATING.md
├── src
    ├── prometheus_alert_rules
    │   ├── always_firing_numeric.rule
    │   └── always_firing_absent.rule
    ├── grafana_dashboards
    │   └── avalanche.json
    ├── kubernetes_service.py
    └── charm.py
├── tests
    ├── integration
    │   ├── test_metrics_endpoint.py
    │   ├── test_upgrade_charm.py
    │   ├── test_remote_write.py
    │   ├── helpers.py
    │   └── conftest.py
    └── unit
    │   ├── test_charm.py
    │   └── test_disable_alerts.py
├── SECURITY.md
├── tox.ini
├── pyproject.toml
├── README.md
├── RELEASE.md
├── charmcraft.yaml
├── CONTRIBUTING.md
├── lib
    └── charms
    │   ├── observability_libs
    │       └── v0
    │       │   └── juju_topology.py
    │   └── prometheus_k8s
    │       ├── v1
    │           └── prometheus_remote_write.py
    │       └── v0
    │           └── prometheus_scrape.py
└── LICENSE


/.wokeignore:
--------------------------------------------------------------------------------
1 | src/prometheus_alert_rules
2 | 


--------------------------------------------------------------------------------
/.jujuignore:
--------------------------------------------------------------------------------
1 | /venv
2 | *.py[cod]
3 | *.charm
4 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @canonical/Observability
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv/
2 | build/
3 | *.charm
4 | 
5 | .coverage
6 | __pycache__/
7 | *.py[cod]
8 | .tox/
9 | 


--------------------------------------------------------------------------------
/.github/renovate.json5:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 |   "extends": [
4 |     "github>canonical/observability//.github/renovate/charms.json5",
5 |   ],
6 | }
7 | 


--------------------------------------------------------------------------------
/.github/workflows/pull-request.yaml:
--------------------------------------------------------------------------------
 1 | name: Pull Requests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 |       - track/**
 8 | 
 9 | jobs:
10 |   pull-request:
11 |     name: PR
12 |     uses: canonical/observability/.github/workflows/charm-pull-request.yaml@v1
13 |     secrets: inherit
14 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | name: Release Charm
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - track/**
 8 | 
 9 | jobs:
10 |   release:
11 |     uses: canonical/observability/.github/workflows/charm-release.yaml@v1
12 |     secrets: inherit
13 |     with:
14 |       default-track: dev
15 | 


--------------------------------------------------------------------------------
/INTEGRATING.md:
--------------------------------------------------------------------------------
 1 | ## Integrating avalanche-operator
 2 | avalanche-operator integrates with any charm that `requires` the
 3 | `prometheus_scrape` interface.
 4 | 
 5 | ### Related charms
 6 | #### Prometheus
 7 | Avalanche is intended for load-testing [prometheus][Prometheus operator].
 8 | 
 9 | [Prometheus operator]: https://charmhub.io/prometheus-k8s
10 | 


--------------------------------------------------------------------------------
/.github/workflows/tiobe-scan.yaml:
--------------------------------------------------------------------------------
 1 | name: Tiobe TiCS Analysis
 2 | 
 3 | on:
 4 |     workflow_dispatch:
 5 |     schedule:
 6 |     - cron: "0 0 * * 1"  # Runs at midnight UTC every Monday
 7 | 
 8 | jobs:
 9 |     tics:
10 |         name: TiCs
11 |         uses: canonical/observability/.github/workflows/charm-tiobe-scan.yaml@v1
12 |         secrets: inherit
13 | 


--------------------------------------------------------------------------------
/.github/.jira_sync_config.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   jira_project_key: "OBC"
 3 |   status_mapping:
 4 |     opened: Untriaged
 5 |     closed: done
 6 |     not_planned: rejected
 7 |     
 8 |   components:
 9 |     - avalanche
10 |       
11 |   add_gh_comment: false
12 |   sync_description: false
13 |   sync_comments: false
14 | 
15 |   label_mapping:
16 |     "Type: Enhancement": Story
17 | 


--------------------------------------------------------------------------------
/.github/workflows/update-libs.yaml:
--------------------------------------------------------------------------------
 1 | name: Auto-update Charm Libraries
 2 | on:
 3 |   # Manual trigger
 4 |   workflow_dispatch:
 5 |   # Check regularly the upstream every four hours
 6 |   schedule:
 7 |     - cron: "0 0,4,8,12,16,20 * * *"
 8 | 
 9 | jobs:
10 |   update-lib:
11 |     name: Check libraries
12 |     uses: canonical/observability/.github/workflows/charm-update-libs.yaml@v1
13 |     secrets: inherit
14 | 
15 | 


--------------------------------------------------------------------------------
/.github/workflows/quality-gates.yaml:
--------------------------------------------------------------------------------
 1 | name: Quality Gates
 2 | 
 3 | on:
 4 |   # Manual trigger
 5 |   workflow_dispatch:
 6 |   # Run the quality checks periodically
 7 |   # FIXME: adjust the frequency as needed once we have actual gates in place
 8 |   # schedule:
 9 |   #   - cron: "0 0 * * Tue"
10 | 
11 | 
12 | jobs:
13 |   quality-gates:
14 |     name: Run quality gates
15 |     uses: canonical/observability/.github/workflows/charm-quality-gates.yaml@v1
16 |     secrets: inherit
17 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/always_firing_numeric.rule:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: AlwaysFiringDueToNumericValue
 3 |   rules:
 4 |   - alert: AlwaysFiringDueToNumericValue
 5 |     expr: avalanche_metric_mmmmm_0_0{series_id="0"} > -1
 6 |     for: 0m
 7 |     labels:
 8 |       severity: High
 9 |     annotations:
10 |       summary: "Instance {{ $labels.instance }} dummy alarm (always firing)"
11 |       description: "{{ $labels.instance }} of job {{ $labels.job }} is firing the dummy alarm."
12 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Issue
 2 | <!-- What issue is this PR trying to solve? -->
 3 | 
 4 | 
 5 | ## Solution
 6 | <!-- A summary of the solution addressing the above issue -->
 7 | 
 8 | 
 9 | ## Context
10 | <!-- What is some specialized knowledge relevant to this project/technology -->
11 | 
12 | 
13 | ## Testing Instructions
14 | <!-- What steps need to be taken to test this PR? -->
15 | 
16 | 
17 | ## Upgrade Notes
18 | <!-- To upgrade from an older revision of charmed prometheus, ... -->
19 | 


--------------------------------------------------------------------------------
/src/prometheus_alert_rules/always_firing_absent.rule:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: AlwaysFiringDueToAbsentMetric
 3 |   rules:
 4 |   - alert: AlwaysFiringDueToAbsentMetric
 5 |     expr: absent(some_metric_name_that_shouldnt_exist{job="non_existing_job"})
 6 |     for: 0m
 7 |     labels:
 8 |       severity: High
 9 |     annotations:
10 |       summary: "Instance {{ $labels.instance }} dummy alarm (always firing)"
11 |       description: "{{ $labels.instance }} of job {{ $labels.job }} is firing the dummy alarm."
12 | 


--------------------------------------------------------------------------------
/.github/workflows/promote.yaml:
--------------------------------------------------------------------------------
 1 | name: Promote Charm
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       promotion:
 7 |         type: choice
 8 |         description: Channel to promote from
 9 |         options:
10 |           - edge -> beta
11 |           - beta -> candidate
12 |           - candidate -> stable
13 | 
14 | jobs:
15 |   promote:
16 |     name: Promote
17 |     uses: canonical/observability/.github/workflows/charm-promote.yaml@v1
18 |     with:
19 |       promotion: ${{ github.event.inputs.promotion }}
20 |     secrets: inherit
21 | 


--------------------------------------------------------------------------------
/tests/integration/test_metrics_endpoint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright 2021 Canonical Ltd.
 3 | # See LICENSE file for licensing details.
 4 | import jubilant
 5 | import pytest
 6 | 
 7 | 
 8 | @pytest.mark.abort_on_fail
 9 | async def test_avalanche_is_scraped_by_prometheus(juju: jubilant.Juju, charm, charm_resources):
10 |     """Deploy the avalanche and deploy it together with related charms."""
11 |     juju.deploy(charm, "avalanche", resources=charm_resources)
12 |     juju.deploy("prometheus-k8s", "prometheus", channel="2/edge", trust=True)
13 |     juju.integrate("avalanche:metrics-endpoint", "prometheus:metrics-endpoint")
14 |     juju.wait(jubilant.all_active)
15 | 


--------------------------------------------------------------------------------
/tests/integration/test_upgrade_charm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright 2021 Canonical Ltd.
 3 | # See LICENSE file for licensing details.
 4 | import jubilant
 5 | import pytest
 6 | 
 7 | 
 8 | @pytest.mark.abort_on_fail
 9 | async def test_upgrade_charm(juju: jubilant.Juju, charm):
10 |     """Deploy the avalanche and deploy it together with related charms."""
11 |     juju.deploy(
12 |         "avalanche-k8s",
13 |         "avalanche",
14 |         channel="2/edge",
15 |         config={"metric_count": "33", "value_interval": "99999"},
16 |     )
17 |     juju.wait(jubilant.all_active)
18 |     juju.refresh("avalanche", path=charm)
19 |     juju.wait(jubilant.all_active)
20 | 


--------------------------------------------------------------------------------
/tests/integration/test_remote_write.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright 2021 Canonical Ltd.
 3 | # See LICENSE file for licensing details.
 4 | import jubilant
 5 | import pytest
 6 | 
 7 | 
 8 | @pytest.mark.abort_on_fail
 9 | async def test_avalanche_remote_writes_to_prometheus(juju: jubilant.Juju, charm, charm_resources):
10 |     """Deploy the avalanche and deploy it together with related charms."""
11 |     juju.deploy(charm, "avalanche", resources=charm_resources)
12 |     juju.deploy("prometheus-k8s", "prometheus", channel="2/edge", trust=True)
13 |     juju.integrate("avalanche:send-remote-write", "prometheus:receive-remote-write")
14 |     juju.wait(jubilant.all_active)
15 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancement_proposal.yml:
--------------------------------------------------------------------------------
 1 | name: Enhancement Proposal
 2 | description: File an enhancement proposal
 3 | labels: ["Type: Enhancement", "Status: Triage"]
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: >
 8 |         Thanks for taking the time to fill out this enhancement proposal! Before submitting your issue, please make
 9 |         sure there isn't already a prior issue concerning this. If there is, please join that discussion instead.
10 |   - type: textarea
11 |     id: enhancement-proposal
12 |     attributes:
13 |       label: Enhancement Proposal
14 |       description: >
15 |         Describe the enhancement you would like to see in as much detail as needed.      
16 |     validations:
17 |       required: true
18 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | The easiest way to report a security issue is through a [Github Private Security Report](https://github.com/canonical/avalanche-k8s-operator/security/advisories/new)
2 | with a description of the issue, the steps you took to create the issue, affected versions, and, if known, mitigations for the issue.
3 | 
4 | Alternatively, to report a security issue via email, please email [security@ubuntu.com](mailto:security@ubuntu.com) with a description of the issue,
5 | the steps you took to create the issue, affected versions, and, if known, mitigations for the issue.
6 | 
7 | The [Ubuntu Security disclosure and embargo policy](https://ubuntu.com/security/disclosure-policy) contains more information about what you can expect
8 | when you contact us and what we expect from you. 
9 | 


--------------------------------------------------------------------------------
/tests/integration/helpers.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | 
 4 | import logging
 5 | 
 6 | log = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | async def get_unit_address(ops_test, app_name: str, unit_num: int) -> str:
10 |     status = await ops_test.model.get_status()  # noqa: F821
11 |     return status["applications"][app_name]["units"][f"{app_name}/{unit_num}"]["address"]
12 | 
13 | 
14 | async def get_config_values(ops_test, app_name) -> dict:
15 |     """Return the app's config, but filter out keys that do not have a value."""
16 |     config = await ops_test.model.applications[app_name].get_config()
17 |     # Need to convert the value to string because set_config only takes strings but get_config
18 |     # may return non-strings
19 |     # https://github.com/juju/python-libjuju/issues/631
20 |     # https://github.com/juju/python-libjuju/issues/630
21 |     return {key: str(config[key]["value"]) for key in config if "value" in config[key]}
22 | 


--------------------------------------------------------------------------------
/tests/unit/test_charm.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | 
 4 | import unittest
 5 | 
 6 | from ops.model import ActiveStatus
 7 | from ops.testing import Harness
 8 | 
 9 | from charm import AvalancheCharm
10 | 
11 | 
12 | class TestCharm(unittest.TestCase):
13 |     def setUp(self):
14 |         self.harness = Harness(AvalancheCharm)
15 |         self.addCleanup(self.harness.cleanup)
16 |         self.harness.handle_exec('avalanche', ['/bin/avalanche', '--version'], result='0.0')
17 |         self.harness.begin_with_initial_hooks()
18 | 
19 |     def test_services_running(self):
20 |         """Check that the supplied service is running and charm is ActiveStatus."""
21 |         service = self.harness.model.unit.get_container(
22 |             AvalancheCharm._container_name
23 |         ).get_service(AvalancheCharm._service_name)
24 |         self.assertTrue(service.is_running())
25 |         self.assertEqual(self.harness.model.unit.status, ActiveStatus())
26 | 


--------------------------------------------------------------------------------
/tests/integration/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | import os
 4 | from pathlib import Path
 5 | from typing import Dict
 6 | 
 7 | import jubilant
 8 | import pytest
 9 | import sh
10 | import yaml
11 | 
12 | 
13 | @pytest.fixture(scope="module")
14 | async def charm():
15 |     """Charm used for integration testing."""
16 |     if charm_file := os.environ.get("CHARM_PATH"):
17 |         return Path(charm_file)
18 | 
19 |     charm = sh.charmcraft.pack()  # type: ignore
20 |     assert charm
21 |     return charm
22 | 
23 | 
24 | @pytest.fixture(scope="module")
25 | def charm_resources(metadata_file="charmcraft.yaml") -> Dict[str, str]:
26 |     with open(metadata_file, "r") as file:
27 |         metadata = yaml.safe_load(file)
28 |     resources = {}
29 |     for res, data in metadata["resources"].items():
30 |         resources[res] = data["upstream-source"]
31 |     return resources
32 | 
33 | 
34 | @pytest.fixture(scope="module")
35 | def juju():
36 |     keep_models: bool = os.environ.get("KEEP_MODELS") is not None
37 |     with jubilant.temp_model(keep=keep_models) as juju:
38 |         yield juju
39 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | 
 4 | [tox]
 5 | skipsdist=True
 6 | skip_missing_interpreters = True
 7 | envlist = lint, static, unit
 8 | 
 9 | [vars]
10 | src_path = {toxinidir}/src
11 | tst_path = {toxinidir}/tests
12 | all_path = {[vars]src_path} {[vars]tst_path}
13 | uv_flags = --frozen --isolated --extra=dev
14 | 
15 | [testenv]
16 | allowlist_externals = uv
17 | basepython = python3
18 | setenv =
19 |   PYTHONPATH = {toxinidir}:{toxinidir}/lib:{[vars]src_path}
20 |   PYTHONBREAKPOINT=ipdb.set_trace
21 | passenv =
22 |   PYTHONPATH
23 |   CHARM_PATH
24 | 
25 | [testenv:lock]
26 | description = Update uv.lock with the latest deps
27 | commands =
28 |   uv lock --upgrade --no-cache
29 | 
30 | [testenv:lint]
31 | description = Lint the code
32 | commands =
33 |     uv run {[vars]uv_flags} ruff check {[vars]all_path}
34 | 
35 | [testenv:static]
36 | description = Run static checks
37 | allowlist_externals =
38 |   {[testenv]allowlist_externals}
39 |   /usr/bin/env
40 | commands =
41 |     uv run {[vars]uv_flags} pyright {[vars]all_path}
42 | 
43 | [testenv:fmt]
44 | description = "Format the code"
45 | commands =
46 |     uv run {[vars]uv_flags} ruff check --fix-only {[vars]all_path}
47 | 
48 | [testenv:unit]
49 | description = Run unit tests
50 | allowlist_externals=
51 |     {[testenv]allowlist_externals}
52 |     /usr/bin/env
53 | commands =
54 |     uv run {[vars]uv_flags} coverage run --source={[vars]src_path} -m pytest \
55 |         {[vars]tst_path}/unit {posargs}
56 |     uv run {[vars]uv_flags} coverage report
57 | 
58 | [testenv:integration]
59 | description = Run integration tests
60 | commands =
61 |     uv run {[vars]uv_flags} pytest --exitfirst {[vars]tst_path}/integration {posargs}
62 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | [project]
 4 | name = "avalanche-k8s"
 5 | version = "0.0"
 6 | requires-python = "~=3.8"
 7 | 
 8 | dependencies = [
 9 |   "ops",
10 |   "PyYAML",
11 |   "kubernetes",
12 |   "jinja2<3",
13 |   "cryptography",
14 |   "pydantic",
15 |   "cosl",
16 | ]
17 | 
18 | [project.optional-dependencies]
19 | dev = [
20 |   # Linting
21 |   "ruff",
22 |   "codespell",
23 |   # Static
24 |   "pyright",
25 |   # Unit
26 |   "pytest",
27 |   "coverage[toml]",
28 |   "ops[testing]",
29 |   "markupsafe==2.0.1",
30 |   # Integration
31 |   "jubilant",
32 |   "sh",
33 |   "pytest-asyncio",
34 | ]
35 | 
36 | # Testing tools configuration
37 | [tool.coverage.run]
38 | branch = true
39 | 
40 | [tool.coverage.report]
41 | show_missing = true
42 | 
43 | # Linting tools configuration
44 | [tool.ruff]
45 | line-length = 99
46 | extend-exclude = ["__pycache__", "*.egg_info"]
47 | lint.select = ["E", "W", "F", "C", "N", "R", "D", "I001"]
48 | # Ignore E501 because using black creates errors with this
49 | # Ignore D107 Missing docstring in __init__
50 | lint.ignore = ["E501", "D107", "N818", "RET504"]
51 | # D100, D101, D102, D103: Ignore missing docstrings in tests
52 | lint.per-file-ignores = {"tests/*" = ["D100","D101","D102","D103"]}
53 | 
54 | [tool.ruff.lint.pydocstyle]
55 | convention = "google"
56 | 
57 | # Static analysis tools configuration
58 | [tool.pyright]
59 | extraPaths = ["lib"]
60 | pythonVersion = "3.8"
61 | pythonPlatform = "Linux"
62 | 
63 | [tool.pytest.ini_options]
64 | minversion = "6.0"
65 | log_cli_level = "INFO"
66 | asyncio_mode = "auto"
67 | addopts = "--tb=native --verbose --capture=no --log-cli-level=INFO"
68 | 
69 | [tool.codespell]
70 | skip = ".git,.tox,build,venv*"
71 | ignore-words-list = "assertIn"
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Avalanche Operator (k8s)
 2 | 
 3 | [![Charmhub Badge](https://charmhub.io/avalanche-k8s/badge.svg)](https://charmhub.io/avalanche-k8s)
 4 | [![Release](https://github.com/canonical/avalanche-k8s-operator/actions/workflows/release.yaml/badge.svg)](https://github.com/canonical/avalanche-k8s-operator/actions/workflows/release.yaml)
 5 | [![Discourse Status](https://img.shields.io/discourse/status?server=https%3A%2F%2Fdiscourse.charmhub.io&style=flat&label=CharmHub%20Discourse)](https://discourse.charmhub.io)
 6 | 
 7 | ## Description
 8 | 
 9 | [Avalanche][Avalanche source] is an [OpenMetrics][OpenMetrics source] endpoint
10 | load tester.
11 | 
12 | ## Usage
13 | 
14 | To use Avalanche, you need to be able to relate to a charm that supports the
15 | `prometheus_scrape` relation interface.
16 | 
17 | For more information see [INTEGRATING](INTEGRATING.md).
18 | 
19 | You also need to have a working Kubernetes environment, and have bootstrapped a
20 | Juju controller of version 2.9+, with a model ready to use with the Kubernetes
21 | cloud.
22 | 
23 | Example deployment:
24 | 
25 | ```shell
26 | juju deploy avalanche-k8s
27 | ```
28 | 
29 | Then you could relate to [prometheus][Prometheus operator]:
30 | ```shell
31 | juju deploy prometheus-k8s
32 | juju relate prometheus-k8s avalanche-k8s
33 | ```
34 | 
35 | ### Scale Out Usage
36 | To add additional Avalanche units for high availability,
37 | 
38 | ```shell
39 | juju add-unit avalanche-k8s
40 | ```
41 | 
42 | ## Relations
43 | Currently, supported relations are:
44 | - `metrics-endpoint`, for interfacing with [prometheus][Prometheus operator].
45 | 
46 | ## OCI Images
47 | This charm can be used with the following image:
48 | - `quay.io/freshtracks.io/avalanche`
49 | 
50 | 
51 | [Avalanche source]: https://github.com/open-fresh/avalanche
52 | [OpenMetrics source]: https://github.com/OpenObservability/OpenMetrics
53 | [Prometheus operator]: https://charmhub.io/prometheus-k8s
54 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
 1 | # Release Process
 2 | 
 3 | ## Overview
 4 | 
 5 | At any given time there are three revisions of the charm available on [CharmHub.io](https://charmhub.io/), for each of the following channels:
 6 | 
 7 | 1. `latest/stable` is a well tested production ready version of the Charm.
 8 | 2. `latest/candidate` is a feature ready next version of the stable release, currently in testing.
 9 | 3. `latest/edge` is the bleeding edge developer version of the charm. While we really try not to, it may break and introduce regressions.
10 | 
11 | Currently, the Alertmanager charm does not make use of the `latest/beta` channel.
12 | For more information about CharmHub channels, refer to the [Juju charm store](https://discourse.charmhub.io/t/the-juju-charm-store) documentation.
13 | 
14 | ## When to create which revisions
15 | 
16 | * **Stable revisions** are done in consultation with product manager and engineering manager when the `candidate` revision has been well tested and is deemed ready for production.
17 | * **Candidate revisions** are done when the charm reaches a state of feature completion with respect to the next planned `stable` release.
18 | * **Edge revisions** are released at the developer's discretion, potentially every time something is merged into `main` and the unit tests pass.
19 | 
20 | ## How to publish revisions
21 | 
22 | Refer to the [Publish your operator in Charmhub](https://discourse.charmhub.io/t/publish-your-operator-in-charmhub) documentation.
23 | After a `latest/stable` release, it is expected that the version of the charm is the same as the one in `latest/candidate`, and those two channels will diverge again when we are ramping up through `latest/candidate` releases for a new `latest/stable` release.
24 | 
25 | ## A note on granularity of revisions
26 | 
27 | We believe in shipping often and with confidence.
28 | It is perfectly acceptable to have a new `latest/stable` release containing just one bug fix or a small new feature with respect to the last one.
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: File a bug report
 3 | labels: ["Type: Bug", "Status: Triage"]
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: >
 8 |         Thanks for taking the time to fill out this bug report! Before submitting your issue, please make
 9 |         sure you are using the latest version of the charm. If not, please try upgrading to the latest edge release prior to 
10 |         posting your report to make sure it's not already solved.
11 |   - type: textarea
12 |     id: bug-description
13 |     attributes:
14 |       label: Bug Description
15 |       description: >
16 |         If applicable, add screenshots to 
17 |         help explain the problem you are facing.      
18 |     validations:
19 |       required: true
20 |   - type: textarea
21 |     id: reproduction
22 |     attributes:
23 |       label: To Reproduce
24 |       description: >
25 |         Please provide the output of `juju export-bundle` and step-by-step instructions for how to reproduce the behavior.
26 |         A deployment diagram could be handy too. See https://discourse.charmhub.io/t/9269 for examples.
27 |       placeholder: |
28 |         1. `juju deploy ...`
29 |         2. `juju relate ...`
30 |         3. `juju status --relations`
31 |     validations:
32 |       required: true
33 |   - type: textarea
34 |     id: environment
35 |     attributes:
36 |       label: Environment
37 |       description: >
38 |         We need to know a bit more about the context in which you run the charm.
39 |         - Are you running Juju locally, on lxd, in multipass or on some other platform?
40 |         - What track and channel you deployed the charm from (ie. `latest/edge` or similar).
41 |         - Version of any applicable components, like the juju snap, the model controller, lxd, microk8s, and/or multipass.
42 |     validations:
43 |       required: true
44 |   - type: textarea
45 |     id: logs
46 |     attributes:
47 |       label: Relevant log output
48 |       description: >
49 |         Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
50 |         Fetch the logs using `juju debug-log --replay` and `kubectl logs ...`. Additional details available in the juju docs 
51 |         at https://juju.is/docs/olm/juju-logs
52 |       render: shell
53 |     validations:
54 |       required: true
55 |   - type: textarea
56 |     id: additional-context
57 |     attributes:
58 |       label: Additional context
59 | 


--------------------------------------------------------------------------------
/tests/unit/test_disable_alerts.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | 
 4 | 
 5 | import pytest
 6 | from ops.testing import Container, Context, Exec, Relation, State
 7 | 
 8 | import charm
 9 | 
10 | 
11 | @pytest.fixture(scope="function")
12 | def avalanche_container():
13 |     return Container(
14 |         "avalanche",
15 |         can_connect=True,
16 |         execs={Exec(["/bin/avalanche", "--version"], return_code=0, stdout="0.0")},
17 |     )
18 | 
19 | 
20 | @pytest.mark.parametrize("forwarding", (True, False))
21 | def test_forward_alert_rules_scrape(forwarding, avalanche_container):
22 |     # GIVEN these relations
23 |     prometheus_relation = Relation("send-remote-write", remote_app_name="prometheus")
24 |     state = State(
25 |         leader=True,
26 |         containers={avalanche_container},
27 |         relations=[
28 |             prometheus_relation,
29 |         ],
30 |         config={"forward_alert_rules": forwarding},
31 |     )
32 |     # WHEN the charm receives a config-changed event
33 |     ctx = Context(
34 |         charm_type=charm.AvalancheCharm,
35 |     )
36 |     with ctx(ctx.on.config_changed(), state) as mgr:
37 |         output_state = mgr.run()
38 |         # THEN the charm is forwarding the alerts
39 |         prometheus_relation_out = output_state.get_relation(prometheus_relation.id)
40 |         if forwarding:
41 |             assert prometheus_relation_out.local_app_data["alert_rules"] != "{}"
42 |         else:
43 |             assert prometheus_relation_out.local_app_data["alert_rules"] == "{}"
44 | 
45 | 
46 | @pytest.mark.parametrize("forwarding", (True, False))
47 | def test_forward_alert_rules(forwarding, avalanche_container):
48 |     # GIVEN these relations
49 |     prometheus_relation = Relation("send-remote-write", remote_app_name="prometheus")
50 |     state = State(
51 |         leader=True,
52 |         containers={avalanche_container},
53 |         relations=[
54 |             prometheus_relation,
55 |         ],
56 |         config={"forward_alert_rules": forwarding},
57 |     )
58 |     # WHEN the charm receives a config-changed event
59 |     ctx = Context(
60 |         charm_type=charm.AvalancheCharm,
61 |     )
62 |     with ctx(ctx.on.config_changed(), state) as mgr:
63 |         output_state = mgr.run()
64 |         # THEN the charm is forwarding the alerts
65 |         prometheus_relation_out = output_state.get_relation(prometheus_relation.id)
66 |         if forwarding:
67 |             assert prometheus_relation_out.local_app_data["alert_rules"] != "{}"
68 |         else:
69 |             assert prometheus_relation_out.local_app_data["alert_rules"] == "{}"
70 | 


--------------------------------------------------------------------------------
/charmcraft.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Canonical Ltd.
 2 | # See LICENSE file for licensing details.
 3 | name: avalanche-k8s
 4 | type: charm
 5 | summary: Load tester for openmetrics endpoints.
 6 | description: Avalanche is a prometheus load tester.
 7 | 
 8 | links:
 9 |   website: https://charmhub.io/avalanche-k8s
10 |   source: https://github.com/canonical/avalanche-k8s-operator
11 |   issues: https://github.com/canonical/avalanche-k8s-operator/issues
12 | 
13 | assumes:
14 |   - k8s-api
15 | 
16 | platforms:
17 |   ubuntu@24.04:amd64:
18 | 
19 | parts:
20 |   charm:
21 |     source: .
22 |     plugin: uv
23 |     build-packages: [git]
24 |     build-snaps: [astral-uv]
25 |     override-build: |
26 |       craftctl default
27 |       git describe --always > $CRAFT_PART_INSTALL/version
28 | 
29 | containers:
30 |   avalanche:
31 |     resource: avalanche-image
32 | 
33 | resources:
34 |   avalanche-image:
35 |     type: oci-image
36 |     description: OCI image for avalanche
37 |     upstream-source: quay.io/freshtracks.io/avalanche
38 | 
39 | provides:
40 |   metrics-endpoint:
41 |     interface: prometheus_scrape
42 |   grafana-dashboard:
43 |     interface: grafana_dashboard
44 | 
45 | requires:
46 |   send-remote-write:
47 |     interface: prometheus_remote_write
48 | 
49 | peers:
50 |   replicas:
51 |     interface: avalanche_replica
52 | 
53 | config:
54 |   options:
55 |     metric_count:
56 |       type: int
57 |       description: Number of metrics to serve.
58 |       default: 500
59 |     label_count:
60 |       type: int
61 |       description: Number of labels per-metric.
62 |       default: 10
63 |     series_count:
64 |       type: int
65 |       description: Number of series per-metric.
66 |       default: 10
67 |     metricname_length:
68 |       type: int
69 |       description: Modify length of metric names.
70 |       default: 5
71 |     labelname_length:
72 |       type: int
73 |       description: Modify length of label names.
74 |       default: 5
75 |     value_interval:
76 |       type: int
77 |       description: Change series values every {interval} seconds.
78 |       default: 30
79 |     series_interval:
80 |       type: int
81 |       description: >
82 |         Change series_id label values every {interval} seconds.
83 |         Avalanche's CLI default value is 60, but this is too low and quickly overloads the scraper.
84 |         Using 3600000 (10k hours ~ 1 year) in lieu of "inf" (never refresh).
85 |       default: 36000000
86 |     metric_interval:
87 |       type: int
88 |       description: >
89 |         Change __name__ label values every {interval} seconds.
90 |         Avalanche's CLI default value is 120, but this is too low and quickly overloads the scraper.
91 |         Using 3600000 (10k hours ~ 1 year) in lieu of "inf" (never refresh).
92 |       default: 36000000
93 |     forward_alert_rules:
94 |       type: boolean
95 |       description: >
96 |         Toggle forwarding of alert rules.
97 |       default: True
98 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to avalanche-operator
  2 | 
  3 | ![GitHub License](https://img.shields.io/github/license/canonical/avalanche-k8s-operator)
  4 | ![GitHub Commit Activity](https://img.shields.io/github/commit-activity/y/canonical/avalanche-k8s-operator)
  5 | ![GitHub Lines of Code](https://img.shields.io/tokei/lines/github/canonical/avalanche-k8s-operator)
  6 | ![GitHub Issues](https://img.shields.io/github/issues/canonical/avalanche-k8s-operator)
  7 | ![GitHub PRs](https://img.shields.io/github/issues-pr/canonical/avalanche-k8s-operator)
  8 | ![GitHub Contributors](https://img.shields.io/github/contributors/canonical/avalanche-k8s-operator)
  9 | ![GitHub Watchers](https://img.shields.io/github/watchers/canonical/avalanche-k8s-operator?style=social)
 10 | 
 11 | The intended use case of this operator is to be deployed together with
 12 | prometheus-operator.
 13 | 
 14 | ## Bugs and pull requests
 15 | - Generally, before developing enhancements to this charm, you should consider
 16 |   opening an issue explaining your use case.
 17 | - If you would like to chat with us about your use-cases or proposed
 18 |   implementation, you can reach us at
 19 |   [Canonical Mattermost public channel](https://chat.charmhub.io/charmhub/channels/charm-dev)
 20 |   or [Discourse](https://discourse.charmhub.io/).
 21 | - All enhancements require review before being merged. Apart from
 22 |   code quality and test coverage, the review will also take into
 23 |   account the resulting user experience for Juju administrators using
 24 |   this charm.
 25 | 
 26 | 
 27 | ## Setup
 28 | 
 29 | A typical setup using [snaps](https://snapcraft.io/) can be found in the
 30 | [Juju docs](https://juju.is/docs/sdk/dev-setup).
 31 | 
 32 | ## Developing
 33 | 
 34 | Use your existing Python 3 development environment or create and
 35 | activate a Python 3 virtualenv
 36 | 
 37 | ```shell
 38 | virtualenv -p python3 venv
 39 | source venv/bin/activate
 40 | ```
 41 | 
 42 | Install the development requirements
 43 | 
 44 | ```shell
 45 | pip install -r requirements.txt
 46 | ```
 47 | 
 48 | Later on, upgrade packages as needed
 49 | 
 50 | ```shell
 51 | pip install --upgrade -r requirements.txt
 52 | ```
 53 | 
 54 | ### Testing
 55 | 
 56 | ```shell
 57 | tox -e fmt    # update your code according to linting rules
 58 | tox -e lint   # code style
 59 | tox -e static # static analysis
 60 | tox -e unit   # unit tests
 61 | ```
 62 | 
 63 | tox creates virtual environment for every tox environment defined in
 64 | [tox.ini](tox.ini). To activate a tox environment for manual testing,
 65 | 
 66 | ```shell
 67 | source .tox/unit/bin/activate
 68 | ```
 69 | 
 70 | ## Build charm
 71 | 
 72 | Build the charm in this git repository using
 73 | 
 74 | ```shell
 75 | charmcraft pack
 76 | ```
 77 | 
 78 | ## Usage
 79 | ### Tested images
 80 | - [quay.io/freshtracks.io/avalanche](https://quay.io/freshtracks.io/avalanche)
 81 | 
 82 | ### Deploy Avalanche
 83 | 
 84 | ```shell
 85 | juju deploy ./avalanche-k8s_ubuntu-20.04-amd64.charm \
 86 |   --resource avalanche-image=quay.io/freshtracks.io/avalanche
 87 | ```
 88 | 
 89 | ## Code overview
 90 | - The main charm class is `AvalancheCharm`, which responds to config changes
 91 |   (via `ConfigChangedEvent`) and application upgrades (via
 92 |   `UpgradeCharmEvent`).
 93 | - All lifecycle events call a common hook, `_common_exit_hook` after executing
 94 |   their own business logic. This pattern simplifies state tracking and improves
 95 |   consistency.
 96 | 
 97 | ## Design choices
 98 | NTA
 99 | 
100 | ## Roadmap
101 | TBD
102 | 


--------------------------------------------------------------------------------
/src/grafana_dashboards/avalanche.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "annotations": {
  3 |     "list": [
  4 |       {
  5 |         "builtIn": 1,
  6 |         "datasource": {
  7 |           "type": "grafana",
  8 |           "uid": "-- Grafana --"
  9 |         },
 10 |         "enable": true,
 11 |         "hide": true,
 12 |         "iconColor": "rgba(0, 211, 255, 1)",
 13 |         "name": "Annotations & Alerts",
 14 |         "type": "dashboard"
 15 |       }
 16 |     ]
 17 |   },
 18 |   "editable": true,
 19 |   "fiscalYearStartMonth": 0,
 20 |   "graphTooltip": 0,
 21 |   "id": 22,
 22 |   "links": [],
 23 |   "panels": [
 24 |     {
 25 |       "datasource": {
 26 |         "type": "prometheus",
 27 |         "uid": "PB84469DE42D2E8C3"
 28 |       },
 29 |       "fieldConfig": {
 30 |         "defaults": {
 31 |           "color": {
 32 |             "mode": "palette-classic"
 33 |           },
 34 |           "custom": {
 35 |             "axisBorderShow": false,
 36 |             "axisCenteredZero": false,
 37 |             "axisColorMode": "text",
 38 |             "axisLabel": "",
 39 |             "axisPlacement": "auto",
 40 |             "barAlignment": 0,
 41 |             "barWidthFactor": 0.6,
 42 |             "drawStyle": "line",
 43 |             "fillOpacity": 0,
 44 |             "gradientMode": "none",
 45 |             "hideFrom": {
 46 |               "legend": false,
 47 |               "tooltip": false,
 48 |               "viz": false
 49 |             },
 50 |             "insertNulls": false,
 51 |             "lineInterpolation": "linear",
 52 |             "lineWidth": 1,
 53 |             "pointSize": 5,
 54 |             "scaleDistribution": {
 55 |               "type": "linear"
 56 |             },
 57 |             "showPoints": "auto",
 58 |             "spanNulls": false,
 59 |             "stacking": {
 60 |               "group": "A",
 61 |               "mode": "none"
 62 |             },
 63 |             "thresholdsStyle": {
 64 |               "mode": "off"
 65 |             }
 66 |           },
 67 |           "mappings": [],
 68 |           "thresholds": {
 69 |             "mode": "absolute",
 70 |             "steps": [
 71 |               {
 72 |                 "color": "green"
 73 |               },
 74 |               {
 75 |                 "color": "red",
 76 |                 "value": 80
 77 |               }
 78 |             ]
 79 |           }
 80 |         },
 81 |         "overrides": []
 82 |       },
 83 |       "gridPos": {
 84 |         "h": 8,
 85 |         "w": 12,
 86 |         "x": 0,
 87 |         "y": 0
 88 |       },
 89 |       "id": 1,
 90 |       "options": {
 91 |         "legend": {
 92 |           "calcs": [],
 93 |           "displayMode": "list",
 94 |           "placement": "bottom",
 95 |           "showLegend": true
 96 |         },
 97 |         "tooltip": {
 98 |           "hideZeros": false,
 99 |           "mode": "single",
100 |           "sort": "none"
101 |         }
102 |       },
103 |       "pluginVersion": "12.0.2",
104 |       "targets": [
105 |         {
106 |           "datasource": {
107 |             "type": "prometheus",
108 |             "uid": "PB84469DE42D2E8C3"
109 |           },
110 |           "editorMode": "code",
111 |           "expr": "avalanche_metric_mmmmm_0_0",
112 |           "instant": false,
113 |           "legendFormat": "__auto",
114 |           "range": true,
115 |           "refId": "A"
116 |         }
117 |       ],
118 |       "title": "New panel",
119 |       "type": "timeseries"
120 |     }
121 |   ],
122 |   "preload": false,
123 |   "schemaVersion": 41,
124 |   "tags": [],
125 |   "templating": {
126 |     "list": []
127 |   },
128 |   "time": {
129 |     "from": "now-15m",
130 |     "to": "now"
131 |   },
132 |   "timepicker": {},
133 |   "timezone": "browser",
134 |   "title": "Avalanche",
135 |   "uid": "",
136 |   "version": 1
137 | }
138 | 


--------------------------------------------------------------------------------
/src/kubernetes_service.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright 2021 Canonical Ltd.
  3 | # See LICENSE file for licensing details.
  4 | 
  5 | """Library for kubernetes services."""
  6 | 
  7 | from typing import List, Tuple
  8 | 
  9 | from kubernetes import client, config
 10 | from kubernetes.client import exceptions
 11 | 
 12 | 
 13 | class PatchFailed(RuntimeError):
 14 |     """Patching the kubernetes service failed."""
 15 | 
 16 | 
 17 | class K8sServicePatch:
 18 |     """A utility for patching the Kubernetes service set up by Juju.
 19 | 
 20 |     Attributes:
 21 |             namespace_file (str): path to the k8s namespace file in the charm container
 22 |     """
 23 | 
 24 |     namespace_file = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
 25 | 
 26 |     @staticmethod
 27 |     def namespace() -> str:
 28 |         """Read the Kubernetes namespace we're deployed in from the mounted service token.
 29 | 
 30 |         Returns:
 31 |             str: The current Kubernetes namespace
 32 |         """
 33 |         with open(K8sServicePatch.namespace_file, "r") as f:
 34 |             return f.read().strip()
 35 | 
 36 |     @staticmethod
 37 |     def _k8s_auth():
 38 |         """Authenticate with the Kubernetes API using an in-cluster service token.
 39 | 
 40 |         Raises:
 41 |             PatchFailed: if no permissions to read cluster role
 42 |         """
 43 |         # Authenticate against the Kubernetes API using a mounted ServiceAccount token
 44 |         config.load_incluster_config()
 45 |         # Test the service account we've got for sufficient perms
 46 |         api = client.CoreV1Api(client.ApiClient())
 47 | 
 48 |         try:
 49 |             api.list_namespaced_service(namespace=K8sServicePatch.namespace())
 50 |         except exceptions.ApiException as e:
 51 |             if e.status == 403:
 52 |                 raise PatchFailed(
 53 |                     "No permission to read cluster role. " "Run `juju trust` on this application."
 54 |                 ) from e
 55 |             raise e
 56 | 
 57 |     @staticmethod
 58 |     def _k8s_service(app: str, service_ports: List[Tuple[str, int, int]]) -> client.V1Service:
 59 |         """Property accessor to return a valid Kubernetes Service representation for Alertmanager.
 60 | 
 61 |         Args:
 62 |             app: app name
 63 |             service_ports: a list of tuples (name, port, target_port) for every service port.
 64 | 
 65 |         Returns:
 66 |             kubernetes.client.V1Service: A Kubernetes Service with correctly annotated metadata and
 67 |             ports
 68 |         """
 69 |         ports = [
 70 |             client.V1ServicePort(name=port[0], port=port[1], target_port=port[2])
 71 |             for port in service_ports
 72 |         ]
 73 | 
 74 |         ns = K8sServicePatch.namespace()
 75 |         return client.V1Service(
 76 |             api_version="v1",
 77 |             metadata=client.V1ObjectMeta(
 78 |                 namespace=ns,
 79 |                 name=app,
 80 |                 labels={"app.kubernetes.io/name": app},
 81 |             ),
 82 |             spec=client.V1ServiceSpec(
 83 |                 ports=ports,
 84 |                 selector={"app.kubernetes.io/name": app},
 85 |             ),
 86 |         )
 87 | 
 88 |     @staticmethod
 89 |     def set_ports(app: str, service_ports: List[Tuple[str, int, int]]):
 90 |         """Patch the Kubernetes service created by Juju to map the correct port.
 91 | 
 92 |         Currently, Juju uses port 65535 for all endpoints. This can be observed via:
 93 | 
 94 |             kubectl describe services -n <model_name> | grep Port -C 2
 95 | 
 96 |         At runtime, pebble watches which ports are bound and we need to patch the gap for pebble
 97 |         not telling Juju to fix the K8S Service definition.
 98 | 
 99 |         Typical usage example from within charm code (e.g. on_install):
100 | 
101 |             service_ports = [("my-app-api", 9093, 9093), ("my-app-ha", 9094, 9094)]
102 |             K8sServicePatch.set_ports(self.app.name, service_ports)
103 | 
104 |         Args:
105 |             app: app name
106 |             service_ports: a list of tuples (name, port, target_port) for every service port.
107 | 
108 |         Raises:
109 |             PatchFailed: if patching fails.
110 |         """
111 |         # First ensure we're authenticated with the Kubernetes API
112 |         K8sServicePatch._k8s_auth()
113 | 
114 |         ns = K8sServicePatch.namespace()
115 |         # Set up a Kubernetes client
116 |         api = client.CoreV1Api(client.ApiClient())
117 |         try:
118 |             # Delete the existing service so we can redefine with correct ports
119 |             # I don't think you can issue a patch that *replaces* the existing ports,
120 |             # only append
121 |             api.delete_namespaced_service(name=app, namespace=ns)
122 |             # Recreate the service with the correct ports for the application
123 |             api.create_namespaced_service(
124 |                 namespace=ns, body=K8sServicePatch._k8s_service(app, service_ports)
125 |             )
126 |         except exceptions.ApiException as e:
127 |             raise PatchFailed(f"Failed to patch k8s service: {e}") from e
128 | 


--------------------------------------------------------------------------------
/lib/charms/observability_libs/v0/juju_topology.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 Canonical Ltd.
  2 | # See LICENSE file for licensing details.
  3 | """## Overview.
  4 | 
  5 | This document explains how to use the `JujuTopology` class to
  6 | create and consume topology information from Juju in a consistent manner.
  7 | 
  8 | The goal of the Juju topology is to uniquely identify a piece
  9 | of software running across any of your Juju-managed deployments.
 10 | This is achieved by combining the following four elements:
 11 | 
 12 | - Model name
 13 | - Model UUID
 14 | - Application name
 15 | - Unit identifier
 16 | 
 17 | 
 18 | For a more in-depth description of the concept, as well as a
 19 | walk-through of it's use-case in observability, see
 20 | [this blog post](https://juju.is/blog/model-driven-observability-part-2-juju-topology-metrics)
 21 | on the Juju blog.
 22 | 
 23 | ## Library Usage
 24 | 
 25 | This library may be used to create and consume `JujuTopology` objects.
 26 | The `JujuTopology` class provides three ways to create instances:
 27 | 
 28 | ### Using the `from_charm` method
 29 | 
 30 | Enables instantiation by supplying the charm as an argument. When
 31 | creating topology objects for the current charm, this is the recommended
 32 | approach.
 33 | 
 34 | ```python
 35 | topology = JujuTopology.from_charm(self)
 36 | ```
 37 | 
 38 | ### Using the `from_dict` method
 39 | 
 40 | Allows for instantion using a dictionary of relation data, like the
 41 | `scrape_metadata` from Prometheus or the labels of an alert rule. When
 42 | creating topology objects for remote charms, this is the recommended
 43 | approach.
 44 | 
 45 | ```python
 46 | scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}"))
 47 | topology = JujuTopology.from_dict(scrape_metadata)
 48 | ```
 49 | 
 50 | ### Using the class constructor
 51 | 
 52 | Enables instantiation using whatever values you want. While this
 53 | is useful in some very specific cases, this is almost certainly not
 54 | what you are looking for as setting these values manually may
 55 | result in observability metrics which do not uniquely identify a
 56 | charm in order to provide accurate usage reporting, alerting,
 57 | horizontal scaling, or other use cases.
 58 | 
 59 | ```python
 60 | topology = JujuTopology(
 61 |     model="some-juju-model",
 62 |     model_uuid="00000000-0000-0000-0000-000000000001",
 63 |     application="fancy-juju-application",
 64 |     unit="fancy-juju-application/0",
 65 |     charm_name="fancy-juju-application-k8s",
 66 | )
 67 | ```
 68 | 
 69 | """
 70 | 
 71 | import warnings
 72 | from collections import OrderedDict
 73 | from typing import Dict, List, Optional
 74 | from uuid import UUID
 75 | 
 76 | # The unique Charmhub library identifier, never change it
 77 | LIBID = "bced1658f20f49d28b88f61f83c2d232"
 78 | 
 79 | LIBAPI = 0
 80 | LIBPATCH = 7
 81 | 
 82 | 
 83 | class InvalidUUIDError(Exception):
 84 |     """Invalid UUID was provided."""
 85 | 
 86 |     def __init__(self, uuid: str):
 87 |         self.message = "'{}' is not a valid UUID.".format(uuid)
 88 |         super().__init__(self.message)
 89 | 
 90 | 
 91 | class JujuTopology:
 92 |     """JujuTopology is used for storing, generating and formatting juju topology information.
 93 | 
 94 |     DEPRECATED: This class is deprecated. Use `pip install cosl` and
 95 |     `from cosl.juju_topology import JujuTopology` instead.
 96 |     """
 97 | 
 98 |     def __init__(
 99 |         self,
100 |         model: str,
101 |         model_uuid: str,
102 |         application: str,
103 |         unit: Optional[str] = None,
104 |         charm_name: Optional[str] = None,
105 |     ):
106 |         """Build a JujuTopology object.
107 | 
108 |         A `JujuTopology` object is used for storing and transforming
109 |         Juju topology information. This information is used to
110 |         annotate Prometheus scrape jobs and alert rules. Such
111 |         annotation when applied to scrape jobs helps in identifying
112 |         the source of the scrapped metrics. On the other hand when
113 |         applied to alert rules topology information ensures that
114 |         evaluation of alert expressions is restricted to the source
115 |         (charm) from which the alert rules were obtained.
116 | 
117 |         Args:
118 |             model: a string name of the Juju model
119 |             model_uuid: a globally unique string identifier for the Juju model
120 |             application: an application name as a string
121 |             unit: a unit name as a string
122 |             charm_name: name of charm as a string
123 |         """
124 |         warnings.warn(
125 |             """
126 |             observability_libs.v0.juju_topology is deprecated. Please import the
127 |             library from `cosl` instead: https://github.com/canonical/cos-lib
128 |             """,
129 |             DeprecationWarning,
130 |         )
131 |         if not self.is_valid_uuid(model_uuid):
132 |             raise InvalidUUIDError(model_uuid)
133 | 
134 |         self._model = model
135 |         self._model_uuid = model_uuid
136 |         self._application = application
137 |         self._charm_name = charm_name
138 |         self._unit = unit
139 | 
140 |     def is_valid_uuid(self, uuid):
141 |         """Validate the supplied UUID against the Juju Model UUID pattern.
142 | 
143 |         Args:
144 |             uuid: string that needs to be checked if it is valid v4 UUID.
145 | 
146 |         Returns:
147 |             True if parameter is a valid v4 UUID, False otherwise.
148 |         """
149 |         try:
150 |             return str(UUID(uuid, version=4)) == uuid
151 |         except (ValueError, TypeError):
152 |             return False
153 | 
154 |     @classmethod
155 |     def from_charm(cls, charm):
156 |         """Creates a JujuTopology instance by using the model data available on a charm object.
157 | 
158 |         Args:
159 |             charm: a `CharmBase` object for which the `JujuTopology` will be constructed
160 |         Returns:
161 |             a `JujuTopology` object.
162 |         """
163 |         return cls(
164 |             model=charm.model.name,
165 |             model_uuid=charm.model.uuid,
166 |             application=charm.model.app.name,
167 |             unit=charm.model.unit.name,
168 |             charm_name=charm.meta.name,
169 |         )
170 | 
171 |     @classmethod
172 |     def from_dict(cls, data: dict):
173 |         """Factory method for creating `JujuTopology` children from a dictionary.
174 | 
175 |         Args:
176 |             data: a dictionary with five keys providing topology information. The keys are
177 |                 - "model"
178 |                 - "model_uuid"
179 |                 - "application"
180 |                 - "unit"
181 |                 - "charm_name"
182 |                 `unit` and `charm_name` may be empty, but will result in more limited
183 |                 labels. However, this allows us to support charms without workloads.
184 | 
185 |         Returns:
186 |             a `JujuTopology` object.
187 |         """
188 |         return cls(
189 |             model=data["model"],
190 |             model_uuid=data["model_uuid"],
191 |             application=data["application"],
192 |             unit=data.get("unit", ""),
193 |             charm_name=data.get("charm_name", ""),
194 |         )
195 | 
196 |     def as_dict(
197 |         self,
198 |         *,
199 |         remapped_keys: Optional[Dict[str, str]] = None,
200 |         excluded_keys: Optional[List[str]] = None,
201 |     ) -> OrderedDict:
202 |         """Format the topology information into an ordered dict.
203 | 
204 |         Keeping the dictionary ordered is important to be able to
205 |         compare dicts without having to resort to deep comparisons.
206 | 
207 |         Args:
208 |             remapped_keys: A dictionary mapping old key names to new key names,
209 |                 which will be substituted when invoked.
210 |             excluded_keys: A list of key names to exclude from the returned dict.
211 |             uuid_length: The length to crop the UUID to.
212 |         """
213 |         ret = OrderedDict(
214 |             [
215 |                 ("model", self.model),
216 |                 ("model_uuid", self.model_uuid),
217 |                 ("application", self.application),
218 |                 ("unit", self.unit),
219 |                 ("charm_name", self.charm_name),
220 |             ]
221 |         )
222 |         if excluded_keys:
223 |             ret = OrderedDict({k: v for k, v in ret.items() if k not in excluded_keys})
224 | 
225 |         if remapped_keys:
226 |             ret = OrderedDict(
227 |                 (remapped_keys.get(k), v) if remapped_keys.get(k) else (k, v)
228 |                 for k, v in ret.items()  # type: ignore
229 |             )
230 | 
231 |         return ret
232 | 
233 |     @property
234 |     def identifier(self) -> str:
235 |         """Format the topology information into a terse string.
236 | 
237 |         This crops the model UUID, making it unsuitable for comparisons against
238 |         anything but other identifiers. Mainly to be used as a display name or file
239 |         name where long strings might become an issue.
240 | 
241 |         >>> JujuTopology( \
242 |               model = "a-model", \
243 |               model_uuid = "00000000-0000-4000-8000-000000000000", \
244 |               application = "some-app", \
245 |               unit = "some-app/1" \
246 |             ).identifier
247 |         'a-model_00000000_some-app'
248 |         """
249 |         parts = self.as_dict(
250 |             excluded_keys=["unit", "charm_name"],
251 |         )
252 | 
253 |         parts["model_uuid"] = self.model_uuid_short
254 |         values = parts.values()
255 | 
256 |         return "_".join([str(val) for val in values]).replace("/", "_")
257 | 
258 |     @property
259 |     def label_matcher_dict(self) -> Dict[str, str]:
260 |         """Format the topology information into a dict with keys having 'juju_' as prefix.
261 | 
262 |         Relabelled topology never includes the unit as it would then only match
263 |         the leader unit (ie. the unit that produced the dict).
264 |         """
265 |         items = self.as_dict(
266 |             remapped_keys={"charm_name": "charm"},
267 |             excluded_keys=["unit"],
268 |         ).items()
269 | 
270 |         return {"juju_{}".format(key): value for key, value in items if value}
271 | 
272 |     @property
273 |     def label_matchers(self) -> str:
274 |         """Format the topology information into a promql/logql label matcher string.
275 | 
276 |         Topology label matchers should never include the unit as it
277 |         would then only match the leader unit (ie. the unit that
278 |         produced the matchers).
279 |         """
280 |         items = self.label_matcher_dict.items()
281 |         return ", ".join(['{}="{}"'.format(key, value) for key, value in items if value])
282 | 
283 |     @property
284 |     def model(self) -> str:
285 |         """Getter for the juju model value."""
286 |         return self._model
287 | 
288 |     @property
289 |     def model_uuid(self) -> str:
290 |         """Getter for the juju model uuid value."""
291 |         return self._model_uuid
292 | 
293 |     @property
294 |     def model_uuid_short(self) -> str:
295 |         """Getter for the juju model value, truncated to the first eight letters."""
296 |         return self._model_uuid[:8]
297 | 
298 |     @property
299 |     def application(self) -> str:
300 |         """Getter for the juju application value."""
301 |         return self._application
302 | 
303 |     @property
304 |     def charm_name(self) -> Optional[str]:
305 |         """Getter for the juju charm name value."""
306 |         return self._charm_name
307 | 
308 |     @property
309 |     def unit(self) -> Optional[str]:
310 |         """Getter for the juju unit value."""
311 |         return self._unit
312 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/src/charm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright 2021 Canonical Ltd.
  3 | # See LICENSE file for licensing details.
  4 | 
  5 | """Deploy Avalanche to a Kubernetes environment."""
  6 | 
  7 | import hashlib
  8 | import logging
  9 | import socket
 10 | from typing import Optional, cast
 11 | 
 12 | from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider
 13 | from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider
 14 | from charms.prometheus_k8s.v1.prometheus_remote_write import (
 15 |     PrometheusRemoteWriteConsumer,
 16 | )
 17 | from ops import main
 18 | from ops.charm import CharmBase
 19 | from ops.framework import StoredState
 20 | from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus
 21 | from ops.pebble import Layer
 22 | 
 23 | from kubernetes_service import K8sServicePatch, PatchFailed
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | def sha256(hashable) -> str:
 29 |     """Use instead of the builtin hash() for repeatable values."""
 30 |     if isinstance(hashable, str):
 31 |         hashable = hashable.encode("utf-8")
 32 |     return hashlib.sha256(hashable).hexdigest()
 33 | 
 34 | 
 35 | class AvalancheCharm(CharmBase):
 36 |     """A Juju charm for Avalanche."""
 37 | 
 38 |     _container_name = "avalanche"  # automatically determined from charm name
 39 |     _layer_name = "avalanche"  # layer label argument for container.add_layer
 40 |     _service_name = "avalanche"  # chosen arbitrarily to match charm name
 41 |     _peer_relation_name = "replicas"  # must match metadata.yaml peer role name
 42 |     _port = 9001  # metrics endpoint
 43 | 
 44 |     _stored = StoredState()
 45 | 
 46 |     def __init__(self, *args):
 47 |         super().__init__(*args)
 48 |         self._stored.set_default(servers={}, config_hash=None)
 49 | 
 50 |         self.container = self.unit.get_container(self._container_name)
 51 |         self.unit.set_ports(self._port)
 52 | 
 53 |         self._forward_alert_rules = cast(bool, self.config["forward_alert_rules"])
 54 | 
 55 |         self.metrics_endpoint = MetricsEndpointProvider(
 56 |             self,
 57 |             "metrics-endpoint",
 58 |             jobs=[
 59 |                 {
 60 |                     "job_name": self.model.app.name,
 61 |                     "metrics_path": "/metrics",
 62 |                     "static_configs": [{"targets": [f"*:{self.port}"]}],
 63 |                     "scrape_interval": "15s",  # TODO: move to config.yaml
 64 |                     "scrape_timeout": "10s",
 65 |                 }
 66 |             ],
 67 |             forward_alert_rules=self._forward_alert_rules,
 68 |             refresh_event=[self.on.config_changed],
 69 |             external_url=socket.getfqdn(),
 70 |         )
 71 | 
 72 |         self.remote_write_consumer = PrometheusRemoteWriteConsumer(
 73 |             self,
 74 |             forward_alert_rules=self._forward_alert_rules,
 75 |             refresh_event=[self.on.config_changed],
 76 |         )
 77 |         self.framework.observe(
 78 |             self.remote_write_consumer.on.endpoints_changed,  # pyright: ignore
 79 |             self._remote_write_endpoints_changed,
 80 |         )
 81 | 
 82 |         self.grafana_dashboard_provider = GrafanaDashboardProvider(self)
 83 | 
 84 |         # Core lifecycle events
 85 |         self.framework.observe(self.on.install, self._on_install)
 86 |         self.framework.observe(self.on.config_changed, self._on_config_changed)
 87 |         self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm)
 88 |         self.framework.observe(
 89 |             self.on.avalanche_pebble_ready,
 90 |             self._on_pebble_ready,  # pyright: ignore
 91 |         )
 92 |         self.framework.observe(self.on.start, self._on_start)
 93 |         self.framework.observe(self.on.update_status, self._on_update_status)
 94 | 
 95 |     def _common_exit_hook(self) -> None:
 96 |         """Event processing hook that is common to all events to ensure idempotency."""
 97 |         if not self.container.can_connect():
 98 |             self.unit.status = MaintenanceStatus("Waiting for pod startup to complete")
 99 |             return
100 | 
101 |         # Update pebble layer
102 |         layer_changed = self._update_layer()
103 |         service_running = (
104 |             service := self.container.get_service(self._service_name)
105 |         ) and service.is_running()
106 |         if layer_changed or not service_running:
107 |             if not self._restart_service():
108 |                 self.unit.status = BlockedStatus("Service restart failed")
109 |                 return
110 | 
111 |         if version := self._avalanche_version:
112 |             self.unit.set_workload_version(version)
113 | 
114 |         self.unit.status = ActiveStatus()
115 | 
116 |     @property
117 |     def _avalanche_version(self) -> Optional[str]:
118 |         if not self.container.can_connect():
119 |             return None
120 |         version_output, _ = self.container.exec(["/bin/avalanche", "--version"], combine_stderr=True).wait_output()
121 |         # Output looks like this:
122 |         # 0.3
123 |         return version_output.strip()
124 | 
125 |     def _update_layer(self) -> bool:
126 |         """Update service layer to reflect changes in peers (replicas).
127 | 
128 |         Args:
129 |           restart: a flag indicating if the service should be restarted if a change was detected.
130 | 
131 |         Returns:
132 |           True if anything changed; False otherwise
133 |         """
134 |         overlay = self._layer()
135 |         plan = self.container.get_plan()
136 | 
137 |         is_changed = False
138 | 
139 |         if self._service_name not in plan.services or overlay.services != plan.services:
140 |             logger.debug(
141 |                 "Layer changed; command: %s",
142 |                 overlay.services[self._service_name].command,
143 |             )
144 |             is_changed = True
145 |             self.container.add_layer(self._layer_name, overlay, combine=True)
146 |             self.container.replan()
147 |             logger.debug(
148 |                 "New layer's command: %s",
149 |                 self.container.get_plan().services.get(self._service_name).command,  # pyright: ignore
150 |             )
151 |         else:
152 |             logger.debug("Layer unchanged")
153 | 
154 |         return is_changed
155 | 
156 |     @property
157 |     def port(self):
158 |         """Return the default Avalanche port."""
159 |         return self._port
160 | 
161 |     def _layer(self) -> Layer:
162 |         """Returns the Pebble configuration layer for Avalanche."""
163 | 
164 |         def _command() -> str:
165 |             if endpoints := self.remote_write_consumer.endpoints:
166 |                 # remote-write mode TODO error out / block if both relations present
167 |                 # avalanche cli args support only one remote write target; take the first one
168 |                 logger.debug(
169 |                     "Going into remote write mode; remote write endpoints: %s",
170 |                     self.remote_write_consumer.endpoints,
171 |                 )
172 | 
173 |                 endpoint = endpoints[0]["url"]
174 |                 # TODO offer remote-write-interval as config option
175 |                 mode_args = f"--remote-url={endpoint} --remote-write-interval=15s"
176 |             else:
177 |                 # scraped mode
178 |                 logger.debug("Going into scraped mode (no remote write endpoints)")
179 | 
180 |                 mode_args = f"--port={self.port}"
181 | 
182 |             return " ".join(
183 |                 [
184 |                     "/bin/avalanche",
185 |                     f"--metric-count={self.config['metric_count']}",
186 |                     f"--label-count={self.config['label_count']}",
187 |                     f"--series-count={self.config['series_count']}",
188 |                     f"--metricname-length={self.config['metricname_length']}",
189 |                     f"--labelname-length={self.config['labelname_length']}",
190 |                     f"--value-interval={self.config['value_interval']}",
191 |                     f"--series-interval={self.config['series_interval']}",
192 |                     f"--metric-interval={self.config['metric_interval']}",
193 |                     mode_args,
194 |                 ]
195 |             )
196 | 
197 |         return Layer(
198 |             {
199 |                 "summary": "avalanche layer",
200 |                 "description": "pebble config layer for avalanche",
201 |                 "services": {
202 |                     self._service_name: {
203 |                         "override": "replace",
204 |                         "summary": "avalanche service",
205 |                         "startup": "enabled",
206 |                         "command": _command(),
207 |                     },
208 |                 },
209 |             }
210 |         )
211 | 
212 |     def _on_install(self, _):
213 |         """Event handler for the `install` event during which we will update the K8s service."""
214 |         self._patch_k8s_service()
215 | 
216 |     def _on_upgrade_charm(self, _):
217 |         """Event handler for the upgrade event during which we will update the K8s service."""
218 |         # Ensure that older deployments of Avalanche run the logic to patch the K8s service
219 |         self._patch_k8s_service()
220 | 
221 |         # After upgrade (refresh), the unit ip address is not guaranteed to remain the same, and
222 |         # the config may need update. Calling the common hook to update.
223 |         self._common_exit_hook()
224 | 
225 |     def _patch_k8s_service(self):
226 |         """Fix the Kubernetes service that was set up by Juju with correct port numbers."""
227 |         if self.unit.is_leader():
228 |             service_ports = [
229 |                 (f"{self.app.name}", self._port, self._port),
230 |             ]
231 |             try:
232 |                 K8sServicePatch.set_ports(self.app.name, service_ports)
233 |             except PatchFailed as e:
234 |                 logger.error("Unable to patch the Kubernetes service: %s", str(e))
235 |             else:
236 |                 logger.debug("Successfully patched the Kubernetes service")
237 | 
238 |     def _on_pebble_ready(self, _):
239 |         """Event handler for PebbleReadyEvent."""
240 |         self._common_exit_hook()
241 | 
242 |     def _on_start(self, _):
243 |         """Event handler for StartEvent.
244 | 
245 |         With Juju 2.9.5 encountered a scenario in which pebble_ready and config_changed fired,
246 |         but IP address was not available and the status was stuck on "Waiting for IP address".
247 |         Adding this hook reduce the likelihood of that scenario.
248 |         """
249 |         self._common_exit_hook()
250 | 
251 |     def _on_config_changed(self, _):
252 |         """Event handler for ConfigChangedEvent."""
253 |         self._common_exit_hook()
254 | 
255 |     def _on_alertmanager_config_changed(self, _):
256 |         """Event handler for :class:`AvalancheAlertmanagerConfigChanged`."""
257 |         self._common_exit_hook()
258 | 
259 |     def _restart_service(self) -> bool:
260 |         """Helper function for restarting the underlying service."""
261 |         logger.info("Restarting service %s", self._service_name)
262 | 
263 |         if not self.container.can_connect():
264 |             logger.error("Cannot (re)start service: container is not ready.")
265 |             return False
266 | 
267 |         # Check if service exists, to avoid ModelError from being raised when the service does
268 |         # not yet exist
269 |         if not self.container.get_services().get(self._service_name):
270 |             logger.error("Cannot (re)start service: service does not (yet) exist.")
271 |             return False
272 | 
273 |         self.container.restart(self._service_name)
274 | 
275 |         return True
276 | 
277 |     def _on_update_status(self, _):
278 |         """Event handler for UpdateStatusEvent.
279 | 
280 |         Logs list of peers, uptime and version info.
281 |         """
282 |         pass
283 | 
284 |     def _remote_write_endpoints_changed(self, _):
285 |         """Event handler for remote write endpoints_changed."""
286 |         self._common_exit_hook()
287 | 
288 | 
289 | if __name__ == "__main__":
290 |     main(AvalancheCharm, use_juju_for_storage=True)
291 | 


--------------------------------------------------------------------------------
/lib/charms/prometheus_k8s/v1/prometheus_remote_write.py:
--------------------------------------------------------------------------------
   1 | # Copyright 2021 Canonical Ltd.
   2 | # See LICENSE file for licensing details.
   3 | """# Prometheus remote-write library.
   4 | 
   5 | This library facilitates the integration of the prometheus_remote_write interface.
   6 | 
   7 | Source code can be found on GitHub at:
   8 |  https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s
   9 | 
  10 | Charms that need to push data to a charm exposing the Prometheus remote_write API,
  11 | should use the `PrometheusRemoteWriteConsumer`. Charms that operate software that exposes
  12 | the Prometheus remote_write API, that is, they can receive metrics data over remote_write,
  13 | should use the `PrometheusRemoteWriteProducer`.
  14 | """
  15 | 
  16 | import copy
  17 | import json
  18 | import logging
  19 | import os
  20 | import platform
  21 | import re
  22 | import socket
  23 | import subprocess
  24 | import tempfile
  25 | from pathlib import Path
  26 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union
  27 | 
  28 | import yaml
  29 | from cosl import JujuTopology
  30 | from cosl.rules import AlertRules, generic_alert_groups
  31 | from ops.charm import (
  32 |     CharmBase,
  33 |     HookEvent,
  34 |     RelationBrokenEvent,
  35 |     RelationEvent,
  36 |     RelationMeta,
  37 |     RelationRole,
  38 | )
  39 | from ops.framework import BoundEvent, EventBase, EventSource, Object, ObjectEvents
  40 | from ops.model import Relation
  41 | 
  42 | # The unique Charmhub library identifier, never change it
  43 | LIBID = "f783823fa75f4b7880eb70f2077ec259"
  44 | 
  45 | # Increment this major API version when introducing breaking changes
  46 | LIBAPI = 1
  47 | 
  48 | # Increment this PATCH version before using `charmcraft publish-lib` or reset
  49 | # to 0 if you are raising the major API version
  50 | LIBPATCH = 10
  51 | 
  52 | PYDEPS = ["cosl"]
  53 | 
  54 | 
  55 | logger = logging.getLogger(__name__)
  56 | 
  57 | 
  58 | DEFAULT_RELATION_NAME = "receive-remote-write"
  59 | DEFAULT_CONSUMER_NAME = "send-remote-write"
  60 | RELATION_INTERFACE_NAME = "prometheus_remote_write"
  61 | 
  62 | DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules"
  63 | 
  64 | 
  65 | class RelationNotFoundError(Exception):
  66 |     """Raised if there is no relation with the given name."""
  67 | 
  68 |     def __init__(self, relation_name: str):
  69 |         self.relation_name = relation_name
  70 |         self.message = "No relation named '{}' found".format(relation_name)
  71 | 
  72 |         super().__init__(self.message)
  73 | 
  74 | 
  75 | class RelationInterfaceMismatchError(Exception):
  76 |     """Raised if the relation with the given name has a different interface."""
  77 | 
  78 |     def __init__(
  79 |         self,
  80 |         relation_name: str,
  81 |         expected_relation_interface: str,
  82 |         actual_relation_interface: str,
  83 |     ):
  84 |         self.relation_name = relation_name
  85 |         self.expected_relation_interface = expected_relation_interface
  86 |         self.actual_relation_interface = actual_relation_interface
  87 |         self.message = (
  88 |             "The '{}' relation has '{}' as its interface rather than the expected '{}'".format(
  89 |                 relation_name, actual_relation_interface, expected_relation_interface
  90 |             )
  91 |         )
  92 | 
  93 |         super().__init__(self.message)
  94 | 
  95 | 
  96 | class RelationRoleMismatchError(Exception):
  97 |     """Raised if the relation with the given name has a different direction."""
  98 | 
  99 |     def __init__(
 100 |         self,
 101 |         relation_name: str,
 102 |         expected_relation_role: RelationRole,
 103 |         actual_relation_role: RelationRole,
 104 |     ):
 105 |         self.relation_name = relation_name
 106 |         self.expected_relation_interface = expected_relation_role
 107 |         self.actual_relation_role = actual_relation_role
 108 |         self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format(
 109 |             relation_name, repr(actual_relation_role), repr(expected_relation_role)
 110 |         )
 111 | 
 112 |         super().__init__(self.message)
 113 | 
 114 | 
 115 | class InvalidAlertRuleEvent(EventBase):
 116 |     """Event emitted when alert rule files are not parsable.
 117 | 
 118 |     Enables us to set a clear status on the provider.
 119 |     """
 120 | 
 121 |     def __init__(self, handle, errors: str = "", valid: bool = False):
 122 |         super().__init__(handle)
 123 |         self.errors = errors
 124 |         self.valid = valid
 125 | 
 126 |     def snapshot(self) -> Dict:
 127 |         """Save alert rule information."""
 128 |         return {
 129 |             "valid": self.valid,
 130 |             "errors": self.errors,
 131 |         }
 132 | 
 133 |     def restore(self, snapshot):
 134 |         """Restore alert rule information."""
 135 |         self.valid = snapshot["valid"]
 136 |         self.errors = snapshot["errors"]
 137 | 
 138 | 
 139 | def _is_official_alert_rule_format(rules_dict: dict) -> bool:
 140 |     """Are alert rules in the upstream format as supported by Prometheus.
 141 | 
 142 |     Alert rules in dictionary format are in "official" form if they
 143 |     contain a "groups" key, since this implies they contain a list of
 144 |     alert rule groups.
 145 | 
 146 |     Args:
 147 |         rules_dict: a set of alert rules in Python dictionary format
 148 | 
 149 |     Returns:
 150 |         True if alert rules are in official Prometheus file format.
 151 |     """
 152 |     return "groups" in rules_dict
 153 | 
 154 | 
 155 | def _is_single_alert_rule_format(rules_dict: dict) -> bool:
 156 |     """Are alert rules in single rule format.
 157 | 
 158 |     The Prometheus charm library supports reading of alert rules in a
 159 |     custom format that consists of a single alert rule per file. This
 160 |     does not conform to the official Prometheus alert rule file format
 161 |     which requires that each alert rules file consists of a list of
 162 |     alert rule groups and each group consists of a list of alert
 163 |     rules.
 164 | 
 165 |     Alert rules in dictionary form are considered to be in single rule
 166 |     format if in the least it contains two keys corresponding to the
 167 |     alert rule name and alert expression.
 168 | 
 169 |     Returns:
 170 |         True if alert rule is in single rule file format.
 171 |     """
 172 |     # one alert rule per file
 173 |     return set(rules_dict) >= {"alert", "expr"}
 174 | 
 175 | 
 176 | def _validate_relation_by_interface_and_direction(
 177 |     charm: CharmBase,
 178 |     relation_name: str,
 179 |     expected_relation_interface: str,
 180 |     expected_relation_role: RelationRole,
 181 | ):
 182 |     """Verifies that a relation has the necessary characteristics.
 183 | 
 184 |     Verifies that the `relation_name` provided: (1) exists in metadata.yaml,
 185 |     (2) declares as interface the interface name passed as `relation_interface`
 186 |     and (3) has the right "direction", i.e., it is a relation that `charm`
 187 |     provides or requires.
 188 | 
 189 |     Args:
 190 |         charm: a `CharmBase` object to scan for the matching relation.
 191 |         relation_name: the name of the relation to be verified.
 192 |         expected_relation_interface: the interface name to be matched by the
 193 |             relation named `relation_name`.
 194 |         expected_relation_role: whether the `relation_name` must be either
 195 |             provided or required by `charm`.
 196 | 
 197 |     Raises:
 198 |         RelationNotFoundError: If there is no relation in the charm's metadata.yaml
 199 |             with the same name as provided via `relation_name` argument.
 200 |         RelationInterfaceMismatchError: The relation with the same name as provided
 201 |             via `relation_name` argument does not have the same relation interface
 202 |             as specified via the `expected_relation_interface` argument.
 203 |         RelationRoleMismatchError: If the relation with the same name as provided
 204 |             via `relation_name` argument does not have the same role as specified
 205 |             via the `expected_relation_role` argument.
 206 |     """
 207 |     if relation_name not in charm.meta.relations:
 208 |         raise RelationNotFoundError(relation_name)
 209 | 
 210 |     relation: RelationMeta = charm.meta.relations[relation_name]
 211 | 
 212 |     actual_relation_interface = relation.interface_name
 213 |     if actual_relation_interface != expected_relation_interface:
 214 |         raise RelationInterfaceMismatchError(
 215 |             relation_name, expected_relation_interface, actual_relation_interface or "None"
 216 |         )
 217 | 
 218 |     if expected_relation_role == RelationRole.provides:
 219 |         if relation_name not in charm.meta.provides:
 220 |             raise RelationRoleMismatchError(
 221 |                 relation_name, RelationRole.provides, RelationRole.requires
 222 |             )
 223 |     elif expected_relation_role == RelationRole.requires:
 224 |         if relation_name not in charm.meta.requires:
 225 |             raise RelationRoleMismatchError(
 226 |                 relation_name, RelationRole.requires, RelationRole.provides
 227 |             )
 228 |     else:
 229 |         raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role))
 230 | 
 231 | 
 232 | class PrometheusRemoteWriteEndpointsChangedEvent(EventBase):
 233 |     """Event emitted when Prometheus remote_write endpoints change."""
 234 | 
 235 |     def __init__(self, handle, relation_id):
 236 |         super().__init__(handle)
 237 |         self.relation_id = relation_id
 238 | 
 239 |     def snapshot(self):
 240 |         """Save scrape Prometheus remote_write information."""
 241 |         return {"relation_id": self.relation_id}
 242 | 
 243 |     def restore(self, snapshot):
 244 |         """Restore scrape Prometheus remote_write information."""
 245 |         self.relation_id = snapshot["relation_id"]
 246 | 
 247 | 
 248 | class InvalidAlertRulePathError(Exception):
 249 |     """Raised if the alert rules folder cannot be found or is otherwise invalid."""
 250 | 
 251 |     def __init__(
 252 |         self,
 253 |         alert_rules_absolute_path: str,
 254 |         message: str,
 255 |     ):
 256 |         self.alert_rules_absolute_path = alert_rules_absolute_path
 257 |         self.message = message
 258 | 
 259 |         super().__init__(self.message)
 260 | 
 261 | 
 262 | def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str:
 263 |     """Resolve the provided path items against the directory of the main file.
 264 | 
 265 |     Look up the directory of the main .py file being executed. This is normally
 266 |     going to be the charm.py file of the charm including this library. Then, resolve
 267 |     the provided path elements and, if the result path exists and is a directory,
 268 |     return its absolute path; otherwise, return `None`.
 269 |     """
 270 |     charm_dir = Path(str(charm.charm_dir))
 271 |     if not charm_dir.exists() or not charm_dir.is_dir():
 272 |         # Operator Framework does not currently expose a robust
 273 |         # way to determine the top level charm source directory
 274 |         # that is consistent across deployed charms and unit tests
 275 |         # Hence for unit tests the current working directory is used
 276 |         # TODO: updated this logic when the following ticket is resolved
 277 |         # https://github.com/canonical/operator/issues/643
 278 |         charm_dir = Path(os.getcwd())
 279 | 
 280 |     alerts_dir_path = charm_dir.absolute().joinpath(*path_elements)
 281 | 
 282 |     if not alerts_dir_path.exists():
 283 |         raise InvalidAlertRulePathError(str(alerts_dir_path), "directory does not exist")
 284 |     if not alerts_dir_path.is_dir():
 285 |         raise InvalidAlertRulePathError(str(alerts_dir_path), "is not a directory")
 286 | 
 287 |     return str(alerts_dir_path)
 288 | 
 289 | 
 290 | class PrometheusRemoteWriteConsumerEvents(ObjectEvents):
 291 |     """Event descriptor for events raised by `PrometheusRemoteWriteConsumer`."""
 292 | 
 293 |     endpoints_changed = EventSource(PrometheusRemoteWriteEndpointsChangedEvent)
 294 |     alert_rule_status_changed = EventSource(InvalidAlertRuleEvent)
 295 | 
 296 | 
 297 | class PrometheusRemoteWriteConsumer(Object):
 298 |     """API that manages a required `prometheus_remote_write` relation.
 299 | 
 300 |      The `PrometheusRemoteWriteConsumer` is intended to be used by charms that need to push data to
 301 |      other charms over the Prometheus remote_write API.
 302 | 
 303 |      The `PrometheusRemoteWriteConsumer` object can be instantiated as follows in your charm:
 304 | 
 305 |      ```
 306 |      from charms.prometheus_k8s.v1.prometheus_remote_write import PrometheusRemoteWriteConsumer
 307 | 
 308 |      def __init__(self, *args):
 309 |          ...
 310 |          self.remote_write_consumer = PrometheusRemoteWriteConsumer(self)
 311 |          ...
 312 |      ```
 313 | 
 314 |      The `PrometheusRemoteWriteConsumer` assumes that, in the `metadata.yaml` of your charm,
 315 |      you declare a required relation as follows:
 316 | 
 317 |      ```
 318 |      requires:
 319 |          send-remote-write:  # Relation name
 320 |              interface: prometheus_remote_write  # Relation interface
 321 |      ```
 322 | 
 323 |      The charmed operator is expected to use the `PrometheusRemoteWriteConsumer` as follows:
 324 | 
 325 |      ```
 326 |      def __init__(self, *args):
 327 |          ...
 328 |          self.remote_write_consumer = PrometheusRemoteWriteConsumer(self)
 329 |          ...
 330 | 
 331 |          self.framework.observe(
 332 |              self.remote_write_consumer.on.endpoints_changed,
 333 |              self._handle_endpoints_changed,
 334 |          )
 335 |      ```
 336 |      The `endpoints_changed` event will fire in situations such as provider ip change (e.g.
 337 |      relation created, provider upgrade, provider pod churn) or provider config change (e.g.
 338 |      metadata settings).
 339 | 
 340 |      Then, inside the logic of `_handle_endpoints_changed`, the updated endpoint list is
 341 |      retrieved with:
 342 | 
 343 |      ```
 344 |      self.remote_write_consumer.endpoints
 345 |      ```
 346 | 
 347 |      which returns a dictionary structured like the Prometheus configuration object (see
 348 |      https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write).
 349 | 
 350 |      Regarding the default relation name, `send-remote-write`: if you choose to change it,
 351 |      you would need to explicitly provide it to the `PrometheusRemoteWriteConsumer` via the
 352 |      `relation_name` constructor argument. (The relation interface, on the other hand, is
 353 |      fixed and, if you were to change it, your charm would not be able to relate with other
 354 |      charms using the correct relation interface. The library prevents you from doing that by
 355 |      raising an exception.)
 356 | 
 357 |      In any case, it is strongly discouraged to change the relation name: having consistent
 358 |      relation names across charms that do similar things is good practice and more
 359 |      straightforward for the users of your charm. The one exception to the rule above,
 360 |      is if your charm needs to both consume and provide a relation using the
 361 |      `prometheus_remote_write` interface, in which case changing the relation name to
 362 |      differentiate between "incoming" and "outgoing" remote write interactions is necessary.
 363 | 
 364 |      It is also possible to specify alert rules. By default, this library will search
 365 |      `<charm_parent_dir>/prometheus_alert_rules`, which in standard charm
 366 |      layouts resolves to `src/prometheus_alert_rules`. Each set of alert rules, grouped
 367 |      by the topology identifier, goes into a separate `*.rule` file.
 368 | 
 369 |      If the syntax of a rule is invalid, the `MetricsEndpointProvider` logs an error and
 370 |      does not load the particular rule.
 371 | 
 372 |      To avoid false positives and false negatives the library will inject label filters
 373 |      automatically in the PromQL expression. For example if the charm provides an
 374 |      alert rule with an `expr` like this one:
 375 | 
 376 |      ```yaml
 377 |      expr: up < 1
 378 |      ```
 379 | 
 380 |     it will be modified with label filters ensuring that
 381 |      the only timeseries evaluated are those scraped from this charm, and no other.
 382 | 
 383 | 
 384 |      ```yaml
 385 |      expr: up{juju_application="traefik",
 386 |               juju_charm="traefik-k8s",
 387 |               juju_model="cos",
 388 |               juju_model_uuid="b5ed878d-2671-42e8-873a-e8d58c0ec325"
 389 |            } < 1
 390 |      labels:
 391 |        juju_application: traefik
 392 |        juju_charm: traefik-k8s
 393 |        juju_model: cos
 394 |        juju_model_uuid: b5ed878d-2671-42e8-873a-e8d58c0ec325
 395 |      ```
 396 |     """
 397 | 
 398 |     on = PrometheusRemoteWriteConsumerEvents()  # pyright: ignore
 399 | 
 400 |     def __init__(
 401 |         self,
 402 |         charm: CharmBase,
 403 |         relation_name: str = DEFAULT_CONSUMER_NAME,
 404 |         alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
 405 |         refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None,
 406 |         *,
 407 |         forward_alert_rules: bool = True,
 408 |         extra_alert_labels: Dict = {},
 409 |     ):
 410 |         """API to manage a required relation with the `prometheus_remote_write` interface.
 411 | 
 412 |         Args:
 413 |             charm: The charm object that instantiated this class.
 414 |             relation_name: Name of the relation with the `prometheus_remote_write` interface as
 415 |                 defined in metadata.yaml.
 416 |             alert_rules_path: Path of the directory containing the alert rules.
 417 |             refresh_event: an optional bound event or list of bound events which
 418 |                 will be observed to re-set alerts data.
 419 |             forward_alert_rules: Flag to toggle forwarding of charmed alert rules.
 420 |             extra_alert_labels: Dict of extra labels to inject alert rules with.
 421 | 
 422 |         Raises:
 423 |             RelationNotFoundError: If there is no relation in the charm's metadata.yaml
 424 |                 with the same name as provided via `relation_name` argument.
 425 |             RelationInterfaceMismatchError: The relation with the same name as provided
 426 |                 via `relation_name` argument does not have the `prometheus_scrape` relation
 427 |                 interface.
 428 |             RelationRoleMismatchError: If the relation with the same name as provided
 429 |                 via `relation_name` argument does not have the `RelationRole.requires`
 430 |                 role.
 431 |         """
 432 |         _validate_relation_by_interface_and_direction(
 433 |             charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires
 434 |         )
 435 | 
 436 |         try:
 437 |             alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path)
 438 |         except InvalidAlertRulePathError as e:
 439 |             logger.debug(
 440 |                 "Invalid Prometheus alert rules folder at %s: %s",
 441 |                 e.alert_rules_absolute_path,
 442 |                 e.message,
 443 |             )
 444 | 
 445 |         super().__init__(charm, relation_name)
 446 |         self._charm = charm
 447 |         self._relation_name = relation_name
 448 |         self._alert_rules_path = alert_rules_path
 449 |         self._forward_alert_rules = forward_alert_rules
 450 |         self._extra_alert_labels = extra_alert_labels
 451 | 
 452 |         self.topology = JujuTopology.from_charm(charm)
 453 | 
 454 |         on_relation = self._charm.on[self._relation_name]
 455 | 
 456 |         self.framework.observe(on_relation.relation_joined, self._handle_endpoints_changed)
 457 |         self.framework.observe(on_relation.relation_changed, self._handle_endpoints_changed)
 458 |         self.framework.observe(on_relation.relation_departed, self._handle_endpoints_changed)
 459 |         self.framework.observe(on_relation.relation_broken, self._on_relation_broken)
 460 |         self.framework.observe(on_relation.relation_joined, self._push_alerts_on_relation_joined)
 461 |         self.framework.observe(
 462 |             self._charm.on.leader_elected, self._push_alerts_to_all_relation_databags
 463 |         )
 464 |         self.framework.observe(
 465 |             self._charm.on.upgrade_charm, self._push_alerts_to_all_relation_databags
 466 |         )
 467 |         if refresh_event:
 468 |             if not isinstance(refresh_event, list):
 469 |                 refresh_event = [refresh_event]
 470 |             for ev in refresh_event:
 471 |                 self.framework.observe(ev, self._push_alerts_to_all_relation_databags)
 472 | 
 473 |     def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
 474 |         self.on.endpoints_changed.emit(relation_id=event.relation.id)
 475 | 
 476 |     def _handle_endpoints_changed(self, event: RelationEvent) -> None:
 477 |         if self._charm.unit.is_leader() and event.app is not None:
 478 |             ev = json.loads(event.relation.data[event.app].get("event", "{}"))
 479 | 
 480 |             if ev:
 481 |                 valid = bool(ev.get("valid", True))
 482 |                 errors = ev.get("errors", "")
 483 | 
 484 |                 if valid and not errors:
 485 |                     self.on.alert_rule_status_changed.emit(valid=valid)
 486 |                 else:
 487 |                     self.on.alert_rule_status_changed.emit(valid=valid, errors=errors)
 488 | 
 489 |         self.on.endpoints_changed.emit(relation_id=event.relation.id)
 490 | 
 491 |     def _push_alerts_on_relation_joined(self, event: RelationEvent) -> None:
 492 |         self._push_alerts_to_relation_databag(event.relation)
 493 | 
 494 |     def _push_alerts_to_all_relation_databags(self, _: Optional[HookEvent]) -> None:
 495 |         for relation in self.model.relations[self._relation_name]:
 496 |             self._push_alerts_to_relation_databag(relation)
 497 | 
 498 |     def _push_alerts_to_relation_databag(self, relation: Relation) -> None:
 499 |         if not self._charm.unit.is_leader():
 500 |             return
 501 | 
 502 |         alert_rules = AlertRules(query_type="promql", topology=self.topology)
 503 |         if self._forward_alert_rules:
 504 |             alert_rules.add_path(self._alert_rules_path)
 505 |             alert_rules.add(
 506 |                 generic_alert_groups.aggregator_rules, group_name_prefix=self.topology.identifier
 507 |             )
 508 | 
 509 |         alert_rules_as_dict = alert_rules.as_dict()
 510 | 
 511 |         if self._extra_alert_labels:
 512 |             alert_rules_as_dict = (
 513 |                 PrometheusRemoteWriteConsumer._inject_extra_labels_to_alert_rules(
 514 |                     alert_rules_as_dict, self._extra_alert_labels
 515 |                 )
 516 |             )
 517 | 
 518 |         relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict)
 519 | 
 520 |     def reload_alerts(self) -> None:
 521 |         """Reload alert rules from disk and push to relation data."""
 522 |         self._push_alerts_to_all_relation_databags(None)
 523 | 
 524 |     @staticmethod
 525 |     def _inject_extra_labels_to_alert_rules(rules: Dict, extra_alert_labels: Dict) -> Dict:
 526 |         """Return a copy of the rules dict with extra labels injected."""
 527 |         result = copy.deepcopy(rules)
 528 |         for group in result.get("groups", []):
 529 |             for rule in group.get("rules", []):
 530 |                 rule.setdefault("labels", {}).update(extra_alert_labels)
 531 |         return result
 532 | 
 533 |     @property
 534 |     def endpoints(self) -> List[Dict[str, str]]:
 535 |         """A config object ready to be dropped into a prometheus config file.
 536 | 
 537 |         The endpoints are deduplicated.
 538 | 
 539 |         The format of the dict is specified in the official prometheus docs:
 540 |         https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
 541 | 
 542 |         Returns:
 543 |             A list of dictionaries where each dictionary provides information about
 544 |             a single remote_write endpoint.
 545 |         """
 546 |         endpoints = []
 547 |         for relation in self.model.relations[self._relation_name]:
 548 |             for unit in relation.units:
 549 |                 if unit.app is self._charm.app:
 550 |                     # This is a peer unit
 551 |                     continue
 552 |                 if not (unit_databag := relation.data.get(unit)):
 553 |                     continue
 554 |                 if not (remote_write := unit_databag.get("remote_write")):
 555 |                     continue
 556 | 
 557 |                 deserialized_remote_write = json.loads(remote_write)
 558 |                 endpoints.append(
 559 |                     {
 560 |                         "url": deserialized_remote_write["url"],
 561 |                     }
 562 |                 )
 563 | 
 564 |         # When multiple units of the remote-write server are behind an ingress
 565 |         # (e.g. mimir), relation data would end up with the same ingress url
 566 |         # for all units.
 567 |         # Deduplicate the endpoints by converting each dict to a tuple of
 568 |         # dict.items(), throwing them into a set, and then converting them
 569 |         # back to dictionaries
 570 |         deduplicated_endpoints = [dict(t) for t in {tuple(d.items()) for d in endpoints}]
 571 |         return deduplicated_endpoints
 572 | 
 573 | 
 574 | class PrometheusRemoteWriteAlertsChangedEvent(EventBase):
 575 |     """Event emitted when Prometheus remote_write alerts change."""
 576 | 
 577 |     def __init__(self, handle, relation_id):
 578 |         super().__init__(handle)
 579 |         self.relation_id = relation_id
 580 | 
 581 |     def snapshot(self):
 582 |         """Save Prometheus remote_write information."""
 583 |         return {"relation_id": self.relation_id}
 584 | 
 585 |     def restore(self, snapshot):
 586 |         """Restore Prometheus remote_write information."""
 587 |         self.relation_id = snapshot["relation_id"]
 588 | 
 589 | 
 590 | class PrometheusRemoteWriteProviderConsumersChangedEvent(EventBase):
 591 |     """Event emitted when Prometheus remote_write alerts change."""
 592 | 
 593 | 
 594 | class PrometheusRemoteWriteProviderEvents(ObjectEvents):
 595 |     """Event descriptor for events raised by `PrometheusRemoteWriteProvider`."""
 596 | 
 597 |     alert_rules_changed = EventSource(PrometheusRemoteWriteAlertsChangedEvent)
 598 |     consumers_changed = EventSource(PrometheusRemoteWriteProviderConsumersChangedEvent)
 599 | 
 600 | 
 601 | class PrometheusRemoteWriteProvider(Object):
 602 |     """API that manages a provided `prometheus_remote_write` relation.
 603 | 
 604 |     The `PrometheusRemoteWriteProvider` is intended to be used by charms whose workloads need
 605 |     to receive data from other charms' workloads over the Prometheus remote_write API.
 606 | 
 607 |     The `PrometheusRemoteWriteProvider` object can be instantiated as follows in your charm:
 608 | 
 609 |     ```
 610 |     from charms.prometheus_k8s.v1.prometheus_remote_write import PrometheusRemoteWriteProvider
 611 | 
 612 |     def __init__(self, *args):
 613 |         ...
 614 |         self.remote_write_provider = PrometheusRemoteWriteProvider(self)
 615 |         ...
 616 |     ```
 617 | 
 618 |     The `PrometheusRemoteWriteProvider` assumes that, in the `metadata.yaml` of your charm,
 619 |     you declare a provided relation as follows:
 620 | 
 621 |     ```
 622 |     provides:
 623 |         receive-remote-write:  # Relation name
 624 |             interface: prometheus_remote_write  # Relation interface
 625 |     ```
 626 | 
 627 |     About the name of the relation managed by this library: technically, you *could* change
 628 |     the relation name, `receive-remote-write`, but that requires you to provide the new
 629 |     relation name to the `PrometheusRemoteWriteProducer` via the `relation_name` constructor
 630 |     argument. (The relation interface, on the other hand, is immutable and, if you were to change
 631 |     it, your charm would not be able to relate with other charms using the right relation
 632 |     interface. The library prevents you from doing that by raising an exception.) In any case, it
 633 |     is strongly discouraged to change the relation name: having consistent relation names across
 634 |     charms that do similar things is a very good thing for the people that will use your charm.
 635 |     The one exception to the rule above, is if you charm needs to both consume and provide a
 636 |     relation using the `prometheus_remote_write` interface, in which case changing the relation
 637 |     name to differentiate between "incoming" and "outgoing" remote write interactions is necessary.
 638 |     """
 639 | 
 640 |     on = PrometheusRemoteWriteProviderEvents()  # pyright: ignore
 641 | 
 642 |     def __init__(
 643 |         self,
 644 |         charm: CharmBase,
 645 |         relation_name: str = DEFAULT_RELATION_NAME,
 646 |         *,
 647 |         server_url_func: Callable[[], str] = lambda: f"http://{socket.getfqdn()}:9090",
 648 |         endpoint_path: str = "/api/v1/write",
 649 |     ):
 650 |         """API to manage a provided relation with the `prometheus_remote_write` interface.
 651 | 
 652 |         Args:
 653 |             charm: The charm object that instantiated this class.
 654 |             relation_name: Name of the relation with the `prometheus_remote_write` interface as
 655 |                 defined in metadata.yaml.
 656 |             server_url_func: A callable returning the URL for your prometheus server.
 657 |             endpoint_path: The path of the server's remote_write endpoint.
 658 | 
 659 |         Raises:
 660 |             RelationNotFoundError: If there is no relation in the charm's metadata.yaml
 661 |                 with the same name as provided via `relation_name` argument.
 662 |             RelationInterfaceMismatchError: The relation with the same name as provided
 663 |                 via `relation_name` argument does not have the `prometheus_scrape` relation
 664 |                 interface.
 665 |             RelationRoleMismatchError: If the relation with the same name as provided
 666 |                 via `relation_name` argument does not have the `RelationRole.requires`
 667 |                 role.
 668 |         """
 669 |         _validate_relation_by_interface_and_direction(
 670 |             charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides
 671 |         )
 672 | 
 673 |         super().__init__(charm, relation_name)
 674 |         self._charm = charm
 675 |         self._tool = CosTool(self._charm)
 676 |         self._relation_name = relation_name
 677 |         self._get_server_url = server_url_func
 678 |         self._endpoint_path = endpoint_path
 679 | 
 680 |         on_relation = self._charm.on[self._relation_name]
 681 |         self.framework.observe(
 682 |             on_relation.relation_created,
 683 |             self._on_consumers_changed,
 684 |         )
 685 |         self.framework.observe(
 686 |             on_relation.relation_joined,
 687 |             self._on_consumers_changed,
 688 |         )
 689 |         self.framework.observe(
 690 |             on_relation.relation_changed,
 691 |             self._on_relation_changed,
 692 |         )
 693 | 
 694 |     def _on_consumers_changed(self, event: RelationEvent) -> None:
 695 |         if not isinstance(event, RelationBrokenEvent):
 696 |             self.update_endpoint(event.relation)
 697 |         self.on.consumers_changed.emit()
 698 | 
 699 |     def _on_relation_changed(self, event: RelationEvent) -> None:
 700 |         """Flag Providers that data has changed, so they can re-read alerts."""
 701 |         self.on.alert_rules_changed.emit(event.relation.id)
 702 | 
 703 |     def update_endpoint(self, relation: Optional[Relation] = None) -> None:
 704 |         """Triggers programmatically the update of the relation data.
 705 | 
 706 |         This method should be used when the charm relying on this library needs
 707 |         to update the relation data in response to something occurring outside
 708 |         the `prometheus_remote_write` relation lifecycle, e.g., in case of a
 709 |         host address change because the charmed operator becomes connected to an
 710 |         Ingress after the `prometheus_remote_write` relation is established.
 711 | 
 712 |         Args:
 713 |             relation: An optional instance of `class:ops.model.Relation` to update.
 714 |                 If not provided, all instances of the `prometheus_remote_write`
 715 |                 relation are updated.
 716 |         """
 717 |         relations = [relation] if relation else self.model.relations[self._relation_name]
 718 | 
 719 |         for relation in relations:
 720 |             self._set_endpoint_on_relation(relation)
 721 | 
 722 |     def _set_endpoint_on_relation(self, relation: Relation) -> None:
 723 |         """Set the remote_write endpoint on relations.
 724 | 
 725 |         Args:
 726 |             relation: The relation whose data to update.
 727 |         """
 728 |         relation.data[self._charm.unit]["remote_write"] = json.dumps(
 729 |             {
 730 |                 "url": self._get_server_url().rstrip("/") + "/" + self._endpoint_path.strip("/"),
 731 |             }
 732 |         )
 733 | 
 734 |     @property
 735 |     def alerts(self) -> dict:
 736 |         """Fetch alert rules from all relations.
 737 | 
 738 |         A Prometheus alert rules file consists of a list of "groups". Each
 739 |         group consists of a list of alerts (`rules`) that are sequentially
 740 |         executed. This method returns all the alert rules provided by each
 741 |         related metrics provider charm. These rules may be used to generate a
 742 |         separate alert rules file for each relation since the returned list
 743 |         of alert groups are indexed by relation ID. Also, for each relation ID
 744 |         associated scrape metadata such as Juju model, UUID and application
 745 |         name are provided so the unique name may be generated for the rules
 746 |         file. For each relation the structure of data returned is a dictionary
 747 |         with four keys
 748 | 
 749 |         - groups
 750 |         - model
 751 |         - model_uuid
 752 |         - application
 753 | 
 754 |         The value of the `groups` key is such that it may be used to generate
 755 |         a Prometheus alert rules file directly using `yaml.dump` but the
 756 |         `groups` key itself must be included as this is required by Prometheus,
 757 |         for example as in `yaml.safe_dump({"groups": alerts["groups"]})`.
 758 | 
 759 |         The `PrometheusRemoteWriteProvider` accepts a list of rules and these
 760 |         rules are all placed into one group.
 761 | 
 762 |         Returns:
 763 |             a dictionary mapping the name of an alert rule group to the group.
 764 |         """
 765 |         alerts = {}  # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files
 766 |         for relation in self._charm.model.relations[self._relation_name]:
 767 |             if not relation.units or not relation.app:
 768 |                 continue
 769 | 
 770 |             alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}"))
 771 |             if not alert_rules:
 772 |                 continue
 773 | 
 774 |             alert_rules = self._inject_alert_expr_labels(alert_rules)
 775 | 
 776 |             identifier, topology = self._get_identifier_by_alert_rules(alert_rules)
 777 |             if not topology:
 778 |                 try:
 779 |                     scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"])
 780 |                     identifier = JujuTopology.from_dict(scrape_metadata).identifier
 781 |                     alerts[identifier] = self._tool.apply_label_matchers(alert_rules)  # type: ignore
 782 | 
 783 |                 except KeyError as e:
 784 |                     logger.debug(
 785 |                         "Relation %s has no 'scrape_metadata': %s",
 786 |                         relation.id,
 787 |                         e,
 788 |                     )
 789 | 
 790 |             if not identifier:
 791 |                 logger.error(
 792 |                     "Alert rules were found but no usable group or identifier was present."
 793 |                 )
 794 |                 continue
 795 | 
 796 |             _, errmsg = self._tool.validate_alert_rules(alert_rules)
 797 |             if errmsg:
 798 |                 logger.error(f"Invalid alert rule file: {errmsg}")
 799 |                 if self._charm.unit.is_leader():
 800 |                     data = json.loads(relation.data[self._charm.app].get("event", "{}"))
 801 |                     data["errors"] = errmsg
 802 |                     relation.data[self._charm.app]["event"] = json.dumps(data)
 803 |                 continue
 804 | 
 805 |             alerts[identifier] = alert_rules
 806 | 
 807 |         return alerts
 808 | 
 809 |     def _get_identifier_by_alert_rules(
 810 |         self, rules: Dict[str, Any]
 811 |     ) -> Tuple[Union[str, None], Union[JujuTopology, None]]:
 812 |         """Determine an appropriate dict key for alert rules.
 813 | 
 814 |         The key is used as the filename when writing alerts to disk, so the structure
 815 |         and uniqueness is important.
 816 | 
 817 |         Args:
 818 |             rules: a dict of alert rules
 819 |         Returns:
 820 |             A tuple containing an identifier, if found, and a JujuTopology, if it could
 821 |             be constructed.
 822 |         """
 823 |         if "groups" not in rules:
 824 |             logger.debug("No alert groups were found in relation data")
 825 |             return None, None
 826 | 
 827 |         # Construct an ID based on what's in the alert rules if they have labels
 828 |         for group in rules["groups"]:
 829 |             try:
 830 |                 labels = group["rules"][0]["labels"]
 831 |                 topology = JujuTopology(
 832 |                     # Don't try to safely get required constructor fields. There's already
 833 |                     # a handler for KeyErrors
 834 |                     model_uuid=labels["juju_model_uuid"],
 835 |                     model=labels["juju_model"],
 836 |                     application=labels["juju_application"],
 837 |                     unit=labels.get("juju_unit", ""),
 838 |                     charm_name=labels.get("juju_charm", ""),
 839 |                 )
 840 |                 return topology.identifier, topology
 841 |             except KeyError:
 842 |                 logger.debug("Alert rules were found but no usable labels were present")
 843 |                 continue
 844 | 
 845 |         logger.warning(
 846 |             "No labeled alert rules were found, and no 'scrape_metadata' "
 847 |             "was available. Using the alert group name as filename."
 848 |         )
 849 |         try:
 850 |             for group in rules["groups"]:
 851 |                 return group["name"], None
 852 |         except KeyError:
 853 |             logger.debug("No group name was found to use as identifier")
 854 | 
 855 |         return None, None
 856 | 
 857 |     def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]:
 858 |         """Iterate through alert rules and inject topology into expressions.
 859 | 
 860 |         Args:
 861 |             rules: a dict of alert rules
 862 |         """
 863 |         if "groups" not in rules:
 864 |             return rules
 865 | 
 866 |         modified_groups = []
 867 |         for group in rules["groups"]:
 868 |             # Copy off rules, so we don't modify an object we're iterating over
 869 |             rules_copy = group["rules"]
 870 |             for idx, rule in enumerate(rules_copy):
 871 |                 labels = rule.get("labels")
 872 | 
 873 |                 if labels:
 874 |                     try:
 875 |                         topology = JujuTopology(
 876 |                             # Don't try to safely get required constructor fields. There's already
 877 |                             # a handler for KeyErrors
 878 |                             model_uuid=labels["juju_model_uuid"],
 879 |                             model=labels["juju_model"],
 880 |                             application=labels["juju_application"],
 881 |                             unit=labels.get("juju_unit", ""),
 882 |                             charm_name=labels.get("juju_charm", ""),
 883 |                         )
 884 | 
 885 |                         # Inject topology and put it back in the list
 886 |                         rule["expr"] = self._tool.inject_label_matchers(
 887 |                             re.sub(r"%%juju_topology%%,?", "", rule["expr"]),
 888 |                             topology.alert_expression_dict,
 889 |                         )
 890 |                     except KeyError:
 891 |                         # Some required JujuTopology key is missing. Just move on.
 892 |                         pass
 893 | 
 894 |                     group["rules"][idx] = rule
 895 | 
 896 |             modified_groups.append(group)
 897 | 
 898 |         rules["groups"] = modified_groups
 899 |         return rules
 900 | 
 901 | 
 902 | # Copy/pasted from prometheus_scrape.py
 903 | class CosTool:
 904 |     """Uses cos-tool to inject label matchers into alert rule expressions and validate rules."""
 905 | 
 906 |     _path = None
 907 |     _disabled = False
 908 | 
 909 |     def __init__(self, charm):
 910 |         self._charm = charm
 911 | 
 912 |     @property
 913 |     def path(self):
 914 |         """Lazy lookup of the path of cos-tool."""
 915 |         if self._disabled:
 916 |             return None
 917 |         if not self._path:
 918 |             self._path = self._get_tool_path()
 919 |             if not self._path:
 920 |                 logger.debug("Skipping injection of juju topology as label matchers")
 921 |                 self._disabled = True
 922 |         return self._path
 923 | 
 924 |     def apply_label_matchers(self, rules) -> dict:
 925 |         """Will apply label matchers to the expression of all alerts in all supplied groups."""
 926 |         if not self.path:
 927 |             return rules
 928 |         for group in rules["groups"]:
 929 |             rules_in_group = group.get("rules", [])
 930 |             for rule in rules_in_group:
 931 |                 topology = {}
 932 |                 # if the user for some reason has provided juju_unit, we'll need to honor it
 933 |                 # in most cases, however, this will be empty
 934 |                 for label in [
 935 |                     "juju_model",
 936 |                     "juju_model_uuid",
 937 |                     "juju_application",
 938 |                     "juju_charm",
 939 |                     "juju_unit",
 940 |                 ]:
 941 |                     if label in rule["labels"]:
 942 |                         topology[label] = rule["labels"][label]
 943 | 
 944 |                 rule["expr"] = self.inject_label_matchers(rule["expr"], topology)
 945 |         return rules
 946 | 
 947 |     def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]:
 948 |         """Will validate correctness of alert rules, returning a boolean and any errors."""
 949 |         if not self.path:
 950 |             logger.debug("`cos-tool` unavailable. Not validating alert correctness.")
 951 |             return True, ""
 952 | 
 953 |         with tempfile.TemporaryDirectory() as tmpdir:
 954 |             rule_path = Path(tmpdir + "/validate_rule.yaml")
 955 |             rule_path.write_text(yaml.dump(rules))
 956 | 
 957 |             args = [str(self.path), "validate", str(rule_path)]
 958 |             # noinspection PyBroadException
 959 |             try:
 960 |                 self._exec(args)
 961 |                 return True, ""
 962 |             except subprocess.CalledProcessError as e:
 963 |                 logger.debug("Validating the rules failed: %s", e.output)
 964 |                 return False, ", ".join(
 965 |                     [
 966 |                         line
 967 |                         for line in e.output.decode("utf8").splitlines()
 968 |                         if "error validating" in line
 969 |                     ]
 970 |                 )
 971 | 
 972 |     def inject_label_matchers(self, expression, topology) -> str:
 973 |         """Add label matchers to an expression."""
 974 |         if not topology:
 975 |             return expression
 976 |         if not self.path:
 977 |             logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression)
 978 |             return expression
 979 |         args = [str(self.path), "transform"]
 980 |         args.extend(
 981 |             ["--label-matcher={}={}".format(key, value) for key, value in topology.items()]
 982 |         )
 983 | 
 984 |         args.extend(["{}".format(expression)])
 985 |         # noinspection PyBroadException
 986 |         try:
 987 |             return self._exec(args)
 988 |         except subprocess.CalledProcessError as e:
 989 |             logger.debug('Applying the expression failed: "%s", falling back to the original', e)
 990 |             return expression
 991 | 
 992 |     def _get_tool_path(self) -> Optional[Path]:
 993 |         arch = platform.machine()
 994 |         arch = "amd64" if arch == "x86_64" else arch
 995 |         res = "cos-tool-{}".format(arch)
 996 |         try:
 997 |             path = Path(res).resolve(strict=True)
 998 |             return path
 999 |         except (FileNotFoundError, OSError):
1000 |             logger.debug('Could not locate cos-tool at: "{}"'.format(res))
1001 |         return None
1002 | 
1003 |     def _exec(self, cmd) -> str:
1004 |         result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
1005 |         return result.stdout.decode("utf-8").strip()
1006 | 


--------------------------------------------------------------------------------
/lib/charms/prometheus_k8s/v0/prometheus_scrape.py:
--------------------------------------------------------------------------------
   1 | # Copyright 2021 Canonical Ltd.
   2 | # See LICENSE file for licensing details.
   3 | """Prometheus Scrape Library.
   4 | 
   5 | ## Overview
   6 | 
   7 | This document explains how to integrate with the Prometheus charm
   8 | for the purpose of providing a metrics endpoint to Prometheus. It
   9 | also explains how alternative implementations of the Prometheus charms
  10 | may maintain the same interface and be backward compatible with all
  11 | currently integrated charms. Finally this document is the
  12 | authoritative reference on the structure of relation data that is
  13 | shared between Prometheus charms and any other charm that intends to
  14 | provide a scrape target for Prometheus.
  15 | 
  16 | ## Source code
  17 | 
  18 | Source code can be found on GitHub at:
  19 |  https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s
  20 | 
  21 | ## Provider Library Usage
  22 | 
  23 | This Prometheus charm interacts with its scrape targets using its
  24 | charm library. Charms seeking to expose metric endpoints for the
  25 | Prometheus charm, must do so using the `MetricsEndpointProvider`
  26 | object from this charm library. For the simplest use cases, using the
  27 | `MetricsEndpointProvider` object only requires instantiating it,
  28 | typically in the constructor of your charm (the one which exposes a
  29 | metrics endpoint). The `MetricsEndpointProvider` constructor requires
  30 | the name of the relation over which a scrape target (metrics endpoint)
  31 | is exposed to the Prometheus charm. This relation must use the
  32 | `prometheus_scrape` interface. By default address of the metrics
  33 | endpoint is set to the unit IP address, by each unit of the
  34 | `MetricsEndpointProvider` charm. These units set their address in
  35 | response to the `PebbleReady` event of each container in the unit,
  36 | since container restarts of Kubernetes charms can result in change of
  37 | IP addresses. The default name for the metrics endpoint relation is
  38 | `metrics-endpoint`. It is strongly recommended to use the same
  39 | relation name for consistency across charms and doing so obviates the
  40 | need for an additional constructor argument. The
  41 | `MetricsEndpointProvider` object may be instantiated as follows
  42 | 
  43 |     from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider
  44 | 
  45 |     def __init__(self, *args):
  46 |         super().__init__(*args)
  47 |         ...
  48 |         self.metrics_endpoint = MetricsEndpointProvider(self)
  49 |         ...
  50 | 
  51 | Note that the first argument (`self`) to `MetricsEndpointProvider` is
  52 | always a reference to the parent (scrape target) charm.
  53 | 
  54 | An instantiated `MetricsEndpointProvider` object will ensure that each
  55 | unit of its parent charm, is a scrape target for the
  56 | `MetricsEndpointConsumer` (Prometheus) charm. By default
  57 | `MetricsEndpointProvider` assumes each unit of the consumer charm
  58 | exports its metrics at a path given by `/metrics` on port 80. These
  59 | defaults may be changed by providing the `MetricsEndpointProvider`
  60 | constructor an optional argument (`jobs`) that represents a
  61 | Prometheus scrape job specification using Python standard data
  62 | structures. This job specification is a subset of Prometheus' own
  63 | [scrape
  64 | configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config)
  65 | format but represented using Python data structures. More than one job
  66 | may be provided using the `jobs` argument. Hence `jobs` accepts a list
  67 | of dictionaries where each dictionary represents one `<scrape_config>`
  68 | object as described in the Prometheus documentation. The currently
  69 | supported configuration subset is: `job_name`, `metrics_path`,
  70 | `static_configs`
  71 | 
  72 | Suppose it is required to change the port on which scraped metrics are
  73 | exposed to 8000. This may be done by providing the following data
  74 | structure as the value of `jobs`.
  75 | 
  76 | ```
  77 | [
  78 |     {
  79 |         "static_configs": [
  80 |             {
  81 |                 "targets": ["*:8000"]
  82 |             }
  83 |         ]
  84 |     }
  85 | ]
  86 | ```
  87 | 
  88 | The wildcard ("*") host specification implies that the scrape targets
  89 | will automatically be set to the host addresses advertised by each
  90 | unit of the consumer charm.
  91 | 
  92 | It is also possible to change the metrics path and scrape multiple
  93 | ports, for example
  94 | 
  95 | ```
  96 | [
  97 |     {
  98 |         "metrics_path": "/my-metrics-path",
  99 |         "static_configs": [
 100 |             {
 101 |                 "targets": ["*:8000", "*:8081"],
 102 |             }
 103 |         ]
 104 |     }
 105 | ]
 106 | ```
 107 | 
 108 | More complex scrape configurations are possible. For example
 109 | 
 110 | ```
 111 | [
 112 |     {
 113 |         "static_configs": [
 114 |             {
 115 |                 "targets": ["10.1.32.215:7000", "*:8000"],
 116 |                 "labels": {
 117 |                     "some_key": "some-value"
 118 |                 }
 119 |             }
 120 |         ]
 121 |     }
 122 | ]
 123 | ```
 124 | 
 125 | This example scrapes the target "10.1.32.215" at port 7000 in addition
 126 | to scraping each unit at port 8000. There is however one difference
 127 | between wildcard targets (specified using "*") and fully qualified
 128 | targets (such as "10.1.32.215"). The Prometheus charm automatically
 129 | associates labels with metrics generated by each target. These labels
 130 | localise the source of metrics within the Juju topology by specifying
 131 | its "model name", "model UUID", "application name" and "unit
 132 | name". However unit name is associated only with wildcard targets but
 133 | not with fully qualified targets.
 134 | 
 135 | Multiple jobs with different metrics paths and labels are allowed, but
 136 | each job must be given a unique name:
 137 | 
 138 | ```
 139 | [
 140 |     {
 141 |         "job_name": "my-first-job",
 142 |         "metrics_path": "one-path",
 143 |         "static_configs": [
 144 |             {
 145 |                 "targets": ["*:7000"],
 146 |                 "labels": {
 147 |                     "some_key": "some-value"
 148 |                 }
 149 |             }
 150 |         ]
 151 |     },
 152 |     {
 153 |         "job_name": "my-second-job",
 154 |         "metrics_path": "another-path",
 155 |         "static_configs": [
 156 |             {
 157 |                 "targets": ["*:8000"],
 158 |                 "labels": {
 159 |                     "some_other_key": "some-other-value"
 160 |                 }
 161 |             }
 162 |         ]
 163 |     }
 164 | ]
 165 | ```
 166 | 
 167 | **Important:** `job_name` should be a fixed string (e.g. hardcoded literal).
 168 | For instance, if you include variable elements, like your `unit.name`, it may break
 169 | the continuity of the metrics time series gathered by Prometheus when the leader unit
 170 | changes (e.g. on upgrade or rescale).
 171 | 
 172 | Additionally, it is also technically possible, but **strongly discouraged**, to
 173 | configure the following scrape-related settings, which behave as described by the
 174 | [Prometheus documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config):
 175 | 
 176 | - `static_configs`
 177 | - `scrape_interval`
 178 | - `scrape_timeout`
 179 | - `proxy_url`
 180 | - `relabel_configs`
 181 | - `metric_relabel_configs`
 182 | - `sample_limit`
 183 | - `label_limit`
 184 | - `label_name_length_limit`
 185 | - `label_value_length_limit`
 186 | 
 187 | The settings above are supported by the `prometheus_scrape` library only for the sake of
 188 | specialized facilities like the [Prometheus Scrape Config](https://charmhub.io/prometheus-scrape-config-k8s)
 189 | charm. Virtually no charms should use these settings, and charmers definitely **should not**
 190 | expose them to the Juju administrator via configuration options.
 191 | 
 192 | ## Consumer Library Usage
 193 | 
 194 | The `MetricsEndpointConsumer` object may be used by Prometheus
 195 | charms to manage relations with their scrape targets. For this
 196 | purposes a Prometheus charm needs to do two things
 197 | 
 198 | 1. Instantiate the `MetricsEndpointConsumer` object by providing it a
 199 | reference to the parent (Prometheus) charm and optionally the name of
 200 | the relation that the Prometheus charm uses to interact with scrape
 201 | targets. This relation must confirm to the `prometheus_scrape`
 202 | interface and it is strongly recommended that this relation be named
 203 | `metrics-endpoint` which is its default value.
 204 | 
 205 | For example a Prometheus charm may instantiate the
 206 | `MetricsEndpointConsumer` in its constructor as follows
 207 | 
 208 |     from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer
 209 | 
 210 |     def __init__(self, *args):
 211 |         super().__init__(*args)
 212 |         ...
 213 |         self.metrics_consumer = MetricsEndpointConsumer(self)
 214 |         ...
 215 | 
 216 | 2. A Prometheus charm also needs to respond to the
 217 | `TargetsChangedEvent` event of the `MetricsEndpointConsumer` by adding itself as
 218 | an observer for these events, as in
 219 | 
 220 |     self.framework.observe(
 221 |         self.metrics_consumer.on.targets_changed,
 222 |         self._on_scrape_targets_changed,
 223 |     )
 224 | 
 225 | In responding to the `TargetsChangedEvent` event the Prometheus
 226 | charm must update the Prometheus configuration so that any new scrape
 227 | targets are added and/or old ones removed from the list of scraped
 228 | endpoints. For this purpose the `MetricsEndpointConsumer` object
 229 | exposes a `jobs()` method that returns a list of scrape jobs. Each
 230 | element of this list is the Prometheus scrape configuration for that
 231 | job. In order to update the Prometheus configuration, the Prometheus
 232 | charm needs to replace the current list of jobs with the list provided
 233 | by `jobs()` as follows
 234 | 
 235 |     def _on_scrape_targets_changed(self, event):
 236 |         ...
 237 |         scrape_jobs = self.metrics_consumer.jobs()
 238 |         for job in scrape_jobs:
 239 |             prometheus_scrape_config.append(job)
 240 |         ...
 241 | 
 242 | ## Alerting Rules
 243 | 
 244 | This charm library also supports gathering alerting rules from all
 245 | related `MetricsEndpointProvider` charms and enabling corresponding alerts within the
 246 | Prometheus charm.  Alert rules are automatically gathered by `MetricsEndpointProvider`
 247 | charms when using this library, from a directory conventionally named
 248 | `prometheus_alert_rules`. This directory must reside at the top level
 249 | in the `src` folder of the consumer charm. Each file in this directory
 250 | is assumed to be in one of two formats:
 251 | - the official prometheus alert rule format, conforming to the
 252 | [Prometheus docs](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
 253 | - a single rule format, which is a simplified subset of the official format,
 254 | comprising a single alert rule per file, using the same YAML fields.
 255 | 
 256 | The file name must have one of the following extensions:
 257 | - `.rule`
 258 | - `.rules`
 259 | - `.yml`
 260 | - `.yaml`
 261 | 
 262 | An example of the contents of such a file in the custom single rule
 263 | format is shown below.
 264 | 
 265 | ```
 266 | alert: HighRequestLatency
 267 | expr: job:request_latency_seconds:mean5m{my_key=my_value} > 0.5
 268 | for: 10m
 269 | labels:
 270 |   severity: Medium
 271 |   type: HighLatency
 272 | annotations:
 273 |   summary: High request latency for {{ $labels.instance }}.
 274 | ```
 275 | 
 276 | The `MetricsEndpointProvider` will read all available alert rules and
 277 | also inject "filtering labels" into the alert expressions. The
 278 | filtering labels ensure that alert rules are localised to the metrics
 279 | provider charm's Juju topology (application, model and its UUID). Such
 280 | a topology filter is essential to ensure that alert rules submitted by
 281 | one provider charm generates alerts only for that same charm. When
 282 | alert rules are embedded in a charm, and the charm is deployed as a
 283 | Juju application, the alert rules from that application have their
 284 | expressions automatically updated to filter for metrics coming from
 285 | the units of that application alone. This remove risk of spurious
 286 | evaluation, e.g., when you have multiple deployments of the same charm
 287 | monitored by the same Prometheus.
 288 | 
 289 | Not all alerts one may want to specify can be embedded in a
 290 | charm. Some alert rules will be specific to a user's use case. This is
 291 | the case, for example, of alert rules that are based on business
 292 | constraints, like expecting a certain amount of requests to a specific
 293 | API every five minutes. Such alert rules can be specified via the
 294 | [COS Config Charm](https://charmhub.io/cos-configuration-k8s),
 295 | which allows importing alert rules and other settings like dashboards
 296 | from a Git repository.
 297 | 
 298 | Gathering alert rules and generating rule files within the Prometheus
 299 | charm is easily done using the `alerts()` method of
 300 | `MetricsEndpointConsumer`. Alerts generated by Prometheus will
 301 | automatically include Juju topology labels in the alerts. These labels
 302 | indicate the source of the alert. The following labels are
 303 | automatically included with each alert
 304 | 
 305 | - `juju_model`
 306 | - `juju_model_uuid`
 307 | - `juju_application`
 308 | 
 309 | ## Relation Data
 310 | 
 311 | The Prometheus charm uses both application and unit relation data to
 312 | obtain information regarding its scrape jobs, alert rules and scrape
 313 | targets. This relation data is in JSON format and it closely resembles
 314 | the YAML structure of Prometheus [scrape configuration]
 315 | (https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config).
 316 | 
 317 | Units of Metrics provider charms advertise their names and addresses
 318 | over unit relation data using the `prometheus_scrape_unit_name` and
 319 | `prometheus_scrape_unit_address` keys. While the `scrape_metadata`,
 320 | `scrape_jobs` and `alert_rules` keys in application relation data
 321 | of Metrics provider charms hold eponymous information.
 322 | 
 323 | """  # noqa: W505
 324 | 
 325 | import copy
 326 | import hashlib
 327 | import ipaddress
 328 | import json
 329 | import logging
 330 | import os
 331 | import platform
 332 | import re
 333 | import socket
 334 | import subprocess
 335 | import tempfile
 336 | from collections import defaultdict
 337 | from pathlib import Path
 338 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 339 | from urllib.parse import urlparse
 340 | 
 341 | import yaml
 342 | from cosl import JujuTopology
 343 | from cosl.rules import AlertRules, generic_alert_groups
 344 | from ops.charm import CharmBase, RelationRole
 345 | from ops.framework import (
 346 |     BoundEvent,
 347 |     EventBase,
 348 |     EventSource,
 349 |     Object,
 350 |     ObjectEvents,
 351 |     StoredDict,
 352 |     StoredList,
 353 | )
 354 | from ops.model import Relation
 355 | 
 356 | # The unique Charmhub library identifier, never change it
 357 | LIBID = "bc84295fef5f4049878f07b131968ee2"
 358 | 
 359 | # Increment this major API version when introducing breaking changes
 360 | LIBAPI = 0
 361 | 
 362 | # Increment this PATCH version before using `charmcraft publish-lib` or reset
 363 | # to 0 if you are raising the major API version
 364 | LIBPATCH = 56
 365 | 
 366 | # Version 0.0.53 needed for cosl.rules.generic_alert_groups
 367 | PYDEPS = ["cosl>=0.0.53"]
 368 | 
 369 | logger = logging.getLogger(__name__)
 370 | 
 371 | 
 372 | ALLOWED_KEYS = {
 373 |     "job_name",
 374 |     "metrics_path",
 375 |     "static_configs",
 376 |     "scrape_interval",
 377 |     "scrape_timeout",
 378 |     "proxy_url",
 379 |     "relabel_configs",
 380 |     "metric_relabel_configs",
 381 |     "sample_limit",
 382 |     "label_limit",
 383 |     "label_name_length_limit",
 384 |     "label_value_length_limit",
 385 |     "scheme",
 386 |     "basic_auth",
 387 |     "tls_config",
 388 |     "authorization",
 389 |     "params",
 390 | }
 391 | DEFAULT_JOB = {
 392 |     "metrics_path": "/metrics",
 393 |     "static_configs": [{"targets": ["*:80"]}],
 394 | }
 395 | 
 396 | 
 397 | DEFAULT_RELATION_NAME = "metrics-endpoint"
 398 | RELATION_INTERFACE_NAME = "prometheus_scrape"
 399 | 
 400 | DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules"
 401 | 
 402 | 
 403 | class PrometheusConfig:
 404 |     """A namespace for utility functions for manipulating the prometheus config dict."""
 405 | 
 406 |     # relabel instance labels so that instance identifiers are globally unique
 407 |     # stable over unit recreation
 408 |     topology_relabel_config = {
 409 |         "source_labels": ["juju_model", "juju_model_uuid", "juju_application"],
 410 |         "separator": "_",
 411 |         "target_label": "instance",
 412 |         "regex": "(.*)",
 413 |     }
 414 | 
 415 |     topology_relabel_config_wildcard = {
 416 |         "source_labels": ["juju_model", "juju_model_uuid", "juju_application", "juju_unit"],
 417 |         "separator": "_",
 418 |         "target_label": "instance",
 419 |         "regex": "(.*)",
 420 |     }
 421 | 
 422 |     @staticmethod
 423 |     def sanitize_scrape_config(job: dict) -> dict:
 424 |         """Restrict permissible scrape configuration options.
 425 | 
 426 |         If job is empty then a default job is returned. The
 427 |         default job is
 428 | 
 429 |         ```
 430 |         {
 431 |             "metrics_path": "/metrics",
 432 |             "static_configs": [{"targets": ["*:80"]}],
 433 |         }
 434 |         ```
 435 | 
 436 |         Args:
 437 |             job: a dict containing a single Prometheus job
 438 |                 specification.
 439 | 
 440 |         Returns:
 441 |             a dictionary containing a sanitized job specification.
 442 |         """
 443 |         sanitized_job = DEFAULT_JOB.copy()
 444 |         sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS})
 445 |         return sanitized_job
 446 | 
 447 |     @staticmethod
 448 |     def sanitize_scrape_configs(scrape_configs: List[dict]) -> List[dict]:
 449 |         """A vectorized version of `sanitize_scrape_config`."""
 450 |         return [PrometheusConfig.sanitize_scrape_config(job) for job in scrape_configs]
 451 | 
 452 |     @staticmethod
 453 |     def prefix_job_names(scrape_configs: List[dict], prefix: str) -> List[dict]:
 454 |         """Adds the given prefix to all the job names in the given scrape_configs list."""
 455 |         modified_scrape_configs = []
 456 |         for scrape_config in scrape_configs:
 457 |             job_name = scrape_config.get("job_name")
 458 |             modified = scrape_config.copy()
 459 |             modified["job_name"] = prefix + "_" + job_name if job_name else prefix
 460 |             modified_scrape_configs.append(modified)
 461 | 
 462 |         return modified_scrape_configs
 463 | 
 464 |     @staticmethod
 465 |     def expand_wildcard_targets_into_individual_jobs(
 466 |         scrape_jobs: List[dict],
 467 |         hosts: Dict[str, Tuple[str, str]],
 468 |         topology: Optional[JujuTopology] = None,
 469 |     ) -> List[dict]:
 470 |         """Extract wildcard hosts from the given scrape_configs list into separate jobs.
 471 | 
 472 |         Args:
 473 |             scrape_jobs: list of scrape jobs.
 474 |             hosts: a dictionary mapping host names to host address for
 475 |                 all units of the relation for which this job configuration
 476 |                 must be constructed.
 477 |             topology: optional arg for adding topology labels to scrape targets.
 478 |         """
 479 |         # hosts = self._relation_hosts(relation)
 480 | 
 481 |         modified_scrape_jobs = []
 482 |         for job in scrape_jobs:
 483 |             static_configs = job.get("static_configs")
 484 |             if not static_configs:
 485 |                 continue
 486 | 
 487 |             # When a single unit specified more than one wildcard target, then they are expanded
 488 |             # into a static_config per target
 489 |             non_wildcard_static_configs = []
 490 | 
 491 |             for static_config in static_configs:
 492 |                 targets = static_config.get("targets")
 493 |                 if not targets:
 494 |                     continue
 495 | 
 496 |                 # All non-wildcard targets remain in the same static_config
 497 |                 non_wildcard_targets = []
 498 | 
 499 |                 # All wildcard targets are extracted to a job per unit. If multiple wildcard
 500 |                 # targets are specified, they remain in the same static_config (per unit).
 501 |                 wildcard_targets = []
 502 | 
 503 |                 for target in targets:
 504 |                     match = re.compile(r"\*(?:(:\d+))?").match(target)
 505 |                     if match:
 506 |                         # This is a wildcard target.
 507 |                         # Need to expand into separate jobs and remove it from this job here
 508 |                         wildcard_targets.append(target)
 509 |                     else:
 510 |                         # This is not a wildcard target. Copy it over into its own static_config.
 511 |                         non_wildcard_targets.append(target)
 512 | 
 513 |                 # All non-wildcard targets remain in the same static_config
 514 |                 if non_wildcard_targets:
 515 |                     non_wildcard_static_config = static_config.copy()
 516 |                     non_wildcard_static_config["targets"] = non_wildcard_targets
 517 | 
 518 |                     if topology:
 519 |                         # When non-wildcard targets (aka fully qualified hostnames) are specified,
 520 |                         # there is no reliable way to determine the name (Juju topology unit name)
 521 |                         # for such a target. Therefore labeling with Juju topology, excluding the
 522 |                         # unit name.
 523 |                         non_wildcard_static_config["labels"] = {
 524 |                             **topology.label_matcher_dict,
 525 |                             **non_wildcard_static_config.get("labels", {}),
 526 |                         }
 527 | 
 528 |                     non_wildcard_static_configs.append(non_wildcard_static_config)
 529 | 
 530 |                 # Extract wildcard targets into individual jobs
 531 |                 if wildcard_targets:
 532 |                     for unit_name, (unit_hostname, unit_path) in hosts.items():
 533 |                         modified_job = job.copy()
 534 |                         modified_job["static_configs"] = [static_config.copy()]
 535 |                         modified_static_config = modified_job["static_configs"][0]
 536 |                         modified_static_config["targets"] = [
 537 |                             target.replace("*", unit_hostname) for target in wildcard_targets
 538 |                         ]
 539 | 
 540 |                         unit_num = unit_name.split("/")[-1]
 541 |                         job_name = modified_job.get("job_name", "unnamed-job") + "-" + unit_num
 542 |                         modified_job["job_name"] = job_name
 543 |                         modified_job["metrics_path"] = unit_path + (
 544 |                             job.get("metrics_path") or "/metrics"
 545 |                         )
 546 | 
 547 |                         if topology:
 548 |                             # Add topology labels
 549 |                             modified_static_config["labels"] = {
 550 |                                 **topology.label_matcher_dict,
 551 |                                 **{"juju_unit": unit_name},
 552 |                                 **modified_static_config.get("labels", {}),
 553 |                             }
 554 | 
 555 |                             # Instance relabeling for topology should be last in order.
 556 |                             modified_job["relabel_configs"] = modified_job.get(
 557 |                                 "relabel_configs", []
 558 |                             ) + [PrometheusConfig.topology_relabel_config_wildcard]
 559 | 
 560 |                         modified_scrape_jobs.append(modified_job)
 561 | 
 562 |             if non_wildcard_static_configs:
 563 |                 modified_job = job.copy()
 564 |                 modified_job["static_configs"] = non_wildcard_static_configs
 565 |                 modified_job["metrics_path"] = modified_job.get("metrics_path") or "/metrics"
 566 | 
 567 |                 if topology:
 568 |                     # Instance relabeling for topology should be last in order.
 569 |                     modified_job["relabel_configs"] = modified_job.get("relabel_configs", []) + [
 570 |                         PrometheusConfig.topology_relabel_config
 571 |                     ]
 572 | 
 573 |                 modified_scrape_jobs.append(modified_job)
 574 | 
 575 |         return modified_scrape_jobs
 576 | 
 577 |     @staticmethod
 578 |     def render_alertmanager_static_configs(alertmanagers: List[str]):
 579 |         """Render the alertmanager static_configs section from a list of URLs.
 580 | 
 581 |         Each target must be in the hostname:port format, and prefixes are specified in a separate
 582 |         key. Therefore, with ingress in place, would need to extract the path into the
 583 |         `path_prefix` key, which is higher up in the config hierarchy.
 584 | 
 585 |         https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
 586 | 
 587 |         Args:
 588 |             alertmanagers: List of alertmanager URLs.
 589 | 
 590 |         Returns:
 591 |             A dict representation for the static_configs section.
 592 |         """
 593 |         # Make sure it's a valid url so urlparse could parse it.
 594 |         scheme = re.compile(r"^https?://")
 595 |         sanitized = [am if scheme.search(am) else "http://" + am for am in alertmanagers]
 596 | 
 597 |         # Create a mapping from paths to netlocs
 598 |         # Group alertmanager targets into a dictionary of lists:
 599 |         # {path: [netloc1, netloc2]}
 600 |         paths = defaultdict(list)  # type: Dict[Tuple[str, str], List[str]]
 601 |         for parsed in map(urlparse, sanitized):
 602 |             path = parsed.path or "/"
 603 |             paths[(parsed.scheme, path)].append(parsed.netloc)
 604 | 
 605 |         return {
 606 |             "alertmanagers": [
 607 |                 {
 608 |                     # For https we still do not render a `tls_config` section because
 609 |                     # certs are expected to be made available by the charm via the
 610 |                     # `update-ca-certificates` mechanism.
 611 |                     "scheme": scheme,
 612 |                     "path_prefix": path_prefix,
 613 |                     "static_configs": [{"targets": netlocs}],
 614 |                 }
 615 |                 for (scheme, path_prefix), netlocs in paths.items()
 616 |             ]
 617 |         }
 618 | 
 619 | 
 620 | class RelationNotFoundError(Exception):
 621 |     """Raised if there is no relation with the given name is found."""
 622 | 
 623 |     def __init__(self, relation_name: str):
 624 |         self.relation_name = relation_name
 625 |         self.message = "No relation named '{}' found".format(relation_name)
 626 | 
 627 |         super().__init__(self.message)
 628 | 
 629 | 
 630 | class RelationInterfaceMismatchError(Exception):
 631 |     """Raised if the relation with the given name has a different interface."""
 632 | 
 633 |     def __init__(
 634 |         self,
 635 |         relation_name: str,
 636 |         expected_relation_interface: str,
 637 |         actual_relation_interface: str,
 638 |     ):
 639 |         self.relation_name = relation_name
 640 |         self.expected_relation_interface = expected_relation_interface
 641 |         self.actual_relation_interface = actual_relation_interface
 642 |         self.message = (
 643 |             "The '{}' relation has '{}' as interface rather than the expected '{}'".format(
 644 |                 relation_name, actual_relation_interface, expected_relation_interface
 645 |             )
 646 |         )
 647 | 
 648 |         super().__init__(self.message)
 649 | 
 650 | 
 651 | class RelationRoleMismatchError(Exception):
 652 |     """Raised if the relation with the given name has a different role."""
 653 | 
 654 |     def __init__(
 655 |         self,
 656 |         relation_name: str,
 657 |         expected_relation_role: RelationRole,
 658 |         actual_relation_role: RelationRole,
 659 |     ):
 660 |         self.relation_name = relation_name
 661 |         self.expected_relation_interface = expected_relation_role
 662 |         self.actual_relation_role = actual_relation_role
 663 |         self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format(
 664 |             relation_name, repr(actual_relation_role), repr(expected_relation_role)
 665 |         )
 666 | 
 667 |         super().__init__(self.message)
 668 | 
 669 | 
 670 | class InvalidAlertRuleEvent(EventBase):
 671 |     """Event emitted when alert rule files are not parsable.
 672 | 
 673 |     Enables us to set a clear status on the provider.
 674 |     """
 675 | 
 676 |     def __init__(self, handle, errors: str = "", valid: bool = False):
 677 |         super().__init__(handle)
 678 |         self.errors = errors
 679 |         self.valid = valid
 680 | 
 681 |     def snapshot(self) -> Dict:
 682 |         """Save alert rule information."""
 683 |         return {
 684 |             "valid": self.valid,
 685 |             "errors": self.errors,
 686 |         }
 687 | 
 688 |     def restore(self, snapshot):
 689 |         """Restore alert rule information."""
 690 |         self.valid = snapshot["valid"]
 691 |         self.errors = snapshot["errors"]
 692 | 
 693 | 
 694 | class InvalidScrapeJobEvent(EventBase):
 695 |     """Event emitted when alert rule files are not valid."""
 696 | 
 697 |     def __init__(self, handle, errors: str = ""):
 698 |         super().__init__(handle)
 699 |         self.errors = errors
 700 | 
 701 |     def snapshot(self) -> Dict:
 702 |         """Save error information."""
 703 |         return {"errors": self.errors}
 704 | 
 705 |     def restore(self, snapshot):
 706 |         """Restore error information."""
 707 |         self.errors = snapshot["errors"]
 708 | 
 709 | 
 710 | class MetricsEndpointProviderEvents(ObjectEvents):
 711 |     """Events raised by :class:`InvalidAlertRuleEvent`s."""
 712 | 
 713 |     alert_rule_status_changed = EventSource(InvalidAlertRuleEvent)
 714 |     invalid_scrape_job = EventSource(InvalidScrapeJobEvent)
 715 | 
 716 | 
 717 | def _type_convert_stored(obj):
 718 |     """Convert Stored* to their appropriate types, recursively."""
 719 |     if isinstance(obj, StoredList):
 720 |         return list(map(_type_convert_stored, obj))
 721 |     if isinstance(obj, StoredDict):
 722 |         rdict = {}  # type: Dict[Any, Any]
 723 |         for k in obj.keys():
 724 |             rdict[k] = _type_convert_stored(obj[k])
 725 |         return rdict
 726 |     return obj
 727 | 
 728 | 
 729 | def _validate_relation_by_interface_and_direction(
 730 |     charm: CharmBase,
 731 |     relation_name: str,
 732 |     expected_relation_interface: str,
 733 |     expected_relation_role: RelationRole,
 734 | ):
 735 |     """Verifies that a relation has the necessary characteristics.
 736 | 
 737 |     Verifies that the `relation_name` provided: (1) exists in metadata.yaml,
 738 |     (2) declares as interface the interface name passed as `relation_interface`
 739 |     and (3) has the right "direction", i.e., it is a relation that `charm`
 740 |     provides or requires.
 741 | 
 742 |     Args:
 743 |         charm: a `CharmBase` object to scan for the matching relation.
 744 |         relation_name: the name of the relation to be verified.
 745 |         expected_relation_interface: the interface name to be matched by the
 746 |             relation named `relation_name`.
 747 |         expected_relation_role: whether the `relation_name` must be either
 748 |             provided or required by `charm`.
 749 | 
 750 |     Raises:
 751 |         RelationNotFoundError: If there is no relation in the charm's metadata.yaml
 752 |             with the same name as provided via `relation_name` argument.
 753 |         RelationInterfaceMismatchError: The relation with the same name as provided
 754 |             via `relation_name` argument does not have the same relation interface
 755 |             as specified via the `expected_relation_interface` argument.
 756 |         RelationRoleMismatchError: If the relation with the same name as provided
 757 |             via `relation_name` argument does not have the same role as specified
 758 |             via the `expected_relation_role` argument.
 759 |     """
 760 |     if relation_name not in charm.meta.relations:
 761 |         raise RelationNotFoundError(relation_name)
 762 | 
 763 |     relation = charm.meta.relations[relation_name]
 764 | 
 765 |     actual_relation_interface = relation.interface_name
 766 |     if actual_relation_interface != expected_relation_interface:
 767 |         raise RelationInterfaceMismatchError(
 768 |             relation_name, expected_relation_interface, actual_relation_interface or "None"
 769 |         )
 770 | 
 771 |     if expected_relation_role == RelationRole.provides:
 772 |         if relation_name not in charm.meta.provides:
 773 |             raise RelationRoleMismatchError(
 774 |                 relation_name, RelationRole.provides, RelationRole.requires
 775 |             )
 776 |     elif expected_relation_role == RelationRole.requires:
 777 |         if relation_name not in charm.meta.requires:
 778 |             raise RelationRoleMismatchError(
 779 |                 relation_name, RelationRole.requires, RelationRole.provides
 780 |             )
 781 |     else:
 782 |         raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role))
 783 | 
 784 | 
 785 | class InvalidAlertRulePathError(Exception):
 786 |     """Raised if the alert rules folder cannot be found or is otherwise invalid."""
 787 | 
 788 |     def __init__(
 789 |         self,
 790 |         alert_rules_absolute_path: Path,
 791 |         message: str,
 792 |     ):
 793 |         self.alert_rules_absolute_path = alert_rules_absolute_path
 794 |         self.message = message
 795 | 
 796 |         super().__init__(self.message)
 797 | 
 798 | 
 799 | class TargetsChangedEvent(EventBase):
 800 |     """Event emitted when Prometheus scrape targets change."""
 801 | 
 802 |     def __init__(self, handle, relation_id):
 803 |         super().__init__(handle)
 804 |         self.relation_id = relation_id
 805 | 
 806 |     def snapshot(self):
 807 |         """Save scrape target relation information."""
 808 |         return {"relation_id": self.relation_id}
 809 | 
 810 |     def restore(self, snapshot):
 811 |         """Restore scrape target relation information."""
 812 |         self.relation_id = snapshot["relation_id"]
 813 | 
 814 | 
 815 | class MonitoringEvents(ObjectEvents):
 816 |     """Event descriptor for events raised by `MetricsEndpointConsumer`."""
 817 | 
 818 |     targets_changed = EventSource(TargetsChangedEvent)
 819 | 
 820 | 
 821 | class MetricsEndpointConsumer(Object):
 822 |     """A Prometheus based Monitoring service."""
 823 | 
 824 |     on = MonitoringEvents()  # pyright: ignore
 825 | 
 826 |     def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME):
 827 |         """A Prometheus based Monitoring service.
 828 | 
 829 |         Args:
 830 |             charm: a `CharmBase` instance that manages this
 831 |                 instance of the Prometheus service.
 832 |             relation_name: an optional string name of the relation between `charm`
 833 |                 and the Prometheus charmed service. The default is "metrics-endpoint".
 834 |                 It is strongly advised not to change the default, so that people
 835 |                 deploying your charm will have a consistent experience with all
 836 |                 other charms that consume metrics endpoints.
 837 | 
 838 |         Raises:
 839 |             RelationNotFoundError: If there is no relation in the charm's metadata.yaml
 840 |                 with the same name as provided via `relation_name` argument.
 841 |             RelationInterfaceMismatchError: The relation with the same name as provided
 842 |                 via `relation_name` argument does not have the `prometheus_scrape` relation
 843 |                 interface.
 844 |             RelationRoleMismatchError: If the relation with the same name as provided
 845 |                 via `relation_name` argument does not have the `RelationRole.requires`
 846 |                 role.
 847 |         """
 848 |         _validate_relation_by_interface_and_direction(
 849 |             charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires
 850 |         )
 851 | 
 852 |         super().__init__(charm, relation_name)
 853 |         self._charm = charm
 854 |         self._relation_name = relation_name
 855 |         self._tool = CosTool(self._charm)
 856 |         events = self._charm.on[relation_name]
 857 |         self.framework.observe(events.relation_changed, self._on_metrics_provider_relation_changed)
 858 |         self.framework.observe(
 859 |             events.relation_departed, self._on_metrics_provider_relation_departed
 860 |         )
 861 | 
 862 |     def _on_metrics_provider_relation_changed(self, event):
 863 |         """Handle changes with related metrics providers.
 864 | 
 865 |         Anytime there are changes in relations between Prometheus
 866 |         and metrics provider charms the Prometheus charm is informed,
 867 |         through a `TargetsChangedEvent` event. The Prometheus charm can
 868 |         then choose to update its scrape configuration.
 869 | 
 870 |         Args:
 871 |             event: a `CharmEvent` in response to which the Prometheus
 872 |                 charm must update its scrape configuration.
 873 |         """
 874 |         rel_id = event.relation.id
 875 | 
 876 |         self.on.targets_changed.emit(relation_id=rel_id)
 877 | 
 878 |     def _on_metrics_provider_relation_departed(self, event):
 879 |         """Update job config when a metrics provider departs.
 880 | 
 881 |         When a metrics provider departs the Prometheus charm is informed
 882 |         through a `TargetsChangedEvent` event so that it can update its
 883 |         scrape configuration to ensure that the departed metrics provider
 884 |         is removed from the list of scrape jobs and
 885 | 
 886 |         Args:
 887 |             event: a `CharmEvent` that indicates a metrics provider
 888 |                unit has departed.
 889 |         """
 890 |         rel_id = event.relation.id
 891 |         self.on.targets_changed.emit(relation_id=rel_id)
 892 | 
 893 |     def jobs(self) -> list:
 894 |         """Fetch the list of scrape jobs.
 895 | 
 896 |         Returns:
 897 |             A list consisting of all the static scrape configurations
 898 |             for each related `MetricsEndpointProvider` that has specified
 899 |             its scrape targets.
 900 |         """
 901 |         scrape_jobs = []
 902 | 
 903 |         for relation in self._charm.model.relations[self._relation_name]:
 904 |             static_scrape_jobs = self._static_scrape_config(relation)
 905 |             if static_scrape_jobs:
 906 |                 # Duplicate job names will cause validate_scrape_jobs to fail.
 907 |                 # Therefore we need to dedupe here and after all jobs are collected.
 908 |                 static_scrape_jobs = _dedupe_job_names(static_scrape_jobs)
 909 |                 try:
 910 |                     self._tool.validate_scrape_jobs(static_scrape_jobs)
 911 |                 except subprocess.CalledProcessError as e:
 912 |                     if self._charm.unit.is_leader():
 913 |                         data = json.loads(relation.data[self._charm.app].get("event", "{}"))
 914 |                         data["scrape_job_errors"] = str(e)
 915 |                         relation.data[self._charm.app]["event"] = json.dumps(data)
 916 |                 else:
 917 |                     scrape_jobs.extend(static_scrape_jobs)
 918 | 
 919 |         scrape_jobs = _dedupe_job_names(scrape_jobs)
 920 | 
 921 |         return scrape_jobs
 922 | 
 923 |     @property
 924 |     def alerts(self) -> dict:
 925 |         """Fetch alerts for all relations.
 926 | 
 927 |         A Prometheus alert rules file consists of a list of "groups". Each
 928 |         group consists of a list of alerts (`rules`) that are sequentially
 929 |         executed. This method returns all the alert rules provided by each
 930 |         related metrics provider charm. These rules may be used to generate a
 931 |         separate alert rules file for each relation since the returned list
 932 |         of alert groups are indexed by that relations Juju topology identifier.
 933 |         The Juju topology identifier string includes substrings that identify
 934 |         alert rule related metadata such as the Juju model, model UUID and the
 935 |         application name from where the alert rule originates. Since this
 936 |         topology identifier is globally unique, it may be used for instance as
 937 |         the name for the file into which the list of alert rule groups are
 938 |         written. For each relation, the structure of data returned is a dictionary
 939 |         representation of a standard prometheus rules file:
 940 | 
 941 |         {"groups": [{"name": ...}, ...]}
 942 | 
 943 |         per official prometheus documentation
 944 |         https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
 945 | 
 946 |         The value of the `groups` key is such that it may be used to generate
 947 |         a Prometheus alert rules file directly using `yaml.dump` but the
 948 |         `groups` key itself must be included as this is required by Prometheus.
 949 | 
 950 |         For example the list of alert rule groups returned by this method may
 951 |         be written into files consumed by Prometheus as follows
 952 | 
 953 |         ```
 954 |         for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items():
 955 |             filename = "juju_" + topology_identifier + ".rules"
 956 |             path = os.path.join(PROMETHEUS_RULES_DIR, filename)
 957 |             rules = yaml.safe_dump(alert_rule_groups)
 958 |             container.push(path, rules, make_dirs=True)
 959 |         ```
 960 | 
 961 |         Returns:
 962 |             A dictionary mapping the Juju topology identifier of the source charm to
 963 |             its list of alert rule groups.
 964 |         """
 965 |         alerts = {}  # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files
 966 |         for relation in self._charm.model.relations[self._relation_name]:
 967 |             if not relation.units or not relation.app:
 968 |                 continue
 969 | 
 970 |             alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}"))
 971 |             if not alert_rules:
 972 |                 continue
 973 | 
 974 |             alert_rules = self._inject_alert_expr_labels(alert_rules)
 975 | 
 976 |             identifier, topology = self._get_identifier_by_alert_rules(alert_rules)
 977 |             if not topology:
 978 |                 try:
 979 |                     scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"])
 980 |                     identifier = JujuTopology.from_dict(scrape_metadata).identifier
 981 | 
 982 |                 except KeyError as e:
 983 |                     logger.debug(
 984 |                         "Relation %s has no 'scrape_metadata': %s",
 985 |                         relation.id,
 986 |                         e,
 987 |                     )
 988 | 
 989 |             if not identifier:
 990 |                 logger.error(
 991 |                     "Alert rules were found but no usable group or identifier was present."
 992 |                 )
 993 |                 continue
 994 | 
 995 |             # We need to append the relation info to the identifier. This is to allow for cases for there are two
 996 |             # relations which eventually scrape the same application. Issue #551.
 997 |             identifier = f"{identifier}_{relation.name}_{relation.id}"
 998 | 
 999 |             alerts[identifier] = alert_rules
1000 | 
1001 |             _, errmsg = self._tool.validate_alert_rules(alert_rules)
1002 |             if errmsg:
1003 |                 if alerts[identifier]:
1004 |                     del alerts[identifier]
1005 |                 if self._charm.unit.is_leader():
1006 |                     data = json.loads(relation.data[self._charm.app].get("event", "{}"))
1007 |                     data["errors"] = errmsg
1008 |                     relation.data[self._charm.app]["event"] = json.dumps(data)
1009 |                 continue
1010 | 
1011 |         return alerts
1012 | 
1013 |     def _get_identifier_by_alert_rules(
1014 |         self, rules: dict
1015 |     ) -> Tuple[Union[str, None], Union[JujuTopology, None]]:
1016 |         """Determine an appropriate dict key for alert rules.
1017 | 
1018 |         The key is used as the filename when writing alerts to disk, so the structure
1019 |         and uniqueness is important.
1020 | 
1021 |         Args:
1022 |             rules: a dict of alert rules
1023 |         Returns:
1024 |             A tuple containing an identifier, if found, and a JujuTopology, if it could
1025 |             be constructed.
1026 |         """
1027 |         if "groups" not in rules:
1028 |             logger.debug("No alert groups were found in relation data")
1029 |             return None, None
1030 | 
1031 |         # Construct an ID based on what's in the alert rules if they have labels
1032 |         for group in rules["groups"]:
1033 |             try:
1034 |                 labels = group["rules"][0]["labels"]
1035 |                 topology = JujuTopology(
1036 |                     # Don't try to safely get required constructor fields. There's already
1037 |                     # a handler for KeyErrors
1038 |                     model_uuid=labels["juju_model_uuid"],
1039 |                     model=labels["juju_model"],
1040 |                     application=labels["juju_application"],
1041 |                     unit=labels.get("juju_unit", ""),
1042 |                     charm_name=labels.get("juju_charm", ""),
1043 |                 )
1044 |                 return topology.identifier, topology
1045 |             except KeyError:
1046 |                 logger.debug("Alert rules were found but no usable labels were present")
1047 |                 continue
1048 | 
1049 |         logger.warning(
1050 |             "No labeled alert rules were found, and no 'scrape_metadata' "
1051 |             "was available. Using the alert group name as filename."
1052 |         )
1053 |         try:
1054 |             for group in rules["groups"]:
1055 |                 return group["name"], None
1056 |         except KeyError:
1057 |             logger.debug("No group name was found to use as identifier")
1058 | 
1059 |         return None, None
1060 | 
1061 |     def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]:
1062 |         """Iterate through alert rules and inject topology into expressions.
1063 | 
1064 |         Args:
1065 |             rules: a dict of alert rules
1066 |         """
1067 |         if "groups" not in rules:
1068 |             return rules
1069 | 
1070 |         modified_groups = []
1071 |         for group in rules["groups"]:
1072 |             # Copy off rules, so we don't modify an object we're iterating over
1073 |             rules_copy = group["rules"]
1074 |             for idx, rule in enumerate(rules_copy):
1075 |                 labels = rule.get("labels")
1076 | 
1077 |                 if labels:
1078 |                     try:
1079 |                         topology = JujuTopology(
1080 |                             # Don't try to safely get required constructor fields. There's already
1081 |                             # a handler for KeyErrors
1082 |                             model_uuid=labels["juju_model_uuid"],
1083 |                             model=labels["juju_model"],
1084 |                             application=labels["juju_application"],
1085 |                             unit=labels.get("juju_unit", ""),
1086 |                             charm_name=labels.get("juju_charm", ""),
1087 |                         )
1088 | 
1089 |                         # Inject topology and put it back in the list
1090 |                         rule["expr"] = self._tool.inject_label_matchers(
1091 |                             re.sub(r"%%juju_topology%%,?", "", rule["expr"]),
1092 |                             topology.alert_expression_dict,
1093 |                         )
1094 |                     except KeyError:
1095 |                         # Some required JujuTopology key is missing. Just move on.
1096 |                         pass
1097 | 
1098 |                     group["rules"][idx] = rule
1099 | 
1100 |             modified_groups.append(group)
1101 | 
1102 |         rules["groups"] = modified_groups
1103 |         return rules
1104 | 
1105 |     def _static_scrape_config(self, relation) -> list:
1106 |         """Generate the static scrape configuration for a single relation.
1107 | 
1108 |         If the relation data includes `scrape_metadata` then the value
1109 |         of this key is used to annotate the scrape jobs with Juju
1110 |         Topology labels before returning them.
1111 | 
1112 |         Args:
1113 |             relation: an `ops.model.Relation` object whose static
1114 |                 scrape configuration is required.
1115 | 
1116 |         Returns:
1117 |             A list (possibly empty) of scrape jobs. Each job is a
1118 |             valid Prometheus scrape configuration for that job,
1119 |             represented as a Python dictionary.
1120 |         """
1121 |         if not relation.units:
1122 |             return []
1123 | 
1124 |         scrape_configs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]"))
1125 | 
1126 |         if not scrape_configs:
1127 |             return []
1128 | 
1129 |         scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}"))
1130 | 
1131 |         if not scrape_metadata:
1132 |             return scrape_configs
1133 | 
1134 |         topology = JujuTopology.from_dict(scrape_metadata)
1135 | 
1136 |         job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier)
1137 |         scrape_configs = PrometheusConfig.prefix_job_names(scrape_configs, job_name_prefix)
1138 |         scrape_configs = PrometheusConfig.sanitize_scrape_configs(scrape_configs)
1139 | 
1140 |         hosts = self._relation_hosts(relation)
1141 | 
1142 |         scrape_configs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs(
1143 |             scrape_configs, hosts, topology
1144 |         )
1145 | 
1146 |         # For https scrape targets we still do not render a `tls_config` section because certs
1147 |         # are expected to be made available by the charm via the `update-ca-certificates` mechanism.
1148 |         return scrape_configs
1149 | 
1150 |     def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]:
1151 |         """Returns a mapping from unit names to (address, path) tuples, for the given relation."""
1152 |         hosts = {}
1153 |         for unit in relation.units:
1154 |             if not (unit_databag := relation.data.get(unit)):
1155 |                 continue
1156 | 
1157 |             unit_path = unit_databag.get("prometheus_scrape_unit_path", "")
1158 |             # TODO deprecate and remove unit.name
1159 |             unit_name = unit_databag.get("prometheus_scrape_unit_name") or unit.name
1160 |             # TODO deprecate and remove "prometheus_scrape_host"
1161 |             unit_address = unit_databag.get("prometheus_scrape_unit_address") or unit_databag.get(
1162 |                 "prometheus_scrape_host"
1163 |             )
1164 | 
1165 |             if not (unit_name and unit_address):
1166 |                 continue
1167 | 
1168 |             hosts.update({unit_name: (unit_address, unit_path)})
1169 | 
1170 |         return hosts
1171 | 
1172 |     def _target_parts(self, target) -> list:
1173 |         """Extract host and port from a wildcard target.
1174 | 
1175 |         Args:
1176 |             target: a string specifying a scrape target. A
1177 |               scrape target is expected to have the format
1178 |               "host:port". The host part may be a wildcard
1179 |               "*" and the port part can be missing (along
1180 |               with ":") in which case port is set to 80.
1181 | 
1182 |         Returns:
1183 |             a list with target host and port as in [host, port]
1184 |         """
1185 |         if ":" in target:
1186 |             parts = target.split(":")
1187 |         else:
1188 |             parts = [target, "80"]
1189 | 
1190 |         return parts
1191 | 
1192 | 
1193 | def _dedupe_job_names(jobs: List[dict]):
1194 |     """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key.
1195 | 
1196 |     Additionally, fully de-duplicate any identical jobs.
1197 | 
1198 |     Args:
1199 |         jobs: A list of prometheus scrape jobs
1200 |     """
1201 |     jobs_copy = copy.deepcopy(jobs)
1202 | 
1203 |     # Convert to a dict with job names as keys
1204 |     # I think this line is O(n^2) but it should be okay given the list sizes
1205 |     jobs_dict = {
1206 |         job["job_name"]: list(filter(lambda x: x["job_name"] == job["job_name"], jobs_copy))
1207 |         for job in jobs_copy
1208 |     }
1209 | 
1210 |     # If multiple jobs have the same name, convert the name to "name_<hash-of-job>"
1211 |     for key in jobs_dict:
1212 |         if len(jobs_dict[key]) > 1:
1213 |             for job in jobs_dict[key]:
1214 |                 job_json = json.dumps(job)
1215 |                 hashed = hashlib.sha256(job_json.encode()).hexdigest()
1216 |                 job["job_name"] = "{}_{}".format(job["job_name"], hashed)
1217 |     new_jobs = []
1218 |     for key in jobs_dict:
1219 |         new_jobs.extend(list(jobs_dict[key]))
1220 | 
1221 |     # Deduplicate jobs which are equal
1222 |     # Again this in O(n^2) but it should be okay
1223 |     deduped_jobs = []
1224 |     seen = []
1225 |     for job in new_jobs:
1226 |         job_json = json.dumps(job)
1227 |         hashed = hashlib.sha256(job_json.encode()).hexdigest()
1228 |         if hashed in seen:
1229 |             continue
1230 |         seen.append(hashed)
1231 |         deduped_jobs.append(job)
1232 | 
1233 |     return deduped_jobs
1234 | 
1235 | 
1236 | def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str:
1237 |     """Resolve the provided path items against the directory of the main file.
1238 | 
1239 |     Look up the directory of the `main.py` file being executed. This is normally
1240 |     going to be the charm.py file of the charm including this library. Then, resolve
1241 |     the provided path elements and, if the result path exists and is a directory,
1242 |     return its absolute path; otherwise, raise en exception.
1243 | 
1244 |     Raises:
1245 |         InvalidAlertRulePathError, if the path does not exist or is not a directory.
1246 |     """
1247 |     charm_dir = Path(str(charm.charm_dir))
1248 |     if not charm_dir.exists() or not charm_dir.is_dir():
1249 |         # Operator Framework does not currently expose a robust
1250 |         # way to determine the top level charm source directory
1251 |         # that is consistent across deployed charms and unit tests
1252 |         # Hence for unit tests the current working directory is used
1253 |         # TODO: updated this logic when the following ticket is resolved
1254 |         # https://github.com/canonical/operator/issues/643
1255 |         charm_dir = Path(os.getcwd())
1256 | 
1257 |     alerts_dir_path = charm_dir.absolute().joinpath(*path_elements)
1258 | 
1259 |     if not alerts_dir_path.exists():
1260 |         raise InvalidAlertRulePathError(alerts_dir_path, "directory does not exist")
1261 |     if not alerts_dir_path.is_dir():
1262 |         raise InvalidAlertRulePathError(alerts_dir_path, "is not a directory")
1263 | 
1264 |     return str(alerts_dir_path)
1265 | 
1266 | 
1267 | class MetricsEndpointProvider(Object):
1268 |     """A metrics endpoint for Prometheus."""
1269 | 
1270 |     on = MetricsEndpointProviderEvents()  # pyright: ignore
1271 | 
1272 |     def __init__(
1273 |         self,
1274 |         charm,
1275 |         relation_name: str = DEFAULT_RELATION_NAME,
1276 |         jobs=None,
1277 |         alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
1278 |         refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None,
1279 |         external_url: str = "",
1280 |         lookaside_jobs_callable: Optional[Callable] = None,
1281 |         *,
1282 |         forward_alert_rules: bool = True,
1283 |     ):
1284 |         """Construct a metrics provider for a Prometheus charm.
1285 | 
1286 |         If your charm exposes a Prometheus metrics endpoint, the
1287 |         `MetricsEndpointProvider` object enables your charm to easily
1288 |         communicate how to reach that metrics endpoint.
1289 | 
1290 |         By default, a charm instantiating this object has the metrics
1291 |         endpoints of each of its units scraped by the related Prometheus
1292 |         charms. The scraped metrics are automatically tagged by the
1293 |         Prometheus charms with Juju topology data via the
1294 |         `juju_model_name`, `juju_model_uuid`, `juju_application_name`
1295 |         and `juju_unit` labels. To support such tagging `MetricsEndpointProvider`
1296 |         automatically forwards scrape metadata to a `MetricsEndpointConsumer`
1297 |         (Prometheus charm).
1298 | 
1299 |         Scrape targets provided by `MetricsEndpointProvider` can be
1300 |         customized when instantiating this object. For example in the
1301 |         case of a charm exposing the metrics endpoint for each of its
1302 |         units on port 8080 and the `/metrics` path, the
1303 |         `MetricsEndpointProvider` can be instantiated as follows:
1304 | 
1305 |             self.metrics_endpoint_provider = MetricsEndpointProvider(
1306 |                 self,
1307 |                 jobs=[{
1308 |                     "static_configs": [{"targets": ["*:8080"]}],
1309 |                 }])
1310 | 
1311 |         The notation `*:<port>` means "scrape each unit of this charm on port
1312 |         `<port>`.
1313 | 
1314 |         In case the metrics endpoints are not on the standard `/metrics` path,
1315 |         a custom path can be specified as follows:
1316 | 
1317 |             self.metrics_endpoint_provider = MetricsEndpointProvider(
1318 |                 self,
1319 |                 jobs=[{
1320 |                     "metrics_path": "/my/strange/metrics/path",
1321 |                     "static_configs": [{"targets": ["*:8080"]}],
1322 |                 }])
1323 | 
1324 |         Note how the `jobs` argument is a list: this allows you to expose multiple
1325 |         combinations of paths "metrics_path" and "static_configs" in case your charm
1326 |         exposes multiple endpoints, which could happen, for example, when you have
1327 |         multiple workload containers, with applications in each needing to be scraped.
1328 |         The structure of the objects in the `jobs` list is one-to-one with the
1329 |         `scrape_config` configuration item of Prometheus' own configuration (see
1330 |         https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
1331 |         ), but with only a subset of the fields allowed. The permitted fields are
1332 |         listed in `ALLOWED_KEYS` object in this charm library module.
1333 | 
1334 |         It is also possible to specify alert rules. By default, this library will look
1335 |         into the `<charm_parent_dir>/prometheus_alert_rules`, which in a standard charm
1336 |         layouts resolves to `src/prometheus_alert_rules`. Each alert rule goes into a
1337 |         separate `*.rule` file. If the syntax of a rule is invalid,
1338 |         the  `MetricsEndpointProvider` logs an error and does not load the particular
1339 |         rule.
1340 | 
1341 |         To avoid false positives and negatives in the evaluation of alert rules,
1342 |         all ingested alert rule expressions are automatically qualified using Juju
1343 |         Topology filters. This ensures that alert rules provided by your charm, trigger
1344 |         alerts based only on data scrapped from your charm. For example an alert rule
1345 |         such as the following
1346 | 
1347 |             alert: UnitUnavailable
1348 |             expr: up < 1
1349 |             for: 0m
1350 | 
1351 |         will be automatically transformed into something along the lines of the following
1352 | 
1353 |             alert: UnitUnavailable
1354 |             expr: up{juju_model=<model>, juju_model_uuid=<uuid-prefix>, juju_application=<app>} < 1
1355 |             for: 0m
1356 | 
1357 |         An attempt will be made to validate alert rules prior to loading them into Prometheus.
1358 |         If they are invalid, an event will be emitted from this object which charms can respond
1359 |         to in order to set a meaningful status for administrators.
1360 | 
1361 |         This can be observed via `consumer.on.alert_rule_status_changed` which contains:
1362 |             - The error(s) encountered when validating as `errors`
1363 |             - A `valid` attribute, which can be used to reset the state of charms if alert rules
1364 |               are updated via another mechanism (e.g. `cos-config`) and refreshed.
1365 | 
1366 |         Args:
1367 |             charm: a `CharmBase` object that manages this
1368 |                 `MetricsEndpointProvider` object. Typically, this is
1369 |                 `self` in the instantiating class.
1370 |             relation_name: an optional string name of the relation between `charm`
1371 |                 and the Prometheus charmed service. The default is "metrics-endpoint".
1372 |                 It is strongly advised not to change the default, so that people
1373 |                 deploying your charm will have a consistent experience with all
1374 |                 other charms that provide metrics endpoints.
1375 |             jobs: an optional list of dictionaries where each
1376 |                 dictionary represents the Prometheus scrape
1377 |                 configuration for a single job. When not provided, a
1378 |                 default scrape configuration is provided for the
1379 |                 `/metrics` endpoint polling all units of the charm on port `80`
1380 |                 using the `MetricsEndpointProvider` object.
1381 |             alert_rules_path: an optional path for the location of alert rules
1382 |                 files.  Defaults to "./prometheus_alert_rules",
1383 |                 resolved relative to the directory hosting the charm entry file.
1384 |                 The alert rules are automatically updated on charm upgrade.
1385 |             forward_alert_rules: a boolean flag to toggle forwarding of charmed alert rules.
1386 |             refresh_event: an optional bound event or list of bound events which
1387 |                 will be observed to re-set scrape job data (IP address and others)
1388 |             external_url: an optional argument that represents an external url that
1389 |                 can be generated by an Ingress or a Proxy.
1390 |             lookaside_jobs_callable: an optional `Callable` which should be invoked
1391 |                 when the job configuration is built as a secondary mapping. The callable
1392 |                 should return a `List[Dict]` which is syntactically identical to the
1393 |                 `jobs` parameter, but can be updated out of step initialization of
1394 |                 this library without disrupting the 'global' job spec.
1395 | 
1396 |         Raises:
1397 |             RelationNotFoundError: If there is no relation in the charm's metadata.yaml
1398 |                 with the same name as provided via `relation_name` argument.
1399 |             RelationInterfaceMismatchError: The relation with the same name as provided
1400 |                 via `relation_name` argument does not have the `prometheus_scrape` relation
1401 |                 interface.
1402 |             RelationRoleMismatchError: If the relation with the same name as provided
1403 |                 via `relation_name` argument does not have the `RelationRole.provides`
1404 |                 role.
1405 |         """
1406 |         _validate_relation_by_interface_and_direction(
1407 |             charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides
1408 |         )
1409 | 
1410 |         try:
1411 |             alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path)
1412 |         except InvalidAlertRulePathError as e:
1413 |             logger.debug(
1414 |                 "Invalid Prometheus alert rules folder at %s: %s",
1415 |                 e.alert_rules_absolute_path,
1416 |                 e.message,
1417 |             )
1418 | 
1419 |         super().__init__(charm, relation_name)
1420 |         self.topology = JujuTopology.from_charm(charm)
1421 | 
1422 |         self._charm = charm
1423 |         self._alert_rules_path = alert_rules_path
1424 |         self._forward_alert_rules = forward_alert_rules
1425 |         self._relation_name = relation_name
1426 |         # sanitize job configurations to the supported subset of parameters
1427 |         jobs = [] if jobs is None else jobs
1428 |         self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs)
1429 | 
1430 |         if external_url:
1431 |             external_url = (
1432 |                 external_url if urlparse(external_url).scheme else ("http://" + external_url)
1433 |             )
1434 |         self.external_url = external_url
1435 |         self._lookaside_jobs = lookaside_jobs_callable
1436 | 
1437 |         events = self._charm.on[self._relation_name]
1438 |         self.framework.observe(events.relation_changed, self._on_relation_changed)
1439 | 
1440 |         if not refresh_event:
1441 |             # FIXME remove once podspec charms are verified.
1442 |             # `self.set_scrape_job_spec()` is called every re-init so this should not be needed.
1443 |             if len(self._charm.meta.containers) == 1:
1444 |                 if "kubernetes" in self._charm.meta.series:
1445 |                     # This is a podspec charm
1446 |                     refresh_event = [self._charm.on.update_status]
1447 |                 else:
1448 |                     # This is a sidecar/pebble charm
1449 |                     container = list(self._charm.meta.containers.values())[0]
1450 |                     refresh_event = [self._charm.on[container.name.replace("-", "_")].pebble_ready]
1451 |             else:
1452 |                 logger.warning(
1453 |                     "%d containers are present in metadata.yaml and "
1454 |                     "refresh_event was not specified. Defaulting to update_status. "
1455 |                     "Metrics IP may not be set in a timely fashion.",
1456 |                     len(self._charm.meta.containers),
1457 |                 )
1458 |                 refresh_event = [self._charm.on.update_status]
1459 | 
1460 |         else:
1461 |             if not isinstance(refresh_event, list):
1462 |                 refresh_event = [refresh_event]
1463 | 
1464 |         self.framework.observe(events.relation_joined, self.set_scrape_job_spec)
1465 |         for ev in refresh_event:
1466 |             self.framework.observe(ev, self.set_scrape_job_spec)
1467 | 
1468 |     def _on_relation_changed(self, event):
1469 |         """Check for alert rule messages in the relation data before moving on."""
1470 |         if self._charm.unit.is_leader():
1471 |             ev = json.loads(event.relation.data[event.app].get("event", "{}"))
1472 | 
1473 |             if ev:
1474 |                 valid = bool(ev.get("valid", True))
1475 |                 errors = ev.get("errors", "")
1476 | 
1477 |                 if valid and not errors:
1478 |                     self.on.alert_rule_status_changed.emit(valid=valid)
1479 |                 else:
1480 |                     self.on.alert_rule_status_changed.emit(valid=valid, errors=errors)
1481 | 
1482 |                 scrape_errors = ev.get("scrape_job_errors", None)
1483 |                 if scrape_errors:
1484 |                     self.on.invalid_scrape_job.emit(errors=scrape_errors)
1485 | 
1486 |     def update_scrape_job_spec(self, jobs):
1487 |         """Update scrape job specification."""
1488 |         self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs)
1489 |         self.set_scrape_job_spec()
1490 | 
1491 |     def set_scrape_job_spec(self, _=None):
1492 |         """Ensure scrape target information is made available to prometheus.
1493 | 
1494 |         When a metrics provider charm is related to a prometheus charm, the
1495 |         metrics provider sets specification and metadata related to its own
1496 |         scrape configuration. This information is set using Juju application
1497 |         data. In addition, each of the consumer units also sets its own
1498 |         host address in Juju unit relation data.
1499 |         """
1500 |         self._set_unit_ip()
1501 | 
1502 |         if not self._charm.unit.is_leader():
1503 |             return
1504 | 
1505 |         alert_rules = AlertRules(query_type="promql", topology=self.topology)
1506 |         if self._forward_alert_rules:
1507 |             alert_rules.add_path(self._alert_rules_path, recursive=True)
1508 |             alert_rules.add(
1509 |                 copy.deepcopy(generic_alert_groups.application_rules),
1510 |                 group_name_prefix=self.topology.identifier,
1511 |             )
1512 |         alert_rules_as_dict = alert_rules.as_dict()
1513 | 
1514 |         for relation in self._charm.model.relations[self._relation_name]:
1515 |             relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata)
1516 |             relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs)
1517 | 
1518 |             # Update relation data with the string representation of the rule file.
1519 |             # Juju topology is already included in the "scrape_metadata" field above.
1520 |             # The consumer side of the relation uses this information to name the rules file
1521 |             # that is written to the filesystem.
1522 |             relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict)
1523 | 
1524 |     def _set_unit_ip(self, _=None):
1525 |         """Set unit host address.
1526 | 
1527 |         Each time a metrics provider charm container is restarted it updates its own
1528 |         host address in the unit relation data for the prometheus charm.
1529 | 
1530 |         The only argument specified is an event, and it ignored. This is for expediency
1531 |         to be able to use this method as an event handler, although no access to the
1532 |         event is actually needed.
1533 |         """
1534 |         for relation in self._charm.model.relations[self._relation_name]:
1535 |             unit_ip = str(self._charm.model.get_binding(relation).network.bind_address)
1536 | 
1537 |             # TODO store entire url in relation data, instead of only select url parts.
1538 | 
1539 |             if self.external_url:
1540 |                 parsed = urlparse(self.external_url)
1541 |                 unit_address = parsed.hostname
1542 |                 path = parsed.path
1543 |             elif self._is_valid_unit_address(unit_ip):
1544 |                 unit_address = unit_ip
1545 |                 path = ""
1546 |             else:
1547 |                 unit_address = socket.getfqdn()
1548 |                 path = ""
1549 | 
1550 |             relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = unit_address
1551 |             relation.data[self._charm.unit]["prometheus_scrape_unit_path"] = path
1552 |             relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str(
1553 |                 self._charm.model.unit.name
1554 |             )
1555 | 
1556 |     def _is_valid_unit_address(self, address: str) -> bool:
1557 |         """Validate a unit address.
1558 | 
1559 |         At present only IP address validation is supported, but
1560 |         this may be extended to DNS addresses also, as needed.
1561 | 
1562 |         Args:
1563 |             address: a string representing a unit address
1564 |         """
1565 |         try:
1566 |             _ = ipaddress.ip_address(address)
1567 |         except ValueError:
1568 |             return False
1569 | 
1570 |         return True
1571 | 
1572 |     @property
1573 |     def _scrape_jobs(self) -> list:
1574 |         """Fetch list of scrape jobs.
1575 | 
1576 |         Returns:
1577 |            A list of dictionaries, where each dictionary specifies a
1578 |            single scrape job for Prometheus.
1579 |         """
1580 |         jobs = self._jobs or []
1581 |         if callable(self._lookaside_jobs):
1582 |             jobs.extend(PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs()))
1583 |         return jobs or [DEFAULT_JOB]
1584 | 
1585 |     @property
1586 |     def _scrape_metadata(self) -> dict:
1587 |         """Generate scrape metadata.
1588 | 
1589 |         Returns:
1590 |             Scrape configuration metadata for this metrics provider charm.
1591 |         """
1592 |         return self.topology.as_dict()
1593 | 
1594 | 
1595 | class PrometheusRulesProvider(Object):
1596 |     """Forward rules to Prometheus.
1597 | 
1598 |     This object may be used to forward rules to Prometheus. At present it only supports
1599 |     forwarding alert rules. This is unlike :class:`MetricsEndpointProvider`, which
1600 |     is used for forwarding both scrape targets and associated alert rules. This object
1601 |     is typically used when there is a desire to forward rules that apply globally (across
1602 |     all deployed charms and units) rather than to a single charm. All rule files are
1603 |     forwarded using the same 'prometheus_scrape' interface that is also used by
1604 |     `MetricsEndpointProvider`.
1605 | 
1606 |     Args:
1607 |         charm: A charm instance that `provides` a relation with the `prometheus_scrape` interface.
1608 |         relation_name: Name of the relation in `metadata.yaml` that
1609 |             has the `prometheus_scrape` interface.
1610 |         dir_path: Root directory for the collection of rule files.
1611 |         recursive: Whether to scan for rule files recursively.
1612 |     """
1613 | 
1614 |     def __init__(
1615 |         self,
1616 |         charm: CharmBase,
1617 |         relation_name: str = DEFAULT_RELATION_NAME,
1618 |         dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH,
1619 |         recursive=True,
1620 |     ):
1621 |         super().__init__(charm, relation_name)
1622 |         self._charm = charm
1623 |         self._relation_name = relation_name
1624 |         self._recursive = recursive
1625 | 
1626 |         try:
1627 |             dir_path = _resolve_dir_against_charm_path(charm, dir_path)
1628 |         except InvalidAlertRulePathError as e:
1629 |             logger.debug(
1630 |                 "Invalid Prometheus alert rules folder at %s: %s",
1631 |                 e.alert_rules_absolute_path,
1632 |                 e.message,
1633 |             )
1634 |         self.dir_path = dir_path
1635 | 
1636 |         events = self._charm.on[self._relation_name]
1637 |         event_sources = [
1638 |             events.relation_joined,
1639 |             events.relation_changed,
1640 |             self._charm.on.leader_elected,
1641 |             self._charm.on.upgrade_charm,
1642 |         ]
1643 | 
1644 |         for event_source in event_sources:
1645 |             self.framework.observe(event_source, self._update_relation_data)
1646 | 
1647 |     def _reinitialize_alert_rules(self):
1648 |         """Reloads alert rules and updates all relations."""
1649 |         self._update_relation_data(None)
1650 | 
1651 |     def _update_relation_data(self, _):
1652 |         """Update application relation data with alert rules for all relations."""
1653 |         if not self._charm.unit.is_leader():
1654 |             return
1655 | 
1656 |         alert_rules = AlertRules(query_type="promql")
1657 |         alert_rules.add_path(self.dir_path, recursive=self._recursive)
1658 |         alert_rules_as_dict = alert_rules.as_dict()
1659 | 
1660 |         logger.info("Updating relation data with rule files from disk")
1661 |         for relation in self._charm.model.relations[self._relation_name]:
1662 |             relation.data[self._charm.app]["alert_rules"] = json.dumps(
1663 |                 alert_rules_as_dict,
1664 |                 sort_keys=True,  # sort, to prevent unnecessary relation_changed events
1665 |             )
1666 | 
1667 | class CosTool:
1668 |     """Uses cos-tool to inject label matchers into alert rule expressions and validate rules."""
1669 | 
1670 |     _path = None
1671 |     _disabled = False
1672 | 
1673 |     def __init__(self, charm):
1674 |         self._charm = charm
1675 | 
1676 |     @property
1677 |     def path(self):
1678 |         """Lazy lookup of the path of cos-tool."""
1679 |         if self._disabled:
1680 |             return None
1681 |         if not self._path:
1682 |             self._path = self._get_tool_path()
1683 |             if not self._path:
1684 |                 logger.debug("Skipping injection of juju topology as label matchers")
1685 |                 self._disabled = True
1686 |         return self._path
1687 | 
1688 |     def apply_label_matchers(self, rules) -> dict:
1689 |         """Will apply label matchers to the expression of all alerts in all supplied groups."""
1690 |         if not self.path:
1691 |             return rules
1692 |         for group in rules["groups"]:
1693 |             rules_in_group = group.get("rules", [])
1694 |             for rule in rules_in_group:
1695 |                 topology = {}
1696 |                 # if the user for some reason has provided juju_unit, we'll need to honor it
1697 |                 # in most cases, however, this will be empty
1698 |                 for label in [
1699 |                     "juju_model",
1700 |                     "juju_model_uuid",
1701 |                     "juju_application",
1702 |                     "juju_charm",
1703 |                     "juju_unit",
1704 |                 ]:
1705 |                     if label in rule["labels"]:
1706 |                         topology[label] = rule["labels"][label]
1707 | 
1708 |                 rule["expr"] = self.inject_label_matchers(rule["expr"], topology)
1709 |         return rules
1710 | 
1711 |     def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]:
1712 |         """Will validate correctness of alert rules, returning a boolean and any errors."""
1713 |         if not self.path:
1714 |             logger.debug("`cos-tool` unavailable. Not validating alert correctness.")
1715 |             return True, ""
1716 | 
1717 |         with tempfile.TemporaryDirectory() as tmpdir:
1718 |             rule_path = Path(tmpdir + "/validate_rule.yaml")
1719 |             rule_path.write_text(yaml.dump(rules))
1720 | 
1721 |             args = [str(self.path), "validate", str(rule_path)]
1722 |             # noinspection PyBroadException
1723 |             try:
1724 |                 self._exec(args)
1725 |                 return True, ""
1726 |             except subprocess.CalledProcessError as e:
1727 |                 logger.debug("Validating the rules failed: %s", e.output.decode("utf8"))
1728 |                 return False, ", ".join(
1729 |                     [
1730 |                         line
1731 |                         for line in e.output.decode("utf8").splitlines()
1732 |                         if "error validating" in line
1733 |                     ]
1734 |                 )
1735 | 
1736 |     def validate_scrape_jobs(self, jobs: list) -> bool:
1737 |         """Validate scrape jobs using cos-tool."""
1738 |         if not self.path:
1739 |             logger.debug("`cos-tool` unavailable. Not validating scrape jobs.")
1740 |             return True
1741 |         conf = {"scrape_configs": jobs}
1742 |         with tempfile.NamedTemporaryFile() as tmpfile:
1743 |             with open(tmpfile.name, "w") as f:
1744 |                 f.write(yaml.safe_dump(conf))
1745 |             try:
1746 |                 self._exec([str(self.path), "validate-config", tmpfile.name])
1747 |             except subprocess.CalledProcessError as e:
1748 |                 logger.error("Validating scrape jobs failed: {}".format(e.output))
1749 |                 raise
1750 |         return True
1751 | 
1752 |     def inject_label_matchers(self, expression, topology) -> str:
1753 |         """Add label matchers to an expression."""
1754 |         if not topology:
1755 |             return expression
1756 |         if not self.path:
1757 |             logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression)
1758 |             return expression
1759 |         args = [str(self.path), "transform"]
1760 |         args.extend(
1761 |             ["--label-matcher={}={}".format(key, value) for key, value in topology.items()]
1762 |         )
1763 | 
1764 |         args.extend(["{}".format(expression)])
1765 |         # noinspection PyBroadException
1766 |         try:
1767 |             return self._exec(args)
1768 |         except subprocess.CalledProcessError as e:
1769 |             logger.debug('Applying the expression failed: "%s", falling back to the original', e)
1770 |             return expression
1771 | 
1772 |     def _get_tool_path(self) -> Optional[Path]:
1773 |         arch = platform.machine()
1774 |         arch = "amd64" if arch == "x86_64" else arch
1775 |         res = "cos-tool-{}".format(arch)
1776 |         try:
1777 |             path = Path(res).resolve(strict=True)
1778 |             return path
1779 |         except (FileNotFoundError, OSError):
1780 |             logger.debug('Could not locate cos-tool at: "{}"'.format(res))
1781 |         return None
1782 | 
1783 |     def _exec(self, cmd) -> str:
1784 |         result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
1785 |         return result.stdout.decode("utf-8").strip()
1786 | 


--------------------------------------------------------------------------------