├── .wokeignore ├── .jujuignore ├── CODEOWNERS ├── .gitignore ├── .github ├── renovate.json5 ├── workflows │ ├── pull-request.yaml │ ├── release.yaml │ ├── tiobe-scan.yaml │ ├── update-libs.yaml │ ├── quality-gates.yaml │ └── promote.yaml ├── .jira_sync_config.yaml ├── pull_request_template.md └── ISSUE_TEMPLATE │ ├── enhancement_proposal.yml │ └── bug_report.yml ├── INTEGRATING.md ├── src ├── prometheus_alert_rules │ ├── always_firing_numeric.rule │ └── always_firing_absent.rule ├── grafana_dashboards │ └── avalanche.json ├── kubernetes_service.py └── charm.py ├── tests ├── integration │ ├── test_metrics_endpoint.py │ ├── test_upgrade_charm.py │ ├── test_remote_write.py │ ├── helpers.py │ └── conftest.py └── unit │ ├── test_charm.py │ └── test_disable_alerts.py ├── SECURITY.md ├── tox.ini ├── pyproject.toml ├── README.md ├── RELEASE.md ├── charmcraft.yaml ├── CONTRIBUTING.md ├── lib └── charms │ ├── observability_libs │ └── v0 │ │ └── juju_topology.py │ └── prometheus_k8s │ ├── v1 │ └── prometheus_remote_write.py │ └── v0 │ └── prometheus_scrape.py └── LICENSE /.wokeignore: -------------------------------------------------------------------------------- 1 | src/prometheus_alert_rules 2 | -------------------------------------------------------------------------------- /.jujuignore: -------------------------------------------------------------------------------- 1 | /venv 2 | *.py[cod] 3 | *.charm 4 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @canonical/Observability 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | build/ 3 | *.charm 4 | 5 | .coverage 6 | __pycache__/ 7 | *.py[cod] 8 | .tox/ 9 | -------------------------------------------------------------------------------- /.github/renovate.json5: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "github>canonical/observability//.github/renovate/charms.json5", 5 | ], 6 | } 7 | -------------------------------------------------------------------------------- /.github/workflows/pull-request.yaml: -------------------------------------------------------------------------------- 1 | name: Pull Requests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - track/** 8 | 9 | jobs: 10 | pull-request: 11 | name: PR 12 | uses: canonical/observability/.github/workflows/charm-pull-request.yaml@v1 13 | secrets: inherit 14 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release Charm 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - track/** 8 | 9 | jobs: 10 | release: 11 | uses: canonical/observability/.github/workflows/charm-release.yaml@v1 12 | secrets: inherit 13 | with: 14 | default-track: dev 15 | -------------------------------------------------------------------------------- /INTEGRATING.md: -------------------------------------------------------------------------------- 1 | ## Integrating avalanche-operator 2 | avalanche-operator integrates with any charm that `requires` the 3 | `prometheus_scrape` interface. 4 | 5 | ### Related charms 6 | #### Prometheus 7 | Avalanche is intended for load-testing [prometheus][Prometheus operator]. 8 | 9 | [Prometheus operator]: https://charmhub.io/prometheus-k8s 10 | -------------------------------------------------------------------------------- /.github/workflows/tiobe-scan.yaml: -------------------------------------------------------------------------------- 1 | name: Tiobe TiCS Analysis 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 0 * * 1" # Runs at midnight UTC every Monday 7 | 8 | jobs: 9 | tics: 10 | name: TiCs 11 | uses: canonical/observability/.github/workflows/charm-tiobe-scan.yaml@v1 12 | secrets: inherit 13 | -------------------------------------------------------------------------------- /.github/.jira_sync_config.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | jira_project_key: "OBC" 3 | status_mapping: 4 | opened: Untriaged 5 | closed: done 6 | not_planned: rejected 7 | 8 | components: 9 | - avalanche 10 | 11 | add_gh_comment: false 12 | sync_description: false 13 | sync_comments: false 14 | 15 | label_mapping: 16 | "Type: Enhancement": Story 17 | -------------------------------------------------------------------------------- /.github/workflows/update-libs.yaml: -------------------------------------------------------------------------------- 1 | name: Auto-update Charm Libraries 2 | on: 3 | # Manual trigger 4 | workflow_dispatch: 5 | # Check regularly the upstream every four hours 6 | schedule: 7 | - cron: "0 0,4,8,12,16,20 * * *" 8 | 9 | jobs: 10 | update-lib: 11 | name: Check libraries 12 | uses: canonical/observability/.github/workflows/charm-update-libs.yaml@v1 13 | secrets: inherit 14 | 15 | -------------------------------------------------------------------------------- /.github/workflows/quality-gates.yaml: -------------------------------------------------------------------------------- 1 | name: Quality Gates 2 | 3 | on: 4 | # Manual trigger 5 | workflow_dispatch: 6 | # Run the quality checks periodically 7 | # FIXME: adjust the frequency as needed once we have actual gates in place 8 | # schedule: 9 | # - cron: "0 0 * * Tue" 10 | 11 | 12 | jobs: 13 | quality-gates: 14 | name: Run quality gates 15 | uses: canonical/observability/.github/workflows/charm-quality-gates.yaml@v1 16 | secrets: inherit 17 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/always_firing_numeric.rule: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: AlwaysFiringDueToNumericValue 3 | rules: 4 | - alert: AlwaysFiringDueToNumericValue 5 | expr: avalanche_metric_mmmmm_0_0{series_id="0"} > -1 6 | for: 0m 7 | labels: 8 | severity: High 9 | annotations: 10 | summary: "Instance {{ $labels.instance }} dummy alarm (always firing)" 11 | description: "{{ $labels.instance }} of job {{ $labels.job }} is firing the dummy alarm." 12 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Issue 2 | 3 | 4 | 5 | ## Solution 6 | 7 | 8 | 9 | ## Context 10 | 11 | 12 | 13 | ## Testing Instructions 14 | 15 | 16 | 17 | ## Upgrade Notes 18 | 19 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/always_firing_absent.rule: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: AlwaysFiringDueToAbsentMetric 3 | rules: 4 | - alert: AlwaysFiringDueToAbsentMetric 5 | expr: absent(some_metric_name_that_shouldnt_exist{job="non_existing_job"}) 6 | for: 0m 7 | labels: 8 | severity: High 9 | annotations: 10 | summary: "Instance {{ $labels.instance }} dummy alarm (always firing)" 11 | description: "{{ $labels.instance }} of job {{ $labels.job }} is firing the dummy alarm." 12 | -------------------------------------------------------------------------------- /.github/workflows/promote.yaml: -------------------------------------------------------------------------------- 1 | name: Promote Charm 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | promotion: 7 | type: choice 8 | description: Channel to promote from 9 | options: 10 | - edge -> beta 11 | - beta -> candidate 12 | - candidate -> stable 13 | 14 | jobs: 15 | promote: 16 | name: Promote 17 | uses: canonical/observability/.github/workflows/charm-promote.yaml@v1 18 | with: 19 | promotion: ${{ github.event.inputs.promotion }} 20 | secrets: inherit 21 | -------------------------------------------------------------------------------- /tests/integration/test_metrics_endpoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | import jubilant 5 | import pytest 6 | 7 | 8 | @pytest.mark.abort_on_fail 9 | async def test_avalanche_is_scraped_by_prometheus(juju: jubilant.Juju, charm, charm_resources): 10 | """Deploy the avalanche and deploy it together with related charms.""" 11 | juju.deploy(charm, "avalanche", resources=charm_resources) 12 | juju.deploy("prometheus-k8s", "prometheus", channel="2/edge", trust=True) 13 | juju.integrate("avalanche:metrics-endpoint", "prometheus:metrics-endpoint") 14 | juju.wait(jubilant.all_active) 15 | -------------------------------------------------------------------------------- /tests/integration/test_upgrade_charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | import jubilant 5 | import pytest 6 | 7 | 8 | @pytest.mark.abort_on_fail 9 | async def test_upgrade_charm(juju: jubilant.Juju, charm): 10 | """Deploy the avalanche and deploy it together with related charms.""" 11 | juju.deploy( 12 | "avalanche-k8s", 13 | "avalanche", 14 | channel="2/edge", 15 | config={"metric_count": "33", "value_interval": "99999"}, 16 | ) 17 | juju.wait(jubilant.all_active) 18 | juju.refresh("avalanche", path=charm) 19 | juju.wait(jubilant.all_active) 20 | -------------------------------------------------------------------------------- /tests/integration/test_remote_write.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | import jubilant 5 | import pytest 6 | 7 | 8 | @pytest.mark.abort_on_fail 9 | async def test_avalanche_remote_writes_to_prometheus(juju: jubilant.Juju, charm, charm_resources): 10 | """Deploy the avalanche and deploy it together with related charms.""" 11 | juju.deploy(charm, "avalanche", resources=charm_resources) 12 | juju.deploy("prometheus-k8s", "prometheus", channel="2/edge", trust=True) 13 | juju.integrate("avalanche:send-remote-write", "prometheus:receive-remote-write") 14 | juju.wait(jubilant.all_active) 15 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement_proposal.yml: -------------------------------------------------------------------------------- 1 | name: Enhancement Proposal 2 | description: File an enhancement proposal 3 | labels: ["Type: Enhancement", "Status: Triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | Thanks for taking the time to fill out this enhancement proposal! Before submitting your issue, please make 9 | sure there isn't already a prior issue concerning this. If there is, please join that discussion instead. 10 | - type: textarea 11 | id: enhancement-proposal 12 | attributes: 13 | label: Enhancement Proposal 14 | description: > 15 | Describe the enhancement you would like to see in as much detail as needed. 16 | validations: 17 | required: true 18 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | The easiest way to report a security issue is through a [Github Private Security Report](https://github.com/canonical/avalanche-k8s-operator/security/advisories/new) 2 | with a description of the issue, the steps you took to create the issue, affected versions, and, if known, mitigations for the issue. 3 | 4 | Alternatively, to report a security issue via email, please email [security@ubuntu.com](mailto:security@ubuntu.com) with a description of the issue, 5 | the steps you took to create the issue, affected versions, and, if known, mitigations for the issue. 6 | 7 | The [Ubuntu Security disclosure and embargo policy](https://ubuntu.com/security/disclosure-policy) contains more information about what you can expect 8 | when you contact us and what we expect from you. 9 | -------------------------------------------------------------------------------- /tests/integration/helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | import logging 5 | 6 | log = logging.getLogger(__name__) 7 | 8 | 9 | async def get_unit_address(ops_test, app_name: str, unit_num: int) -> str: 10 | status = await ops_test.model.get_status() # noqa: F821 11 | return status["applications"][app_name]["units"][f"{app_name}/{unit_num}"]["address"] 12 | 13 | 14 | async def get_config_values(ops_test, app_name) -> dict: 15 | """Return the app's config, but filter out keys that do not have a value.""" 16 | config = await ops_test.model.applications[app_name].get_config() 17 | # Need to convert the value to string because set_config only takes strings but get_config 18 | # may return non-strings 19 | # https://github.com/juju/python-libjuju/issues/631 20 | # https://github.com/juju/python-libjuju/issues/630 21 | return {key: str(config[key]["value"]) for key in config if "value" in config[key]} 22 | -------------------------------------------------------------------------------- /tests/unit/test_charm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | import unittest 5 | 6 | from ops.model import ActiveStatus 7 | from ops.testing import Harness 8 | 9 | from charm import AvalancheCharm 10 | 11 | 12 | class TestCharm(unittest.TestCase): 13 | def setUp(self): 14 | self.harness = Harness(AvalancheCharm) 15 | self.addCleanup(self.harness.cleanup) 16 | self.harness.handle_exec('avalanche', ['/bin/avalanche', '--version'], result='0.0') 17 | self.harness.begin_with_initial_hooks() 18 | 19 | def test_services_running(self): 20 | """Check that the supplied service is running and charm is ActiveStatus.""" 21 | service = self.harness.model.unit.get_container( 22 | AvalancheCharm._container_name 23 | ).get_service(AvalancheCharm._service_name) 24 | self.assertTrue(service.is_running()) 25 | self.assertEqual(self.harness.model.unit.status, ActiveStatus()) 26 | -------------------------------------------------------------------------------- /tests/integration/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | import os 4 | from pathlib import Path 5 | from typing import Dict 6 | 7 | import jubilant 8 | import pytest 9 | import sh 10 | import yaml 11 | 12 | 13 | @pytest.fixture(scope="module") 14 | async def charm(): 15 | """Charm used for integration testing.""" 16 | if charm_file := os.environ.get("CHARM_PATH"): 17 | return Path(charm_file) 18 | 19 | charm = sh.charmcraft.pack() # type: ignore 20 | assert charm 21 | return charm 22 | 23 | 24 | @pytest.fixture(scope="module") 25 | def charm_resources(metadata_file="charmcraft.yaml") -> Dict[str, str]: 26 | with open(metadata_file, "r") as file: 27 | metadata = yaml.safe_load(file) 28 | resources = {} 29 | for res, data in metadata["resources"].items(): 30 | resources[res] = data["upstream-source"] 31 | return resources 32 | 33 | 34 | @pytest.fixture(scope="module") 35 | def juju(): 36 | keep_models: bool = os.environ.get("KEEP_MODELS") is not None 37 | with jubilant.temp_model(keep=keep_models) as juju: 38 | yield juju 39 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | [tox] 5 | skipsdist=True 6 | skip_missing_interpreters = True 7 | envlist = lint, static, unit 8 | 9 | [vars] 10 | src_path = {toxinidir}/src 11 | tst_path = {toxinidir}/tests 12 | all_path = {[vars]src_path} {[vars]tst_path} 13 | uv_flags = --frozen --isolated --extra=dev 14 | 15 | [testenv] 16 | allowlist_externals = uv 17 | basepython = python3 18 | setenv = 19 | PYTHONPATH = {toxinidir}:{toxinidir}/lib:{[vars]src_path} 20 | PYTHONBREAKPOINT=ipdb.set_trace 21 | passenv = 22 | PYTHONPATH 23 | CHARM_PATH 24 | 25 | [testenv:lock] 26 | description = Update uv.lock with the latest deps 27 | commands = 28 | uv lock --upgrade --no-cache 29 | 30 | [testenv:lint] 31 | description = Lint the code 32 | commands = 33 | uv run {[vars]uv_flags} ruff check {[vars]all_path} 34 | 35 | [testenv:static] 36 | description = Run static checks 37 | allowlist_externals = 38 | {[testenv]allowlist_externals} 39 | /usr/bin/env 40 | commands = 41 | uv run {[vars]uv_flags} pyright {[vars]all_path} 42 | 43 | [testenv:fmt] 44 | description = "Format the code" 45 | commands = 46 | uv run {[vars]uv_flags} ruff check --fix-only {[vars]all_path} 47 | 48 | [testenv:unit] 49 | description = Run unit tests 50 | allowlist_externals= 51 | {[testenv]allowlist_externals} 52 | /usr/bin/env 53 | commands = 54 | uv run {[vars]uv_flags} coverage run --source={[vars]src_path} -m pytest \ 55 | {[vars]tst_path}/unit {posargs} 56 | uv run {[vars]uv_flags} coverage report 57 | 58 | [testenv:integration] 59 | description = Run integration tests 60 | commands = 61 | uv run {[vars]uv_flags} pytest --exitfirst {[vars]tst_path}/integration {posargs} 62 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | [project] 4 | name = "avalanche-k8s" 5 | version = "0.0" 6 | requires-python = "~=3.8" 7 | 8 | dependencies = [ 9 | "ops", 10 | "PyYAML", 11 | "kubernetes", 12 | "jinja2<3", 13 | "cryptography", 14 | "pydantic", 15 | "cosl", 16 | ] 17 | 18 | [project.optional-dependencies] 19 | dev = [ 20 | # Linting 21 | "ruff", 22 | "codespell", 23 | # Static 24 | "pyright", 25 | # Unit 26 | "pytest", 27 | "coverage[toml]", 28 | "ops[testing]", 29 | "markupsafe==2.0.1", 30 | # Integration 31 | "jubilant", 32 | "sh", 33 | "pytest-asyncio", 34 | ] 35 | 36 | # Testing tools configuration 37 | [tool.coverage.run] 38 | branch = true 39 | 40 | [tool.coverage.report] 41 | show_missing = true 42 | 43 | # Linting tools configuration 44 | [tool.ruff] 45 | line-length = 99 46 | extend-exclude = ["__pycache__", "*.egg_info"] 47 | lint.select = ["E", "W", "F", "C", "N", "R", "D", "I001"] 48 | # Ignore E501 because using black creates errors with this 49 | # Ignore D107 Missing docstring in __init__ 50 | lint.ignore = ["E501", "D107", "N818", "RET504"] 51 | # D100, D101, D102, D103: Ignore missing docstrings in tests 52 | lint.per-file-ignores = {"tests/*" = ["D100","D101","D102","D103"]} 53 | 54 | [tool.ruff.lint.pydocstyle] 55 | convention = "google" 56 | 57 | # Static analysis tools configuration 58 | [tool.pyright] 59 | extraPaths = ["lib"] 60 | pythonVersion = "3.8" 61 | pythonPlatform = "Linux" 62 | 63 | [tool.pytest.ini_options] 64 | minversion = "6.0" 65 | log_cli_level = "INFO" 66 | asyncio_mode = "auto" 67 | addopts = "--tb=native --verbose --capture=no --log-cli-level=INFO" 68 | 69 | [tool.codespell] 70 | skip = ".git,.tox,build,venv*" 71 | ignore-words-list = "assertIn" 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Avalanche Operator (k8s) 2 | 3 | [![Charmhub Badge](https://charmhub.io/avalanche-k8s/badge.svg)](https://charmhub.io/avalanche-k8s) 4 | [![Release](https://github.com/canonical/avalanche-k8s-operator/actions/workflows/release.yaml/badge.svg)](https://github.com/canonical/avalanche-k8s-operator/actions/workflows/release.yaml) 5 | [![Discourse Status](https://img.shields.io/discourse/status?server=https%3A%2F%2Fdiscourse.charmhub.io&style=flat&label=CharmHub%20Discourse)](https://discourse.charmhub.io) 6 | 7 | ## Description 8 | 9 | [Avalanche][Avalanche source] is an [OpenMetrics][OpenMetrics source] endpoint 10 | load tester. 11 | 12 | ## Usage 13 | 14 | To use Avalanche, you need to be able to relate to a charm that supports the 15 | `prometheus_scrape` relation interface. 16 | 17 | For more information see [INTEGRATING](INTEGRATING.md). 18 | 19 | You also need to have a working Kubernetes environment, and have bootstrapped a 20 | Juju controller of version 2.9+, with a model ready to use with the Kubernetes 21 | cloud. 22 | 23 | Example deployment: 24 | 25 | ```shell 26 | juju deploy avalanche-k8s 27 | ``` 28 | 29 | Then you could relate to [prometheus][Prometheus operator]: 30 | ```shell 31 | juju deploy prometheus-k8s 32 | juju relate prometheus-k8s avalanche-k8s 33 | ``` 34 | 35 | ### Scale Out Usage 36 | To add additional Avalanche units for high availability, 37 | 38 | ```shell 39 | juju add-unit avalanche-k8s 40 | ``` 41 | 42 | ## Relations 43 | Currently, supported relations are: 44 | - `metrics-endpoint`, for interfacing with [prometheus][Prometheus operator]. 45 | 46 | ## OCI Images 47 | This charm can be used with the following image: 48 | - `quay.io/freshtracks.io/avalanche` 49 | 50 | 51 | [Avalanche source]: https://github.com/open-fresh/avalanche 52 | [OpenMetrics source]: https://github.com/OpenObservability/OpenMetrics 53 | [Prometheus operator]: https://charmhub.io/prometheus-k8s 54 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release Process 2 | 3 | ## Overview 4 | 5 | At any given time there are three revisions of the charm available on [CharmHub.io](https://charmhub.io/), for each of the following channels: 6 | 7 | 1. `latest/stable` is a well tested production ready version of the Charm. 8 | 2. `latest/candidate` is a feature ready next version of the stable release, currently in testing. 9 | 3. `latest/edge` is the bleeding edge developer version of the charm. While we really try not to, it may break and introduce regressions. 10 | 11 | Currently, the Alertmanager charm does not make use of the `latest/beta` channel. 12 | For more information about CharmHub channels, refer to the [Juju charm store](https://discourse.charmhub.io/t/the-juju-charm-store) documentation. 13 | 14 | ## When to create which revisions 15 | 16 | * **Stable revisions** are done in consultation with product manager and engineering manager when the `candidate` revision has been well tested and is deemed ready for production. 17 | * **Candidate revisions** are done when the charm reaches a state of feature completion with respect to the next planned `stable` release. 18 | * **Edge revisions** are released at the developer's discretion, potentially every time something is merged into `main` and the unit tests pass. 19 | 20 | ## How to publish revisions 21 | 22 | Refer to the [Publish your operator in Charmhub](https://discourse.charmhub.io/t/publish-your-operator-in-charmhub) documentation. 23 | After a `latest/stable` release, it is expected that the version of the charm is the same as the one in `latest/candidate`, and those two channels will diverge again when we are ramping up through `latest/candidate` releases for a new `latest/stable` release. 24 | 25 | ## A note on granularity of revisions 26 | 27 | We believe in shipping often and with confidence. 28 | It is perfectly acceptable to have a new `latest/stable` release containing just one bug fix or a small new feature with respect to the last one. 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report 3 | labels: ["Type: Bug", "Status: Triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | Thanks for taking the time to fill out this bug report! Before submitting your issue, please make 9 | sure you are using the latest version of the charm. If not, please try upgrading to the latest edge release prior to 10 | posting your report to make sure it's not already solved. 11 | - type: textarea 12 | id: bug-description 13 | attributes: 14 | label: Bug Description 15 | description: > 16 | If applicable, add screenshots to 17 | help explain the problem you are facing. 18 | validations: 19 | required: true 20 | - type: textarea 21 | id: reproduction 22 | attributes: 23 | label: To Reproduce 24 | description: > 25 | Please provide the output of `juju export-bundle` and step-by-step instructions for how to reproduce the behavior. 26 | A deployment diagram could be handy too. See https://discourse.charmhub.io/t/9269 for examples. 27 | placeholder: | 28 | 1. `juju deploy ...` 29 | 2. `juju relate ...` 30 | 3. `juju status --relations` 31 | validations: 32 | required: true 33 | - type: textarea 34 | id: environment 35 | attributes: 36 | label: Environment 37 | description: > 38 | We need to know a bit more about the context in which you run the charm. 39 | - Are you running Juju locally, on lxd, in multipass or on some other platform? 40 | - What track and channel you deployed the charm from (ie. `latest/edge` or similar). 41 | - Version of any applicable components, like the juju snap, the model controller, lxd, microk8s, and/or multipass. 42 | validations: 43 | required: true 44 | - type: textarea 45 | id: logs 46 | attributes: 47 | label: Relevant log output 48 | description: > 49 | Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. 50 | Fetch the logs using `juju debug-log --replay` and `kubectl logs ...`. Additional details available in the juju docs 51 | at https://juju.is/docs/olm/juju-logs 52 | render: shell 53 | validations: 54 | required: true 55 | - type: textarea 56 | id: additional-context 57 | attributes: 58 | label: Additional context 59 | -------------------------------------------------------------------------------- /tests/unit/test_disable_alerts.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | 5 | import pytest 6 | from ops.testing import Container, Context, Exec, Relation, State 7 | 8 | import charm 9 | 10 | 11 | @pytest.fixture(scope="function") 12 | def avalanche_container(): 13 | return Container( 14 | "avalanche", 15 | can_connect=True, 16 | execs={Exec(["/bin/avalanche", "--version"], return_code=0, stdout="0.0")}, 17 | ) 18 | 19 | 20 | @pytest.mark.parametrize("forwarding", (True, False)) 21 | def test_forward_alert_rules_scrape(forwarding, avalanche_container): 22 | # GIVEN these relations 23 | prometheus_relation = Relation("send-remote-write", remote_app_name="prometheus") 24 | state = State( 25 | leader=True, 26 | containers={avalanche_container}, 27 | relations=[ 28 | prometheus_relation, 29 | ], 30 | config={"forward_alert_rules": forwarding}, 31 | ) 32 | # WHEN the charm receives a config-changed event 33 | ctx = Context( 34 | charm_type=charm.AvalancheCharm, 35 | ) 36 | with ctx(ctx.on.config_changed(), state) as mgr: 37 | output_state = mgr.run() 38 | # THEN the charm is forwarding the alerts 39 | prometheus_relation_out = output_state.get_relation(prometheus_relation.id) 40 | if forwarding: 41 | assert prometheus_relation_out.local_app_data["alert_rules"] != "{}" 42 | else: 43 | assert prometheus_relation_out.local_app_data["alert_rules"] == "{}" 44 | 45 | 46 | @pytest.mark.parametrize("forwarding", (True, False)) 47 | def test_forward_alert_rules(forwarding, avalanche_container): 48 | # GIVEN these relations 49 | prometheus_relation = Relation("send-remote-write", remote_app_name="prometheus") 50 | state = State( 51 | leader=True, 52 | containers={avalanche_container}, 53 | relations=[ 54 | prometheus_relation, 55 | ], 56 | config={"forward_alert_rules": forwarding}, 57 | ) 58 | # WHEN the charm receives a config-changed event 59 | ctx = Context( 60 | charm_type=charm.AvalancheCharm, 61 | ) 62 | with ctx(ctx.on.config_changed(), state) as mgr: 63 | output_state = mgr.run() 64 | # THEN the charm is forwarding the alerts 65 | prometheus_relation_out = output_state.get_relation(prometheus_relation.id) 66 | if forwarding: 67 | assert prometheus_relation_out.local_app_data["alert_rules"] != "{}" 68 | else: 69 | assert prometheus_relation_out.local_app_data["alert_rules"] == "{}" 70 | -------------------------------------------------------------------------------- /charmcraft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | name: avalanche-k8s 4 | type: charm 5 | summary: Load tester for openmetrics endpoints. 6 | description: Avalanche is a prometheus load tester. 7 | 8 | links: 9 | website: https://charmhub.io/avalanche-k8s 10 | source: https://github.com/canonical/avalanche-k8s-operator 11 | issues: https://github.com/canonical/avalanche-k8s-operator/issues 12 | 13 | assumes: 14 | - k8s-api 15 | 16 | platforms: 17 | ubuntu@24.04:amd64: 18 | 19 | parts: 20 | charm: 21 | source: . 22 | plugin: uv 23 | build-packages: [git] 24 | build-snaps: [astral-uv] 25 | override-build: | 26 | craftctl default 27 | git describe --always > $CRAFT_PART_INSTALL/version 28 | 29 | containers: 30 | avalanche: 31 | resource: avalanche-image 32 | 33 | resources: 34 | avalanche-image: 35 | type: oci-image 36 | description: OCI image for avalanche 37 | upstream-source: quay.io/freshtracks.io/avalanche 38 | 39 | provides: 40 | metrics-endpoint: 41 | interface: prometheus_scrape 42 | grafana-dashboard: 43 | interface: grafana_dashboard 44 | 45 | requires: 46 | send-remote-write: 47 | interface: prometheus_remote_write 48 | 49 | peers: 50 | replicas: 51 | interface: avalanche_replica 52 | 53 | config: 54 | options: 55 | metric_count: 56 | type: int 57 | description: Number of metrics to serve. 58 | default: 500 59 | label_count: 60 | type: int 61 | description: Number of labels per-metric. 62 | default: 10 63 | series_count: 64 | type: int 65 | description: Number of series per-metric. 66 | default: 10 67 | metricname_length: 68 | type: int 69 | description: Modify length of metric names. 70 | default: 5 71 | labelname_length: 72 | type: int 73 | description: Modify length of label names. 74 | default: 5 75 | value_interval: 76 | type: int 77 | description: Change series values every {interval} seconds. 78 | default: 30 79 | series_interval: 80 | type: int 81 | description: > 82 | Change series_id label values every {interval} seconds. 83 | Avalanche's CLI default value is 60, but this is too low and quickly overloads the scraper. 84 | Using 3600000 (10k hours ~ 1 year) in lieu of "inf" (never refresh). 85 | default: 36000000 86 | metric_interval: 87 | type: int 88 | description: > 89 | Change __name__ label values every {interval} seconds. 90 | Avalanche's CLI default value is 120, but this is too low and quickly overloads the scraper. 91 | Using 3600000 (10k hours ~ 1 year) in lieu of "inf" (never refresh). 92 | default: 36000000 93 | forward_alert_rules: 94 | type: boolean 95 | description: > 96 | Toggle forwarding of alert rules. 97 | default: True 98 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to avalanche-operator 2 | 3 | ![GitHub License](https://img.shields.io/github/license/canonical/avalanche-k8s-operator) 4 | ![GitHub Commit Activity](https://img.shields.io/github/commit-activity/y/canonical/avalanche-k8s-operator) 5 | ![GitHub Lines of Code](https://img.shields.io/tokei/lines/github/canonical/avalanche-k8s-operator) 6 | ![GitHub Issues](https://img.shields.io/github/issues/canonical/avalanche-k8s-operator) 7 | ![GitHub PRs](https://img.shields.io/github/issues-pr/canonical/avalanche-k8s-operator) 8 | ![GitHub Contributors](https://img.shields.io/github/contributors/canonical/avalanche-k8s-operator) 9 | ![GitHub Watchers](https://img.shields.io/github/watchers/canonical/avalanche-k8s-operator?style=social) 10 | 11 | The intended use case of this operator is to be deployed together with 12 | prometheus-operator. 13 | 14 | ## Bugs and pull requests 15 | - Generally, before developing enhancements to this charm, you should consider 16 | opening an issue explaining your use case. 17 | - If you would like to chat with us about your use-cases or proposed 18 | implementation, you can reach us at 19 | [Canonical Mattermost public channel](https://chat.charmhub.io/charmhub/channels/charm-dev) 20 | or [Discourse](https://discourse.charmhub.io/). 21 | - All enhancements require review before being merged. Apart from 22 | code quality and test coverage, the review will also take into 23 | account the resulting user experience for Juju administrators using 24 | this charm. 25 | 26 | 27 | ## Setup 28 | 29 | A typical setup using [snaps](https://snapcraft.io/) can be found in the 30 | [Juju docs](https://juju.is/docs/sdk/dev-setup). 31 | 32 | ## Developing 33 | 34 | Use your existing Python 3 development environment or create and 35 | activate a Python 3 virtualenv 36 | 37 | ```shell 38 | virtualenv -p python3 venv 39 | source venv/bin/activate 40 | ``` 41 | 42 | Install the development requirements 43 | 44 | ```shell 45 | pip install -r requirements.txt 46 | ``` 47 | 48 | Later on, upgrade packages as needed 49 | 50 | ```shell 51 | pip install --upgrade -r requirements.txt 52 | ``` 53 | 54 | ### Testing 55 | 56 | ```shell 57 | tox -e fmt # update your code according to linting rules 58 | tox -e lint # code style 59 | tox -e static # static analysis 60 | tox -e unit # unit tests 61 | ``` 62 | 63 | tox creates virtual environment for every tox environment defined in 64 | [tox.ini](tox.ini). To activate a tox environment for manual testing, 65 | 66 | ```shell 67 | source .tox/unit/bin/activate 68 | ``` 69 | 70 | ## Build charm 71 | 72 | Build the charm in this git repository using 73 | 74 | ```shell 75 | charmcraft pack 76 | ``` 77 | 78 | ## Usage 79 | ### Tested images 80 | - [quay.io/freshtracks.io/avalanche](https://quay.io/freshtracks.io/avalanche) 81 | 82 | ### Deploy Avalanche 83 | 84 | ```shell 85 | juju deploy ./avalanche-k8s_ubuntu-20.04-amd64.charm \ 86 | --resource avalanche-image=quay.io/freshtracks.io/avalanche 87 | ``` 88 | 89 | ## Code overview 90 | - The main charm class is `AvalancheCharm`, which responds to config changes 91 | (via `ConfigChangedEvent`) and application upgrades (via 92 | `UpgradeCharmEvent`). 93 | - All lifecycle events call a common hook, `_common_exit_hook` after executing 94 | their own business logic. This pattern simplifies state tracking and improves 95 | consistency. 96 | 97 | ## Design choices 98 | NTA 99 | 100 | ## Roadmap 101 | TBD 102 | -------------------------------------------------------------------------------- /src/grafana_dashboards/avalanche.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "type": "dashboard" 15 | } 16 | ] 17 | }, 18 | "editable": true, 19 | "fiscalYearStartMonth": 0, 20 | "graphTooltip": 0, 21 | "id": 22, 22 | "links": [], 23 | "panels": [ 24 | { 25 | "datasource": { 26 | "type": "prometheus", 27 | "uid": "PB84469DE42D2E8C3" 28 | }, 29 | "fieldConfig": { 30 | "defaults": { 31 | "color": { 32 | "mode": "palette-classic" 33 | }, 34 | "custom": { 35 | "axisBorderShow": false, 36 | "axisCenteredZero": false, 37 | "axisColorMode": "text", 38 | "axisLabel": "", 39 | "axisPlacement": "auto", 40 | "barAlignment": 0, 41 | "barWidthFactor": 0.6, 42 | "drawStyle": "line", 43 | "fillOpacity": 0, 44 | "gradientMode": "none", 45 | "hideFrom": { 46 | "legend": false, 47 | "tooltip": false, 48 | "viz": false 49 | }, 50 | "insertNulls": false, 51 | "lineInterpolation": "linear", 52 | "lineWidth": 1, 53 | "pointSize": 5, 54 | "scaleDistribution": { 55 | "type": "linear" 56 | }, 57 | "showPoints": "auto", 58 | "spanNulls": false, 59 | "stacking": { 60 | "group": "A", 61 | "mode": "none" 62 | }, 63 | "thresholdsStyle": { 64 | "mode": "off" 65 | } 66 | }, 67 | "mappings": [], 68 | "thresholds": { 69 | "mode": "absolute", 70 | "steps": [ 71 | { 72 | "color": "green" 73 | }, 74 | { 75 | "color": "red", 76 | "value": 80 77 | } 78 | ] 79 | } 80 | }, 81 | "overrides": [] 82 | }, 83 | "gridPos": { 84 | "h": 8, 85 | "w": 12, 86 | "x": 0, 87 | "y": 0 88 | }, 89 | "id": 1, 90 | "options": { 91 | "legend": { 92 | "calcs": [], 93 | "displayMode": "list", 94 | "placement": "bottom", 95 | "showLegend": true 96 | }, 97 | "tooltip": { 98 | "hideZeros": false, 99 | "mode": "single", 100 | "sort": "none" 101 | } 102 | }, 103 | "pluginVersion": "12.0.2", 104 | "targets": [ 105 | { 106 | "datasource": { 107 | "type": "prometheus", 108 | "uid": "PB84469DE42D2E8C3" 109 | }, 110 | "editorMode": "code", 111 | "expr": "avalanche_metric_mmmmm_0_0", 112 | "instant": false, 113 | "legendFormat": "__auto", 114 | "range": true, 115 | "refId": "A" 116 | } 117 | ], 118 | "title": "New panel", 119 | "type": "timeseries" 120 | } 121 | ], 122 | "preload": false, 123 | "schemaVersion": 41, 124 | "tags": [], 125 | "templating": { 126 | "list": [] 127 | }, 128 | "time": { 129 | "from": "now-15m", 130 | "to": "now" 131 | }, 132 | "timepicker": {}, 133 | "timezone": "browser", 134 | "title": "Avalanche", 135 | "uid": "", 136 | "version": 1 137 | } 138 | -------------------------------------------------------------------------------- /src/kubernetes_service.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """Library for kubernetes services.""" 6 | 7 | from typing import List, Tuple 8 | 9 | from kubernetes import client, config 10 | from kubernetes.client import exceptions 11 | 12 | 13 | class PatchFailed(RuntimeError): 14 | """Patching the kubernetes service failed.""" 15 | 16 | 17 | class K8sServicePatch: 18 | """A utility for patching the Kubernetes service set up by Juju. 19 | 20 | Attributes: 21 | namespace_file (str): path to the k8s namespace file in the charm container 22 | """ 23 | 24 | namespace_file = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" 25 | 26 | @staticmethod 27 | def namespace() -> str: 28 | """Read the Kubernetes namespace we're deployed in from the mounted service token. 29 | 30 | Returns: 31 | str: The current Kubernetes namespace 32 | """ 33 | with open(K8sServicePatch.namespace_file, "r") as f: 34 | return f.read().strip() 35 | 36 | @staticmethod 37 | def _k8s_auth(): 38 | """Authenticate with the Kubernetes API using an in-cluster service token. 39 | 40 | Raises: 41 | PatchFailed: if no permissions to read cluster role 42 | """ 43 | # Authenticate against the Kubernetes API using a mounted ServiceAccount token 44 | config.load_incluster_config() 45 | # Test the service account we've got for sufficient perms 46 | api = client.CoreV1Api(client.ApiClient()) 47 | 48 | try: 49 | api.list_namespaced_service(namespace=K8sServicePatch.namespace()) 50 | except exceptions.ApiException as e: 51 | if e.status == 403: 52 | raise PatchFailed( 53 | "No permission to read cluster role. " "Run `juju trust` on this application." 54 | ) from e 55 | raise e 56 | 57 | @staticmethod 58 | def _k8s_service(app: str, service_ports: List[Tuple[str, int, int]]) -> client.V1Service: 59 | """Property accessor to return a valid Kubernetes Service representation for Alertmanager. 60 | 61 | Args: 62 | app: app name 63 | service_ports: a list of tuples (name, port, target_port) for every service port. 64 | 65 | Returns: 66 | kubernetes.client.V1Service: A Kubernetes Service with correctly annotated metadata and 67 | ports 68 | """ 69 | ports = [ 70 | client.V1ServicePort(name=port[0], port=port[1], target_port=port[2]) 71 | for port in service_ports 72 | ] 73 | 74 | ns = K8sServicePatch.namespace() 75 | return client.V1Service( 76 | api_version="v1", 77 | metadata=client.V1ObjectMeta( 78 | namespace=ns, 79 | name=app, 80 | labels={"app.kubernetes.io/name": app}, 81 | ), 82 | spec=client.V1ServiceSpec( 83 | ports=ports, 84 | selector={"app.kubernetes.io/name": app}, 85 | ), 86 | ) 87 | 88 | @staticmethod 89 | def set_ports(app: str, service_ports: List[Tuple[str, int, int]]): 90 | """Patch the Kubernetes service created by Juju to map the correct port. 91 | 92 | Currently, Juju uses port 65535 for all endpoints. This can be observed via: 93 | 94 | kubectl describe services -n | grep Port -C 2 95 | 96 | At runtime, pebble watches which ports are bound and we need to patch the gap for pebble 97 | not telling Juju to fix the K8S Service definition. 98 | 99 | Typical usage example from within charm code (e.g. on_install): 100 | 101 | service_ports = [("my-app-api", 9093, 9093), ("my-app-ha", 9094, 9094)] 102 | K8sServicePatch.set_ports(self.app.name, service_ports) 103 | 104 | Args: 105 | app: app name 106 | service_ports: a list of tuples (name, port, target_port) for every service port. 107 | 108 | Raises: 109 | PatchFailed: if patching fails. 110 | """ 111 | # First ensure we're authenticated with the Kubernetes API 112 | K8sServicePatch._k8s_auth() 113 | 114 | ns = K8sServicePatch.namespace() 115 | # Set up a Kubernetes client 116 | api = client.CoreV1Api(client.ApiClient()) 117 | try: 118 | # Delete the existing service so we can redefine with correct ports 119 | # I don't think you can issue a patch that *replaces* the existing ports, 120 | # only append 121 | api.delete_namespaced_service(name=app, namespace=ns) 122 | # Recreate the service with the correct ports for the application 123 | api.create_namespaced_service( 124 | namespace=ns, body=K8sServicePatch._k8s_service(app, service_ports) 125 | ) 126 | except exceptions.ApiException as e: 127 | raise PatchFailed(f"Failed to patch k8s service: {e}") from e 128 | -------------------------------------------------------------------------------- /lib/charms/observability_libs/v0/juju_topology.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | """## Overview. 4 | 5 | This document explains how to use the `JujuTopology` class to 6 | create and consume topology information from Juju in a consistent manner. 7 | 8 | The goal of the Juju topology is to uniquely identify a piece 9 | of software running across any of your Juju-managed deployments. 10 | This is achieved by combining the following four elements: 11 | 12 | - Model name 13 | - Model UUID 14 | - Application name 15 | - Unit identifier 16 | 17 | 18 | For a more in-depth description of the concept, as well as a 19 | walk-through of it's use-case in observability, see 20 | [this blog post](https://juju.is/blog/model-driven-observability-part-2-juju-topology-metrics) 21 | on the Juju blog. 22 | 23 | ## Library Usage 24 | 25 | This library may be used to create and consume `JujuTopology` objects. 26 | The `JujuTopology` class provides three ways to create instances: 27 | 28 | ### Using the `from_charm` method 29 | 30 | Enables instantiation by supplying the charm as an argument. When 31 | creating topology objects for the current charm, this is the recommended 32 | approach. 33 | 34 | ```python 35 | topology = JujuTopology.from_charm(self) 36 | ``` 37 | 38 | ### Using the `from_dict` method 39 | 40 | Allows for instantion using a dictionary of relation data, like the 41 | `scrape_metadata` from Prometheus or the labels of an alert rule. When 42 | creating topology objects for remote charms, this is the recommended 43 | approach. 44 | 45 | ```python 46 | scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) 47 | topology = JujuTopology.from_dict(scrape_metadata) 48 | ``` 49 | 50 | ### Using the class constructor 51 | 52 | Enables instantiation using whatever values you want. While this 53 | is useful in some very specific cases, this is almost certainly not 54 | what you are looking for as setting these values manually may 55 | result in observability metrics which do not uniquely identify a 56 | charm in order to provide accurate usage reporting, alerting, 57 | horizontal scaling, or other use cases. 58 | 59 | ```python 60 | topology = JujuTopology( 61 | model="some-juju-model", 62 | model_uuid="00000000-0000-0000-0000-000000000001", 63 | application="fancy-juju-application", 64 | unit="fancy-juju-application/0", 65 | charm_name="fancy-juju-application-k8s", 66 | ) 67 | ``` 68 | 69 | """ 70 | 71 | import warnings 72 | from collections import OrderedDict 73 | from typing import Dict, List, Optional 74 | from uuid import UUID 75 | 76 | # The unique Charmhub library identifier, never change it 77 | LIBID = "bced1658f20f49d28b88f61f83c2d232" 78 | 79 | LIBAPI = 0 80 | LIBPATCH = 7 81 | 82 | 83 | class InvalidUUIDError(Exception): 84 | """Invalid UUID was provided.""" 85 | 86 | def __init__(self, uuid: str): 87 | self.message = "'{}' is not a valid UUID.".format(uuid) 88 | super().__init__(self.message) 89 | 90 | 91 | class JujuTopology: 92 | """JujuTopology is used for storing, generating and formatting juju topology information. 93 | 94 | DEPRECATED: This class is deprecated. Use `pip install cosl` and 95 | `from cosl.juju_topology import JujuTopology` instead. 96 | """ 97 | 98 | def __init__( 99 | self, 100 | model: str, 101 | model_uuid: str, 102 | application: str, 103 | unit: Optional[str] = None, 104 | charm_name: Optional[str] = None, 105 | ): 106 | """Build a JujuTopology object. 107 | 108 | A `JujuTopology` object is used for storing and transforming 109 | Juju topology information. This information is used to 110 | annotate Prometheus scrape jobs and alert rules. Such 111 | annotation when applied to scrape jobs helps in identifying 112 | the source of the scrapped metrics. On the other hand when 113 | applied to alert rules topology information ensures that 114 | evaluation of alert expressions is restricted to the source 115 | (charm) from which the alert rules were obtained. 116 | 117 | Args: 118 | model: a string name of the Juju model 119 | model_uuid: a globally unique string identifier for the Juju model 120 | application: an application name as a string 121 | unit: a unit name as a string 122 | charm_name: name of charm as a string 123 | """ 124 | warnings.warn( 125 | """ 126 | observability_libs.v0.juju_topology is deprecated. Please import the 127 | library from `cosl` instead: https://github.com/canonical/cos-lib 128 | """, 129 | DeprecationWarning, 130 | ) 131 | if not self.is_valid_uuid(model_uuid): 132 | raise InvalidUUIDError(model_uuid) 133 | 134 | self._model = model 135 | self._model_uuid = model_uuid 136 | self._application = application 137 | self._charm_name = charm_name 138 | self._unit = unit 139 | 140 | def is_valid_uuid(self, uuid): 141 | """Validate the supplied UUID against the Juju Model UUID pattern. 142 | 143 | Args: 144 | uuid: string that needs to be checked if it is valid v4 UUID. 145 | 146 | Returns: 147 | True if parameter is a valid v4 UUID, False otherwise. 148 | """ 149 | try: 150 | return str(UUID(uuid, version=4)) == uuid 151 | except (ValueError, TypeError): 152 | return False 153 | 154 | @classmethod 155 | def from_charm(cls, charm): 156 | """Creates a JujuTopology instance by using the model data available on a charm object. 157 | 158 | Args: 159 | charm: a `CharmBase` object for which the `JujuTopology` will be constructed 160 | Returns: 161 | a `JujuTopology` object. 162 | """ 163 | return cls( 164 | model=charm.model.name, 165 | model_uuid=charm.model.uuid, 166 | application=charm.model.app.name, 167 | unit=charm.model.unit.name, 168 | charm_name=charm.meta.name, 169 | ) 170 | 171 | @classmethod 172 | def from_dict(cls, data: dict): 173 | """Factory method for creating `JujuTopology` children from a dictionary. 174 | 175 | Args: 176 | data: a dictionary with five keys providing topology information. The keys are 177 | - "model" 178 | - "model_uuid" 179 | - "application" 180 | - "unit" 181 | - "charm_name" 182 | `unit` and `charm_name` may be empty, but will result in more limited 183 | labels. However, this allows us to support charms without workloads. 184 | 185 | Returns: 186 | a `JujuTopology` object. 187 | """ 188 | return cls( 189 | model=data["model"], 190 | model_uuid=data["model_uuid"], 191 | application=data["application"], 192 | unit=data.get("unit", ""), 193 | charm_name=data.get("charm_name", ""), 194 | ) 195 | 196 | def as_dict( 197 | self, 198 | *, 199 | remapped_keys: Optional[Dict[str, str]] = None, 200 | excluded_keys: Optional[List[str]] = None, 201 | ) -> OrderedDict: 202 | """Format the topology information into an ordered dict. 203 | 204 | Keeping the dictionary ordered is important to be able to 205 | compare dicts without having to resort to deep comparisons. 206 | 207 | Args: 208 | remapped_keys: A dictionary mapping old key names to new key names, 209 | which will be substituted when invoked. 210 | excluded_keys: A list of key names to exclude from the returned dict. 211 | uuid_length: The length to crop the UUID to. 212 | """ 213 | ret = OrderedDict( 214 | [ 215 | ("model", self.model), 216 | ("model_uuid", self.model_uuid), 217 | ("application", self.application), 218 | ("unit", self.unit), 219 | ("charm_name", self.charm_name), 220 | ] 221 | ) 222 | if excluded_keys: 223 | ret = OrderedDict({k: v for k, v in ret.items() if k not in excluded_keys}) 224 | 225 | if remapped_keys: 226 | ret = OrderedDict( 227 | (remapped_keys.get(k), v) if remapped_keys.get(k) else (k, v) 228 | for k, v in ret.items() # type: ignore 229 | ) 230 | 231 | return ret 232 | 233 | @property 234 | def identifier(self) -> str: 235 | """Format the topology information into a terse string. 236 | 237 | This crops the model UUID, making it unsuitable for comparisons against 238 | anything but other identifiers. Mainly to be used as a display name or file 239 | name where long strings might become an issue. 240 | 241 | >>> JujuTopology( \ 242 | model = "a-model", \ 243 | model_uuid = "00000000-0000-4000-8000-000000000000", \ 244 | application = "some-app", \ 245 | unit = "some-app/1" \ 246 | ).identifier 247 | 'a-model_00000000_some-app' 248 | """ 249 | parts = self.as_dict( 250 | excluded_keys=["unit", "charm_name"], 251 | ) 252 | 253 | parts["model_uuid"] = self.model_uuid_short 254 | values = parts.values() 255 | 256 | return "_".join([str(val) for val in values]).replace("/", "_") 257 | 258 | @property 259 | def label_matcher_dict(self) -> Dict[str, str]: 260 | """Format the topology information into a dict with keys having 'juju_' as prefix. 261 | 262 | Relabelled topology never includes the unit as it would then only match 263 | the leader unit (ie. the unit that produced the dict). 264 | """ 265 | items = self.as_dict( 266 | remapped_keys={"charm_name": "charm"}, 267 | excluded_keys=["unit"], 268 | ).items() 269 | 270 | return {"juju_{}".format(key): value for key, value in items if value} 271 | 272 | @property 273 | def label_matchers(self) -> str: 274 | """Format the topology information into a promql/logql label matcher string. 275 | 276 | Topology label matchers should never include the unit as it 277 | would then only match the leader unit (ie. the unit that 278 | produced the matchers). 279 | """ 280 | items = self.label_matcher_dict.items() 281 | return ", ".join(['{}="{}"'.format(key, value) for key, value in items if value]) 282 | 283 | @property 284 | def model(self) -> str: 285 | """Getter for the juju model value.""" 286 | return self._model 287 | 288 | @property 289 | def model_uuid(self) -> str: 290 | """Getter for the juju model uuid value.""" 291 | return self._model_uuid 292 | 293 | @property 294 | def model_uuid_short(self) -> str: 295 | """Getter for the juju model value, truncated to the first eight letters.""" 296 | return self._model_uuid[:8] 297 | 298 | @property 299 | def application(self) -> str: 300 | """Getter for the juju application value.""" 301 | return self._application 302 | 303 | @property 304 | def charm_name(self) -> Optional[str]: 305 | """Getter for the juju charm name value.""" 306 | return self._charm_name 307 | 308 | @property 309 | def unit(self) -> Optional[str]: 310 | """Getter for the juju unit value.""" 311 | return self._unit 312 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /src/charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """Deploy Avalanche to a Kubernetes environment.""" 6 | 7 | import hashlib 8 | import logging 9 | import socket 10 | from typing import Optional, cast 11 | 12 | from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider 13 | from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider 14 | from charms.prometheus_k8s.v1.prometheus_remote_write import ( 15 | PrometheusRemoteWriteConsumer, 16 | ) 17 | from ops import main 18 | from ops.charm import CharmBase 19 | from ops.framework import StoredState 20 | from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus 21 | from ops.pebble import Layer 22 | 23 | from kubernetes_service import K8sServicePatch, PatchFailed 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | def sha256(hashable) -> str: 29 | """Use instead of the builtin hash() for repeatable values.""" 30 | if isinstance(hashable, str): 31 | hashable = hashable.encode("utf-8") 32 | return hashlib.sha256(hashable).hexdigest() 33 | 34 | 35 | class AvalancheCharm(CharmBase): 36 | """A Juju charm for Avalanche.""" 37 | 38 | _container_name = "avalanche" # automatically determined from charm name 39 | _layer_name = "avalanche" # layer label argument for container.add_layer 40 | _service_name = "avalanche" # chosen arbitrarily to match charm name 41 | _peer_relation_name = "replicas" # must match metadata.yaml peer role name 42 | _port = 9001 # metrics endpoint 43 | 44 | _stored = StoredState() 45 | 46 | def __init__(self, *args): 47 | super().__init__(*args) 48 | self._stored.set_default(servers={}, config_hash=None) 49 | 50 | self.container = self.unit.get_container(self._container_name) 51 | self.unit.set_ports(self._port) 52 | 53 | self._forward_alert_rules = cast(bool, self.config["forward_alert_rules"]) 54 | 55 | self.metrics_endpoint = MetricsEndpointProvider( 56 | self, 57 | "metrics-endpoint", 58 | jobs=[ 59 | { 60 | "job_name": self.model.app.name, 61 | "metrics_path": "/metrics", 62 | "static_configs": [{"targets": [f"*:{self.port}"]}], 63 | "scrape_interval": "15s", # TODO: move to config.yaml 64 | "scrape_timeout": "10s", 65 | } 66 | ], 67 | forward_alert_rules=self._forward_alert_rules, 68 | refresh_event=[self.on.config_changed], 69 | external_url=socket.getfqdn(), 70 | ) 71 | 72 | self.remote_write_consumer = PrometheusRemoteWriteConsumer( 73 | self, 74 | forward_alert_rules=self._forward_alert_rules, 75 | refresh_event=[self.on.config_changed], 76 | ) 77 | self.framework.observe( 78 | self.remote_write_consumer.on.endpoints_changed, # pyright: ignore 79 | self._remote_write_endpoints_changed, 80 | ) 81 | 82 | self.grafana_dashboard_provider = GrafanaDashboardProvider(self) 83 | 84 | # Core lifecycle events 85 | self.framework.observe(self.on.install, self._on_install) 86 | self.framework.observe(self.on.config_changed, self._on_config_changed) 87 | self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm) 88 | self.framework.observe( 89 | self.on.avalanche_pebble_ready, 90 | self._on_pebble_ready, # pyright: ignore 91 | ) 92 | self.framework.observe(self.on.start, self._on_start) 93 | self.framework.observe(self.on.update_status, self._on_update_status) 94 | 95 | def _common_exit_hook(self) -> None: 96 | """Event processing hook that is common to all events to ensure idempotency.""" 97 | if not self.container.can_connect(): 98 | self.unit.status = MaintenanceStatus("Waiting for pod startup to complete") 99 | return 100 | 101 | # Update pebble layer 102 | layer_changed = self._update_layer() 103 | service_running = ( 104 | service := self.container.get_service(self._service_name) 105 | ) and service.is_running() 106 | if layer_changed or not service_running: 107 | if not self._restart_service(): 108 | self.unit.status = BlockedStatus("Service restart failed") 109 | return 110 | 111 | if version := self._avalanche_version: 112 | self.unit.set_workload_version(version) 113 | 114 | self.unit.status = ActiveStatus() 115 | 116 | @property 117 | def _avalanche_version(self) -> Optional[str]: 118 | if not self.container.can_connect(): 119 | return None 120 | version_output, _ = self.container.exec(["/bin/avalanche", "--version"], combine_stderr=True).wait_output() 121 | # Output looks like this: 122 | # 0.3 123 | return version_output.strip() 124 | 125 | def _update_layer(self) -> bool: 126 | """Update service layer to reflect changes in peers (replicas). 127 | 128 | Args: 129 | restart: a flag indicating if the service should be restarted if a change was detected. 130 | 131 | Returns: 132 | True if anything changed; False otherwise 133 | """ 134 | overlay = self._layer() 135 | plan = self.container.get_plan() 136 | 137 | is_changed = False 138 | 139 | if self._service_name not in plan.services or overlay.services != plan.services: 140 | logger.debug( 141 | "Layer changed; command: %s", 142 | overlay.services[self._service_name].command, 143 | ) 144 | is_changed = True 145 | self.container.add_layer(self._layer_name, overlay, combine=True) 146 | self.container.replan() 147 | logger.debug( 148 | "New layer's command: %s", 149 | self.container.get_plan().services.get(self._service_name).command, # pyright: ignore 150 | ) 151 | else: 152 | logger.debug("Layer unchanged") 153 | 154 | return is_changed 155 | 156 | @property 157 | def port(self): 158 | """Return the default Avalanche port.""" 159 | return self._port 160 | 161 | def _layer(self) -> Layer: 162 | """Returns the Pebble configuration layer for Avalanche.""" 163 | 164 | def _command() -> str: 165 | if endpoints := self.remote_write_consumer.endpoints: 166 | # remote-write mode TODO error out / block if both relations present 167 | # avalanche cli args support only one remote write target; take the first one 168 | logger.debug( 169 | "Going into remote write mode; remote write endpoints: %s", 170 | self.remote_write_consumer.endpoints, 171 | ) 172 | 173 | endpoint = endpoints[0]["url"] 174 | # TODO offer remote-write-interval as config option 175 | mode_args = f"--remote-url={endpoint} --remote-write-interval=15s" 176 | else: 177 | # scraped mode 178 | logger.debug("Going into scraped mode (no remote write endpoints)") 179 | 180 | mode_args = f"--port={self.port}" 181 | 182 | return " ".join( 183 | [ 184 | "/bin/avalanche", 185 | f"--metric-count={self.config['metric_count']}", 186 | f"--label-count={self.config['label_count']}", 187 | f"--series-count={self.config['series_count']}", 188 | f"--metricname-length={self.config['metricname_length']}", 189 | f"--labelname-length={self.config['labelname_length']}", 190 | f"--value-interval={self.config['value_interval']}", 191 | f"--series-interval={self.config['series_interval']}", 192 | f"--metric-interval={self.config['metric_interval']}", 193 | mode_args, 194 | ] 195 | ) 196 | 197 | return Layer( 198 | { 199 | "summary": "avalanche layer", 200 | "description": "pebble config layer for avalanche", 201 | "services": { 202 | self._service_name: { 203 | "override": "replace", 204 | "summary": "avalanche service", 205 | "startup": "enabled", 206 | "command": _command(), 207 | }, 208 | }, 209 | } 210 | ) 211 | 212 | def _on_install(self, _): 213 | """Event handler for the `install` event during which we will update the K8s service.""" 214 | self._patch_k8s_service() 215 | 216 | def _on_upgrade_charm(self, _): 217 | """Event handler for the upgrade event during which we will update the K8s service.""" 218 | # Ensure that older deployments of Avalanche run the logic to patch the K8s service 219 | self._patch_k8s_service() 220 | 221 | # After upgrade (refresh), the unit ip address is not guaranteed to remain the same, and 222 | # the config may need update. Calling the common hook to update. 223 | self._common_exit_hook() 224 | 225 | def _patch_k8s_service(self): 226 | """Fix the Kubernetes service that was set up by Juju with correct port numbers.""" 227 | if self.unit.is_leader(): 228 | service_ports = [ 229 | (f"{self.app.name}", self._port, self._port), 230 | ] 231 | try: 232 | K8sServicePatch.set_ports(self.app.name, service_ports) 233 | except PatchFailed as e: 234 | logger.error("Unable to patch the Kubernetes service: %s", str(e)) 235 | else: 236 | logger.debug("Successfully patched the Kubernetes service") 237 | 238 | def _on_pebble_ready(self, _): 239 | """Event handler for PebbleReadyEvent.""" 240 | self._common_exit_hook() 241 | 242 | def _on_start(self, _): 243 | """Event handler for StartEvent. 244 | 245 | With Juju 2.9.5 encountered a scenario in which pebble_ready and config_changed fired, 246 | but IP address was not available and the status was stuck on "Waiting for IP address". 247 | Adding this hook reduce the likelihood of that scenario. 248 | """ 249 | self._common_exit_hook() 250 | 251 | def _on_config_changed(self, _): 252 | """Event handler for ConfigChangedEvent.""" 253 | self._common_exit_hook() 254 | 255 | def _on_alertmanager_config_changed(self, _): 256 | """Event handler for :class:`AvalancheAlertmanagerConfigChanged`.""" 257 | self._common_exit_hook() 258 | 259 | def _restart_service(self) -> bool: 260 | """Helper function for restarting the underlying service.""" 261 | logger.info("Restarting service %s", self._service_name) 262 | 263 | if not self.container.can_connect(): 264 | logger.error("Cannot (re)start service: container is not ready.") 265 | return False 266 | 267 | # Check if service exists, to avoid ModelError from being raised when the service does 268 | # not yet exist 269 | if not self.container.get_services().get(self._service_name): 270 | logger.error("Cannot (re)start service: service does not (yet) exist.") 271 | return False 272 | 273 | self.container.restart(self._service_name) 274 | 275 | return True 276 | 277 | def _on_update_status(self, _): 278 | """Event handler for UpdateStatusEvent. 279 | 280 | Logs list of peers, uptime and version info. 281 | """ 282 | pass 283 | 284 | def _remote_write_endpoints_changed(self, _): 285 | """Event handler for remote write endpoints_changed.""" 286 | self._common_exit_hook() 287 | 288 | 289 | if __name__ == "__main__": 290 | main(AvalancheCharm, use_juju_for_storage=True) 291 | -------------------------------------------------------------------------------- /lib/charms/prometheus_k8s/v1/prometheus_remote_write.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | """# Prometheus remote-write library. 4 | 5 | This library facilitates the integration of the prometheus_remote_write interface. 6 | 7 | Source code can be found on GitHub at: 8 | https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s 9 | 10 | Charms that need to push data to a charm exposing the Prometheus remote_write API, 11 | should use the `PrometheusRemoteWriteConsumer`. Charms that operate software that exposes 12 | the Prometheus remote_write API, that is, they can receive metrics data over remote_write, 13 | should use the `PrometheusRemoteWriteProducer`. 14 | """ 15 | 16 | import copy 17 | import json 18 | import logging 19 | import os 20 | import platform 21 | import re 22 | import socket 23 | import subprocess 24 | import tempfile 25 | from pathlib import Path 26 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union 27 | 28 | import yaml 29 | from cosl import JujuTopology 30 | from cosl.rules import AlertRules, generic_alert_groups 31 | from ops.charm import ( 32 | CharmBase, 33 | HookEvent, 34 | RelationBrokenEvent, 35 | RelationEvent, 36 | RelationMeta, 37 | RelationRole, 38 | ) 39 | from ops.framework import BoundEvent, EventBase, EventSource, Object, ObjectEvents 40 | from ops.model import Relation 41 | 42 | # The unique Charmhub library identifier, never change it 43 | LIBID = "f783823fa75f4b7880eb70f2077ec259" 44 | 45 | # Increment this major API version when introducing breaking changes 46 | LIBAPI = 1 47 | 48 | # Increment this PATCH version before using `charmcraft publish-lib` or reset 49 | # to 0 if you are raising the major API version 50 | LIBPATCH = 10 51 | 52 | PYDEPS = ["cosl"] 53 | 54 | 55 | logger = logging.getLogger(__name__) 56 | 57 | 58 | DEFAULT_RELATION_NAME = "receive-remote-write" 59 | DEFAULT_CONSUMER_NAME = "send-remote-write" 60 | RELATION_INTERFACE_NAME = "prometheus_remote_write" 61 | 62 | DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" 63 | 64 | 65 | class RelationNotFoundError(Exception): 66 | """Raised if there is no relation with the given name.""" 67 | 68 | def __init__(self, relation_name: str): 69 | self.relation_name = relation_name 70 | self.message = "No relation named '{}' found".format(relation_name) 71 | 72 | super().__init__(self.message) 73 | 74 | 75 | class RelationInterfaceMismatchError(Exception): 76 | """Raised if the relation with the given name has a different interface.""" 77 | 78 | def __init__( 79 | self, 80 | relation_name: str, 81 | expected_relation_interface: str, 82 | actual_relation_interface: str, 83 | ): 84 | self.relation_name = relation_name 85 | self.expected_relation_interface = expected_relation_interface 86 | self.actual_relation_interface = actual_relation_interface 87 | self.message = ( 88 | "The '{}' relation has '{}' as its interface rather than the expected '{}'".format( 89 | relation_name, actual_relation_interface, expected_relation_interface 90 | ) 91 | ) 92 | 93 | super().__init__(self.message) 94 | 95 | 96 | class RelationRoleMismatchError(Exception): 97 | """Raised if the relation with the given name has a different direction.""" 98 | 99 | def __init__( 100 | self, 101 | relation_name: str, 102 | expected_relation_role: RelationRole, 103 | actual_relation_role: RelationRole, 104 | ): 105 | self.relation_name = relation_name 106 | self.expected_relation_interface = expected_relation_role 107 | self.actual_relation_role = actual_relation_role 108 | self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( 109 | relation_name, repr(actual_relation_role), repr(expected_relation_role) 110 | ) 111 | 112 | super().__init__(self.message) 113 | 114 | 115 | class InvalidAlertRuleEvent(EventBase): 116 | """Event emitted when alert rule files are not parsable. 117 | 118 | Enables us to set a clear status on the provider. 119 | """ 120 | 121 | def __init__(self, handle, errors: str = "", valid: bool = False): 122 | super().__init__(handle) 123 | self.errors = errors 124 | self.valid = valid 125 | 126 | def snapshot(self) -> Dict: 127 | """Save alert rule information.""" 128 | return { 129 | "valid": self.valid, 130 | "errors": self.errors, 131 | } 132 | 133 | def restore(self, snapshot): 134 | """Restore alert rule information.""" 135 | self.valid = snapshot["valid"] 136 | self.errors = snapshot["errors"] 137 | 138 | 139 | def _is_official_alert_rule_format(rules_dict: dict) -> bool: 140 | """Are alert rules in the upstream format as supported by Prometheus. 141 | 142 | Alert rules in dictionary format are in "official" form if they 143 | contain a "groups" key, since this implies they contain a list of 144 | alert rule groups. 145 | 146 | Args: 147 | rules_dict: a set of alert rules in Python dictionary format 148 | 149 | Returns: 150 | True if alert rules are in official Prometheus file format. 151 | """ 152 | return "groups" in rules_dict 153 | 154 | 155 | def _is_single_alert_rule_format(rules_dict: dict) -> bool: 156 | """Are alert rules in single rule format. 157 | 158 | The Prometheus charm library supports reading of alert rules in a 159 | custom format that consists of a single alert rule per file. This 160 | does not conform to the official Prometheus alert rule file format 161 | which requires that each alert rules file consists of a list of 162 | alert rule groups and each group consists of a list of alert 163 | rules. 164 | 165 | Alert rules in dictionary form are considered to be in single rule 166 | format if in the least it contains two keys corresponding to the 167 | alert rule name and alert expression. 168 | 169 | Returns: 170 | True if alert rule is in single rule file format. 171 | """ 172 | # one alert rule per file 173 | return set(rules_dict) >= {"alert", "expr"} 174 | 175 | 176 | def _validate_relation_by_interface_and_direction( 177 | charm: CharmBase, 178 | relation_name: str, 179 | expected_relation_interface: str, 180 | expected_relation_role: RelationRole, 181 | ): 182 | """Verifies that a relation has the necessary characteristics. 183 | 184 | Verifies that the `relation_name` provided: (1) exists in metadata.yaml, 185 | (2) declares as interface the interface name passed as `relation_interface` 186 | and (3) has the right "direction", i.e., it is a relation that `charm` 187 | provides or requires. 188 | 189 | Args: 190 | charm: a `CharmBase` object to scan for the matching relation. 191 | relation_name: the name of the relation to be verified. 192 | expected_relation_interface: the interface name to be matched by the 193 | relation named `relation_name`. 194 | expected_relation_role: whether the `relation_name` must be either 195 | provided or required by `charm`. 196 | 197 | Raises: 198 | RelationNotFoundError: If there is no relation in the charm's metadata.yaml 199 | with the same name as provided via `relation_name` argument. 200 | RelationInterfaceMismatchError: The relation with the same name as provided 201 | via `relation_name` argument does not have the same relation interface 202 | as specified via the `expected_relation_interface` argument. 203 | RelationRoleMismatchError: If the relation with the same name as provided 204 | via `relation_name` argument does not have the same role as specified 205 | via the `expected_relation_role` argument. 206 | """ 207 | if relation_name not in charm.meta.relations: 208 | raise RelationNotFoundError(relation_name) 209 | 210 | relation: RelationMeta = charm.meta.relations[relation_name] 211 | 212 | actual_relation_interface = relation.interface_name 213 | if actual_relation_interface != expected_relation_interface: 214 | raise RelationInterfaceMismatchError( 215 | relation_name, expected_relation_interface, actual_relation_interface or "None" 216 | ) 217 | 218 | if expected_relation_role == RelationRole.provides: 219 | if relation_name not in charm.meta.provides: 220 | raise RelationRoleMismatchError( 221 | relation_name, RelationRole.provides, RelationRole.requires 222 | ) 223 | elif expected_relation_role == RelationRole.requires: 224 | if relation_name not in charm.meta.requires: 225 | raise RelationRoleMismatchError( 226 | relation_name, RelationRole.requires, RelationRole.provides 227 | ) 228 | else: 229 | raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) 230 | 231 | 232 | class PrometheusRemoteWriteEndpointsChangedEvent(EventBase): 233 | """Event emitted when Prometheus remote_write endpoints change.""" 234 | 235 | def __init__(self, handle, relation_id): 236 | super().__init__(handle) 237 | self.relation_id = relation_id 238 | 239 | def snapshot(self): 240 | """Save scrape Prometheus remote_write information.""" 241 | return {"relation_id": self.relation_id} 242 | 243 | def restore(self, snapshot): 244 | """Restore scrape Prometheus remote_write information.""" 245 | self.relation_id = snapshot["relation_id"] 246 | 247 | 248 | class InvalidAlertRulePathError(Exception): 249 | """Raised if the alert rules folder cannot be found or is otherwise invalid.""" 250 | 251 | def __init__( 252 | self, 253 | alert_rules_absolute_path: str, 254 | message: str, 255 | ): 256 | self.alert_rules_absolute_path = alert_rules_absolute_path 257 | self.message = message 258 | 259 | super().__init__(self.message) 260 | 261 | 262 | def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: 263 | """Resolve the provided path items against the directory of the main file. 264 | 265 | Look up the directory of the main .py file being executed. This is normally 266 | going to be the charm.py file of the charm including this library. Then, resolve 267 | the provided path elements and, if the result path exists and is a directory, 268 | return its absolute path; otherwise, return `None`. 269 | """ 270 | charm_dir = Path(str(charm.charm_dir)) 271 | if not charm_dir.exists() or not charm_dir.is_dir(): 272 | # Operator Framework does not currently expose a robust 273 | # way to determine the top level charm source directory 274 | # that is consistent across deployed charms and unit tests 275 | # Hence for unit tests the current working directory is used 276 | # TODO: updated this logic when the following ticket is resolved 277 | # https://github.com/canonical/operator/issues/643 278 | charm_dir = Path(os.getcwd()) 279 | 280 | alerts_dir_path = charm_dir.absolute().joinpath(*path_elements) 281 | 282 | if not alerts_dir_path.exists(): 283 | raise InvalidAlertRulePathError(str(alerts_dir_path), "directory does not exist") 284 | if not alerts_dir_path.is_dir(): 285 | raise InvalidAlertRulePathError(str(alerts_dir_path), "is not a directory") 286 | 287 | return str(alerts_dir_path) 288 | 289 | 290 | class PrometheusRemoteWriteConsumerEvents(ObjectEvents): 291 | """Event descriptor for events raised by `PrometheusRemoteWriteConsumer`.""" 292 | 293 | endpoints_changed = EventSource(PrometheusRemoteWriteEndpointsChangedEvent) 294 | alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) 295 | 296 | 297 | class PrometheusRemoteWriteConsumer(Object): 298 | """API that manages a required `prometheus_remote_write` relation. 299 | 300 | The `PrometheusRemoteWriteConsumer` is intended to be used by charms that need to push data to 301 | other charms over the Prometheus remote_write API. 302 | 303 | The `PrometheusRemoteWriteConsumer` object can be instantiated as follows in your charm: 304 | 305 | ``` 306 | from charms.prometheus_k8s.v1.prometheus_remote_write import PrometheusRemoteWriteConsumer 307 | 308 | def __init__(self, *args): 309 | ... 310 | self.remote_write_consumer = PrometheusRemoteWriteConsumer(self) 311 | ... 312 | ``` 313 | 314 | The `PrometheusRemoteWriteConsumer` assumes that, in the `metadata.yaml` of your charm, 315 | you declare a required relation as follows: 316 | 317 | ``` 318 | requires: 319 | send-remote-write: # Relation name 320 | interface: prometheus_remote_write # Relation interface 321 | ``` 322 | 323 | The charmed operator is expected to use the `PrometheusRemoteWriteConsumer` as follows: 324 | 325 | ``` 326 | def __init__(self, *args): 327 | ... 328 | self.remote_write_consumer = PrometheusRemoteWriteConsumer(self) 329 | ... 330 | 331 | self.framework.observe( 332 | self.remote_write_consumer.on.endpoints_changed, 333 | self._handle_endpoints_changed, 334 | ) 335 | ``` 336 | The `endpoints_changed` event will fire in situations such as provider ip change (e.g. 337 | relation created, provider upgrade, provider pod churn) or provider config change (e.g. 338 | metadata settings). 339 | 340 | Then, inside the logic of `_handle_endpoints_changed`, the updated endpoint list is 341 | retrieved with: 342 | 343 | ``` 344 | self.remote_write_consumer.endpoints 345 | ``` 346 | 347 | which returns a dictionary structured like the Prometheus configuration object (see 348 | https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). 349 | 350 | Regarding the default relation name, `send-remote-write`: if you choose to change it, 351 | you would need to explicitly provide it to the `PrometheusRemoteWriteConsumer` via the 352 | `relation_name` constructor argument. (The relation interface, on the other hand, is 353 | fixed and, if you were to change it, your charm would not be able to relate with other 354 | charms using the correct relation interface. The library prevents you from doing that by 355 | raising an exception.) 356 | 357 | In any case, it is strongly discouraged to change the relation name: having consistent 358 | relation names across charms that do similar things is good practice and more 359 | straightforward for the users of your charm. The one exception to the rule above, 360 | is if your charm needs to both consume and provide a relation using the 361 | `prometheus_remote_write` interface, in which case changing the relation name to 362 | differentiate between "incoming" and "outgoing" remote write interactions is necessary. 363 | 364 | It is also possible to specify alert rules. By default, this library will search 365 | `/prometheus_alert_rules`, which in standard charm 366 | layouts resolves to `src/prometheus_alert_rules`. Each set of alert rules, grouped 367 | by the topology identifier, goes into a separate `*.rule` file. 368 | 369 | If the syntax of a rule is invalid, the `MetricsEndpointProvider` logs an error and 370 | does not load the particular rule. 371 | 372 | To avoid false positives and false negatives the library will inject label filters 373 | automatically in the PromQL expression. For example if the charm provides an 374 | alert rule with an `expr` like this one: 375 | 376 | ```yaml 377 | expr: up < 1 378 | ``` 379 | 380 | it will be modified with label filters ensuring that 381 | the only timeseries evaluated are those scraped from this charm, and no other. 382 | 383 | 384 | ```yaml 385 | expr: up{juju_application="traefik", 386 | juju_charm="traefik-k8s", 387 | juju_model="cos", 388 | juju_model_uuid="b5ed878d-2671-42e8-873a-e8d58c0ec325" 389 | } < 1 390 | labels: 391 | juju_application: traefik 392 | juju_charm: traefik-k8s 393 | juju_model: cos 394 | juju_model_uuid: b5ed878d-2671-42e8-873a-e8d58c0ec325 395 | ``` 396 | """ 397 | 398 | on = PrometheusRemoteWriteConsumerEvents() # pyright: ignore 399 | 400 | def __init__( 401 | self, 402 | charm: CharmBase, 403 | relation_name: str = DEFAULT_CONSUMER_NAME, 404 | alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, 405 | refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, 406 | *, 407 | forward_alert_rules: bool = True, 408 | extra_alert_labels: Dict = {}, 409 | ): 410 | """API to manage a required relation with the `prometheus_remote_write` interface. 411 | 412 | Args: 413 | charm: The charm object that instantiated this class. 414 | relation_name: Name of the relation with the `prometheus_remote_write` interface as 415 | defined in metadata.yaml. 416 | alert_rules_path: Path of the directory containing the alert rules. 417 | refresh_event: an optional bound event or list of bound events which 418 | will be observed to re-set alerts data. 419 | forward_alert_rules: Flag to toggle forwarding of charmed alert rules. 420 | extra_alert_labels: Dict of extra labels to inject alert rules with. 421 | 422 | Raises: 423 | RelationNotFoundError: If there is no relation in the charm's metadata.yaml 424 | with the same name as provided via `relation_name` argument. 425 | RelationInterfaceMismatchError: The relation with the same name as provided 426 | via `relation_name` argument does not have the `prometheus_scrape` relation 427 | interface. 428 | RelationRoleMismatchError: If the relation with the same name as provided 429 | via `relation_name` argument does not have the `RelationRole.requires` 430 | role. 431 | """ 432 | _validate_relation_by_interface_and_direction( 433 | charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires 434 | ) 435 | 436 | try: 437 | alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path) 438 | except InvalidAlertRulePathError as e: 439 | logger.debug( 440 | "Invalid Prometheus alert rules folder at %s: %s", 441 | e.alert_rules_absolute_path, 442 | e.message, 443 | ) 444 | 445 | super().__init__(charm, relation_name) 446 | self._charm = charm 447 | self._relation_name = relation_name 448 | self._alert_rules_path = alert_rules_path 449 | self._forward_alert_rules = forward_alert_rules 450 | self._extra_alert_labels = extra_alert_labels 451 | 452 | self.topology = JujuTopology.from_charm(charm) 453 | 454 | on_relation = self._charm.on[self._relation_name] 455 | 456 | self.framework.observe(on_relation.relation_joined, self._handle_endpoints_changed) 457 | self.framework.observe(on_relation.relation_changed, self._handle_endpoints_changed) 458 | self.framework.observe(on_relation.relation_departed, self._handle_endpoints_changed) 459 | self.framework.observe(on_relation.relation_broken, self._on_relation_broken) 460 | self.framework.observe(on_relation.relation_joined, self._push_alerts_on_relation_joined) 461 | self.framework.observe( 462 | self._charm.on.leader_elected, self._push_alerts_to_all_relation_databags 463 | ) 464 | self.framework.observe( 465 | self._charm.on.upgrade_charm, self._push_alerts_to_all_relation_databags 466 | ) 467 | if refresh_event: 468 | if not isinstance(refresh_event, list): 469 | refresh_event = [refresh_event] 470 | for ev in refresh_event: 471 | self.framework.observe(ev, self._push_alerts_to_all_relation_databags) 472 | 473 | def _on_relation_broken(self, event: RelationBrokenEvent) -> None: 474 | self.on.endpoints_changed.emit(relation_id=event.relation.id) 475 | 476 | def _handle_endpoints_changed(self, event: RelationEvent) -> None: 477 | if self._charm.unit.is_leader() and event.app is not None: 478 | ev = json.loads(event.relation.data[event.app].get("event", "{}")) 479 | 480 | if ev: 481 | valid = bool(ev.get("valid", True)) 482 | errors = ev.get("errors", "") 483 | 484 | if valid and not errors: 485 | self.on.alert_rule_status_changed.emit(valid=valid) 486 | else: 487 | self.on.alert_rule_status_changed.emit(valid=valid, errors=errors) 488 | 489 | self.on.endpoints_changed.emit(relation_id=event.relation.id) 490 | 491 | def _push_alerts_on_relation_joined(self, event: RelationEvent) -> None: 492 | self._push_alerts_to_relation_databag(event.relation) 493 | 494 | def _push_alerts_to_all_relation_databags(self, _: Optional[HookEvent]) -> None: 495 | for relation in self.model.relations[self._relation_name]: 496 | self._push_alerts_to_relation_databag(relation) 497 | 498 | def _push_alerts_to_relation_databag(self, relation: Relation) -> None: 499 | if not self._charm.unit.is_leader(): 500 | return 501 | 502 | alert_rules = AlertRules(query_type="promql", topology=self.topology) 503 | if self._forward_alert_rules: 504 | alert_rules.add_path(self._alert_rules_path) 505 | alert_rules.add( 506 | generic_alert_groups.aggregator_rules, group_name_prefix=self.topology.identifier 507 | ) 508 | 509 | alert_rules_as_dict = alert_rules.as_dict() 510 | 511 | if self._extra_alert_labels: 512 | alert_rules_as_dict = ( 513 | PrometheusRemoteWriteConsumer._inject_extra_labels_to_alert_rules( 514 | alert_rules_as_dict, self._extra_alert_labels 515 | ) 516 | ) 517 | 518 | relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) 519 | 520 | def reload_alerts(self) -> None: 521 | """Reload alert rules from disk and push to relation data.""" 522 | self._push_alerts_to_all_relation_databags(None) 523 | 524 | @staticmethod 525 | def _inject_extra_labels_to_alert_rules(rules: Dict, extra_alert_labels: Dict) -> Dict: 526 | """Return a copy of the rules dict with extra labels injected.""" 527 | result = copy.deepcopy(rules) 528 | for group in result.get("groups", []): 529 | for rule in group.get("rules", []): 530 | rule.setdefault("labels", {}).update(extra_alert_labels) 531 | return result 532 | 533 | @property 534 | def endpoints(self) -> List[Dict[str, str]]: 535 | """A config object ready to be dropped into a prometheus config file. 536 | 537 | The endpoints are deduplicated. 538 | 539 | The format of the dict is specified in the official prometheus docs: 540 | https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write 541 | 542 | Returns: 543 | A list of dictionaries where each dictionary provides information about 544 | a single remote_write endpoint. 545 | """ 546 | endpoints = [] 547 | for relation in self.model.relations[self._relation_name]: 548 | for unit in relation.units: 549 | if unit.app is self._charm.app: 550 | # This is a peer unit 551 | continue 552 | if not (unit_databag := relation.data.get(unit)): 553 | continue 554 | if not (remote_write := unit_databag.get("remote_write")): 555 | continue 556 | 557 | deserialized_remote_write = json.loads(remote_write) 558 | endpoints.append( 559 | { 560 | "url": deserialized_remote_write["url"], 561 | } 562 | ) 563 | 564 | # When multiple units of the remote-write server are behind an ingress 565 | # (e.g. mimir), relation data would end up with the same ingress url 566 | # for all units. 567 | # Deduplicate the endpoints by converting each dict to a tuple of 568 | # dict.items(), throwing them into a set, and then converting them 569 | # back to dictionaries 570 | deduplicated_endpoints = [dict(t) for t in {tuple(d.items()) for d in endpoints}] 571 | return deduplicated_endpoints 572 | 573 | 574 | class PrometheusRemoteWriteAlertsChangedEvent(EventBase): 575 | """Event emitted when Prometheus remote_write alerts change.""" 576 | 577 | def __init__(self, handle, relation_id): 578 | super().__init__(handle) 579 | self.relation_id = relation_id 580 | 581 | def snapshot(self): 582 | """Save Prometheus remote_write information.""" 583 | return {"relation_id": self.relation_id} 584 | 585 | def restore(self, snapshot): 586 | """Restore Prometheus remote_write information.""" 587 | self.relation_id = snapshot["relation_id"] 588 | 589 | 590 | class PrometheusRemoteWriteProviderConsumersChangedEvent(EventBase): 591 | """Event emitted when Prometheus remote_write alerts change.""" 592 | 593 | 594 | class PrometheusRemoteWriteProviderEvents(ObjectEvents): 595 | """Event descriptor for events raised by `PrometheusRemoteWriteProvider`.""" 596 | 597 | alert_rules_changed = EventSource(PrometheusRemoteWriteAlertsChangedEvent) 598 | consumers_changed = EventSource(PrometheusRemoteWriteProviderConsumersChangedEvent) 599 | 600 | 601 | class PrometheusRemoteWriteProvider(Object): 602 | """API that manages a provided `prometheus_remote_write` relation. 603 | 604 | The `PrometheusRemoteWriteProvider` is intended to be used by charms whose workloads need 605 | to receive data from other charms' workloads over the Prometheus remote_write API. 606 | 607 | The `PrometheusRemoteWriteProvider` object can be instantiated as follows in your charm: 608 | 609 | ``` 610 | from charms.prometheus_k8s.v1.prometheus_remote_write import PrometheusRemoteWriteProvider 611 | 612 | def __init__(self, *args): 613 | ... 614 | self.remote_write_provider = PrometheusRemoteWriteProvider(self) 615 | ... 616 | ``` 617 | 618 | The `PrometheusRemoteWriteProvider` assumes that, in the `metadata.yaml` of your charm, 619 | you declare a provided relation as follows: 620 | 621 | ``` 622 | provides: 623 | receive-remote-write: # Relation name 624 | interface: prometheus_remote_write # Relation interface 625 | ``` 626 | 627 | About the name of the relation managed by this library: technically, you *could* change 628 | the relation name, `receive-remote-write`, but that requires you to provide the new 629 | relation name to the `PrometheusRemoteWriteProducer` via the `relation_name` constructor 630 | argument. (The relation interface, on the other hand, is immutable and, if you were to change 631 | it, your charm would not be able to relate with other charms using the right relation 632 | interface. The library prevents you from doing that by raising an exception.) In any case, it 633 | is strongly discouraged to change the relation name: having consistent relation names across 634 | charms that do similar things is a very good thing for the people that will use your charm. 635 | The one exception to the rule above, is if you charm needs to both consume and provide a 636 | relation using the `prometheus_remote_write` interface, in which case changing the relation 637 | name to differentiate between "incoming" and "outgoing" remote write interactions is necessary. 638 | """ 639 | 640 | on = PrometheusRemoteWriteProviderEvents() # pyright: ignore 641 | 642 | def __init__( 643 | self, 644 | charm: CharmBase, 645 | relation_name: str = DEFAULT_RELATION_NAME, 646 | *, 647 | server_url_func: Callable[[], str] = lambda: f"http://{socket.getfqdn()}:9090", 648 | endpoint_path: str = "/api/v1/write", 649 | ): 650 | """API to manage a provided relation with the `prometheus_remote_write` interface. 651 | 652 | Args: 653 | charm: The charm object that instantiated this class. 654 | relation_name: Name of the relation with the `prometheus_remote_write` interface as 655 | defined in metadata.yaml. 656 | server_url_func: A callable returning the URL for your prometheus server. 657 | endpoint_path: The path of the server's remote_write endpoint. 658 | 659 | Raises: 660 | RelationNotFoundError: If there is no relation in the charm's metadata.yaml 661 | with the same name as provided via `relation_name` argument. 662 | RelationInterfaceMismatchError: The relation with the same name as provided 663 | via `relation_name` argument does not have the `prometheus_scrape` relation 664 | interface. 665 | RelationRoleMismatchError: If the relation with the same name as provided 666 | via `relation_name` argument does not have the `RelationRole.requires` 667 | role. 668 | """ 669 | _validate_relation_by_interface_and_direction( 670 | charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides 671 | ) 672 | 673 | super().__init__(charm, relation_name) 674 | self._charm = charm 675 | self._tool = CosTool(self._charm) 676 | self._relation_name = relation_name 677 | self._get_server_url = server_url_func 678 | self._endpoint_path = endpoint_path 679 | 680 | on_relation = self._charm.on[self._relation_name] 681 | self.framework.observe( 682 | on_relation.relation_created, 683 | self._on_consumers_changed, 684 | ) 685 | self.framework.observe( 686 | on_relation.relation_joined, 687 | self._on_consumers_changed, 688 | ) 689 | self.framework.observe( 690 | on_relation.relation_changed, 691 | self._on_relation_changed, 692 | ) 693 | 694 | def _on_consumers_changed(self, event: RelationEvent) -> None: 695 | if not isinstance(event, RelationBrokenEvent): 696 | self.update_endpoint(event.relation) 697 | self.on.consumers_changed.emit() 698 | 699 | def _on_relation_changed(self, event: RelationEvent) -> None: 700 | """Flag Providers that data has changed, so they can re-read alerts.""" 701 | self.on.alert_rules_changed.emit(event.relation.id) 702 | 703 | def update_endpoint(self, relation: Optional[Relation] = None) -> None: 704 | """Triggers programmatically the update of the relation data. 705 | 706 | This method should be used when the charm relying on this library needs 707 | to update the relation data in response to something occurring outside 708 | the `prometheus_remote_write` relation lifecycle, e.g., in case of a 709 | host address change because the charmed operator becomes connected to an 710 | Ingress after the `prometheus_remote_write` relation is established. 711 | 712 | Args: 713 | relation: An optional instance of `class:ops.model.Relation` to update. 714 | If not provided, all instances of the `prometheus_remote_write` 715 | relation are updated. 716 | """ 717 | relations = [relation] if relation else self.model.relations[self._relation_name] 718 | 719 | for relation in relations: 720 | self._set_endpoint_on_relation(relation) 721 | 722 | def _set_endpoint_on_relation(self, relation: Relation) -> None: 723 | """Set the remote_write endpoint on relations. 724 | 725 | Args: 726 | relation: The relation whose data to update. 727 | """ 728 | relation.data[self._charm.unit]["remote_write"] = json.dumps( 729 | { 730 | "url": self._get_server_url().rstrip("/") + "/" + self._endpoint_path.strip("/"), 731 | } 732 | ) 733 | 734 | @property 735 | def alerts(self) -> dict: 736 | """Fetch alert rules from all relations. 737 | 738 | A Prometheus alert rules file consists of a list of "groups". Each 739 | group consists of a list of alerts (`rules`) that are sequentially 740 | executed. This method returns all the alert rules provided by each 741 | related metrics provider charm. These rules may be used to generate a 742 | separate alert rules file for each relation since the returned list 743 | of alert groups are indexed by relation ID. Also, for each relation ID 744 | associated scrape metadata such as Juju model, UUID and application 745 | name are provided so the unique name may be generated for the rules 746 | file. For each relation the structure of data returned is a dictionary 747 | with four keys 748 | 749 | - groups 750 | - model 751 | - model_uuid 752 | - application 753 | 754 | The value of the `groups` key is such that it may be used to generate 755 | a Prometheus alert rules file directly using `yaml.dump` but the 756 | `groups` key itself must be included as this is required by Prometheus, 757 | for example as in `yaml.safe_dump({"groups": alerts["groups"]})`. 758 | 759 | The `PrometheusRemoteWriteProvider` accepts a list of rules and these 760 | rules are all placed into one group. 761 | 762 | Returns: 763 | a dictionary mapping the name of an alert rule group to the group. 764 | """ 765 | alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files 766 | for relation in self._charm.model.relations[self._relation_name]: 767 | if not relation.units or not relation.app: 768 | continue 769 | 770 | alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) 771 | if not alert_rules: 772 | continue 773 | 774 | alert_rules = self._inject_alert_expr_labels(alert_rules) 775 | 776 | identifier, topology = self._get_identifier_by_alert_rules(alert_rules) 777 | if not topology: 778 | try: 779 | scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) 780 | identifier = JujuTopology.from_dict(scrape_metadata).identifier 781 | alerts[identifier] = self._tool.apply_label_matchers(alert_rules) # type: ignore 782 | 783 | except KeyError as e: 784 | logger.debug( 785 | "Relation %s has no 'scrape_metadata': %s", 786 | relation.id, 787 | e, 788 | ) 789 | 790 | if not identifier: 791 | logger.error( 792 | "Alert rules were found but no usable group or identifier was present." 793 | ) 794 | continue 795 | 796 | _, errmsg = self._tool.validate_alert_rules(alert_rules) 797 | if errmsg: 798 | logger.error(f"Invalid alert rule file: {errmsg}") 799 | if self._charm.unit.is_leader(): 800 | data = json.loads(relation.data[self._charm.app].get("event", "{}")) 801 | data["errors"] = errmsg 802 | relation.data[self._charm.app]["event"] = json.dumps(data) 803 | continue 804 | 805 | alerts[identifier] = alert_rules 806 | 807 | return alerts 808 | 809 | def _get_identifier_by_alert_rules( 810 | self, rules: Dict[str, Any] 811 | ) -> Tuple[Union[str, None], Union[JujuTopology, None]]: 812 | """Determine an appropriate dict key for alert rules. 813 | 814 | The key is used as the filename when writing alerts to disk, so the structure 815 | and uniqueness is important. 816 | 817 | Args: 818 | rules: a dict of alert rules 819 | Returns: 820 | A tuple containing an identifier, if found, and a JujuTopology, if it could 821 | be constructed. 822 | """ 823 | if "groups" not in rules: 824 | logger.debug("No alert groups were found in relation data") 825 | return None, None 826 | 827 | # Construct an ID based on what's in the alert rules if they have labels 828 | for group in rules["groups"]: 829 | try: 830 | labels = group["rules"][0]["labels"] 831 | topology = JujuTopology( 832 | # Don't try to safely get required constructor fields. There's already 833 | # a handler for KeyErrors 834 | model_uuid=labels["juju_model_uuid"], 835 | model=labels["juju_model"], 836 | application=labels["juju_application"], 837 | unit=labels.get("juju_unit", ""), 838 | charm_name=labels.get("juju_charm", ""), 839 | ) 840 | return topology.identifier, topology 841 | except KeyError: 842 | logger.debug("Alert rules were found but no usable labels were present") 843 | continue 844 | 845 | logger.warning( 846 | "No labeled alert rules were found, and no 'scrape_metadata' " 847 | "was available. Using the alert group name as filename." 848 | ) 849 | try: 850 | for group in rules["groups"]: 851 | return group["name"], None 852 | except KeyError: 853 | logger.debug("No group name was found to use as identifier") 854 | 855 | return None, None 856 | 857 | def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]: 858 | """Iterate through alert rules and inject topology into expressions. 859 | 860 | Args: 861 | rules: a dict of alert rules 862 | """ 863 | if "groups" not in rules: 864 | return rules 865 | 866 | modified_groups = [] 867 | for group in rules["groups"]: 868 | # Copy off rules, so we don't modify an object we're iterating over 869 | rules_copy = group["rules"] 870 | for idx, rule in enumerate(rules_copy): 871 | labels = rule.get("labels") 872 | 873 | if labels: 874 | try: 875 | topology = JujuTopology( 876 | # Don't try to safely get required constructor fields. There's already 877 | # a handler for KeyErrors 878 | model_uuid=labels["juju_model_uuid"], 879 | model=labels["juju_model"], 880 | application=labels["juju_application"], 881 | unit=labels.get("juju_unit", ""), 882 | charm_name=labels.get("juju_charm", ""), 883 | ) 884 | 885 | # Inject topology and put it back in the list 886 | rule["expr"] = self._tool.inject_label_matchers( 887 | re.sub(r"%%juju_topology%%,?", "", rule["expr"]), 888 | topology.alert_expression_dict, 889 | ) 890 | except KeyError: 891 | # Some required JujuTopology key is missing. Just move on. 892 | pass 893 | 894 | group["rules"][idx] = rule 895 | 896 | modified_groups.append(group) 897 | 898 | rules["groups"] = modified_groups 899 | return rules 900 | 901 | 902 | # Copy/pasted from prometheus_scrape.py 903 | class CosTool: 904 | """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" 905 | 906 | _path = None 907 | _disabled = False 908 | 909 | def __init__(self, charm): 910 | self._charm = charm 911 | 912 | @property 913 | def path(self): 914 | """Lazy lookup of the path of cos-tool.""" 915 | if self._disabled: 916 | return None 917 | if not self._path: 918 | self._path = self._get_tool_path() 919 | if not self._path: 920 | logger.debug("Skipping injection of juju topology as label matchers") 921 | self._disabled = True 922 | return self._path 923 | 924 | def apply_label_matchers(self, rules) -> dict: 925 | """Will apply label matchers to the expression of all alerts in all supplied groups.""" 926 | if not self.path: 927 | return rules 928 | for group in rules["groups"]: 929 | rules_in_group = group.get("rules", []) 930 | for rule in rules_in_group: 931 | topology = {} 932 | # if the user for some reason has provided juju_unit, we'll need to honor it 933 | # in most cases, however, this will be empty 934 | for label in [ 935 | "juju_model", 936 | "juju_model_uuid", 937 | "juju_application", 938 | "juju_charm", 939 | "juju_unit", 940 | ]: 941 | if label in rule["labels"]: 942 | topology[label] = rule["labels"][label] 943 | 944 | rule["expr"] = self.inject_label_matchers(rule["expr"], topology) 945 | return rules 946 | 947 | def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: 948 | """Will validate correctness of alert rules, returning a boolean and any errors.""" 949 | if not self.path: 950 | logger.debug("`cos-tool` unavailable. Not validating alert correctness.") 951 | return True, "" 952 | 953 | with tempfile.TemporaryDirectory() as tmpdir: 954 | rule_path = Path(tmpdir + "/validate_rule.yaml") 955 | rule_path.write_text(yaml.dump(rules)) 956 | 957 | args = [str(self.path), "validate", str(rule_path)] 958 | # noinspection PyBroadException 959 | try: 960 | self._exec(args) 961 | return True, "" 962 | except subprocess.CalledProcessError as e: 963 | logger.debug("Validating the rules failed: %s", e.output) 964 | return False, ", ".join( 965 | [ 966 | line 967 | for line in e.output.decode("utf8").splitlines() 968 | if "error validating" in line 969 | ] 970 | ) 971 | 972 | def inject_label_matchers(self, expression, topology) -> str: 973 | """Add label matchers to an expression.""" 974 | if not topology: 975 | return expression 976 | if not self.path: 977 | logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression) 978 | return expression 979 | args = [str(self.path), "transform"] 980 | args.extend( 981 | ["--label-matcher={}={}".format(key, value) for key, value in topology.items()] 982 | ) 983 | 984 | args.extend(["{}".format(expression)]) 985 | # noinspection PyBroadException 986 | try: 987 | return self._exec(args) 988 | except subprocess.CalledProcessError as e: 989 | logger.debug('Applying the expression failed: "%s", falling back to the original', e) 990 | return expression 991 | 992 | def _get_tool_path(self) -> Optional[Path]: 993 | arch = platform.machine() 994 | arch = "amd64" if arch == "x86_64" else arch 995 | res = "cos-tool-{}".format(arch) 996 | try: 997 | path = Path(res).resolve(strict=True) 998 | return path 999 | except (FileNotFoundError, OSError): 1000 | logger.debug('Could not locate cos-tool at: "{}"'.format(res)) 1001 | return None 1002 | 1003 | def _exec(self, cmd) -> str: 1004 | result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 1005 | return result.stdout.decode("utf-8").strip() 1006 | -------------------------------------------------------------------------------- /lib/charms/prometheus_k8s/v0/prometheus_scrape.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | """Prometheus Scrape Library. 4 | 5 | ## Overview 6 | 7 | This document explains how to integrate with the Prometheus charm 8 | for the purpose of providing a metrics endpoint to Prometheus. It 9 | also explains how alternative implementations of the Prometheus charms 10 | may maintain the same interface and be backward compatible with all 11 | currently integrated charms. Finally this document is the 12 | authoritative reference on the structure of relation data that is 13 | shared between Prometheus charms and any other charm that intends to 14 | provide a scrape target for Prometheus. 15 | 16 | ## Source code 17 | 18 | Source code can be found on GitHub at: 19 | https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s 20 | 21 | ## Provider Library Usage 22 | 23 | This Prometheus charm interacts with its scrape targets using its 24 | charm library. Charms seeking to expose metric endpoints for the 25 | Prometheus charm, must do so using the `MetricsEndpointProvider` 26 | object from this charm library. For the simplest use cases, using the 27 | `MetricsEndpointProvider` object only requires instantiating it, 28 | typically in the constructor of your charm (the one which exposes a 29 | metrics endpoint). The `MetricsEndpointProvider` constructor requires 30 | the name of the relation over which a scrape target (metrics endpoint) 31 | is exposed to the Prometheus charm. This relation must use the 32 | `prometheus_scrape` interface. By default address of the metrics 33 | endpoint is set to the unit IP address, by each unit of the 34 | `MetricsEndpointProvider` charm. These units set their address in 35 | response to the `PebbleReady` event of each container in the unit, 36 | since container restarts of Kubernetes charms can result in change of 37 | IP addresses. The default name for the metrics endpoint relation is 38 | `metrics-endpoint`. It is strongly recommended to use the same 39 | relation name for consistency across charms and doing so obviates the 40 | need for an additional constructor argument. The 41 | `MetricsEndpointProvider` object may be instantiated as follows 42 | 43 | from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider 44 | 45 | def __init__(self, *args): 46 | super().__init__(*args) 47 | ... 48 | self.metrics_endpoint = MetricsEndpointProvider(self) 49 | ... 50 | 51 | Note that the first argument (`self`) to `MetricsEndpointProvider` is 52 | always a reference to the parent (scrape target) charm. 53 | 54 | An instantiated `MetricsEndpointProvider` object will ensure that each 55 | unit of its parent charm, is a scrape target for the 56 | `MetricsEndpointConsumer` (Prometheus) charm. By default 57 | `MetricsEndpointProvider` assumes each unit of the consumer charm 58 | exports its metrics at a path given by `/metrics` on port 80. These 59 | defaults may be changed by providing the `MetricsEndpointProvider` 60 | constructor an optional argument (`jobs`) that represents a 61 | Prometheus scrape job specification using Python standard data 62 | structures. This job specification is a subset of Prometheus' own 63 | [scrape 64 | configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) 65 | format but represented using Python data structures. More than one job 66 | may be provided using the `jobs` argument. Hence `jobs` accepts a list 67 | of dictionaries where each dictionary represents one `` 68 | object as described in the Prometheus documentation. The currently 69 | supported configuration subset is: `job_name`, `metrics_path`, 70 | `static_configs` 71 | 72 | Suppose it is required to change the port on which scraped metrics are 73 | exposed to 8000. This may be done by providing the following data 74 | structure as the value of `jobs`. 75 | 76 | ``` 77 | [ 78 | { 79 | "static_configs": [ 80 | { 81 | "targets": ["*:8000"] 82 | } 83 | ] 84 | } 85 | ] 86 | ``` 87 | 88 | The wildcard ("*") host specification implies that the scrape targets 89 | will automatically be set to the host addresses advertised by each 90 | unit of the consumer charm. 91 | 92 | It is also possible to change the metrics path and scrape multiple 93 | ports, for example 94 | 95 | ``` 96 | [ 97 | { 98 | "metrics_path": "/my-metrics-path", 99 | "static_configs": [ 100 | { 101 | "targets": ["*:8000", "*:8081"], 102 | } 103 | ] 104 | } 105 | ] 106 | ``` 107 | 108 | More complex scrape configurations are possible. For example 109 | 110 | ``` 111 | [ 112 | { 113 | "static_configs": [ 114 | { 115 | "targets": ["10.1.32.215:7000", "*:8000"], 116 | "labels": { 117 | "some_key": "some-value" 118 | } 119 | } 120 | ] 121 | } 122 | ] 123 | ``` 124 | 125 | This example scrapes the target "10.1.32.215" at port 7000 in addition 126 | to scraping each unit at port 8000. There is however one difference 127 | between wildcard targets (specified using "*") and fully qualified 128 | targets (such as "10.1.32.215"). The Prometheus charm automatically 129 | associates labels with metrics generated by each target. These labels 130 | localise the source of metrics within the Juju topology by specifying 131 | its "model name", "model UUID", "application name" and "unit 132 | name". However unit name is associated only with wildcard targets but 133 | not with fully qualified targets. 134 | 135 | Multiple jobs with different metrics paths and labels are allowed, but 136 | each job must be given a unique name: 137 | 138 | ``` 139 | [ 140 | { 141 | "job_name": "my-first-job", 142 | "metrics_path": "one-path", 143 | "static_configs": [ 144 | { 145 | "targets": ["*:7000"], 146 | "labels": { 147 | "some_key": "some-value" 148 | } 149 | } 150 | ] 151 | }, 152 | { 153 | "job_name": "my-second-job", 154 | "metrics_path": "another-path", 155 | "static_configs": [ 156 | { 157 | "targets": ["*:8000"], 158 | "labels": { 159 | "some_other_key": "some-other-value" 160 | } 161 | } 162 | ] 163 | } 164 | ] 165 | ``` 166 | 167 | **Important:** `job_name` should be a fixed string (e.g. hardcoded literal). 168 | For instance, if you include variable elements, like your `unit.name`, it may break 169 | the continuity of the metrics time series gathered by Prometheus when the leader unit 170 | changes (e.g. on upgrade or rescale). 171 | 172 | Additionally, it is also technically possible, but **strongly discouraged**, to 173 | configure the following scrape-related settings, which behave as described by the 174 | [Prometheus documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config): 175 | 176 | - `static_configs` 177 | - `scrape_interval` 178 | - `scrape_timeout` 179 | - `proxy_url` 180 | - `relabel_configs` 181 | - `metric_relabel_configs` 182 | - `sample_limit` 183 | - `label_limit` 184 | - `label_name_length_limit` 185 | - `label_value_length_limit` 186 | 187 | The settings above are supported by the `prometheus_scrape` library only for the sake of 188 | specialized facilities like the [Prometheus Scrape Config](https://charmhub.io/prometheus-scrape-config-k8s) 189 | charm. Virtually no charms should use these settings, and charmers definitely **should not** 190 | expose them to the Juju administrator via configuration options. 191 | 192 | ## Consumer Library Usage 193 | 194 | The `MetricsEndpointConsumer` object may be used by Prometheus 195 | charms to manage relations with their scrape targets. For this 196 | purposes a Prometheus charm needs to do two things 197 | 198 | 1. Instantiate the `MetricsEndpointConsumer` object by providing it a 199 | reference to the parent (Prometheus) charm and optionally the name of 200 | the relation that the Prometheus charm uses to interact with scrape 201 | targets. This relation must confirm to the `prometheus_scrape` 202 | interface and it is strongly recommended that this relation be named 203 | `metrics-endpoint` which is its default value. 204 | 205 | For example a Prometheus charm may instantiate the 206 | `MetricsEndpointConsumer` in its constructor as follows 207 | 208 | from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer 209 | 210 | def __init__(self, *args): 211 | super().__init__(*args) 212 | ... 213 | self.metrics_consumer = MetricsEndpointConsumer(self) 214 | ... 215 | 216 | 2. A Prometheus charm also needs to respond to the 217 | `TargetsChangedEvent` event of the `MetricsEndpointConsumer` by adding itself as 218 | an observer for these events, as in 219 | 220 | self.framework.observe( 221 | self.metrics_consumer.on.targets_changed, 222 | self._on_scrape_targets_changed, 223 | ) 224 | 225 | In responding to the `TargetsChangedEvent` event the Prometheus 226 | charm must update the Prometheus configuration so that any new scrape 227 | targets are added and/or old ones removed from the list of scraped 228 | endpoints. For this purpose the `MetricsEndpointConsumer` object 229 | exposes a `jobs()` method that returns a list of scrape jobs. Each 230 | element of this list is the Prometheus scrape configuration for that 231 | job. In order to update the Prometheus configuration, the Prometheus 232 | charm needs to replace the current list of jobs with the list provided 233 | by `jobs()` as follows 234 | 235 | def _on_scrape_targets_changed(self, event): 236 | ... 237 | scrape_jobs = self.metrics_consumer.jobs() 238 | for job in scrape_jobs: 239 | prometheus_scrape_config.append(job) 240 | ... 241 | 242 | ## Alerting Rules 243 | 244 | This charm library also supports gathering alerting rules from all 245 | related `MetricsEndpointProvider` charms and enabling corresponding alerts within the 246 | Prometheus charm. Alert rules are automatically gathered by `MetricsEndpointProvider` 247 | charms when using this library, from a directory conventionally named 248 | `prometheus_alert_rules`. This directory must reside at the top level 249 | in the `src` folder of the consumer charm. Each file in this directory 250 | is assumed to be in one of two formats: 251 | - the official prometheus alert rule format, conforming to the 252 | [Prometheus docs](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) 253 | - a single rule format, which is a simplified subset of the official format, 254 | comprising a single alert rule per file, using the same YAML fields. 255 | 256 | The file name must have one of the following extensions: 257 | - `.rule` 258 | - `.rules` 259 | - `.yml` 260 | - `.yaml` 261 | 262 | An example of the contents of such a file in the custom single rule 263 | format is shown below. 264 | 265 | ``` 266 | alert: HighRequestLatency 267 | expr: job:request_latency_seconds:mean5m{my_key=my_value} > 0.5 268 | for: 10m 269 | labels: 270 | severity: Medium 271 | type: HighLatency 272 | annotations: 273 | summary: High request latency for {{ $labels.instance }}. 274 | ``` 275 | 276 | The `MetricsEndpointProvider` will read all available alert rules and 277 | also inject "filtering labels" into the alert expressions. The 278 | filtering labels ensure that alert rules are localised to the metrics 279 | provider charm's Juju topology (application, model and its UUID). Such 280 | a topology filter is essential to ensure that alert rules submitted by 281 | one provider charm generates alerts only for that same charm. When 282 | alert rules are embedded in a charm, and the charm is deployed as a 283 | Juju application, the alert rules from that application have their 284 | expressions automatically updated to filter for metrics coming from 285 | the units of that application alone. This remove risk of spurious 286 | evaluation, e.g., when you have multiple deployments of the same charm 287 | monitored by the same Prometheus. 288 | 289 | Not all alerts one may want to specify can be embedded in a 290 | charm. Some alert rules will be specific to a user's use case. This is 291 | the case, for example, of alert rules that are based on business 292 | constraints, like expecting a certain amount of requests to a specific 293 | API every five minutes. Such alert rules can be specified via the 294 | [COS Config Charm](https://charmhub.io/cos-configuration-k8s), 295 | which allows importing alert rules and other settings like dashboards 296 | from a Git repository. 297 | 298 | Gathering alert rules and generating rule files within the Prometheus 299 | charm is easily done using the `alerts()` method of 300 | `MetricsEndpointConsumer`. Alerts generated by Prometheus will 301 | automatically include Juju topology labels in the alerts. These labels 302 | indicate the source of the alert. The following labels are 303 | automatically included with each alert 304 | 305 | - `juju_model` 306 | - `juju_model_uuid` 307 | - `juju_application` 308 | 309 | ## Relation Data 310 | 311 | The Prometheus charm uses both application and unit relation data to 312 | obtain information regarding its scrape jobs, alert rules and scrape 313 | targets. This relation data is in JSON format and it closely resembles 314 | the YAML structure of Prometheus [scrape configuration] 315 | (https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config). 316 | 317 | Units of Metrics provider charms advertise their names and addresses 318 | over unit relation data using the `prometheus_scrape_unit_name` and 319 | `prometheus_scrape_unit_address` keys. While the `scrape_metadata`, 320 | `scrape_jobs` and `alert_rules` keys in application relation data 321 | of Metrics provider charms hold eponymous information. 322 | 323 | """ # noqa: W505 324 | 325 | import copy 326 | import hashlib 327 | import ipaddress 328 | import json 329 | import logging 330 | import os 331 | import platform 332 | import re 333 | import socket 334 | import subprocess 335 | import tempfile 336 | from collections import defaultdict 337 | from pathlib import Path 338 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union 339 | from urllib.parse import urlparse 340 | 341 | import yaml 342 | from cosl import JujuTopology 343 | from cosl.rules import AlertRules, generic_alert_groups 344 | from ops.charm import CharmBase, RelationRole 345 | from ops.framework import ( 346 | BoundEvent, 347 | EventBase, 348 | EventSource, 349 | Object, 350 | ObjectEvents, 351 | StoredDict, 352 | StoredList, 353 | ) 354 | from ops.model import Relation 355 | 356 | # The unique Charmhub library identifier, never change it 357 | LIBID = "bc84295fef5f4049878f07b131968ee2" 358 | 359 | # Increment this major API version when introducing breaking changes 360 | LIBAPI = 0 361 | 362 | # Increment this PATCH version before using `charmcraft publish-lib` or reset 363 | # to 0 if you are raising the major API version 364 | LIBPATCH = 56 365 | 366 | # Version 0.0.53 needed for cosl.rules.generic_alert_groups 367 | PYDEPS = ["cosl>=0.0.53"] 368 | 369 | logger = logging.getLogger(__name__) 370 | 371 | 372 | ALLOWED_KEYS = { 373 | "job_name", 374 | "metrics_path", 375 | "static_configs", 376 | "scrape_interval", 377 | "scrape_timeout", 378 | "proxy_url", 379 | "relabel_configs", 380 | "metric_relabel_configs", 381 | "sample_limit", 382 | "label_limit", 383 | "label_name_length_limit", 384 | "label_value_length_limit", 385 | "scheme", 386 | "basic_auth", 387 | "tls_config", 388 | "authorization", 389 | "params", 390 | } 391 | DEFAULT_JOB = { 392 | "metrics_path": "/metrics", 393 | "static_configs": [{"targets": ["*:80"]}], 394 | } 395 | 396 | 397 | DEFAULT_RELATION_NAME = "metrics-endpoint" 398 | RELATION_INTERFACE_NAME = "prometheus_scrape" 399 | 400 | DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" 401 | 402 | 403 | class PrometheusConfig: 404 | """A namespace for utility functions for manipulating the prometheus config dict.""" 405 | 406 | # relabel instance labels so that instance identifiers are globally unique 407 | # stable over unit recreation 408 | topology_relabel_config = { 409 | "source_labels": ["juju_model", "juju_model_uuid", "juju_application"], 410 | "separator": "_", 411 | "target_label": "instance", 412 | "regex": "(.*)", 413 | } 414 | 415 | topology_relabel_config_wildcard = { 416 | "source_labels": ["juju_model", "juju_model_uuid", "juju_application", "juju_unit"], 417 | "separator": "_", 418 | "target_label": "instance", 419 | "regex": "(.*)", 420 | } 421 | 422 | @staticmethod 423 | def sanitize_scrape_config(job: dict) -> dict: 424 | """Restrict permissible scrape configuration options. 425 | 426 | If job is empty then a default job is returned. The 427 | default job is 428 | 429 | ``` 430 | { 431 | "metrics_path": "/metrics", 432 | "static_configs": [{"targets": ["*:80"]}], 433 | } 434 | ``` 435 | 436 | Args: 437 | job: a dict containing a single Prometheus job 438 | specification. 439 | 440 | Returns: 441 | a dictionary containing a sanitized job specification. 442 | """ 443 | sanitized_job = DEFAULT_JOB.copy() 444 | sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS}) 445 | return sanitized_job 446 | 447 | @staticmethod 448 | def sanitize_scrape_configs(scrape_configs: List[dict]) -> List[dict]: 449 | """A vectorized version of `sanitize_scrape_config`.""" 450 | return [PrometheusConfig.sanitize_scrape_config(job) for job in scrape_configs] 451 | 452 | @staticmethod 453 | def prefix_job_names(scrape_configs: List[dict], prefix: str) -> List[dict]: 454 | """Adds the given prefix to all the job names in the given scrape_configs list.""" 455 | modified_scrape_configs = [] 456 | for scrape_config in scrape_configs: 457 | job_name = scrape_config.get("job_name") 458 | modified = scrape_config.copy() 459 | modified["job_name"] = prefix + "_" + job_name if job_name else prefix 460 | modified_scrape_configs.append(modified) 461 | 462 | return modified_scrape_configs 463 | 464 | @staticmethod 465 | def expand_wildcard_targets_into_individual_jobs( 466 | scrape_jobs: List[dict], 467 | hosts: Dict[str, Tuple[str, str]], 468 | topology: Optional[JujuTopology] = None, 469 | ) -> List[dict]: 470 | """Extract wildcard hosts from the given scrape_configs list into separate jobs. 471 | 472 | Args: 473 | scrape_jobs: list of scrape jobs. 474 | hosts: a dictionary mapping host names to host address for 475 | all units of the relation for which this job configuration 476 | must be constructed. 477 | topology: optional arg for adding topology labels to scrape targets. 478 | """ 479 | # hosts = self._relation_hosts(relation) 480 | 481 | modified_scrape_jobs = [] 482 | for job in scrape_jobs: 483 | static_configs = job.get("static_configs") 484 | if not static_configs: 485 | continue 486 | 487 | # When a single unit specified more than one wildcard target, then they are expanded 488 | # into a static_config per target 489 | non_wildcard_static_configs = [] 490 | 491 | for static_config in static_configs: 492 | targets = static_config.get("targets") 493 | if not targets: 494 | continue 495 | 496 | # All non-wildcard targets remain in the same static_config 497 | non_wildcard_targets = [] 498 | 499 | # All wildcard targets are extracted to a job per unit. If multiple wildcard 500 | # targets are specified, they remain in the same static_config (per unit). 501 | wildcard_targets = [] 502 | 503 | for target in targets: 504 | match = re.compile(r"\*(?:(:\d+))?").match(target) 505 | if match: 506 | # This is a wildcard target. 507 | # Need to expand into separate jobs and remove it from this job here 508 | wildcard_targets.append(target) 509 | else: 510 | # This is not a wildcard target. Copy it over into its own static_config. 511 | non_wildcard_targets.append(target) 512 | 513 | # All non-wildcard targets remain in the same static_config 514 | if non_wildcard_targets: 515 | non_wildcard_static_config = static_config.copy() 516 | non_wildcard_static_config["targets"] = non_wildcard_targets 517 | 518 | if topology: 519 | # When non-wildcard targets (aka fully qualified hostnames) are specified, 520 | # there is no reliable way to determine the name (Juju topology unit name) 521 | # for such a target. Therefore labeling with Juju topology, excluding the 522 | # unit name. 523 | non_wildcard_static_config["labels"] = { 524 | **topology.label_matcher_dict, 525 | **non_wildcard_static_config.get("labels", {}), 526 | } 527 | 528 | non_wildcard_static_configs.append(non_wildcard_static_config) 529 | 530 | # Extract wildcard targets into individual jobs 531 | if wildcard_targets: 532 | for unit_name, (unit_hostname, unit_path) in hosts.items(): 533 | modified_job = job.copy() 534 | modified_job["static_configs"] = [static_config.copy()] 535 | modified_static_config = modified_job["static_configs"][0] 536 | modified_static_config["targets"] = [ 537 | target.replace("*", unit_hostname) for target in wildcard_targets 538 | ] 539 | 540 | unit_num = unit_name.split("/")[-1] 541 | job_name = modified_job.get("job_name", "unnamed-job") + "-" + unit_num 542 | modified_job["job_name"] = job_name 543 | modified_job["metrics_path"] = unit_path + ( 544 | job.get("metrics_path") or "/metrics" 545 | ) 546 | 547 | if topology: 548 | # Add topology labels 549 | modified_static_config["labels"] = { 550 | **topology.label_matcher_dict, 551 | **{"juju_unit": unit_name}, 552 | **modified_static_config.get("labels", {}), 553 | } 554 | 555 | # Instance relabeling for topology should be last in order. 556 | modified_job["relabel_configs"] = modified_job.get( 557 | "relabel_configs", [] 558 | ) + [PrometheusConfig.topology_relabel_config_wildcard] 559 | 560 | modified_scrape_jobs.append(modified_job) 561 | 562 | if non_wildcard_static_configs: 563 | modified_job = job.copy() 564 | modified_job["static_configs"] = non_wildcard_static_configs 565 | modified_job["metrics_path"] = modified_job.get("metrics_path") or "/metrics" 566 | 567 | if topology: 568 | # Instance relabeling for topology should be last in order. 569 | modified_job["relabel_configs"] = modified_job.get("relabel_configs", []) + [ 570 | PrometheusConfig.topology_relabel_config 571 | ] 572 | 573 | modified_scrape_jobs.append(modified_job) 574 | 575 | return modified_scrape_jobs 576 | 577 | @staticmethod 578 | def render_alertmanager_static_configs(alertmanagers: List[str]): 579 | """Render the alertmanager static_configs section from a list of URLs. 580 | 581 | Each target must be in the hostname:port format, and prefixes are specified in a separate 582 | key. Therefore, with ingress in place, would need to extract the path into the 583 | `path_prefix` key, which is higher up in the config hierarchy. 584 | 585 | https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config 586 | 587 | Args: 588 | alertmanagers: List of alertmanager URLs. 589 | 590 | Returns: 591 | A dict representation for the static_configs section. 592 | """ 593 | # Make sure it's a valid url so urlparse could parse it. 594 | scheme = re.compile(r"^https?://") 595 | sanitized = [am if scheme.search(am) else "http://" + am for am in alertmanagers] 596 | 597 | # Create a mapping from paths to netlocs 598 | # Group alertmanager targets into a dictionary of lists: 599 | # {path: [netloc1, netloc2]} 600 | paths = defaultdict(list) # type: Dict[Tuple[str, str], List[str]] 601 | for parsed in map(urlparse, sanitized): 602 | path = parsed.path or "/" 603 | paths[(parsed.scheme, path)].append(parsed.netloc) 604 | 605 | return { 606 | "alertmanagers": [ 607 | { 608 | # For https we still do not render a `tls_config` section because 609 | # certs are expected to be made available by the charm via the 610 | # `update-ca-certificates` mechanism. 611 | "scheme": scheme, 612 | "path_prefix": path_prefix, 613 | "static_configs": [{"targets": netlocs}], 614 | } 615 | for (scheme, path_prefix), netlocs in paths.items() 616 | ] 617 | } 618 | 619 | 620 | class RelationNotFoundError(Exception): 621 | """Raised if there is no relation with the given name is found.""" 622 | 623 | def __init__(self, relation_name: str): 624 | self.relation_name = relation_name 625 | self.message = "No relation named '{}' found".format(relation_name) 626 | 627 | super().__init__(self.message) 628 | 629 | 630 | class RelationInterfaceMismatchError(Exception): 631 | """Raised if the relation with the given name has a different interface.""" 632 | 633 | def __init__( 634 | self, 635 | relation_name: str, 636 | expected_relation_interface: str, 637 | actual_relation_interface: str, 638 | ): 639 | self.relation_name = relation_name 640 | self.expected_relation_interface = expected_relation_interface 641 | self.actual_relation_interface = actual_relation_interface 642 | self.message = ( 643 | "The '{}' relation has '{}' as interface rather than the expected '{}'".format( 644 | relation_name, actual_relation_interface, expected_relation_interface 645 | ) 646 | ) 647 | 648 | super().__init__(self.message) 649 | 650 | 651 | class RelationRoleMismatchError(Exception): 652 | """Raised if the relation with the given name has a different role.""" 653 | 654 | def __init__( 655 | self, 656 | relation_name: str, 657 | expected_relation_role: RelationRole, 658 | actual_relation_role: RelationRole, 659 | ): 660 | self.relation_name = relation_name 661 | self.expected_relation_interface = expected_relation_role 662 | self.actual_relation_role = actual_relation_role 663 | self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( 664 | relation_name, repr(actual_relation_role), repr(expected_relation_role) 665 | ) 666 | 667 | super().__init__(self.message) 668 | 669 | 670 | class InvalidAlertRuleEvent(EventBase): 671 | """Event emitted when alert rule files are not parsable. 672 | 673 | Enables us to set a clear status on the provider. 674 | """ 675 | 676 | def __init__(self, handle, errors: str = "", valid: bool = False): 677 | super().__init__(handle) 678 | self.errors = errors 679 | self.valid = valid 680 | 681 | def snapshot(self) -> Dict: 682 | """Save alert rule information.""" 683 | return { 684 | "valid": self.valid, 685 | "errors": self.errors, 686 | } 687 | 688 | def restore(self, snapshot): 689 | """Restore alert rule information.""" 690 | self.valid = snapshot["valid"] 691 | self.errors = snapshot["errors"] 692 | 693 | 694 | class InvalidScrapeJobEvent(EventBase): 695 | """Event emitted when alert rule files are not valid.""" 696 | 697 | def __init__(self, handle, errors: str = ""): 698 | super().__init__(handle) 699 | self.errors = errors 700 | 701 | def snapshot(self) -> Dict: 702 | """Save error information.""" 703 | return {"errors": self.errors} 704 | 705 | def restore(self, snapshot): 706 | """Restore error information.""" 707 | self.errors = snapshot["errors"] 708 | 709 | 710 | class MetricsEndpointProviderEvents(ObjectEvents): 711 | """Events raised by :class:`InvalidAlertRuleEvent`s.""" 712 | 713 | alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) 714 | invalid_scrape_job = EventSource(InvalidScrapeJobEvent) 715 | 716 | 717 | def _type_convert_stored(obj): 718 | """Convert Stored* to their appropriate types, recursively.""" 719 | if isinstance(obj, StoredList): 720 | return list(map(_type_convert_stored, obj)) 721 | if isinstance(obj, StoredDict): 722 | rdict = {} # type: Dict[Any, Any] 723 | for k in obj.keys(): 724 | rdict[k] = _type_convert_stored(obj[k]) 725 | return rdict 726 | return obj 727 | 728 | 729 | def _validate_relation_by_interface_and_direction( 730 | charm: CharmBase, 731 | relation_name: str, 732 | expected_relation_interface: str, 733 | expected_relation_role: RelationRole, 734 | ): 735 | """Verifies that a relation has the necessary characteristics. 736 | 737 | Verifies that the `relation_name` provided: (1) exists in metadata.yaml, 738 | (2) declares as interface the interface name passed as `relation_interface` 739 | and (3) has the right "direction", i.e., it is a relation that `charm` 740 | provides or requires. 741 | 742 | Args: 743 | charm: a `CharmBase` object to scan for the matching relation. 744 | relation_name: the name of the relation to be verified. 745 | expected_relation_interface: the interface name to be matched by the 746 | relation named `relation_name`. 747 | expected_relation_role: whether the `relation_name` must be either 748 | provided or required by `charm`. 749 | 750 | Raises: 751 | RelationNotFoundError: If there is no relation in the charm's metadata.yaml 752 | with the same name as provided via `relation_name` argument. 753 | RelationInterfaceMismatchError: The relation with the same name as provided 754 | via `relation_name` argument does not have the same relation interface 755 | as specified via the `expected_relation_interface` argument. 756 | RelationRoleMismatchError: If the relation with the same name as provided 757 | via `relation_name` argument does not have the same role as specified 758 | via the `expected_relation_role` argument. 759 | """ 760 | if relation_name not in charm.meta.relations: 761 | raise RelationNotFoundError(relation_name) 762 | 763 | relation = charm.meta.relations[relation_name] 764 | 765 | actual_relation_interface = relation.interface_name 766 | if actual_relation_interface != expected_relation_interface: 767 | raise RelationInterfaceMismatchError( 768 | relation_name, expected_relation_interface, actual_relation_interface or "None" 769 | ) 770 | 771 | if expected_relation_role == RelationRole.provides: 772 | if relation_name not in charm.meta.provides: 773 | raise RelationRoleMismatchError( 774 | relation_name, RelationRole.provides, RelationRole.requires 775 | ) 776 | elif expected_relation_role == RelationRole.requires: 777 | if relation_name not in charm.meta.requires: 778 | raise RelationRoleMismatchError( 779 | relation_name, RelationRole.requires, RelationRole.provides 780 | ) 781 | else: 782 | raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) 783 | 784 | 785 | class InvalidAlertRulePathError(Exception): 786 | """Raised if the alert rules folder cannot be found or is otherwise invalid.""" 787 | 788 | def __init__( 789 | self, 790 | alert_rules_absolute_path: Path, 791 | message: str, 792 | ): 793 | self.alert_rules_absolute_path = alert_rules_absolute_path 794 | self.message = message 795 | 796 | super().__init__(self.message) 797 | 798 | 799 | class TargetsChangedEvent(EventBase): 800 | """Event emitted when Prometheus scrape targets change.""" 801 | 802 | def __init__(self, handle, relation_id): 803 | super().__init__(handle) 804 | self.relation_id = relation_id 805 | 806 | def snapshot(self): 807 | """Save scrape target relation information.""" 808 | return {"relation_id": self.relation_id} 809 | 810 | def restore(self, snapshot): 811 | """Restore scrape target relation information.""" 812 | self.relation_id = snapshot["relation_id"] 813 | 814 | 815 | class MonitoringEvents(ObjectEvents): 816 | """Event descriptor for events raised by `MetricsEndpointConsumer`.""" 817 | 818 | targets_changed = EventSource(TargetsChangedEvent) 819 | 820 | 821 | class MetricsEndpointConsumer(Object): 822 | """A Prometheus based Monitoring service.""" 823 | 824 | on = MonitoringEvents() # pyright: ignore 825 | 826 | def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): 827 | """A Prometheus based Monitoring service. 828 | 829 | Args: 830 | charm: a `CharmBase` instance that manages this 831 | instance of the Prometheus service. 832 | relation_name: an optional string name of the relation between `charm` 833 | and the Prometheus charmed service. The default is "metrics-endpoint". 834 | It is strongly advised not to change the default, so that people 835 | deploying your charm will have a consistent experience with all 836 | other charms that consume metrics endpoints. 837 | 838 | Raises: 839 | RelationNotFoundError: If there is no relation in the charm's metadata.yaml 840 | with the same name as provided via `relation_name` argument. 841 | RelationInterfaceMismatchError: The relation with the same name as provided 842 | via `relation_name` argument does not have the `prometheus_scrape` relation 843 | interface. 844 | RelationRoleMismatchError: If the relation with the same name as provided 845 | via `relation_name` argument does not have the `RelationRole.requires` 846 | role. 847 | """ 848 | _validate_relation_by_interface_and_direction( 849 | charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires 850 | ) 851 | 852 | super().__init__(charm, relation_name) 853 | self._charm = charm 854 | self._relation_name = relation_name 855 | self._tool = CosTool(self._charm) 856 | events = self._charm.on[relation_name] 857 | self.framework.observe(events.relation_changed, self._on_metrics_provider_relation_changed) 858 | self.framework.observe( 859 | events.relation_departed, self._on_metrics_provider_relation_departed 860 | ) 861 | 862 | def _on_metrics_provider_relation_changed(self, event): 863 | """Handle changes with related metrics providers. 864 | 865 | Anytime there are changes in relations between Prometheus 866 | and metrics provider charms the Prometheus charm is informed, 867 | through a `TargetsChangedEvent` event. The Prometheus charm can 868 | then choose to update its scrape configuration. 869 | 870 | Args: 871 | event: a `CharmEvent` in response to which the Prometheus 872 | charm must update its scrape configuration. 873 | """ 874 | rel_id = event.relation.id 875 | 876 | self.on.targets_changed.emit(relation_id=rel_id) 877 | 878 | def _on_metrics_provider_relation_departed(self, event): 879 | """Update job config when a metrics provider departs. 880 | 881 | When a metrics provider departs the Prometheus charm is informed 882 | through a `TargetsChangedEvent` event so that it can update its 883 | scrape configuration to ensure that the departed metrics provider 884 | is removed from the list of scrape jobs and 885 | 886 | Args: 887 | event: a `CharmEvent` that indicates a metrics provider 888 | unit has departed. 889 | """ 890 | rel_id = event.relation.id 891 | self.on.targets_changed.emit(relation_id=rel_id) 892 | 893 | def jobs(self) -> list: 894 | """Fetch the list of scrape jobs. 895 | 896 | Returns: 897 | A list consisting of all the static scrape configurations 898 | for each related `MetricsEndpointProvider` that has specified 899 | its scrape targets. 900 | """ 901 | scrape_jobs = [] 902 | 903 | for relation in self._charm.model.relations[self._relation_name]: 904 | static_scrape_jobs = self._static_scrape_config(relation) 905 | if static_scrape_jobs: 906 | # Duplicate job names will cause validate_scrape_jobs to fail. 907 | # Therefore we need to dedupe here and after all jobs are collected. 908 | static_scrape_jobs = _dedupe_job_names(static_scrape_jobs) 909 | try: 910 | self._tool.validate_scrape_jobs(static_scrape_jobs) 911 | except subprocess.CalledProcessError as e: 912 | if self._charm.unit.is_leader(): 913 | data = json.loads(relation.data[self._charm.app].get("event", "{}")) 914 | data["scrape_job_errors"] = str(e) 915 | relation.data[self._charm.app]["event"] = json.dumps(data) 916 | else: 917 | scrape_jobs.extend(static_scrape_jobs) 918 | 919 | scrape_jobs = _dedupe_job_names(scrape_jobs) 920 | 921 | return scrape_jobs 922 | 923 | @property 924 | def alerts(self) -> dict: 925 | """Fetch alerts for all relations. 926 | 927 | A Prometheus alert rules file consists of a list of "groups". Each 928 | group consists of a list of alerts (`rules`) that are sequentially 929 | executed. This method returns all the alert rules provided by each 930 | related metrics provider charm. These rules may be used to generate a 931 | separate alert rules file for each relation since the returned list 932 | of alert groups are indexed by that relations Juju topology identifier. 933 | The Juju topology identifier string includes substrings that identify 934 | alert rule related metadata such as the Juju model, model UUID and the 935 | application name from where the alert rule originates. Since this 936 | topology identifier is globally unique, it may be used for instance as 937 | the name for the file into which the list of alert rule groups are 938 | written. For each relation, the structure of data returned is a dictionary 939 | representation of a standard prometheus rules file: 940 | 941 | {"groups": [{"name": ...}, ...]} 942 | 943 | per official prometheus documentation 944 | https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ 945 | 946 | The value of the `groups` key is such that it may be used to generate 947 | a Prometheus alert rules file directly using `yaml.dump` but the 948 | `groups` key itself must be included as this is required by Prometheus. 949 | 950 | For example the list of alert rule groups returned by this method may 951 | be written into files consumed by Prometheus as follows 952 | 953 | ``` 954 | for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items(): 955 | filename = "juju_" + topology_identifier + ".rules" 956 | path = os.path.join(PROMETHEUS_RULES_DIR, filename) 957 | rules = yaml.safe_dump(alert_rule_groups) 958 | container.push(path, rules, make_dirs=True) 959 | ``` 960 | 961 | Returns: 962 | A dictionary mapping the Juju topology identifier of the source charm to 963 | its list of alert rule groups. 964 | """ 965 | alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files 966 | for relation in self._charm.model.relations[self._relation_name]: 967 | if not relation.units or not relation.app: 968 | continue 969 | 970 | alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) 971 | if not alert_rules: 972 | continue 973 | 974 | alert_rules = self._inject_alert_expr_labels(alert_rules) 975 | 976 | identifier, topology = self._get_identifier_by_alert_rules(alert_rules) 977 | if not topology: 978 | try: 979 | scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) 980 | identifier = JujuTopology.from_dict(scrape_metadata).identifier 981 | 982 | except KeyError as e: 983 | logger.debug( 984 | "Relation %s has no 'scrape_metadata': %s", 985 | relation.id, 986 | e, 987 | ) 988 | 989 | if not identifier: 990 | logger.error( 991 | "Alert rules were found but no usable group or identifier was present." 992 | ) 993 | continue 994 | 995 | # We need to append the relation info to the identifier. This is to allow for cases for there are two 996 | # relations which eventually scrape the same application. Issue #551. 997 | identifier = f"{identifier}_{relation.name}_{relation.id}" 998 | 999 | alerts[identifier] = alert_rules 1000 | 1001 | _, errmsg = self._tool.validate_alert_rules(alert_rules) 1002 | if errmsg: 1003 | if alerts[identifier]: 1004 | del alerts[identifier] 1005 | if self._charm.unit.is_leader(): 1006 | data = json.loads(relation.data[self._charm.app].get("event", "{}")) 1007 | data["errors"] = errmsg 1008 | relation.data[self._charm.app]["event"] = json.dumps(data) 1009 | continue 1010 | 1011 | return alerts 1012 | 1013 | def _get_identifier_by_alert_rules( 1014 | self, rules: dict 1015 | ) -> Tuple[Union[str, None], Union[JujuTopology, None]]: 1016 | """Determine an appropriate dict key for alert rules. 1017 | 1018 | The key is used as the filename when writing alerts to disk, so the structure 1019 | and uniqueness is important. 1020 | 1021 | Args: 1022 | rules: a dict of alert rules 1023 | Returns: 1024 | A tuple containing an identifier, if found, and a JujuTopology, if it could 1025 | be constructed. 1026 | """ 1027 | if "groups" not in rules: 1028 | logger.debug("No alert groups were found in relation data") 1029 | return None, None 1030 | 1031 | # Construct an ID based on what's in the alert rules if they have labels 1032 | for group in rules["groups"]: 1033 | try: 1034 | labels = group["rules"][0]["labels"] 1035 | topology = JujuTopology( 1036 | # Don't try to safely get required constructor fields. There's already 1037 | # a handler for KeyErrors 1038 | model_uuid=labels["juju_model_uuid"], 1039 | model=labels["juju_model"], 1040 | application=labels["juju_application"], 1041 | unit=labels.get("juju_unit", ""), 1042 | charm_name=labels.get("juju_charm", ""), 1043 | ) 1044 | return topology.identifier, topology 1045 | except KeyError: 1046 | logger.debug("Alert rules were found but no usable labels were present") 1047 | continue 1048 | 1049 | logger.warning( 1050 | "No labeled alert rules were found, and no 'scrape_metadata' " 1051 | "was available. Using the alert group name as filename." 1052 | ) 1053 | try: 1054 | for group in rules["groups"]: 1055 | return group["name"], None 1056 | except KeyError: 1057 | logger.debug("No group name was found to use as identifier") 1058 | 1059 | return None, None 1060 | 1061 | def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]: 1062 | """Iterate through alert rules and inject topology into expressions. 1063 | 1064 | Args: 1065 | rules: a dict of alert rules 1066 | """ 1067 | if "groups" not in rules: 1068 | return rules 1069 | 1070 | modified_groups = [] 1071 | for group in rules["groups"]: 1072 | # Copy off rules, so we don't modify an object we're iterating over 1073 | rules_copy = group["rules"] 1074 | for idx, rule in enumerate(rules_copy): 1075 | labels = rule.get("labels") 1076 | 1077 | if labels: 1078 | try: 1079 | topology = JujuTopology( 1080 | # Don't try to safely get required constructor fields. There's already 1081 | # a handler for KeyErrors 1082 | model_uuid=labels["juju_model_uuid"], 1083 | model=labels["juju_model"], 1084 | application=labels["juju_application"], 1085 | unit=labels.get("juju_unit", ""), 1086 | charm_name=labels.get("juju_charm", ""), 1087 | ) 1088 | 1089 | # Inject topology and put it back in the list 1090 | rule["expr"] = self._tool.inject_label_matchers( 1091 | re.sub(r"%%juju_topology%%,?", "", rule["expr"]), 1092 | topology.alert_expression_dict, 1093 | ) 1094 | except KeyError: 1095 | # Some required JujuTopology key is missing. Just move on. 1096 | pass 1097 | 1098 | group["rules"][idx] = rule 1099 | 1100 | modified_groups.append(group) 1101 | 1102 | rules["groups"] = modified_groups 1103 | return rules 1104 | 1105 | def _static_scrape_config(self, relation) -> list: 1106 | """Generate the static scrape configuration for a single relation. 1107 | 1108 | If the relation data includes `scrape_metadata` then the value 1109 | of this key is used to annotate the scrape jobs with Juju 1110 | Topology labels before returning them. 1111 | 1112 | Args: 1113 | relation: an `ops.model.Relation` object whose static 1114 | scrape configuration is required. 1115 | 1116 | Returns: 1117 | A list (possibly empty) of scrape jobs. Each job is a 1118 | valid Prometheus scrape configuration for that job, 1119 | represented as a Python dictionary. 1120 | """ 1121 | if not relation.units: 1122 | return [] 1123 | 1124 | scrape_configs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) 1125 | 1126 | if not scrape_configs: 1127 | return [] 1128 | 1129 | scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) 1130 | 1131 | if not scrape_metadata: 1132 | return scrape_configs 1133 | 1134 | topology = JujuTopology.from_dict(scrape_metadata) 1135 | 1136 | job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier) 1137 | scrape_configs = PrometheusConfig.prefix_job_names(scrape_configs, job_name_prefix) 1138 | scrape_configs = PrometheusConfig.sanitize_scrape_configs(scrape_configs) 1139 | 1140 | hosts = self._relation_hosts(relation) 1141 | 1142 | scrape_configs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( 1143 | scrape_configs, hosts, topology 1144 | ) 1145 | 1146 | # For https scrape targets we still do not render a `tls_config` section because certs 1147 | # are expected to be made available by the charm via the `update-ca-certificates` mechanism. 1148 | return scrape_configs 1149 | 1150 | def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]: 1151 | """Returns a mapping from unit names to (address, path) tuples, for the given relation.""" 1152 | hosts = {} 1153 | for unit in relation.units: 1154 | if not (unit_databag := relation.data.get(unit)): 1155 | continue 1156 | 1157 | unit_path = unit_databag.get("prometheus_scrape_unit_path", "") 1158 | # TODO deprecate and remove unit.name 1159 | unit_name = unit_databag.get("prometheus_scrape_unit_name") or unit.name 1160 | # TODO deprecate and remove "prometheus_scrape_host" 1161 | unit_address = unit_databag.get("prometheus_scrape_unit_address") or unit_databag.get( 1162 | "prometheus_scrape_host" 1163 | ) 1164 | 1165 | if not (unit_name and unit_address): 1166 | continue 1167 | 1168 | hosts.update({unit_name: (unit_address, unit_path)}) 1169 | 1170 | return hosts 1171 | 1172 | def _target_parts(self, target) -> list: 1173 | """Extract host and port from a wildcard target. 1174 | 1175 | Args: 1176 | target: a string specifying a scrape target. A 1177 | scrape target is expected to have the format 1178 | "host:port". The host part may be a wildcard 1179 | "*" and the port part can be missing (along 1180 | with ":") in which case port is set to 80. 1181 | 1182 | Returns: 1183 | a list with target host and port as in [host, port] 1184 | """ 1185 | if ":" in target: 1186 | parts = target.split(":") 1187 | else: 1188 | parts = [target, "80"] 1189 | 1190 | return parts 1191 | 1192 | 1193 | def _dedupe_job_names(jobs: List[dict]): 1194 | """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key. 1195 | 1196 | Additionally, fully de-duplicate any identical jobs. 1197 | 1198 | Args: 1199 | jobs: A list of prometheus scrape jobs 1200 | """ 1201 | jobs_copy = copy.deepcopy(jobs) 1202 | 1203 | # Convert to a dict with job names as keys 1204 | # I think this line is O(n^2) but it should be okay given the list sizes 1205 | jobs_dict = { 1206 | job["job_name"]: list(filter(lambda x: x["job_name"] == job["job_name"], jobs_copy)) 1207 | for job in jobs_copy 1208 | } 1209 | 1210 | # If multiple jobs have the same name, convert the name to "name_" 1211 | for key in jobs_dict: 1212 | if len(jobs_dict[key]) > 1: 1213 | for job in jobs_dict[key]: 1214 | job_json = json.dumps(job) 1215 | hashed = hashlib.sha256(job_json.encode()).hexdigest() 1216 | job["job_name"] = "{}_{}".format(job["job_name"], hashed) 1217 | new_jobs = [] 1218 | for key in jobs_dict: 1219 | new_jobs.extend(list(jobs_dict[key])) 1220 | 1221 | # Deduplicate jobs which are equal 1222 | # Again this in O(n^2) but it should be okay 1223 | deduped_jobs = [] 1224 | seen = [] 1225 | for job in new_jobs: 1226 | job_json = json.dumps(job) 1227 | hashed = hashlib.sha256(job_json.encode()).hexdigest() 1228 | if hashed in seen: 1229 | continue 1230 | seen.append(hashed) 1231 | deduped_jobs.append(job) 1232 | 1233 | return deduped_jobs 1234 | 1235 | 1236 | def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: 1237 | """Resolve the provided path items against the directory of the main file. 1238 | 1239 | Look up the directory of the `main.py` file being executed. This is normally 1240 | going to be the charm.py file of the charm including this library. Then, resolve 1241 | the provided path elements and, if the result path exists and is a directory, 1242 | return its absolute path; otherwise, raise en exception. 1243 | 1244 | Raises: 1245 | InvalidAlertRulePathError, if the path does not exist or is not a directory. 1246 | """ 1247 | charm_dir = Path(str(charm.charm_dir)) 1248 | if not charm_dir.exists() or not charm_dir.is_dir(): 1249 | # Operator Framework does not currently expose a robust 1250 | # way to determine the top level charm source directory 1251 | # that is consistent across deployed charms and unit tests 1252 | # Hence for unit tests the current working directory is used 1253 | # TODO: updated this logic when the following ticket is resolved 1254 | # https://github.com/canonical/operator/issues/643 1255 | charm_dir = Path(os.getcwd()) 1256 | 1257 | alerts_dir_path = charm_dir.absolute().joinpath(*path_elements) 1258 | 1259 | if not alerts_dir_path.exists(): 1260 | raise InvalidAlertRulePathError(alerts_dir_path, "directory does not exist") 1261 | if not alerts_dir_path.is_dir(): 1262 | raise InvalidAlertRulePathError(alerts_dir_path, "is not a directory") 1263 | 1264 | return str(alerts_dir_path) 1265 | 1266 | 1267 | class MetricsEndpointProvider(Object): 1268 | """A metrics endpoint for Prometheus.""" 1269 | 1270 | on = MetricsEndpointProviderEvents() # pyright: ignore 1271 | 1272 | def __init__( 1273 | self, 1274 | charm, 1275 | relation_name: str = DEFAULT_RELATION_NAME, 1276 | jobs=None, 1277 | alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, 1278 | refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, 1279 | external_url: str = "", 1280 | lookaside_jobs_callable: Optional[Callable] = None, 1281 | *, 1282 | forward_alert_rules: bool = True, 1283 | ): 1284 | """Construct a metrics provider for a Prometheus charm. 1285 | 1286 | If your charm exposes a Prometheus metrics endpoint, the 1287 | `MetricsEndpointProvider` object enables your charm to easily 1288 | communicate how to reach that metrics endpoint. 1289 | 1290 | By default, a charm instantiating this object has the metrics 1291 | endpoints of each of its units scraped by the related Prometheus 1292 | charms. The scraped metrics are automatically tagged by the 1293 | Prometheus charms with Juju topology data via the 1294 | `juju_model_name`, `juju_model_uuid`, `juju_application_name` 1295 | and `juju_unit` labels. To support such tagging `MetricsEndpointProvider` 1296 | automatically forwards scrape metadata to a `MetricsEndpointConsumer` 1297 | (Prometheus charm). 1298 | 1299 | Scrape targets provided by `MetricsEndpointProvider` can be 1300 | customized when instantiating this object. For example in the 1301 | case of a charm exposing the metrics endpoint for each of its 1302 | units on port 8080 and the `/metrics` path, the 1303 | `MetricsEndpointProvider` can be instantiated as follows: 1304 | 1305 | self.metrics_endpoint_provider = MetricsEndpointProvider( 1306 | self, 1307 | jobs=[{ 1308 | "static_configs": [{"targets": ["*:8080"]}], 1309 | }]) 1310 | 1311 | The notation `*:` means "scrape each unit of this charm on port 1312 | ``. 1313 | 1314 | In case the metrics endpoints are not on the standard `/metrics` path, 1315 | a custom path can be specified as follows: 1316 | 1317 | self.metrics_endpoint_provider = MetricsEndpointProvider( 1318 | self, 1319 | jobs=[{ 1320 | "metrics_path": "/my/strange/metrics/path", 1321 | "static_configs": [{"targets": ["*:8080"]}], 1322 | }]) 1323 | 1324 | Note how the `jobs` argument is a list: this allows you to expose multiple 1325 | combinations of paths "metrics_path" and "static_configs" in case your charm 1326 | exposes multiple endpoints, which could happen, for example, when you have 1327 | multiple workload containers, with applications in each needing to be scraped. 1328 | The structure of the objects in the `jobs` list is one-to-one with the 1329 | `scrape_config` configuration item of Prometheus' own configuration (see 1330 | https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config 1331 | ), but with only a subset of the fields allowed. The permitted fields are 1332 | listed in `ALLOWED_KEYS` object in this charm library module. 1333 | 1334 | It is also possible to specify alert rules. By default, this library will look 1335 | into the `/prometheus_alert_rules`, which in a standard charm 1336 | layouts resolves to `src/prometheus_alert_rules`. Each alert rule goes into a 1337 | separate `*.rule` file. If the syntax of a rule is invalid, 1338 | the `MetricsEndpointProvider` logs an error and does not load the particular 1339 | rule. 1340 | 1341 | To avoid false positives and negatives in the evaluation of alert rules, 1342 | all ingested alert rule expressions are automatically qualified using Juju 1343 | Topology filters. This ensures that alert rules provided by your charm, trigger 1344 | alerts based only on data scrapped from your charm. For example an alert rule 1345 | such as the following 1346 | 1347 | alert: UnitUnavailable 1348 | expr: up < 1 1349 | for: 0m 1350 | 1351 | will be automatically transformed into something along the lines of the following 1352 | 1353 | alert: UnitUnavailable 1354 | expr: up{juju_model=, juju_model_uuid=, juju_application=} < 1 1355 | for: 0m 1356 | 1357 | An attempt will be made to validate alert rules prior to loading them into Prometheus. 1358 | If they are invalid, an event will be emitted from this object which charms can respond 1359 | to in order to set a meaningful status for administrators. 1360 | 1361 | This can be observed via `consumer.on.alert_rule_status_changed` which contains: 1362 | - The error(s) encountered when validating as `errors` 1363 | - A `valid` attribute, which can be used to reset the state of charms if alert rules 1364 | are updated via another mechanism (e.g. `cos-config`) and refreshed. 1365 | 1366 | Args: 1367 | charm: a `CharmBase` object that manages this 1368 | `MetricsEndpointProvider` object. Typically, this is 1369 | `self` in the instantiating class. 1370 | relation_name: an optional string name of the relation between `charm` 1371 | and the Prometheus charmed service. The default is "metrics-endpoint". 1372 | It is strongly advised not to change the default, so that people 1373 | deploying your charm will have a consistent experience with all 1374 | other charms that provide metrics endpoints. 1375 | jobs: an optional list of dictionaries where each 1376 | dictionary represents the Prometheus scrape 1377 | configuration for a single job. When not provided, a 1378 | default scrape configuration is provided for the 1379 | `/metrics` endpoint polling all units of the charm on port `80` 1380 | using the `MetricsEndpointProvider` object. 1381 | alert_rules_path: an optional path for the location of alert rules 1382 | files. Defaults to "./prometheus_alert_rules", 1383 | resolved relative to the directory hosting the charm entry file. 1384 | The alert rules are automatically updated on charm upgrade. 1385 | forward_alert_rules: a boolean flag to toggle forwarding of charmed alert rules. 1386 | refresh_event: an optional bound event or list of bound events which 1387 | will be observed to re-set scrape job data (IP address and others) 1388 | external_url: an optional argument that represents an external url that 1389 | can be generated by an Ingress or a Proxy. 1390 | lookaside_jobs_callable: an optional `Callable` which should be invoked 1391 | when the job configuration is built as a secondary mapping. The callable 1392 | should return a `List[Dict]` which is syntactically identical to the 1393 | `jobs` parameter, but can be updated out of step initialization of 1394 | this library without disrupting the 'global' job spec. 1395 | 1396 | Raises: 1397 | RelationNotFoundError: If there is no relation in the charm's metadata.yaml 1398 | with the same name as provided via `relation_name` argument. 1399 | RelationInterfaceMismatchError: The relation with the same name as provided 1400 | via `relation_name` argument does not have the `prometheus_scrape` relation 1401 | interface. 1402 | RelationRoleMismatchError: If the relation with the same name as provided 1403 | via `relation_name` argument does not have the `RelationRole.provides` 1404 | role. 1405 | """ 1406 | _validate_relation_by_interface_and_direction( 1407 | charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides 1408 | ) 1409 | 1410 | try: 1411 | alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path) 1412 | except InvalidAlertRulePathError as e: 1413 | logger.debug( 1414 | "Invalid Prometheus alert rules folder at %s: %s", 1415 | e.alert_rules_absolute_path, 1416 | e.message, 1417 | ) 1418 | 1419 | super().__init__(charm, relation_name) 1420 | self.topology = JujuTopology.from_charm(charm) 1421 | 1422 | self._charm = charm 1423 | self._alert_rules_path = alert_rules_path 1424 | self._forward_alert_rules = forward_alert_rules 1425 | self._relation_name = relation_name 1426 | # sanitize job configurations to the supported subset of parameters 1427 | jobs = [] if jobs is None else jobs 1428 | self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) 1429 | 1430 | if external_url: 1431 | external_url = ( 1432 | external_url if urlparse(external_url).scheme else ("http://" + external_url) 1433 | ) 1434 | self.external_url = external_url 1435 | self._lookaside_jobs = lookaside_jobs_callable 1436 | 1437 | events = self._charm.on[self._relation_name] 1438 | self.framework.observe(events.relation_changed, self._on_relation_changed) 1439 | 1440 | if not refresh_event: 1441 | # FIXME remove once podspec charms are verified. 1442 | # `self.set_scrape_job_spec()` is called every re-init so this should not be needed. 1443 | if len(self._charm.meta.containers) == 1: 1444 | if "kubernetes" in self._charm.meta.series: 1445 | # This is a podspec charm 1446 | refresh_event = [self._charm.on.update_status] 1447 | else: 1448 | # This is a sidecar/pebble charm 1449 | container = list(self._charm.meta.containers.values())[0] 1450 | refresh_event = [self._charm.on[container.name.replace("-", "_")].pebble_ready] 1451 | else: 1452 | logger.warning( 1453 | "%d containers are present in metadata.yaml and " 1454 | "refresh_event was not specified. Defaulting to update_status. " 1455 | "Metrics IP may not be set in a timely fashion.", 1456 | len(self._charm.meta.containers), 1457 | ) 1458 | refresh_event = [self._charm.on.update_status] 1459 | 1460 | else: 1461 | if not isinstance(refresh_event, list): 1462 | refresh_event = [refresh_event] 1463 | 1464 | self.framework.observe(events.relation_joined, self.set_scrape_job_spec) 1465 | for ev in refresh_event: 1466 | self.framework.observe(ev, self.set_scrape_job_spec) 1467 | 1468 | def _on_relation_changed(self, event): 1469 | """Check for alert rule messages in the relation data before moving on.""" 1470 | if self._charm.unit.is_leader(): 1471 | ev = json.loads(event.relation.data[event.app].get("event", "{}")) 1472 | 1473 | if ev: 1474 | valid = bool(ev.get("valid", True)) 1475 | errors = ev.get("errors", "") 1476 | 1477 | if valid and not errors: 1478 | self.on.alert_rule_status_changed.emit(valid=valid) 1479 | else: 1480 | self.on.alert_rule_status_changed.emit(valid=valid, errors=errors) 1481 | 1482 | scrape_errors = ev.get("scrape_job_errors", None) 1483 | if scrape_errors: 1484 | self.on.invalid_scrape_job.emit(errors=scrape_errors) 1485 | 1486 | def update_scrape_job_spec(self, jobs): 1487 | """Update scrape job specification.""" 1488 | self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) 1489 | self.set_scrape_job_spec() 1490 | 1491 | def set_scrape_job_spec(self, _=None): 1492 | """Ensure scrape target information is made available to prometheus. 1493 | 1494 | When a metrics provider charm is related to a prometheus charm, the 1495 | metrics provider sets specification and metadata related to its own 1496 | scrape configuration. This information is set using Juju application 1497 | data. In addition, each of the consumer units also sets its own 1498 | host address in Juju unit relation data. 1499 | """ 1500 | self._set_unit_ip() 1501 | 1502 | if not self._charm.unit.is_leader(): 1503 | return 1504 | 1505 | alert_rules = AlertRules(query_type="promql", topology=self.topology) 1506 | if self._forward_alert_rules: 1507 | alert_rules.add_path(self._alert_rules_path, recursive=True) 1508 | alert_rules.add( 1509 | copy.deepcopy(generic_alert_groups.application_rules), 1510 | group_name_prefix=self.topology.identifier, 1511 | ) 1512 | alert_rules_as_dict = alert_rules.as_dict() 1513 | 1514 | for relation in self._charm.model.relations[self._relation_name]: 1515 | relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) 1516 | relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) 1517 | 1518 | # Update relation data with the string representation of the rule file. 1519 | # Juju topology is already included in the "scrape_metadata" field above. 1520 | # The consumer side of the relation uses this information to name the rules file 1521 | # that is written to the filesystem. 1522 | relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) 1523 | 1524 | def _set_unit_ip(self, _=None): 1525 | """Set unit host address. 1526 | 1527 | Each time a metrics provider charm container is restarted it updates its own 1528 | host address in the unit relation data for the prometheus charm. 1529 | 1530 | The only argument specified is an event, and it ignored. This is for expediency 1531 | to be able to use this method as an event handler, although no access to the 1532 | event is actually needed. 1533 | """ 1534 | for relation in self._charm.model.relations[self._relation_name]: 1535 | unit_ip = str(self._charm.model.get_binding(relation).network.bind_address) 1536 | 1537 | # TODO store entire url in relation data, instead of only select url parts. 1538 | 1539 | if self.external_url: 1540 | parsed = urlparse(self.external_url) 1541 | unit_address = parsed.hostname 1542 | path = parsed.path 1543 | elif self._is_valid_unit_address(unit_ip): 1544 | unit_address = unit_ip 1545 | path = "" 1546 | else: 1547 | unit_address = socket.getfqdn() 1548 | path = "" 1549 | 1550 | relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = unit_address 1551 | relation.data[self._charm.unit]["prometheus_scrape_unit_path"] = path 1552 | relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str( 1553 | self._charm.model.unit.name 1554 | ) 1555 | 1556 | def _is_valid_unit_address(self, address: str) -> bool: 1557 | """Validate a unit address. 1558 | 1559 | At present only IP address validation is supported, but 1560 | this may be extended to DNS addresses also, as needed. 1561 | 1562 | Args: 1563 | address: a string representing a unit address 1564 | """ 1565 | try: 1566 | _ = ipaddress.ip_address(address) 1567 | except ValueError: 1568 | return False 1569 | 1570 | return True 1571 | 1572 | @property 1573 | def _scrape_jobs(self) -> list: 1574 | """Fetch list of scrape jobs. 1575 | 1576 | Returns: 1577 | A list of dictionaries, where each dictionary specifies a 1578 | single scrape job for Prometheus. 1579 | """ 1580 | jobs = self._jobs or [] 1581 | if callable(self._lookaside_jobs): 1582 | jobs.extend(PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs())) 1583 | return jobs or [DEFAULT_JOB] 1584 | 1585 | @property 1586 | def _scrape_metadata(self) -> dict: 1587 | """Generate scrape metadata. 1588 | 1589 | Returns: 1590 | Scrape configuration metadata for this metrics provider charm. 1591 | """ 1592 | return self.topology.as_dict() 1593 | 1594 | 1595 | class PrometheusRulesProvider(Object): 1596 | """Forward rules to Prometheus. 1597 | 1598 | This object may be used to forward rules to Prometheus. At present it only supports 1599 | forwarding alert rules. This is unlike :class:`MetricsEndpointProvider`, which 1600 | is used for forwarding both scrape targets and associated alert rules. This object 1601 | is typically used when there is a desire to forward rules that apply globally (across 1602 | all deployed charms and units) rather than to a single charm. All rule files are 1603 | forwarded using the same 'prometheus_scrape' interface that is also used by 1604 | `MetricsEndpointProvider`. 1605 | 1606 | Args: 1607 | charm: A charm instance that `provides` a relation with the `prometheus_scrape` interface. 1608 | relation_name: Name of the relation in `metadata.yaml` that 1609 | has the `prometheus_scrape` interface. 1610 | dir_path: Root directory for the collection of rule files. 1611 | recursive: Whether to scan for rule files recursively. 1612 | """ 1613 | 1614 | def __init__( 1615 | self, 1616 | charm: CharmBase, 1617 | relation_name: str = DEFAULT_RELATION_NAME, 1618 | dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, 1619 | recursive=True, 1620 | ): 1621 | super().__init__(charm, relation_name) 1622 | self._charm = charm 1623 | self._relation_name = relation_name 1624 | self._recursive = recursive 1625 | 1626 | try: 1627 | dir_path = _resolve_dir_against_charm_path(charm, dir_path) 1628 | except InvalidAlertRulePathError as e: 1629 | logger.debug( 1630 | "Invalid Prometheus alert rules folder at %s: %s", 1631 | e.alert_rules_absolute_path, 1632 | e.message, 1633 | ) 1634 | self.dir_path = dir_path 1635 | 1636 | events = self._charm.on[self._relation_name] 1637 | event_sources = [ 1638 | events.relation_joined, 1639 | events.relation_changed, 1640 | self._charm.on.leader_elected, 1641 | self._charm.on.upgrade_charm, 1642 | ] 1643 | 1644 | for event_source in event_sources: 1645 | self.framework.observe(event_source, self._update_relation_data) 1646 | 1647 | def _reinitialize_alert_rules(self): 1648 | """Reloads alert rules and updates all relations.""" 1649 | self._update_relation_data(None) 1650 | 1651 | def _update_relation_data(self, _): 1652 | """Update application relation data with alert rules for all relations.""" 1653 | if not self._charm.unit.is_leader(): 1654 | return 1655 | 1656 | alert_rules = AlertRules(query_type="promql") 1657 | alert_rules.add_path(self.dir_path, recursive=self._recursive) 1658 | alert_rules_as_dict = alert_rules.as_dict() 1659 | 1660 | logger.info("Updating relation data with rule files from disk") 1661 | for relation in self._charm.model.relations[self._relation_name]: 1662 | relation.data[self._charm.app]["alert_rules"] = json.dumps( 1663 | alert_rules_as_dict, 1664 | sort_keys=True, # sort, to prevent unnecessary relation_changed events 1665 | ) 1666 | 1667 | class CosTool: 1668 | """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" 1669 | 1670 | _path = None 1671 | _disabled = False 1672 | 1673 | def __init__(self, charm): 1674 | self._charm = charm 1675 | 1676 | @property 1677 | def path(self): 1678 | """Lazy lookup of the path of cos-tool.""" 1679 | if self._disabled: 1680 | return None 1681 | if not self._path: 1682 | self._path = self._get_tool_path() 1683 | if not self._path: 1684 | logger.debug("Skipping injection of juju topology as label matchers") 1685 | self._disabled = True 1686 | return self._path 1687 | 1688 | def apply_label_matchers(self, rules) -> dict: 1689 | """Will apply label matchers to the expression of all alerts in all supplied groups.""" 1690 | if not self.path: 1691 | return rules 1692 | for group in rules["groups"]: 1693 | rules_in_group = group.get("rules", []) 1694 | for rule in rules_in_group: 1695 | topology = {} 1696 | # if the user for some reason has provided juju_unit, we'll need to honor it 1697 | # in most cases, however, this will be empty 1698 | for label in [ 1699 | "juju_model", 1700 | "juju_model_uuid", 1701 | "juju_application", 1702 | "juju_charm", 1703 | "juju_unit", 1704 | ]: 1705 | if label in rule["labels"]: 1706 | topology[label] = rule["labels"][label] 1707 | 1708 | rule["expr"] = self.inject_label_matchers(rule["expr"], topology) 1709 | return rules 1710 | 1711 | def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: 1712 | """Will validate correctness of alert rules, returning a boolean and any errors.""" 1713 | if not self.path: 1714 | logger.debug("`cos-tool` unavailable. Not validating alert correctness.") 1715 | return True, "" 1716 | 1717 | with tempfile.TemporaryDirectory() as tmpdir: 1718 | rule_path = Path(tmpdir + "/validate_rule.yaml") 1719 | rule_path.write_text(yaml.dump(rules)) 1720 | 1721 | args = [str(self.path), "validate", str(rule_path)] 1722 | # noinspection PyBroadException 1723 | try: 1724 | self._exec(args) 1725 | return True, "" 1726 | except subprocess.CalledProcessError as e: 1727 | logger.debug("Validating the rules failed: %s", e.output.decode("utf8")) 1728 | return False, ", ".join( 1729 | [ 1730 | line 1731 | for line in e.output.decode("utf8").splitlines() 1732 | if "error validating" in line 1733 | ] 1734 | ) 1735 | 1736 | def validate_scrape_jobs(self, jobs: list) -> bool: 1737 | """Validate scrape jobs using cos-tool.""" 1738 | if not self.path: 1739 | logger.debug("`cos-tool` unavailable. Not validating scrape jobs.") 1740 | return True 1741 | conf = {"scrape_configs": jobs} 1742 | with tempfile.NamedTemporaryFile() as tmpfile: 1743 | with open(tmpfile.name, "w") as f: 1744 | f.write(yaml.safe_dump(conf)) 1745 | try: 1746 | self._exec([str(self.path), "validate-config", tmpfile.name]) 1747 | except subprocess.CalledProcessError as e: 1748 | logger.error("Validating scrape jobs failed: {}".format(e.output)) 1749 | raise 1750 | return True 1751 | 1752 | def inject_label_matchers(self, expression, topology) -> str: 1753 | """Add label matchers to an expression.""" 1754 | if not topology: 1755 | return expression 1756 | if not self.path: 1757 | logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression) 1758 | return expression 1759 | args = [str(self.path), "transform"] 1760 | args.extend( 1761 | ["--label-matcher={}={}".format(key, value) for key, value in topology.items()] 1762 | ) 1763 | 1764 | args.extend(["{}".format(expression)]) 1765 | # noinspection PyBroadException 1766 | try: 1767 | return self._exec(args) 1768 | except subprocess.CalledProcessError as e: 1769 | logger.debug('Applying the expression failed: "%s", falling back to the original', e) 1770 | return expression 1771 | 1772 | def _get_tool_path(self) -> Optional[Path]: 1773 | arch = platform.machine() 1774 | arch = "amd64" if arch == "x86_64" else arch 1775 | res = "cos-tool-{}".format(arch) 1776 | try: 1777 | path = Path(res).resolve(strict=True) 1778 | return path 1779 | except (FileNotFoundError, OSError): 1780 | logger.debug('Could not locate cos-tool at: "{}"'.format(res)) 1781 | return None 1782 | 1783 | def _exec(self, cmd) -> str: 1784 | result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 1785 | return result.stdout.decode("utf-8").strip() 1786 | --------------------------------------------------------------------------------