├── .jujuignore ├── CODEOWNERS ├── tests ├── unit │ ├── test_config │ │ ├── alertmanager_empty.yml │ │ ├── test_templates.tmpl │ │ ├── alertmanager_invalid.yml │ │ ├── alertmanager.yml │ │ └── alertmanager_with_templates.yml │ ├── conftest.py │ ├── test_brute_isolated.py │ ├── helpers.py │ ├── test_self_scrape_jobs.py │ ├── test_server_scheme.py │ ├── test_external_url.py │ ├── test_alertmanager_client.py │ ├── test_remote_configuration_requirer.py │ ├── test_push_config_to_workload_on_startup.py │ ├── test_consumer.py │ ├── test_remote_configuration_provider.py │ └── test_charm.py ├── integration │ ├── remote_configuration_tester │ │ ├── lib │ │ │ └── charms │ │ │ │ └── alertmanager_k8s │ │ │ │ └── v0 │ │ │ │ └── .gitkeep │ │ ├── charmcraft.yaml │ │ ├── pyproject.toml │ │ └── src │ │ │ └── charm.py │ ├── am_config.yaml │ ├── test_kubectl_delete.py │ ├── test_grafana_source.py │ ├── test_persistence.py │ ├── conftest.py │ ├── test_rescale_charm.py │ ├── test_upgrade_charm.py │ ├── test_remote_configuration.py │ ├── test_tls_web.py │ ├── test_templates.py │ └── helpers.py └── manual │ └── bundle_1_e2e_tls.yaml ├── .wokeignore ├── terraform ├── versions.tf ├── main.tf ├── outputs.tf ├── variables.tf └── README.md ├── .github ├── renovate.json5 ├── workflows │ ├── pull-request.yaml │ ├── release.yaml │ ├── tiobe-scan.yaml │ ├── update-libs.yaml │ ├── quality-gates.yaml │ └── promote.yaml ├── .jira_sync_config.yaml ├── pull_request_template.md └── ISSUE_TEMPLATE │ ├── enhancement_proposal.yml │ └── bug_report.yml ├── src ├── prometheus_alert_rules │ ├── heartbeat.rule │ ├── alertmanager_notifications_failed.rule │ └── alertmanager_configuration_reload_failure.rule ├── config_builder.py └── alertmanager.py ├── SECURITY.md ├── .gitignore ├── RELEASE.md ├── tox.ini ├── pyproject.toml ├── icon.svg ├── INTEGRATING.md ├── CONTRIBUTING.md ├── README.md ├── lib └── charms │ └── catalogue_k8s │ └── v1 │ └── catalogue.py ├── charmcraft.yaml └── LICENSE /.jujuignore: -------------------------------------------------------------------------------- 1 | /venv 2 | **/__pycache__ 3 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @canonical/Observability 2 | -------------------------------------------------------------------------------- /tests/unit/test_config/alertmanager_empty.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.wokeignore: -------------------------------------------------------------------------------- 1 | tests/integration/remote_configuration_tester/lib 2 | -------------------------------------------------------------------------------- /tests/integration/remote_configuration_tester/lib/charms/alertmanager_k8s/v0/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/test_config/test_templates.tmpl: -------------------------------------------------------------------------------- 1 | {{define "myTemplate"}}do something else{{end}} -------------------------------------------------------------------------------- /tests/unit/test_config/alertmanager_invalid.yml: -------------------------------------------------------------------------------- 1 | just: 2 | some: 3 | placeholder: config 4 | which: 5 | - is 6 | - not 7 | - valid 8 | -------------------------------------------------------------------------------- /terraform/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.5" 3 | required_providers { 4 | juju = { 5 | source = "juju/juju" 6 | version = "~> 1.0" 7 | } 8 | } 9 | } -------------------------------------------------------------------------------- /.github/renovate.json5: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "github>canonical/observability//.github/renovate/charms.json5", 5 | ], 6 | } 7 | -------------------------------------------------------------------------------- /tests/integration/am_config.yaml: -------------------------------------------------------------------------------- 1 | route: 2 | receiver: test_receiver 3 | group_by: 4 | - alertname 5 | group_wait: 1234s 6 | group_interval: 4321s 7 | repeat_interval: 1111h 8 | receivers: 9 | - name: test_receiver 10 | -------------------------------------------------------------------------------- /.github/workflows/pull-request.yaml: -------------------------------------------------------------------------------- 1 | name: Pull Requests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - track/** 8 | 9 | jobs: 10 | pull-request: 11 | name: PR 12 | uses: canonical/observability/.github/workflows/charm-pull-request.yaml@v1 13 | secrets: inherit 14 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release Charm 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - track/** 8 | 9 | jobs: 10 | release: 11 | uses: canonical/observability/.github/workflows/charm-release.yaml@v1 12 | secrets: inherit 13 | with: 14 | default-track: dev 15 | -------------------------------------------------------------------------------- /.github/workflows/tiobe-scan.yaml: -------------------------------------------------------------------------------- 1 | name: Tiobe TiCS Analysis 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 0 * * 1" # Runs at midnight UTC every Monday 7 | 8 | jobs: 9 | tics: 10 | name: TiCs 11 | uses: canonical/observability/.github/workflows/charm-tiobe-scan.yaml@v1 12 | secrets: inherit 13 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/heartbeat.rule: -------------------------------------------------------------------------------- 1 | # Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 2 | groups: 3 | - name: Watchdog 4 | rules: 5 | - alert: Watchdog 6 | expr: vector(1) 7 | labels: 8 | severity: none 9 | annotations: 10 | summary: Continuously firing alert to ensure Alertmanager is working 11 | -------------------------------------------------------------------------------- /.github/.jira_sync_config.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | jira_project_key: "OBC" 3 | status_mapping: 4 | opened: Untriaged 5 | closed: done 6 | not_planned: rejected 7 | 8 | components: 9 | - alertmanager 10 | 11 | add_gh_comment: false 12 | sync_description: false 13 | sync_comments: false 14 | 15 | label_mapping: 16 | "Type: Enhancement": Story 17 | -------------------------------------------------------------------------------- /.github/workflows/update-libs.yaml: -------------------------------------------------------------------------------- 1 | name: Auto-update Charm Libraries 2 | on: 3 | # Manual trigger 4 | workflow_dispatch: 5 | # Check regularly the upstream every four hours 6 | schedule: 7 | - cron: "0 0,4,8,12,16,20 * * *" 8 | 9 | jobs: 10 | update-lib: 11 | name: Check libraries 12 | uses: canonical/observability/.github/workflows/charm-update-libs.yaml@v1 13 | secrets: inherit 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/test_config/alertmanager.yml: -------------------------------------------------------------------------------- 1 | global: 2 | http_config: 3 | tls_config: 4 | insecure_skip_verify: true 5 | receivers: 6 | - name: placeholder 7 | webhook_configs: 8 | - url: http://127.0.0.1:5001/ 9 | route: 10 | group_by: 11 | - juju_application 12 | - juju_model 13 | - juju_model_uuid 14 | group_interval: 5m 15 | group_wait: 30s 16 | receiver: placeholder 17 | repeat_interval: 1h 18 | -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- 1 | resource "juju_application" "alertmanager" { 2 | name = var.app_name 3 | config = var.config 4 | constraints = var.constraints 5 | model_uuid = var.model_uuid 6 | storage_directives = var.storage_directives 7 | trust = true 8 | units = var.units 9 | 10 | charm { 11 | name = "alertmanager-k8s" 12 | channel = var.channel 13 | revision = var.revision 14 | } 15 | } -------------------------------------------------------------------------------- /.github/workflows/quality-gates.yaml: -------------------------------------------------------------------------------- 1 | name: Quality Gates 2 | 3 | on: 4 | # Manual trigger 5 | workflow_dispatch: 6 | # Run the quality checks periodically 7 | # FIXME: adjust the frequency as needed once we have actual gates in place 8 | # schedule: 9 | # - cron: "0 0 * * Tue" 10 | 11 | 12 | jobs: 13 | quality-gates: 14 | name: Run quality gates 15 | uses: canonical/observability/.github/workflows/charm-quality-gates.yaml@v1 16 | secrets: inherit 17 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Issue 2 | 3 | 4 | 5 | ## Solution 6 | 7 | 8 | 9 | ## Context 10 | 11 | 12 | 13 | ## Testing Instructions 14 | 15 | 16 | 17 | ## Upgrade Notes 18 | 19 | -------------------------------------------------------------------------------- /tests/unit/test_config/alertmanager_with_templates.yml: -------------------------------------------------------------------------------- 1 | global: 2 | http_config: 3 | tls_config: 4 | insecure_skip_verify: true 5 | receivers: 6 | - name: placeholder 7 | webhook_configs: 8 | - url: http://127.0.0.1:5001/ 9 | route: 10 | group_by: 11 | - juju_application 12 | - juju_model 13 | - juju_model_uuid 14 | group_interval: 5m 15 | group_wait: 30s 16 | receiver: placeholder 17 | repeat_interval: 1h 18 | templates: 19 | - ./tests/unit/test_config/test_templates.tmpl 20 | -------------------------------------------------------------------------------- /.github/workflows/promote.yaml: -------------------------------------------------------------------------------- 1 | name: Promote Charm 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | promotion: 7 | type: choice 8 | description: Channel to promote from 9 | options: 10 | - edge -> beta 11 | - beta -> candidate 12 | - candidate -> stable 13 | 14 | jobs: 15 | promote: 16 | name: Promote 17 | uses: canonical/observability/.github/workflows/charm-promote.yaml@v1 18 | with: 19 | promotion: ${{ github.event.inputs.promotion }} 20 | secrets: inherit 21 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/alertmanager_notifications_failed.rule: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: AlertmanagerNotificationsFailed 3 | rules: 4 | - alert: AlertmanagerNotificationsFailed 5 | expr: rate(alertmanager_notifications_failed_total{integration=~".*"}[5m]) > 0 6 | for: 0m 7 | labels: 8 | severity: warning 9 | annotations: 10 | summary: Alertmanager notifications failure (application {{ $labels.juju_application }} in model {{ $labels.juju_model }}) 11 | description: | 12 | Alertmanager notifications failure 13 | VALUE = {{ $value }} 14 | LABELS = {{ $labels }} 15 | -------------------------------------------------------------------------------- /terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | output "app_name" { 2 | value = juju_application.alertmanager.name 3 | } 4 | 5 | output "endpoints" { 6 | value = { 7 | # Requires 8 | catalogue = "catalogue", 9 | certificates = "certificates", 10 | ingress = "ingress", 11 | tracing = "tracing", 12 | remote_configuration = "remote-configuration" 13 | 14 | # Provides 15 | alerting = "alerting" 16 | karma_dashboard = "karma-dashboard" 17 | self_metrics_endpoint = "self-metrics-endpoint" 18 | grafana_dashboard = "grafana-dashboard" 19 | grafana_source = "grafana-source" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/alertmanager_configuration_reload_failure.rule: -------------------------------------------------------------------------------- 1 | # Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 2 | groups: 3 | - name: AlertmanagerConfigurationReloadFailure 4 | rules: 5 | - alert: AlertmanagerConfigurationReloadFailure 6 | expr: alertmanager_config_last_reload_successful{} != 1 7 | for: 0m 8 | labels: 9 | severity: warning 10 | annotations: 11 | summary: Alertmanager configuration reload failure (application {{ $labels.juju_application }} in model {{ $labels.juju_model }}) 12 | description: | 13 | Alertmanager configuration reload error 14 | VALUE = {{ $value }} 15 | LABELS = {{ $labels }} 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement_proposal.yml: -------------------------------------------------------------------------------- 1 | name: Enhancement Proposal 2 | description: File an enhancement proposal 3 | labels: ["Type: Enhancement", "Status: Triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | Thanks for taking the time to fill out this enhancement proposal! Before submitting your issue, please make 9 | sure there isn't already a prior issue concerning this. If there is, please join that discussion instead. 10 | - type: textarea 11 | id: enhancement-proposal 12 | attributes: 13 | label: Enhancement Proposal 14 | description: > 15 | Describe the enhancement you would like to see in as much detail as needed. 16 | validations: 17 | required: true 18 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | The easiest way to report a security issue is through a [Github Private Security Report](https://github.com/canonical/alertmanager-k8s-operator/security/advisories/new) 2 | with a description of the issue, the steps you took to create the issue, affected versions, and, if known, mitigations for the issue. 3 | 4 | Alternatively, to report a security issue via email, please email [security@ubuntu.com](mailto:security@ubuntu.com) with a description of the issue, 5 | the steps you took to create the issue, affected versions, and, if known, mitigations for the issue. 6 | 7 | The [Ubuntu Security disclosure and embargo policy](https://ubuntu.com/security/disclosure-policy) contains more information about what you can expect 8 | when you contact us and what we expect from you. 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | build/ 3 | *.charm 4 | *.orig 5 | .coverage 6 | **/__pycache__/ 7 | *.py[cod] 8 | .hypothesis/ 9 | .idea/ 10 | .tox/ 11 | .mypy_cache 12 | **/*.egg-info/ 13 | 14 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 15 | # password, private keys, and other secrets. These should not be part of version 16 | # control as they are data points which are potentially sensitive and subject 17 | # to change depending on the environment. 18 | *.tfvars 19 | *.tfvars.json 20 | 21 | # Ignore override files as they are usually used to override resources locally and so 22 | # are not checked in 23 | .terraform 24 | override.tf 25 | override.tf.json 26 | *_override.tf 27 | *_override.tf.json 28 | 29 | # Include override files you do wish to add to version control using negated pattern 30 | # !example_override.tf 31 | 32 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 33 | # example: *tfplan* 34 | 35 | # Ignore CLI configuration files 36 | .terraformrc 37 | terraform.rc 38 | .terraform.lock.hcl 39 | 40 | *.tfstate 41 | *.tfstate.backup -------------------------------------------------------------------------------- /tests/integration/remote_configuration_tester/charmcraft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | name: remote-configuration-tester 4 | type: charm 5 | summary: A charm to test the Alertmanager Remote Configuration library 6 | description: A charm to test the Alertmanager Remote Configuration library 7 | 8 | platforms: 9 | ubuntu@24.04:amd64: 10 | 11 | parts: 12 | charm: 13 | source: . 14 | plugin: uv 15 | build-packages: [git] 16 | build-snaps: [astral-uv] 17 | 18 | containers: 19 | remote-configuration-tester: 20 | resource: remote-configuration-tester-image 21 | mounts: 22 | - storage: config 23 | location: /etc/alertmanager 24 | 25 | storage: 26 | config: 27 | type: filesystem 28 | location: /etc/alertmanager 29 | 30 | resources: 31 | remote-configuration-tester-image: 32 | type: oci-image 33 | description: upstream docker image for remote-configuration-tester 34 | upstream-source: python:slim 35 | 36 | provides: 37 | remote-configuration: 38 | interface: alertmanager_remote_configuration 39 | 40 | config: 41 | options: 42 | config_file: 43 | type: string 44 | default: "" 45 | description: | 46 | Alertmanager configuration file (yaml). 47 | -------------------------------------------------------------------------------- /tests/unit/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | from charms.tempo_coordinator_k8s.v0.charm_tracing import charm_tracing_disabled 5 | from ops.testing import Context 6 | 7 | from src.alertmanager import WorkloadManager 8 | from src.charm import AlertmanagerCharm 9 | 10 | 11 | @pytest.fixture(autouse=True) 12 | def patch_buffer_file_for_charm_tracing(tmp_path): 13 | with patch( 14 | "charms.tempo_coordinator_k8s.v0.charm_tracing.BUFFER_DEFAULT_CACHE_FILE_NAME", 15 | str(tmp_path / "foo.json"), 16 | ): 17 | yield 18 | 19 | 20 | @pytest.fixture(autouse=True) 21 | def silence_tracing(): 22 | with charm_tracing_disabled(): 23 | yield 24 | 25 | 26 | def tautology(*_, **__) -> bool: 27 | return True 28 | 29 | 30 | @pytest.fixture(autouse=True) 31 | def alertmanager_charm(): 32 | with patch("lightkube.core.client.GenericSyncClient"), patch.multiple( 33 | "charm.KubernetesComputeResourcesPatch", 34 | _namespace="test-namespace", 35 | _patch=tautology, 36 | is_ready=tautology, 37 | ), patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")), patch.object( 38 | WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0") 39 | ), patch("subprocess.run"): 40 | yield AlertmanagerCharm 41 | 42 | 43 | @pytest.fixture(scope="function") 44 | def context(alertmanager_charm): 45 | return Context(charm_type=alertmanager_charm) 46 | -------------------------------------------------------------------------------- /terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "app_name" { 2 | description = "Name to give the deployed application" 3 | type = string 4 | default = "alertmanager" 5 | } 6 | 7 | variable "channel" { 8 | description = "Channel that the charm is deployed from" 9 | type = string 10 | } 11 | 12 | variable "config" { 13 | description = "Map of the charm configuration options" 14 | type = map(string) 15 | default = {} 16 | } 17 | 18 | # We use constraints to set AntiAffinity in K8s 19 | # https://discourse.charmhub.io/t/pod-priority-and-affinity-in-juju-charms/4091/13?u=jose 20 | variable "constraints" { 21 | description = "String listing constraints for this application" 22 | type = string 23 | # FIXME: Passing an empty constraints value to the Juju Terraform provider currently 24 | # causes the operation to fail due to https://github.com/juju/terraform-provider-juju/issues/344 25 | default = "arch=amd64" 26 | } 27 | 28 | variable "model_uuid" { 29 | description = "Reference to an existing model resource or data source for the model to deploy to" 30 | type = string 31 | } 32 | 33 | variable "revision" { 34 | description = "Revision number of the charm" 35 | type = number 36 | default = null 37 | } 38 | 39 | variable "storage_directives" { 40 | description = "Map of storage used by the application, which defaults to 1 GB, allocated by Juju" 41 | type = map(string) 42 | default = {} 43 | } 44 | 45 | variable "units" { 46 | description = "Unit count/scale" 47 | type = number 48 | default = 1 49 | } 50 | -------------------------------------------------------------------------------- /tests/manual/bundle_1_e2e_tls.yaml: -------------------------------------------------------------------------------- 1 | bundle: kubernetes 2 | applications: 3 | alertmanager: 4 | charm: ../../alertmanager-k8s_ubuntu-20.04-amd64.charm 5 | series: focal 6 | resources: 7 | alertmanager-image: ghcr.io/canonical/alertmanager:0.25.0 8 | scale: 1 9 | trust: true 10 | prometheus: 11 | charm: prometheus-k8s 12 | channel: edge 13 | scale: 1 14 | trust: true 15 | avalanche: 16 | # The avalanche charm has always-firing alerts that can be used to verify prometheus is able to 17 | # post alerts to alertmanager. 18 | charm: avalanche-k8s 19 | channel: edge 20 | scale: 1 21 | trust: true 22 | options: 23 | metric_count: 10 24 | series_count: 2 25 | local-ca: 26 | charm: self-signed-certificates 27 | channel: edge 28 | scale: 1 29 | external-ca: 30 | charm: self-signed-certificates 31 | channel: edge 32 | scale: 1 33 | traefik: 34 | charm: traefik-k8s 35 | channel: edge 36 | series: focal 37 | scale: 1 38 | trust: true 39 | relations: 40 | - - traefik:ingress 41 | - alertmanager:ingress 42 | - - local-ca:send-ca-cert 43 | - traefik:receive-ca-cert 44 | - - local-ca:certificates 45 | - alertmanager:certificates 46 | - - local-ca:certificates 47 | - prometheus:certificates 48 | - - traefik:certificates 49 | - external-ca:certificates 50 | - - alertmanager:alerting 51 | - prometheus:alertmanager 52 | - - traefik:ingress-per-unit 53 | - prometheus:ingress 54 | - - alertmanager:self-metrics-endpoint 55 | - prometheus:metrics-endpoint 56 | - - avalanche:metrics-endpoint 57 | - prometheus:metrics-endpoint 58 | -------------------------------------------------------------------------------- /tests/integration/remote_configuration_tester/pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | [project] 4 | name = "remote-configuration-tester" 5 | version = "0.0" 6 | requires-python = "~=3.8" 7 | 8 | dependencies = [ 9 | "ops", 10 | "pyyaml", 11 | "jsonschema", 12 | "requests", 13 | ] 14 | 15 | # Testing tools configuration 16 | [tool.coverage.run] 17 | branch = true 18 | 19 | [tool.coverage.report] 20 | show_missing = true 21 | 22 | # Formatting tools configuration 23 | [tool.black] 24 | line-length = 99 25 | target-version = ["py38"] 26 | 27 | # Linting tools configuration 28 | [tool.ruff] 29 | line-length = 99 30 | extend-exclude = ["__pycache__", "*.egg_info"] 31 | 32 | [tool.ruff.lint] 33 | select = ["E", "W", "F", "C", "N", "R", "D", "I001"] 34 | # Ignore E501 because using black creates errors with this 35 | # Ignore D107 Missing docstring in __init__ 36 | ignore = ["E501", "D107", "N818", "RET504"] 37 | # D100, D101, D102, D103: Ignore missing docstrings in tests 38 | per-file-ignores = {"tests/*" = ["D100","D101","D102","D103"]} 39 | 40 | [tool.ruff.lint.pydocstyle] 41 | convention = "google" 42 | 43 | # Static analysis tools configuration 44 | [tool.pyright] 45 | extraPaths = ["src", "lib"] 46 | pythonVersion = "3.8" 47 | pythonPlatform = "All" 48 | exclude = [ 49 | "tests/integration/remote_configuration_tester/**", 50 | ] 51 | 52 | [tool.pytest.ini_options] 53 | minversion = "6.0" 54 | log_cli_level = "INFO" 55 | asyncio_mode = "auto" 56 | addopts = "--tb=native --verbose --capture=no --log-cli-level=INFO" 57 | 58 | [tool.codespell] 59 | skip = ".git,.tox,build,venv*" 60 | ignore-words-list = "assertIn" 61 | -------------------------------------------------------------------------------- /tests/integration/test_kubectl_delete.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2022 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | 6 | import logging 7 | from pathlib import Path 8 | 9 | import pytest 10 | import sh 11 | import yaml 12 | from helpers import is_alertmanager_up 13 | from pytest_operator.plugin import OpsTest 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) 18 | app_name = METADATA["name"] 19 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 20 | 21 | 22 | @pytest.mark.abort_on_fail 23 | async def test_deploy_from_local_path(ops_test: OpsTest, charm_under_test): 24 | """Deploy the charm-under-test.""" 25 | assert ops_test.model 26 | logger.debug("deploy local charm") 27 | 28 | await ops_test.model.deploy( 29 | charm_under_test, application_name=app_name, resources=resources, trust=True 30 | ) 31 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 32 | await is_alertmanager_up(ops_test, app_name) 33 | 34 | 35 | @pytest.mark.abort_on_fail 36 | async def test_kubectl_delete_pod(ops_test: OpsTest): 37 | assert ops_test.model 38 | assert ops_test.model_name 39 | pod_name = f"{app_name}-0" 40 | 41 | sh.kubectl.delete.pod(pod_name, namespace=ops_test.model_name) # pyright: ignore 42 | 43 | application = ops_test.model.applications[app_name] 44 | assert application 45 | await ops_test.model.block_until(lambda: len(application.units) > 0) 46 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 47 | assert await is_alertmanager_up(ops_test, app_name) 48 | -------------------------------------------------------------------------------- /tests/integration/remote_configuration_tester/src/charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2022 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """A Charm to functionally test the Alertmanager Operator.""" 6 | 7 | import logging 8 | import typing 9 | 10 | from charms.alertmanager_k8s.v0.alertmanager_remote_configuration import ( 11 | ConfigReadError, 12 | RemoteConfigurationProvider, 13 | ) 14 | from ops.charm import CharmBase, PebbleReadyEvent 15 | from ops.main import main 16 | from ops.model import ActiveStatus, WaitingStatus 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class AlertmanagerTesterCharm(CharmBase): 22 | """A Charm to functionally test the Alertmanager Operator.""" 23 | 24 | ALERTMANAGER_CONFIG_FILE = "/etc/alertmanager/alertmanager.yml" 25 | 26 | def __init__(self, *args): 27 | super().__init__(*args) 28 | self.container = self.unit.get_container("remote-configuration-tester") 29 | 30 | try: 31 | self.remote_configuration_consumer = RemoteConfigurationProvider.with_config_file( 32 | charm=self, config_file=self.ALERTMANAGER_CONFIG_FILE 33 | ) 34 | except ConfigReadError: 35 | pass 36 | 37 | self.framework.observe(self.on.remote_configuration_tester_pebble_ready, self._on_ready) 38 | 39 | def _on_ready(self, event: PebbleReadyEvent) -> None: 40 | if not self.container.can_connect(): 41 | self.unit.status = WaitingStatus("Waiting for the container to be ready") 42 | event.defer() 43 | return 44 | self.container.push( 45 | self.ALERTMANAGER_CONFIG_FILE, typing.cast(str, self.config["config_file"]) 46 | ) 47 | self.unit.status = ActiveStatus() 48 | 49 | 50 | if __name__ == "__main__": 51 | main(AlertmanagerTesterCharm) 52 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release Process 2 | 3 | ## Overview 4 | 5 | At any given time there are three revisions of the Alertmanager charm [available on CharmHub.io](https://charmhub.io/alertmanager-k8s), for each of the following channels: 6 | 7 | 1. `latest/stable` is a well tested production ready version of the Charm. 8 | 2. `latest/candidate` is a feature ready next version of the stable release, currently in testing. 9 | 3. `latest/edge` is the bleeding edge developer version of the charm. While we really try not to, it may break and introduce regressions. 10 | 11 | Currently, the Alertmanager charm does not make use of the `latest/beta` channel. 12 | For more information about CharmHub channels, refer to the [Juju charm store](https://discourse.charmhub.io/t/the-juju-charm-store) documentation. 13 | 14 | ## When to create which revisions 15 | 16 | * **Stable revisions** are done in consultation with product manager and engineering manager when the `candidate` revision has been well tested and is deemed ready for production. 17 | * **Candidate revisions** are done when the charm reaches a state of feature completion with respect to the next planned `stable` release. 18 | * **Edge revisions** are released at the developer's discretion, potentially every time something is merged into `main` and the unit tests pass. 19 | 20 | ## How to publish revisions 21 | 22 | Refer to the [Publish your operator in Charmhub](https://discourse.charmhub.io/t/publish-your-operator-in-charmhub) documentation. 23 | After a `latest/stable` release, it is expected that the version of the charm is the same as the one in `latest/candidate`, and those two channels will diverge again when we are ramping up through `latest/candidate` releases for a new `latest/stable` release. 24 | 25 | ## A note on granularity of revisions 26 | 27 | We believe in shipping often and with confidence. 28 | It is perfectly acceptable to have a new `latest/stable` release containing just one bug fix or a small new feature with respect to the last one. 29 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | [tox] 5 | skipsdist=True 6 | skip_missing_interpreters = True 7 | envlist = lint, static, unit 8 | 9 | [vars] 10 | src_path = {toxinidir}/src 11 | tst_path = {toxinidir}/tests 12 | lib_path = {toxinidir}/lib/charms/alertmanager_k8s 13 | all_path = {[vars]src_path} {[vars]tst_path} {[vars]lib_path} 14 | uv_flags = --frozen --isolated --extra=dev 15 | 16 | [testenv] 17 | allowlist_externals = uv 18 | basepython = python3 19 | setenv = 20 | PYTHONPATH = {toxinidir}:{toxinidir}/lib:{[vars]src_path} 21 | PYTHONBREAKPOINT=ipdb.set_trace 22 | PY_COLORS=1 23 | passenv = 24 | PYTHONPATH 25 | CHARM_PATH 26 | 27 | [testenv:lock] 28 | description = Update uv.lock with the latest deps 29 | commands = 30 | uv lock --upgrade --no-cache 31 | 32 | [testenv:lint] 33 | description = Lint the code 34 | commands = 35 | uv run {[vars]uv_flags} ruff check {[vars]all_path} 36 | 37 | [testenv:static] 38 | description = Run static checks 39 | allowlist_externals = 40 | {[testenv]allowlist_externals} 41 | /usr/bin/env 42 | commands = 43 | uv run {[vars]uv_flags} pyright {[vars]all_path} 44 | /usr/bin/env sh -c 'for m in $(git diff main --name-only {[vars]lib_path}); do if ! git diff main $m | grep -q "+LIBPATCH\|+LIBAPI"; then echo "You forgot to bump the version on $m!"; exit 1; fi; done' 45 | 46 | [testenv:fmt] 47 | description = "Format the code" 48 | commands = 49 | uv run {[vars]uv_flags} ruff check --fix-only {[vars]all_path} 50 | 51 | [testenv:unit] 52 | description = Run unit tests 53 | setenv = 54 | {[testenv]setenv} 55 | JUJU_VERSION=3.0.3 56 | passenv = 57 | PYTHONPATH 58 | allowlist_externals = 59 | {[testenv]allowlist_externals} 60 | /usr/bin/env 61 | commands = 62 | uv run {[vars]uv_flags} coverage run --source={[vars]src_path},{[vars]lib_path} -m pytest \ 63 | {[vars]tst_path}/unit {posargs} 64 | uv run {[vars]uv_flags} coverage report 65 | 66 | [testenv:integration] 67 | description = Run integration tests 68 | commands = 69 | uv run {[vars]uv_flags} pytest --exitfirst {[vars]tst_path}/integration {posargs} 70 | -------------------------------------------------------------------------------- /tests/unit/test_brute_isolated.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | from unittest.mock import patch 5 | 6 | import pytest 7 | from helpers import add_relation_sequence, begin_with_initial_hooks_isolated 8 | from ops.testing import Context, Relation, State 9 | 10 | """Some brute-force tests, so that other tests can remain focused.""" 11 | 12 | 13 | def test_startup_shutdown_sequence(context: Context): 14 | state = begin_with_initial_hooks_isolated(context) 15 | state = context.run(context.on.update_status(), state) 16 | 17 | for peer_rel in state.get_relations("replicas"): 18 | state = context.run(context.on.relation_departed(peer_rel, remote_unit=2), state) 19 | 20 | state = context.run(context.on.stop(), state) 21 | context.run(context.on.remove(), state) 22 | 23 | 24 | @pytest.mark.parametrize("fqdn", ["localhost", "am-0.endpoints.cluster.local"]) 25 | @pytest.mark.parametrize("leader", [True, False]) 26 | class TestAlertingRelationDataUniformity: 27 | """Scenario: The charm is related to several different prometheus apps.""" 28 | 29 | @pytest.fixture 30 | def post_startup(self, context, fqdn, leader) -> State: 31 | with patch("socket.getfqdn", new=lambda *args: fqdn): 32 | state = begin_with_initial_hooks_isolated(context, leader=leader) 33 | 34 | # Add several relations TODO: how to obtain the next rel_id automatically? 35 | prom_rels = [Relation("alerting", id=rel_id) for rel_id in (10, 11, 12)] 36 | for prom_rel in prom_rels: 37 | state = add_relation_sequence(context, state, prom_rel) 38 | return state 39 | 40 | def test_relation_data_is_the_same_for_all_related_apps(self, post_startup, fqdn): 41 | # GIVEN an isolated alertmanager charm after the startup sequence is complete 42 | state = post_startup 43 | 44 | # THEN the "alerting" relation data has the same contents for all related apps 45 | relations = state.get_relations("alerting") 46 | for i in range(1, len(relations)): 47 | assert relations[0].local_unit_data == relations[i].local_unit_data 48 | assert relations[0].local_app_data == relations[i].local_app_data 49 | -------------------------------------------------------------------------------- /terraform/README.md: -------------------------------------------------------------------------------- 1 | # Terraform module for alertmanager-k8s 2 | 3 | This is a Terraform module facilitating the deployment of alertmanager-k8s, using the [Terraform juju provider](https://github.com/juju/terraform-provider-juju/). For more information, refer to the provider [documentation](https://registry.terraform.io/providers/juju/juju/latest/docs). 4 | 5 | 6 | ## Requirements 7 | 8 | | Name | Version | 9 | |------|---------| 10 | | [terraform](#requirement\_terraform) | >= 1.5 | 11 | | [juju](#requirement\_juju) | ~> 1.0 | 12 | 13 | ## Providers 14 | 15 | | Name | Version | 16 | |------|---------| 17 | | [juju](#provider\_juju) | ~> 1.0 | 18 | 19 | ## Modules 20 | 21 | No modules. 22 | 23 | ## Inputs 24 | 25 | | Name | Description | Type | Default | Required | 26 | |------|-------------|------|---------|:--------:| 27 | | [app\_name](#input\_app\_name) | Name to give the deployed application | `string` | `"alertmanager"` | no | 28 | | [channel](#input\_channel) | Channel that the charm is deployed from | `string` | n/a | yes | 29 | | [config](#input\_config) | Map of the charm configuration options | `map(string)` | `{}` | no | 30 | | [constraints](#input\_constraints) | String listing constraints for this application | `string` | `"arch=amd64"` | no | 31 | | [model\_uuid](#input\_model\_uuid) | Reference to an existing model resource or data source for the model to deploy to | `string` | n/a | yes | 32 | | [revision](#input\_revision) | Revision number of the charm | `number` | `null` | no | 33 | | [storage\_directives](#input\_storage\_directives) | Map of storage used by the application, which defaults to 1 GB, allocated by Juju | `map(string)` | `{}` | no | 34 | | [units](#input\_units) | Unit count/scale | `number` | `1` | no | 35 | 36 | ## Outputs 37 | 38 | | Name | Description | 39 | |------|-------------| 40 | | [app\_name](#output\_app\_name) | n/a | 41 | | [endpoints](#output\_endpoints) | n/a | 42 | 43 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report 3 | labels: ["Type: Bug", "Status: Triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | Thanks for taking the time to fill out this bug report! Before submitting your issue, please make 9 | sure you are using the latest version of the charm. If not, please try upgrading to the latest edge release prior to 10 | posting your report to make sure it's not already solved. 11 | - type: textarea 12 | id: bug-description 13 | attributes: 14 | label: Bug Description 15 | description: > 16 | If applicable, add screenshots to 17 | help explain the problem you are facing. 18 | validations: 19 | required: true 20 | - type: textarea 21 | id: reproduction 22 | attributes: 23 | label: To Reproduce 24 | description: > 25 | Please provide the output of `juju export-bundle` and step-by-step instructions for how to reproduce the behavior. 26 | A deployment diagram could be handy too. See https://discourse.charmhub.io/t/9269 for examples. 27 | placeholder: | 28 | 1. `juju deploy ...` 29 | 2. `juju relate ...` 30 | 3. `juju status --relations` 31 | validations: 32 | required: true 33 | - type: textarea 34 | id: environment 35 | attributes: 36 | label: Environment 37 | description: > 38 | We need to know a bit more about the context in which you run the charm. 39 | - Are you running Juju locally, on lxd, in multipass or on some other platform? 40 | - What track and channel you deployed the charm from (ie. `latest/edge` or similar). 41 | - Version of any applicable components, like the juju snap, the model controller, lxd, microk8s, and/or multipass. 42 | validations: 43 | required: true 44 | - type: textarea 45 | id: logs 46 | attributes: 47 | label: Relevant log output 48 | description: > 49 | Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. 50 | Fetch the logs using `juju debug-log --replay` and `kubectl logs ...`. Additional details available in the juju docs 51 | at https://juju.is/docs/olm/juju-logs 52 | render: shell 53 | validations: 54 | required: true 55 | - type: textarea 56 | id: additional-context 57 | attributes: 58 | label: Additional context 59 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | [project] 4 | name = "alertmanager-k8s" 5 | version = "0.0" 6 | requires-python = "~=3.8" 7 | 8 | dependencies = [ 9 | "ops", 10 | "pyyaml", 11 | "lightkube>=0.11", # observability_libs 12 | "lightkube-models", # observability_libs 13 | "jsonschema", # traefik_k8s, tls_certificates 14 | "cryptography", # tls_certificates 15 | "pydantic>=2", # traefik_k8s.v2.ingress 16 | "opentelemetry-exporter-otlp-proto-http>=1.21.0", # tracing 17 | "tenacity", 18 | "cosl", 19 | "charmed-service-mesh-helpers>=0.2.0", 20 | "lightkube-extensions@git+https://github.com/canonical/lightkube-extensions.git@main", 21 | ] 22 | 23 | [project.optional-dependencies] 24 | dev = [ 25 | # Linting 26 | "ruff", 27 | "codespell", 28 | # Static 29 | "pyright<1.1.399", # 1.1.399 vendors typeshed that dropped Python 3.8 support 30 | # Unit 31 | "pytest", 32 | "coverage[toml]", 33 | "deepdiff", 34 | "hypothesis", 35 | "validators>=0.21.2", 36 | "ops[testing]", 37 | "pytest-interface-tester>0.3", 38 | # Integration 39 | "juju<=3.3.0,>=3.0", 40 | "websockets<14.0", 41 | "pytest-operator", 42 | "pytest-httpserver", 43 | "sh", 44 | ] 45 | 46 | # Testing tools configuration 47 | [tool.coverage.run] 48 | branch = true 49 | 50 | [tool.coverage.report] 51 | show_missing = true 52 | 53 | # Formatting tools configuration 54 | [tool.black] 55 | line-length = 99 56 | target-version = ["py38"] 57 | 58 | # Linting tools configuration 59 | [tool.ruff] 60 | line-length = 99 61 | extend-exclude = ["__pycache__", "*.egg_info"] 62 | 63 | [tool.ruff.lint] 64 | select = ["E", "W", "F", "C", "N", "R", "D", "I001"] 65 | # Ignore E501 because using black creates errors with this 66 | # Ignore D107 Missing docstring in __init__ 67 | ignore = ["E501", "D107", "N818", "RET504"] 68 | # D100, D101, D102, D103: Ignore missing docstrings in tests 69 | per-file-ignores = {"tests/*" = ["D100","D101","D102","D103"]} 70 | 71 | [tool.ruff.lint.pydocstyle] 72 | convention = "google" 73 | 74 | # Static analysis tools configuration 75 | [tool.pyright] 76 | extraPaths = ["src", "lib"] 77 | pythonVersion = "3.8" 78 | pythonPlatform = "All" 79 | exclude = [ 80 | "tests/integration/remote_configuration_tester/**", 81 | ] 82 | 83 | [tool.pytest.ini_options] 84 | minversion = "6.0" 85 | log_cli_level = "INFO" 86 | asyncio_mode = "auto" 87 | addopts = "--tb=native --verbose --capture=no --log-cli-level=INFO" 88 | 89 | [tool.codespell] 90 | skip = ".git,.tox,build,venv*" 91 | ignore-words-list = "assertIn" 92 | -------------------------------------------------------------------------------- /icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | image/svg+xml -------------------------------------------------------------------------------- /tests/integration/test_grafana_source.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from pathlib import Path 4 | 5 | import pytest 6 | import yaml 7 | from helpers import grafana_datasources 8 | from pytest_operator.plugin import OpsTest 9 | from tenacity import retry, stop_after_attempt, wait_fixed 10 | 11 | # pyright: reportAttributeAccessIssue = false 12 | # pyright: reportOptionalMemberAccess = false 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) 17 | app_name = METADATA["name"] 18 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 19 | 20 | """We need to ensure that, even if there are multiple units for Alertmanager, only one is shown as a datasouce in Grafana. 21 | We use this test to simulate multiple units of Alertmanager, and then check that only the leader has the key `grafana_source_host` written to relation data with Grafana. 22 | """ 23 | 24 | @pytest.mark.abort_on_fail 25 | async def test_build_and_deploy(ops_test: OpsTest, charm_under_test): 26 | """Build the charm-under-test, deploy the charm from charmhub, and upgrade from path.""" 27 | await asyncio.gather( 28 | ops_test.model.deploy(charm_under_test, "am", resources=resources, trust=True, num_units=2), 29 | ops_test.model.deploy("grafana-k8s", "grafana", channel="2/edge", trust=True), 30 | ) 31 | 32 | await ops_test.model.add_relation("grafana:grafana-source", "am") 33 | await ops_test.model.wait_for_idle(apps=["am", "grafana"], status="active") 34 | 35 | @retry(wait=wait_fixed(10), stop=stop_after_attempt(6)) 36 | async def test_grafana_datasources(ops_test: OpsTest): 37 | # We have 2 units of Alertmanager, but only one datasource should be shown as a Grafana source. 38 | datasources = await grafana_datasources(ops_test, "grafana") 39 | assert len(datasources) == 1 40 | 41 | # The datasource URL should point to the service, not to a specific pod unit. 42 | # This check is safe, because we name the application `am` and we're not using TLS, so the service will always start with `http://am-endpoints`. 43 | assert datasources[0]["url"].startswith("http://am-endpoints") 44 | 45 | @pytest.mark.abort_on_fail 46 | async def test_deploy_and_integrate_traefik(ops_test: OpsTest): 47 | """Build the charm-under-test, deploy the charm from charmhub, and upgrade from path.""" 48 | await ops_test.model.deploy("traefik-k8s", "traefik", channel="edge", trust=True) 49 | 50 | await ops_test.model.add_relation("traefik:ingress", "am") 51 | await ops_test.model.wait_for_idle(apps=["am", "grafana", "traefik"], status="active") 52 | 53 | async def test_grafana_datasources_when_ingress_available(ops_test: OpsTest): 54 | # We have 2 units of Alertmanager, but only one datasource should be shown as a Grafana source. 55 | datasources = await grafana_datasources(ops_test, "grafana") 56 | assert len(datasources) == 1 57 | 58 | assert "am-endpoints" not in datasources[0]["url"] 59 | -------------------------------------------------------------------------------- /tests/integration/test_persistence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2022 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import logging 6 | from datetime import datetime, timedelta, timezone 7 | from pathlib import Path 8 | 9 | import pytest 10 | import sh 11 | import yaml 12 | from helpers import get_unit_address, is_alertmanager_up 13 | from pytest_operator.plugin import OpsTest 14 | 15 | from src.alertmanager_client import Alertmanager 16 | 17 | # pyright: reportAttributeAccessIssue = false 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) 22 | app_name = METADATA["name"] 23 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 24 | 25 | 26 | @pytest.mark.abort_on_fail 27 | async def test_silences_persist_across_upgrades(ops_test: OpsTest, charm_under_test, httpserver): 28 | assert ops_test.model 29 | # deploy alertmanager charm from charmhub 30 | logger.info("deploy charm from charmhub") 31 | sh.juju.deploy("alertmanager-k8s", model=ops_test.model.name, channel="2/edge", trust=True) 32 | await ops_test.model.wait_for_idle( 33 | apps=[app_name], status="active", timeout=1000, raise_on_error=False 34 | ) 35 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=30) 36 | 37 | # set a silencer for an alert and check it is set 38 | unit_address = await get_unit_address(ops_test, app_name, 0) 39 | alertmanager = Alertmanager(f"http://{unit_address}:9093") 40 | 41 | silence_start = datetime.now(timezone.utc) 42 | silence_end = silence_start + timedelta(minutes=30) 43 | matchers = [ 44 | { 45 | "name": "alertname", 46 | "value": "fake-alert", 47 | "isRegex": False, 48 | } 49 | ] 50 | alertmanager.set_silences(matchers, silence_start, silence_end) 51 | silences_before = alertmanager.get_silences() 52 | assert len(silences_before) 53 | 54 | application = ops_test.model.applications[app_name] 55 | assert application 56 | await ops_test.model.block_until(lambda: len(application.units) > 0) 57 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 58 | assert await is_alertmanager_up(ops_test, app_name) 59 | 60 | # upgrade alertmanger using charm built locally 61 | logger.info("upgrade deployed charm with local charm %s", charm_under_test) 62 | sh.juju.refresh(app_name, model=ops_test.model.name, path=charm_under_test) 63 | await ops_test.model.wait_for_idle( 64 | apps=[app_name], status="active", timeout=1000, raise_on_error=False 65 | ) 66 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=30) 67 | assert await is_alertmanager_up(ops_test, app_name) 68 | 69 | # check silencer is still set 70 | unit_address = await get_unit_address(ops_test, app_name, 0) 71 | alertmanager = Alertmanager(f"http://{unit_address}:9093") 72 | silences_after = alertmanager.get_silences() 73 | assert len(silences_after) 74 | 75 | assert silences_before == silences_after 76 | -------------------------------------------------------------------------------- /tests/unit/helpers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """Helper functions for writing tests.""" 6 | 7 | import dataclasses 8 | from unittest.mock import patch 9 | 10 | from ops.testing import Container, Context, Exec, PeerRelation, Relation, State 11 | 12 | 13 | def no_op(*_, **__) -> None: 14 | pass 15 | 16 | 17 | def tautology(*_, **__) -> bool: 18 | return True 19 | 20 | 21 | def cli_arg(plan, cli_opt): 22 | plan_dict = plan.to_dict() 23 | args = plan_dict["services"]["alertmanager"]["command"].split() 24 | for arg in args: 25 | opt_list = arg.split("=") 26 | if len(opt_list) == 2 and opt_list[0] == cli_opt: 27 | return opt_list[1] 28 | if len(opt_list) == 1 and opt_list[0] == cli_opt: 29 | return opt_list[0] 30 | return None 31 | 32 | 33 | k8s_resource_multipatch = patch.multiple( 34 | "charm.KubernetesComputeResourcesPatch", 35 | _namespace="test-namespace", 36 | _patch=tautology, 37 | is_ready=tautology, 38 | ) 39 | 40 | 41 | def begin_with_initial_hooks_isolated(context: Context, *, leader: bool = True) -> State: 42 | container = Container( 43 | "alertmanager", 44 | can_connect=False, 45 | execs={ 46 | Exec(["update-ca-certificates", "--fresh"]), 47 | Exec( 48 | ["alertmanager", "--version"], 49 | stdout="alertmanager, version 0.23.0 (branch: HEAD, ...", 50 | ), 51 | Exec(["/usr/bin/amtool", "check-config", "/etc/alertmanager/alertmanager.yml"]), 52 | }, 53 | ) 54 | state = State(config={"config_file": ""}, containers=[container]) 55 | peer_rel = PeerRelation("replicas") 56 | 57 | state = context.run(context.on.install(), state) 58 | 59 | state = dataclasses.replace(state, relations=[peer_rel]) 60 | state = context.run(context.on.relation_created(peer_rel), state) 61 | 62 | if leader: 63 | state = dataclasses.replace(state, leader=True) 64 | state = context.run(context.on.leader_elected(), state) 65 | else: 66 | state = dataclasses.replace(state, leader=False) 67 | 68 | state = context.run(context.on.config_changed(), state) 69 | 70 | # state = state.with_can_connect("alertmanger") 71 | container = dataclasses.replace(container, can_connect=True) 72 | state = dataclasses.replace(state, containers=[container]) 73 | state = context.run(context.on.pebble_ready(container), state) 74 | 75 | state = context.run(context.on.start(), state) 76 | 77 | return state 78 | 79 | 80 | def add_relation_sequence(context: Context, state: State, relation: Relation): 81 | """Helper to simulate a relation-added sequence.""" 82 | # TODO consider adding to scenario.sequences 83 | state_with_relation = dataclasses.replace(state, relations={*state.relations, relation}) 84 | state_after_relation_created = context.run(context.on.relation_created(relation), state_with_relation) 85 | state_after_relation_joined = context.run(context.on.relation_joined(relation), state_after_relation_created) 86 | state_after_relation_changed = context.run( 87 | context.on.relation_changed(state_after_relation_joined.get_relation(relation.id)), 88 | state_after_relation_joined, 89 | ) 90 | return state_after_relation_changed 91 | -------------------------------------------------------------------------------- /tests/integration/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import functools 6 | import logging 7 | import os 8 | import socket 9 | from collections import defaultdict 10 | from datetime import datetime 11 | from pathlib import Path 12 | 13 | import juju.utils 14 | import pytest 15 | from pytest_operator.plugin import OpsTest 16 | 17 | PYTEST_HTTP_SERVER_PORT = 8000 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class Store(defaultdict): 22 | def __init__(self): 23 | super(Store, self).__init__(Store) 24 | 25 | def __getattr__(self, key): 26 | """Override __getattr__ so dot syntax works on keys.""" 27 | try: 28 | return self[key] 29 | except KeyError: 30 | raise AttributeError(key) 31 | 32 | def __setattr__(self, key, value): 33 | """Override __setattr__ so dot syntax works on keys.""" 34 | self[key] = value 35 | 36 | 37 | store = Store() 38 | 39 | 40 | def timed_memoizer(func): 41 | @functools.wraps(func) 42 | async def wrapper(*args, **kwargs): 43 | fname = func.__qualname__ 44 | logger.info("Started: %s" % fname) 45 | start_time = datetime.now() 46 | if fname in store.keys(): 47 | ret = store[fname] 48 | else: 49 | logger.info("Return for {} not cached".format(fname)) 50 | ret = await func(*args, **kwargs) 51 | store[fname] = ret 52 | logger.info("Finished: {} in: {} seconds".format(fname, datetime.now() - start_time)) 53 | return ret 54 | 55 | return wrapper 56 | 57 | 58 | @pytest.fixture(scope="module") 59 | @timed_memoizer 60 | async def charm_under_test(ops_test: OpsTest) -> Path: 61 | """Charm used for integration testing.""" 62 | if charm_file := os.environ.get("CHARM_PATH"): 63 | return Path(charm_file) 64 | 65 | path_to_built_charm = await ops_test.build_charm(".", verbosity="debug") 66 | return path_to_built_charm 67 | 68 | 69 | @pytest.fixture(scope="session") 70 | def httpserver_listen_address(): 71 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 72 | s.settimeout(0) 73 | try: 74 | # ip address does not need to be reachable 75 | s.connect(("8.8.8.8", 1)) 76 | local_ip_address = s.getsockname()[0] 77 | except Exception: 78 | local_ip_address = "127.0.0.1" 79 | finally: 80 | s.close() 81 | return local_ip_address, PYTEST_HTTP_SERVER_PORT 82 | 83 | 84 | @pytest.fixture(autouse=True, scope="module") 85 | async def setup_env(ops_test: OpsTest): 86 | assert ops_test.model 87 | # Prevent "update-status" from interfering with the test: 88 | # - if fired "too quickly", traefik will flip between active/idle and maintenance; 89 | # - make sure charm code does not rely on update-status for correct operation. 90 | await ops_test.model.set_config( 91 | {"update-status-hook-interval": "60m", "logging-config": "=WARNING; unit=DEBUG"} 92 | ) 93 | 94 | 95 | @pytest.fixture(scope="module") 96 | def temp_dir(tmp_path_factory): 97 | return tmp_path_factory.mktemp("data") 98 | 99 | @pytest.fixture(scope="module", autouse=True) 100 | def patch_pylibjuju_series_2404(): 101 | juju.utils.ALL_SERIES_VERSIONS["noble"] = "24.04" 102 | juju.utils.UBUNTU_SERIES["noble"] = "24.04" 103 | 104 | yield 105 | 106 | del juju.utils.ALL_SERIES_VERSIONS["noble"] 107 | del juju.utils.UBUNTU_SERIES["noble"] 108 | -------------------------------------------------------------------------------- /tests/unit/test_self_scrape_jobs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | import unittest 5 | from unittest.mock import PropertyMock, patch 6 | 7 | from helpers import k8s_resource_multipatch 8 | from ops.testing import Harness 9 | 10 | from alertmanager import WorkloadManager 11 | from charm import AlertmanagerCharm 12 | 13 | 14 | class TestWithInitialHooks(unittest.TestCase): 15 | container_name: str = "alertmanager" 16 | 17 | @patch("lightkube.core.client.GenericSyncClient") 18 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 19 | @k8s_resource_multipatch 20 | @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) 21 | def setUp(self, *unused): 22 | self.harness = Harness(AlertmanagerCharm) 23 | self.addCleanup(self.harness.cleanup) 24 | 25 | self.harness.set_leader(True) 26 | self.app_name = "am" 27 | # Create the peer relation before running harness.begin_with_initial_hooks(), because 28 | # otherwise it will create it for you and we don't know the rel_id 29 | self.peer_rel_id = self.harness.add_relation("replicas", self.app_name) 30 | 31 | self.harness.begin_with_initial_hooks() 32 | 33 | @patch.object(AlertmanagerCharm, "_internal_url", new_callable=PropertyMock) 34 | @patch.object(AlertmanagerCharm, "_scheme", new_callable=PropertyMock) 35 | def test_self_scraping_job_with_no_peers(self, _mock_scheme, _mock_internal_url): 36 | scheme = "https" 37 | _mock_scheme.return_value = scheme 38 | url_no_scheme = f"test-internal.url:{self.harness.charm._ports.api}" 39 | _mock_internal_url.return_value = f"{scheme}://{url_no_scheme}" 40 | jobs_expected = [ 41 | { 42 | "metrics_path": "/metrics", 43 | "scheme": scheme, 44 | "static_configs": [{"targets": [url_no_scheme]}], 45 | } 46 | ] 47 | 48 | jobs = self.harness.charm.self_scraping_job 49 | self.assertEqual(jobs, jobs_expected) 50 | 51 | @patch.object(WorkloadManager, "check_config") 52 | @patch.object(AlertmanagerCharm, "_internal_url", new_callable=PropertyMock) 53 | @patch.object(AlertmanagerCharm, "_scheme", new_callable=PropertyMock) 54 | def test_self_scraping_job_with_peers( 55 | self, _mock_scheme, _mock_internal_url, _mock_check_config 56 | ): 57 | scheme = "https" 58 | _mock_scheme.return_value = scheme 59 | 60 | targets = [ 61 | f"test-internal-0.url:{self.harness.charm._ports.api}", 62 | f"test-internal-1.url:{self.harness.charm._ports.api}", 63 | f"test-internal-2.url:{self.harness.charm._ports.api}", 64 | ] 65 | metrics_path = "/metrics" 66 | _mock_internal_url.return_value = f"{scheme}://{targets[0]}" 67 | 68 | jobs_expected = [ 69 | { 70 | "metrics_path": metrics_path, 71 | "scheme": scheme, 72 | "static_configs": [{"targets": targets}], 73 | } 74 | ] 75 | 76 | # Add peers 77 | for i, target in enumerate(targets[1:], 1): 78 | unit_name = f"{self.app_name}/{i}" 79 | self.harness.add_relation_unit(self.peer_rel_id, unit_name) 80 | self.harness.update_relation_data( 81 | self.peer_rel_id, unit_name, {"private_address": f"{scheme}://{target}"} 82 | ) 83 | 84 | jobs = self.harness.charm.self_scraping_job 85 | self.assertEqual(jobs_expected, jobs) 86 | -------------------------------------------------------------------------------- /tests/integration/test_rescale_charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """This test module tests rescaling. 6 | 7 | 1. Deploys multiple units of the charm under test and waits for them to become active 8 | 2. Reset and repeat the above until the leader unit is not the zero unit 9 | 3. Scales up the application by a few units and waits for them to become active 10 | 4. Scales down the application to below the leader unit, to trigger a leadership change event 11 | """ 12 | 13 | import logging 14 | from pathlib import Path 15 | 16 | import pytest 17 | import yaml 18 | from helpers import block_until_leader_elected, get_leader_unit_num, is_alertmanager_up 19 | from pytest_operator.plugin import OpsTest 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) 24 | app_name = METADATA["name"] 25 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 26 | 27 | 28 | # @pytest.mark.abort_on_fail 29 | @pytest.mark.xfail 30 | async def test_deploy_multiple_units(ops_test: OpsTest, charm_under_test): 31 | """Deploy the charm-under-test.""" 32 | assert ops_test.model 33 | logger.info("build charm from local source folder") 34 | 35 | logger.info("deploy charm") 36 | await ops_test.model.deploy( 37 | charm_under_test, application_name=app_name, resources=resources, num_units=10, trust=True 38 | ) 39 | await block_until_leader_elected(ops_test, app_name) 40 | 41 | if await get_leader_unit_num(ops_test, app_name) == 0: 42 | # We're unlucky this time: unit/0 is the leader, which means no scale down could trigger a 43 | # leadership change event. 44 | # Fail the test instead of model.reset() and repeat, because this hangs on github actions. 45 | logger.info("Elected leader is unit/0 - resetting and repeating") 46 | assert 0, "No luck in electing a leader that is not the zero unit. Try re-running?" 47 | 48 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 49 | 50 | 51 | # @pytest.mark.abort_on_fail 52 | @pytest.mark.xfail 53 | async def test_scale_down_to_single_unit_with_leadership_change(ops_test: OpsTest): 54 | """Scale down below current leader to trigger a leadership change event.""" 55 | assert ops_test.model 56 | application = ops_test.model.applications[app_name] 57 | assert application 58 | await application.scale(scale=1) 59 | await ops_test.model.wait_for_idle( 60 | apps=[app_name], status="active", timeout=1000, wait_for_exact_units=1 61 | ) 62 | assert await is_alertmanager_up(ops_test, app_name) 63 | 64 | 65 | # @pytest.mark.abort_on_fail 66 | @pytest.mark.xfail 67 | async def test_scale_up_from_single_unit(ops_test: OpsTest): 68 | """Add a few more units.""" 69 | assert ops_test.model 70 | application = ops_test.model.applications[app_name] 71 | assert application 72 | await application.scale(scale_change=2) 73 | await ops_test.model.wait_for_idle( 74 | apps=[app_name], status="active", timeout=1000, wait_for_exact_units=3 75 | ) 76 | assert await is_alertmanager_up(ops_test, app_name) 77 | 78 | 79 | # @pytest.mark.abort_on_fail 80 | @pytest.mark.xfail 81 | async def test_scale_down_to_single_unit_without_leadership_change(ops_test): 82 | """Remove a few units.""" 83 | await ops_test.model.applications[app_name].scale(scale_change=-2) 84 | await ops_test.model.wait_for_idle( 85 | apps=[app_name], status="active", timeout=1000, wait_for_exact_units=1 86 | ) 87 | assert await is_alertmanager_up(ops_test, app_name) 88 | -------------------------------------------------------------------------------- /tests/integration/test_upgrade_charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """This test module tests alertmanager upgrade with and without relations present. 6 | 7 | 1. Deploy the charm under test _from charmhub_. 8 | 2. Refresh with locally built charm. 9 | 3. Add all supported relations. 10 | 4. Refresh with locally built charm. 11 | 5. Add unit and refresh again (test multi unit upgrade with relations). 12 | """ 13 | 14 | import logging 15 | from pathlib import Path 16 | 17 | import pytest 18 | import sh 19 | import yaml 20 | from helpers import is_alertmanager_up 21 | from pytest_operator.plugin import OpsTest 22 | 23 | # pyright: reportAttributeAccessIssue = false 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) 28 | app_name = METADATA["name"] 29 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 30 | 31 | 32 | @pytest.mark.abort_on_fail 33 | async def test_setup_env(ops_test: OpsTest): 34 | assert ops_test.model 35 | await ops_test.model.set_config( 36 | {"update-status-hook-interval": "60m", "logging-config": "=WARNING; unit=DEBUG"} 37 | ) 38 | 39 | 40 | @pytest.mark.abort_on_fail 41 | async def test_upgrade_edge_with_local_in_isolation(ops_test: OpsTest, charm_under_test): 42 | """Build the charm-under-test, deploy the charm from charmhub, and upgrade from path.""" 43 | logger.info("deploy charm from charmhub") 44 | assert ops_test.model 45 | sh.juju.deploy(app_name, model=ops_test.model.name, channel="2/edge", trust=True) 46 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 47 | 48 | logger.info("upgrade deployed charm with local charm %s", charm_under_test) 49 | application = ops_test.model.applications[app_name] 50 | assert application 51 | sh.juju.refresh(app_name, model=ops_test.model.name, path=charm_under_test) 52 | await ops_test.model.wait_for_idle( 53 | apps=[app_name], status="active", timeout=1000, raise_on_error=False 54 | ) 55 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=30) 56 | assert await is_alertmanager_up(ops_test, app_name) 57 | 58 | 59 | @pytest.mark.abort_on_fail 60 | async def test_upgrade_local_with_local_with_relations(ops_test: OpsTest, charm_under_test): 61 | # Deploy related apps 62 | assert ops_test.model 63 | sh.juju.deploy( 64 | "prometheus-k8s", "prom", model=ops_test.model.name, channel="2/edge", trust=True 65 | ) 66 | sh.juju.deploy("karma-k8s", "karma", model=ops_test.model.name, channel="2/edge", trust=True) 67 | 68 | # Relate apps 69 | sh.juju.relate(app_name, "prom:alertmanager", model=ops_test.model.name) 70 | sh.juju.relate(app_name, "karma", model=ops_test.model.name) 71 | 72 | # Refresh from path 73 | application = ops_test.model.applications[app_name] 74 | assert application 75 | sh.juju.refresh(app_name, model=ops_test.model.name, path=charm_under_test) 76 | await ops_test.model.wait_for_idle( 77 | apps=[app_name, "prom", "karma"], 78 | status="active", 79 | timeout=2500, 80 | raise_on_error=False, 81 | ) 82 | assert await is_alertmanager_up(ops_test, app_name) 83 | 84 | 85 | @pytest.mark.abort_on_fail 86 | async def test_upgrade_with_multiple_units(ops_test: OpsTest, charm_under_test): 87 | assert ops_test.model 88 | # Add unit 89 | application = ops_test.model.applications[app_name] 90 | assert application 91 | await application.scale(scale_change=1) 92 | await ops_test.model.wait_for_idle( 93 | apps=[app_name, "prom", "karma"], status="active", timeout=1000 94 | ) 95 | 96 | # Refresh from path 97 | sh.juju.refresh(app_name, model=ops_test.model.name, path=charm_under_test) 98 | await ops_test.model.wait_for_idle( 99 | apps=[app_name, "prom", "karma"], status="active", timeout=2500 100 | ) 101 | assert await is_alertmanager_up(ops_test, app_name) 102 | -------------------------------------------------------------------------------- /tests/unit/test_server_scheme.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """Feature: The workload's scheme is reflected in the pebble command and in relation data. 5 | 6 | This feature spans: 7 | - manifest generation (pebble layer) 8 | - schema generation (alertmanager_dispatch provider) 9 | 10 | The alertmanager server can serve over HTTP or HTTPS. The requirer side of the relation may be 11 | design to take URL parts rather than a full URL. Prometheus takes URL parts and would need to 12 | generate its "alertmanagers" config section differently depending on the scheme. 13 | """ 14 | 15 | import json 16 | from unittest.mock import patch 17 | 18 | import pytest 19 | from helpers import add_relation_sequence, begin_with_initial_hooks_isolated 20 | from ops.testing import Relation, State 21 | 22 | 23 | @pytest.mark.parametrize("fqdn", ["localhost", "am-0.endpoints.cluster.local"]) 24 | @pytest.mark.parametrize("leader", [True, False]) 25 | class TestServerScheme: 26 | """Scenario: The workload is deployed to operate in HTTP mode, then switched to HTTPS.""" 27 | 28 | @pytest.fixture 29 | def initial_state(self, context, fqdn, leader) -> State: # pyright: ignore 30 | """This is the initial state for this test class.""" 31 | # GIVEN an isolated alertmanager charm after the startup sequence is complete 32 | 33 | # No "tls-certificates" relation, no config options 34 | with patch("socket.getfqdn", new=lambda *args: fqdn): 35 | state = begin_with_initial_hooks_isolated(context, leader=leader) 36 | 37 | # Add relation 38 | prom_rel = Relation("alerting", id=10) 39 | state = add_relation_sequence(context, state, prom_rel) 40 | yield state # keep the patch active for so long as this fixture is needed # pyright:ignore 41 | 42 | def test_initial_state_has_http_scheme_in_pebble_layer(self, context, initial_state, fqdn): 43 | # THEN the pebble command has 'http' and the correct hostname in the 'web.external-url' arg 44 | container = initial_state.get_container("alertmanager") 45 | command = container.layers["alertmanager"].services["alertmanager"].command 46 | assert f"--web.external-url=http://{fqdn}:9093" in command 47 | 48 | @pytest.mark.xfail 49 | def test_pebble_layer_scheme_becomes_https_if_tls_relation_added( 50 | self, context, initial_state, fqdn 51 | ): 52 | # WHEN a tls_certificates relation joins 53 | ca = Relation( 54 | "certificates", 55 | id=100, 56 | remote_app_data={ 57 | "certificates": json.dumps( 58 | [ 59 | { 60 | # fixme: the problem is: instead of "placeholder" here we need a forward ref to the 61 | # CSR that AM will generate on certificates_relation_joined. 62 | # Otherwise, as it stands, charms/tls_certificates_interface/v2/tls_certificates.py:1336 will not find 63 | # this csr and ignore it. Hence no handlers are triggered. 64 | "certificate": "placeholder", 65 | "certificate_signing_request": "placeholder", 66 | "ca": "placeholder", 67 | "chain": ["first", "second"], 68 | } 69 | ] 70 | ) 71 | }, 72 | ) # TODO figure out how to easily figure out structure of remote data 73 | state = add_relation_sequence(context, initial_state, ca) 74 | # TODO figure out why relation-changed observer in tls_certificates is not being called 75 | 76 | # THEN the pebble command has 'https' in the 'web.external-url' arg 77 | container = state.get_container("alertmanager") 78 | command = container.layers["alertmanager"].services["alertmanager"].command 79 | assert f"--web.external-url=https://{fqdn}:9093" in command 80 | 81 | def test_alerting_relation_data_scheme(self, initial_state, fqdn): 82 | # FIXME: should rely on interface tests for this kind of test. 83 | 84 | # THEN the "alerting" relation data has 'http' and the correct hostname 85 | relation = initial_state.get_relations("alerting")[0] 86 | assert relation.local_unit_data["public_address"] == f"{fqdn}:9093" 87 | assert relation.local_unit_data["scheme"] == "http" 88 | 89 | # WHEN a tls_certificates relation joins 90 | # TODO 91 | 92 | # THEN the "alerting" relation data has 'http' and the correct hostname 93 | # TODO 94 | 95 | def test_self_monitoring_scrape_job_scheme(self, fqdn, leader): 96 | # TODO 97 | pass 98 | -------------------------------------------------------------------------------- /INTEGRATING.md: -------------------------------------------------------------------------------- 1 | # Integrating alertmanager-k8s 2 | 3 | Alermanager can handle different types of relations in the `provides` side and in the `requires` side. 4 | 5 | ## Provides 6 | 7 | ### Alerting 8 | 9 | ```yaml 10 | alerting: 11 | interface: alertmanager_dispatch 12 | ``` 13 | 14 | Over the 15 | [`alertmanager_dispatch`](https://charmhub.io/alertmanager-k8s/libraries/alertmanager_dispatch) 16 | relation interface Alermanager can be related to charms that can forward alerts to it, 17 | for example: [Prometheus][Prometheus operator], [Loki][Loki operator]. 18 | 19 | ``` 20 | juju relate alertmanager-k8s:alerting prometheus-k8s:alerting 21 | ``` 22 | 23 | ### Karma dashboard 24 | 25 | ```yaml 26 | karma-dashboard: 27 | interface: karma_dashboard 28 | ``` 29 | 30 | The [`karma_dashboard`](https://charmhub.io/karma-k8s/libraries/karma_dashboard) 31 | relation interface links an entire Alertmanager cluster to a 32 | [Karma](https://charmhub.io/karma-k8s) dashboard. 33 | Scaling alertmanager would automatically cause karma to group alerts by 34 | cluster. 35 | 36 | ``` 37 | juju relate alertmanager-k8s:karma_dashboard karma-k8s:karma_dashboard 38 | ``` 39 | 40 | ### Self metrics endpoint 41 | 42 | 43 | ```yaml 44 | self-metrics-endpoint: 45 | interface: prometheus_scrape 46 | ``` 47 | This Alertmanager charm may forward information about its metrics endpoint and associated alert rules to a Prometheus charm over the `self-metrics-endpoint` relation using the [`prometheus_scrape`](https://charmhub.io/prometheus-k8s/libraries/prometheus_scrape) interface. In order for these metrics to be aggregated by the remote Prometheus charm all that is required is to relate the two charms as in: 48 | 49 | ```bash 50 | juju relate alertmanager-k8s:self-metrics-endpoint prometheus:metrics-endpoint 51 | ``` 52 | 53 | 54 | ### Grafana dashboard 55 | 56 | ```yaml 57 | grafana-dashboard: 58 | interface: grafana_dashboard 59 | ``` 60 | 61 | Over the `grafana-dashboard` relation using the [`grafana-dashboard`](https://charmhub.io/grafana-k8s/libraries/grafana_dashboard) interface, this Alertmanager charm also provides meaningful dashboards about its metrics to be shown in a [Grafana Charm ](https://charmhub.io/grafana-k8s). 62 | 63 | In order to add these dashboards to Grafana all that is required is to relate the two charms in the following way: 64 | 65 | ```bash 66 | juju relate alertmanager-k8s:grafana-dashboard grafana-k8s:grafana-dashboard 67 | ``` 68 | 69 | ### Grafana source 70 | 71 | ```yaml 72 | grafana-source: 73 | interface: grafana_datasource 74 | ``` 75 | 76 | This charm may provide a data source to Grafana through the `grafana-source` relation using the [`grafana_datasource`](https://charmhub.io/grafana-k8s/libraries/grafana_source) interface. 77 | 78 | ``` 79 | juju relate alertmanager-k8s:grafana-source grafana-k8s:grafana-source 80 | ``` 81 | 82 | ## Requires 83 | 84 | 85 | ### Ingress 86 | 87 | ```yaml 88 | ingress: 89 | interface: ingress 90 | limit: 1 91 | ``` 92 | 93 | Interactions with the Alertmanager charm can not be assumed to originate within the same Juju model, let alone the same Kubernetes cluster, or even the same Juju cloud. Hence the charm also supports an Ingress relation. 94 | 95 | Since Alertmanager units automatically form a cluster, the charm only needs a "per app" Ingress. The ingress relation is available in the [traefik-k8s](https://charmhub.io/traefik-k8s) charm and this Alertmanager charm does support that relation over [`ingress`](https://charmhub.io/traefik-k8s/libraries/ingress) interface. 96 | 97 | 98 | ``` 99 | juju relate alertmanager-k8s:ingress traefik-k8s:ingress 100 | ``` 101 | 102 | ### Remote Configuration 103 | 104 | ```yaml 105 | remote-configuration: 106 | interface: alertmanager_remote_configuration 107 | limit: 1 108 | ``` 109 | 110 | Remote Configuration relation offers the option of configuring Alertmanager via relation data. 111 | This method assumes usage of another charm providing the configuration 112 | (i.e. [alertmanager-configurer-k8s]). 113 | 114 | Remote configuration and local configuration (using charm's config parameters) are mutually 115 | exclusive. In case configuration is provided through both channels simultaneously, charm will go 116 | to `Blocked` state, awaiting conflict resolution by the user. 117 | 118 | ```bash 119 | juju relate alertmanager-k8s:remote-configuration SOME_PROVIDER_CHARM:remote-configuration 120 | ``` 121 | 122 | ```mermaid 123 | graph LR 124 | 125 | subgraph observability["Observability"] 126 | alertmanager[Alertmanager] 127 | end 128 | 129 | subgraph alertmanager-configurer["Remote Configuration Provider"] 130 | am_config[alertmanager-configurer-k8s] 131 | end 132 | 133 | am_config --->|remote_configuration| alertmanager 134 | user{User} -.-> |REST API CALLS| am_config 135 | ``` 136 | 137 | [Loki operator]: https://charmhub.io/loki-k8s 138 | [Prometheus operator]: https://charmhub.io/prometheus-k8s 139 | [Karma operator]: https://charmhub.io/karma-k8s/ 140 | [alertmanager-configurer-k8s]: https://github.com/canonical/alertmanager-configurer-k8s-operator 141 | -------------------------------------------------------------------------------- /tests/integration/test_remote_configuration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2022 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """This test module tests remote configuration support in Alertmanager. 6 | 7 | 0. Deploy `alertmanager-k8s` and `remote-configuration-tester`. 8 | 1. Create `remote-configuration` relation. 9 | 2. Verify that the configuration provided by `remote-configuration-tester` has been applied in 10 | `alertmanager-k8s`. 11 | """ 12 | 13 | import os 14 | import shutil 15 | from pathlib import Path 16 | 17 | import helpers 18 | import pytest 19 | import sh 20 | import yaml 21 | from deepdiff import DeepDiff # type: ignore[import] 22 | from pytest_operator.plugin import OpsTest 23 | 24 | METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) 25 | APP_NAME = METADATA["name"] 26 | RESOURCES = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 27 | 28 | TESTER_CHARM_PATH = "./tests/integration/remote_configuration_tester" 29 | TESTER_APP_METADATA = yaml.safe_load( 30 | Path(os.path.join(TESTER_CHARM_PATH, "charmcraft.yaml")).read_text() 31 | ) 32 | TESTER_APP_NAME = TESTER_APP_METADATA["name"] 33 | TESTER_APP_RESOURCES = { 34 | f"{TESTER_APP_NAME}-image": TESTER_APP_METADATA["resources"][f"{TESTER_APP_NAME}-image"][ 35 | "upstream-source" 36 | ] 37 | } 38 | 39 | TESTER_CHARM_CONFIG = """route: 40 | receiver: test_receiver 41 | group_by: 42 | - alertname 43 | group_wait: 1234s 44 | group_interval: 4321s 45 | repeat_interval: 1111h 46 | receivers: 47 | - name: test_receiver 48 | """ 49 | 50 | 51 | @pytest.fixture(scope="module") 52 | async def tester_charm(ops_test: OpsTest): 53 | assert ops_test.model 54 | _copy_alertmanager_remote_configuration_library_into_tester_charm() 55 | tester_charm = await ops_test.build_charm(TESTER_CHARM_PATH) 56 | await ops_test.model.deploy( 57 | tester_charm, 58 | resources=TESTER_APP_RESOURCES, 59 | application_name=TESTER_APP_NAME, 60 | config={"config_file": TESTER_CHARM_CONFIG}, 61 | trust=True, 62 | ) 63 | await ops_test.model.wait_for_idle(apps=[TESTER_APP_NAME], status="active", timeout=1000) 64 | 65 | 66 | @pytest.fixture(scope="module") 67 | @pytest.mark.abort_on_fail 68 | async def setup(ops_test: OpsTest, charm_under_test, tester_charm): 69 | assert ops_test.model 70 | await ops_test.model.deploy( 71 | charm_under_test, 72 | resources=RESOURCES, 73 | application_name=APP_NAME, 74 | trust=True, 75 | ) 76 | await ops_test.model.wait_for_idle( 77 | apps=[APP_NAME, TESTER_APP_NAME], status="active", timeout=1000 78 | ) 79 | 80 | 81 | @pytest.mark.abort_on_fail 82 | async def test_remote_configuration_applied_on_relation_created(ops_test: OpsTest, setup): 83 | assert ops_test.model 84 | await ops_test.model.add_relation( 85 | relation1=f"{APP_NAME}:remote-configuration", relation2=TESTER_APP_NAME 86 | ) 87 | expected_config = _add_juju_details_to_alertmanager_config(TESTER_CHARM_CONFIG) 88 | await ops_test.model.wait_for_idle( 89 | apps=[APP_NAME], 90 | status="active", 91 | timeout=1000, 92 | idle_period=5, 93 | ) 94 | 95 | _, actual_config, _ = await helpers.get_alertmanager_config_from_file( 96 | ops_test=ops_test, 97 | app_name=APP_NAME, 98 | container_name="alertmanager", 99 | config_file_path="/etc/alertmanager/alertmanager.yml", 100 | ) 101 | 102 | assert ( 103 | DeepDiff( 104 | yaml.safe_load(actual_config), 105 | yaml.safe_load(expected_config), 106 | ignore_order=True, 107 | ) 108 | == {} 109 | ) 110 | 111 | 112 | @pytest.mark.abort_on_fail 113 | async def test_remote_configuration_file_wrongly_applied(ops_test: OpsTest, setup): 114 | assert ops_test.model 115 | sh.juju( # pyright: ignore 116 | [ 117 | "config", 118 | f"{APP_NAME}", 119 | "-m", 120 | ops_test.model_name, 121 | "config_file=tests/integration/am_config.yaml", 122 | ] 123 | ) 124 | 125 | await ops_test.model.wait_for_idle( 126 | apps=[APP_NAME], 127 | status="blocked", 128 | timeout=1000, 129 | idle_period=5, 130 | ) 131 | 132 | 133 | def _copy_alertmanager_remote_configuration_library_into_tester_charm(): 134 | """Ensure that the tester charm uses the current Alertmanager Remote Configuration library.""" 135 | library_path = "lib/charms/alertmanager_k8s/v0/alertmanager_remote_configuration.py" 136 | install_path = "tests/integration/remote_configuration_tester/" + library_path 137 | shutil.copyfile(library_path, install_path) 138 | 139 | 140 | def _add_juju_details_to_alertmanager_config(config: str) -> str: 141 | juju_details = ["juju_application", "juju_model", "juju_model_uuid"] 142 | config_dict = yaml.safe_load(config) 143 | group_by = config_dict["route"]["group_by"] 144 | group_by.extend(juju_details) 145 | config_dict["route"]["group_by"] = group_by 146 | return yaml.safe_dump(config_dict) 147 | -------------------------------------------------------------------------------- /src/config_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """Config builder for charmed alertmanager.""" 5 | 6 | import logging 7 | from dataclasses import dataclass 8 | from typing import Optional 9 | 10 | import yaml 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class ConfigError(Exception): 16 | """Custom exception for failed config updates.""" 17 | 18 | 19 | default_config = { 20 | "global": {"http_config": {"tls_config": {"insecure_skip_verify": False}}}, 21 | "route": { 22 | "group_wait": "30s", 23 | "group_interval": "5m", 24 | "repeat_interval": "1h", 25 | "receiver": "placeholder", 26 | }, 27 | "receivers": [{"name": "placeholder"}], 28 | } 29 | 30 | 31 | @dataclass(frozen=True) 32 | class ConfigSuite: 33 | """Represents all the configuration files managed by this module, and their contents.""" 34 | 35 | alertmanager: str 36 | web: Optional[str] 37 | templates: Optional[str] 38 | amtool: str 39 | 40 | 41 | class ConfigBuilder: 42 | """A 'config builder' for alertmanager.""" 43 | 44 | def __init__( 45 | self, 46 | *, 47 | api_port: int = 9093, 48 | web_route_prefix: Optional[str] = None, 49 | ): 50 | self._api_port = api_port 51 | 52 | # Sanitize `web_route_prefix` so it has a leading `/` and no trailing `/` 53 | web_route_prefix = web_route_prefix.strip("/") if web_route_prefix else "" 54 | self._web_route_prefix = "/" + web_route_prefix 55 | 56 | self._config = default_config.copy() 57 | self._templates = None 58 | self._templates_path = "/etc/alertmanager/templates.tmpl" 59 | 60 | self._cert_file_path = None 61 | self._key_file_path = None 62 | 63 | def set_config(self, config: Optional[dict]): 64 | """Set the main config file contents.""" 65 | if config is not None: 66 | self._config = config 67 | return self 68 | 69 | def set_templates(self, templates: Optional[str], path: Optional[str] = None): 70 | """Set templates.""" 71 | if templates is not None: 72 | self._templates = templates 73 | if path: 74 | self._templates_path = path 75 | return self 76 | 77 | def set_tls_server_config(self, *, cert_file_path: str, key_file_path: str): 78 | """Set TLS server config.""" 79 | self._cert_file_path = cert_file_path 80 | self._key_file_path = key_file_path 81 | return self 82 | 83 | @property 84 | def _alertmanager_config(self) -> str: 85 | config = self._config.copy() 86 | 87 | # On disk, alertmanager rewrites the config and automatically adds an empty placeholder, 88 | # `templates: []`, so `get` is more robust than `if "templates" in config`. 89 | if config.get("templates"): 90 | logger.error( 91 | "alertmanager config file must not have a 'templates' section; " 92 | "use the 'templates' config option instead." 93 | ) 94 | raise ConfigError("Invalid config file: use charm's 'templates' config option instead") 95 | 96 | if self._templates: 97 | config["templates"] = [self._templates_path] 98 | 99 | # add juju topology to "group_by" 100 | # `route` is a mandatory field so don't need to be too careful 101 | route = config.get("route", {}) 102 | group_by = set(route.get("group_by", [])) 103 | 104 | # The special value '...' disables aggregation entirely. Do not add topology in that case. 105 | # Ref: https://prometheus.io/docs/alerting/latest/configuration/#route 106 | if group_by != {"..."}: 107 | group_by = list(group_by.union(["juju_application", "juju_model", "juju_model_uuid"])) 108 | route["group_by"] = list(group_by) 109 | config["route"] = route 110 | return yaml.safe_dump(config) 111 | 112 | @property 113 | def _amtool_config(self) -> str: 114 | # When amtool is run, it is always in the same container as alertmanager so we can use 115 | # `localhost` in the url. 116 | url = f"http://localhost:{self._api_port}" + self._web_route_prefix 117 | # Make sure url ends with `/` 118 | url = url.rstrip("/") + "/" 119 | return yaml.safe_dump({"alertmanager.url": url}) 120 | 121 | @property 122 | def _web_config(self) -> Optional[str]: 123 | if self._cert_file_path and self._key_file_path: 124 | web_config = { 125 | # https://prometheus.io/docs/prometheus/latest/configuration/https/ 126 | "tls_server_config": { 127 | # Certificate and key files for server to use to authenticate to client. 128 | "cert_file": self._cert_file_path, 129 | "key_file": self._key_file_path, 130 | }, 131 | } 132 | return yaml.safe_dump(web_config) 133 | if self._cert_file_path or self._key_file_path: 134 | raise ConfigError("Must provide both cert and key files") 135 | return None 136 | 137 | def build(self) -> ConfigSuite: 138 | """Return the entire config suite rendered.""" 139 | return ConfigSuite( 140 | alertmanager=self._alertmanager_config, 141 | web=self._web_config, 142 | templates=self._templates, 143 | amtool=self._amtool_config, 144 | ) 145 | -------------------------------------------------------------------------------- /tests/integration/test_tls_web.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2023 Ubuntu 3 | # See LICENSE file for licensing details. 4 | 5 | import logging 6 | from pathlib import Path 7 | from types import SimpleNamespace 8 | 9 | import pytest 10 | import sh 11 | import yaml 12 | from helpers import curl, get_unit_address 13 | from pytest_operator.plugin import OpsTest 14 | 15 | # pyright: reportAttributeAccessIssue = false 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) 20 | alertmanager_image_rev = METADATA["resources"]["alertmanager-image"]["upstream-source"] 21 | am = SimpleNamespace(name="alertmanager", scale=1) 22 | ca = SimpleNamespace(name="ca") 23 | 24 | # FIXME change scale to 2 once the tls_certificate lib issue is fixed 25 | # https://github.com/canonical/tls-certificates-interface/issues/57 26 | 27 | 28 | @pytest.mark.abort_on_fail 29 | async def test_build_and_deploy(ops_test: OpsTest, charm_under_test): 30 | """Deploy 2 alertmanager units, related to a local CA.""" 31 | assert ops_test.model 32 | # Deploy the charm and wait for active/idle status 33 | sh.juju.deploy( 34 | charm_under_test, 35 | "alertmanager", 36 | f"--num-units={am.scale}", 37 | model=ops_test.model.name, 38 | resource=f"alertmanager-image={alertmanager_image_rev}", 39 | trust=True, 40 | ) 41 | sh.juju.deploy("self-signed-certificates", "ca", model=ops_test.model.name, channel="edge") 42 | sh.juju.relate("alertmanager:certificates", "ca", model=ops_test.model.name) 43 | 44 | await ops_test.model.wait_for_idle( 45 | apps=["alertmanager", "ca"], 46 | status="active", 47 | raise_on_error=False, 48 | timeout=600, 49 | idle_period=30, 50 | ) 51 | 52 | 53 | @pytest.mark.abort_on_fail 54 | async def test_tls_files_created(ops_test: OpsTest): 55 | """Make sure charm code created web-config, cert and key files.""" 56 | # juju ssh --container alertmanager am/0 ls /etc/alertmanager/ 57 | config_path = "/etc/alertmanager/" 58 | for i in range(am.scale): 59 | unit_name = f"{am.name}/{i}" 60 | rc, stdout, stderr = await ops_test.juju( 61 | "ssh", "--container", "alertmanager", unit_name, "ls", f"{config_path}" 62 | ) 63 | logger.info("%s: contents of %s: %s", unit_name, config_path, stdout or stderr) 64 | 65 | 66 | @pytest.mark.abort_on_fail 67 | async def test_server_cert(ops_test: OpsTest): 68 | """Inspect server cert and confirm `X509v3 Subject Alternative Name` field is as expected.""" 69 | # echo \ 70 | # | openssl s_client -showcerts -servername $IPADDR:9093 -connect $IPADDR:9093 2>/dev/null \ 71 | # | openssl x509 -inform pem -noout -text 72 | am_ip_addrs = [await get_unit_address(ops_test, am.name, i) for i in range(am.scale)] 73 | for am_ip in am_ip_addrs: 74 | cmd = [ 75 | "sh", 76 | "-c", 77 | f"echo | openssl s_client -showcerts -servername {am_ip}:9093 -connect {am_ip}:9093 2>/dev/null | openssl x509 -inform pem -noout -text", 78 | ] 79 | retcode, stdout, stderr = await ops_test.run(*cmd) 80 | fqdn = f"{am.name}-0.{am.name}-endpoints.{ops_test.model_name}.svc.cluster.local" 81 | assert fqdn in stdout 82 | 83 | 84 | @pytest.mark.abort_on_fail 85 | async def test_https_reachable(ops_test: OpsTest, temp_dir): 86 | """Make sure alertmanager's https endpoint is reachable using curl and ca cert.""" 87 | for i in range(am.scale): 88 | # Save CA cert locally 89 | # juju show-unit am/0 --format yaml | yq '.am/0."relation-info"[0]."local-unit".data.ca' > /tmp/cacert.pem 90 | # juju run ca/0 get-ca-certificate --format json | jq -r '."ca/0".results."ca-certificate"' > internal.cert 91 | cmd = [ 92 | "sh", 93 | "-c", 94 | f'juju run {ca.name}/0 get-ca-certificate --format json | jq -r \'."{ca.name}/0".results."ca-certificate"\'', 95 | ] 96 | logger.info("Obtaining CA cert with command: %s", " ".join(cmd)) 97 | retcode, stdout, stderr = await ops_test.run(*cmd) 98 | cert = stdout 99 | cert_path = temp_dir / "local.cert" 100 | with open(cert_path, "wt") as f: 101 | f.writelines(cert) 102 | 103 | # Confirm alertmanager TLS endpoint reachable 104 | # curl --fail-with-body --capath /tmp --cacert /tmp/cacert.pem https://alertmanager.local:9093/-/ready 105 | ip_addr = await get_unit_address(ops_test, am.name, i) 106 | fqdn = f"{am.name}-0.{am.name}-endpoints.{ops_test.model_name}.svc.cluster.local" 107 | response = await curl( 108 | ops_test, 109 | cert_dir=temp_dir, 110 | cert_path=cert_path, 111 | ip_addr=ip_addr, 112 | mock_url=f"https://{fqdn}:9093/-/ready", 113 | ) 114 | assert "OK" in response 115 | 116 | 117 | @pytest.mark.abort_on_fail 118 | async def test_https_still_reachable_after_refresh(ops_test: OpsTest, charm_under_test, temp_dir): 119 | """Make sure alertmanager's https endpoint is still reachable after an upgrade.""" 120 | assert ops_test.model 121 | sh.juju.refresh("alertmanager", model=ops_test.model.name, path=charm_under_test) 122 | await ops_test.model.wait_for_idle( 123 | apps=["alertmanager", "ca"], 124 | status="active", 125 | raise_on_error=False, 126 | timeout=600, 127 | idle_period=30, 128 | ) 129 | await test_https_reachable(ops_test, temp_dir) 130 | -------------------------------------------------------------------------------- /tests/integration/test_templates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2022 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import json 6 | import logging 7 | import time 8 | from pathlib import Path 9 | 10 | import pytest 11 | import sh 12 | import yaml 13 | from helpers import is_alertmanager_up 14 | from pytest_operator.plugin import OpsTest 15 | from werkzeug.wrappers import Request, Response 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) 20 | app_name = METADATA["name"] 21 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 22 | receiver_name = "fake-receiver" 23 | 24 | # Define the template to use for testing the charm correctly passes it to the workload. 25 | callback_id = str(int(time.time())) # The slack callback id 26 | template = r'{{ define "slack.default.callbackid" }}' + callback_id + "{{ end }}" 27 | 28 | 29 | @pytest.mark.abort_on_fail 30 | async def test_build_and_deploy(ops_test: OpsTest, charm_under_test): 31 | # deploy charm from local source folder 32 | assert ops_test.model 33 | await ops_test.model.deploy( 34 | charm_under_test, resources=resources, application_name=app_name, trust=True 35 | ) 36 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 37 | application = ops_test.model.applications[app_name] 38 | assert application 39 | assert application.units[0].workload_status == "active" 40 | assert await is_alertmanager_up(ops_test, app_name) 41 | 42 | 43 | @pytest.mark.abort_on_fail 44 | async def test_configure_alertmanager_with_templates(ops_test: OpsTest, httpserver): 45 | # define the alertmanager configuration 46 | assert ops_test.model 47 | aconfig = { 48 | "global": {"http_config": {"tls_config": {"insecure_skip_verify": True}}}, 49 | "route": { 50 | "group_by": ["alertname"], 51 | "group_wait": "3s", 52 | "group_interval": "5m", 53 | "repeat_interval": "1h", 54 | "receiver": receiver_name, 55 | }, 56 | "receivers": [ 57 | { 58 | "name": receiver_name, 59 | "slack_configs": [ 60 | { 61 | "api_url": httpserver.url_for("/"), 62 | "channel": "test", 63 | "text": r"https://localhost/alerts/{{ .GroupLabels.alertname }}", 64 | } 65 | ], 66 | } 67 | ], 68 | } 69 | 70 | # set alertmanager configuration and template file 71 | application = ops_test.model.applications[app_name] 72 | assert application 73 | await application.set_config( 74 | {"config_file": yaml.safe_dump(aconfig), "templates_file": template} 75 | ) 76 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=60) 77 | 78 | 79 | @pytest.mark.abort_on_fail 80 | async def test_receiver_gets_alert(ops_test: OpsTest, httpserver): 81 | request_from_alertmanager = None 82 | 83 | def request_handler(request: Request): 84 | """A request handler. 85 | 86 | Alertmanager's POST request to a slack server looks like this: 87 | 88 | {'attachments': [{'callback_id': '2', 89 | 'color': 'danger', 90 | 'fallback': '[FIRING:1] fake-alert alertmanager-k8s ' 91 | 'test-templates-klzm 1234 | ' 92 | 'http://alertmanager-k8s-0.fqdn:9093/#/alerts?receiver=name', 93 | 'footer': '', 94 | 'mrkdwn_in': ['fallback', 'pretext', 'text'], 95 | 'text': 'https://localhost/alerts/fake-alert', 96 | 'title': '[FIRING:1] fake-alert alertmanager-k8s ' 97 | 'test-templates-klzm 1234 ', 98 | 'title_link': 'http://alertmanager-k8s-0.fqdn:9093/#/alerts?receiver=name'}], 99 | 'channel': 'test', 100 | 'username': 'Alertmanager'} 101 | """ 102 | nonlocal request_from_alertmanager 103 | response = Response("OK", status=200, content_type="text/plain") 104 | request_from_alertmanager = json.loads(request.data.decode("utf-8")) 105 | logger.info("Got Request Data : %s", request_from_alertmanager) 106 | return response 107 | 108 | # set the alert 109 | with httpserver.wait(timeout=120) as waiting: 110 | # expect an alert to be forwarded to the receiver 111 | httpserver.expect_oneshot_request("/", method="POST").respond_with_handler(request_handler) 112 | 113 | # Use amtool to fire a stand-in alert 114 | sh.juju( # pyright: ignore 115 | [ 116 | "ssh", 117 | "-m", 118 | ops_test.model_name, 119 | "--container", 120 | "alertmanager", 121 | f"{app_name}/0", 122 | "amtool", 123 | "alert", 124 | "add", 125 | "foo", 126 | "node=bar", 127 | "status=firing", 128 | "juju_model_uuid=1234", 129 | f"juju_application={app_name}", 130 | "juju_model=model_name", 131 | "--annotation=summary=summary", 132 | ] 133 | ) 134 | 135 | # check receiver got an alert 136 | assert waiting.result 137 | assert request_from_alertmanager["attachments"][0]["callback_id"] == callback_id # type: ignore 138 | -------------------------------------------------------------------------------- /tests/unit/test_external_url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import logging 6 | import unittest 7 | from typing import Optional 8 | from unittest.mock import patch 9 | 10 | import ops 11 | import yaml 12 | from helpers import cli_arg, k8s_resource_multipatch 13 | from ops.testing import Harness 14 | 15 | from alertmanager import WorkloadManager 16 | from charm import AlertmanagerCharm 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | ops.testing.SIMULATE_CAN_CONNECT = True # pyright: ignore 21 | CONTAINER_NAME = "alertmanager" 22 | SERVICE_NAME = AlertmanagerCharm._service_name 23 | 24 | 25 | class TestExternalUrl(unittest.TestCase): 26 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 27 | @patch("socket.getfqdn", new=lambda *args: "fqdn") 28 | @k8s_resource_multipatch 29 | @patch("lightkube.core.client.GenericSyncClient") 30 | @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) 31 | def setUp(self, *unused): 32 | self.harness = Harness(AlertmanagerCharm) 33 | self.harness.set_model_name(self.__class__.__name__) 34 | self.addCleanup(self.harness.cleanup) 35 | self.harness.set_leader(True) 36 | 37 | # Peer relation 38 | self.app_name = "alertmanager-k8s" 39 | self.peer_rel_id = self.harness.add_relation("replicas", self.app_name) 40 | 41 | # Regular relation 42 | self.rel_id = self.harness.add_relation("alerting", "otherapp") 43 | self.harness.add_relation_unit(self.rel_id, "otherapp/0") 44 | 45 | self.harness.begin_with_initial_hooks() 46 | self.fqdn_url = f"http://fqdn:{self.harness.charm.api_port}" 47 | 48 | def get_url_cli_arg(self) -> Optional[str]: 49 | plan = self.harness.get_container_pebble_plan(CONTAINER_NAME) 50 | return cli_arg(plan, "--web.external-url") 51 | 52 | def get_cluster_args(self): 53 | plan = self.harness.get_container_pebble_plan(CONTAINER_NAME).to_dict() 54 | args = plan.get("services", {}).get(SERVICE_NAME, {}).get("command", "").split() 55 | cluster_args = filter(lambda s: s.startswith("--cluster.peer="), args) 56 | cluster_args = sorted((s.split("=")[1] for s in cluster_args)) 57 | return cluster_args 58 | 59 | def is_service_running(self) -> bool: 60 | # service = plan.services.get(self.harness.charm._service_name) 61 | service = self.harness.model.unit.get_container(CONTAINER_NAME).get_service(SERVICE_NAME) 62 | return service.is_running() 63 | 64 | @unittest.skip("https://github.com/canonical/operator/issues/736") 65 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 66 | @patch("socket.getfqdn", new=lambda *args: "fqdn") 67 | @k8s_resource_multipatch 68 | def test_traefik_overrides_fqdn(self): 69 | """The config option for external url must override all other external urls.""" 70 | # GIVEN a charm with the fqdn as its external URL 71 | self.assertEqual(self.get_url_cli_arg(), self.fqdn_url) 72 | self.assertTrue(self.is_service_running()) 73 | self.assertEqual(self.harness.charm._external_url, self.fqdn_url) 74 | 75 | # WHEN a relation with traefik is formed but ingress isn't ready 76 | rel_id = self.harness.add_relation("ingress", "traefik-app") 77 | self.harness.add_relation_unit(rel_id, "traefik-app/0") 78 | 79 | # THEN there is no change to the cli arg 80 | self.assertEqual(self.get_url_cli_arg(), self.fqdn_url) 81 | self.assertTrue(self.is_service_running()) 82 | self.assertEqual(self.harness.charm._external_url, self.fqdn_url) 83 | 84 | # WHEN ingress becomes available 85 | external_url_ingress = "http://foo.bar.ingress:80/path/to/mdl-alertmanager-k8s" 86 | app_data = {"ingress": yaml.safe_dump({"url": external_url_ingress})} 87 | self.harness.update_relation_data(rel_id, "traefik-app", app_data) 88 | 89 | # THEN the external url from the ingress relation overrides the fqdn 90 | self.assertEqual(self.get_url_cli_arg(), external_url_ingress) 91 | self.assertTrue(self.is_service_running()) 92 | 93 | # NOTE intentionally not emptying out relation data manually 94 | # FIXME: figure out if we do or do not need to manually empty out relation-data 95 | # before relation-broken is emitted. 96 | # https://github.com/canonical/operator/issues/888 97 | app_data = {"ingress": ""} 98 | self.harness.update_relation_data(rel_id, "traefik-app", app_data) 99 | 100 | # AND WHEN the traefik relation is removed 101 | self.harness.remove_relation_unit(rel_id, "traefik-app/0") 102 | self.harness.remove_relation(rel_id) 103 | 104 | # THEN the fqdn is used as external url 105 | self.assertEqual(self.get_url_cli_arg(), self.fqdn_url) 106 | 107 | @unittest.skip("https://github.com/canonical/operator/issues/736") 108 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 109 | @patch("socket.getfqdn", new=lambda *args: "fqdn-0") 110 | @k8s_resource_multipatch 111 | def test_cluster_addresses(self, *_): 112 | # GIVEN an alertmanager charm with 3 units in total 113 | for u in [1, 2]: 114 | unit_name = self.app_name + f"/{u}" 115 | self.harness.add_relation_unit(self.peer_rel_id, unit_name) 116 | self.harness.update_relation_data( 117 | self.peer_rel_id, unit_name, {"private_address": f"http://fqdn-{u}:9093"} 118 | ) 119 | 120 | # THEN the `--cluster.peer` args are made up of the hostname and HA port 121 | cluster_args = self.get_cluster_args() 122 | self.assertEqual(cluster_args, ["fqdn-1:9094", "fqdn-2:9094"]) # cluster is on ha-port 123 | -------------------------------------------------------------------------------- /tests/unit/test_alertmanager_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import json 6 | import unittest 7 | from datetime import datetime, timedelta, timezone 8 | from unittest.mock import patch 9 | 10 | from alertmanager_client import Alertmanager, AlertmanagerBadResponse 11 | 12 | 13 | class TestAlertmanagerAPIClient(unittest.TestCase): 14 | def setUp(self): 15 | self.path = "custom/path" 16 | self.api = Alertmanager(f"http://address:12345/{self.path}/") 17 | 18 | def test_base_url(self): 19 | """Check that regardless of the passed url, base_url ends with a slash.""" 20 | api_with_slash = Alertmanager(f"http://address:12345/{self.path}/") 21 | self.assertEqual(f"http://address:12345/{self.path}/", api_with_slash.base_url) 22 | api_without_slash = Alertmanager(f"http://address:12345/{self.path}") 23 | self.assertEqual(f"http://address:12345/{self.path}/", api_without_slash.base_url) 24 | 25 | @patch("alertmanager_client.urllib.request.urlopen") 26 | def test_reload_succeed(self, urlopen_mock): 27 | urlopen_mock.return_value.code = 200 28 | urlopen_mock.return_value.reason = "OK" 29 | 30 | self.api.reload() 31 | urlopen_mock.assert_called() 32 | 33 | @patch("alertmanager_client.urllib.request.urlopen") 34 | def test_status_succeed(self, urlopen_mock): 35 | urlopen_mock.return_value.read = lambda: json.dumps({"status": "fake"}) 36 | urlopen_mock.return_value.code = 200 37 | urlopen_mock.return_value.reason = "OK" 38 | 39 | status = self.api.status() 40 | self.assertIsNotNone(status) 41 | self.assertDictEqual({"status": "fake"}, status) 42 | 43 | def test_reload_and_status_fail(self): 44 | def mock_connection_error(*args, **kwargs): 45 | import urllib.error 46 | 47 | raise urllib.error.HTTPError( 48 | url="mock://url", 49 | code=500, 50 | msg="mock msg", 51 | hdrs={"mock hdr": "mock smth"}, # type: ignore[arg-type] 52 | fp=None, 53 | ) 54 | 55 | with patch("alertmanager_client.urllib.request.urlopen", mock_connection_error): 56 | self.assertRaises(AlertmanagerBadResponse, self.api.reload) 57 | 58 | with patch("alertmanager_client.urllib.request.urlopen", mock_connection_error): 59 | self.assertRaises(AlertmanagerBadResponse, self.api.status) 60 | 61 | @patch("alertmanager_client.urllib.request.urlopen") 62 | def test_version(self, urlopen_mock): 63 | urlopen_mock.return_value.read = lambda: json.dumps({"versionInfo": {"version": "0.1.2"}}) 64 | urlopen_mock.return_value.code = 200 65 | urlopen_mock.return_value.reason = "OK" 66 | 67 | self.assertEqual(self.api.version, "0.1.2") 68 | 69 | @patch("alertmanager_client.urllib.request.urlopen") 70 | def test_alerts_can_be_set(self, urlopen_mock): 71 | msg = "HTTP 200 OK" 72 | urlopen_mock.return_value = msg 73 | alerts = [ 74 | { 75 | "startsAt": datetime.now().isoformat("T"), 76 | "status": "firing", 77 | "annotations": { 78 | "summary": "A fake alert", 79 | }, 80 | "labels": { 81 | "alertname": "fake alert", 82 | }, 83 | } 84 | ] 85 | status = self.api.set_alerts(alerts) 86 | urlopen_mock.assert_called() 87 | self.assertEqual(status, msg) 88 | 89 | @patch("alertmanager_client.urllib.request.urlopen") 90 | def test_available_alerts_are_returned(self, urlopen_mock): 91 | fake_alerts = [ 92 | { 93 | "labels": {"name": "fake-alert"}, 94 | "startsAt": datetime.now().isoformat("T"), 95 | } 96 | ] 97 | urlopen_mock.return_value.read = lambda: json.dumps(fake_alerts) 98 | urlopen_mock.return_value.code = 200 99 | urlopen_mock.return_value.reason = "OK" 100 | 101 | alerts = self.api.get_alerts() 102 | self.assertListEqual(alerts, fake_alerts) 103 | 104 | @patch("alertmanager_client.urllib.request.urlopen") 105 | def test_silences_can_be_set(self, urlopen_mock): 106 | msg = "HTTP 200 OK" 107 | urlopen_mock.return_value = msg 108 | matchers = [ 109 | { 110 | "name": "alertname", 111 | "value": "fake-alert", 112 | "isRegex": False, 113 | } 114 | ] 115 | silence_start = datetime.now(timezone.utc) 116 | silence_end = silence_start + timedelta(minutes=60) 117 | status = self.api.set_silences( 118 | matchers=matchers, start_time=silence_start, end_time=silence_end 119 | ) 120 | urlopen_mock.assert_called() 121 | self.assertEqual(status, msg) 122 | 123 | @patch("alertmanager_client.urllib.request.urlopen") 124 | def test_available_silences_are_returned(self, urlopen_mock): 125 | fake_silences = [ 126 | { 127 | "id": "fake-silencer", 128 | "status": {"state": "active"}, 129 | "startsAt": datetime.now().isoformat("T"), 130 | "endsAt": (datetime.now() + timedelta(minutes=60)).isoformat("T"), 131 | "matchers": [ 132 | { 133 | "name": "alertname", 134 | "value": "fake-alert", 135 | "isRegex": False, 136 | } 137 | ], 138 | } 139 | ] 140 | urlopen_mock.return_value.read = lambda: json.dumps(fake_silences) 141 | urlopen_mock.return_value.code = 200 142 | urlopen_mock.return_value.reason = "OK" 143 | 144 | alerts = self.api.get_silences() 145 | self.assertListEqual(alerts, fake_silences) 146 | 147 | @patch("alertmanager_client.urllib.request.urlopen") 148 | def test_silences_can_be_deleted(self, urlopen_mock): 149 | msg = "HTTP 200 OK" 150 | urlopen_mock.return_value = msg 151 | 152 | status = self.api.delete_silence("fake-id") 153 | urlopen_mock.assert_called() 154 | self.assertEqual(status, msg) 155 | -------------------------------------------------------------------------------- /tests/unit/test_remote_configuration_requirer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | import json 5 | import logging 6 | import unittest 7 | from typing import cast 8 | from unittest.mock import patch 9 | 10 | import yaml 11 | from charms.alertmanager_k8s.v0.alertmanager_remote_configuration import ( 12 | DEFAULT_RELATION_NAME, 13 | ) 14 | from deepdiff import DeepDiff # type: ignore[import] 15 | from helpers import k8s_resource_multipatch 16 | from ops import testing 17 | from ops.model import BlockedStatus 18 | 19 | from alertmanager import WorkloadManager 20 | from charm import AlertmanagerCharm 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | testing.SIMULATE_CAN_CONNECT = True # pyright: ignore 25 | 26 | TEST_ALERTMANAGER_CONFIG_FILE = "/test/rules/dir/config_file.yml" 27 | TEST_ALERTMANAGER_DEFAULT_CONFIG = """route: 28 | receiver: placeholder 29 | receivers: 30 | - name: placeholder 31 | """ 32 | TEST_ALERTMANAGER_REMOTE_CONFIG = """receivers: 33 | - name: test_receiver 34 | route: 35 | receiver: test_receiver 36 | group_by: 37 | - alertname 38 | group_wait: 1234s 39 | group_interval: 4321s 40 | repeat_interval: 1111h 41 | """ 42 | 43 | 44 | @patch("subprocess.run") 45 | class TestAlertmanagerRemoteConfigurationRequirer(unittest.TestCase): 46 | @patch("subprocess.run") 47 | @patch("lightkube.core.client.GenericSyncClient") 48 | @patch.object(AlertmanagerCharm, "_update_ca_certs", lambda *a, **kw: None) 49 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 50 | @k8s_resource_multipatch 51 | def setUp(self, *_) -> None: 52 | self.harness = testing.Harness(AlertmanagerCharm) 53 | self.addCleanup(self.harness.cleanup) 54 | self.harness.set_leader(True) 55 | 56 | self.harness.handle_exec("alertmanager", ["update-ca-certificates", "--fresh"], result="") 57 | self.harness.handle_exec( 58 | "alertmanager", 59 | [WorkloadManager._amtool_path, "check-config", AlertmanagerCharm._config_path], 60 | result="", 61 | ) 62 | 63 | # TODO: Once we're on ops 2.0.0+ this can be removed as begin_with_initial_hooks() 64 | # now does it. 65 | self.harness.set_can_connect("alertmanager", True) 66 | 67 | # In ops 2.0.0+, we need to mock the version, as begin_with_initial_hooks() now triggers 68 | # pebble-ready, which attempts to obtain the workload version. 69 | patcher = patch.object( 70 | WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0") 71 | ) 72 | self.mock_version = patcher.start() 73 | self.addCleanup(patcher.stop) 74 | 75 | self.harness.begin_with_initial_hooks() 76 | 77 | self.relation_id = self.harness.add_relation( 78 | DEFAULT_RELATION_NAME, "remote-config-provider" 79 | ) 80 | self.harness.add_relation_unit(self.relation_id, "remote-config-provider/0") 81 | 82 | @k8s_resource_multipatch 83 | def test_valid_config_pushed_to_relation_data_bag_updates_alertmanager_config( 84 | self, 85 | *_, 86 | ): 87 | expected_config = remote_config = yaml.safe_load(TEST_ALERTMANAGER_REMOTE_CONFIG) 88 | # add juju topology to "group_by" 89 | route = cast(dict, expected_config.get("route", {})) 90 | route["group_by"] = list( 91 | set(route.get("group_by", [])).union( 92 | ["juju_application", "juju_model", "juju_model_uuid"] 93 | ) 94 | ) 95 | expected_config["route"] = route 96 | 97 | self.harness.update_relation_data( 98 | relation_id=self.relation_id, 99 | app_or_unit="remote-config-provider", 100 | key_values={"alertmanager_config": json.dumps(remote_config)}, 101 | ) 102 | config = self.harness.charm.container.pull(self.harness.charm._config_path) 103 | 104 | self.assertEqual( 105 | DeepDiff(yaml.safe_load(config.read()), expected_config, ignore_order=True), 106 | {}, 107 | ) 108 | 109 | @k8s_resource_multipatch 110 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 111 | def test_configs_available_from_both_relation_data_bag_and_charm_config_block_charm( 112 | self, 113 | *_, 114 | ): 115 | sample_remote_config = yaml.safe_load(TEST_ALERTMANAGER_REMOTE_CONFIG) 116 | self.harness.update_relation_data( 117 | relation_id=self.relation_id, 118 | app_or_unit="remote-config-provider", 119 | key_values={"alertmanager_config": json.dumps(sample_remote_config)}, 120 | ) 121 | self.harness.update_config({"config_file": TEST_ALERTMANAGER_DEFAULT_CONFIG}) 122 | 123 | self.assertEqual( 124 | self.harness.charm.unit.status, BlockedStatus("Multiple configs detected") 125 | ) 126 | 127 | @patch("config_builder.default_config", yaml.safe_load(TEST_ALERTMANAGER_DEFAULT_CONFIG)) 128 | @k8s_resource_multipatch 129 | def test_invalid_config_pushed_to_the_relation_data_bag_does_not_update_alertmanager_config( 130 | self, 131 | *_, 132 | ): 133 | invalid_config = yaml.safe_load("some: invalid_config") 134 | 135 | self.harness.update_relation_data( 136 | relation_id=self.relation_id, 137 | app_or_unit="remote-config-provider", 138 | key_values={"alertmanager_config": json.dumps(invalid_config)}, 139 | ) 140 | config = self.harness.charm.container.pull(self.harness.charm._config_path) 141 | 142 | self.assertNotIn("invalid_config", yaml.safe_load(config.read())) 143 | 144 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 145 | @k8s_resource_multipatch 146 | def test_templates_pushed_to_relation_data_bag_are_saved_to_templates_file_in_alertmanager( 147 | self, 148 | *_, 149 | ): 150 | sample_remote_config = yaml.safe_load(TEST_ALERTMANAGER_REMOTE_CONFIG) 151 | test_template = '{{define "myTemplate"}}do something{{end}}' 152 | 153 | self.harness.update_relation_data( 154 | relation_id=self.relation_id, 155 | app_or_unit="remote-config-provider", 156 | key_values={ 157 | "alertmanager_config": json.dumps(sample_remote_config), 158 | "alertmanager_templates": json.dumps([test_template]), 159 | }, 160 | ) 161 | updated_templates = self.harness.charm.container.pull(self.harness.charm._templates_path) 162 | 163 | self.assertEqual(updated_templates.read(), test_template) 164 | -------------------------------------------------------------------------------- /tests/unit/test_push_config_to_workload_on_startup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import logging 6 | import unittest 7 | from unittest.mock import patch 8 | 9 | import hypothesis.strategies as st 10 | import ops 11 | import validators 12 | import yaml 13 | from helpers import k8s_resource_multipatch 14 | from hypothesis import given 15 | from ops.model import ActiveStatus, BlockedStatus 16 | from ops.testing import Harness 17 | 18 | from alertmanager import WorkloadManager 19 | from charm import AlertmanagerCharm 20 | 21 | logger = logging.getLogger(__name__) 22 | ops.testing.SIMULATE_CAN_CONNECT = True # pyright: ignore 23 | CONTAINER_NAME = "alertmanager" 24 | 25 | 26 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("0.0.0", "")) 27 | @patch("subprocess.run") 28 | class TestPushConfigToWorkloadOnStartup(unittest.TestCase): 29 | """Feature: Push config to workload on startup. 30 | 31 | Background: Charm starts up with initial hooks. 32 | """ 33 | 34 | @patch("subprocess.run") 35 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("0.0.0", "")) 36 | @k8s_resource_multipatch 37 | @patch("lightkube.core.client.GenericSyncClient") 38 | @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) 39 | def setUp(self, *_): 40 | self.harness = Harness(AlertmanagerCharm) 41 | self.addCleanup(self.harness.cleanup) 42 | 43 | # self.harness.charm.app.name does not exist before .begin() 44 | # https://github.com/canonical/operator/issues/675 45 | self.app_name = "alertmanager-k8s" 46 | self.peer_rel_id = self.harness.add_relation("replicas", self.app_name) 47 | self.harness.begin_with_initial_hooks() 48 | 49 | @given(st.booleans()) 50 | def test_single_unit_cluster(self, is_leader, _): 51 | """Scenario: Current unit is the only unit present.""" 52 | # WHEN only one unit is 53 | self.assertEqual(self.harness.model.app.planned_units(), 1) 54 | self.harness.set_leader(is_leader) 55 | 56 | # THEN amtool config is rendered 57 | amtool_config = yaml.safe_load( 58 | self.harness.charm.container.pull(self.harness.charm._amtool_config_path) 59 | ) 60 | self.assertTrue(validators.url(amtool_config["alertmanager.url"], simple_host=True)) 61 | 62 | # AND alertmanager config is rendered 63 | am_config = yaml.safe_load( 64 | self.harness.charm.container.pull(self.harness.charm._config_path) 65 | ) 66 | self.assertGreaterEqual(am_config.keys(), {"global", "route", "receivers"}) 67 | 68 | # AND path to config file is part of pebble layer command 69 | command = ( 70 | self.harness.get_container_pebble_plan(self.harness.charm._container_name) 71 | .services[self.harness.charm._service_name] 72 | .command 73 | ) 74 | self.assertIn(f"--config.file={self.harness.charm._config_path}", command) 75 | 76 | # AND peer clusters cli arg is not present in pebble layer command 77 | self.assertNotIn("--cluster.peer=", command) 78 | 79 | @unittest.skip("https://github.com/canonical/operator/issues/736") 80 | @k8s_resource_multipatch 81 | def test_multi_unit_cluster(self, *_): 82 | """Scenario: Current unit is a part of a multi-unit cluster.""" 83 | self.harness.set_leader(False) 84 | 85 | # WHEN multiple units are present 86 | num_units = 3 87 | for i in range(1, num_units): 88 | self.harness.add_relation_unit(self.peer_rel_id, f"{self.app_name}/{i}") 89 | self.harness.update_relation_data( 90 | self.peer_rel_id, 91 | f"{self.app_name}/{i}", 92 | {"private_address": f"http://fqdn-{i}"}, 93 | ) 94 | 95 | self.assertEqual(self.harness.model.app.planned_units(), num_units) 96 | 97 | # THEN peer clusters cli arg is present in pebble layer command 98 | command = ( 99 | self.harness.get_container_pebble_plan(self.harness.charm._container_name) 100 | .services[self.harness.charm._service_name] 101 | .command 102 | ) 103 | self.assertIn("--cluster.peer=", command) 104 | 105 | def test_charm_blocks_on_connection_error(self, *_): 106 | self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) 107 | self.harness.set_can_connect(CONTAINER_NAME, False) 108 | self.harness.update_config({"templates_file": "doesn't matter"}) 109 | self.assertNotIsInstance(self.harness.charm.unit.status, ActiveStatus) 110 | 111 | 112 | @patch("subprocess.run") 113 | class TestInvalidConfig(unittest.TestCase): 114 | """Feature: Charm must block when invalid config is provided. 115 | 116 | Background: alertmanager exits when config is invalid, so this must be guarded against, 117 | otherwise pebble will keep trying to restart it, resulting in an idle crash-loop. 118 | """ 119 | 120 | def setUp(self): 121 | self.harness = Harness(AlertmanagerCharm) 122 | self.addCleanup(self.harness.cleanup) 123 | 124 | self.harness.handle_exec("alertmanager", ["update-ca-certificates", "--fresh"], result="") 125 | 126 | @k8s_resource_multipatch 127 | @patch("lightkube.core.client.GenericSyncClient") 128 | @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) 129 | def test_charm_blocks_on_invalid_config_on_startup(self, *_): 130 | # GIVEN an invalid config file 131 | self.harness.update_config({"config_file": "templates: [wrong]"}) 132 | 133 | # WHEN the charm starts 134 | self.harness.begin_with_initial_hooks() 135 | 136 | # THEN the charm goes into blocked status 137 | self.assertIsInstance(self.harness.charm.unit.status, BlockedStatus) 138 | 139 | @k8s_resource_multipatch 140 | @patch("lightkube.core.client.GenericSyncClient") 141 | @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) 142 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("0.0.0", "")) 143 | def test_charm_blocks_on_invalid_config_changed(self, *_): 144 | # GIVEN a valid configuration 145 | self.harness.update_config({"config_file": "templates: []"}) 146 | 147 | # WHEN the charm starts 148 | self.harness.begin_with_initial_hooks() 149 | 150 | # THEN the charm goes into active status 151 | self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) 152 | 153 | # AND WHEN the config is updated and invalid (mocked below) 154 | self.harness.update_config({"config_file": "templates: [wrong]"}) 155 | 156 | # THEN the charm goes into blocked status 157 | self.assertIsInstance(self.harness.charm.unit.status, BlockedStatus) 158 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to alertmanager-k8s 2 | ![GitHub](https://img.shields.io/github/license/canonical/alertmanager-k8s-operator) 3 | ![GitHub commit activity](https://img.shields.io/github/commit-activity/y/canonical/alertmanager-k8s-operator) 4 | ![GitHub](https://img.shields.io/tokei/lines/github/canonical/alertmanager-k8s-operator) 5 | ![GitHub](https://img.shields.io/github/issues/canonical/alertmanager-k8s-operator) 6 | ![GitHub](https://img.shields.io/github/issues-pr/canonical/alertmanager-k8s-operator) ![GitHub](https://img.shields.io/github/contributors/canonical/alertmanager-k8s-operator) ![GitHub](https://img.shields.io/github/watchers/canonical/alertmanager-k8s-operator?style=social) 7 | 8 | ## Overview 9 | 10 | This documents explains the processes and practices recommended for 11 | contributing enhancements or bug fixing to the Alertmanager Charmed Operator. 12 | 13 | The intended use case of this operator is to be deployed as part of the 14 | [COS Lite] bundle, although that is not necessary. 15 | 16 | 17 | ## Setup 18 | 19 | A typical setup using [snaps](https://snapcraft.io/) can be found in the 20 | [Juju docs](https://juju.is/docs/sdk/dev-setup). 21 | 22 | 23 | ## Developing 24 | 25 | - Prior to getting started on a pull request, we first encourage you to open an 26 | issue explaining the use case or bug. 27 | This gives other contributors a chance to weigh in early in the process. 28 | - To author PRs you should be familiar with [juju](https://juju.is/#what-is-juju) 29 | and [how operators are written](https://juju.is/docs/sdk). 30 | - The best way to get a head start is to join the conversation on our 31 | [Mattermost channel] or [Discourse]. 32 | - All enhancements require review before being merged. Besides the 33 | code quality and test coverage, the review will also take into 34 | account the resulting user experience for Juju administrators using 35 | this charm. To be able to merge you would have to rebase 36 | onto the `main` branch. We do this to avoid merge commits and to have a 37 | linear Git history. 38 | - We use [`tox`](https://tox.wiki/en/latest/#) to manage all virtualenvs for 39 | the development lifecycle. 40 | 41 | 42 | ### Testing 43 | Unit tests are written with the Operator Framework [test harness] and 44 | integration tests are written using [pytest-operator] and [python-libjuju]. 45 | 46 | The default test environments - lint, static and unit - will run if you start 47 | `tox` without arguments. 48 | 49 | You can also manually run a specific test environment: 50 | 51 | ```shell 52 | tox -e fmt # update your code according to linting rules 53 | tox -e lint # code style 54 | tox -e static # static analysis 55 | tox -e unit # unit tests 56 | tox -e integration # integration tests 57 | tox -e integration-lma # integration tests for the lma-light bundle 58 | ``` 59 | 60 | `tox` creates a virtual environment for every tox environment defined in 61 | [tox.ini](tox.ini). To activate a tox environment for manual testing, 62 | 63 | ```shell 64 | source .tox/unit/bin/activate 65 | ``` 66 | 67 | 68 | #### Manual testing 69 | Alerts can be created using 70 | [`amtool`](https://manpages.debian.org/testing/prometheus-alertmanager/amtool.1.en.html), 71 | 72 | ```shell 73 | amtool alert add alertname=oops service="my-service" severity=warning \ 74 | instance="oops.example.net" --annotation=summary="High latency is high!" \ 75 | --generator-url="http://prometheus.int.example.net" 76 | ``` 77 | 78 | or using [Alertmanager's HTTP API][Alertmanager API browser], 79 | [for example](https://gist.github.com/cherti/61ec48deaaab7d288c9fcf17e700853a): 80 | 81 | ```shell 82 | alertmanager_ip=$(juju status alertmanager/0 --format=json | \ 83 | jq -r ".applications.alertmanager.units.\"alertmanager/0\".address") 84 | 85 | curl -XPOST http://$alertmanager_ip:9093/api/v1/alerts -d "[{ 86 | \"status\": \"firing\", 87 | \"labels\": { 88 | \"alertname\": \"$name\", 89 | \"service\": \"my-service\", 90 | \"severity\":\"warning\", 91 | \"instance\": \"$name.example.net\" 92 | }, 93 | \"annotations\": { 94 | \"summary\": \"High latency is high!\" 95 | }, 96 | \"generatorURL\": \"http://prometheus.int.example.net\" 97 | }]" 98 | ``` 99 | 100 | The alert should then be listed, 101 | 102 | ```shell 103 | curl http://$alertmanager_ip:9093/api/v1/alerts 104 | ``` 105 | 106 | and visible on a karma dashboard, if configured. 107 | 108 | Relations between alertmanager and prometheus can be verified by 109 | [querying prometheus](https://prometheus.io/docs/prometheus/latest/querying/api/#alertmanagers) 110 | for active alertmanagers: 111 | 112 | ```shell 113 | curl -X GET "http://$prom_ip:9090/api/v1/alertmanagers" 114 | ``` 115 | 116 | ## Build charm 117 | 118 | Build the charm in this git repository using 119 | 120 | ```shell 121 | charmcraft pack 122 | ``` 123 | 124 | which will create a `*.charm` file you can deploy with: 125 | 126 | ```shell 127 | juju deploy ./alertmanager-k8s.charm \ 128 | --resource alertmanager-image=ubuntu/prometheus-alertmanager \ 129 | --config config_file='@path/to/alertmanager.yml' \ 130 | --config templates_file='@path/to/templates.tmpl' 131 | ``` 132 | 133 | 134 | ## Code overview 135 | - The main charm class is `AlertmanagerCharm`, which responds to config changes 136 | (via `ConfigChangedEvent`) and cluster changes (via `RelationJoinedEvent`, 137 | `RelationChangedEvent` and `RelationDepartedEvent`). 138 | - All lifecycle events call a common hook, `_common_exit_hook` after executing 139 | their own business logic. This pattern simplifies state tracking and improves 140 | consistency. 141 | - On startup, the charm waits for `PebbleReadyEvent` and for an IP address to 142 | become available before starting the karma service and declaring 143 | `ActiveStatus`. The charm must be related to an alertmanager instance, 144 | otherwise the charm will go into blocked state. 145 | 146 | ## Design choices 147 | - The `alertmanager.yml` config file is created in its entirety by the charm 148 | code on startup (the default `alertmanager.yml` is overwritten). This is done 149 | to maintain consistency across OCI images. 150 | - Hot reload via the alertmanager HTTP API is used whenever possible instead of 151 | service restart, to minimize downtime. 152 | 153 | 154 | [Alertmanager API browser]: https://petstore.swagger.io/?url=https://raw.githubusercontent.com/prometheus/alertmanager/main/api/v2/openapi.yaml 155 | [gh:Prometheus operator]: https://github.com/canonical/prometheus-operator 156 | [Prometheus operator]: https://charmhub.io/prometheus-k8s 157 | [COS Lite]: https://charmhub.io/cos-lite 158 | [Mattermost channel]: https://chat.charmhub.io/charmhub/channels/observability 159 | [Discourse]: https://discourse.charmhub.io/tag/alertmanager 160 | [test harness]: https://ops.readthedocs.io/en/latest/#module-ops.testing 161 | [pytest-operator]: https://github.com/charmed-kubernetes/pytest-operator/blob/main/docs/reference.md 162 | [python-libjuju]: https://pythonlibjuju.readthedocs.io/en/latest/ 163 | -------------------------------------------------------------------------------- /tests/unit/test_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import textwrap 6 | import unittest 7 | 8 | import ops 9 | from charms.alertmanager_k8s.v1.alertmanager_dispatch import AlertmanagerConsumer 10 | from ops.charm import CharmBase 11 | from ops.framework import StoredState 12 | from ops.testing import Harness 13 | 14 | ops.testing.SIMULATE_CAN_CONNECT = True # pyright: ignore 15 | 16 | 17 | class SampleConsumerCharm(CharmBase): 18 | """Mimic bare functionality of AlertmanagerCharm needed to test the consumer.""" 19 | 20 | # define custom metadata - without this the harness would parse the metadata.yaml in this repo, 21 | # which would result in expressions like self.harness.model.app.name to return 22 | # "alertmanager-k8s", which is not what we want in a consumer test 23 | metadata_yaml = textwrap.dedent( 24 | """ 25 | name: SampleConsumerCharm 26 | containers: 27 | consumer-charm: 28 | resource: consumer-charm-image 29 | resources: 30 | consumer-charm-image: 31 | type: oci-image 32 | requires: 33 | alerting: 34 | interface: alertmanager_dispatch 35 | peers: 36 | replicas: 37 | interface: consumer_charm_replica 38 | """ 39 | ) 40 | _stored = StoredState() 41 | 42 | def __init__(self, *args, **kwargs): 43 | super().__init__(*args) 44 | # relation name must match metadata 45 | self.alertmanager_lib = AlertmanagerConsumer(self, relation_name="alerting") 46 | 47 | self.framework.observe( 48 | self.alertmanager_lib.on.cluster_changed, self._on_alertmanager_cluster_changed 49 | ) 50 | 51 | self._stored.set_default(alertmanagers=[], cluster_changed_emitted=0) 52 | 53 | def _on_alertmanager_cluster_changed(self, _): 54 | self._stored.cluster_changed_emitted += 1 55 | self._stored.alertmanagers = self.alertmanager_lib.get_cluster_info() 56 | 57 | 58 | class TestConsumer(unittest.TestCase): 59 | def setUp(self): 60 | self.harness = Harness(SampleConsumerCharm, meta=SampleConsumerCharm.metadata_yaml) 61 | self.addCleanup(self.harness.cleanup) 62 | self.harness.set_leader(True) 63 | self.harness.begin_with_initial_hooks() 64 | 65 | def _relate_to_alertmanager(self) -> int: 66 | """Create relation between 'this app' and a hypothetical (remote) alertmanager.""" 67 | rel_id = self.harness.add_relation(relation_name="alerting", remote_app="am") 68 | return rel_id 69 | 70 | def _add_alertmanager_units(self, rel_id: int, num_units: int, start_with=0): 71 | for i in range(start_with, start_with + num_units): 72 | remote_unit_name = f"am/{i}" 73 | self.harness.add_relation_unit(rel_id, remote_unit_name) 74 | self.harness.update_relation_data( 75 | rel_id, remote_unit_name, {"public_address": f"10.20.30.{i}"} 76 | ) 77 | 78 | return rel_id 79 | 80 | def test_cluster_updated_after_alertmanager_units_join(self): 81 | # before 82 | self.assertEqual(set(), self.harness.charm.alertmanager_lib.get_cluster_info()) 83 | num_events = self.harness.charm._stored.cluster_changed_emitted 84 | 85 | # add relation 86 | rel_id = self._relate_to_alertmanager() 87 | self._add_alertmanager_units(rel_id, num_units=2) 88 | 89 | # after 90 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 91 | self.assertSetEqual( 92 | {"http://10.20.30.0", "http://10.20.30.1"}, 93 | self.harness.charm.alertmanager_lib.get_cluster_info(), 94 | ) 95 | 96 | num_events = self.harness.charm._stored.cluster_changed_emitted 97 | 98 | # add another unit 99 | self._add_alertmanager_units(rel_id, num_units=1, start_with=2) 100 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 101 | self.assertSetEqual( 102 | {"http://10.20.30.0", "http://10.20.30.1", "http://10.20.30.2"}, 103 | self.harness.charm.alertmanager_lib.get_cluster_info(), 104 | ) 105 | 106 | def test_cluster_updated_after_alertmanager_unit_leaves(self): 107 | num_events = self.harness.charm._stored.cluster_changed_emitted 108 | 109 | # add relation 110 | rel_id = self._relate_to_alertmanager() 111 | self._add_alertmanager_units(rel_id, num_units=4) 112 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 113 | before = self.harness.charm.alertmanager_lib.get_cluster_info() 114 | self.assertEqual(len(before), 4) 115 | 116 | num_events = self.harness.charm._stored.cluster_changed_emitted 117 | 118 | # remove alertmanager units 119 | self.harness.remove_relation_unit(rel_id, "am/3") 120 | self.harness.remove_relation_unit(rel_id, "am/2") 121 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 122 | after = self.harness.charm.alertmanager_lib.get_cluster_info() 123 | self.assertSetEqual(after, {"http://10.20.30.0", "http://10.20.30.1"}) 124 | 125 | num_events = self.harness.charm._stored.cluster_changed_emitted 126 | 127 | # remove all remaining units 128 | self.harness.remove_relation_unit(rel_id, "am/1") 129 | self.harness.remove_relation_unit(rel_id, "am/0") 130 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 131 | after = self.harness.charm.alertmanager_lib.get_cluster_info() 132 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 133 | self.assertSetEqual(after, set()) 134 | 135 | def test_cluster_is_empty_after_relation_breaks(self): 136 | # add relation 137 | rel_id = self._relate_to_alertmanager() 138 | self._add_alertmanager_units(rel_id, num_units=4) 139 | before = self.harness.charm.alertmanager_lib.get_cluster_info() 140 | self.assertEqual(len(before), 4) 141 | 142 | num_events = self.harness.charm._stored.cluster_changed_emitted 143 | 144 | # remove relation 145 | self.harness.remove_relation(rel_id) 146 | after = self.harness.charm.alertmanager_lib.get_cluster_info() 147 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 148 | self.assertSetEqual(set(), after) 149 | 150 | def test_relation_changed(self): 151 | # add relation 152 | rel_id = self._relate_to_alertmanager() 153 | self._add_alertmanager_units(rel_id, num_units=2) 154 | 155 | # update remote unit's relation data (emulates upgrade-charm) 156 | self.harness.update_relation_data(rel_id, "am/1", {"public_address": "90.80.70.60"}) 157 | self.assertSetEqual( 158 | {"http://10.20.30.0", "http://90.80.70.60"}, 159 | self.harness.charm.alertmanager_lib.get_cluster_info(), 160 | ) 161 | -------------------------------------------------------------------------------- /tests/unit/test_remote_configuration_provider.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | import json 5 | import logging 6 | import unittest 7 | from unittest.mock import PropertyMock, patch 8 | 9 | import yaml 10 | from charms.alertmanager_k8s.v0.alertmanager_remote_configuration import ( 11 | DEFAULT_RELATION_NAME, 12 | ConfigReadError, 13 | RemoteConfigurationProvider, 14 | ) 15 | from ops import testing 16 | from ops.charm import CharmBase, CharmEvents 17 | from ops.framework import EventBase, EventSource, StoredState 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | testing.SIMULATE_CAN_CONNECT = True # pyright: ignore 22 | 23 | TEST_APP_NAME = "provider-tester" 24 | METADATA = f""" 25 | name: {TEST_APP_NAME} 26 | provides: 27 | {DEFAULT_RELATION_NAME}: 28 | interface: alertmanager_remote_configuration 29 | """ 30 | TEST_ALERTMANAGER_CONFIG_WITHOUT_TEMPLATES_FILE_PATH = "./tests/unit/test_config/alertmanager.yml" 31 | TEST_ALERTMANAGER_CONFIG_WITH_TEMPLATES_FILE_PATH = ( 32 | "./tests/unit/test_config/alertmanager_with_templates.yml" 33 | ) 34 | TEST_ALERTMANAGER_INVALID_CONFIG_FILE_PATH = "./tests/unit/test_config/alertmanager_invalid.yml" 35 | TEST_ALERTMANAGER_TEMPLATES_FILE_PATH = "./tests/unit/test_config/test_templates.tmpl" 36 | TESTER_CHARM = "test_remote_configuration_provider.RemoteConfigurationProviderCharm" 37 | 38 | 39 | class AlertmanagerConfigFileChangedEvent(EventBase): 40 | pass 41 | 42 | 43 | class AlertmanagerConfigFileChangedCharmEvents(CharmEvents): 44 | alertmanager_config_file_changed = EventSource(AlertmanagerConfigFileChangedEvent) 45 | 46 | 47 | class RemoteConfigurationProviderCharm(CharmBase): 48 | ALERTMANAGER_CONFIG_FILE = TEST_ALERTMANAGER_CONFIG_WITHOUT_TEMPLATES_FILE_PATH 49 | 50 | on = AlertmanagerConfigFileChangedCharmEvents() # pyright: ignore 51 | _stored = StoredState() 52 | 53 | def __init__(self, *args): 54 | super().__init__(*args) 55 | self._stored.set_default(configuration_broken_emitted=0) 56 | 57 | alertmanager_config = RemoteConfigurationProvider.load_config_file( 58 | self.ALERTMANAGER_CONFIG_FILE 59 | ) 60 | self.remote_configuration_provider = RemoteConfigurationProvider( 61 | charm=self, 62 | alertmanager_config=alertmanager_config, 63 | relation_name=DEFAULT_RELATION_NAME, 64 | ) 65 | 66 | self.framework.observe(self.on.alertmanager_config_file_changed, self._update_config) 67 | self.framework.observe( 68 | self.remote_configuration_provider.on.configuration_broken, 69 | self._on_configuration_broken, 70 | ) 71 | 72 | def _update_config(self, _): 73 | try: 74 | alertmanager_config = RemoteConfigurationProvider.load_config_file( 75 | self.ALERTMANAGER_CONFIG_FILE 76 | ) 77 | self.remote_configuration_provider.update_relation_data_bag(alertmanager_config) 78 | except ConfigReadError: 79 | logger.warning("Error reading Alertmanager config file.") 80 | 81 | def _on_configuration_broken(self, _): 82 | self._stored.configuration_broken_emitted += 1 83 | 84 | 85 | class TestAlertmanagerRemoteConfigurationProvider(unittest.TestCase): 86 | def setUp(self) -> None: 87 | self.harness = testing.Harness(RemoteConfigurationProviderCharm, meta=METADATA) 88 | self.addCleanup(self.harness.cleanup) 89 | self.harness.set_leader(True) 90 | self.harness.begin_with_initial_hooks() 91 | 92 | def test_config_without_templates_updates_only_alertmanager_config_in_the_data_bag(self): 93 | with open(TEST_ALERTMANAGER_CONFIG_WITHOUT_TEMPLATES_FILE_PATH, "r") as config_yaml: 94 | expected_config = yaml.safe_load(config_yaml) 95 | 96 | relation_id = self.harness.add_relation(DEFAULT_RELATION_NAME, "requirer") 97 | self.harness.add_relation_unit(relation_id, "requirer/0") 98 | 99 | self.assertEqual( 100 | json.loads( 101 | self.harness.get_relation_data(relation_id, TEST_APP_NAME)["alertmanager_config"] 102 | ), 103 | expected_config, 104 | ) 105 | self.assertEqual( 106 | json.loads( 107 | self.harness.get_relation_data(relation_id, TEST_APP_NAME)[ 108 | "alertmanager_templates" 109 | ] 110 | ), 111 | [], 112 | ) 113 | 114 | @patch(f"{TESTER_CHARM}.ALERTMANAGER_CONFIG_FILE", new_callable=PropertyMock) 115 | def test_config_with_templates_updates_both_alertmanager_config_and_alertmanager_templates_in_the_data_bag( # noqa: E501 116 | self, patched_alertmanager_config_file 117 | ): 118 | patched_alertmanager_config_file.return_value = ( 119 | TEST_ALERTMANAGER_CONFIG_WITH_TEMPLATES_FILE_PATH 120 | ) 121 | with open(TEST_ALERTMANAGER_TEMPLATES_FILE_PATH, "r") as templates_file: 122 | expected_templates = templates_file.readlines() 123 | relation_id = self.harness.add_relation(DEFAULT_RELATION_NAME, "requirer") 124 | self.harness.add_relation_unit(relation_id, "requirer/0") 125 | 126 | self.harness.charm.on.alertmanager_config_file_changed.emit() 127 | 128 | self.assertEqual( 129 | json.loads( 130 | self.harness.get_relation_data(relation_id, TEST_APP_NAME)[ 131 | "alertmanager_templates" 132 | ] 133 | ), 134 | expected_templates, 135 | ) 136 | 137 | @patch(f"{TESTER_CHARM}.ALERTMANAGER_CONFIG_FILE", new_callable=PropertyMock) 138 | def test_invalid_config_emits_remote_configuration_broken_event( 139 | self, patched_alertmanager_config_file 140 | ): 141 | num_events = self.harness.charm._stored.configuration_broken_emitted 142 | patched_alertmanager_config_file.return_value = TEST_ALERTMANAGER_INVALID_CONFIG_FILE_PATH 143 | relation_id = self.harness.add_relation(DEFAULT_RELATION_NAME, "requirer") 144 | self.harness.add_relation_unit(relation_id, "requirer/0") 145 | 146 | self.harness.charm.on.alertmanager_config_file_changed.emit() 147 | 148 | self.assertGreater( 149 | self.harness.charm._stored.configuration_broken_emitted, 150 | num_events, 151 | ) 152 | 153 | @patch(f"{TESTER_CHARM}.ALERTMANAGER_CONFIG_FILE", new_callable=PropertyMock) 154 | def test_invalid_config_clears_relation_data_bag(self, patched_alertmanager_config_file): 155 | patched_alertmanager_config_file.return_value = TEST_ALERTMANAGER_INVALID_CONFIG_FILE_PATH 156 | relation_id = self.harness.add_relation(DEFAULT_RELATION_NAME, "requirer") 157 | self.harness.add_relation_unit(relation_id, "requirer/0") 158 | 159 | self.harness.charm.on.alertmanager_config_file_changed.emit() 160 | 161 | with self.assertRaises(KeyError): 162 | _ = self.harness.get_relation_data(relation_id, TEST_APP_NAME)["alertmanager_config"] 163 | 164 | @patch(f"{TESTER_CHARM}.ALERTMANAGER_CONFIG_FILE", new_callable=PropertyMock) 165 | def test_empty_config_file_clears_relation_data_bag(self, patched_alertmanager_config_file): 166 | test_config_file = "./tests/unit/test_config/alertmanager_empty.yml" 167 | patched_alertmanager_config_file.return_value = test_config_file 168 | relation_id = self.harness.add_relation(DEFAULT_RELATION_NAME, "requirer") 169 | self.harness.add_relation_unit(relation_id, "requirer/0") 170 | 171 | self.harness.charm.on.alertmanager_config_file_changed.emit() 172 | 173 | with self.assertRaises(KeyError): 174 | _ = self.harness.get_relation_data(relation_id, TEST_APP_NAME)["alertmanager_config"] 175 | -------------------------------------------------------------------------------- /tests/integration/helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """Helper functions for writing tests.""" 5 | 6 | import asyncio 7 | import grp 8 | import json 9 | import logging 10 | import urllib.request 11 | from typing import Dict, Optional, Tuple 12 | from urllib.parse import urlparse 13 | 14 | import requests 15 | from juju.unit import Unit 16 | from pytest_operator.plugin import OpsTest 17 | from requests.auth import HTTPBasicAuth 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | async def get_unit_address(ops_test: OpsTest, app_name: str, unit_num: int) -> str: 23 | """Get private address of a unit.""" 24 | assert ops_test.model 25 | status = await ops_test.model.get_status() # noqa: F821 26 | return status["applications"][app_name]["units"][f"{app_name}/{unit_num}"]["address"] 27 | 28 | 29 | def interleave(l1: list, l2: list) -> list: 30 | """Interleave two lists. 31 | 32 | >>> interleave([1,2,3], ['a', 'b', 'c']) 33 | [1, 'a', 2, 'b', 3, 'c'] 34 | 35 | Reference: https://stackoverflow.com/a/11125298/3516684 36 | """ 37 | return [x for t in zip(l1, l2) for x in t] 38 | 39 | 40 | async def cli_upgrade_from_path_and_wait( 41 | ops_test: OpsTest, 42 | path: str, 43 | alias: str, 44 | resources: Optional[Dict[str, str]] = None, 45 | wait_for_status: Optional[str] = None, 46 | ): 47 | assert ops_test.model 48 | if resources is None: 49 | resources = {} 50 | 51 | resource_pairs = [f"{k}={v}" for k, v in resources.items()] 52 | resource_arg_prefixes = ["--resource"] * len(resource_pairs) 53 | resource_args = interleave(resource_arg_prefixes, resource_pairs) 54 | 55 | cmd = [ 56 | "juju", 57 | "refresh", 58 | "--path", 59 | path, 60 | alias, 61 | *resource_args, 62 | ] 63 | 64 | retcode, stdout, stderr = await ops_test.run(*cmd) 65 | assert retcode == 0, f"Upgrade failed: {(stderr or stdout).strip()}" 66 | logger.info(stdout) 67 | await ops_test.model.wait_for_idle(apps=[alias], status=wait_for_status, timeout=120) 68 | 69 | 70 | async def get_leader_unit_num(ops_test: OpsTest, app_name: str): 71 | assert ops_test.model 72 | application = ops_test.model.applications[app_name] 73 | assert application 74 | units = application.units 75 | is_leader = [await units[i].is_leader_from_status() for i in range(len(units))] 76 | logger.info("Leaders: %s", is_leader) 77 | return is_leader.index(True) 78 | 79 | 80 | async def is_leader_elected(ops_test: OpsTest, app_name: str): 81 | assert ops_test.model 82 | application = ops_test.model.applications[app_name] 83 | assert application 84 | units = application.units 85 | return any([await units[i].is_leader_from_status() for i in range(len(units))]) 86 | 87 | 88 | async def block_until_leader_elected(ops_test: OpsTest, app_name: str): 89 | # await ops_test.model.block_until(is_leader_elected) 90 | # block_until does not take async (yet?) https://github.com/juju/python-libjuju/issues/609 91 | while not await is_leader_elected(ops_test, app_name): 92 | await asyncio.sleep(5) 93 | 94 | 95 | def uk8s_group() -> str: 96 | try: 97 | # Classically confined microk8s 98 | uk8s_group = grp.getgrnam("microk8s").gr_name 99 | except KeyError: 100 | # Strictly confined microk8s 101 | uk8s_group = "snap_microk8s" 102 | return uk8s_group 103 | 104 | 105 | async def is_alertmanage_unit_up(ops_test: OpsTest, app_name: str, unit_num: int): 106 | address = await get_unit_address(ops_test, app_name, unit_num) 107 | url = f"http://{address}:9093" 108 | logger.info("am public address: %s", url) 109 | 110 | response = urllib.request.urlopen(f"{url}/api/v2/status", data=None, timeout=2.0) 111 | return response.code == 200 and "versionInfo" in json.loads(response.read()) 112 | 113 | 114 | async def is_alertmanager_up(ops_test: OpsTest, app_name: str): 115 | assert ops_test.model 116 | application = ops_test.model.applications[app_name] 117 | assert application 118 | return all( 119 | [ 120 | await is_alertmanage_unit_up(ops_test, app_name, unit_num) 121 | for unit_num in range(len(application.units)) 122 | ] 123 | ) 124 | 125 | 126 | async def get_alertmanager_config_from_file( 127 | ops_test: OpsTest, app_name: str, container_name: str, config_file_path: str 128 | ) -> Tuple[Optional[int], str, str]: 129 | rc, stdout, stderr = await ops_test.juju( 130 | "ssh", "--container", f"{container_name}", f"{app_name}/0", "cat", f"{config_file_path}" 131 | ) 132 | return rc, stdout, stderr 133 | 134 | 135 | async def deploy_literal_bundle(ops_test: OpsTest, bundle: str): 136 | run_args = [ 137 | "juju", 138 | "deploy", 139 | "--trust", 140 | "-m", 141 | ops_test.model_name, 142 | str(ops_test.render_bundle(bundle)), 143 | ] 144 | 145 | retcode, stdout, stderr = await ops_test.run(*run_args) 146 | assert retcode == 0, f"Deploy failed: {(stderr or stdout).strip()}" 147 | logger.info(stdout) 148 | 149 | 150 | async def curl(ops_test: OpsTest, *, cert_dir: str, cert_path: str, ip_addr: str, mock_url: str): 151 | p = urlparse(mock_url) 152 | 153 | # Tell curl to resolve the mock url as traefik's IP (to avoid using a custom DNS 154 | # server). This is needed because the certificate issued by the CA would have that same 155 | # hostname as the subject, and for TLS to succeed, the target url's hostname must match 156 | # the one in the certificate. 157 | cmd = [ 158 | "curl", 159 | "-s", 160 | "--fail-with-body", 161 | "--resolve", 162 | f"{p.hostname}:{p.port or 443}:{ip_addr}", 163 | "--capath", 164 | str(cert_dir), 165 | "--cacert", 166 | str(cert_path), 167 | mock_url, 168 | ] 169 | logger.info("cURL command: '%s'", " ".join(cmd)) 170 | rc, stdout, stderr = await ops_test.run(*cmd) 171 | logger.info("%s: %s", mock_url, (rc, stdout, stderr)) 172 | assert rc == 0, ( 173 | f"curl exited with rc={rc} for {mock_url}; " 174 | "non-zero return code means curl encountered a >= 400 HTTP code" 175 | ) 176 | return stdout 177 | 178 | async def grafana_password(ops_test: OpsTest, app_name: str) -> str: 179 | """Get the admin password. Memoize it to reduce turnaround time. 180 | 181 | Args: 182 | ops_test: pytest-operator plugin 183 | app_name: string name of application 184 | 185 | Returns: 186 | admin password as a string 187 | """ 188 | leader: Optional[Unit] = None 189 | for unit in ops_test.model.applications[app_name].units: # type: ignore 190 | is_leader = await unit.is_leader_from_status() 191 | if is_leader: 192 | leader = unit 193 | break 194 | 195 | assert leader 196 | action = await leader.run_action("get-admin-password") 197 | action = await action.wait() 198 | return action.results["admin-password"] 199 | 200 | async def grafana_datasources(ops_test: OpsTest, app_name: str) -> "list[dict]": 201 | """Get the datasources configured in Grafana. 202 | 203 | A sample response from Grafana's /api/datasources endpoint is a list of datasources, similar to below. 204 | 205 | [{"id":1,"uid":"ABC","orgId":1,"name":"", 206 | "type":"alertmanager","typeName":"Alertmanager", 207 | "typeLogoUrl":"public/app/plugins/datasource/alertmanager/img/logo.svg","access":"proxy", 208 | "url":"","user":"","database":"","basicAuth":false,"isDefault":false, 209 | "jsonData":{"implementation":"prometheus","timeout":300},"readOnly":true}}, ...] 210 | 211 | Args: 212 | ops_test: pytest-operator plugin 213 | app_name: string name of application 214 | Returns: 215 | number of datasources as an integer 216 | """ 217 | address = await get_unit_address(ops_test, app_name, 0) 218 | url = f"http://{address}:3000/api/datasources" 219 | 220 | admin_password = await grafana_password(ops_test, app_name) 221 | response = requests.get( 222 | url, 223 | auth=HTTPBasicAuth("admin", admin_password), 224 | ) 225 | response.raise_for_status() 226 | datasources = response.json() 227 | return datasources 228 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Alertmanager Operator (k8s) 2 | [![Charmhub Badge](https://charmhub.io/alertmanager-k8s/badge.svg)](https://charmhub.io/alertmanager-k8s) 3 | [![Release](https://github.com/canonical/alertmanager-k8s-operator/actions/workflows/release.yaml/badge.svg)](https://github.com/canonical/alertmanager-k8s-operator/actions/workflows/release.yaml) 4 | [![Discourse Status](https://img.shields.io/discourse/status?server=https%3A%2F%2Fdiscourse.charmhub.io&style=flat&label=CharmHub%20Discourse)](https://discourse.charmhub.io) 5 | 6 | [Charmed Alertmanager (alertmanager-k8s)][Alertmanager operator] is a charm for 7 | [Alertmanager]. 8 | 9 | The charm imposes configurable resource limits on the workload, can be readily 10 | integrated with alert sources such as [prometheus][Prometheus operator] or 11 | [loki][Loki operator], and comes with built-in alert rules and dashboards for 12 | self-monitoring. 13 | 14 | It is an essential part of the [COS Lite bundle]. 15 | 16 | 17 | [Alertmanager]: https://prometheus.io/docs/alerting/latest/alertmanager/ 18 | [COS Lite bundle]: https://charmhub.io/cos-lite 19 | [Loki operator]: https://charmhub.io/loki-k8s 20 | [Prometheus operator]: https://charmhub.io/prometheus-k8s 21 | [Alertmanager operator]: https://charmhub.io/alertmanager-k8s 22 | 23 | 24 | ## Getting started 25 | 26 | ### Basic deployment 27 | 28 | Once you have a controller and model ready, you can deploy alertmanager 29 | using the Juju CLI: 30 | 31 | ```shell 32 | juju deploy --channel=beta alertmanager-k8s 33 | ``` 34 | 35 | The available [channels](https://snapcraft.io/docs/channels) are listed at the top 36 | of [the page](https://charmhub.io/alertmanager-k8s) and can also be retrieved with 37 | Charmcraft CLI: 38 | 39 | ```shell 40 | $ charmcraft status alertmanager-k8s 41 | 42 | Track Base Channel Version Revision Resources 43 | latest ubuntu 20.04 (amd64) stable - - - 44 | candidate - - - 45 | beta 9 9 alertmanager-image (r1) 46 | edge 9 9 alertmanager-image (r1) 47 | ``` 48 | 49 | Once the Charmed Operator is deployed, the status can be checked by running: 50 | 51 | ```shell 52 | juju status --relations --storage --color 53 | ``` 54 | 55 | 56 | ### Configuration 57 | 58 | In order to have alerts dispatched to your receiver(s) of choice, 59 | a [configuration file](https://www.prometheus.io/docs/alerting/latest/configuration/) 60 | must be provided to Alertmanager using the 61 | [`config_file`](https://charmhub.io/alertmanager-k8s/configure#config_file) option: 62 | 63 | ```shell 64 | juju config alertmanager-k8s \ 65 | config_file='@path/to/alertmanager.yml' 66 | ``` 67 | 68 | Note that if you use templates, you should use the `templates_file` config option 69 | instead of having a `templates` section in your `yaml` configuration file. 70 | (This is a slight deviation from the official alertmanager config spec.) 71 | 72 | 73 | Use the [`templates_file`](https://charmhub.io/alertmanager-k8s/configure#templates_file) 74 | option to push templates that are being used by the configuration file: 75 | 76 | ```shell 77 | juju config alertmanager-k8s \ 78 | config_file='@path/to/alertmanager.yml' \ 79 | templates_file='@path/to/templates.tmpl' 80 | ``` 81 | 82 | All templates need to go into this single config option, instead of 83 | the 'templates' section of the main configuration file. The templates will be 84 | pushed to the workload container, and the configuration file will be updated 85 | accordingly. 86 | 87 | Refer to the 88 | [official templates documentation](https://prometheus.io/docs/alerting/latest/notification_examples/) 89 | for more details. 90 | 91 | 92 | To verify Alertmanager is using the expected configuration you can use the 93 | [`show-config`](https://charmhub.io/alertmanager-k8s/actions#show-config) action: 94 | 95 | ```shell 96 | juju run-action alertmanager-k8s/0 show-config --wait 97 | ``` 98 | 99 | 100 | ### Dashboard and HTTP API 101 | 102 | The Alertmanager dashboard and 103 | [HTTP API](https://www.prometheus.io/docs/alerting/latest/management_api/) 104 | can be accessed at the default port (9093) on the Alertmanager IP address, 105 | which is determinable with a `juju status` command. 106 | 107 | To obtain the load-balanaced application IP, 108 | 109 | ```shell 110 | juju status alertmanager-k8s --format=json \ 111 | | jq -r '.applications."alertmanager-k8s".address' 112 | ``` 113 | 114 | Similarly, to obtain an individual unit's IP address: 115 | 116 | ```shell 117 | juju status alertmanager-k8s --format=json \ 118 | | jq -r '.applications."alertmanager-k8s".units."alertmanager-k8s/0".address' 119 | ``` 120 | 121 | So, if you navigate to these IPs you will get the Alertmanager dashboard: 122 | 123 | ![alertmanager-ui](https://github.com/user-attachments/assets/9eb0c006-ef57-476e-9341-46e076b596c4) 124 | 125 | ## Clustering 126 | 127 | ### Forming a cluster 128 | 129 | Alertmanager [supports clustering](https://www.prometheus.io/docs/alerting/latest/alertmanager/#high-availability) 130 | and all you need to do to create/update a cluster is to rescale the application. This can be done in two ways. 131 | 132 | Let's say we have one alertmanager unit running and we want to scale the deployment to three units. 133 | 134 | With `juju add-unit` we can achieve that using the `--num-units` argument and the number of units we want to add: 135 | 136 | ```shell 137 | juju add-unit alertmanager-k8s --num-units 2 138 | ``` 139 | 140 | or using `juju scale-application` and the total number of units we want: 141 | 142 | ```shell 143 | juju scale-application alertmanager-k8s 3 144 | ``` 145 | 146 | Regardless of which of the two options you use, `juju status --relations --color` will show you the status of the cluster. 147 | 148 | 149 | Internally, HA is achieved by providing each Alertmanager instance at least one IP address of another instance. The cluster would then auto-update with subsequent changes to the units present. 150 | 151 | ### Verification 152 | 153 | 154 | 155 | #### Pebble plan 156 | Cluster information is passed to Alertmanager via [`--cluster.peer` command line arguments](https://github.com/prometheus/alertmanager#high-availability). This can be verified by looking at the current pebble plan: 157 | 158 | ```shell 159 | > juju exec --unit alertmanager-k8s/0 -- \ 160 | PEBBLE_SOCKET=/charm/containers/alertmanager/pebble.socket \ 161 | pebble plan 162 | 163 | services: 164 | alertmanager: 165 | summary: alertmanager service 166 | startup: enabled 167 | override: replace 168 | command: alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager --web.listen-address=:9093 --cluster.listen-address=0.0.0.0:9094 --cluster.peer=10.1.179.220:9094 --cluster.peer=10.1.179.221:9094 169 | ``` 170 | #### HTTP API 171 | To manually verify a cluster is indeed formed, you can query the alertmanager HTTP API directly: 172 | 173 | ```shell 174 | > curl -s $ALERTMANAGER_IP:9093/api/v1/status \ 175 | | jq '.data.clusterStatus.peers[].address' 176 | "10.1.179.220:9094" 177 | "10.1.179.221:9094" 178 | "10.1.179.217:9094" 179 | ``` 180 | 181 | 182 | ## OCI Images 183 | This charm is published on Charmhub with alertmanager images from 184 | [ubuntu/prometheus-alertmanager], however, it should also work with the 185 | official [quay.io/prometheus/alertmanager]. 186 | 187 | To try the charm with a different image you can use `juju refresh`. For example: 188 | 189 | ```shell 190 | juju refresh alertmanager-k8s \ 191 | --resource alertmanager-image=quay.io/prometheus/alertmanager 192 | ``` 193 | 194 | 195 | [ubuntu/prometheus-alertmanager]: https://hub.docker.com/r/ubuntu/prometheus-alertmanager 196 | [quay.io/prometheus/alertmanager]: https://quay.io/repository/prometheus/alertmanager?tab=tags 197 | 198 | 199 | ## Official alertmanager documentation 200 | 201 | For further details about Alertmanager configuration and usage, please refer to 202 | the [official Alertmanager documentation](https://www.prometheus.io/docs/alerting/latest/overview/). 203 | 204 | 205 | ## Additional Information 206 | - [Logging, Monitoring, and Alerting](https://discourse.ubuntu.com/t/logging-monitoring-and-alerting/19151) (LMA) - 207 | a tutorial for running Prometheus, Grafana and Alertmanager with LXD. 208 | - [Alertmanager README](https://github.com/prometheus/alertmanager) 209 | - [PromCon 2018: Life of an Alert](https://youtube.com/watch?v=PUdjca23Qa4) 210 | -------------------------------------------------------------------------------- /lib/charms/catalogue_k8s/v1/catalogue.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """Charm for providing services catalogues to bundles or sets of charms. 5 | 6 | This charm library contains two classes (CatalogueProvider and CatalogueConsumer) that handle 7 | both sides of the `catalogue` relation interface. 8 | 9 | ### CatalogueConsumer 10 | 11 | The Consumer allows sending catalogue items to a Catalogue charm. 12 | 13 | Adding it to your charm is very simple: 14 | 15 | ``` 16 | from charms.catalogue_k8s.v1.catalogue import ( 17 | CatalogueConsumer, 18 | CatalogueItem, 19 | ) 20 | 21 | ... 22 | self.catalogue = CatalogueConsumer( 23 | charm=self, 24 | relation_name="catalogue", # optional 25 | item=CatalogueItem( 26 | name="myapp", 27 | url=myapp_url, 28 | icon="rainbow", 29 | description="This is a rainbow app!" 30 | ) 31 | ) 32 | ``` 33 | 34 | The relevant events listeners are already registered by the CatalogueConsumer object. 35 | 36 | ### CatalogueProvider 37 | 38 | The Provider helps you receive catalogue items from other charms to display them however you like. 39 | 40 | To implement this in your charm: 41 | 42 | ``` 43 | from charms.catalogue_k8s.v1.catalogue import CatalogueProvider 44 | 45 | ... 46 | self.catalogue = CatalogueProvider( 47 | charm=self, 48 | relation_name="catalogue", # optional 49 | ) 50 | ``` 51 | 52 | 53 | The relevant events listeners are already registered by the CatalogueProvider object. 54 | """ 55 | 56 | import ipaddress 57 | import json 58 | import logging 59 | from typing import Dict, Optional 60 | 61 | from ops.charm import CharmBase 62 | from ops.framework import EventBase, EventSource, Object, ObjectEvents 63 | 64 | LIBID = "fa28b361293b46668bcd1f209ada6983" 65 | LIBAPI = 1 66 | LIBPATCH = 3 67 | 68 | DEFAULT_RELATION_NAME = "catalogue" 69 | 70 | logger = logging.getLogger(__name__) 71 | 72 | 73 | class CatalogueItem: 74 | """`CatalogueItem` represents an application entry sent to a catalogue. 75 | 76 | icon (str): An Iconify Material Design Icon (MDI) string. 77 | (See: https://icon-sets.iconify.design/mdi for more details). 78 | api_docs (str): A URL to the docs relevant to this item (upstream or otherwise). 79 | api_endpoints (dict): A dictionary containing API information, where: 80 | - The key is a description or name of the endpoint (e.g., "Alerts"). 81 | - The value is the actual address of the endpoint (e.g., "'http://1.2.3.4:1234/api/v1/targets/metadata'"). 82 | - Example for setting the api_endpoints attr: 83 | api_endpoints={"Alerts": f"{self.external_url}/api/v1/alerts"} 84 | """ 85 | 86 | def __init__(self, name: str, url: str, icon: str, description: str = "", api_docs: str = "", api_endpoints: Optional[Dict[str,str]] = None): 87 | self.name = name 88 | self.url = url 89 | self.icon = icon 90 | self.description = description 91 | self.api_docs = api_docs 92 | self.api_endpoints = api_endpoints 93 | 94 | 95 | class CatalogueConsumer(Object): 96 | """`CatalogueConsumer` is used to send over a `CatalogueItem`.""" 97 | 98 | def __init__( 99 | self, 100 | charm, 101 | relation_name: str = DEFAULT_RELATION_NAME, 102 | item: Optional[CatalogueItem] = None, 103 | ): 104 | super().__init__(charm, relation_name) 105 | self._charm = charm 106 | self._relation_name = relation_name 107 | self._item = item 108 | 109 | events = self._charm.on[self._relation_name] 110 | self.framework.observe(events.relation_joined, self._on_relation_changed) 111 | self.framework.observe(events.relation_broken, self._on_relation_changed) 112 | self.framework.observe(events.relation_changed, self._on_relation_changed) 113 | self.framework.observe(events.relation_departed, self._on_relation_changed) 114 | self.framework.observe(events.relation_created, self._on_relation_changed) 115 | 116 | def _on_relation_changed(self, _): 117 | self._update_relation_data() 118 | 119 | def _update_relation_data(self): 120 | if not self._charm.unit.is_leader(): 121 | return 122 | 123 | if not self._item: 124 | return 125 | 126 | for relation in self._charm.model.relations[self._relation_name]: 127 | relation.data[self._charm.model.app]["name"] = self._item.name 128 | relation.data[self._charm.model.app]["description"] = self._item.description 129 | relation.data[self._charm.model.app]["url"] = self.unit_address(relation) 130 | relation.data[self._charm.model.app]["icon"] = self._item.icon 131 | relation.data[self._charm.model.app]["api_docs"] = self._item.api_docs 132 | relation.data[self._charm.model.app]["api_endpoints"] = json.dumps(self._item.api_endpoints) 133 | 134 | def update_item(self, item: CatalogueItem): 135 | """Update the catalogue item.""" 136 | self._item = item 137 | self._update_relation_data() 138 | 139 | def unit_address(self, relation): 140 | """Return the unit address of the consumer, on which it is reachable. 141 | 142 | Requires ingress to be connected for it to be routable. 143 | """ 144 | if self._item and self._item.url: 145 | return self._item.url 146 | return "" 147 | 148 | def _is_valid_unit_address(self, address: str) -> bool: 149 | """Validate a unit address. 150 | 151 | At present only IP address validation is supported, but 152 | this may be extended to DNS addresses also, as needed. 153 | 154 | Args: 155 | address: a string representing a unit address 156 | 157 | """ 158 | try: 159 | _ = ipaddress.ip_address(address) 160 | except ValueError: 161 | return False 162 | 163 | return True 164 | 165 | 166 | class CatalogueItemsChangedEvent(EventBase): 167 | """Event emitted when the catalogue entries change.""" 168 | 169 | def __init__(self, handle, items): 170 | super().__init__(handle) 171 | self.items = items 172 | 173 | def snapshot(self): 174 | """Save catalogue entries information.""" 175 | return {"items": self.items} 176 | 177 | def restore(self, snapshot): 178 | """Restore catalogue entries information.""" 179 | self.items = snapshot["items"] 180 | 181 | 182 | class CatalogueEvents(ObjectEvents): 183 | """Events raised by `CatalogueConsumer`.""" 184 | 185 | items_changed = EventSource(CatalogueItemsChangedEvent) 186 | 187 | 188 | class CatalogueProvider(Object): 189 | """`CatalogueProvider` is the side of the relation that serves the actual service catalogue.""" 190 | 191 | on = CatalogueEvents() # pyright: ignore 192 | 193 | def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): 194 | super().__init__(charm, relation_name) 195 | self._charm = charm 196 | self._relation_name = relation_name 197 | events = self._charm.on[self._relation_name] 198 | self.framework.observe(events.relation_changed, self._on_relation_changed) 199 | self.framework.observe(events.relation_joined, self._on_relation_changed) 200 | self.framework.observe(events.relation_departed, self._on_relation_changed) 201 | self.framework.observe(events.relation_broken, self._on_relation_broken) 202 | 203 | def _on_relation_broken(self, event): 204 | self.on.items_changed.emit(items=self.items) # pyright: ignore 205 | 206 | def _on_relation_changed(self, event): 207 | self.on.items_changed.emit(items=self.items) # pyright: ignore 208 | 209 | @property 210 | def items(self): 211 | """A list of apps sent over relation data.""" 212 | return [ 213 | { 214 | "name": relation.data[relation.app].get("name", ""), 215 | "url": relation.data[relation.app].get("url", ""), 216 | "icon": relation.data[relation.app].get("icon", ""), 217 | "description": relation.data[relation.app].get("description", ""), 218 | "api_docs": relation.data[relation.app].get("api_docs", ""), 219 | "api_endpoints": json.loads(relation.data[relation.app].get("api_endpoints", "{}")), 220 | } 221 | for relation in self._charm.model.relations[self._relation_name] 222 | if relation.app and relation.units 223 | ] 224 | -------------------------------------------------------------------------------- /charmcraft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | name: alertmanager-k8s 4 | type: charm 5 | summary: Alertmanager handles alerts sent by client applications. 6 | description: > 7 | Alertmanager handles alerts sent by client applications such as the Prometheus server. 8 | It takes care of deduplicating, grouping, and routing them to the correct receiver integrations 9 | such as email, PagerDuty, or OpsGenie. It also takes care of silencing and inhibition of alerts. 10 | 11 | links: 12 | documentation: https://discourse.charmhub.io/t/alertmanager-k8s-docs-index/5788 13 | website: https://charmhub.io/alertmanager-k8s 14 | source: https://github.com/canonical/alertmanager-k8s-operator 15 | issues: https://github.com/canonical/alertmanager-k8s-operator/issues 16 | 17 | assumes: 18 | - k8s-api 19 | - juju >= 3.6 20 | 21 | platforms: 22 | ubuntu@24.04:amd64: 23 | 24 | parts: 25 | charm: 26 | source: . 27 | plugin: uv 28 | build-packages: [git] 29 | build-snaps: [astral-uv] 30 | override-build: | 31 | craftctl default 32 | git describe --always > $CRAFT_PART_INSTALL/version 33 | 34 | containers: 35 | alertmanager: # container key used by pebble 36 | resource: alertmanager-image 37 | mounts: 38 | - storage: data 39 | # nflogs and silences files go here. With a mounted storage for silences, they persist 40 | # across container restarts. 41 | # This path is passed to alertmanager via the `--storage.path` cli argument. 42 | location: /alertmanager 43 | 44 | storage: 45 | data: 46 | type: filesystem 47 | description: > 48 | Storage path passed to alertmanager via --storage.path argument and used for nflog and silences snapshot 49 | 50 | provides: 51 | alerting: 52 | # The provider (alertmanager) adds the following key-value pair to the relation data bag of 53 | # every alertmanager unit: 54 | # "public_address": : 55 | interface: alertmanager_dispatch 56 | optional: true 57 | description: | 58 | Integrates with other charms to send notifications when alert rules are triggered. 59 | karma-dashboard: 60 | interface: karma_dashboard 61 | optional: true 62 | description: | 63 | Links an entire Alertmanager cluster to a Karma[1] dashboard. 64 | Scaling alertmanager would automatically cause karma to group alerts by cluster. 65 | 66 | [1] https://charmhub.io/karma-k8s 67 | self-metrics-endpoint: 68 | interface: prometheus_scrape 69 | optional: true 70 | description: | 71 | Exposes the Prometheus metrics endpoint providing telemetry about the Alertmanager instance. 72 | grafana-dashboard: 73 | interface: grafana_dashboard 74 | optional: true 75 | description: | 76 | Forwards the built-in Grafana dashboard(s) for monitoring Alertmanager. 77 | grafana-source: 78 | interface: grafana_datasource 79 | optional: true 80 | description: | 81 | Configures Grafana to be able to use this Alertmanager instance as a datasource. 82 | provide-cmr-mesh: # server-side-for-cmr-mesh 83 | interface: cross_model_mesh 84 | description: | 85 | Allow cross-model applications to make HTTP requests to alertmanager via the service mesh. 86 | This relation provides additional data required by the service mesh to create cross-model authorization policies 87 | 88 | Announce a subset of juju topology to the other side because a CMR obfuscates identity. 89 | Each pair of charm would need a separate relation of this kind, e.g. otelcol to loki and to prom. 90 | 91 | To make use of this relation, you also must have either the service-mesh relation in place (e.g. istio-beacon) or 92 | have the istio-beacon enroll the entire model (via its config option). 93 | (The service_mesh charm library manages both of these relations.) 94 | 95 | requires: 96 | ingress: 97 | interface: ingress 98 | optional: true 99 | limit: 1 100 | description: | 101 | Alertmanager typically needs a "per app" ingress, which is available in the traefik charm[1]. 102 | 103 | [1] https://charmhub.io/traefik-k8s 104 | remote-configuration: 105 | interface: alertmanager_remote_configuration 106 | optional: true 107 | limit: 1 108 | catalogue: 109 | interface: catalogue 110 | optional: true 111 | description: Add Alertmanager as an item to a Catalogue charm. 112 | certificates: 113 | interface: tls-certificates 114 | optional: true 115 | limit: 1 116 | description: | 117 | Certificate and key files for the alertmanager server to use to authenticate to client. 118 | tracing: 119 | interface: tracing 120 | optional: true 121 | limit: 1 122 | description: | 123 | Enables sending workload traces to a distributed tracing backend such as Tempo. 124 | service-mesh: 125 | limit: 1 126 | interface: service_mesh 127 | description: | 128 | Subscribe this charm into a service mesh and create authorization policies. 129 | We forward to the beacon our authorization policies. 130 | The beacon sends the pod and service labels required by this charm to join the mesh. 131 | This relation is a pre-requisite for using the provide-cmr-mesh relation. 132 | require-cmr-mesh: 133 | # TODO: remove this relation when this is fixed: 134 | # https://github.com/canonical/istio-beacon-k8s-operator/issues/91 135 | interface: cross_model_mesh 136 | description: | 137 | Allow a cross-model application access to alertmanager via the service mesh. 138 | This relation provides additional data required by the service mesh to enforce cross-model authorization policies. 139 | 140 | peers: 141 | replicas: 142 | interface: alertmanager_replica 143 | # assumed network type: private 144 | 145 | resources: 146 | alertmanager-image: 147 | type: oci-image 148 | description: | 149 | OCI image for alertmanager. This charms makes the following assumptions about the image: 150 | - location of executable "alertmanager" is in the path 151 | - has `update-ca-certificates` 152 | upstream-source: ubuntu/alertmanager@sha256:368985dfd680291f1888cc339afa7a097981ccb33b3398598e18f0dda2027573 # renovate: oci-image tag: 0.28.0-24.04 153 | 154 | config: 155 | options: 156 | config_file: 157 | type: string 158 | default: "" 159 | description: > 160 | Alertmanager configuration file (yaml), with the exclusion of the templates section. 161 | To send the contents of a file to this configuration option, the symbol `@` must be used. 162 | 163 | Usage: `juju config alertmanager config_file=@alertmanager.yaml` 164 | 165 | For more information on configuring the Alertmanager, refer to: 166 | https://www.prometheus.io/docs/alerting/latest/configuration/ 167 | templates_file: 168 | type: string 169 | default: "" 170 | description: > 171 | Alertmanager templates definition file. This is a slight deviation from the official 172 | alertmanager config spec. All templates need to go into this single config option, instead of 173 | the 'templates' section of the main configuration file. The templates will be pushed to the 174 | workload container, and the configuration file will be updated accordingly. Templates can't 175 | be used without `config_file`. 176 | Refer to https://prometheus.io/docs/alerting/latest/notification_examples/ for more details 177 | on templates. 178 | web_external_url: 179 | type: string 180 | default: "" 181 | description: | 182 | DEPRECATED. This config option is no longer used, in favor of "skipPrefix". 183 | 184 | The URL under which Alertmanager is externally reachable (for example, if 185 | Alertmanager is served via a manually configured ingress). 186 | 187 | This config option is used for the `--web.external-url` alertmanager cli 188 | argument. If this charm config option is provided, it takes precedence over the 189 | URL provided over the "ingress" relation. 190 | 191 | Note: this config option shouldn't be included when you're using the "ingress" 192 | relation (e.g. traefik) - the charm will automatically assign an external url 193 | to `--web.external-url` when related to an ingress provider. 194 | 195 | This should be a complete URI, including scheme, or a fully qualified subpath 196 | starting with `/`. 197 | If Alertmanager is being served directly from the root of a fully-qualified 198 | host or a bare A record, this may be omitted. 199 | If the URL has a path portion, Alertmanager will use it to prefix all HTTP 200 | endpoints. 201 | cpu: 202 | description: | 203 | K8s cpu resource limit, e.g. "1" or "500m". Default is unset (no limit). This value is used 204 | for the "limits" portion of the resource requirements (the "requests" portion is 205 | automatically deduced from it). 206 | See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ 207 | type: string 208 | memory: 209 | description: | 210 | K8s memory resource limit, e.g. "1Gi". Default is unset (no limit). This value is used 211 | for the "limits" portion of the resource requirements (the "requests" portion is 212 | automatically deduced from it). 213 | See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ 214 | type: string 215 | 216 | actions: 217 | show-config: 218 | description: Show alertmanager config file. 219 | check-config: 220 | description: | 221 | Run `amtool` inside the workload to validate the configuration file, and 222 | return the resulting output. This can be useful for troubleshooting. 223 | 224 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /tests/unit/test_charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | import unittest 5 | from unittest.mock import patch 6 | 7 | import ops 8 | import yaml 9 | from helpers import k8s_resource_multipatch 10 | from ops import pebble 11 | from ops.model import ActiveStatus, BlockedStatus 12 | from ops.testing import Harness 13 | 14 | from alertmanager import WorkloadManager 15 | from charm import AlertmanagerCharm 16 | 17 | ops.testing.SIMULATE_CAN_CONNECT = True # pyright: ignore 18 | 19 | 20 | class TestWithInitialHooks(unittest.TestCase): 21 | container_name: str = "alertmanager" 22 | 23 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 24 | @patch("socket.getfqdn", new=lambda *args: "fqdn") 25 | @k8s_resource_multipatch 26 | @patch("lightkube.core.client.GenericSyncClient") 27 | @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) 28 | def setUp(self, *unused): 29 | self.harness = Harness(AlertmanagerCharm) 30 | self.addCleanup(self.harness.cleanup) 31 | 32 | self.relation_id = self.harness.add_relation("alerting", "otherapp") 33 | self.harness.add_relation_unit(self.relation_id, "otherapp/0") 34 | self.harness.set_leader(True) 35 | 36 | self.harness.begin_with_initial_hooks() 37 | 38 | def test_num_peers(self): 39 | self.assertEqual(0, len(self.harness.charm.peer_relation.units)) # type: ignore 40 | 41 | @patch("socket.getfqdn", new=lambda *args: "fqdn") 42 | def test_pebble_layer_added(self, *unused): 43 | plan = self.harness.get_container_pebble_plan(self.container_name) 44 | 45 | # Check we've got the plan as expected 46 | self.assertIsNotNone(plan.services) 47 | self.assertIsNotNone(service := plan.services.get(self.harness.charm._service_name)) 48 | self.assertIsNotNone(command := service.command) # pyright: ignore 49 | 50 | # Check command is as expected 51 | self.assertEqual( 52 | plan.services, self.harness.charm.alertmanager_workload._alertmanager_layer().services 53 | ) 54 | 55 | # Check command contains key arguments 56 | self.assertIn("--config.file", command) 57 | self.assertIn("--storage.path", command) 58 | self.assertIn("--web.listen-address", command) 59 | self.assertIn("--cluster.listen-address", command) 60 | 61 | # Check the service was started 62 | service = self.harness.model.unit.get_container("alertmanager").get_service("alertmanager") 63 | self.assertTrue(service.is_running()) 64 | 65 | @patch("socket.getfqdn", new=lambda *args: "fqdn") 66 | def test_relation_data_provides_public_address(self): 67 | # to suppress mypy error: Item "None" of "Optional[Any]" has no attribute "get_relation" 68 | model = self.harness.charm.framework.model 69 | assert model is not None 70 | 71 | rel = model.get_relation("alerting", self.relation_id) 72 | assert rel is not None # for static checker 73 | expected_address = "fqdn:{}".format(self.harness.charm.api_port) 74 | expected_rel_data = { 75 | "url": "http://fqdn:9093", 76 | "public_address": expected_address, 77 | "scheme": "http", 78 | } 79 | self.assertEqual(expected_rel_data, rel.data[self.harness.charm.unit]) 80 | 81 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 82 | @k8s_resource_multipatch 83 | @patch.object(AlertmanagerCharm, "_update_ca_certs", lambda *a, **kw: None) 84 | def test_topology_added_if_user_provided_config_without_group_by(self, *unused): 85 | new_config = yaml.dump({"not a real config": "but good enough for testing"}) 86 | self.harness.update_config({"config_file": new_config}) 87 | updated_config = yaml.safe_load( 88 | self.harness.charm.container.pull(self.harness.charm._config_path) 89 | ) 90 | 91 | self.assertEqual(updated_config["not a real config"], "but good enough for testing") 92 | self.assertListEqual( 93 | sorted(updated_config["route"]["group_by"]), 94 | sorted(["juju_model", "juju_application", "juju_model_uuid"]), 95 | ) 96 | 97 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 98 | @k8s_resource_multipatch 99 | @patch.object(AlertmanagerCharm, "_update_ca_certs", lambda *a, **kw: None) 100 | def test_topology_added_if_user_provided_config_with_group_by(self, *unused): 101 | new_config = yaml.dump({"route": {"group_by": ["alertname", "juju_model"]}}) 102 | self.harness.update_config({"config_file": new_config}) 103 | updated_config = yaml.safe_load( 104 | self.harness.charm.container.pull(self.harness.charm._config_path) 105 | ) 106 | 107 | self.assertListEqual( 108 | sorted(updated_config["route"]["group_by"]), 109 | sorted(["alertname", "juju_model", "juju_application", "juju_model_uuid"]), 110 | ) 111 | 112 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 113 | @k8s_resource_multipatch 114 | @patch.object(AlertmanagerCharm, "_update_ca_certs", lambda *a, **kw: None) 115 | def test_topology_is_not_added_if_user_provided_config_with_ellipsis(self, *unused): 116 | """The special value '...' effectively disables aggregation entirely. 117 | 118 | Ref: https://prometheus.io/docs/alerting/latest/configuration/#route 119 | """ 120 | new_config = yaml.dump({"route": {"group_by": ["..."]}}) 121 | self.harness.update_config({"config_file": new_config}) 122 | updated_config = yaml.safe_load( 123 | self.harness.charm.container.pull(self.harness.charm._config_path) 124 | ) 125 | 126 | self.assertListEqual( 127 | updated_config["route"]["group_by"], 128 | sorted(["..."]), 129 | ) 130 | 131 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 132 | @k8s_resource_multipatch 133 | @patch.object(AlertmanagerCharm, "_update_ca_certs", lambda *a, **kw: None) 134 | def test_charm_blocks_if_user_provided_config_with_templates(self, *unused): 135 | new_config = yaml.dump({"templates": ["/what/ever/*.tmpl"]}) 136 | self.harness.update_config({"config_file": new_config}) 137 | self.assertIsInstance(self.harness.charm.unit.status, BlockedStatus) 138 | 139 | new_config = yaml.dump({}) 140 | self.harness.update_config({"config_file": new_config}) 141 | self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) 142 | 143 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 144 | @k8s_resource_multipatch 145 | @patch.object(AlertmanagerCharm, "_update_ca_certs", lambda *a, **kw: None) 146 | def test_templates_file_not_created_if_user_provides_templates_without_config(self, *unused): 147 | templates = '{{ define "some.tmpl.variable" }}whatever it is{{ end}}' 148 | self.harness.update_config({"templates_file": templates}) 149 | 150 | # The testing harness's pull() used to raise FileNotFoundError, but 151 | # now it (correctly) raises pebble.PathError as per the real system, 152 | # so catch both. 153 | # TODO: update to just pebble.PathError when ops 2.1 is released. 154 | with self.assertRaises((pebble.PathError, FileNotFoundError)): 155 | self.harness.charm.container.pull(self.harness.charm._templates_path) 156 | 157 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 158 | @k8s_resource_multipatch 159 | @patch.object(AlertmanagerCharm, "_update_ca_certs", lambda *a, **kw: None) 160 | def test_templates_section_added_if_user_provided_templates(self, *unused): 161 | new_config = yaml.dump({"route": {"group_by": ["alertname", "juju_model"]}}) 162 | self.harness.update_config({"config_file": new_config}) 163 | templates = '{{ define "some.tmpl.variable" }}whatever it is{{ end}}' 164 | self.harness.update_config({"templates_file": templates}) 165 | updated_templates = self.harness.charm.container.pull(self.harness.charm._templates_path) 166 | self.assertEqual(templates, updated_templates.read()) 167 | 168 | updated_config = yaml.safe_load( 169 | self.harness.charm.container.pull(self.harness.charm._config_path) 170 | ) 171 | self.assertEqual(updated_config["templates"], [f"{self.harness.charm._templates_path}"]) 172 | 173 | 174 | class TestWithoutInitialHooks(unittest.TestCase): 175 | container_name: str = "alertmanager" 176 | 177 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 178 | @k8s_resource_multipatch 179 | @patch("lightkube.core.client.GenericSyncClient") 180 | def setUp(self, *unused): 181 | self.harness = Harness(AlertmanagerCharm) 182 | self.addCleanup(self.harness.cleanup) 183 | 184 | self.relation_id = self.harness.add_relation("alerting", "otherapp") 185 | self.harness.add_relation_unit(self.relation_id, "otherapp/0") 186 | self.harness.set_leader(True) 187 | 188 | self.harness.begin() 189 | self.harness.add_relation("replicas", "alertmanager") 190 | 191 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 192 | @k8s_resource_multipatch 193 | @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) 194 | def test_unit_status_around_pebble_ready(self, *unused): 195 | # before pebble_ready, status should be "maintenance" 196 | self.assertIsInstance(self.harness.charm.unit.status, ops.model.MaintenanceStatus) 197 | 198 | # after pebble_ready, status should be "active" 199 | self.harness.container_pebble_ready(self.container_name) 200 | self.assertIsInstance(self.harness.charm.unit.status, ops.model.ActiveStatus) 201 | 202 | self.assertEqual(self.harness.model.unit.name, "alertmanager-k8s/0") 203 | 204 | 205 | class TestActions(unittest.TestCase): 206 | container_name: str = "alertmanager" 207 | 208 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 209 | @patch("socket.getfqdn", new=lambda *args: "fqdn") 210 | @k8s_resource_multipatch 211 | @patch("lightkube.core.client.GenericSyncClient") 212 | @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) 213 | def setUp(self, *unused): 214 | self.harness = Harness(AlertmanagerCharm) 215 | self.addCleanup(self.harness.cleanup) 216 | 217 | self.harness.set_leader(True) 218 | self.harness.begin_with_initial_hooks() 219 | 220 | @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) 221 | @k8s_resource_multipatch 222 | @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) 223 | def test_show_config(self, *_unused): 224 | tls_paths = { 225 | self.harness.charm._server_cert_path, 226 | self.harness.charm._ca_cert_path, 227 | self.harness.charm._key_path, 228 | } 229 | 230 | # GIVEN an isolated charm (see setUp, decorator) 231 | # WHEN the "show-config" action runs 232 | results = self.harness.run_action("show-config").results 233 | 234 | # THEN the result is a dict some keys 235 | self.assertEqual(results.keys(), {"path", "content", "configs"}) 236 | 237 | # AND configs DOES NOT contain cert-related entries 238 | # results.configs is a list of dicts, [{"path": ..., "content": ...}, {...}, ...]. 239 | paths_rendered = {d["path"] for d in yaml.safe_load(results["configs"])} 240 | for filepath in tls_paths: 241 | self.assertNotIn(filepath, paths_rendered) 242 | 243 | # AND GIVEN a tls relation is in place 244 | rel_id = self.harness.add_relation("certificates", "ca") 245 | self.harness.add_relation_unit(rel_id, "ca/0") 246 | # AND cert files are on disk 247 | for filepath in tls_paths: 248 | self.harness.model.unit.get_container("alertmanager").push( 249 | filepath, "test", make_dirs=True 250 | ) 251 | 252 | # WHEN the "show-config" action runs 253 | results = self.harness.run_action("show-config").results 254 | 255 | # THEN the result is a dict with the same keys as before 256 | self.assertEqual(results.keys(), {"path", "content", "configs"}) 257 | 258 | # AND configs contains cert-related entries 259 | paths_rendered = {d["path"] for d in yaml.safe_load(results["configs"])} 260 | for filepath in tls_paths: 261 | self.assertIn(filepath, paths_rendered) 262 | -------------------------------------------------------------------------------- /src/alertmanager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2023 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """Workload manager for alertmanaqger.""" 6 | 7 | import logging 8 | import os 9 | import re 10 | from typing import Callable, Dict, List, Optional, Tuple 11 | 12 | from ops.framework import Object 13 | from ops.model import Container 14 | from ops.pebble import ( # type: ignore 15 | ChangeError, 16 | ExecError, 17 | Layer, 18 | ) 19 | 20 | from alertmanager_client import Alertmanager, AlertmanagerBadResponse 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class ConfigFileSystemState: 26 | """Class representing the configuration state in a filesystem.""" 27 | 28 | def __init__(self, manifest: Optional[Dict[str, Optional[str]]] = None): 29 | self._manifest = manifest.copy() if manifest else {} 30 | 31 | @property 32 | def manifest(self) -> Dict[str, Optional[str]]: 33 | """Return a copy of the planned manifest.""" 34 | return self._manifest.copy() 35 | 36 | def add_file(self, path: str, content: str): 37 | """Add a file to the configuration.""" 38 | # `None` means it needs to be removed (if present). If paths changed across an upgrade, 39 | # to prevent stale files from remaining (if were previously written to persistent 40 | # storage), hard-code the old paths to None to guarantee their removal. 41 | self._manifest[path] = content 42 | 43 | def delete_file(self, path: str): 44 | """Add a file to the configuration.""" 45 | self._manifest[path] = None 46 | 47 | def apply(self, container: Container): 48 | """Apply this manifest onto a container.""" 49 | for filepath, content in self._manifest.items(): 50 | if content is None: 51 | container.remove_path(filepath, recursive=True) 52 | else: 53 | container.push(filepath, content, make_dirs=True) 54 | 55 | 56 | class WorkloadManagerError(Exception): 57 | """Base class for exceptions raised by WorkloadManager.""" 58 | 59 | 60 | class ConfigUpdateFailure(WorkloadManagerError): 61 | """Custom exception for failed config updates.""" 62 | 63 | 64 | class ContainerNotReady(WorkloadManagerError): 65 | """Raised when an operation is run that presumes the container being ready..""" 66 | 67 | 68 | class WorkloadManager(Object): 69 | """Workload manager for alertmanager.""" 70 | 71 | _layer_name = _service_name = _exe_name = "alertmanager" 72 | 73 | # path, inside the workload container for alertmanager data, e.g. 'nflogs', 'silences'. 74 | _storage_path = "/alertmanager" 75 | 76 | _amtool_path = "/usr/bin/amtool" 77 | 78 | def __init__( 79 | self, 80 | charm, 81 | *, 82 | container_name: str, 83 | peer_netlocs: List[str], 84 | api_port: int, 85 | ha_port: int, 86 | web_external_url: str, 87 | web_route_prefix: str, 88 | config_path: str, 89 | web_config_path: str, 90 | tls_enabled: Callable[[], bool], 91 | cafile: Optional[str], 92 | ): 93 | # Must inherit from ops 'Object' to be able to register events. 94 | super().__init__(charm, f"{self.__class__.__name__}-{container_name}") 95 | 96 | self._unit = charm.unit 97 | 98 | self._service_name = self._container_name = container_name 99 | self._container = charm.unit.get_container(container_name) 100 | 101 | self._peer_netlocs = peer_netlocs 102 | 103 | self._api_port = api_port 104 | self._ha_port = ha_port 105 | self.api = Alertmanager(endpoint_url=web_external_url, cafile=cafile) 106 | self._web_external_url = web_external_url 107 | self._web_route_prefix = web_route_prefix 108 | self._config_path = config_path 109 | self._web_config_path = web_config_path 110 | self._is_tls_enabled = tls_enabled 111 | 112 | # turn the container name to a valid Python identifier 113 | snake_case_container_name = self._container_name.replace("-", "_") 114 | charm.framework.observe( 115 | charm.on[snake_case_container_name].pebble_ready, 116 | self._on_pebble_ready, 117 | ) 118 | charm.framework.observe(charm.on.stop, self._on_stop) 119 | 120 | @property 121 | def is_ready(self): 122 | """Is the workload ready to be interacted with?""" 123 | return self._container.can_connect() 124 | 125 | def _on_pebble_ready(self, _): 126 | if version := self._alertmanager_version: 127 | self._unit.set_workload_version(version) 128 | else: 129 | logger.debug( 130 | "Cannot set workload version at this time: could not get Alertmanager version." 131 | ) 132 | 133 | def _on_stop(self, _): 134 | self._unit.set_workload_version("") 135 | 136 | @property 137 | def _alertmanager_version(self) -> Optional[str]: 138 | """Returns the version of Alertmanager. 139 | 140 | Returns: 141 | A string equal to the Alertmanager version. 142 | """ 143 | if not self.is_ready: 144 | return None 145 | version_output, _ = self._container.exec( 146 | [self._exe_name, "--version"], timeout=30 147 | ).wait_output() 148 | # Output looks like this: 149 | # alertmanager, version 0.23.0 (branch: HEAD, ... 150 | result = re.search(r"version (\d*\.\d*\.\d*)", version_output) 151 | if result is None: 152 | return result 153 | return result.group(1) 154 | 155 | def check_config(self) -> Tuple[str, str]: 156 | """Check config with amtool. 157 | 158 | Returns stdout, stderr. 159 | """ 160 | if not self.is_ready: 161 | raise ContainerNotReady( 162 | "cannot check config: alertmanager workload container not ready" 163 | ) 164 | proc = self._container.exec( 165 | [self._amtool_path, "check-config", self._config_path], timeout=30 166 | ) 167 | try: 168 | output, err = proc.wait_output() 169 | except ExecError as e: 170 | output, err = str(e.stdout), str(e.stderr) 171 | # let ChangeError raise 172 | return output, err 173 | 174 | def _alertmanager_layer(self) -> Layer: 175 | """Returns Pebble configuration layer for alertmanager.""" 176 | 177 | def _command(): 178 | """Returns full command line to start alertmanager.""" 179 | # cluster listen netloc - empty string disables HA mode 180 | listen_netloc_arg = "" if len(self._peer_netlocs) == 0 else f"0.0.0.0:{self._ha_port}" 181 | 182 | # The chosen port in the cluster.listen-address flag is the port that needs to be 183 | # specified in the cluster.peer flag of the other peers. 184 | # Assuming all replicas use the same port. 185 | # Sorting for repeatability in comparing between service layers. 186 | peer_cmd_args = " ".join( 187 | sorted([f"--cluster.peer={netloc}" for netloc in self._peer_netlocs]) 188 | ) 189 | web_config_arg = ( 190 | f"--web.config.file={self._web_config_path} " if self._is_tls_enabled() else "" 191 | ) 192 | return ( 193 | f"{self._exe_name} " 194 | f"--config.file={self._config_path} " 195 | f"--storage.path={self._storage_path} " 196 | f"--web.listen-address=:{self._api_port} " 197 | f"--cluster.listen-address={listen_netloc_arg} " 198 | f"--web.external-url={self._web_external_url} " 199 | f"--web.route-prefix={self._web_route_prefix} " 200 | f"{web_config_arg}" 201 | f"{peer_cmd_args}" 202 | ) 203 | 204 | def _environment(): 205 | return { 206 | "https_proxy": os.environ.get("JUJU_CHARM_HTTPS_PROXY", ""), 207 | "http_proxy": os.environ.get("JUJU_CHARM_HTTP_PROXY", ""), 208 | "no_proxy": os.environ.get("JUJU_CHARM_NO_PROXY", ""), 209 | } 210 | 211 | return Layer( 212 | { 213 | "summary": "alertmanager layer", 214 | "description": "pebble config layer for alertmanager", 215 | "services": { 216 | self._service_name: { 217 | "override": "replace", 218 | "summary": "alertmanager service", 219 | "command": _command(), 220 | "startup": "enabled", 221 | "environment": _environment(), 222 | } 223 | }, 224 | } 225 | ) 226 | 227 | def update_layer(self) -> None: 228 | """Update service layer to reflect changes in peers (replicas).""" 229 | if not self.is_ready: 230 | raise ContainerNotReady("cannot update layer") 231 | 232 | overlay = self._alertmanager_layer() 233 | 234 | self._container.add_layer(self._layer_name, overlay, combine=True) 235 | try: 236 | # If a config is invalid then alertmanager would exit immediately. 237 | # This would be caught by pebble (default timeout is 30 sec) and a ChangeError 238 | # would be raised. 239 | self._container.replan() 240 | except ChangeError as e: 241 | logger.error( 242 | "Failed to replan; pebble plan: %s; %s", 243 | self._container.get_plan().to_dict(), 244 | str(e), 245 | ) 246 | 247 | def update_config(self, manifest: ConfigFileSystemState) -> None: 248 | """Update alertmanager config files to reflect changes in configuration. 249 | 250 | After pushing a new config, a hot-reload is attempted. If hot-reload fails, the service is 251 | restarted. 252 | 253 | Raises: 254 | ConfigUpdateFailure, if failed to update configuration file. 255 | """ 256 | if not self.is_ready: 257 | raise ContainerNotReady("cannot update config") 258 | 259 | logger.debug("applying config changes") 260 | manifest.apply(self._container) 261 | 262 | # Validate with amtool and raise if bad 263 | try: 264 | self.check_config() 265 | except WorkloadManagerError as e: 266 | raise ConfigUpdateFailure("Failed to validate config (run check-config action)") from e 267 | 268 | def restart_service(self) -> bool: 269 | """Helper function for restarting the underlying service. 270 | 271 | Returns: 272 | True if restart succeeded; False otherwise. 273 | """ 274 | logger.info("Restarting service %s", self._service_name) 275 | 276 | if not self.is_ready: 277 | logger.error("Cannot (re)start service: container is not ready.") 278 | return False 279 | 280 | # Check if service exists, to avoid ModelError from being raised when the service does 281 | # not exist, 282 | if not self._container.get_plan().services.get(self._service_name): 283 | logger.error("Cannot (re)start service: service does not (yet) exist.") 284 | return False 285 | 286 | self._container.restart(self._service_name) 287 | 288 | return True 289 | 290 | def reload(self) -> None: 291 | """Trigger a hot-reload of the configuration (or service restart). 292 | 293 | Raises: 294 | ConfigUpdateFailure, if the reload (or restart) fails. 295 | """ 296 | if not self.is_ready: 297 | raise ContainerNotReady("cannot reload") 298 | 299 | # Obtain a "before" snapshot of the config from the server. 300 | # This is different from `config` above because alertmanager adds in a bunch of details 301 | # such as: 302 | # 303 | # smtp_hello: localhost 304 | # smtp_require_tls: true 305 | # pagerduty_url: https://events.pagerduty.com/v2/enqueue 306 | # opsgenie_api_url: https://api.opsgenie.com/ 307 | # wechat_api_url: https://qyapi.weixin.qq.com/cgi-bin/ 308 | # victorops_api_url: https://alert.victorops.com/integrations/generic/20131114/alert/ 309 | # 310 | # The snapshot is needed to determine if reloading took place. 311 | try: 312 | config_from_server_before = self.api.config() 313 | except AlertmanagerBadResponse: 314 | config_from_server_before = None 315 | 316 | # Send an HTTP POST to alertmanager to hot-reload the config. 317 | # This reduces down-time compared to restarting the service. 318 | try: 319 | self.api.reload() 320 | except AlertmanagerBadResponse as e: 321 | logger.warning("config reload via HTTP POST failed: %s", str(e)) 322 | # hot-reload failed so attempting a service restart 323 | if not self.restart_service(): 324 | raise ConfigUpdateFailure( 325 | "Is config valid? hot reload and service restart failed." 326 | ) 327 | 328 | # Obtain an "after" snapshot of the config from the server. 329 | try: 330 | config_from_server_after = self.api.config() 331 | except AlertmanagerBadResponse: 332 | config_from_server_after = None 333 | 334 | if config_from_server_before is None or config_from_server_after is None: 335 | logger.warning("cannot determine if reload succeeded") 336 | elif config_from_server_before == config_from_server_after: 337 | logger.warning("config remained the same after a reload") 338 | --------------------------------------------------------------------------------