├── .jujuignore ├── CODEOWNERS ├── .gitignore ├── requirements.txt ├── actions.yaml ├── tests ├── unit │ ├── helpers.py │ ├── test_alertmanager_client.py │ ├── charm │ │ └── test_push_config_to_workload_on_startup.py │ ├── test_charm.py │ └── test_consumer.py └── integration │ ├── conftest.py │ ├── test_kubectl_delete.py │ ├── test_update_status_pressure.py │ ├── test_config_changed_modifies_file.py │ ├── test_upgrade_charm.py │ ├── test_rescale_charm.py │ ├── helpers.py │ ├── test_rerelate_alertmanager_dispatch_metrics_endpoint.py │ └── test_templates.py ├── charmcraft.yaml ├── src ├── prometheus_alert_rules │ ├── alertmanager_notifications_failed.rule │ ├── alertmanager_missing.rule │ └── alertmanager_configuration_reload_failure.rule ├── alertmanager_client.py └── charm.py ├── .github ├── ISSUE_TEMPLATE │ ├── enhancement_proposal.yml │ └── bug_report.yml └── workflows │ ├── promote.yaml │ ├── pull-request.yaml │ ├── issues.yml │ ├── codeql-analysis.yml │ └── release-edge.yaml ├── config.yaml ├── INTEGRATING.md ├── RELEASE.md ├── metadata.yaml ├── pyproject.toml ├── icon.svg ├── tox.ini ├── CONTRIBUTING.md ├── README.md ├── lib └── charms │ ├── alertmanager_k8s │ └── v0 │ │ └── alertmanager_dispatch.py │ ├── observability_libs │ └── v0 │ │ └── kubernetes_service_patch.py │ ├── karma_k8s │ └── v0 │ │ └── karma_dashboard.py │ └── grafana_k8s │ └── v0 │ └── grafana_source.py └── LICENSE /.jujuignore: -------------------------------------------------------------------------------- 1 | /venv 2 | **/__pycache__ 3 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @sed-i @abuelodelanada @rbarry82 @balbirthomas @dstathis @simskij 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | build/ 3 | *.charm 4 | *.orig 5 | .coverage 6 | **/__pycache__/ 7 | *.py[cod] 8 | .idea/ 9 | .tox/ 10 | .mypy_cache -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | ops 5 | PyYAML 6 | lightkube 7 | lightkube-models 8 | -------------------------------------------------------------------------------- /actions.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | show-config: 5 | description: Show alertmanager config file. 6 | -------------------------------------------------------------------------------- /tests/unit/helpers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """Helper functions for writing tests.""" 6 | 7 | 8 | def no_op(*args, **kwargs) -> None: 9 | pass 10 | 11 | 12 | def tautology(*args, **kwargs) -> bool: 13 | return True 14 | -------------------------------------------------------------------------------- /charmcraft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | type: charm 5 | bases: 6 | - build-on: 7 | - name: "ubuntu" 8 | channel: "20.04" 9 | run-on: 10 | - name: "ubuntu" 11 | channel: "20.04" 12 | parts: 13 | charm: 14 | build-packages: 15 | - git 16 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/alertmanager_notifications_failed.rule: -------------------------------------------------------------------------------- 1 | alert: AlertmanagerNotificationsFailed 2 | expr: alertmanager_notifications_failed_total{integration=~".*"} != 0 3 | for: 0m 4 | labels: 5 | severity: warning 6 | annotations: 7 | summary: Alertmanager notifications failure (instance {{ $labels.instance }}) 8 | description: | 9 | Alertmanager notifications failure 10 | VALUE = {{ $value }} 11 | LABELS = {{ $labels }} 12 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/alertmanager_missing.rule: -------------------------------------------------------------------------------- 1 | # Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 2 | alert: AlertmanagerJobMissing 3 | expr: absent(up{}) 4 | for: 0m 5 | labels: 6 | severity: warning 7 | annotations: 8 | summary: Alertmanager job missing (instance {{ $labels.instance }}) 9 | description: | 10 | A Alertmanager job has disappeared 11 | VALUE = {{ $value }} 12 | LABELS = {{ $labels }} 13 | -------------------------------------------------------------------------------- /src/prometheus_alert_rules/alertmanager_configuration_reload_failure.rule: -------------------------------------------------------------------------------- 1 | # Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 2 | alert: AlertmanagerConfigurationReloadFailure 3 | expr: alertmanager_config_last_reload_successful{} != 1 4 | for: 0m 5 | labels: 6 | severity: warning 7 | annotations: 8 | summary: Alertmanager configuration reload failure (instance {{ $labels.instance }}) 9 | description: | 10 | Alertmanager configuration reload error 11 | VALUE = {{ $value }} 12 | LABELS = {{ $labels }} 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement_proposal.yml: -------------------------------------------------------------------------------- 1 | name: Enhancement Proposal 2 | description: File an enhancement proposal 3 | labels: ["Type: Enhancement", "Status: Triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | Thanks for taking the time to fill out this enhancement proposal! Before submitting your issue, please make 9 | sure there isn't already a prior issue concerning this. If there is, please join that discussion instead. 10 | - type: textarea 11 | id: enhancement-proposal 12 | attributes: 13 | label: Enhancement Proposal 14 | description: > 15 | Describe the enhancement you would like to see in as much detail as needed. 16 | validations: 17 | required: true 18 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | options: 5 | config_file: 6 | type: string 7 | description: > 8 | Alertmanager configuration file (yaml), with the exclusion of the templates section. 9 | Refer to https://www.prometheus.io/docs/alerting/latest/configuration/ for full details. 10 | default: "" 11 | templates_file: 12 | type: string 13 | description: > 14 | Alertmanager templates definition file. This is a slight deviation from the official 15 | alertmanager config spec. All templates need to go into this single config option, instead of 16 | the 'templates' section of the main configuration file. The templates will be pushed to the 17 | workload container, and the configuration file will be updated accordingly. 18 | Refer to https://prometheus.io/docs/alerting/latest/notification_examples/ for more details 19 | on templates. 20 | default: "" 21 | -------------------------------------------------------------------------------- /INTEGRATING.md: -------------------------------------------------------------------------------- 1 | # Integrating alertmanager-k8s 2 | 3 | ## Provides 4 | 5 | ### alertmanager_dispatch 6 | 7 | Any charm that implements the 8 | [`alertmanager_dispatch`](https://charmhub.io/alertmanager-k8s/libraries/alertmanager_dispatch) 9 | relation interface can be related to this charm for forwarding alerts to alertmanager, 10 | for example: [Prometheus][Prometheus operator], [Loki][Loki operator]. 11 | 12 | ``` 13 | juju relate alertmanager-k8s prometheus-k8s 14 | ``` 15 | 16 | ### karma_dashboard 17 | The [`karma_dashboard`](https://charmhub.io/karma-k8s/libraries/karma_dashboard) 18 | relation interface links an entire Alertmanager cluster to a 19 | [Karma][Karma operator] dashboard. 20 | Scaling alertmanager would automatically cause karma to group alerts by 21 | cluster. 22 | 23 | ``` 24 | juju relate alertmanager-k8s karma-k8s 25 | ``` 26 | 27 | ## Requires 28 | None. 29 | 30 | [Loki operator]: https://charmhub.io/loki-k8s 31 | [Prometheus operator]: https://charmhub.io/prometheus-k8s 32 | [Karma operator]: https://charmhub.io/karma-k8s/ 33 | -------------------------------------------------------------------------------- /tests/integration/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import socket 6 | from pathlib import Path 7 | 8 | import pytest 9 | from pytest_operator.plugin import OpsTest 10 | 11 | PYTEST_HTTP_SERVER_PORT = 8000 12 | 13 | 14 | @pytest.fixture(scope="module") 15 | async def charm_under_test(ops_test: OpsTest) -> Path: 16 | """Charm used for integration testing.""" 17 | path_to_built_charm = await ops_test.build_charm(".") 18 | 19 | return path_to_built_charm 20 | 21 | 22 | @pytest.fixture(scope="session") 23 | def httpserver_listen_address(): 24 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 25 | s.settimeout(0) 26 | try: 27 | # ip address does not need to be reachable 28 | s.connect(("8.8.8.8", 1)) 29 | local_ip_address = s.getsockname()[0] 30 | except Exception: 31 | local_ip_address = "127.0.0.1" 32 | finally: 33 | s.close() 34 | return (local_ip_address, PYTEST_HTTP_SERVER_PORT) 35 | -------------------------------------------------------------------------------- /.github/workflows/promote.yaml: -------------------------------------------------------------------------------- 1 | name: Promote Charm 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | promotion: 7 | type: choice 8 | description: Channel to promote from 9 | options: 10 | - edge -> beta 11 | - beta -> candidate 12 | - candidate -> stable 13 | 14 | jobs: 15 | promote: 16 | name: Promote Charm 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v2 21 | - name: Set target channel 22 | env: 23 | PROMOTE_FROM: ${{ github.event.inputs.promotion }} 24 | run: | 25 | if [ "${PROMOTE_FROM}" == "edge -> beta" ]; then 26 | echo "promote-from=edge" >> ${GITHUB_ENV} 27 | echo "promote-to=beta" >> ${GITHUB_ENV} 28 | elif [ "${PROMOTE_FROM}" == "beta -> candidate" ]; then 29 | echo "promote-from=beta" >> ${GITHUB_ENV} 30 | echo "promote-to=candidate" >> ${GITHUB_ENV} 31 | elif [ "${PROMOTE_FROM}" == "candidate -> stable" ]; then 32 | echo "promote-from=candidate" >> ${GITHUB_ENV} 33 | echo "promote-to=stable" >> ${GITHUB_ENV} 34 | fi 35 | - name: Promote Charm 36 | uses: canonical/charming-actions/release-charm@1.0.3 37 | with: 38 | credentials: ${{ secrets.CHARMHUB_TOKEN }} 39 | github-token: ${{ secrets.GITHUB_TOKEN }} 40 | destination-channel: latest/${{ env.promote-to }} 41 | origin-channel: latest/${{ env.promote-from }} 42 | charmcraft-channel: latest/stable 43 | -------------------------------------------------------------------------------- /tests/integration/test_kubectl_delete.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2022 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | 6 | import logging 7 | from pathlib import Path 8 | 9 | import pytest 10 | import yaml 11 | from helpers import is_alertmanager_up 12 | from pytest_operator.plugin import OpsTest 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) 17 | app_name = METADATA["name"] 18 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 19 | 20 | 21 | @pytest.mark.abort_on_fail 22 | async def test_deploy_from_local_path(ops_test: OpsTest, charm_under_test): 23 | """Deploy the charm-under-test.""" 24 | logger.debug("deploy local charm") 25 | 26 | await ops_test.model.deploy(charm_under_test, application_name=app_name, resources=resources) 27 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 28 | await is_alertmanager_up(ops_test, app_name) 29 | 30 | 31 | @pytest.mark.abort_on_fail 32 | async def test_kubectl_delete_pod(ops_test: OpsTest): 33 | pod_name = f"{app_name}-0" 34 | 35 | cmd = [ 36 | "sg", 37 | "microk8s", 38 | "-c", 39 | " ".join(["microk8s.kubectl", "delete", "pod", "-n", ops_test.model_name, pod_name]), 40 | ] 41 | 42 | logger.debug( 43 | "Removing pod '%s' from model '%s' with cmd: %s", pod_name, ops_test.model_name, cmd 44 | ) 45 | 46 | retcode, stdout, stderr = await ops_test.run(*cmd) 47 | assert retcode == 0, f"kubectl failed: {(stderr or stdout).strip()}" 48 | logger.debug(stdout) 49 | await ops_test.model.block_until(lambda: len(ops_test.model.applications[app_name].units) > 0) 50 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 51 | assert await is_alertmanager_up(ops_test, app_name) 52 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release Process 2 | 3 | ## Overview 4 | 5 | At any given time there are three revisions of the Alertmanager charm [available on CharmHub.io](https://charmhub.io/alertmanager-k8s), for each of the following channels: 6 | 7 | 1. `latest/stable` is a well tested production ready version of the Charm. 8 | 2. `latest/candidate` is a feature ready next version of the stable release, currently in testing. 9 | 3. `latest/edge` is the bleeding edge developer version of the charm. While we really try not to, it may break and introduce regressions. 10 | 11 | Currently, the Alertmanager charm does not make use of the `latest/beta` channel. 12 | For more information about CharmHub channels, refer to the [Juju charm store](https://discourse.charmhub.io/t/the-juju-charm-store) documentation. 13 | 14 | ## When to create which revisions 15 | 16 | * **Stable revisions** are done in consultation with product manager and engineering manager when the `candidate` revision has been well tested and is deemed ready for production. 17 | * **Candidate revisions** are done when the charm reaches a state of feature completion with respect to the next planned `stable` release. 18 | * **Edge revisions** are released at the developer's discretion, potentially every time something is merged into `main` and the unit tests pass. 19 | 20 | ## How to publish revisions 21 | 22 | Refer to the [Publish your operator in Charmhub](https://discourse.charmhub.io/t/publish-your-operator-in-charmhub) documentation. 23 | After a `latest/stable` release, it is expected that the version of the charm is the same as the one in `latest/candidate`, and those two channels will diverge again when we are ramping up through `latest/candidate` releases for a new `latest/stable` release. 24 | 25 | ## A note on granularity of revisions 26 | 27 | We believe in shipping often and with confidence. 28 | It is perfectly acceptable to have a new `latest/stable` release containing just one bug fix or a small new feature with respect to the last one. 29 | -------------------------------------------------------------------------------- /.github/workflows/pull-request.yaml: -------------------------------------------------------------------------------- 1 | name: Pull Request 2 | on: 3 | pull_request: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | lib-check: 9 | name: Static analysis of /lib for Python 3.5 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v2 14 | - name: Set up Python 3.5 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.5 18 | - name: Install dependencies 19 | run: python3 -m pip install tox 20 | - name: Run static analysis for /lib for 3.5 21 | run: tox -vve static-lib 22 | static-analysis: 23 | name: Static analysis 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v2 28 | - name: Install dependencies 29 | run: python3 -m pip install tox 30 | - name: Run static analysis (charm) 31 | run: tox -vve static-charm 32 | - name: Run static analysis (unit tests) 33 | run: tox -vve static-unit 34 | - name: Run static analysis (integration tests) 35 | run: tox -vve static-integration 36 | lint: 37 | name: Lint 38 | runs-on: ubuntu-latest 39 | steps: 40 | - name: Checkout 41 | uses: actions/checkout@v2 42 | - name: Install dependencies 43 | run: python3 -m pip install tox 44 | - name: Run linters 45 | run: tox -vve lint 46 | unit-test: 47 | name: Unit tests 48 | runs-on: ubuntu-latest 49 | steps: 50 | - name: Checkout 51 | uses: actions/checkout@v2 52 | - name: Install dependencies 53 | run: python -m pip install tox 54 | - name: Run tests 55 | run: tox -vve unit 56 | integration-test-microk8s: 57 | name: Integration tests (microk8s) 58 | runs-on: ubuntu-latest 59 | steps: 60 | - name: Checkout 61 | uses: actions/checkout@v2 62 | - name: Setup operator environment 63 | uses: charmed-kubernetes/actions-operator@main 64 | with: 65 | provider: microk8s 66 | - name: Run alertmanger tests 67 | run: tox -vve integration 68 | -------------------------------------------------------------------------------- /metadata.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | name: alertmanager-k8s 5 | 6 | summary: | 7 | Kubernetes charm for Alertmanager. 8 | 9 | description: | 10 | Alertmanager handles alerts sent by client applications such as the Prometheus server. 11 | It takes care of deduplicating, grouping, and routing them to the correct receiver integrations 12 | such as email, PagerDuty, or OpsGenie. It also takes care of silencing and inhibition of alerts. 13 | 14 | docs: https://discourse.charmhub.io/t/alertmanager-k8s-docs-index/5788 15 | 16 | # workload containers 17 | containers: 18 | alertmanager: # container key used by pebble 19 | resource: alertmanager-image 20 | mounts: 21 | - storage: data 22 | # nflogs and silences files go here. With a mounted storage for silences, they persist 23 | # across container restarts. 24 | # This path is passed to alertmanager via the `--storage.path` cli argument. 25 | location: /alertmanager 26 | 27 | # oci-image resources for each container defined above 28 | resources: 29 | alertmanager-image: 30 | type: oci-image 31 | description: OCI image for alertmanager 32 | upstream-source: ubuntu/prometheus-alertmanager:0.23-22.04_beta 33 | provides: 34 | alerting: 35 | # The provider (alertmanager) adds the following key-value pair to the relation data bag of 36 | # every alertmanager unit: 37 | # "public_address": : 38 | interface: alertmanager_dispatch 39 | # assumed network type: private 40 | karma-dashboard: 41 | interface: karma_dashboard 42 | self-metrics-endpoint: 43 | interface: prometheus_scrape 44 | grafana-dashboard: 45 | interface: grafana_dashboard 46 | grafana-source: 47 | interface: grafana_datasource 48 | 49 | peers: 50 | replicas: 51 | interface: alertmanager_replica 52 | # assumed network type: private 53 | 54 | storage: 55 | data: 56 | type: filesystem 57 | description: > 58 | Storage path passed to alertmanager via --storage.path argument and used for nflog and silences snapshot 59 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | # Testing tools configuration 5 | [tool.coverage.run] 6 | branch = true 7 | 8 | [tool.coverage.report] 9 | show_missing = true 10 | 11 | # Formatting tools configuration 12 | [tool.black] 13 | line-length = 99 14 | target-version = ["py38"] 15 | 16 | [tool.isort] 17 | profile = "black" 18 | 19 | # Linting tools configuration 20 | [tool.flake8] 21 | max-line-length = 99 22 | max-doc-length = 99 23 | max-complexity = 10 24 | exclude = [".git", "__pycache__", ".tox", "build", "dist", "*.egg_info", "venv"] 25 | select = ["E", "W", "F", "C", "N", "R", "D", "H"] 26 | # Ignore W503, E501 because using black creates errors with this 27 | # Ignore D107 Missing docstring in __init__ 28 | ignore = ["W503", "E501", "D107"] 29 | # D100, D101, D102, D103: Ignore missing docstrings in tests 30 | per-file-ignores = ["tests/*:D100,D101,D102,D103"] 31 | docstring-convention = "google" 32 | # Check for properly formatted copyright header in each file 33 | copyright-check = "True" 34 | copyright-author = "Canonical Ltd." 35 | copyright-regexp = "Copyright\\s\\d{4}([-,]\\d{4})*\\s+%(author)s" 36 | 37 | # Static analysis tools configuration 38 | [tool.mypy] 39 | pretty = true 40 | python_version = 3.8 41 | mypy_path = "$MYPY_CONFIG_FILE_DIR/src:$MYPY_CONFIG_FILE_DIR/lib" 42 | follow_imports = "normal" 43 | warn_redundant_casts = true 44 | warn_unused_ignores = true 45 | warn_unused_configs = true 46 | show_traceback = true 47 | show_error_codes = true 48 | namespace_packages = true 49 | explicit_package_bases = true 50 | check_untyped_defs = true 51 | allow_redefinition = true 52 | 53 | # Ignore libraries that do not have type hint nor stubs 54 | [[tool.mypy.overrides]] 55 | module = ["ops.*", "lightkube.*", "git.*", "pytest_operator.*", "validators.*"] 56 | ignore_missing_imports = true 57 | 58 | [[tool.mypy.overrides]] 59 | module = ["charms.grafana_k8s.*", "charms.observability_libs.*"] 60 | follow_imports = "silent" 61 | warn_unused_ignores = false 62 | 63 | [tool.pytest.ini_options] 64 | minversion = "6.0" 65 | log_cli_level = "INFO" 66 | asyncio_mode = "auto" -------------------------------------------------------------------------------- /.github/workflows/issues.yml: -------------------------------------------------------------------------------- 1 | name: Issues 2 | 3 | on: [issues] 4 | 5 | jobs: 6 | update: 7 | name: Update Issue 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Dump Github Context 11 | run: | 12 | echo '${{ toJSON(github) }}' 13 | echo "update=false" >> $GITHUB_ENV 14 | 15 | if [ ${{ github.event_name }} != "issues" ]; then 16 | echo "This action only operates on issues" 17 | exit 0 18 | fi 19 | 20 | echo "update=true" >> $GITHUB_ENV 21 | - name: Determine action 22 | run: | 23 | if [ ${{ github.event.action }} == "opened" ]; then 24 | echo "action=open" >> $GITHUB_ENV 25 | fi 26 | if [ ${{ github.event.action }} == "reopened" ]; then 27 | echo "action=reopen" >> $GITHUB_ENV 28 | fi 29 | if [ ${{ github.event.action }} == "closed" ]; then 30 | echo "action=close" >> $GITHUB_ENV 31 | fi 32 | - name: Determine type 33 | run: | 34 | if ${{ contains(github.event.*.labels.*.name, 'Type: Bug') }}; then 35 | echo "type=bug" >> $GITHUB_ENV 36 | else 37 | echo "type=story" >> $GITHUB_ENV 38 | fi 39 | - name: Update 40 | if: ${{ env.update == 'true' }} 41 | run: | 42 | id="${{ github.event.issue.html_url }}" 43 | title="${{ github.event.issue.title }}" 44 | component="alertmanager" 45 | 46 | description="Opened by ${{ github.event.issue.user.login }}." 47 | 48 | data=$(jq -n \ 49 | --arg id "$id" \ 50 | --arg action "${{ env.action }}" \ 51 | --arg title "$title" \ 52 | --arg description "$description" \ 53 | --arg component "$component" \ 54 | --arg type "${{ env.type }}" \ 55 | '{data: {id: $id, action: $action, title: $title, description: $description, component: $component, type: $type}}') 56 | 57 | curl -X POST -H 'Content-type: application/json' --data "${data}" "${{ secrets.JIRA_URL }}" 58 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report 3 | labels: ["Type: Bug", "Status: Triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | Thanks for taking the time to fill out this bug report! Before submitting your issue, please make 9 | sure you are using the latest version of the charm. If not, please switch to this image prior to 10 | posting your report to make sure it's not already solved. 11 | - type: textarea 12 | id: bug-description 13 | attributes: 14 | label: Bug Description 15 | description: > 16 | If applicable, add screenshots to help explain your problem. If applicable, add screenshots to 17 | help explain the problem you are facing. 18 | validations: 19 | required: true 20 | - type: textarea 21 | id: reproduction 22 | attributes: 23 | label: To Reproduce 24 | description: > 25 | Please provide a step-by-step instruction of how to reproduce the behavior. 26 | placeholder: | 27 | 1. `juju deploy ...` 28 | 2. `juju relate ...` 29 | 3. `juju status --relations` 30 | validations: 31 | required: true 32 | - type: textarea 33 | id: environment 34 | attributes: 35 | label: Environment 36 | description: > 37 | We need to know a bit more about the context in which you run the charm. 38 | - Are you running Juju locally, on lxd, in multipass or on some other platform? 39 | - What track and channel you deployed the charm from (ie. `latest/edge` or similar). 40 | - Version of any applicable components, like the juju snap, the model controller, lxd, microk8s, and/or multipass. 41 | validations: 42 | required: true 43 | - type: textarea 44 | id: logs 45 | attributes: 46 | label: Relevant log output 47 | description: > 48 | Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. 49 | Fetch the logs using `juju debug-log --replay` and `kubectl logs ...`. Additional details available in the juju docs 50 | at https://juju.is/docs/olm/juju-logs 51 | render: shell 52 | validations: 53 | required: true 54 | - type: textarea 55 | id: additional-context 56 | attributes: 57 | label: Additional context 58 | 59 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | name: "CodeQL" 7 | 8 | on: 9 | push: 10 | branches: [main] 11 | pull_request: 12 | # The branches below must be a subset of the branches above 13 | branches: [main] 14 | 15 | permissions: 16 | security-events: 17 | write 18 | 19 | jobs: 20 | analyze: 21 | name: Analyze 22 | runs-on: ubuntu-latest 23 | 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | # Override automatic language detection by changing the below list 28 | # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python'] 29 | language: ['python'] 30 | # Learn more... 31 | # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection 32 | 33 | steps: 34 | - name: Checkout repository 35 | uses: actions/checkout@v2 36 | 37 | # Initializes the CodeQL tools for scanning. 38 | - name: Initialize CodeQL 39 | uses: github/codeql-action/init@v1 40 | with: 41 | languages: ${{ matrix.language }} 42 | # If you wish to specify custom queries, you can do so here or in a config file. 43 | # By default, queries listed here will override any specified in a config file. 44 | # Prefix the list here with "+" to use these queries and those in the config file. 45 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 46 | 47 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 48 | # If this step fails, then you should remove it and run the build manually (see below) 49 | - name: Autobuild 50 | uses: github/codeql-action/autobuild@v1 51 | 52 | # ℹ️ Command-line programs to run using the OS shell. 53 | # 📚 https://git.io/JvXDl 54 | 55 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 56 | # and modify them (or add more) to build your code if your project 57 | # uses a compiled language 58 | 59 | #- run: | 60 | # make bootstrap 61 | # make release 62 | 63 | - name: Perform CodeQL Analysis 64 | uses: github/codeql-action/analyze@v1 65 | -------------------------------------------------------------------------------- /.github/workflows/release-edge.yaml: -------------------------------------------------------------------------------- 1 | name: Release to Edge 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | lib-check: 9 | name: Static analysis of /lib for Python 3.5 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v2 14 | - name: Set up Python 3.5 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.5 18 | - name: Install dependencies 19 | run: python3 -m pip install tox 20 | - name: Run static analysis for /lib for 3.5 21 | run: tox -vve static-lib 22 | static-analysis: 23 | name: Static analysis 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v2 28 | - name: Install dependencies 29 | run: python3 -m pip install tox 30 | - name: Run static analysis (charm) 31 | run: tox -vve static-charm 32 | - name: Run static analysis (unit tests) 33 | run: tox -vve static-unit 34 | - name: Run static analysis (integration tests) 35 | run: tox -vve static-integration 36 | lint: 37 | name: Lint 38 | runs-on: ubuntu-latest 39 | steps: 40 | - name: Checkout 41 | uses: actions/checkout@v2 42 | - name: Install dependencies 43 | run: python3 -m pip install tox 44 | - name: Run linters 45 | run: tox -vve lint 46 | unit-test: 47 | name: Unit tests 48 | runs-on: ubuntu-latest 49 | steps: 50 | - name: Checkout 51 | uses: actions/checkout@v2 52 | - name: Install dependencies 53 | run: python -m pip install tox 54 | - name: Run tests 55 | run: tox -vve unit 56 | integration-test: 57 | name: Integration tests (microk8s) 58 | runs-on: ubuntu-latest 59 | steps: 60 | - name: Checkout 61 | uses: actions/checkout@v2 62 | - name: Setup operator environment 63 | uses: charmed-kubernetes/actions-operator@main 64 | with: 65 | provider: microk8s 66 | - name: Run alertmanger tests 67 | run: tox -vve integration 68 | release-to-charmhub: 69 | name: Release to CharmHub 70 | needs: 71 | - static-analysis 72 | - lib-check 73 | - lint 74 | - unit-test 75 | - integration-test 76 | runs-on: ubuntu-latest 77 | steps: 78 | - name: Checkout 79 | uses: actions/checkout@v2 80 | with: 81 | fetch-depth: 0 82 | - name: Select charmhub channel 83 | uses: canonical/charming-actions/channel@1.0.0 84 | id: channel 85 | - name: Upload charm to charmhub 86 | uses: canonical/charming-actions/upload-charm@1.0.0 87 | with: 88 | credentials: "${{ secrets.CHARMHUB_TOKEN }}" 89 | github-token: "${{ secrets.GITHUB_TOKEN }}" 90 | channel: "${{ steps.channel.outputs.name }}" 91 | -------------------------------------------------------------------------------- /icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | image/svg+xml -------------------------------------------------------------------------------- /tests/integration/test_update_status_pressure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """This test module tests common lifecycle behaviors under frequent update-status hook firing. 6 | 7 | 0. Set update-status frequency to the minimum possible 8 | 1. Deploys and relate the charm-under-test 9 | 2. Remove related app(s) 10 | """ 11 | 12 | import asyncio 13 | import logging 14 | from pathlib import Path 15 | 16 | import pytest 17 | import yaml 18 | from helpers import is_alertmanager_up 19 | from pytest_operator.plugin import OpsTest 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) 24 | app_name = METADATA["name"] 25 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 26 | 27 | 28 | @pytest.mark.abort_on_fail 29 | async def test_setup_env(ops_test: OpsTest): 30 | await ops_test.model.set_config( 31 | {"update-status-hook-interval": "10s", "logging-config": "=WARNING; unit=DEBUG"} 32 | ) 33 | 34 | 35 | @pytest.mark.abort_on_fail 36 | async def test_deploy_multiple_units(ops_test: OpsTest, charm_under_test): 37 | """Deploy the charm-under-test.""" 38 | logger.info("build charm from local source folder") 39 | 40 | logger.info("deploy charms") 41 | await asyncio.gather( 42 | ops_test.model.deploy( 43 | charm_under_test, application_name=app_name, resources=resources, num_units=2 44 | ), 45 | ops_test.model.deploy( 46 | "ch:prometheus-k8s", application_name="prom", channel="edge", trust=True 47 | ), 48 | ) 49 | 50 | await asyncio.gather( 51 | ops_test.model.add_relation(f"{app_name}:alerting", "prom"), 52 | ops_test.model.wait_for_idle(status="active", timeout=2500), 53 | ) 54 | 55 | assert await is_alertmanager_up(ops_test, app_name) 56 | 57 | 58 | @pytest.mark.abort_on_fail 59 | async def test_remove_related_app(ops_test: OpsTest): 60 | await ops_test.model.applications["prom"].remove() 61 | # Block until it is really gone. Added after an itest failed when tried to redeploy: 62 | # juju.errors.JujuError: ['cannot add application "related-app": application already exists'] 63 | await ops_test.model.block_until(lambda: "prom" not in ops_test.model.applications) 64 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=300) 65 | assert await is_alertmanager_up(ops_test, app_name) 66 | 67 | 68 | @pytest.mark.abort_on_fail 69 | async def test_wait_through_a_few_update_status_cycles(ops_test: OpsTest): 70 | await asyncio.sleep(60) # should be longer than the update-status period 71 | 72 | # "Disable" update-status so the charm gets a chance to become idle for long enough for 73 | # wait_for_idle to succeed 74 | await ops_test.model.set_config({"update-status-hook-interval": "60m"}) 75 | 76 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=300) 77 | -------------------------------------------------------------------------------- /tests/integration/test_config_changed_modifies_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """This test module tests change in alertmanager config. 6 | 7 | 1. Deploy the charm under test with default config and wait for it to become active. 8 | 2. Make a config change and expect reload to be triggered. 9 | 3. Confirm changes applied. 10 | """ 11 | 12 | import logging 13 | from pathlib import Path 14 | 15 | import pytest 16 | import yaml 17 | from helpers import get_unit_address, is_alertmanager_up 18 | from pytest_operator.plugin import OpsTest 19 | 20 | from alertmanager_client import Alertmanager 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) 25 | app_name = METADATA["name"] 26 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 27 | 28 | 29 | @pytest.mark.abort_on_fail 30 | async def test_build_and_deploy(ops_test: OpsTest, charm_under_test): 31 | """Build the charm-under-test and deploy it together with related charms. 32 | 33 | Assert on the unit status before any relations/configurations take place. 34 | """ 35 | # deploy charm from local source folder 36 | await ops_test.model.deploy(charm_under_test, resources=resources, application_name=app_name) 37 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 38 | assert ops_test.model.applications[app_name].units[0].workload_status == "active" 39 | assert await is_alertmanager_up(ops_test, app_name) 40 | 41 | 42 | async def test_update_config(ops_test: OpsTest): 43 | # Obtain a "before" snapshot of the config from the server. 44 | client = Alertmanager(await get_unit_address(ops_test, app_name, 0)) 45 | config_from_server_before = client.config() 46 | # Make sure the defaults is what we expect them to be (this is only a partial check, but an 47 | # easy one). 48 | assert "receivers" in config_from_server_before 49 | 50 | def rename_toplevel_receiver(config: dict, new_name: str): 51 | old_name = config["route"]["receiver"] 52 | config["route"]["receiver"] = new_name 53 | 54 | for receiver in config["receivers"]: 55 | if receiver["name"] == old_name: 56 | receiver["name"] = new_name 57 | 58 | # Modify the default config 59 | config = config_from_server_before.copy() 60 | receiver_name = config["route"]["receiver"] 61 | rename_toplevel_receiver(config, receiver_name * 2) 62 | 63 | await ops_test.model.applications[app_name].set_config({"config_file": yaml.safe_dump(config)}) 64 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=60) 65 | 66 | # Obtain an "after" snapshot of the config from the server. 67 | config_from_server_after = client.config() 68 | # Make sure the current config is what we expect it to be (this is only a partial check, but an 69 | # easy one). 70 | assert config_from_server_after["receivers"] == config["receivers"] 71 | -------------------------------------------------------------------------------- /tests/unit/test_alertmanager_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import json 6 | import unittest 7 | from datetime import datetime 8 | from unittest.mock import patch 9 | 10 | from alertmanager_client import Alertmanager, AlertmanagerBadResponse 11 | 12 | 13 | class TestAlertmanagerAPIClient(unittest.TestCase): 14 | def setUp(self): 15 | self.api = Alertmanager("address", 12345) 16 | 17 | def test_base_url(self): 18 | self.assertEqual("http://address:12345/", self.api.base_url) 19 | 20 | @patch("alertmanager_client.urllib.request.urlopen") 21 | def test_reload_succeed(self, urlopen_mock): 22 | urlopen_mock.return_value.code = 200 23 | urlopen_mock.return_value.reason = "OK" 24 | 25 | self.api.reload() 26 | urlopen_mock.assert_called() 27 | 28 | @patch("alertmanager_client.urllib.request.urlopen") 29 | def test_status_succeed(self, urlopen_mock): 30 | urlopen_mock.return_value.read = lambda: json.dumps({"status": "fake"}) 31 | urlopen_mock.return_value.code = 200 32 | urlopen_mock.return_value.reason = "OK" 33 | 34 | status = self.api.status() 35 | self.assertIsNotNone(status) 36 | self.assertDictEqual({"status": "fake"}, status) 37 | 38 | def test_reload_and_status_fail(self): 39 | def mock_connection_error(*args, **kwargs): 40 | import urllib.error 41 | 42 | raise urllib.error.HTTPError( 43 | url="mock://url", 44 | code=500, 45 | msg="mock msg", 46 | hdrs={"mock hdr": "mock smth"}, # type: ignore[arg-type] 47 | fp=None, 48 | ) 49 | 50 | with patch("alertmanager_client.urllib.request.urlopen", mock_connection_error): 51 | self.assertRaises(AlertmanagerBadResponse, self.api.reload) 52 | 53 | with patch("alertmanager_client.urllib.request.urlopen", mock_connection_error): 54 | self.assertRaises(AlertmanagerBadResponse, self.api.status) 55 | 56 | @patch("alertmanager_client.urllib.request.urlopen") 57 | def test_version(self, urlopen_mock): 58 | urlopen_mock.return_value.read = lambda: json.dumps({"versionInfo": {"version": "0.1.2"}}) 59 | urlopen_mock.return_value.code = 200 60 | urlopen_mock.return_value.reason = "OK" 61 | 62 | self.assertEqual(self.api.version, "0.1.2") 63 | 64 | @patch("alertmanager_client.urllib.request.urlopen") 65 | def test_alerts_can_be_set(self, urlopen_mock): 66 | msg = "HTTP 200 OK" 67 | urlopen_mock.return_value = msg 68 | alerts = [ 69 | { 70 | "startsAt": datetime.now().isoformat("T"), 71 | "status": "firing", 72 | "annotations": { 73 | "summary": "A fake alert", 74 | }, 75 | "labels": { 76 | "alertname": "fake alert", 77 | }, 78 | } 79 | ] 80 | status = self.api.set_alerts(alerts) 81 | urlopen_mock.assert_called() 82 | self.assertEqual(status, msg) 83 | -------------------------------------------------------------------------------- /tests/integration/test_upgrade_charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """This test module tests alertmanager upgrade with and without relations present. 6 | 7 | 1. Deploy the charm under test _from charmhub_. 8 | 2. Refresh with locally built charm. 9 | 3. Add all supported relations. 10 | 4. Refresh with locally built charm. 11 | 5. Add unit and refresh again (test multi unit upgrade with relations). 12 | """ 13 | 14 | import asyncio 15 | import logging 16 | from pathlib import Path 17 | 18 | import pytest 19 | import yaml 20 | from helpers import is_alertmanager_up 21 | from pytest_operator.plugin import OpsTest 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) 26 | app_name = METADATA["name"] 27 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 28 | 29 | 30 | @pytest.mark.abort_on_fail 31 | async def test_upgrade_edge_with_local_in_isolation(ops_test: OpsTest, charm_under_test): 32 | """Build the charm-under-test, deploy the charm from charmhub, and upgrade from path.""" 33 | logger.info("deploy charm from charmhub") 34 | await ops_test.model.deploy("ch:alertmanager-k8s", application_name=app_name, channel="edge") 35 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 36 | 37 | logger.info("upgrade deployed charm with local charm %s", charm_under_test) 38 | await ops_test.model.applications[app_name].refresh(path=charm_under_test, resources=resources) 39 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 40 | assert await is_alertmanager_up(ops_test, app_name) 41 | 42 | 43 | @pytest.mark.abort_on_fail 44 | async def test_upgrade_local_with_local_with_relations(ops_test: OpsTest, charm_under_test): 45 | # Deploy related apps 46 | await asyncio.gather( 47 | ops_test.model.deploy( 48 | "ch:prometheus-k8s", application_name="prom", channel="edge", trust=True 49 | ), 50 | ops_test.model.deploy("ch:karma-k8s", application_name="karma", channel="edge"), 51 | ) 52 | 53 | # Relate apps 54 | await asyncio.gather( 55 | ops_test.model.add_relation(app_name, "prom:alertmanager"), 56 | ops_test.model.add_relation(app_name, "karma"), 57 | ) 58 | 59 | # Refresh from path 60 | await ops_test.model.applications[app_name].refresh(path=charm_under_test, resources=resources) 61 | await ops_test.model.wait_for_idle( 62 | apps=[app_name, "prom", "karma"], status="active", timeout=2500 63 | ) 64 | assert await is_alertmanager_up(ops_test, app_name) 65 | 66 | 67 | @pytest.mark.abort_on_fail 68 | async def test_upgrade_with_multiple_units(ops_test: OpsTest, charm_under_test): 69 | # Add unit 70 | await ops_test.model.applications[app_name].scale(scale_change=1) 71 | await ops_test.model.wait_for_idle( 72 | apps=[app_name, "prom", "karma"], status="active", timeout=1000 73 | ) 74 | 75 | # Refresh from path 76 | await ops_test.model.applications[app_name].refresh(path=charm_under_test, resources=resources) 77 | await ops_test.model.wait_for_idle( 78 | apps=[app_name, "prom", "karma"], status="active", timeout=2500 79 | ) 80 | assert await is_alertmanager_up(ops_test, app_name) 81 | -------------------------------------------------------------------------------- /tests/integration/test_rescale_charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """This test module tests rescaling. 6 | 7 | 1. Deploys multiple units of the charm under test and waits for them to become active 8 | 2. Reset and repeat the above until the leader unit is not the zero unit 9 | 3. Scales up the application by a few units and waits for them to become active 10 | 4. Scales down the application to below the leader unit, to trigger a leadership change event 11 | """ 12 | 13 | 14 | import logging 15 | from pathlib import Path 16 | 17 | import pytest 18 | import yaml 19 | from helpers import block_until_leader_elected, get_leader_unit_num, is_alertmanager_up 20 | from pytest_operator.plugin import OpsTest 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) 25 | app_name = METADATA["name"] 26 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 27 | 28 | 29 | # @pytest.mark.abort_on_fail 30 | @pytest.mark.xfail 31 | async def test_deploy_multiple_units(ops_test: OpsTest, charm_under_test): 32 | """Deploy the charm-under-test.""" 33 | logger.info("build charm from local source folder") 34 | 35 | logger.info("deploy charm") 36 | await ops_test.model.deploy( 37 | charm_under_test, application_name=app_name, resources=resources, num_units=10 38 | ) 39 | await block_until_leader_elected(ops_test, app_name) 40 | 41 | if await get_leader_unit_num(ops_test, app_name) == 0: 42 | # We're unlucky this time: unit/0 is the leader, which means no scale down could trigger a 43 | # leadership change event. 44 | # Fail the test instead of model.reset() and repeat, because this hangs on github actions. 45 | logger.info("Elected leader is unit/0 - resetting and repeating") 46 | assert 0, "No luck in electing a leader that is not the zero unit. Try re-running?" 47 | 48 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 49 | 50 | 51 | # @pytest.mark.abort_on_fail 52 | @pytest.mark.xfail 53 | async def test_scale_down_to_single_unit_with_leadership_change(ops_test: OpsTest): 54 | """Scale down below current leader to trigger a leadership change event.""" 55 | await ops_test.model.applications[app_name].scale(scale=1) 56 | await ops_test.model.wait_for_idle( 57 | apps=[app_name], status="active", timeout=1000, wait_for_exact_units=1 58 | ) 59 | assert await is_alertmanager_up(ops_test, app_name) 60 | 61 | 62 | # @pytest.mark.abort_on_fail 63 | @pytest.mark.xfail 64 | async def test_scale_up_from_single_unit(ops_test: OpsTest): 65 | """Add a few more units.""" 66 | await ops_test.model.applications[app_name].scale(scale_change=2) 67 | await ops_test.model.wait_for_idle( 68 | apps=[app_name], status="active", timeout=1000, wait_for_exact_units=3 69 | ) 70 | assert await is_alertmanager_up(ops_test, app_name) 71 | 72 | 73 | # @pytest.mark.abort_on_fail 74 | @pytest.mark.xfail 75 | async def test_scale_down_to_single_unit_without_leadership_change(ops_test): 76 | """Remove a few units.""" 77 | await ops_test.model.applications[app_name].scale(scale_change=-2) 78 | await ops_test.model.wait_for_idle( 79 | apps=[app_name], status="active", timeout=1000, wait_for_exact_units=1 80 | ) 81 | assert await is_alertmanager_up(ops_test, app_name) 82 | -------------------------------------------------------------------------------- /tests/integration/helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """Helper functions for writing tests.""" 5 | 6 | import asyncio 7 | import json 8 | import logging 9 | import urllib.request 10 | from typing import Dict 11 | 12 | from pytest_operator.plugin import OpsTest 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | async def get_unit_address(ops_test: OpsTest, app_name: str, unit_num: int) -> str: 18 | """Get private address of a unit.""" 19 | status = await ops_test.model.get_status() # noqa: F821 20 | return status["applications"][app_name]["units"][f"{app_name}/{unit_num}"]["address"] 21 | 22 | 23 | def interleave(l1: list, l2: list) -> list: 24 | """Interleave two lists. 25 | 26 | >>> interleave([1,2,3], ['a', 'b', 'c']) 27 | [1, 'a', 2, 'b', 3, 'c'] 28 | 29 | Reference: https://stackoverflow.com/a/11125298/3516684 30 | """ 31 | return [x for t in zip(l1, l2) for x in t] 32 | 33 | 34 | async def cli_upgrade_from_path_and_wait( 35 | ops_test: OpsTest, 36 | path: str, 37 | alias: str, 38 | resources: Dict[str, str] = None, 39 | wait_for_status: str = None, 40 | ): 41 | if resources is None: 42 | resources = {} 43 | 44 | resource_pairs = [f"{k}={v}" for k, v in resources.items()] 45 | resource_arg_prefixes = ["--resource"] * len(resource_pairs) 46 | resource_args = interleave(resource_arg_prefixes, resource_pairs) 47 | 48 | cmd = [ 49 | "juju", 50 | "refresh", 51 | "--path", 52 | path, 53 | alias, 54 | *resource_args, 55 | ] 56 | 57 | retcode, stdout, stderr = await ops_test.run(*cmd) 58 | assert retcode == 0, f"Upgrade failed: {(stderr or stdout).strip()}" 59 | logger.info(stdout) 60 | await ops_test.model.wait_for_idle(apps=[alias], status=wait_for_status, timeout=120) 61 | 62 | 63 | async def get_leader_unit_num(ops_test: OpsTest, app_name: str): 64 | units = ops_test.model.applications[app_name].units 65 | is_leader = [await units[i].is_leader_from_status() for i in range(len(units))] 66 | logger.info("Leaders: %s", is_leader) 67 | return is_leader.index(True) 68 | 69 | 70 | async def is_leader_elected(ops_test: OpsTest, app_name: str): 71 | units = ops_test.model.applications[app_name].units 72 | return any([await units[i].is_leader_from_status() for i in range(len(units))]) 73 | 74 | 75 | async def block_until_leader_elected(ops_test: OpsTest, app_name: str): 76 | # await ops_test.model.block_until(is_leader_elected) 77 | # block_until does not take async (yet?) https://github.com/juju/python-libjuju/issues/609 78 | while not await is_leader_elected(ops_test, app_name): 79 | await asyncio.sleep(5) 80 | 81 | 82 | async def is_alertmanage_unit_up(ops_test: OpsTest, app_name: str, unit_num: int): 83 | address = await get_unit_address(ops_test, app_name, unit_num) 84 | url = f"http://{address}:9093" 85 | logger.info("am public address: %s", url) 86 | 87 | response = urllib.request.urlopen(f"{url}/api/v2/status", data=None, timeout=2.0) 88 | return response.code == 200 and "versionInfo" in json.loads(response.read()) 89 | 90 | 91 | async def is_alertmanager_up(ops_test: OpsTest, app_name: str): 92 | return all( 93 | [ 94 | await is_alertmanage_unit_up(ops_test, app_name, unit_num) 95 | for unit_num in range(len(ops_test.model.applications[app_name].units)) 96 | ] 97 | ) 98 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | [tox] 5 | skipsdist=True 6 | skip_missing_interpreters = True 7 | envlist = lint, static-{charm,lib,unit,integration}, unit 8 | 9 | [vars] 10 | src_path = {toxinidir}/src 11 | tst_path = {toxinidir}/tests 12 | lib_path = {toxinidir}/lib/charms/alertmanager_k8s 13 | all_path = {[vars]src_path} {[vars]tst_path} {[vars]lib_path} 14 | 15 | [testenv] 16 | basepython = python3 17 | setenv = 18 | PYTHONPATH = {toxinidir}:{toxinidir}/lib:{[vars]src_path} 19 | PYTHONBREAKPOINT=ipdb.set_trace 20 | PY_COLORS=1 21 | passenv = 22 | PYTHONPATH 23 | HOME 24 | PATH 25 | CHARM_BUILD_DIR 26 | MODEL_SETTINGS 27 | HTTP_PROXY 28 | HTTPS_PROXY 29 | NO_PROXY 30 | 31 | [testenv:fmt] 32 | description = Apply coding style standards to code 33 | deps = 34 | black 35 | isort 36 | commands = 37 | isort {[vars]all_path} 38 | black {[vars]all_path} 39 | 40 | [testenv:lint] 41 | description = Check code against coding style standards 42 | deps = 43 | black 44 | flake8 45 | flake8-docstrings 46 | flake8-copyright 47 | flake8-builtins 48 | pyproject-flake8 49 | pep8-naming 50 | isort 51 | codespell 52 | commands = 53 | codespell {[vars]lib_path} 54 | codespell . --skip .git --skip .tox --skip build --skip lib --skip venv --skip .mypy_cache 55 | # pflake8 wrapper supports config from pyproject.toml 56 | pflake8 {[vars]all_path} 57 | isort --check-only --diff {[vars]all_path} 58 | black --check --diff {[vars]all_path} 59 | 60 | [testenv:static-{charm,lib,unit,integration}] 61 | description = Run static analysis checks 62 | setenv = 63 | unit: MYPYPATH = {[vars]tst_path}/unit 64 | integration: MYPYPATH = {[vars]tst_path}/integration 65 | deps = 66 | mypy 67 | types-PyYAML 68 | types-setuptools 69 | types-toml 70 | # pip-check-reqs does not yet work with recent pip 71 | pip-check-reqs 72 | charm: pip<=21.1.3 73 | charm: -r{toxinidir}/requirements.txt 74 | lib: git+https://github.com/canonical/operator#egg=ops 75 | unit: {[testenv:unit]deps} 76 | integration: {[testenv:integration]deps} 77 | commands = 78 | charm: pip-missing-reqs {toxinidir}/src {toxinidir}/lib --requirements-file={toxinidir}/requirements.txt 79 | charm: pip-extra-reqs {toxinidir}/src {toxinidir}/lib --requirements-file={toxinidir}/requirements.txt 80 | charm: mypy {[vars]src_path} {posargs} 81 | lib: mypy --python-version 3.5 {[vars]lib_path} {posargs} 82 | unit: mypy {[vars]tst_path}/unit {posargs} 83 | integration: mypy {[vars]tst_path}/integration {posargs} 84 | 85 | [testenv:unit] 86 | description = Run unit tests 87 | deps = 88 | pytest 89 | coverage[toml] 90 | hypothesis 91 | validators 92 | -r{toxinidir}/requirements.txt 93 | commands = 94 | coverage run \ 95 | --source={[vars]src_path},{[vars]lib_path} \ 96 | -m pytest -v --tb native --log-cli-level=INFO -s {posargs} {[vars]tst_path}/unit 97 | coverage report 98 | 99 | [testenv:integration] 100 | description = Run integration tests 101 | deps = 102 | #git+https://github.com/juju/python-libjuju.git 103 | juju 104 | pytest 105 | #git+https://github.com/charmed-kubernetes/pytest-operator.git 106 | pytest-operator 107 | pytest-httpserver 108 | commands = 109 | pytest -v --tb native --log-cli-level=INFO -s {posargs} {toxinidir}/tests/integration 110 | 111 | [testenv:integration-bundle] 112 | description = Run cos-lite bundle integration tests but with alertmanager built from source 113 | bundle_dir = {envtmpdir}/cos-lite-bundle 114 | deps = 115 | # deps from cos-lite bundle - these are needed here because running pytest on the bundle 116 | jinja2 117 | #git+https://github.com/juju/python-libjuju.git 118 | juju 119 | pytest 120 | #git+https://github.com/charmed-kubernetes/pytest-operator.git 121 | pytest-operator 122 | allowlist_externals = 123 | git 124 | commands = 125 | git clone --single-branch --depth=1 https://github.com/canonical/cos-light-bundle.git {[testenv:integration-bundle]bundle_dir} 126 | # run pytest on the integration tests of the cos-lite bundle, but override alertmanager with 127 | # path to this source dir 128 | pytest -v --tb native --log-cli-level=INFO -s --alertmanager={toxinidir} {posargs} {[testenv:integration-bundle]bundle_dir}/tests/integration 129 | -------------------------------------------------------------------------------- /tests/integration/test_rerelate_alertmanager_dispatch_metrics_endpoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """This test module tests alertmanager response to related apps being removed and re-related. 6 | 7 | 1. Deploy the charm under test and a related app (Promethes) relate them using 8 | `alertmanager_dispatch` and `prometheus_scrape` interfaces and wait for them to become idle. 9 | 2. Remove the relation. 10 | 3. Re-add the relation. 11 | 4. Remove the related application. 12 | 5. Redeploy the related application and add the relation back again. 13 | """ 14 | 15 | import asyncio 16 | import logging 17 | from pathlib import Path 18 | 19 | import pytest 20 | import yaml 21 | from helpers import is_alertmanager_up 22 | from pytest_operator.plugin import OpsTest 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) 27 | app_name = METADATA["name"] 28 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 29 | related_app = "related-app" 30 | 31 | 32 | @pytest.mark.abort_on_fail 33 | async def test_build_and_deploy(ops_test: OpsTest, charm_under_test): 34 | """Build the charm-under-test and deploy it together with related charms.""" 35 | await asyncio.gather( 36 | ops_test.model.deploy( 37 | charm_under_test, resources=resources, application_name=app_name, num_units=2 38 | ), 39 | ops_test.model.deploy( 40 | "ch:prometheus-k8s", application_name=related_app, channel="edge", trust=True 41 | ), 42 | ) 43 | 44 | await ops_test.model.add_relation(app_name, f"{related_app}:alertmanager") 45 | await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=2500) 46 | 47 | assert await is_alertmanager_up(ops_test, app_name) 48 | 49 | await ops_test.model.add_relation(app_name, f"{related_app}:metrics-endpoint") 50 | await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) 51 | 52 | assert await is_alertmanager_up(ops_test, app_name) 53 | 54 | 55 | @pytest.mark.abort_on_fail 56 | async def test_remove_relation(ops_test: OpsTest): 57 | await ops_test.model.applications[app_name].remove_relation("alerting", related_app) 58 | await ops_test.model.applications[app_name].remove_relation( 59 | "self-metrics-endpoint", related_app 60 | ) 61 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 62 | assert await is_alertmanager_up(ops_test, app_name) 63 | 64 | 65 | @pytest.mark.abort_on_fail 66 | async def test_rerelate(ops_test: OpsTest): 67 | await ops_test.model.add_relation(app_name, f"{related_app}:alertmanager") 68 | await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) 69 | assert await is_alertmanager_up(ops_test, app_name) 70 | 71 | await ops_test.model.add_relation(app_name, f"{related_app}:metrics-endpoint") 72 | await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) 73 | assert await is_alertmanager_up(ops_test, app_name) 74 | 75 | 76 | @pytest.mark.abort_on_fail 77 | async def test_remove_related_app(ops_test: OpsTest): 78 | await ops_test.model.applications[related_app].remove() 79 | # Block until it is really gone. Added after an itest failed when tried to redeploy: 80 | # juju.errors.JujuError: ['cannot add application "related-app": application already exists'] 81 | await ops_test.model.block_until(lambda: related_app not in ops_test.model.applications) 82 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 83 | assert await is_alertmanager_up(ops_test, app_name) 84 | 85 | 86 | @pytest.mark.abort_on_fail 87 | async def test_rerelate_app(ops_test: OpsTest): 88 | await ops_test.model.deploy( 89 | "ch:prometheus-k8s", application_name=related_app, channel="edge", trust=True 90 | ) 91 | await ops_test.model.add_relation(app_name, f"{related_app}:alertmanager") 92 | await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) 93 | assert await is_alertmanager_up(ops_test, app_name) 94 | 95 | await ops_test.model.add_relation(app_name, f"{related_app}:metrics-endpoint") 96 | await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) 97 | assert await is_alertmanager_up(ops_test, app_name) 98 | -------------------------------------------------------------------------------- /tests/unit/charm/test_push_config_to_workload_on_startup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import logging 6 | import unittest 7 | from unittest.mock import patch 8 | 9 | import hypothesis.strategies as st 10 | import validators 11 | import yaml 12 | from helpers import tautology 13 | from hypothesis import given 14 | from ops.testing import Harness 15 | 16 | from charm import Alertmanager, AlertmanagerCharm 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class TestPushConfigToWorkloadOnStartup(unittest.TestCase): 22 | """Feature: Push config to workload on startup. 23 | 24 | Background: Charm starts up with initial hooks. 25 | """ 26 | 27 | @patch.object(Alertmanager, "reload", tautology) 28 | @patch("charm.KubernetesServicePatch", lambda *a, **kw: None) 29 | def setUp(self, *_): 30 | self.harness = Harness(AlertmanagerCharm) 31 | self.addCleanup(self.harness.cleanup) 32 | 33 | # self.harness.charm.app.name does not exist before .begin() 34 | # https://github.com/canonical/operator/issues/675 35 | # self.peer_rel_id = self.harness.add_relation("replicas", self.app_name) 36 | self.app_name = "alertmanager-k8s" 37 | self.peer_rel_id = self.harness.add_relation("replicas", self.app_name) 38 | self.harness.begin_with_initial_hooks() 39 | 40 | @given(st.booleans()) 41 | def test_single_unit_cluster(self, is_leader): 42 | """Scenario: Current unit is the only unit present.""" 43 | # WHEN only one unit is 44 | self.assertEqual(self.harness.model.app.planned_units(), 1) 45 | self.harness.set_leader(is_leader) 46 | 47 | # THEN amtool config is rendered 48 | amtool_config = yaml.safe_load( 49 | self.harness.charm.container.pull(self.harness.charm._amtool_config_path) 50 | ) 51 | self.assertTrue(validators.url(amtool_config["alertmanager.url"])) 52 | 53 | # AND alertmanager config is rendered 54 | am_config = yaml.safe_load( 55 | self.harness.charm.container.pull(self.harness.charm._config_path) 56 | ) 57 | self.assertGreaterEqual(am_config.keys(), {"global", "route", "receivers"}) 58 | 59 | # AND path to config file is part of pebble layer command 60 | command = ( 61 | self.harness.get_container_pebble_plan(self.harness.charm._container_name) 62 | .services[self.harness.charm._service_name] 63 | .command 64 | ) 65 | self.assertIn(f"--config.file={self.harness.charm._config_path}", command) 66 | 67 | # AND peer clusters cli arg is not present in pebble layer command 68 | self.assertNotIn("--cluster.peer=", command) 69 | 70 | @given(st.booleans(), st.integers(2, 10)) 71 | def test_multi_unit_cluster(self, is_leader, num_units): 72 | """Scenario: Current unit is a part of a multi-unit cluster.""" 73 | # without the try-finally, if any assertion fails, then hypothesis would reenter without 74 | # the cleanup, carrying forward the units that were previously added 75 | try: 76 | self.assertEqual(self.harness.model.app.planned_units(), 1) 77 | 78 | # WHEN multiple units are present 79 | for i in range(1, num_units): 80 | self.harness.add_relation_unit(self.peer_rel_id, f"{self.app_name}/{i}") 81 | self.harness.update_relation_data( 82 | self.peer_rel_id, 83 | f"{self.app_name}/{i}", 84 | {"private_address": f"{2*i}.{2*i}.{2*i}.{2*i}"}, 85 | ) 86 | 87 | self.assertEqual(self.harness.model.app.planned_units(), num_units) 88 | self.harness.set_leader(is_leader) 89 | 90 | # THEN peer clusters cli arg is present in pebble layer command 91 | command = ( 92 | self.harness.get_container_pebble_plan(self.harness.charm._container_name) 93 | .services[self.harness.charm._service_name] 94 | .command 95 | ) 96 | self.assertIn("--cluster.peer=", command) 97 | 98 | finally: 99 | # cleanup added units to prep for reentry by hypothesis' strategy 100 | for i in reversed(range(1, num_units)): 101 | self.harness.remove_relation_unit(self.peer_rel_id, f"{self.app_name}/{i}") 102 | -------------------------------------------------------------------------------- /tests/integration/test_templates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2022 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import json 6 | import logging 7 | from datetime import datetime, timedelta, timezone 8 | from pathlib import Path 9 | 10 | import pytest 11 | import yaml 12 | from helpers import get_unit_address, is_alertmanager_up 13 | from pytest_operator.plugin import OpsTest 14 | from werkzeug.wrappers import Request, Response 15 | 16 | from alertmanager_client import Alertmanager 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) 21 | app_name = METADATA["name"] 22 | resources = {"alertmanager-image": METADATA["resources"]["alertmanager-image"]["upstream-source"]} 23 | 24 | 25 | def request_handler(request: Request): 26 | response = Response("OK", status=200, content_type="text/plain") 27 | logger.info("Got Request Data : %s", json.loads(request.data.decode("utf-8"))) 28 | return response 29 | 30 | 31 | @pytest.mark.abort_on_fail 32 | async def test_receiver_gets_alert(ops_test: OpsTest, charm_under_test, httpserver): 33 | 34 | # deploy charm from local source folder 35 | await ops_test.model.deploy(charm_under_test, resources=resources, application_name=app_name) 36 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) 37 | assert ops_test.model.applications[app_name].units[0].workload_status == "active" 38 | assert await is_alertmanager_up(ops_test, app_name) 39 | 40 | # define the alertmanager configuration 41 | receiver_name = "fake-receiver" 42 | aconfig = { 43 | "global": {"http_config": {"tls_config": {"insecure_skip_verify": True}}}, 44 | "route": { 45 | "group_by": ["alertname"], 46 | "group_wait": "3s", 47 | "group_interval": "5m", 48 | "repeat_interval": "1h", 49 | "receiver": receiver_name, 50 | }, 51 | "receivers": [ 52 | { 53 | "name": receiver_name, 54 | "slack_configs": [ 55 | { 56 | "api_url": httpserver.url_for("/"), 57 | "channel": "test", 58 | "text": r"https://localhost/alerts/{{ .GroupLabels.alertname }}", 59 | } 60 | ], 61 | } 62 | ], 63 | } 64 | 65 | # use a template to define the slack callback id 66 | atemplate = r'{{ define "slack.default.callbackid" }}2{{ end }}' 67 | # set alertmanager configuration and template file 68 | await ops_test.model.applications[app_name].set_config( 69 | {"config_file": yaml.safe_dump(aconfig), "templates_file": atemplate} 70 | ) 71 | await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=60) 72 | 73 | # create an alert 74 | start_time = datetime.now(timezone.utc) 75 | end_time = start_time + timedelta(minutes=5) 76 | alert_name = "fake-alert" 77 | model_uuid = "1234" 78 | alerts = [ 79 | { 80 | "startsAt": start_time.isoformat("T"), 81 | "endsAt": end_time.isoformat("T"), 82 | "status": "firing", 83 | "annotations": { 84 | "summary": "A fake alert", 85 | }, 86 | "labels": { 87 | "juju_model_uuid": model_uuid, 88 | "juju_application": app_name, 89 | "juju_model": ops_test.model_name, 90 | "alertname": alert_name, 91 | }, 92 | "generatorURL": f"http://localhost/{alert_name}", 93 | } 94 | ] 95 | 96 | # define the expected slack notification for the alert 97 | expected_notification = { 98 | "channel": "test", 99 | "username": "Alertmanager", 100 | "attachments": [ 101 | { 102 | "title": f"[FIRING:1] {alert_name} {app_name} {ops_test.model_name} {model_uuid} ", 103 | "title_link": f"http://{app_name}-0:9093/#/alerts?receiver={receiver_name}", 104 | "text": f"https://localhost/alerts/{alert_name}", 105 | "fallback": f"[FIRING:1] {alert_name} {app_name} {ops_test.model_name} {model_uuid} | " 106 | f"http://{app_name}-0:9093/#/alerts?receiver={receiver_name}", 107 | "callback_id": "2", 108 | "footer": "", 109 | "color": "danger", 110 | "mrkdwn_in": ["fallback", "pretext", "text"], 111 | } 112 | ], 113 | } 114 | 115 | # set the alert 116 | with httpserver.wait(timeout=120) as waiting: 117 | # expect an alert to be forwarded to the receiver 118 | httpserver.expect_oneshot_request( 119 | "/", method="POST", json=expected_notification 120 | ).respond_with_handler(request_handler) 121 | client_address = await get_unit_address(ops_test, app_name, 0) 122 | amanager = Alertmanager(address=client_address) 123 | amanager.set_alerts(alerts) 124 | 125 | # check receiver got an alert 126 | assert waiting.result 127 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to alertmanager-k8s 2 | 3 | ## Overview 4 | 5 | This documents explains the processes and practices recommended for 6 | contributing enhancements or bug fixing to the Alertmanager Charmed Operator. 7 | 8 | The intended use case of this operator is to be deployed as part of the 9 | [COS Lite] bundle, although that is not necessary. 10 | 11 | 12 | ## Setup 13 | 14 | A typical setup using [snaps](https://snapcraft.io/) can be found in the 15 | [Juju docs](https://juju.is/docs/sdk/dev-setup). 16 | 17 | 18 | ## Developing 19 | 20 | - Prior to getting started on a pull request, we first encourage you to open an 21 | issue explaining the use case or bug. 22 | This gives other contributors a chance to weigh in early in the process. 23 | - To author PRs you should be familiar with [juju](https://juju.is/#what-is-juju) 24 | and [how operators are written](https://juju.is/docs/sdk). 25 | - The best way to get a head start is to join the conversation on our 26 | [Mattermost channel] or [Discourse]. 27 | - All enhancements require review before being merged. Besides the 28 | code quality and test coverage, the review will also take into 29 | account the resulting user experience for Juju administrators using 30 | this charm. To be able to merge you would have to rebase 31 | onto the `main` branch. We do this to avoid merge commits and to have a 32 | linear Git history. 33 | - We use [`tox`](https://tox.wiki/en/latest/#) to manage all virtualenvs for 34 | the development lifecycle. 35 | 36 | 37 | ### Testing 38 | Unit tests are written with the Operator Framework [test harness] and 39 | integration tests are written using [pytest-operator] and [python-libjuju]. 40 | 41 | The default test environments - lint, static and unit - will run if you start 42 | `tox` without arguments. 43 | 44 | You can also manually run a specific test environment: 45 | 46 | ```shell 47 | tox -e fmt # update your code according to linting rules 48 | tox -e lint # code style 49 | tox -e static # static analysis 50 | tox -e unit # unit tests 51 | tox -e integration # integration tests 52 | tox -e integration-lma # integration tests for the lma-light bundle 53 | ``` 54 | 55 | `tox` creates a virtual environment for every tox environment defined in 56 | [tox.ini](tox.ini). To activate a tox environment for manual testing, 57 | 58 | ```shell 59 | source .tox/unit/bin/activate 60 | ``` 61 | 62 | 63 | #### Manual testing 64 | Alerts can be created using 65 | [`amtool`](https://manpages.debian.org/testing/prometheus-alertmanager/amtool.1.en.html), 66 | 67 | ```shell 68 | amtool alert add alertname=oops service="my-service" severity=warning \ 69 | instance="oops.example.net" --annotation=summary="High latency is high!" \ 70 | --generator-url="http://prometheus.int.example.net" 71 | ``` 72 | 73 | or using [Alertmanager's HTTP API][Alertmanager API browser], 74 | [for example](https://gist.github.com/cherti/61ec48deaaab7d288c9fcf17e700853a): 75 | 76 | ```shell 77 | alertmanager_ip=$(juju status alertmanager/0 --format=json | \ 78 | jq -r ".applications.alertmanager.units.\"alertmanager/0\".address") 79 | 80 | curl -XPOST http://$alertmanager_ip:9093/api/v1/alerts -d "[{ 81 | \"status\": \"firing\", 82 | \"labels\": { 83 | \"alertname\": \"$name\", 84 | \"service\": \"my-service\", 85 | \"severity\":\"warning\", 86 | \"instance\": \"$name.example.net\" 87 | }, 88 | \"annotations\": { 89 | \"summary\": \"High latency is high!\" 90 | }, 91 | \"generatorURL\": \"http://prometheus.int.example.net\" 92 | }]" 93 | ``` 94 | 95 | The alert should then be listed, 96 | 97 | ```shell 98 | curl http://$alertmanager_ip:9093/api/v1/alerts 99 | ``` 100 | 101 | and visible on a karma dashboard, if configured. 102 | 103 | Relations between alertmanager and prometheus can be verified by 104 | [querying prometheus](https://prometheus.io/docs/prometheus/latest/querying/api/#alertmanagers) 105 | for active alertmanagers: 106 | 107 | ```shell 108 | curl -X GET "http://$prom_ip:9090/api/v1/alertmanagers" 109 | ``` 110 | 111 | ## Build charm 112 | 113 | Build the charm in this git repository using 114 | 115 | ```shell 116 | charmcraft pack 117 | ``` 118 | 119 | which will create a `*.charm` file you can deploy with: 120 | 121 | ```shell 122 | juju deploy ./alertmanager-k8s.charm \ 123 | --resource alertmanager-image=ubuntu/prometheus-alertmanager \ 124 | --config config_file='@path/to/alertmanager.yml' \ 125 | --config templates_file='@path/to/templates.tmpl' 126 | ``` 127 | 128 | 129 | ## Code overview 130 | - The main charm class is `AlertmanagerCharm`, which responds to config changes 131 | (via `ConfigChangedEvent`) and cluster changes (via `RelationJoinedEvent`, 132 | `RelationChangedEvent` and `RelationDepartedEvent`). 133 | - All lifecycle events call a common hook, `_common_exit_hook` after executing 134 | their own business logic. This pattern simplifies state tracking and improves 135 | consistency. 136 | - On startup, the charm waits for `PebbleReadyEvent` and for an IP address to 137 | become available before starting the karma service and declaring 138 | `ActiveStatus`. The charm must be related to an alertmanager instance, 139 | otherwise the charm will go into blocked state. 140 | 141 | ## Design choices 142 | - The `alertmanager.yml` config file is created in its entirety by the charm 143 | code on startup (the default `alertmanager.yml` is overwritten). This is done 144 | to maintain consistency across OCI images. 145 | - Hot reload via the alertmanager HTTP API is used whenever possible instead of 146 | service restart, to minimize downtime. 147 | 148 | 149 | [Alertmanager API browser]: https://petstore.swagger.io/?url=https://raw.githubusercontent.com/prometheus/alertmanager/master/api/v2/openapi.yaml 150 | [gh:Prometheus operator]: https://github.com/canonical/prometheus-operator 151 | [Prometheus operator]: https://charmhub.io/prometheus-k8s 152 | [COS Lite]: https://charmhub.io/cos-lite 153 | [Mattermost channel]: https://chat.charmhub.io/charmhub/channels/observability 154 | [Discourse]: https://discourse.charmhub.io/tag/alertmanager 155 | [test harness]: https://ops.readthedocs.io/en/latest/#module-ops.testing 156 | [pytest-operator]: https://github.com/charmed-kubernetes/pytest-operator/blob/main/docs/reference.md 157 | [python-libjuju]: https://pythonlibjuju.readthedocs.io/en/latest/ 158 | -------------------------------------------------------------------------------- /tests/unit/test_charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import unittest 6 | from unittest.mock import patch 7 | 8 | import ops 9 | import yaml 10 | from helpers import tautology 11 | from ops.model import ActiveStatus, BlockedStatus 12 | from ops.testing import Harness 13 | 14 | from charm import Alertmanager, AlertmanagerCharm 15 | 16 | 17 | class TestWithInitialHooks(unittest.TestCase): 18 | container_name: str = "alertmanager" 19 | 20 | @patch.object(Alertmanager, "reload", tautology) 21 | @patch("charm.KubernetesServicePatch", lambda x, y: None) 22 | @patch("socket.getfqdn", new=lambda *args: "fqdn") 23 | def setUp(self, *unused): 24 | self.harness = Harness(AlertmanagerCharm) 25 | self.addCleanup(self.harness.cleanup) 26 | 27 | self.relation_id = self.harness.add_relation("alerting", "otherapp") 28 | self.harness.add_relation_unit(self.relation_id, "otherapp/0") 29 | self.harness.set_leader(True) 30 | 31 | self.harness.begin_with_initial_hooks() 32 | 33 | def test_num_peers(self): 34 | self.assertEqual(0, len(self.harness.charm.peer_relation.units)) # type: ignore 35 | 36 | def test_pebble_layer_added(self, *unused): 37 | self.harness.container_pebble_ready(self.container_name) 38 | plan = self.harness.get_container_pebble_plan(self.container_name) 39 | 40 | # Check we've got the plan as expected 41 | self.assertIsNotNone(plan.services) 42 | self.assertIsNotNone(service := plan.services.get(self.harness.charm._service_name)) 43 | self.assertIsNotNone(command := service.command) 44 | 45 | # Check command is as expected 46 | self.assertEqual(plan.services, self.harness.charm._alertmanager_layer().services) 47 | 48 | # Check command contains key arguments 49 | self.assertIn("--config.file", command) 50 | self.assertIn("--storage.path", command) 51 | self.assertIn("--web.listen-address", command) 52 | self.assertIn("--cluster.listen-address", command) 53 | 54 | # Check the service was started 55 | service = self.harness.model.unit.get_container("alertmanager").get_service("alertmanager") 56 | self.assertTrue(service.is_running()) 57 | 58 | def test_relation_data_provides_public_address(self): 59 | # to suppress mypy error: Item "None" of "Optional[Any]" has no attribute "get_relation" 60 | model = self.harness.charm.framework.model 61 | assert model is not None 62 | 63 | rel = model.get_relation("alerting", self.relation_id) 64 | expected_address = "fqdn:{}".format(self.harness.charm.alertmanager_provider.api_port) 65 | self.assertEqual({"public_address": expected_address}, rel.data[self.harness.charm.unit]) # type: ignore 66 | 67 | def test_topology_added_if_user_provided_config_without_group_by(self, *unused): 68 | self.harness.container_pebble_ready(self.container_name) 69 | 70 | new_config = yaml.dump({"not a real config": "but good enough for testing"}) 71 | self.harness.update_config({"config_file": new_config}) 72 | updated_config = yaml.safe_load( 73 | self.harness.charm.container.pull(self.harness.charm._config_path) 74 | ) 75 | 76 | self.assertEqual(updated_config["not a real config"], "but good enough for testing") 77 | self.assertListEqual( 78 | sorted(updated_config["route"]["group_by"]), 79 | sorted(["juju_model", "juju_application", "juju_model_uuid"]), 80 | ) 81 | 82 | def test_topology_added_if_user_provided_config_with_group_by(self, *unused): 83 | self.harness.container_pebble_ready(self.container_name) 84 | 85 | new_config = yaml.dump({"route": {"group_by": ["alertname", "juju_model"]}}) 86 | self.harness.update_config({"config_file": new_config}) 87 | updated_config = yaml.safe_load( 88 | self.harness.charm.container.pull(self.harness.charm._config_path) 89 | ) 90 | 91 | self.assertListEqual( 92 | sorted(updated_config["route"]["group_by"]), 93 | sorted(["alertname", "juju_model", "juju_application", "juju_model_uuid"]), 94 | ) 95 | 96 | def test_charm_blocks_if_user_provided_config_with_templates(self, *unused): 97 | self.harness.container_pebble_ready(self.container_name) 98 | 99 | new_config = yaml.dump({"templates": ["/what/ever/*.tmpl"]}) 100 | self.harness.update_config({"config_file": new_config}) 101 | self.assertIsInstance(self.harness.charm.unit.status, BlockedStatus) 102 | 103 | new_config = yaml.dump({}) 104 | self.harness.update_config({"config_file": new_config}) 105 | self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) 106 | 107 | def test_templates_section_added_if_user_provided_templates(self, *unused): 108 | self.harness.container_pebble_ready(self.container_name) 109 | 110 | templates = '{{ define "some.tmpl.variable" }}whatever it is{{ end}}' 111 | self.harness.update_config({"templates_file": templates}) 112 | updated_templates = self.harness.charm.container.pull(self.harness.charm._templates_path) 113 | self.assertEqual(templates, updated_templates.read()) 114 | 115 | updated_config = yaml.safe_load( 116 | self.harness.charm.container.pull(self.harness.charm._config_path) 117 | ) 118 | self.assertEqual(updated_config["templates"], [f"{self.harness.charm._templates_path}"]) 119 | 120 | 121 | class TestWithoutInitialHooks(unittest.TestCase): 122 | container_name: str = "alertmanager" 123 | 124 | @patch.object(Alertmanager, "reload", tautology) 125 | @patch("charm.KubernetesServicePatch", lambda x, y: None) 126 | def setUp(self, *unused): 127 | self.harness = Harness(AlertmanagerCharm) 128 | self.addCleanup(self.harness.cleanup) 129 | 130 | self.relation_id = self.harness.add_relation("alerting", "otherapp") 131 | self.harness.add_relation_unit(self.relation_id, "otherapp/0") 132 | self.harness.set_leader(True) 133 | 134 | self.harness.begin() 135 | self.harness.add_relation("replicas", "alertmanager") 136 | 137 | def test_unit_status_around_pebble_ready(self, *unused): 138 | # before pebble_ready, status should be "maintenance" 139 | self.assertIsInstance(self.harness.charm.unit.status, ops.model.MaintenanceStatus) 140 | 141 | # after pebble_ready, status should be "active" 142 | self.harness.container_pebble_ready(self.container_name) 143 | self.assertIsInstance(self.harness.charm.unit.status, ops.model.ActiveStatus) 144 | 145 | self.assertEqual(self.harness.model.unit.name, "alertmanager-k8s/0") 146 | -------------------------------------------------------------------------------- /tests/unit/test_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | import textwrap 6 | import unittest 7 | 8 | from charms.alertmanager_k8s.v0.alertmanager_dispatch import AlertmanagerConsumer 9 | from ops.charm import CharmBase 10 | from ops.framework import StoredState 11 | from ops.testing import Harness 12 | 13 | 14 | class DummyConsumerCharm(CharmBase): 15 | """Mimic bare functionality of AlertmanagerCharm needed to test the consumer.""" 16 | 17 | # define custom metadata - without this the harness would parse the metadata.yaml in this repo, 18 | # which would result in expressions like self.harness.model.app.name to return 19 | # "alertmanager-k8s", which is not what we want in a consumer test 20 | metadata_yaml = textwrap.dedent( 21 | """ 22 | name: DummyConsumerCharm 23 | containers: 24 | consumer-charm: 25 | resource: consumer-charm-image 26 | resources: 27 | consumer-charm-image: 28 | type: oci-image 29 | requires: 30 | alerting: 31 | interface: alertmanager_dispatch 32 | peers: 33 | replicas: 34 | interface: consumer_charm_replica 35 | """ 36 | ) 37 | _stored = StoredState() 38 | 39 | def __init__(self, *args, **kwargs): 40 | super().__init__(*args) 41 | # relation name must match metadata 42 | self.alertmanager_lib = AlertmanagerConsumer(self, relation_name="alerting") 43 | 44 | self.framework.observe( 45 | self.alertmanager_lib.on.cluster_changed, self._on_alertmanager_cluster_changed 46 | ) 47 | 48 | self._stored.set_default(alertmanagers=[], cluster_changed_emitted=0) 49 | 50 | def _on_alertmanager_cluster_changed(self, _): 51 | self._stored.cluster_changed_emitted += 1 52 | self._stored.alertmanagers = self.alertmanager_lib.get_cluster_info() 53 | 54 | 55 | class TestConsumer(unittest.TestCase): 56 | def setUp(self): 57 | self.harness = Harness(DummyConsumerCharm, meta=DummyConsumerCharm.metadata_yaml) 58 | self.addCleanup(self.harness.cleanup) 59 | self.harness.set_leader(True) 60 | self.harness.begin_with_initial_hooks() 61 | 62 | def _relate_to_alertmanager(self) -> int: 63 | """Create relation between 'this app' and a hypothetical (remote) alertmanager.""" 64 | rel_id = self.harness.add_relation(relation_name="alerting", remote_app="am") 65 | return rel_id 66 | 67 | def _add_alertmanager_units(self, rel_id: int, num_units: int, start_with=0): 68 | for i in range(start_with, start_with + num_units): 69 | remote_unit_name = f"am/{i}" 70 | self.harness.add_relation_unit(rel_id, remote_unit_name) 71 | self.harness.update_relation_data( 72 | rel_id, remote_unit_name, {"public_address": f"10.20.30.{i}"} 73 | ) 74 | 75 | return rel_id 76 | 77 | def test_cluster_updated_after_alertmanager_units_join(self): 78 | # before 79 | self.assertEqual([], self.harness.charm.alertmanager_lib.get_cluster_info()) 80 | num_events = self.harness.charm._stored.cluster_changed_emitted 81 | 82 | # add relation 83 | rel_id = self._relate_to_alertmanager() 84 | self._add_alertmanager_units(rel_id, num_units=2) 85 | 86 | # after 87 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 88 | self.assertListEqual( 89 | ["10.20.30.0", "10.20.30.1"], self.harness.charm.alertmanager_lib.get_cluster_info() 90 | ) 91 | 92 | num_events = self.harness.charm._stored.cluster_changed_emitted 93 | 94 | # add another unit 95 | self._add_alertmanager_units(rel_id, num_units=1, start_with=2) 96 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 97 | self.assertListEqual( 98 | ["10.20.30.0", "10.20.30.1", "10.20.30.2"], 99 | self.harness.charm.alertmanager_lib.get_cluster_info(), 100 | ) 101 | 102 | def test_cluster_updated_after_alertmanager_unit_leaves(self): 103 | num_events = self.harness.charm._stored.cluster_changed_emitted 104 | 105 | # add relation 106 | rel_id = self._relate_to_alertmanager() 107 | self._add_alertmanager_units(rel_id, num_units=4) 108 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 109 | before = self.harness.charm.alertmanager_lib.get_cluster_info() 110 | self.assertEqual(len(before), 4) 111 | 112 | num_events = self.harness.charm._stored.cluster_changed_emitted 113 | 114 | # remove alertmanager units 115 | self.harness.remove_relation_unit(rel_id, "am/3") 116 | self.harness.remove_relation_unit(rel_id, "am/2") 117 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 118 | after = self.harness.charm.alertmanager_lib.get_cluster_info() 119 | self.assertListEqual(after, ["10.20.30.0", "10.20.30.1"]) 120 | 121 | num_events = self.harness.charm._stored.cluster_changed_emitted 122 | 123 | # remove all remaining units 124 | self.harness.remove_relation_unit(rel_id, "am/1") 125 | self.harness.remove_relation_unit(rel_id, "am/0") 126 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 127 | after = self.harness.charm.alertmanager_lib.get_cluster_info() 128 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 129 | self.assertListEqual(after, []) 130 | 131 | def test_cluster_is_empty_after_relation_breaks(self): 132 | # add relation 133 | rel_id = self._relate_to_alertmanager() 134 | self._add_alertmanager_units(rel_id, num_units=4) 135 | before = self.harness.charm.alertmanager_lib.get_cluster_info() 136 | self.assertEqual(len(before), 4) 137 | 138 | num_events = self.harness.charm._stored.cluster_changed_emitted 139 | 140 | # remove relation 141 | self.harness.remove_relation(rel_id) 142 | after = self.harness.charm.alertmanager_lib.get_cluster_info() 143 | self.assertGreater(self.harness.charm._stored.cluster_changed_emitted, num_events) 144 | self.assertListEqual([], after) 145 | 146 | def test_relation_changed(self): 147 | # add relation 148 | rel_id = self._relate_to_alertmanager() 149 | self._add_alertmanager_units(rel_id, num_units=2) 150 | 151 | # update remote unit's relation data (emulates upgrade-charm) 152 | self.harness.update_relation_data(rel_id, "am/1", {"public_address": "90.80.70.60"}) 153 | self.assertListEqual( 154 | ["10.20.30.0", "90.80.70.60"], self.harness.charm.alertmanager_lib.get_cluster_info() 155 | ) 156 | -------------------------------------------------------------------------------- /src/alertmanager_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """Client library for Alertmanager API.""" 6 | 7 | import json 8 | import logging 9 | import time 10 | import urllib.error 11 | import urllib.parse 12 | import urllib.request 13 | from typing import Optional 14 | 15 | import yaml 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class AlertmanagerBadResponse(RuntimeError): 21 | """A catch-all exception type to indicate 'no reply', regardless the reason.""" 22 | 23 | 24 | class Alertmanager: 25 | """Alertmanager HTTP API client.""" 26 | 27 | def __init__(self, address: str = "localhost", port: int = 9093, timeout=2.0): 28 | self.base_url = f"http://{address}:{port}/" 29 | self.timeout = timeout 30 | 31 | def reload(self) -> bool: 32 | """Send a POST request to to hot-reload the config. 33 | 34 | This reduces down-time compared to restarting the service. 35 | 36 | Returns: 37 | True if reload succeeded (returned 200 OK); False otherwise. 38 | """ 39 | url = urllib.parse.urljoin(self.base_url, "/-/reload") 40 | # for an empty POST request, the `data` arg must be b"" to tell urlopen it's a POST 41 | if resp := self._open(url, data=b"", timeout=self.timeout): 42 | logger.warning("reload: POST returned a non-empty response: %s", resp) 43 | return False 44 | return True 45 | 46 | @staticmethod 47 | def _open(url: str, data: Optional[bytes], timeout: float) -> bytes: 48 | """Send a request using urlopen. 49 | 50 | Args: 51 | url: target url for the request 52 | data: bytes to send to target 53 | timeout: duration in seconds after which to return, regardless the result 54 | 55 | Raises: 56 | AlertmanagerBadResponse: If no response or invalid response, regardless the reason. 57 | """ 58 | for retry in reversed(range(3)): 59 | try: 60 | response = urllib.request.urlopen(url, data, timeout) 61 | if response.code == 200 and response.reason == "OK": 62 | return response.read() 63 | elif retry == 0: 64 | raise AlertmanagerBadResponse( 65 | f"Bad response (code={response.code}, reason={response.reason})" 66 | ) 67 | 68 | except (ValueError, urllib.error.HTTPError, urllib.error.URLError) as e: 69 | if retry == 0: 70 | raise AlertmanagerBadResponse("Bad response") from e 71 | 72 | time.sleep(0.2) 73 | 74 | assert False, "unreachable" # help mypy (https://github.com/python/mypy/issues/8964) 75 | 76 | def status(self) -> dict: 77 | """Obtain status information from the alertmanager server. 78 | 79 | Typical output: 80 | { 81 | "cluster": { 82 | "peers": [], 83 | "status": "disabled" 84 | }, 85 | "config": { 86 | "original": "global: [...]" 87 | }, 88 | "uptime": "2021-08-31T14:15:31.613Z", 89 | "versionInfo": { 90 | "branch": "HEAD", 91 | "buildDate": "20210324-17:46:50", 92 | "buildUser": "root@lgw01-amd64-031", 93 | "goVersion": "go1.14.15", 94 | "revision": "4c6c03ebfe21009c546e4d1e9b92c371d67c021d", 95 | "version": "0.21.0" 96 | } 97 | } 98 | """ 99 | url = urllib.parse.urljoin(self.base_url, "/api/v2/status") 100 | try: 101 | # the `data` arg must be None to tell urlopen it's a GET 102 | return json.loads(self._open(url, data=None, timeout=self.timeout)) 103 | except (TypeError, json.decoder.JSONDecodeError) as e: 104 | raise AlertmanagerBadResponse("Response is not a JSON string") from e 105 | 106 | @property 107 | def version(self) -> str: 108 | """Obtain version number from the alertmanager server.""" 109 | try: 110 | return self.status()["versionInfo"]["version"] 111 | except KeyError as e: 112 | raise AlertmanagerBadResponse("Unexpected response") from e 113 | 114 | def config(self) -> dict: 115 | """Obtain config from the alertmanager server. 116 | 117 | Typical output (here displayed in yaml format): 118 | global: 119 | resolve_timeout: 5m 120 | http_config: 121 | tls_config: 122 | insecure_skip_verify: true 123 | smtp_hello: localhost 124 | smtp_require_tls: true 125 | pagerduty_url: https://events.pagerduty.com/v2/enqueue 126 | opsgenie_api_url: https://api.opsgenie.com/ 127 | wechat_api_url: https://qyapi.weixin.qq.com/cgi-bin/ 128 | victorops_api_url: https://alert.victorops.com/integrations/generic/20131114/alert/ 129 | route: 130 | receiver: dummy 131 | group_by: 132 | - juju_application 133 | - juju_model 134 | - juju_model_uuid 135 | group_wait: 30s 136 | group_interval: 5m 137 | repeat_interval: 1h 138 | receivers: 139 | - name: dummy 140 | webhook_configs: 141 | - send_resolved: true 142 | http_config: 143 | tls_config: 144 | insecure_skip_verify: true 145 | url: http://127.0.0.1:5001/ 146 | max_alerts: 0 147 | templates: [] 148 | """ 149 | try: 150 | config = self.status()["config"]["original"] 151 | except KeyError as e: 152 | raise AlertmanagerBadResponse("Unexpected response") from e 153 | 154 | try: 155 | return yaml.safe_load(config) 156 | except yaml.YAMLError as e: 157 | raise AlertmanagerBadResponse("Response is not a YAML string") from e 158 | 159 | def _post( 160 | self, url: str, post_data: bytes, headers: dict = None, timeout: int = None 161 | ) -> bytes: 162 | """Make a HTTP POST request to Alertmanager. 163 | 164 | Args: 165 | url: string URL where POST request is sent. 166 | post_data: encoded string (bytes) of data to be posted. 167 | headers: dictionary containing HTTP headers to be used for POST request. 168 | timeout: numeric timeout value in seconds. 169 | 170 | Returns: 171 | urllib response object. 172 | """ 173 | response = "".encode("utf-8") 174 | timeout = timeout or self.timeout 175 | request = urllib.request.Request(url, headers=headers or {}, data=post_data, method="POST") 176 | 177 | try: 178 | response = urllib.request.urlopen(request, timeout=timeout) 179 | except urllib.error.HTTPError as error: 180 | logger.debug( 181 | "Failed posting to %s, reason: %s", 182 | url, 183 | error.reason, 184 | ) 185 | except urllib.error.URLError as error: 186 | logger.debug("Invalid URL %s : %s", url, error) 187 | except TimeoutError: 188 | logger.debug("Request timeout during posting to URL %s", url) 189 | return response 190 | 191 | def set_alerts(self, alerts: list) -> bytes: 192 | """Send a set of new alerts to alertmanger. 193 | 194 | Args: 195 | alerts: a list of alerts to be set. Format of this list is 196 | described here https://prometheus.io/docs/alerting/latest/clients/. 197 | 198 | Returns: 199 | urllib response object. 200 | """ 201 | url = urllib.parse.urljoin(self.base_url, "/api/v1/alerts") 202 | headers = {"Content-Type": "application/json"} 203 | post_data = json.dumps(alerts).encode("utf-8") 204 | response = self._post(url, post_data, headers=headers) 205 | 206 | return response 207 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Alertmanager Operator (k8s) 2 | 3 | [![Test Suite](https://github.com/canonical/alertmanager-k8s-operator/actions/workflows/release-edge.yaml/badge.svg)](https://github.com/canonical/alertmanager-k8s-operator/actions/workflows/release-edge.yaml) 4 | ![Discourse status](https://img.shields.io/discourse/status?server=https%3A%2F%2Fdiscourse.charmhub.io&style=flat) 5 | 6 | This Charmed Operator handles instantiation, scaling, configuration, and Day 2 7 | operations specific to [Alertmanager]. 8 | 9 | This operator drives the Alertmanager application, and it can be composed with 10 | other operators to deliver a complex application or service, 11 | such as [COS Lite][COS Lite bundle]. 12 | 13 | Alertmanager receives alerts from supporting applications, such as 14 | [Prometheus][Prometheus operator] or [Loki][Loki operator], then deduplicates, 15 | groups and routes them to the configured receiver(s). 16 | 17 | 18 | [Alertmanager]: https://prometheus.io/docs/alerting/latest/alertmanager/ 19 | [COS Lite bundle]: https://charmhub.io/cos-lite 20 | [Loki operator]: https://charmhub.io/loki-k8s 21 | [Prometheus operator]: https://charmhub.io/prometheus-k8s 22 | 23 | 24 | ## Getting started 25 | 26 | ### Basic deployment 27 | 28 | Once you have a controller and model ready, you can deploy alertmanager 29 | using the Juju CLI: 30 | 31 | ```shell 32 | juju deploy --channel=beta alertmanager-k8s 33 | ``` 34 | 35 | The available [channels](https://snapcraft.io/docs/channels) are listed at the top 36 | of [the page](https://charmhub.io/alertmanager-k8s) and can also be retrieved with 37 | Charmcraft CLI: 38 | 39 | ```shell 40 | $ charmcraft status alertmanager-k8s 41 | 42 | Track Base Channel Version Revision Resources 43 | latest ubuntu 20.04 (amd64) stable - - - 44 | candidate - - - 45 | beta 9 9 alertmanager-image (r1) 46 | edge 9 9 alertmanager-image (r1) 47 | ``` 48 | 49 | Once the Charmed Operator is deployed, the status can be checked by running: 50 | 51 | ```shell 52 | juju status --relations --storage --color 53 | ``` 54 | 55 | 56 | ### Configuration 57 | 58 | In order to have alerts dispatched to your receiver(s) of choice, 59 | a [configuration file](https://www.prometheus.io/docs/alerting/latest/configuration/) 60 | must be provided to Alertmanager using the 61 | [`config_file`](https://charmhub.io/alertmanager-k8s/configure#config_file) option: 62 | 63 | ```shell 64 | juju config alertmanager-k8s \ 65 | config_file='@path/to/alertmanager.yml' 66 | ``` 67 | 68 | Note that if you use templates, you should use the `templates_file` config option 69 | instead of having a `templates` section in your `yaml` configuration file. 70 | (This is a slight deviation from the official alertmanager config spec.) 71 | 72 | 73 | Use the [`templates_file`](https://charmhub.io/alertmanager-k8s/configure#templates_file) 74 | option to push templates that are being used by the configuration file: 75 | 76 | ```shell 77 | juju config alertmanager-k8s \ 78 | config_file='@path/to/alertmanager.yml' \ 79 | templates_file='@path/to/templates.tmpl' 80 | ``` 81 | 82 | All templates need to go into this single config option, instead of 83 | the 'templates' section of the main configuration file. The templates will be 84 | pushed to the workload container, and the configuration file will be updated 85 | accordingly. 86 | 87 | Refer to the 88 | [official templates documentation](https://prometheus.io/docs/alerting/latest/notification_examples/) 89 | for more details. 90 | 91 | 92 | To verify Alertmanager is using the expected configuration you can use the 93 | [`show-config`](https://charmhub.io/alertmanager-k8s/actions#show-config) action: 94 | 95 | ```shell 96 | juju run-action alertmanager-k8s/0 show-config --wait 97 | ``` 98 | 99 | 100 | ### Dashboard and HTTP API 101 | 102 | The Alertmanager dashboard and 103 | [HTTP API](https://www.prometheus.io/docs/alerting/latest/management_api/) 104 | can be accessed at the default port (9093) on the Alertmanager IP address, 105 | which is determinable with a `juju status` command. 106 | 107 | To obtain the load-balanaced application IP, 108 | 109 | ```shell 110 | juju status alertmanager-k8s --format=json \ 111 | | jq -r '.applications."alertmanager-k8s".address' 112 | ``` 113 | 114 | Similarly, to obtain an individual unit's IP address: 115 | 116 | ```shell 117 | juju status alertmanager-k8s --format=json \ 118 | | jq -r '.applications."alertmanager-k8s".units."alertmanager-k8s/0".address' 119 | ``` 120 | 121 | 122 | ## Clustering 123 | 124 | ### Forming a cluster 125 | 126 | Alertmanager [supports clustering](https://www.prometheus.io/docs/alerting/latest/alertmanager/#high-availability) 127 | and all you need to do to create/update a cluster is to rescale the application to the desired number 128 | of units using `add-unit`: 129 | 130 | ```shell 131 | juju add-unit alertmanager-k8s 132 | ``` 133 | 134 | or using `scale-application`: 135 | 136 | ```shell 137 | juju scale-application alertmanager-k8s 3 138 | ``` 139 | 140 | Internally, HA is achieved by providing each Alertmanager instance at least one IP address of another instance. The cluster would then auto-update with subsequent changes to the units present. 141 | 142 | ### Verification 143 | #### Pebble plan 144 | Cluster information is passed to Alertmanager via [`--cluster.peer` command line arguments](https://github.com/prometheus/alertmanager#high-availability). This can be verified by looking at the current pebble plan: 145 | 146 | ```shell 147 | $ juju exec --unit alertmanager-k8s/0 -- \ 148 | PEBBLE_SOCKET=/charm/containers/alertmanager/pebble.socket \ 149 | pebble plan 150 | 151 | services: 152 | alertmanager: 153 | summary: alertmanager service 154 | startup: enabled 155 | override: replace 156 | command: alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager --web.listen-address=:9093 --cluster.listen-address=0.0.0.0:9094 --cluster.peer=10.1.179.220:9094 --cluster.peer=10.1.179.221:9094 157 | ``` 158 | #### HTTP API 159 | To manually verify a cluster is indeed formed, you can query the alertmanager HTTP API directly: 160 | 161 | ```shell 162 | $ curl -s $ALERTMANAGER_IP:9093/api/v1/status \ 163 | | jq '.data.clusterStatus.peers[].address' 164 | "10.1.179.220:9094" 165 | "10.1.179.221:9094" 166 | "10.1.179.217:9094" 167 | ``` 168 | 169 | 170 | ## OCI Images 171 | This charm is published on Charmhub with alertmanager images from 172 | [ubuntu/prometheus-alertmanager], however, it should also work with the 173 | official [quay.io/prometheus/alertmanager]. 174 | 175 | To try the charm with a different image you can use `juju refresh`. For example: 176 | 177 | ```shell 178 | juju refresh alertmanager-k8s \ 179 | --resource alertmanager-image=quay.io/prometheus/alertmanager 180 | ``` 181 | 182 | (Note: currently, refreshing to a different image only works when deploying from a local 183 | charm - [lp/1954462](https://bugs.launchpad.net/juju/+bug/1954462).) 184 | 185 | ### Resource revisions 186 | Workload images are archived on charmhub by revision number. 187 | 188 | | Resource | Revision | Image | 189 | |--------------------|:--------:|-------------------| 190 | | alertmanager-image | r1 | [0.21-20.04_beta] | 191 | 192 | You can use `charmcraft` to see the mapping between charm revisions and resource revisions: 193 | 194 | ```shell 195 | charmcraft status alertmanager-k8s 196 | ``` 197 | 198 | [ubuntu/prometheus-alertmanager]: https://hub.docker.com/r/ubuntu/prometheus-alertmanager 199 | [quay.io/prometheus/alertmanager]: https://quay.io/repository/prometheus/alertmanager?tab=tags 200 | [0.21-20.04_beta]: https://hub.docker.com/layers/ubuntu/prometheus-alertmanager/0.21-20.04_beta/images/sha256-1418c677768887c2c717d043c9cb8397a32552a61354cb98c25cef23eeeb2b3f?context=explore 201 | 202 | 203 | ## Official alertmanager documentation 204 | 205 | For further details about Alertmanager configuration and usage, please refer to 206 | the [official Alertmanager documentation](https://www.prometheus.io/docs/alerting/latest/overview/). 207 | 208 | 209 | ## Additional Information 210 | - [Logging, Monitoring, and Alerting](https://discourse.ubuntu.com/t/logging-monitoring-and-alerting/19151) (LMA) - 211 | a tutorial for running Prometheus, Grafana and Alertmanager with LXD. 212 | - [Alertmanager README](https://github.com/prometheus/alertmanager) 213 | - [PromCon 2018: Life of an Alert](https://youtube.com/watch?v=PUdjca23Qa4) 214 | -------------------------------------------------------------------------------- /lib/charms/alertmanager_k8s/v0/alertmanager_dispatch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """# Alertmanager library. 5 | 6 | This library is designed to be used by a charm consuming or providing the `alertmanager_dispatch` 7 | relation interface. 8 | 9 | This library is published as part of the 10 | [Alertmanager charm](https://charmhub.io/alertmanager-k8s). 11 | 12 | You can file bugs [here](https://github.com/canonical/alertmanager-operator/issues)! 13 | 14 | A typical example of including this library might be: 15 | 16 | ```python 17 | # ... 18 | from charms.alertmanager_k8s.v0.alertmanager_dispatch import AlertmanagerConsumer 19 | 20 | class SomeApplication(CharmBase): 21 | def __init__(self, *args): 22 | # ... 23 | self.alertmanager_consumer = AlertmanagerConsumer(self, relation_name="alertmanager") 24 | # ... 25 | ``` 26 | """ 27 | import logging 28 | import socket 29 | from typing import List 30 | 31 | import ops 32 | from ops.charm import CharmBase, RelationEvent, RelationJoinedEvent, RelationRole 33 | from ops.framework import EventBase, EventSource, Object, ObjectEvents 34 | from ops.model import Relation 35 | 36 | # The unique Charmhub library identifier, never change it 37 | LIBID = "37f1ca6f8fe84e3092ebbf6dc2885310" 38 | 39 | # Increment this major API version when introducing breaking changes 40 | LIBAPI = 0 41 | 42 | # Increment this PATCH version before using `charmcraft publish-lib` or reset 43 | # to 0 if you are raising the major API version 44 | LIBPATCH = 4 45 | 46 | # Set to match metadata.yaml 47 | INTERFACE_NAME = "alertmanager_dispatch" 48 | 49 | logger = logging.getLogger(__name__) 50 | 51 | 52 | class ClusterChanged(EventBase): 53 | """Event raised when an alertmanager cluster is changed. 54 | 55 | If an alertmanager unit is added to or removed from a relation, 56 | then a :class:`ClusterChanged` event should be emitted. 57 | """ 58 | 59 | 60 | class AlertmanagerConsumerEvents(ObjectEvents): 61 | """Event descriptor for events raised by `AlertmanagerConsumer`.""" 62 | 63 | cluster_changed = EventSource(ClusterChanged) 64 | 65 | 66 | class RelationManagerBase(Object): 67 | """Base class that represents relation ends ("provides" and "requires"). 68 | 69 | :class:`RelationManagerBase` is used to create a relation manager. This is done by inheriting 70 | from :class:`RelationManagerBase` and customising the sub class as required. 71 | 72 | Attributes: 73 | name (str): consumer's relation name 74 | """ 75 | 76 | def __init__(self, charm: CharmBase, relation_name: str, relation_role: RelationRole): 77 | super().__init__(charm, relation_name) 78 | self.charm = charm 79 | self._validate_relation(relation_name, relation_role) 80 | self.name = relation_name 81 | 82 | def _validate_relation(self, relation_name: str, relation_role: RelationRole): 83 | try: 84 | if self.charm.meta.relations[relation_name].role != relation_role: 85 | raise ValueError( 86 | "Relation '{}' in the charm's metadata.yaml must be '{}' " 87 | "to be managed by this library, but instead it is '{}'".format( 88 | relation_name, 89 | relation_role, 90 | self.charm.meta.relations[relation_name].role, 91 | ) 92 | ) 93 | if self.charm.meta.relations[relation_name].interface_name != INTERFACE_NAME: 94 | raise ValueError( 95 | "Relation '{}' in the charm's metadata.yaml must use the '{}' interface " 96 | "to be managed by this library, but instead it is '{}'".format( 97 | relation_name, 98 | INTERFACE_NAME, 99 | self.charm.meta.relations[relation_name].interface_name, 100 | ) 101 | ) 102 | except KeyError: 103 | raise ValueError( 104 | "Relation '{}' is not in the charm's metadata.yaml".format(relation_name) 105 | ) 106 | 107 | 108 | class AlertmanagerConsumer(RelationManagerBase): 109 | """A "consumer" handler to be used by charms that relate to Alertmanager (the 'requires' side). 110 | 111 | To have your charm consume alertmanager cluster data, declare the interface's use in your 112 | charm's metadata.yaml file: 113 | 114 | ```yaml 115 | requires: 116 | alertmanager: 117 | interface: alertmanager_dispatch 118 | ``` 119 | 120 | A typical example of importing this library might be 121 | 122 | ```python 123 | from charms.alertmanager_k8s.v0.alertmanager_dispatch import AlertmanagerConsumer 124 | ``` 125 | 126 | In your charm's `__init__` method: 127 | 128 | ```python 129 | self.alertmanager_consumer = AlertmanagerConsumer(self, relation_name="alertmanager") 130 | ``` 131 | 132 | Every change in the alertmanager cluster emits a :class:`ClusterChanged` event that the 133 | consumer charm can register and handle, for example: 134 | 135 | ``` 136 | self.framework.observe(self.alertmanager_consumer.on.cluster_changed, 137 | self._on_alertmanager_cluster_changed) 138 | ``` 139 | 140 | The updated alertmanager cluster can then be obtained via the `get_cluster_info` method 141 | 142 | This consumer library expect the consumer charm to observe the `cluster_changed` event. 143 | 144 | Arguments: 145 | charm (CharmBase): consumer charm 146 | relation_name (str): from consumer's metadata.yaml 147 | 148 | Attributes: 149 | charm (CharmBase): consumer charm 150 | """ 151 | 152 | on = AlertmanagerConsumerEvents() 153 | 154 | def __init__(self, charm: CharmBase, relation_name: str = "alerting"): 155 | super().__init__(charm, relation_name, RelationRole.requires) 156 | 157 | self.framework.observe( 158 | self.charm.on[self.name].relation_changed, self._on_relation_changed 159 | ) 160 | self.framework.observe( 161 | self.charm.on[self.name].relation_departed, 162 | self._on_relation_departed, 163 | ) 164 | self.framework.observe(self.charm.on[self.name].relation_broken, self._on_relation_broken) 165 | 166 | def _on_relation_changed(self, event: ops.charm.RelationChangedEvent): 167 | """This hook notifies the charm that there may have been changes to the cluster.""" 168 | if event.unit: # event.unit may be `None` in the case of app data change 169 | # inform consumer about the change 170 | self.on.cluster_changed.emit() 171 | 172 | def get_cluster_info(self) -> List[str]: 173 | """Returns a list of ip addresses of all the alertmanager units.""" 174 | alertmanagers = [] # type: List[str] 175 | relation = self.charm.model.get_relation(self.name) 176 | if not relation: 177 | return alertmanagers 178 | for unit in relation.units: 179 | address = relation.data[unit].get("public_address") 180 | if address: 181 | alertmanagers.append(address) 182 | return sorted(alertmanagers) 183 | 184 | def _on_relation_departed(self, _): 185 | """This hook notifies the charm that there may have been changes to the cluster.""" 186 | self.on.cluster_changed.emit() 187 | 188 | def _on_relation_broken(self, _): 189 | """This hook notifies the charm that a relation has been completely removed.""" 190 | # inform consumer about the change 191 | self.on.cluster_changed.emit() 192 | 193 | 194 | class AlertmanagerProvider(RelationManagerBase): 195 | """A "provider" handler to be used by charms that relate to Alertmanager (the 'provides' side). 196 | 197 | To have your charm provide alertmanager cluster data, declare the interface's use in your 198 | charm's metadata.yaml file: 199 | 200 | ```yaml 201 | provides: 202 | alerting: 203 | interface: alertmanager_dispatch 204 | ``` 205 | 206 | A typical example of importing this library might be 207 | 208 | ```python 209 | from charms.alertmanager_k8s.v0.alertmanager_dispatch import AlertmanagerProvider 210 | ``` 211 | 212 | In your charm's `__init__` method: 213 | 214 | ```python 215 | self.alertmanager_provider = AlertmanagerProvider(self, self._relation_name, self._api_port) 216 | ``` 217 | 218 | Then inform consumers on any update to alertmanager cluster data via 219 | 220 | ```python 221 | self.alertmanager_provider.update_relation_data() 222 | ``` 223 | 224 | This provider auto-registers relation events on behalf of the main Alertmanager charm. 225 | 226 | Arguments: 227 | charm (CharmBase): consumer charm 228 | relation_name (str): relation name (not interface name) 229 | api_port (int): alertmanager server's api port; this is needed here to avoid accessing 230 | charm constructs directly 231 | 232 | Attributes: 233 | charm (CharmBase): the Alertmanager charm 234 | """ 235 | 236 | def __init__(self, charm, relation_name: str = "alerting", api_port: int = 9093): 237 | super().__init__(charm, relation_name, RelationRole.provides) 238 | 239 | self._api_port = api_port 240 | 241 | events = self.charm.on[self.name] 242 | 243 | # No need to observe `relation_departed` or `relation_broken`: data bags are auto-updated 244 | # so both events are address on the consumer side. 245 | self.framework.observe(events.relation_joined, self._on_relation_joined) 246 | 247 | @property 248 | def api_port(self): 249 | """Get the API port number to use for alertmanager.""" 250 | return self._api_port 251 | 252 | def _on_relation_joined(self, event: RelationJoinedEvent): 253 | """This hook stores the public address of the newly-joined "alerting" relation. 254 | 255 | This is needed for consumers such as prometheus, which should be aware of all alertmanager 256 | instances. 257 | """ 258 | self.update_relation_data(event) 259 | 260 | def _generate_relation_data(self, relation: Relation): 261 | """Helper function to generate relation data in the correct format.""" 262 | public_address = "{}:{}".format(socket.getfqdn(), self.api_port) 263 | return {"public_address": public_address} 264 | 265 | def update_relation_data(self, event: RelationEvent = None): 266 | """Helper function for updating relation data bags. 267 | 268 | This function can be used in two different ways: 269 | - update relation data bag of a given event (e.g. a newly joined relation); 270 | - update relation data for all relations 271 | 272 | Args: 273 | event: The event whose data bag needs to be updated. If it is None, update data bags of 274 | all relations. 275 | """ 276 | if event is None: 277 | # update all existing relation data 278 | # a single consumer charm's unit may be related to multiple providers 279 | if self.name in self.charm.model.relations: 280 | for relation in self.charm.model.relations[self.name]: 281 | # Sometimes (e.g. when an app is removed with `--force`), there is a dangling 282 | # relation, for which we get the following error: 283 | # ops.model.ModelError: b'ERROR relation 17 not found (not found)\n' 284 | # when trying to `network-get alerting`. 285 | relation.data[self.charm.unit].update(self._generate_relation_data(relation)) 286 | 287 | else: 288 | # update relation data only for the newly joined relation 289 | event.relation.data[self.charm.unit].update( 290 | self._generate_relation_data(event.relation) 291 | ) 292 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /lib/charms/observability_libs/v0/kubernetes_service_patch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """# KubernetesServicePatch Library. 5 | 6 | This library is designed to enable developers to more simply patch the Kubernetes Service created 7 | by Juju during the deployment of a sidecar charm. When sidecar charms are deployed, Juju creates a 8 | service named after the application in the namespace (named after the Juju model). This service by 9 | default contains a "placeholder" port, which is 65536/TCP. 10 | 11 | When modifying the default set of resources managed by Juju, one must consider the lifecycle of the 12 | charm. In this case, any modifications to the default service (created during deployment), will 13 | be overwritten during a charm upgrade. 14 | 15 | When initialised, this library binds a handler to the parent charm's `install` and `upgrade_charm` 16 | events which applies the patch to the cluster. This should ensure that the service ports are 17 | correct throughout the charm's life. 18 | 19 | The constructor simply takes a reference to the parent charm, and a list of tuples that each define 20 | a port for the service, where each tuple contains: 21 | 22 | - a name for the port 23 | - port for the service to listen on 24 | - optionally: a targetPort for the service (the port in the container!) 25 | - optionally: a nodePort for the service (for NodePort or LoadBalancer services only!) 26 | - optionally: a name of the service (in case service name needs to be patched as well) 27 | 28 | ## Getting Started 29 | 30 | To get started using the library, you just need to fetch the library using `charmcraft`. **Note 31 | that you also need to add `lightkube` and `lightkube-models` to your charm's `requirements.txt`.** 32 | 33 | ```shell 34 | cd some-charm 35 | charmcraft fetch-lib charms.observability_libs.v0.kubernetes_service_patch 36 | echo <<-EOF >> requirements.txt 37 | lightkube 38 | lightkube-models 39 | EOF 40 | ``` 41 | 42 | Then, to initialise the library: 43 | 44 | For ClusterIP services: 45 | ```python 46 | # ... 47 | from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch 48 | 49 | class SomeCharm(CharmBase): 50 | def __init__(self, *args): 51 | # ... 52 | self.service_patcher = KubernetesServicePatch(self, [(f"{self.app.name}", 8080)]) 53 | # ... 54 | ``` 55 | 56 | For LoadBalancer/NodePort services: 57 | ```python 58 | # ... 59 | from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch 60 | 61 | class SomeCharm(CharmBase): 62 | def __init__(self, *args): 63 | # ... 64 | self.service_patcher = KubernetesServicePatch( 65 | self, [(f"{self.app.name}", 443, 443, 30666)], "LoadBalancer" 66 | ) 67 | # ... 68 | ``` 69 | 70 | Additionally, you may wish to use mocks in your charm's unit testing to ensure that the library 71 | does not try to make any API calls, or open any files during testing that are unlikely to be 72 | present, and could break your tests. The easiest way to do this is during your test `setUp`: 73 | 74 | ```python 75 | # ... 76 | 77 | @patch("charm.KubernetesServicePatch", lambda x, y: None) 78 | def setUp(self, *unused): 79 | self.harness = Harness(SomeCharm) 80 | # ... 81 | ``` 82 | """ 83 | 84 | import logging 85 | from types import MethodType 86 | from typing import Literal, Sequence, Tuple, Union 87 | 88 | from lightkube import ApiError, Client 89 | from lightkube.models.core_v1 import ServicePort, ServiceSpec 90 | from lightkube.models.meta_v1 import ObjectMeta 91 | from lightkube.resources.core_v1 import Service 92 | from lightkube.types import PatchType 93 | from ops.charm import CharmBase 94 | from ops.framework import Object 95 | 96 | logger = logging.getLogger(__name__) 97 | 98 | # The unique Charmhub library identifier, never change it 99 | LIBID = "0042f86d0a874435adef581806cddbbb" 100 | 101 | # Increment this major API version when introducing breaking changes 102 | LIBAPI = 0 103 | 104 | # Increment this PATCH version before using `charmcraft publish-lib` or reset 105 | # to 0 if you are raising the major API version 106 | LIBPATCH = 6 107 | 108 | PortDefinition = Union[Tuple[str, int], Tuple[str, int, int], Tuple[str, int, int, int]] 109 | ServiceType = Literal["ClusterIP", "LoadBalancer"] 110 | 111 | 112 | class KubernetesServicePatch(Object): 113 | """A utility for patching the Kubernetes service set up by Juju.""" 114 | 115 | def __init__( 116 | self, 117 | charm: CharmBase, 118 | ports: Sequence[PortDefinition], 119 | service_name: str = None, 120 | service_type: ServiceType = "ClusterIP", 121 | additional_labels: dict = None, 122 | additional_selectors: dict = None, 123 | additional_annotations: dict = None, 124 | ): 125 | """Constructor for KubernetesServicePatch. 126 | 127 | Args: 128 | charm: the charm that is instantiating the library. 129 | ports: a list of tuples (name, port, targetPort, nodePort) for every service port. 130 | service_name: allows setting custom name to the patched service. If none given, 131 | application name will be used. 132 | service_type: desired type of K8s service. Default value is in line with ServiceSpec's 133 | default value. 134 | additional_labels: Labels to be added to the kubernetes service (by default only 135 | "app.kubernetes.io/name" is set to the service name) 136 | additional_selectors: Selectors to be added to the kubernetes service (by default only 137 | "app.kubernetes.io/name" is set to the service name) 138 | additional_annotations: Annotations to be added to the kubernetes service. 139 | """ 140 | super().__init__(charm, "kubernetes-service-patch") 141 | self.charm = charm 142 | self.service_name = service_name if service_name else self._app 143 | self.service = self._service_object( 144 | ports, 145 | service_name, 146 | service_type, 147 | additional_labels, 148 | additional_selectors, 149 | additional_annotations, 150 | ) 151 | 152 | # Make mypy type checking happy that self._patch is a method 153 | assert isinstance(self._patch, MethodType) 154 | # Ensure this patch is applied during the 'install' and 'upgrade-charm' events 155 | self.framework.observe(charm.on.install, self._patch) 156 | self.framework.observe(charm.on.upgrade_charm, self._patch) 157 | 158 | def _service_object( 159 | self, 160 | ports: Sequence[PortDefinition], 161 | service_name: str = None, 162 | service_type: ServiceType = "ClusterIP", 163 | additional_labels: dict = None, 164 | additional_selectors: dict = None, 165 | additional_annotations: dict = None, 166 | ) -> Service: 167 | """Creates a valid Service representation. 168 | 169 | Args: 170 | ports: a list of tuples of the form (name, port) or (name, port, targetPort) 171 | or (name, port, targetPort, nodePort) for every service port. If the 'targetPort' 172 | is omitted, it is assumed to be equal to 'port', with the exception of NodePort 173 | and LoadBalancer services, where all port numbers have to be specified. 174 | service_name: allows setting custom name to the patched service. If none given, 175 | application name will be used. 176 | service_type: desired type of K8s service. Default value is in line with ServiceSpec's 177 | default value. 178 | additional_labels: Labels to be added to the kubernetes service (by default only 179 | "app.kubernetes.io/name" is set to the service name) 180 | additional_selectors: Selectors to be added to the kubernetes service (by default only 181 | "app.kubernetes.io/name" is set to the service name) 182 | additional_annotations: Annotations to be added to the kubernetes service. 183 | 184 | Returns: 185 | Service: A valid representation of a Kubernetes Service with the correct ports. 186 | """ 187 | if not service_name: 188 | service_name = self._app 189 | labels = {"app.kubernetes.io/name": self._app} 190 | if additional_labels: 191 | labels.update(additional_labels) 192 | selector = {"app.kubernetes.io/name": self._app} 193 | if additional_selectors: 194 | selector.update(additional_selectors) 195 | return Service( 196 | apiVersion="v1", 197 | kind="Service", 198 | metadata=ObjectMeta( 199 | namespace=self._namespace, 200 | name=service_name, 201 | labels=labels, 202 | annotations=additional_annotations, # type: ignore[arg-type] 203 | ), 204 | spec=ServiceSpec( 205 | selector=selector, 206 | ports=[ 207 | ServicePort( 208 | name=p[0], 209 | port=p[1], 210 | targetPort=p[2] if len(p) > 2 else p[1], # type: ignore[misc] 211 | nodePort=p[3] if len(p) > 3 else None, # type: ignore[arg-type, misc] 212 | ) 213 | for p in ports 214 | ], 215 | type=service_type, 216 | ), 217 | ) 218 | 219 | def _patch(self, _) -> None: 220 | """Patch the Kubernetes service created by Juju to map the correct port. 221 | 222 | Raises: 223 | PatchFailed: if patching fails due to lack of permissions, or otherwise. 224 | """ 225 | if not self.charm.unit.is_leader(): 226 | return 227 | 228 | client = Client() 229 | try: 230 | if self.service_name != self._app: 231 | self._delete_and_create_service(client) 232 | client.patch(Service, self.service_name, self.service, patch_type=PatchType.MERGE) 233 | except ApiError as e: 234 | if e.status.code == 403: 235 | logger.error("Kubernetes service patch failed: `juju trust` this application.") 236 | else: 237 | logger.error("Kubernetes service patch failed: %s", str(e)) 238 | else: 239 | logger.info("Kubernetes service '%s' patched successfully", self._app) 240 | 241 | def _delete_and_create_service(self, client: Client): 242 | service = client.get(Service, self._app, namespace=self._namespace) 243 | service.metadata.name = self.service_name # type: ignore[attr-defined] 244 | service.metadata.resourceVersion = service.metadata.uid = None # type: ignore[attr-defined] # noqa: E501 245 | client.delete(Service, self._app, namespace=self._namespace) 246 | client.create(service) 247 | 248 | def is_patched(self) -> bool: 249 | """Reports if the service patch has been applied. 250 | 251 | Returns: 252 | bool: A boolean indicating if the service patch has been applied. 253 | """ 254 | client = Client() 255 | # Get the relevant service from the cluster 256 | service = client.get(Service, name=self.service_name, namespace=self._namespace) 257 | # Construct a list of expected ports, should the patch be applied 258 | expected_ports = [(p.port, p.targetPort) for p in self.service.spec.ports] 259 | # Construct a list in the same manner, using the fetched service 260 | fetched_ports = [(p.port, p.targetPort) for p in service.spec.ports] # type: ignore[attr-defined] # noqa: E501 261 | return expected_ports == fetched_ports 262 | 263 | @property 264 | def _app(self) -> str: 265 | """Name of the current Juju application. 266 | 267 | Returns: 268 | str: A string containing the name of the current Juju application. 269 | """ 270 | return self.charm.app.name 271 | 272 | @property 273 | def _namespace(self) -> str: 274 | """The Kubernetes namespace we're running in. 275 | 276 | Returns: 277 | str: A string containing the name of the current Kubernetes namespace. 278 | """ 279 | with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") as f: 280 | return f.read().strip() 281 | -------------------------------------------------------------------------------- /lib/charms/karma_k8s/v0/karma_dashboard.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """# Karma library. 5 | 6 | This library is designed to be used by a charm consuming or providing the karma-dashboard relation. 7 | This library is published as part of the [Karma charm](https://charmhub.io/karma-k8s). 8 | 9 | You can file bugs [here](https://github.com/canonical/karma-operator/issues)! 10 | 11 | A typical example of including this library might be: 12 | 13 | ```python 14 | # ... 15 | from charms.karma_k8s.v0.karma_dashboard import KarmaConsumer 16 | 17 | class SomeApplication(CharmBase): 18 | def __init__(self, *args): 19 | # ... 20 | self.karma_consumer = KarmaConsumer(self, "dashboard") 21 | # ... 22 | ``` 23 | """ 24 | 25 | import logging 26 | from typing import Dict, List, Optional 27 | 28 | import ops.charm 29 | from ops.charm import CharmBase, RelationJoinedEvent, RelationRole 30 | from ops.framework import EventBase, EventSource, Object, ObjectEvents, StoredState 31 | 32 | # The unique Charmhub library identifier, never change it 33 | LIBID = "98f9dc00f7ff4b1197895886bdd92037" 34 | 35 | # Increment this major API version when introducing breaking changes 36 | LIBAPI = 0 37 | 38 | # Increment this PATCH version before using `charmcraft publish-lib` or reset 39 | # to 0 if you are raising the major API version 40 | LIBPATCH = 3 41 | 42 | # Set to match metadata.yaml 43 | INTERFACE_NAME = "karma_dashboard" 44 | 45 | logger = logging.getLogger(__name__) 46 | 47 | 48 | class KarmaAlertmanagerConfig: 49 | """A helper class for alertmanager server configuration for Karma. 50 | 51 | Refer to the Karma documentation for full details: 52 | https://github.com/prymitive/karma/blob/main/docs/CONFIGURATION.md#alertmanagers 53 | """ 54 | 55 | required_fields = {"name", "uri"} 56 | optional_fields = {"cluster"} 57 | _supported_fields = required_fields | optional_fields 58 | 59 | @staticmethod 60 | def is_valid(config: Dict[str, str]) -> bool: 61 | """Validate alertmanager server configuration for Karma. 62 | 63 | Args: 64 | config: target configuration to be validated. 65 | 66 | Returns: 67 | True if all required keys are present and all remaining keys are supported optional 68 | fields; False otherwise. 69 | """ 70 | all_required = all(key in config for key in KarmaAlertmanagerConfig.required_fields) 71 | all_supported = all(key in KarmaAlertmanagerConfig._supported_fields for key in config) 72 | return all_required and all_supported 73 | 74 | @staticmethod 75 | def from_dict(data: Dict[str, str]) -> Dict[str, str]: 76 | """Generate alertmanager server configuration from the given dict. 77 | 78 | Configuration is constructed by creating a subset of the provided dictionary that contains 79 | only the supported fields. 80 | 81 | Args: 82 | data: a dict that may contain alertmanager server configuration for Karma. 83 | 84 | Returns: 85 | A subset of `data` that contains all the supported fields found in `data`, if the 86 | resulting subset makes a valid configuration; False otherwise. 87 | """ 88 | config = {k: data[k] for k in data if k in KarmaAlertmanagerConfig.required_fields} 89 | optional_config = { 90 | k: data[k] for k in data if data[k] and k in KarmaAlertmanagerConfig.optional_fields 91 | } 92 | config.update(optional_config) 93 | return config if KarmaAlertmanagerConfig.is_valid(config) else {} 94 | 95 | @staticmethod 96 | def build(name: str, url: str, *, cluster=None) -> Dict[str, str]: 97 | """Build alertmanager server configuration for Karma. 98 | 99 | Args: 100 | name: name for the alertmanager unit. 101 | url: url of the alertmanager api server (including scheme and port) 102 | cluster: name of a cluster to which the alertmanager unit belongs to (optional) 103 | 104 | Returns: 105 | Alertmanager server configuration for Karma. 106 | """ 107 | return KarmaAlertmanagerConfig.from_dict({"name": name, "uri": url, "cluster": cluster}) 108 | 109 | 110 | class KarmaAlertmanagerConfigChanged(EventBase): 111 | """Event raised when karma configuration is changed. 112 | 113 | If an alertmanager unit is added to or removed from a relation, 114 | then a :class:`KarmaAlertmanagerConfigChanged` should be emitted. 115 | """ 116 | 117 | 118 | class KarmaConsumerEvents(ObjectEvents): 119 | """Event descriptor for events raised by `AlertmanagerConsumer`.""" 120 | 121 | alertmanager_config_changed = EventSource(KarmaAlertmanagerConfigChanged) 122 | 123 | 124 | class RelationManagerBase(Object): 125 | """Base class that represents relation ends ("provides" and "requires"). 126 | 127 | :class:`RelationManagerBase` is used to create a relation manager. This is done by inheriting 128 | from :class:`RelationManagerBase` and customising the sub class as required. 129 | 130 | Attributes: 131 | name (str): consumer's relation name 132 | """ 133 | 134 | def __init__(self, charm: CharmBase, relation_name, relation_role: RelationRole): 135 | super().__init__(charm, relation_name) 136 | self.charm = charm 137 | self._validate_relation(relation_name, relation_role) 138 | self.name = relation_name 139 | 140 | def _validate_relation(self, relation_name: str, relation_role: RelationRole): 141 | try: 142 | if self.charm.meta.relations[relation_name].role != relation_role: 143 | raise ValueError( 144 | "Relation '{}' in the charm's metadata.yaml must be '{}' " 145 | "to be managed by this library, but instead it is '{}'".format( 146 | relation_name, 147 | relation_role, 148 | self.charm.meta.relations[relation_name].role, 149 | ) 150 | ) 151 | if self.charm.meta.relations[relation_name].interface_name != INTERFACE_NAME: 152 | raise ValueError( 153 | "Relation '{}' in the charm's metadata.yaml must use the '{}' interface " 154 | "to be managed by this library, but instead it is '{}'".format( 155 | relation_name, 156 | INTERFACE_NAME, 157 | self.charm.meta.relations[relation_name].interface_name, 158 | ) 159 | ) 160 | except KeyError: 161 | raise ValueError( 162 | "Relation '{}' is not in the charm's metadata.yaml".format(relation_name) 163 | ) 164 | 165 | 166 | class KarmaConsumer(RelationManagerBase): 167 | """A "consumer" handler to be used by the Karma charm (the 'requires' side). 168 | 169 | This library offers the interface needed in order to forward Alertmanager URLs and associated 170 | information to the Karma application. 171 | 172 | To have your charm provide URLs to Karma, declare the interface's use in your charm's 173 | metadata.yaml file: 174 | 175 | ```yaml 176 | provides: 177 | karma-dashboard: 178 | interface: karma_dashboard 179 | ``` 180 | 181 | A typical example of importing this library might be 182 | 183 | ```python 184 | from charms.alertmanager_karma.v0.karma_dashboard import KarmaConsumer 185 | ``` 186 | 187 | In your charm's `__init__` method: 188 | 189 | ```python 190 | self.karma_consumer = KarmaConsumer(self, "dashboard") 191 | ``` 192 | 193 | The consumer charm is expected to observe and respond to the 194 | :class:`KarmaAlertmanagerConfigChanged` event, for example: 195 | 196 | ```python 197 | self.framework.observe( 198 | self.karma_consumer.on.alertmanager_config_changed, self._on_alertmanager_config_changed 199 | ) 200 | ``` 201 | 202 | This consumer observes relation joined, changed and departed events on behalf of the charm. 203 | 204 | From charm code you can then obtain the list of proxied alertmanagers via: 205 | 206 | ```python 207 | alertmanagers = self.karma_consumer.get_alertmanager_servers() 208 | ``` 209 | 210 | Arguments: 211 | charm (CharmBase): consumer charm 212 | name (str): from consumer's metadata.yaml 213 | 214 | Attributes: 215 | relation_charm (CharmBase): consumer charm 216 | """ 217 | 218 | on = KarmaConsumerEvents() 219 | 220 | def __init__(self, charm, relation_name: str = "karma-dashboard"): 221 | super().__init__(charm, relation_name, RelationRole.requires) 222 | self.charm = charm 223 | 224 | events = self.charm.on[self.name] 225 | self.framework.observe(events.relation_changed, self._on_relation_changed) 226 | self.framework.observe(events.relation_departed, self._on_relation_departed) 227 | 228 | def get_alertmanager_servers(self) -> List[Dict[str, str]]: 229 | """Return configuration data for all related alertmanager servers. 230 | 231 | The exact spec is described in the Karma project documentation 232 | https://github.com/prymitive/karma/blob/main/docs/CONFIGURATION.md#alertmanagers 233 | Every item in the returned list represents an item under the "servers" yaml section. 234 | 235 | Returns: 236 | List of server configurations, in the format prescribed by the Karma project 237 | """ 238 | servers = [] 239 | 240 | logger.debug("relations for %s: %s", self.name, self.charm.model.relations[self.name]) 241 | for relation in self.charm.model.relations[self.name]: 242 | # get data from related application 243 | for key in relation.data: 244 | if key is not self.charm.unit and isinstance(key, ops.charm.model.Unit): 245 | data = relation.data[key] 246 | config = KarmaAlertmanagerConfig.from_dict(data) 247 | if config and config not in servers: 248 | servers.append(config) 249 | 250 | return servers # TODO sorted 251 | 252 | def _on_relation_changed(self, _): 253 | """Event handler for RelationChangedEvent.""" 254 | self.on.alertmanager_config_changed.emit() 255 | 256 | def _on_relation_departed(self, _): 257 | """Hook is called when a unit leaves, but another unit may still be present.""" 258 | # At this point the unit data bag of the departing unit is gone from relation data 259 | self.on.alertmanager_config_changed.emit() 260 | 261 | @property 262 | def config_valid(self) -> bool: 263 | """Check if the current configuration is valid. 264 | 265 | Returns: 266 | True if the currently stored configuration for an alertmanager target is valid; False 267 | otherwise. 268 | """ 269 | # karma will fail starting without alertmanager server(s), which would cause pebble to 270 | # error out. 271 | 272 | # check that there is at least one alertmanager server configured 273 | servers = self.get_alertmanager_servers() 274 | return len(servers) > 0 275 | 276 | 277 | class KarmaProvider(RelationManagerBase): 278 | """A "provider" handler to be used by charms that relate to Karma (the 'provides' side). 279 | 280 | This library offers the interface needed in order to provide Alertmanager URLs and associated 281 | information to the Karma application. 282 | 283 | To have your charm provide URLs to Karma, declare the interface's use in your charm's 284 | metadata.yaml file: 285 | 286 | ```yaml 287 | provides: 288 | karma-dashboard: 289 | interface: karma_dashboard 290 | ``` 291 | 292 | A typical example of importing this library might be 293 | 294 | ```python 295 | from charms.karma_k8s.v0.karma_dashboard import KarmaProvider 296 | ``` 297 | 298 | In your charm's `__init__` method: 299 | 300 | ```python 301 | self.karma_provider = KarmaProvider(self, "karma-dashboard") 302 | ``` 303 | 304 | The provider charm is expected to set the target URL via the consumer library, for example in 305 | config-changed: 306 | 307 | self.karma_provider.target = "http://whatever:9093" 308 | 309 | The provider charm can then obtain the configured IP address, for example: 310 | 311 | self.unit.status = ActiveStatus("Proxying {}".format(self.karma_provider.target)) 312 | 313 | Arguments: 314 | charm (CharmBase): consumer charm 315 | relation_name (str): relation name from consumer's metadata.yaml 316 | 317 | Attributes: 318 | charm (CharmBase): consumer charm 319 | """ 320 | 321 | _stored = StoredState() 322 | 323 | def __init__(self, charm, relation_name: str = "dashboard"): 324 | super().__init__(charm, relation_name, RelationRole.provides) 325 | self.charm = charm 326 | 327 | # StoredState is used for holding the target URL. 328 | # It is needed here because the target URL may be set by the consumer before any 329 | # "karma-dashboard" relation is joined, in which case there are no relation unit data bags 330 | # available for storing the target URL. 331 | self._stored.set_default(config={}) 332 | 333 | events = self.charm.on[self.name] 334 | self.framework.observe(events.relation_joined, self._on_relation_joined) 335 | 336 | def _on_relation_joined(self, event: RelationJoinedEvent): 337 | self._update_relation_data(event) 338 | 339 | @property 340 | def config_valid(self) -> bool: 341 | """Check if the current configuration is valid. 342 | 343 | Returns: 344 | True if the currently stored configuration for an alertmanager target is valid; False 345 | otherwise. 346 | """ 347 | return KarmaAlertmanagerConfig.is_valid(self._stored.config) 348 | 349 | @property 350 | def target(self) -> Optional[str]: 351 | """str: Alertmanager URL to be used by Karma.""" 352 | return self._stored.config.get("uri", None) 353 | 354 | @target.setter 355 | def target(self, url: str) -> None: 356 | """Configure an alertmanager target server to be used by Karma. 357 | 358 | Apart from the server's URL, the server configuration is determined from the juju topology. 359 | 360 | Args: 361 | url: Complete URL (scheme and port) of the target alertmanager server. 362 | 363 | Returns: 364 | None. 365 | """ 366 | name = self.charm.unit.name 367 | cluster = "{}_{}".format(self.charm.model.name, self.charm.app.name) 368 | config = KarmaAlertmanagerConfig.build(name, url, cluster=cluster) 369 | if not config: 370 | logger.warning("Invalid config: {%s, %s}", name, url) 371 | return 372 | 373 | self._stored.config.update(config) 374 | 375 | # target changed - must update all relation data 376 | self._update_relation_data() 377 | 378 | def _update_relation_data(self, event: RelationJoinedEvent = None): 379 | """Helper function for updating relation data bags. 380 | 381 | This function can be used in two different ways: 382 | - update relation data bag of a given event (e.g. a newly joined relation); 383 | - update relation data for all relations 384 | 385 | Args: 386 | event: The event whose data bag needs to be updated. If it is None, update data bags of 387 | all relations. 388 | """ 389 | if event is None: 390 | # update all existing relation data 391 | # a single consumer charm's unit may be related to multiple karma dashboards 392 | if self.name in self.charm.model.relations: 393 | for relation in self.charm.model.relations[self.name]: 394 | relation.data[self.charm.unit].update(self._stored.config) 395 | else: 396 | # update relation data only for the newly joined relation 397 | event.relation.data[self.charm.unit].update(self._stored.config) 398 | -------------------------------------------------------------------------------- /src/charm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Canonical Ltd. 3 | # See LICENSE file for licensing details. 4 | 5 | """A Juju charm for alertmanager.""" 6 | 7 | import hashlib 8 | import logging 9 | import socket 10 | from typing import List, Optional, cast 11 | 12 | import yaml 13 | from charms.alertmanager_k8s.v0.alertmanager_dispatch import AlertmanagerProvider 14 | from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider 15 | from charms.grafana_k8s.v0.grafana_source import GrafanaSourceProvider 16 | from charms.karma_k8s.v0.karma_dashboard import KarmaProvider 17 | from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch 18 | from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider 19 | from ops.charm import ActionEvent, CharmBase 20 | from ops.framework import StoredState 21 | from ops.main import main 22 | from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, Relation 23 | from ops.pebble import Layer, PathError, ProtocolError 24 | 25 | from alertmanager_client import Alertmanager, AlertmanagerBadResponse 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def sha256(hashable) -> str: 31 | """Use instead of the builtin hash() for repeatable values.""" 32 | if isinstance(hashable, str): 33 | hashable = hashable.encode("utf-8") 34 | return hashlib.sha256(hashable).hexdigest() 35 | 36 | 37 | class ConfigUpdateFailure(RuntimeError): 38 | """Custom exception for failed config updates.""" 39 | 40 | 41 | class AlertmanagerCharm(CharmBase): 42 | """A Juju charm for alertmanager. 43 | 44 | Attributes: 45 | api: an API client instance for communicating with the alertmanager workload 46 | server 47 | """ 48 | 49 | # Container name is automatically determined from charm name 50 | # Layer name is used for the layer label argument in container.add_layer 51 | # Service name matches charm name for consistency 52 | _container_name = _layer_name = _service_name = "alertmanager" 53 | _relation_name = "alerting" 54 | _peer_relation_name = "replicas" # must match metadata.yaml peer role name 55 | _api_port = 9093 # port to listen on for the web interface and API 56 | _ha_port = 9094 # port for HA-communication between multiple instances of alertmanager 57 | 58 | # path, inside the workload container, to the alertmanager and amtool configuration files 59 | # the amalgamated templates file goes in the same folder as the main configuration file 60 | _config_path = "/etc/alertmanager/alertmanager.yml" 61 | _templates_path = "/etc/alertmanager/templates.tmpl" 62 | _amtool_config_path = "/etc/amtool/config.yml" 63 | 64 | # path, inside the workload container for alertmanager data, e.g. 'nflogs', 'silences'. 65 | _storage_path = "/alertmanager" 66 | 67 | _stored = StoredState() 68 | 69 | def __init__(self, *args): 70 | super().__init__(*args) 71 | self._stored.set_default(config_hash=None, launched_with_peers=False) 72 | self.api = Alertmanager(port=self._api_port) 73 | 74 | self.alertmanager_provider = AlertmanagerProvider( 75 | self, self._relation_name, self._api_port 76 | ) 77 | self.grafana_dashboard_provider = GrafanaDashboardProvider(charm=self) 78 | self.grafana_source_provider = GrafanaSourceProvider( 79 | charm=self, 80 | source_type="alertmanager", 81 | source_url=self.api_address, 82 | ) 83 | self.karma_provider = KarmaProvider(self, "karma-dashboard") 84 | 85 | self.service_patcher = KubernetesServicePatch( 86 | self, 87 | [ 88 | (f"{self.app.name}", self._api_port, self._api_port), 89 | (f"{self.app.name}-ha", self._ha_port, self._ha_port), 90 | ], 91 | ) 92 | 93 | # Self-monitoring 94 | self._scraping = MetricsEndpointProvider( 95 | self, 96 | relation_name="self-metrics-endpoint", 97 | jobs=[{"static_configs": [{"targets": [f"*:{self._api_port}"]}]}], 98 | ) 99 | 100 | self.container = self.unit.get_container(self._container_name) 101 | 102 | # Core lifecycle events 103 | self.framework.observe(self.on.config_changed, self._on_config_changed) 104 | self.framework.observe(self.on.alertmanager_pebble_ready, self._on_pebble_ready) 105 | self.framework.observe(self.on.start, self._on_start) 106 | self.framework.observe(self.on.update_status, self._on_update_status) 107 | self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm) 108 | 109 | # Peer relation events 110 | self.framework.observe( 111 | self.on[self._peer_relation_name].relation_joined, self._on_peer_relation_joined 112 | ) 113 | self.framework.observe( 114 | self.on[self._peer_relation_name].relation_changed, self._on_peer_relation_changed 115 | ) 116 | 117 | # Action events 118 | self.framework.observe(self.on.show_config_action, self._on_show_config_action) 119 | 120 | def _on_show_config_action(self, event: ActionEvent): 121 | """Hook for the show-config action.""" 122 | event.log(f"Fetching {self._config_path}") 123 | if not self.container.can_connect(): 124 | event.fail("Container not ready") 125 | 126 | try: 127 | content = self.container.pull(self._config_path) 128 | # juju requires keys to be lowercase alphanumeric (can't use self._config_path) 129 | event.set_results({"path": self._config_path, "content": content.read()}) 130 | except (ProtocolError, PathError) as e: 131 | event.fail(str(e)) 132 | 133 | @property 134 | def api_port(self) -> int: 135 | """Get the API port number to use for alertmanager (default: 9093).""" 136 | return self._api_port 137 | 138 | @property 139 | def peer_relation(self) -> Optional["Relation"]: 140 | """Helper function for obtaining the peer relation object. 141 | 142 | Returns: peer relation object 143 | (NOTE: would return None if called too early, e.g. during install). 144 | """ 145 | return self.model.get_relation(self._peer_relation_name) 146 | 147 | def _alertmanager_layer(self) -> Layer: 148 | """Returns Pebble configuration layer for alertmanager.""" 149 | 150 | def _command(): 151 | """Returns full command line to start alertmanager.""" 152 | peer_addresses = self._get_peer_addresses() 153 | 154 | # cluster listen address - empty string disables HA mode 155 | listen_address_arg = "" if len(peer_addresses) == 0 else f"0.0.0.0:{self._ha_port}" 156 | 157 | # The chosen port in the cluster.listen-address flag is the port that needs to be 158 | # specified in the cluster.peer flag of the other peers. 159 | # Assuming all replicas use the same port. 160 | # Sorting for repeatability in comparing between service layers. 161 | peer_cmd_args = " ".join( 162 | sorted([f"--cluster.peer={address}" for address in peer_addresses]) 163 | ) 164 | return ( 165 | f"alertmanager " 166 | f"--config.file={self._config_path} " 167 | f"--storage.path={self._storage_path} " 168 | f"--web.listen-address=:{self._api_port} " 169 | f"--cluster.listen-address={listen_address_arg} " 170 | f"{peer_cmd_args}" 171 | ) 172 | 173 | return Layer( 174 | { 175 | "summary": "alertmanager layer", 176 | "description": "pebble config layer for alertmanager", 177 | "services": { 178 | self._service_name: { 179 | "override": "replace", 180 | "summary": "alertmanager service", 181 | "command": _command(), 182 | "startup": "enabled", 183 | } 184 | }, 185 | } 186 | ) 187 | 188 | def _restart_service(self) -> bool: 189 | """Helper function for restarting the underlying service. 190 | 191 | Returns: 192 | True if restart succeeded; False otherwise. 193 | """ 194 | logger.info("Restarting service %s", self._service_name) 195 | 196 | if not self.container.can_connect(): 197 | logger.error("Cannot (re)start service: container is not ready.") 198 | return False 199 | 200 | # Check if service exists, to avoid ModelError from being raised when the service does 201 | # not exist, 202 | if not self.container.get_plan().services.get(self._service_name): 203 | logger.error("Cannot (re)start service: service does not (yet) exist.") 204 | return False 205 | 206 | self.container.restart(self._service_name) 207 | 208 | # Update "launched with peers" flag. 209 | # The service should be restarted when peers joined if this is False. 210 | plan = self.container.get_plan() 211 | service = plan.services.get(self._service_name) 212 | self._stored.launched_with_peers = "--cluster.peer" in service.command 213 | 214 | return True 215 | 216 | def _update_layer(self, restart: bool) -> bool: 217 | """Update service layer to reflect changes in peers (replicas). 218 | 219 | Args: 220 | restart: a flag indicating if the service should be restarted if a change was detected. 221 | 222 | Returns: 223 | True if anything changed; False otherwise 224 | """ 225 | overlay = self._alertmanager_layer() 226 | plan = self.container.get_plan() 227 | 228 | if self._service_name not in plan.services or overlay.services != plan.services: 229 | self.container.add_layer(self._layer_name, overlay, combine=True) 230 | 231 | if restart: 232 | self._restart_service() 233 | 234 | return True 235 | 236 | return False 237 | 238 | @property 239 | def _default_config(self) -> dict: 240 | return { 241 | "global": {"http_config": {"tls_config": {"insecure_skip_verify": True}}}, 242 | "route": { 243 | "group_wait": "30s", 244 | "group_interval": "5m", 245 | "repeat_interval": "1h", 246 | "receiver": "dummy", 247 | }, 248 | "receivers": [ 249 | {"name": "dummy", "webhook_configs": [{"url": "http://127.0.0.1:5001/"}]} 250 | ], 251 | } 252 | 253 | def _update_config(self) -> None: 254 | """Update alertmanager.yml config file to reflect changes in configuration. 255 | 256 | After pushing a new config, a hot-reload is attempted. If hot-reload fails, the service is 257 | restarted. 258 | 259 | Raises: 260 | ConfigUpdateFailure, if failed to update configuration file. 261 | """ 262 | # update amtool config file 263 | amtool_config = yaml.safe_dump({"alertmanager.url": f"http://localhost:{self.api_port}"}) 264 | self.container.push(self._amtool_config_path, amtool_config, make_dirs=True) 265 | 266 | # if no config provided, use default config with a dummy receiver 267 | config = yaml.safe_load(self.config["config_file"]) or self._default_config 268 | 269 | if config.get("templates", []): 270 | logger.error( 271 | "alertmanager config file must not have a 'templates' section; " 272 | "use the 'templates' config option instead." 273 | ) 274 | raise ConfigUpdateFailure( 275 | "Invalid config file: use charm's 'templates' config option instead" 276 | ) 277 | 278 | # add templates, if any 279 | if templates := self.config["templates_file"]: 280 | config["templates"] = [f"{self._templates_path}"] 281 | self.container.push(self._templates_path, templates, make_dirs=True) 282 | 283 | # add juju topology to "group_by" 284 | route = cast(dict, config.get("route", {})) 285 | route["group_by"] = list( 286 | set(route.get("group_by", [])).union( 287 | ["juju_application", "juju_model", "juju_model_uuid"] 288 | ) 289 | ) 290 | config["route"] = route 291 | 292 | config_yaml = yaml.safe_dump(config) 293 | config_hash = sha256(config_yaml) 294 | 295 | if config_hash == self._stored.config_hash: 296 | logger.debug("no change in config") 297 | return 298 | 299 | logger.debug("config changed") 300 | self._push_config_and_reload(config_yaml) 301 | self._stored.config_hash = config_hash 302 | 303 | def _push_config_and_reload(self, config_yaml): 304 | """Push config into workload container, and trigger a hot-reload (or service restart). 305 | 306 | Args: 307 | config_yaml: contents of the new config file. 308 | 309 | Raises: 310 | ConfigUpdateFailure, if config update fails. 311 | """ 312 | self.container.push(self._config_path, config_yaml, make_dirs=True) 313 | 314 | # Obtain a "before" snapshot of the config from the server. 315 | # This is different from `config` above because alertmanager adds in a bunch of details 316 | # such as: 317 | # 318 | # smtp_hello: localhost 319 | # smtp_require_tls: true 320 | # pagerduty_url: https://events.pagerduty.com/v2/enqueue 321 | # opsgenie_api_url: https://api.opsgenie.com/ 322 | # wechat_api_url: https://qyapi.weixin.qq.com/cgi-bin/ 323 | # victorops_api_url: https://alert.victorops.com/integrations/generic/20131114/alert/ 324 | # 325 | # The snapshot is needed to determine if reloading took place. 326 | try: 327 | config_from_server_before = self.api.config() 328 | except AlertmanagerBadResponse: 329 | config_from_server_before = None 330 | 331 | # Send an HTTP POST to alertmanager to hot-reload the config. 332 | # This reduces down-time compared to restarting the service. 333 | try: 334 | self.api.reload() 335 | except AlertmanagerBadResponse as e: 336 | logger.warning("config reload via HTTP POST failed: %s", str(e)) 337 | # hot-reload failed so attempting a service restart 338 | if not self._restart_service(): 339 | raise ConfigUpdateFailure( 340 | "Is config valid? hot reload and service restart failed." 341 | ) 342 | 343 | # Obtain an "after" snapshot of the config from the server. 344 | try: 345 | config_from_server_after = self.api.config() 346 | except AlertmanagerBadResponse: 347 | config_from_server_after = None 348 | 349 | if config_from_server_before is None or config_from_server_after is None: 350 | logger.warning("cannot determine if reload succeeded") 351 | elif config_from_server_before == config_from_server_after: 352 | logger.warning("config remained the same after a reload") 353 | 354 | @property 355 | def api_address(self): 356 | """Returns the API address (including scheme and port) of the alertmanager server.""" 357 | return f"http://{socket.getfqdn()}:{self.api_port}" 358 | 359 | def _common_exit_hook(self) -> None: 360 | """Event processing hook that is common to all events to ensure idempotency.""" 361 | if not self.container.can_connect(): 362 | self.unit.status = MaintenanceStatus("Waiting for pod startup to complete") 363 | return 364 | 365 | # In the case of a single unit deployment, no 'RelationJoined' event is emitted, so 366 | # setting IP here. 367 | # Store private address in unit's peer relation data bucket. This is still needed because 368 | # the "private-address" field in the data bag is being populated incorrectly. 369 | # Also, ip address may still be None even after RelationJoinedEvent, for which 370 | # "ops.model.RelationDataError: relation data values must be strings" would be emitted. 371 | if self.peer_relation: 372 | self.peer_relation.data[self.unit]["private_address"] = socket.getfqdn() 373 | 374 | self.alertmanager_provider.update_relation_data() 375 | if karma_address := self.api_address: 376 | self.karma_provider.target = karma_address 377 | 378 | # Update pebble layer 379 | layer_changed = self._update_layer(restart=False) 380 | 381 | service_running = ( 382 | service := self.container.get_service(self._service_name) 383 | ) and service.is_running() 384 | 385 | num_peers = len(rel.units) if (rel := self.peer_relation) else 0 386 | 387 | if layer_changed and ( 388 | not service_running or (num_peers > 0 and not self._stored.launched_with_peers) 389 | ): 390 | self._restart_service() 391 | 392 | # Update config file 393 | try: 394 | self._update_config() 395 | except ConfigUpdateFailure as e: 396 | self.unit.status = BlockedStatus(str(e)) 397 | return 398 | 399 | self.unit.status = ActiveStatus() 400 | 401 | def _on_pebble_ready(self, _): 402 | """Event handler for PebbleReadyEvent.""" 403 | self._common_exit_hook() 404 | 405 | def _on_config_changed(self, _): 406 | """Event handler for ConfigChangedEvent.""" 407 | self._common_exit_hook() 408 | 409 | def _on_start(self, _): 410 | """Event handler for StartEvent. 411 | 412 | With Juju 2.9.5 encountered a scenario in which pebble_ready and config_changed fired, 413 | but IP address was not available and the status was stuck on "Waiting for IP address". 414 | Adding this hook reduce the likelihood of that scenario. 415 | """ 416 | self._common_exit_hook() 417 | 418 | def _on_peer_relation_joined(self, _): 419 | """Event handler for replica's RelationChangedEvent.""" 420 | self._common_exit_hook() 421 | 422 | def _on_peer_relation_changed(self, _): 423 | """Event handler for replica's RelationChangedEvent. 424 | 425 | `relation_changed` is needed in addition to `relation_joined` because when a second unit 426 | joins, the first unit must be restarted and provided with the second unit's IP address. 427 | when the first unit sees "joined", it is not guaranteed that the second unit already has 428 | an IP address. 429 | """ 430 | self._common_exit_hook() 431 | 432 | def _on_update_status(self, _): 433 | """Event handler for UpdateStatusEvent. 434 | 435 | Logs list of peers, uptime and version info. 436 | """ 437 | try: 438 | status = self.api.status() 439 | logger.info( 440 | "alertmanager %s is up and running (uptime: %s); " 441 | "cluster mode: %s, with %d peers", 442 | status["versionInfo"]["version"], 443 | status["uptime"], 444 | status["cluster"]["status"], 445 | len(status["cluster"]["peers"]), 446 | ) 447 | except AlertmanagerBadResponse as e: 448 | logger.error("Failed to obtain status: %s", str(e)) 449 | 450 | # Calling the common hook to make sure a single unit set its IP in case all events fired 451 | # before an IP address was ready, leaving UpdateStatue as the last resort. 452 | self._common_exit_hook() 453 | 454 | def _on_upgrade_charm(self, _): 455 | """Event handler for replica's UpgradeCharmEvent.""" 456 | # update config hash 457 | self._stored.config_hash = ( 458 | "" 459 | if not self.container.can_connect() 460 | else sha256(yaml.safe_dump(yaml.safe_load(self.container.pull(self._config_path)))) 461 | ) 462 | 463 | # After upgrade (refresh), the unit ip address is not guaranteed to remain the same, and 464 | # the config may need update. Calling the common hook to update. 465 | self._common_exit_hook() 466 | 467 | def _get_peer_addresses(self) -> List[str]: 468 | """Create a list of HA addresses of all peer units (all units excluding current). 469 | 470 | The returned addresses include the HA port number but do not include scheme (http). 471 | If a unit does not have an address, it will be omitted from the list. 472 | """ 473 | addresses = [] 474 | if pr := self.peer_relation: 475 | addresses = [ 476 | f"{address}:{self._ha_port}" 477 | for unit in pr.units # pr.units only holds peers (self.unit is not included) 478 | if (address := pr.data[unit].get("private_address")) 479 | ] 480 | 481 | return addresses 482 | 483 | 484 | if __name__ == "__main__": 485 | main(AlertmanagerCharm, use_juju_for_storage=True) 486 | -------------------------------------------------------------------------------- /lib/charms/grafana_k8s/v0/grafana_source.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Canonical Ltd. 2 | # See LICENSE file for licensing details. 3 | 4 | """## Overview. 5 | 6 | This document explains how to integrate with the Grafana charm 7 | for the purpose of providing a datasource which can be used by 8 | Grafana dashboards. It also explains the structure of the data 9 | expected by the `grafana-source` interface, and may provide a 10 | mechanism or reference point for providing a compatible interface 11 | or library by providing a definitive reference guide to the 12 | structure of relation data which is shared between the Grafana 13 | charm and any charm providing datasource information. 14 | 15 | ## Provider Library Usage 16 | 17 | The Grafana charm interacts with its datasources using its charm 18 | library. The goal of this library is to be as simple to use as 19 | possible, and instantiation of the class with or without changing 20 | the default arguments provides a complete use case. For the simplest 21 | use case of a Prometheus (or Prometheus-compatible) datasource 22 | provider in a charm which `provides: grafana-source`, creation of a 23 | `GrafanaSourceProvider` object with the default arguments is sufficient. 24 | 25 | The default arguments are: 26 | 27 | `charm`: `self` from the charm instantiating this library 28 | `source_type`: None 29 | `source_port`: None 30 | `source_url`: None 31 | `relation_name`: grafana-source 32 | `refresh_event`: A `PebbleReady` event from `charm`, used to refresh 33 | the IP address sent to Grafana on a charm lifecycle event or 34 | pod restart 35 | 36 | The value of `source_url` should be a fully-resolvable URL for a valid Grafana 37 | source, e.g., `http://example.com/api` or similar. 38 | 39 | If your configuration requires any changes from these defaults, they 40 | may be set from the class constructor. It may be instantiated as 41 | follows: 42 | 43 | from charms.grafana_k8s.v0.grafana_source import GrafanaSourceProvider 44 | 45 | class FooCharm: 46 | def __init__(self, *args): 47 | super().__init__(*args, **kwargs) 48 | ... 49 | self.grafana_source_provider = GrafanaSourceProvider( 50 | self, source_type="prometheus", source_port="9090" 51 | ) 52 | ... 53 | 54 | The first argument (`self`) should be a reference to the parent (datasource) 55 | charm, as this charm's model will be used for relation data, IP addresses, 56 | and lifecycle events. 57 | 58 | An instantiated `GrafanaSourceProvider` will ensure that each unit of its 59 | parent charm is added as a datasource in the Grafana configuration once a 60 | relation is established, using the [Grafana datasource provisioning]( 61 | https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources) 62 | specification via YAML files. 63 | 64 | This information is added to the relation data for the charms as serialized JSON 65 | from a dict, with a structure of: 66 | ``` 67 | { 68 | "application": { 69 | "model": charm.model.name, # from `charm` in the constructor 70 | "model_uuid": charm.model.uuid, 71 | "application": charm.model.app.name, 72 | "type": source_type, 73 | }, 74 | "unit/0": { 75 | "uri": {ip_address}:{port}{path} # `ip_address` is derived at runtime, `port` from the constructor, 76 | # and `path` from the constructor, if specified 77 | }, 78 | ``` 79 | 80 | This is ingested by :class:`GrafanaSourceConsumer`, and is sufficient for configuration. 81 | 82 | 83 | ## Consumer Library Usage 84 | 85 | The `GrafanaSourceConsumer` object may be used by Grafana 86 | charms to manage relations with available datasources. For this 87 | purpose, a charm consuming Grafana datasource information should do 88 | the following things: 89 | 90 | 1. Instantiate the `GrafanaSourceConsumer` object by providing it a 91 | reference to the parent (Grafana) charm and, optionally, the name of 92 | the relation that the Grafana charm uses to interact with datasources. 93 | This relation must confirm to the `grafana-source` interface. 94 | 95 | For example a Grafana charm may instantiate the 96 | `GrafanaSourceConsumer` in its constructor as follows 97 | 98 | from charms.grafana_k8s.v0.grafana_source import GrafanaSourceConsumer 99 | 100 | def __init__(self, *args): 101 | super().__init__(*args) 102 | ... 103 | self.grafana_source_consumer = GrafanaSourceConsumer(self) 104 | ... 105 | 106 | 2. A Grafana charm also needs to listen to the 107 | `GrafanaSourceEvents` events emitted by the `GrafanaSourceConsumer` 108 | by adding itself as an observer for these events: 109 | 110 | self.framework.observe( 111 | self.grafana_source_consumer.on.sources_changed, 112 | self._on_sources_changed, 113 | ) 114 | self.framework.observe( 115 | self.grafana_source_consumer.on.sources_to_delete_changed, 116 | self._on_sources_to_delete_change, 117 | ) 118 | 119 | The reason for two separate events is that Grafana keeps track of 120 | removed datasources in its [datasource provisioning]( 121 | https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources). 122 | 123 | If your charm is merely implementing a `grafana-source`-compatible API, 124 | and is does not follow exactly the same semantics as Grafana, observing these 125 | events may not be needed. 126 | """ 127 | 128 | import json 129 | import logging 130 | import re 131 | import socket 132 | from typing import Any, Dict, List, Optional, Union 133 | 134 | from ops.charm import ( 135 | CharmBase, 136 | CharmEvents, 137 | RelationChangedEvent, 138 | RelationDepartedEvent, 139 | RelationEvent, 140 | RelationJoinedEvent, 141 | RelationRole, 142 | ) 143 | from ops.framework import ( 144 | BoundEvent, 145 | EventBase, 146 | EventSource, 147 | Object, 148 | ObjectEvents, 149 | StoredDict, 150 | StoredList, 151 | StoredState, 152 | ) 153 | from ops.model import Relation 154 | 155 | # The unique Charmhub library identifier, never change it 156 | LIBID = "974705adb86f40228298156e34b460dc" 157 | 158 | # Increment this major API version when introducing breaking changes 159 | LIBAPI = 0 160 | 161 | # Increment this PATCH version before using `charmcraft publish-lib` or reset 162 | # to 0 if you are raising the major API version 163 | LIBPATCH = 11 164 | 165 | logger = logging.getLogger(__name__) 166 | 167 | DEFAULT_RELATION_NAME = "grafana-source" 168 | DEFAULT_PEER_NAME = "grafana" 169 | RELATION_INTERFACE_NAME = "grafana_datasource" 170 | 171 | 172 | def _type_convert_stored(obj): 173 | """Convert Stored* to their appropriate types, recursively.""" 174 | if isinstance(obj, StoredList): 175 | return list(map(_type_convert_stored, obj)) 176 | elif isinstance(obj, StoredDict): 177 | rdict = {} 178 | for k in obj.keys(): 179 | rdict[k] = _type_convert_stored(obj[k]) 180 | return rdict 181 | else: 182 | return obj 183 | 184 | 185 | class RelationNotFoundError(Exception): 186 | """Raised if there is no relation with the given name.""" 187 | 188 | def __init__(self, relation_name: str): 189 | self._relation_name = relation_name 190 | self.message = "No relation named '{}' found".format(relation_name) 191 | 192 | super().__init__(self.message) 193 | 194 | 195 | class RelationInterfaceMismatchError(Exception): 196 | """Raised if the relation with the given name has a different interface.""" 197 | 198 | def __init__( 199 | self, 200 | relation_name: str, 201 | expected_relation_interface: str, 202 | actual_relation_interface: str, 203 | ): 204 | self._relation_name = relation_name 205 | self.expected_relation_interface = expected_relation_interface 206 | self.actual_relation_interface = actual_relation_interface 207 | self.message = ( 208 | "The '{}' relation has '{}' as " 209 | "interface rather than the expected '{}'".format( 210 | relation_name, actual_relation_interface, expected_relation_interface 211 | ) 212 | ) 213 | 214 | super().__init__(self.message) 215 | 216 | 217 | class RelationRoleMismatchError(Exception): 218 | """Raised if the relation with the given name has a different direction.""" 219 | 220 | def __init__( 221 | self, 222 | relation_name: str, 223 | expected_relation_role: RelationRole, 224 | actual_relation_role: RelationRole, 225 | ): 226 | self._relation_name = relation_name 227 | self.expected_relation_interface = expected_relation_role 228 | self.actual_relation_role = actual_relation_role 229 | self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( 230 | relation_name, repr(actual_relation_role), repr(expected_relation_role) 231 | ) 232 | 233 | super().__init__(self.message) 234 | 235 | 236 | def _validate_relation_by_interface_and_direction( 237 | charm: CharmBase, 238 | relation_name: str, 239 | expected_relation_interface: str, 240 | expected_relation_role: RelationRole, 241 | ) -> None: 242 | """Verifies that a relation has the necessary characteristics. 243 | 244 | Verifies that the `relation_name` provided: (1) exists in metadata.yaml, 245 | (2) declares as interface the interface name passed as `relation_interface` 246 | and (3) has the right "direction", i.e., it is a relation that `charm` 247 | provides or requires. 248 | 249 | Args: 250 | charm: a `CharmBase` object to scan for the matching relation. 251 | relation_name: the name of the relation to be verified. 252 | expected_relation_interface: the interface name to be matched by the 253 | relation named `relation_name`. 254 | expected_relation_role: whether the `relation_name` must be either 255 | provided or required by `charm`. 256 | """ 257 | if relation_name not in charm.meta.relations: 258 | raise RelationNotFoundError(relation_name) 259 | 260 | relation = charm.meta.relations[relation_name] 261 | 262 | actual_relation_interface = relation.interface_name 263 | if actual_relation_interface != expected_relation_interface: 264 | raise RelationInterfaceMismatchError( 265 | relation_name, expected_relation_interface, actual_relation_interface 266 | ) 267 | 268 | if expected_relation_role == RelationRole.provides: 269 | if relation_name not in charm.meta.provides: 270 | raise RelationRoleMismatchError( 271 | relation_name, RelationRole.provides, RelationRole.requires 272 | ) 273 | elif expected_relation_role == RelationRole.requires: 274 | if relation_name not in charm.meta.requires: 275 | raise RelationRoleMismatchError( 276 | relation_name, RelationRole.requires, RelationRole.provides 277 | ) 278 | else: 279 | raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) 280 | 281 | 282 | class SourceFieldsMissingError(Exception): 283 | """An exception to indicate there a missing fields from a Grafana datsource definition.""" 284 | 285 | pass 286 | 287 | 288 | class GrafanaSourcesChanged(EventBase): 289 | """Event emitted when Grafana sources change.""" 290 | 291 | def __init__(self, handle, data=None): 292 | super().__init__(handle) 293 | self.data = data 294 | 295 | def snapshot(self) -> Dict: 296 | """Save grafana source information.""" 297 | return {"data": self.data} 298 | 299 | def restore(self, snapshot) -> None: 300 | """Restore grafana source information.""" 301 | self.data = snapshot["data"] 302 | 303 | 304 | class GrafanaSourceEvents(ObjectEvents): 305 | """Events raised by :class:`GrafanaSourceEvents.""" 306 | 307 | # We are emitting multiple events for the same thing due to the way Grafana provisions 308 | # datasources. There is no "convenient" way to tell Grafana to remove them outside of 309 | # setting a separate "deleteDatasources" key in the configuration file to tell Grafana 310 | # to forget about them, and the reasons why sources_to_delete -> deleteDatasources 311 | # would be emitted is intrinsically linked to the sources themselves 312 | sources_changed = EventSource(GrafanaSourcesChanged) 313 | sources_to_delete_changed = EventSource(GrafanaSourcesChanged) 314 | 315 | 316 | class GrafanaSourceProvider(Object): 317 | """A provider object for Grafana datasources.""" 318 | 319 | def __init__( 320 | self, 321 | charm: CharmBase, 322 | source_type: str, 323 | source_port: Optional[str] = "", 324 | source_url: Optional[str] = "", 325 | refresh_event: Optional[BoundEvent] = None, 326 | relation_name: str = DEFAULT_RELATION_NAME, 327 | extra_fields: dict = None, 328 | ) -> None: 329 | """Construct a Grafana charm client. 330 | 331 | The :class:`GrafanaSourceProvider` object provides an interface 332 | to Grafana. This interface supports providing additional 333 | sources for Grafana to monitor. For example, if a charm 334 | exposes some metrics which are consumable by an ingestor 335 | (such as Prometheus), then an additional source can be added 336 | by instantiating a :class:`GrafanaSourceProvider` object and 337 | adding its datasources as follows: 338 | 339 | self.grafana = GrafanaSourceProvider(self) 340 | self.grafana.add_source( 341 | address=
, 342 | port= 343 | ) 344 | 345 | Args: 346 | charm: a :class:`CharmBase` object which manages this 347 | :class:`GrafanaSourceProvider` object. Generally this is 348 | `self` in the instantiating class. 349 | source_type: an optional (default `prometheus`) source type 350 | required for Grafana configuration. The value must match 351 | the DataSource type from the Grafana perspective. 352 | source_port: an optional (default `9090`) source port 353 | required for Grafana configuration. 354 | source_url: an optional source URL which can be used, for example, if 355 | ingress for a source is enabled, or a URL path to the API consumed 356 | by the datasource must be specified for another reason. If set, 357 | 'source_port' will not be used. 358 | relation_name: string name of the relation that is provides the 359 | Grafana source service. It is strongly advised not to change 360 | the default, so that people deploying your charm will have a 361 | consistent experience with all other charms that provide 362 | Grafana datasources. 363 | refresh_event: a :class:`CharmEvents` event on which the IP 364 | address should be refreshed in case of pod or 365 | machine/VM restart. 366 | extra_fields: a :dict: which is used for additional information required 367 | for some datasources in the `jsonData` field 368 | """ 369 | _validate_relation_by_interface_and_direction( 370 | charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides 371 | ) 372 | 373 | super().__init__(charm, relation_name) 374 | self._charm = charm 375 | self._relation_name = relation_name 376 | events = self._charm.on[relation_name] 377 | 378 | self._source_type = source_type 379 | if source_type == "alertmanager": 380 | if not extra_fields: 381 | extra_fields = {"implementation": "prometheus"} 382 | elif not extra_fields.get("implementation", None): 383 | extra_fields["implementation"] = "prometheus" 384 | 385 | self._extra_fields = extra_fields 386 | 387 | if not refresh_event: 388 | if len(self._charm.meta.containers) == 1: 389 | container = list(self._charm.meta.containers.values())[0] 390 | refresh_event = self._charm.on[container.name.replace("-", "_")].pebble_ready 391 | 392 | if source_port and source_url: 393 | logger.warning( 394 | "Both `source_port` and `source_url` were specified! Using " 395 | "`source_url` as the address." 396 | ) 397 | 398 | if source_url and not re.match(r"^\w+://", source_url): 399 | logger.warning( 400 | "'source_url' should start with a scheme, such as " 401 | "'http://'. Assuming 'http://' since none is present." 402 | ) 403 | source_url = "http://{}".format(source_url) 404 | 405 | self._source_port = source_port 406 | self._source_url = source_url 407 | 408 | self.framework.observe(events.relation_joined, self._set_sources_from_event) 409 | if refresh_event: 410 | self.framework.observe(refresh_event, self._set_unit_details) 411 | 412 | def update_source(self, source_url: Optional[str] = ""): 413 | """Trigger the update of relation data.""" 414 | if source_url: 415 | self._source_url = source_url 416 | 417 | rel = self._charm.model.get_relation(self._relation_name) 418 | 419 | if not rel: 420 | return 421 | 422 | self._set_sources(rel) 423 | 424 | def _set_sources_from_event(self, event: RelationJoinedEvent) -> None: 425 | """Get a `Relation` object from the event to pass on.""" 426 | self._set_sources(event.relation) 427 | 428 | def _set_sources(self, rel: Relation): 429 | """Inform the consumer about the source configuration.""" 430 | self._set_unit_details(rel) 431 | 432 | if not self._charm.unit.is_leader(): 433 | return 434 | 435 | logger.debug("Setting Grafana data sources: %s", self._scrape_data) 436 | rel.data[self._charm.app]["grafana_source_data"] = json.dumps(self._scrape_data) 437 | 438 | @property 439 | def _scrape_data(self) -> Dict: 440 | """Generate source metadata. 441 | 442 | Returns: 443 | Source configuration data for Grafana. 444 | """ 445 | data = { 446 | "model": str(self._charm.model.name), 447 | "model_uuid": str(self._charm.model.uuid), 448 | "application": str(self._charm.model.app.name), 449 | "type": self._source_type, 450 | "extra_fields": self._extra_fields, 451 | } 452 | return data 453 | 454 | def _set_unit_details(self, _: Union[BoundEvent, RelationEvent, Relation]): 455 | """Set unit host details. 456 | 457 | Each time a provider charm container is restarted it updates its own host address in the 458 | unit relation data for the Prometheus consumer. 459 | """ 460 | for relation in self._charm.model.relations[self._relation_name]: 461 | url = self._source_url or "{}:{}".format(socket.getfqdn(), self._source_port) 462 | relation.data[self._charm.unit]["grafana_source_host"] = url 463 | 464 | 465 | class GrafanaSourceConsumer(Object): 466 | """A consumer object for working with Grafana datasources.""" 467 | 468 | on = GrafanaSourceEvents() 469 | _stored = StoredState() 470 | 471 | def __init__( 472 | self, 473 | charm: CharmBase, 474 | relation_name: str = DEFAULT_RELATION_NAME, 475 | ) -> None: 476 | """A Grafana based Monitoring service consumer, i.e., the charm that uses a datasource. 477 | 478 | Args: 479 | charm: a :class:`CharmBase` instance that manages this 480 | instance of the Grafana source service. 481 | relation_name: string name of the relation that is provides the 482 | Grafana source service. It is strongly advised not to change 483 | the default, so that people deploying your charm will have a 484 | consistent experience with all other charms that provide 485 | Grafana datasources. 486 | """ 487 | _validate_relation_by_interface_and_direction( 488 | charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires 489 | ) 490 | 491 | super().__init__(charm, relation_name) 492 | self._relation_name = relation_name 493 | self._charm = charm 494 | events = self._charm.on[relation_name] 495 | 496 | # We're stuck with this forever now so upgrades work, or until such point as we can 497 | # break compatibility 498 | self._stored.set_default( 499 | sources=dict(), 500 | sources_to_delete=set(), 501 | ) 502 | 503 | self.framework.observe(events.relation_changed, self._on_grafana_source_relation_changed) 504 | self.framework.observe(events.relation_departed, self._on_grafana_source_relation_departed) 505 | self.framework.observe( 506 | self._charm.on[DEFAULT_PEER_NAME].relation_changed, 507 | self._on_grafana_peer_changed, 508 | ) 509 | 510 | def _on_grafana_source_relation_changed(self, event: CharmEvents) -> None: 511 | """Handle relation changes in related providers. 512 | 513 | If there are changes in relations between Grafana source consumers 514 | and providers, this event handler (if the unit is the leader) will 515 | get data for an incoming grafana-source relation through a 516 | :class:`GrafanaSourcesChanged` event, and make the relation data 517 | is available in the app's datastore object. This data is set using 518 | Juju application topology. 519 | 520 | The Grafana charm can then respond to the event to update its 521 | configuration. 522 | """ 523 | if self._charm.unit.is_leader(): 524 | sources = {} 525 | 526 | for rel in self._charm.model.relations[self._relation_name]: 527 | source = self._get_source_config(rel) 528 | if source: 529 | sources[rel.id] = source 530 | 531 | self.set_peer_data("sources", sources) 532 | 533 | self.on.sources_changed.emit() 534 | 535 | def _on_grafana_peer_changed(self, _: RelationChangedEvent) -> None: 536 | """Emit source events on peer events so secondary charm data updates.""" 537 | if self._charm.unit.is_leader(): 538 | return 539 | self.on.sources_changed.emit() 540 | self.on.sources_to_delete_changed.emit() 541 | 542 | def _get_source_config(self, rel: Relation): 543 | """Generate configuration from data stored in relation data by providers.""" 544 | source_data = json.loads(rel.data[rel.app].get("grafana_source_data", "{}")) # type: ignore 545 | if not source_data: 546 | return 547 | 548 | data = [] 549 | 550 | sources_to_delete = self.get_peer_data("sources_to_delete") 551 | for unit_name, host_addr in self._relation_hosts(rel).items(): 552 | unique_source_name = "juju_{}_{}_{}_{}".format( 553 | source_data["model"], 554 | source_data["model_uuid"], 555 | source_data["application"], 556 | unit_name.split("/")[1], 557 | ) 558 | 559 | host = ( 560 | "http://{}".format(host_addr) if not re.match(r"^\w+://", host_addr) else host_addr 561 | ) 562 | 563 | host_data = { 564 | "unit": unit_name, 565 | "source_name": unique_source_name, 566 | "source_type": source_data["type"], 567 | "url": host, 568 | } 569 | if source_data.get("extra_fields", None): 570 | host_data["extra_fields"] = source_data.get("extra_fields") 571 | 572 | if host_data["source_name"] in sources_to_delete: 573 | sources_to_delete.remove(host_data["source_name"]) 574 | 575 | data.append(host_data) 576 | self.set_peer_data("sources_to_delete", list(sources_to_delete)) 577 | return data 578 | 579 | def _relation_hosts(self, rel: Relation) -> Dict: 580 | """Fetch host names and address of all provider units for a single relation. 581 | 582 | Args: 583 | rel: An `ops.model.Relation` object for which the host name to 584 | address mapping is required. 585 | 586 | Returns: 587 | A dictionary that maps unit names to unit addresses for 588 | the specified relation. 589 | """ 590 | hosts = {} 591 | for unit in rel.units: 592 | host_address = rel.data[unit].get("grafana_source_host") 593 | if not host_address: 594 | continue 595 | hosts[unit.name] = host_address 596 | return hosts 597 | 598 | def _on_grafana_source_relation_departed(self, event: RelationDepartedEvent) -> None: 599 | """Update job config when providers depart. 600 | 601 | When a Grafana source provider departs, the configuration 602 | for that provider is removed from the list of sources jobs, 603 | added to a list of sources to remove, and other providers 604 | are informed through a :class:`GrafanaSourcesChanged` event. 605 | """ 606 | removed_source = False 607 | if self._charm.unit.is_leader(): 608 | removed_source = self._remove_source_from_datastore(event) 609 | 610 | if removed_source: 611 | self.on.sources_to_delete_changed.emit() 612 | 613 | def _remove_source_from_datastore(self, event: RelationDepartedEvent) -> bool: 614 | """Remove the grafana-source from the datastore. 615 | 616 | Add the name to the list of sources to remove when a relation is broken. 617 | 618 | Returns a boolean indicating whether an event should be emitted. 619 | """ 620 | rel_id = event.relation.id 621 | logger.debug("Removing all data for relation: {}".format(rel_id)) 622 | 623 | stored_sources = self.get_peer_data("sources") 624 | 625 | removed_source = stored_sources.pop(str(rel_id), None) 626 | if removed_source: 627 | if event.unit: 628 | # Remove one unit only 629 | dead_unit = [s for s in removed_source if s["unit"] == event.unit.name][0] 630 | self._remove_source(dead_unit["source_name"]) 631 | 632 | # Re-update the list of stored sources 633 | stored_sources[rel_id] = [ 634 | dict(s) for s in removed_source if s["unit"] != event.unit.name 635 | ] 636 | else: 637 | for host in removed_source: 638 | self._remove_source(host["source_name"]) 639 | 640 | self.set_peer_data("sources", stored_sources) 641 | return True 642 | return False 643 | 644 | def _remove_source(self, source_name: str) -> None: 645 | """Remove a datasource by name.""" 646 | sources_to_delete = self.get_peer_data("sources_to_delete") 647 | if source_name not in sources_to_delete: 648 | sources_to_delete.append(source_name) 649 | self.set_peer_data("sources_to_delete", sources_to_delete) 650 | 651 | def upgrade_keys(self) -> None: 652 | """On upgrade, ensure stored data maintains compatibility.""" 653 | # self._stored.sources may have hyphens instead of underscores in key names. 654 | # Make sure they reconcile. 655 | self._set_default_data() 656 | sources = _type_convert_stored(self._stored.sources) 657 | for rel_id in sources.keys(): 658 | for i in range(len(sources[rel_id])): 659 | sources[rel_id][i].update( 660 | {k.replace("-", "_"): v for k, v in sources[rel_id][i].items()} 661 | ) 662 | 663 | # If there's stored data, merge it and purge it 664 | if self._stored.sources: 665 | self._stored.sources = {} 666 | peer_sources = self.get_peer_data("sources") 667 | sources.update(peer_sources) 668 | self.set_peer_data("sources", sources) 669 | 670 | if self._stored.sources_to_delete: 671 | old_sources_to_delete = _type_convert_stored(self._stored.sources_to_delete) 672 | self._stored.sources_to_delete = set() 673 | peer_sources_to_delete = set(self.get_peer_data("sources_to_delete")) 674 | sources_to_delete = set.union(old_sources_to_delete, peer_sources_to_delete) 675 | self.set_peer_data("sources_to_delete", sources_to_delete) 676 | 677 | @property 678 | def sources(self) -> List[dict]: 679 | """Returns an array of sources the source_consumer knows about.""" 680 | sources = [] 681 | stored_sources = self.get_peer_data("sources") 682 | for source in stored_sources.values(): 683 | sources.extend([host for host in _type_convert_stored(source)]) 684 | 685 | return sources 686 | 687 | @property 688 | def sources_to_delete(self) -> List[str]: 689 | """Returns an array of source names which have been removed.""" 690 | return self.get_peer_data("sources_to_delete") 691 | 692 | def _set_default_data(self) -> None: 693 | """Set defaults if they are not in peer relation data.""" 694 | data = {"sources": {}, "sources_to_delete": []} # type: ignore 695 | for k, v in data.items(): 696 | if not self.get_peer_data(k): 697 | self.set_peer_data(k, v) 698 | 699 | def set_peer_data(self, key: str, data: Any) -> None: 700 | """Put information into the peer data bucket instead of `StoredState`.""" 701 | self._charm.peers.data[self._charm.app][key] = json.dumps(data) # type: ignore 702 | 703 | def get_peer_data(self, key: str) -> Any: 704 | """Retrieve information from the peer data bucket instead of `StoredState`.""" 705 | data = self._charm.peers.data[self._charm.app].get(key, "") # type: ignore 706 | return json.loads(data) if data else {} 707 | --------------------------------------------------------------------------------