├── .gitattributes ├── .github └── workflows │ ├── lint.yaml │ ├── nightly-build.yaml │ └── release.yaml ├── .gitignore ├── CHANGELOG.md ├── CODEOWNERS ├── LICENSE ├── Makefile ├── PUBLISH.md ├── README.gotmpl ├── README.md ├── artifacthub-repo.yaml ├── coder-observability ├── .helmignore ├── Chart.lock ├── Chart.yaml ├── runbooks │ ├── coderd.md │ ├── postgres.md │ └── provisionerd.md ├── templates │ ├── _collector-config.tpl │ ├── _helpers.tpl │ ├── configmap-collector.yaml │ ├── configmap-prometheus-alerts.yaml │ ├── configmap-runbooks.yaml │ ├── configmap-sql-exporter.yaml │ ├── dashboards │ │ ├── _dashboards_coderd.json.tpl │ │ ├── _dashboards_prebuilds.json.tpl │ │ ├── _dashboards_provisionerd.json.tpl │ │ ├── _dashboards_status.json.tpl │ │ ├── _dashboards_workspace_detail.json.tpl │ │ ├── _dashboards_workspaces.json.tpl │ │ ├── configmap-dashboards-coderd.yaml │ │ ├── configmap-dashboards-prebuilds.yaml │ │ ├── configmap-dashboards-provisionerd.yaml │ │ ├── configmap-dashboards-status.yaml │ │ ├── configmap-dashboards-workspace_detail.yaml │ │ └── configmap-dashboards-workspaces.yaml │ ├── service-runbook-viewer.yaml │ ├── statefulset-postgres-exporter.yaml │ ├── statefulset-runbook-viewer.yaml │ └── statefulset-sql-exporter.yaml └── values.yaml ├── compiled └── resources.yaml └── scripts ├── check-unstaged.sh ├── compile.sh ├── lib.sh ├── lint-rules.sh ├── publish.sh └── version.sh /.gitattributes: -------------------------------------------------------------------------------- 1 | *.tpl linguist-language=go -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | branches: 8 | - main 9 | 10 | jobs: 11 | lint: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v4 16 | 17 | - name: Setup Go 18 | uses: actions/setup-go@v2 19 | with: 20 | go-version: 1.23 21 | 22 | - name: Install Helm 23 | uses: azure/setup-helm@v4 24 | with: 25 | version: v3.17.1 26 | 27 | - name: Install yq 28 | run: | 29 | sudo wget https://github.com/mikefarah/yq/releases/download/v4.42.1/yq_linux_amd64 -O /usr/bin/yq &&\ 30 | sudo chmod +x /usr/bin/yq 31 | 32 | - name: Lint Helm chart and rules 33 | run: make lint -------------------------------------------------------------------------------- /.github/workflows/nightly-build.yaml: -------------------------------------------------------------------------------- 1 | name: Nightly build 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | workflow_dispatch: # Allows manual triggering of the workflow 7 | 8 | jobs: 9 | nightly-build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v4 15 | 16 | - name: Setup Go 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: 1.22 20 | 21 | - name: Install Helm 22 | uses: azure/setup-helm@v4 23 | with: 24 | version: v3.17.1 25 | 26 | - name: Install yq 27 | run: | 28 | sudo wget https://github.com/mikefarah/yq/releases/download/v4.42.1/yq_linux_amd64 -O /usr/bin/yq &&\ 29 | sudo chmod +x /usr/bin/yq 30 | 31 | - name: make build 32 | run: | 33 | make build > output.log 2>&1 34 | continue-on-error: false 35 | 36 | - name: Upload script output 37 | uses: actions/upload-artifact@v4 38 | with: 39 | name: script-output 40 | path: output.log 41 | 42 | - name: Create issue from file on failure 43 | if: failure() 44 | uses: peter-evans/create-issue-from-file@v5 45 | with: 46 | title: nightly build failure 47 | content-filepath: output.log 48 | assignees: dannykopping -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | # GitHub release workflow. 2 | name: publish-helm 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | permissions: 9 | # Required to publish a release 10 | contents: write 11 | # Necessary to push docker images to ghcr.io. 12 | packages: write 13 | # Necessary for GCP authentication (https://github.com/google-github-actions/setup-gcloud#usage) 14 | id-token: write 15 | 16 | concurrency: ${{ github.workflow }}-${{ github.ref }} 17 | 18 | jobs: 19 | release: 20 | name: Build and publish 21 | runs-on: ubuntu-latest 22 | outputs: 23 | version: ${{ steps.version.outputs.version }} 24 | steps: 25 | - name: Checkout 26 | uses: actions/checkout@v4 27 | with: 28 | fetch-depth: 0 29 | 30 | # If the event that triggered the build was an annotated tag (which our 31 | # tags are supposed to be), actions/checkout has a bug where the tag in 32 | # question is only a lightweight tag and not a full annotated tag. This 33 | # command seems to fix it. 34 | # https://github.com/actions/checkout/issues/290 35 | - name: Fetch git tags 36 | run: git fetch --tags --force 37 | 38 | - name: Authenticate to Google Cloud 39 | uses: google-github-actions/auth@v2 40 | with: 41 | workload_identity_provider: projects/898976630798/locations/global/workloadIdentityPools/coder-ci/providers/github-actions 42 | service_account: coder-observability@coder-customer-releases.iam.gserviceaccount.com 43 | 44 | - name: Setup GCloud SDK 45 | uses: "google-github-actions/setup-gcloud@v2" 46 | 47 | - name: Install helm 48 | uses: azure/setup-helm@v4 49 | with: 50 | version: v3.9.2 51 | 52 | - name: Publish Helm Chart 53 | if: ${{ !inputs.dry_run }} 54 | run: | 55 | ./scripts/publish.sh 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | charts/ 2 | build/ 3 | scratch 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## v0.3.0 4 | 5 | - Adding prebuilt workspace dashboard & alerts 6 | 7 | ## v0.2.1 8 | 9 | - Upgraded subcharts 10 | - Loki: upgraded to v6.7.1 -> v6.7.3 11 | - FIX: `listen-address` duplicate removed in `prometheus-config-reloader` 12 | 13 | ## v0.2.0 14 | 15 | - Upgraded subcharts 16 | - Grafana: upgraded from v7.3.7 -> v7.3.12 17 | - Prometheus: upgraded to v25.18.0 -> v25.24.1 18 | - Loki: upgraded to v6.3.4 -> v6.7.1 19 | 20 | ## v0.1.0 21 | 22 | - Lint Helm chart in CI 23 | 24 | ## v0.0.2 -> v0.0.11 25 | 26 | - Several stability & configurability improvements 27 | 28 | ## v0.0.1 29 | 30 | - Initial release 31 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @dannykopping -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Use a single bash shell for each job, and immediately exit on failure 2 | SHELL := bash 3 | .SHELLFLAGS = -ceu 4 | .ONESHELL: 5 | 6 | # This doesn't work on directories. 7 | # See https://stackoverflow.com/questions/25752543/make-delete-on-error-for-directory-targets 8 | .DELETE_ON_ERROR: 9 | 10 | all: lint 11 | .PHONY: all 12 | 13 | lint: build lint/helm lint/rules readme 14 | ./scripts/check-unstaged.sh 15 | .PHONY: lint 16 | 17 | lint/helm: lint/helm/coder-observability 18 | .PHONY: lint/helm 19 | 20 | lint/helm/coder-observability: 21 | helm lint --strict --set coder.image.tag=v$(shell ./scripts/version.sh) coder-observability/ 22 | .PHONY: lint/helm/coder-observability 23 | 24 | build: 25 | ./scripts/compile.sh 26 | .PHONY: build 27 | 28 | lint/rules: lint/helm/prometheus-rules 29 | .PHONY: lint/rules 30 | 31 | lint/helm/prometheus-rules: 32 | @./scripts/lint-rules.sh 33 | 34 | .PHONY: lint/helm/prometheus-rules 35 | 36 | # Usage: publish-patch, publish-minor, publish-major 37 | # Publishing is handled by GitHub Actions, triggered by tag creation. 38 | publish-%: 39 | version=$(shell ./scripts/version.sh --bump $*) && \ 40 | git tag --sign "$$version" -m "Release: $$version" && \ 41 | git push origin tag "$$version" 42 | 43 | readme: 44 | go install github.com/norwoodj/helm-docs/cmd/helm-docs@latest 45 | helm-docs --output-file ../README.md \ 46 | --values-file=values.yaml --chart-search-root=coder-observability --template-files=../README.gotmpl -------------------------------------------------------------------------------- /PUBLISH.md: -------------------------------------------------------------------------------- 1 | # Publishing the Coder Observability Chart 2 | 3 | - make desired changes 4 | - run `make publish-{major|minor|patch}` which creates & pushes a new tag, which kicks off a GH Action to publish the chart -------------------------------------------------------------------------------- /README.gotmpl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Coder Observability Chart 5 | 6 | > [!NOTE] 7 | > This Helm chart is in BETA; use with caution 8 | 9 | ## Overview 10 | 11 | This chart contains a highly opinionated set of integrations between Grafana, Loki, Prometheus, Alertmanager, and 12 | Grafana Agent. 13 | 14 | Dashboards, alerts, and runbooks are preconfigured for monitoring [Coder](https://coder.com/) installations. 15 | 16 | Out of the box: 17 | 18 | Metrics will be scraped from all pods which have a `prometheus.io/scrape=true` annotation.
19 | Logs will be scraped from all pods in the Kubernetes cluster. 20 | 21 | ## Installation 22 | 23 | 24 | 25 | ```bash 26 | helm repo add coder-observability https://helm.coder.com/observability 27 | helm upgrade --install coder-observability coder-observability/coder-observability --version 0.1.1 --namespace coder-observability --create-namespace 28 | ``` 29 | 30 | ## Requirements 31 | 32 | ### General 33 | 34 | - Helm 3.7+ 35 | 36 | ### Coder 37 | 38 |
39 | Kubernetes-based deployments 40 | If your installation is not in a namespace named `coder`, you will need to modify: 41 | 42 | ```yaml 43 | global: 44 | coder: 45 | controlPlaneNamespace: 46 | externalProvisionersNamespace: 47 | ``` 48 | 49 |
50 | 51 |
52 | Non-Kubernetes deployments (click to expand) 53 | Ensure your Coder installation is accessible to the resources created by this chart. 54 | 55 | Set `global.coder.scrapeMetrics` such that the metrics can be scraped from your installation, e.g.: 56 | 57 | ```yaml 58 | global: 59 | coder: 60 | scrapeMetrics: 61 | hostname: your.coder.host 62 | port: 2112 63 | scrapeInterval: 15s 64 | additionalLabels: 65 | job: coder 66 | ``` 67 | 68 | If you would like your logs scraped from a process outside Kubernetes, you need to mount the log file(s) in and 69 | configure Grafana Agent to scrape them; here's an example configuration: 70 | 71 | ```yaml 72 | grafana-agent: 73 | agent: 74 | mounts: 75 | extra: 76 | - mountPath: /var/log 77 | name: logs 78 | readOnly: true 79 | controller: 80 | volumes: 81 | extra: 82 | - hostPath: 83 | path: /var/log 84 | name: logs 85 | 86 | extraBlocks: |- 87 | loki.source.file "coder_log" { 88 | targets = [ 89 | {__path__ = "/var/log/coder.log", job="coder"}, 90 | ] 91 | forward_to = [loki.write.loki.receiver] 92 | } 93 | ``` 94 | 95 |
96 | 97 | Ensure these environment variables are set in your Coder deployment: 98 | 99 | - `CODER_PROMETHEUS_ENABLE=true` 100 | - `CODER_PROMETHEUS_COLLECT_AGENT_STATS=true` 101 | - `CODER_LOGGING_HUMAN=/dev/stderr` (only `human` log format is supported 102 | currently; [issue](https://github.com/coder/observability/issues/8)) 103 | 104 | Ensure these labels exist on your Coder & provisioner deployments: 105 | 106 | - `prometheus.io/scrape=true` 107 | - `prometheus.io/port=2112` (ensure this matches the port defined by `CODER_PROMETHEUS_ADDRESS`) 108 | 109 | If you use the [`coder/coder` helm chart](https://github.com/coder/coder/tree/main/helm), you can use the 110 | following: 111 | 112 | ```yaml 113 | coder: 114 | podAnnotations: 115 | prometheus.io/scrape: "true" 116 | prometheus.io/port: "2112" 117 | ``` 118 | 119 | For more details, see 120 | the [coder documentation on exposing Prometheus metrics](https://coder.com/docs/v2/latest/admin/prometheus). 121 | 122 | ### Postgres 123 | 124 | You may configure the Helm chart to monitor your Coder deployment's Postgres server. Ensure that the resources created 125 | by this Helm chart can access your Postgres server. 126 | 127 | Create a secret with your Postgres password and reference it as follows, along with the other connection details: 128 | 129 | ```yaml 130 | global: 131 | postgres: 132 | hostname: 133 | port: 134 | database: 135 | username: 136 | mountSecret: 137 | ``` 138 | 139 | The secret should be in the form of `PGPASSWORD=`, as this secret will be used to create an environment 140 | variable. 141 | 142 | ```yaml 143 | apiVersion: v1 144 | kind: Secret 145 | metadata: 146 | name: pg-secret 147 | namespace: coder-observability 148 | data: 149 | PGPASSWORD: 150 | ``` 151 | 152 |
153 | Postgres metrics (click to expand) 154 | 155 | A tool called [`postgres-exporter`](https://github.com/prometheus-community/postgres_exporter) is used to scrape metrics 156 | from your Postgres server, and you can see the metrics it is exposing as follows: 157 | 158 | ```bash 159 | kubectl -n coder-observability port-forward statefulset/postgres-exporter 9187 160 | 161 | curl http://localhost:9187/metrics 162 | ``` 163 | 164 |
165 | 166 | ### Grafana 167 | 168 | To access Grafana, run: 169 | 170 | ```bash 171 | kubectl -n coder-observability port-forward svc/grafana 3000:80 172 | ``` 173 | 174 | And open your web browser to http://localhost:3000/. 175 | 176 | By default, Grafana is configured to allow anonymous access; if you want password authentication, define this in 177 | your `values.yaml`: 178 | 179 | ```yaml 180 | grafana: 181 | admin: 182 | existingSecret: grafana-admin 183 | userKey: username 184 | passwordKey: password 185 | grafana.ini: 186 | auth.anonymous: 187 | enabled: false 188 | ``` 189 | 190 | You will also need to define a secret as follows: 191 | 192 | ```yaml 193 | apiVersion: v1 194 | kind: Secret 195 | metadata: 196 | name: grafana-admin # this matches the "existingSecret" field above 197 | stringData: 198 | username: "" # this matches the "userKey" field above 199 | password: "" # this matches the "passwordKey" field above 200 | ``` 201 | 202 | To add an Ingress for Grafana, define this in your `values.yaml`: 203 | 204 | ```yaml 205 | grafana: 206 | grafana.ini: 207 | server: 208 | domain: observability.example.com 209 | root_url: "%(protocol)s://%(domain)s/grafana" 210 | serve_from_sub_path: true 211 | ingress: 212 | enabled: true 213 | hosts: 214 | - "observability.example.com" 215 | path: "/" 216 | ``` 217 | 218 | ## Subcharts 219 | 220 | {{ template "chart.requirementsTable" . }} 221 | 222 | Each subchart can be disabled by setting the `enabled` field to `false`. 223 | 224 | | Subchart | Setting | 225 | |-----------------|-------------------------| 226 | | `grafana` | `grafana.enabled` | 227 | | `grafana-agent` | `grafana-agent.enabled` | 228 | | `loki` | `loki.enabled` | 229 | | `prometheus` | `prometheus.enabled` | 230 | 231 | ## Values 232 | 233 | The `global` values are the values which pertain to this chart, while the rest pertain to the subcharts. 234 | These values represent only the values _set_ in this chart. For the full list of available values, please see each 235 | subchart. 236 | 237 | For example, the `grafana.replicas` value is set by this chart by default, and is one of hundreds of available 238 | values which are defined [here](https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration). 239 | 240 | {{ template "chart.valuesTable" . }} 241 | 242 | {{ template "helm-docs.versionFooter" . }} 243 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Coder Observability Chart 5 | 6 | > [!NOTE] 7 | > This Helm chart is in BETA; use with caution 8 | 9 | ## Overview 10 | 11 | This chart contains a highly opinionated set of integrations between Grafana, Loki, Prometheus, Alertmanager, and 12 | Grafana Agent. 13 | 14 | Dashboards, alerts, and runbooks are preconfigured for monitoring [Coder](https://coder.com/) installations. 15 | 16 | Out of the box: 17 | 18 | Metrics will be scraped from all pods which have a `prometheus.io/scrape=true` annotation.
19 | Logs will be scraped from all pods in the Kubernetes cluster. 20 | 21 | ## Installation 22 | 23 | 24 | 25 | ```bash 26 | helm repo add coder-observability https://helm.coder.com/observability 27 | helm upgrade --install coder-observability coder-observability/coder-observability --version 0.1.1 --namespace coder-observability --create-namespace 28 | ``` 29 | 30 | ## Requirements 31 | 32 | ### General 33 | 34 | - Helm 3.7+ 35 | 36 | ### Coder 37 | 38 |
39 | Kubernetes-based deployments 40 | If your installation is not in a namespace named `coder`, you will need to modify: 41 | 42 | ```yaml 43 | global: 44 | coder: 45 | controlPlaneNamespace: 46 | externalProvisionersNamespace: 47 | ``` 48 | 49 |
50 | 51 |
52 | Non-Kubernetes deployments (click to expand) 53 | Ensure your Coder installation is accessible to the resources created by this chart. 54 | 55 | Set `global.coder.scrapeMetrics` such that the metrics can be scraped from your installation, e.g.: 56 | 57 | ```yaml 58 | global: 59 | coder: 60 | scrapeMetrics: 61 | hostname: your.coder.host 62 | port: 2112 63 | scrapeInterval: 15s 64 | additionalLabels: 65 | job: coder 66 | ``` 67 | 68 | If you would like your logs scraped from a process outside Kubernetes, you need to mount the log file(s) in and 69 | configure Grafana Agent to scrape them; here's an example configuration: 70 | 71 | ```yaml 72 | grafana-agent: 73 | agent: 74 | mounts: 75 | extra: 76 | - mountPath: /var/log 77 | name: logs 78 | readOnly: true 79 | controller: 80 | volumes: 81 | extra: 82 | - hostPath: 83 | path: /var/log 84 | name: logs 85 | 86 | extraBlocks: |- 87 | loki.source.file "coder_log" { 88 | targets = [ 89 | {__path__ = "/var/log/coder.log", job="coder"}, 90 | ] 91 | forward_to = [loki.write.loki.receiver] 92 | } 93 | ``` 94 | 95 |
96 | 97 | Ensure these environment variables are set in your Coder deployment: 98 | 99 | - `CODER_PROMETHEUS_ENABLE=true` 100 | - `CODER_PROMETHEUS_COLLECT_AGENT_STATS=true` 101 | - `CODER_LOGGING_HUMAN=/dev/stderr` (only `human` log format is supported 102 | currently; [issue](https://github.com/coder/observability/issues/8)) 103 | 104 | Ensure these labels exist on your Coder & provisioner deployments: 105 | 106 | - `prometheus.io/scrape=true` 107 | - `prometheus.io/port=2112` (ensure this matches the port defined by `CODER_PROMETHEUS_ADDRESS`) 108 | 109 | If you use the [`coder/coder` helm chart](https://github.com/coder/coder/tree/main/helm), you can use the 110 | following: 111 | 112 | ```yaml 113 | coder: 114 | podAnnotations: 115 | prometheus.io/scrape: "true" 116 | prometheus.io/port: "2112" 117 | ``` 118 | 119 | For more details, see 120 | the [coder documentation on exposing Prometheus metrics](https://coder.com/docs/v2/latest/admin/prometheus). 121 | 122 | ### Postgres 123 | 124 | You may configure the Helm chart to monitor your Coder deployment's Postgres server. Ensure that the resources created 125 | by this Helm chart can access your Postgres server. 126 | 127 | Create a secret with your Postgres password and reference it as follows, along with the other connection details: 128 | 129 | ```yaml 130 | global: 131 | postgres: 132 | hostname: 133 | port: 134 | database: 135 | username: 136 | mountSecret: 137 | ``` 138 | 139 | The secret should be in the form of `PGPASSWORD=`, as this secret will be used to create an environment 140 | variable. 141 | 142 | ```yaml 143 | apiVersion: v1 144 | kind: Secret 145 | metadata: 146 | name: pg-secret 147 | namespace: coder-observability 148 | data: 149 | PGPASSWORD: 150 | ``` 151 | 152 |
153 | Postgres metrics (click to expand) 154 | 155 | A tool called [`postgres-exporter`](https://github.com/prometheus-community/postgres_exporter) is used to scrape metrics 156 | from your Postgres server, and you can see the metrics it is exposing as follows: 157 | 158 | ```bash 159 | kubectl -n coder-observability port-forward statefulset/postgres-exporter 9187 160 | 161 | curl http://localhost:9187/metrics 162 | ``` 163 | 164 |
165 | 166 | ### Grafana 167 | 168 | To access Grafana, run: 169 | 170 | ```bash 171 | kubectl -n coder-observability port-forward svc/grafana 3000:80 172 | ``` 173 | 174 | And open your web browser to http://localhost:3000/. 175 | 176 | By default, Grafana is configured to allow anonymous access; if you want password authentication, define this in 177 | your `values.yaml`: 178 | 179 | ```yaml 180 | grafana: 181 | admin: 182 | existingSecret: grafana-admin 183 | userKey: username 184 | passwordKey: password 185 | grafana.ini: 186 | auth.anonymous: 187 | enabled: false 188 | ``` 189 | 190 | You will also need to define a secret as follows: 191 | 192 | ```yaml 193 | apiVersion: v1 194 | kind: Secret 195 | metadata: 196 | name: grafana-admin # this matches the "existingSecret" field above 197 | stringData: 198 | username: "" # this matches the "userKey" field above 199 | password: "" # this matches the "passwordKey" field above 200 | ``` 201 | 202 | To add an Ingress for Grafana, define this in your `values.yaml`: 203 | 204 | ```yaml 205 | grafana: 206 | grafana.ini: 207 | server: 208 | domain: observability.example.com 209 | root_url: "%(protocol)s://%(domain)s/grafana" 210 | serve_from_sub_path: true 211 | ingress: 212 | enabled: true 213 | hosts: 214 | - "observability.example.com" 215 | path: "/" 216 | ``` 217 | 218 | ## Subcharts 219 | 220 | | Repository | Name | Version | 221 | |------------|------|---------| 222 | | https://grafana.github.io/helm-charts | grafana | ~v7.3.7 | 223 | | https://grafana.github.io/helm-charts | grafana-agent(grafana-agent) | ~0.37.0 | 224 | | https://grafana.github.io/helm-charts | loki | ~v6.7.3 | 225 | | https://prometheus-community.github.io/helm-charts | prometheus | ~v25.24.1 | 226 | 227 | Each subchart can be disabled by setting the `enabled` field to `false`. 228 | 229 | | Subchart | Setting | 230 | |-----------------|-------------------------| 231 | | `grafana` | `grafana.enabled` | 232 | | `grafana-agent` | `grafana-agent.enabled` | 233 | | `loki` | `loki.enabled` | 234 | | `prometheus` | `prometheus.enabled` | 235 | 236 | ## Values 237 | 238 | The `global` values are the values which pertain to this chart, while the rest pertain to the subcharts. 239 | These values represent only the values _set_ in this chart. For the full list of available values, please see each 240 | subchart. 241 | 242 | For example, the `grafana.replicas` value is set by this chart by default, and is one of hundreds of available 243 | values which are defined [here](https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration). 244 | 245 | | Key | Type | Default | Description | 246 | |-----|------|---------|-------------| 247 | | global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"IneligiblePrebuilds":{"delay":"10m","enabled":true,"thresholds":{"notify":1}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"UnprovisionedPrebuiltWorkspaces":{"delay":"10m","enabled":true,"thresholds":{"warn":1}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | 248 | | global.coder.coderdSelector | string | `"pod=~`coder.*`, pod!~`.*provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. ensure this uses backticks for quotes! | 249 | | global.coder.controlPlaneNamespace | string | `"coder"` | the namespace into which the control plane has been deployed. | 250 | | global.coder.externalProvisionersNamespace | string | `"coder"` | the namespace into which any external provisioners have been deployed. | 251 | | global.coder.logFormat | string | `"human"` | | 252 | | global.coder.provisionerdSelector | string | `"pod=~`coder-provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. https://coder.com/docs/v2/latest/admin/provisioners TODO: rename container label in provisioner helm chart to be "provisioner" not "coder" ensure this uses backticks for quotes! | 253 | | global.coder.scrapeMetrics | string | `nil` | use this to scrape metrics from a standalone (set of) coder deployment(s) if using kubernetes, rather add an annotation "prometheus.io/scrape=true" and coder will get automatically scraped; set this value to null and configure coderdSelector to target your coder pods | 254 | | global.coder.workspacesSelector | string | `"namespace=`coder-workspaces`"` | the namespace into which any external provisioners have been deployed. | 255 | | global.dashboards | object | `{"queryTimeout":900,"refresh":"30s","timerange":"12h"}` | settings for bundled dashboards | 256 | | global.dashboards.queryTimeout | int | `900` | how long until a query in Grafana will timeout after | 257 | | global.dashboards.refresh | string | `"30s"` | how often dashboards should refresh | 258 | | global.dashboards.timerange | string | `"12h"` | how far back dashboards should look | 259 | | global.externalScheme | string | `"http"` | | 260 | | global.externalZone | string | `"svc.cluster.local"` | | 261 | | global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","sslrootcert":"/home/coder/.postgresql/rootcert.pem","username":"coder","volumeMounts":[{"mountPath":"/home/coder/.postgresql","name":"pg-certs-mount","readOnly":true}],"volumes":[{"configMap":{"name":"pg-certs-mount-config-map"},"name":"pg-certs-mount"}]}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | 262 | | global.postgres.alerts | object | `{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}}` | alerts for postgres | 263 | | global.telemetry | object | `{"metrics":{"scrape_interval":"15s","scrape_timeout":"12s"}}` | control telemetry collection | 264 | | global.telemetry.metrics | object | `{"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection | 265 | | global.telemetry.metrics.scrape_interval | string | `"15s"` | how often the collector will scrape discovered pods | 266 | | global.telemetry.metrics.scrape_timeout | string | `"12s"` | how long a request will be allowed to wait before being canceled | 267 | | global.zone | string | `"svc"` | | 268 | | grafana-agent.agent.clustering.enabled | bool | `false` | | 269 | | grafana-agent.agent.configMap.create | bool | `false` | | 270 | | grafana-agent.agent.configMap.key | string | `"config.river"` | | 271 | | grafana-agent.agent.configMap.name | string | `"collector-config"` | | 272 | | grafana-agent.agent.extraArgs[0] | string | `"--disable-reporting=true"` | | 273 | | grafana-agent.agent.mode | string | `"flow"` | | 274 | | grafana-agent.agent.mounts.dockercontainers | bool | `true` | | 275 | | grafana-agent.agent.mounts.varlog | bool | `true` | | 276 | | grafana-agent.commonRelabellings | string | `"rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n}\n// coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\nrule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n}\nrule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n}\nrule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n}\nrule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n}"` | | 277 | | grafana-agent.controller.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 278 | | grafana-agent.controller.type | string | `"daemonset"` | | 279 | | grafana-agent.crds.create | bool | `false` | | 280 | | grafana-agent.discovery | string | `"// Discover k8s nodes\ndiscovery.kubernetes \"nodes\" {\n role = \"node\"\n}\n\n// Discover k8s pods\ndiscovery.kubernetes \"pods\" {\n role = \"pod\"\n selectors {\n role = \"pod\"\n }\n}"` | | 281 | | grafana-agent.enabled | bool | `true` | | 282 | | grafana-agent.extraBlocks | string | `""` | | 283 | | grafana-agent.fullnameOverride | string | `"grafana-agent"` | | 284 | | grafana-agent.podLogsRelabelRules | string | `""` | | 285 | | grafana-agent.podMetricsRelabelRules | string | `""` | | 286 | | grafana-agent.withOTLPReceiver | bool | `false` | | 287 | | grafana."grafana.ini"."auth.anonymous".enabled | bool | `true` | | 288 | | grafana."grafana.ini"."auth.anonymous".org_name | string | `"Main Org."` | | 289 | | grafana."grafana.ini"."auth.anonymous".org_role | string | `"Admin"` | | 290 | | grafana."grafana.ini".analytics.reporting_enabled | bool | `false` | | 291 | | grafana."grafana.ini".dashboards.default_home_dashboard_path | string | `"/var/lib/grafana/dashboards/coder/0/status.json"` | | 292 | | grafana."grafana.ini".dataproxy.timeout | string | `"{{ $.Values.global.dashboards.queryTimeout }}"` | | 293 | | grafana."grafana.ini".feature_toggles.autoMigrateOldPanels | bool | `true` | | 294 | | grafana."grafana.ini".users.allow_sign_up | bool | `false` | | 295 | | grafana.admin.existingSecret | string | `""` | | 296 | | grafana.annotations."prometheus.io/scrape" | string | `"true"` | | 297 | | grafana.dashboardProviders."coder.yaml".apiVersion | int | `1` | | 298 | | grafana.dashboardProviders."coder.yaml".providers[0].disableDeletion | bool | `false` | | 299 | | grafana.dashboardProviders."coder.yaml".providers[0].editable | bool | `false` | | 300 | | grafana.dashboardProviders."coder.yaml".providers[0].folder | string | `"Coder"` | | 301 | | grafana.dashboardProviders."coder.yaml".providers[0].name | string | `"coder"` | | 302 | | grafana.dashboardProviders."coder.yaml".providers[0].options.path | string | `"/var/lib/grafana/dashboards/coder"` | | 303 | | grafana.dashboardProviders."coder.yaml".providers[0].orgId | int | `1` | | 304 | | grafana.dashboardProviders."coder.yaml".providers[0].type | string | `"file"` | | 305 | | grafana.dashboardProviders."coder.yaml".providers[0].updateIntervalSeconds | int | `5` | | 306 | | grafana.dashboardProviders."infra.yaml".apiVersion | int | `1` | | 307 | | grafana.dashboardProviders."infra.yaml".providers[0].disableDeletion | bool | `false` | | 308 | | grafana.dashboardProviders."infra.yaml".providers[0].editable | bool | `false` | | 309 | | grafana.dashboardProviders."infra.yaml".providers[0].folder | string | `"Infrastructure"` | | 310 | | grafana.dashboardProviders."infra.yaml".providers[0].name | string | `"infra"` | | 311 | | grafana.dashboardProviders."infra.yaml".providers[0].options.path | string | `"/var/lib/grafana/dashboards/infra"` | | 312 | | grafana.dashboardProviders."infra.yaml".providers[0].orgId | int | `1` | | 313 | | grafana.dashboardProviders."infra.yaml".providers[0].type | string | `"file"` | | 314 | | grafana.dashboardProviders."sidecar.yaml".apiVersion | int | `1` | | 315 | | grafana.dashboardProviders."sidecar.yaml".providers[0].disableDeletion | bool | `false` | | 316 | | grafana.dashboardProviders."sidecar.yaml".providers[0].editable | bool | `false` | | 317 | | grafana.dashboardProviders."sidecar.yaml".providers[0].folder | string | `"Other"` | | 318 | | grafana.dashboardProviders."sidecar.yaml".providers[0].name | string | `"sidecar"` | | 319 | | grafana.dashboardProviders."sidecar.yaml".providers[0].options.path | string | `"/tmp/dashboards"` | | 320 | | grafana.dashboardProviders."sidecar.yaml".providers[0].orgId | int | `1` | | 321 | | grafana.dashboardProviders."sidecar.yaml".providers[0].type | string | `"file"` | | 322 | | grafana.dashboardProviders."sidecar.yaml".providers[0].updateIntervalSeconds | int | `30` | | 323 | | grafana.dashboards.infra.node-exporter-full.datasource | string | `"metrics"` | | 324 | | grafana.dashboards.infra.node-exporter-full.gnetId | int | `1860` | | 325 | | grafana.dashboards.infra.node-exporter-full.revision | int | `36` | | 326 | | grafana.dashboards.infra.postgres-database.datasource | string | `"metrics"` | | 327 | | grafana.dashboards.infra.postgres-database.gnetId | int | `9628` | | 328 | | grafana.dashboards.infra.postgres-database.revision | int | `7` | | 329 | | grafana.datasources."datasources.yaml".apiVersion | int | `1` | | 330 | | grafana.datasources."datasources.yaml".datasources[0].access | string | `"proxy"` | | 331 | | grafana.datasources."datasources.yaml".datasources[0].editable | bool | `false` | | 332 | | grafana.datasources."datasources.yaml".datasources[0].isDefault | bool | `true` | | 333 | | grafana.datasources."datasources.yaml".datasources[0].name | string | `"metrics"` | | 334 | | grafana.datasources."datasources.yaml".datasources[0].timeout | string | `"{{ add $.Values.global.dashboards.queryTimeout 5 }}"` | | 335 | | grafana.datasources."datasources.yaml".datasources[0].type | string | `"prometheus"` | | 336 | | grafana.datasources."datasources.yaml".datasources[0].uid | string | `"prometheus"` | | 337 | | grafana.datasources."datasources.yaml".datasources[0].url | string | `"http://prometheus.{{ .Release.Namespace }}.{{ $.Values.global.zone }}"` | | 338 | | grafana.datasources."datasources.yaml".datasources[1].access | string | `"proxy"` | | 339 | | grafana.datasources."datasources.yaml".datasources[1].editable | bool | `false` | | 340 | | grafana.datasources."datasources.yaml".datasources[1].isDefault | bool | `false` | | 341 | | grafana.datasources."datasources.yaml".datasources[1].name | string | `"logs"` | | 342 | | grafana.datasources."datasources.yaml".datasources[1].timeout | string | `"{{ add $.Values.global.dashboards.queryTimeout 5 }}"` | | 343 | | grafana.datasources."datasources.yaml".datasources[1].type | string | `"loki"` | | 344 | | grafana.datasources."datasources.yaml".datasources[1].uid | string | `"loki"` | | 345 | | grafana.datasources."datasources.yaml".datasources[1].url | string | `"http://loki-gateway.{{ .Release.Namespace }}.{{ $.Values.global.zone }}"` | | 346 | | grafana.datasources."datasources.yaml".datasources[2].editable | bool | `false` | | 347 | | grafana.datasources."datasources.yaml".datasources[2].isDefault | bool | `false` | | 348 | | grafana.datasources."datasources.yaml".datasources[2].jsonData.sslmode | string | `"{{ .Values.global.postgres.sslmode }}"` | | 349 | | grafana.datasources."datasources.yaml".datasources[2].name | string | `"postgres"` | | 350 | | grafana.datasources."datasources.yaml".datasources[2].secureJsonData.password | string | `"{{ if .Values.global.postgres.password }}{{ .Values.global.postgres.password }}{{ else }}$PGPASSWORD{{ end }}"` | | 351 | | grafana.datasources."datasources.yaml".datasources[2].timeout | string | `"{{ add $.Values.global.dashboards.queryTimeout 5 }}"` | | 352 | | grafana.datasources."datasources.yaml".datasources[2].type | string | `"postgres"` | | 353 | | grafana.datasources."datasources.yaml".datasources[2].uid | string | `"postgres"` | | 354 | | grafana.datasources."datasources.yaml".datasources[2].url | string | `"{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}"` | | 355 | | grafana.datasources."datasources.yaml".datasources[2].user | string | `"{{ .Values.global.postgres.username }}"` | | 356 | | grafana.deploymentStrategy.type | string | `"Recreate"` | | 357 | | grafana.enabled | bool | `true` | | 358 | | grafana.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION | bool | `true` | | 359 | | grafana.extraConfigmapMounts[0].configMap | string | `"dashboards-status"` | | 360 | | grafana.extraConfigmapMounts[0].mountPath | string | `"/var/lib/grafana/dashboards/coder/0"` | | 361 | | grafana.extraConfigmapMounts[0].name | string | `"dashboards-status"` | | 362 | | grafana.extraConfigmapMounts[0].readOnly | bool | `false` | | 363 | | grafana.extraConfigmapMounts[1].configMap | string | `"dashboards-coderd"` | | 364 | | grafana.extraConfigmapMounts[1].mountPath | string | `"/var/lib/grafana/dashboards/coder/1"` | | 365 | | grafana.extraConfigmapMounts[1].name | string | `"dashboards-coderd"` | | 366 | | grafana.extraConfigmapMounts[1].readOnly | bool | `false` | | 367 | | grafana.extraConfigmapMounts[2].configMap | string | `"dashboards-provisionerd"` | | 368 | | grafana.extraConfigmapMounts[2].mountPath | string | `"/var/lib/grafana/dashboards/coder/2"` | | 369 | | grafana.extraConfigmapMounts[2].name | string | `"dashboards-provisionerd"` | | 370 | | grafana.extraConfigmapMounts[2].readOnly | bool | `false` | | 371 | | grafana.extraConfigmapMounts[3].configMap | string | `"dashboards-workspaces"` | | 372 | | grafana.extraConfigmapMounts[3].mountPath | string | `"/var/lib/grafana/dashboards/coder/3"` | | 373 | | grafana.extraConfigmapMounts[3].name | string | `"dashboards-workspaces"` | | 374 | | grafana.extraConfigmapMounts[3].readOnly | bool | `false` | | 375 | | grafana.extraConfigmapMounts[4].configMap | string | `"dashboards-workspace-detail"` | | 376 | | grafana.extraConfigmapMounts[4].mountPath | string | `"/var/lib/grafana/dashboards/coder/4"` | | 377 | | grafana.extraConfigmapMounts[4].name | string | `"dashboards-workspace-detail"` | | 378 | | grafana.extraConfigmapMounts[4].readOnly | bool | `false` | | 379 | | grafana.extraConfigmapMounts[5].configMap | string | `"dashboards-prebuilds"` | | 380 | | grafana.extraConfigmapMounts[5].mountPath | string | `"/var/lib/grafana/dashboards/coder/5"` | | 381 | | grafana.extraConfigmapMounts[5].name | string | `"dashboards-prebuilds"` | | 382 | | grafana.extraConfigmapMounts[5].readOnly | bool | `false` | | 383 | | grafana.fullnameOverride | string | `"grafana"` | | 384 | | grafana.image.tag | string | `"10.4.19"` | | 385 | | grafana.persistence.enabled | bool | `true` | | 386 | | grafana.persistence.size | string | `"10Gi"` | | 387 | | grafana.replicas | int | `1` | | 388 | | grafana.service.enabled | bool | `true` | | 389 | | grafana.sidecar.dashboards.enabled | bool | `false` | | 390 | | grafana.sidecar.dashboards.labelValue | string | `"1"` | | 391 | | grafana.sidecar.dashboards.provider.allowUiUpdates | bool | `true` | | 392 | | grafana.sidecar.dashboards.provider.disableDelete | bool | `true` | | 393 | | grafana.testFramework.enabled | bool | `false` | | 394 | | grafana.useStatefulSet | bool | `true` | | 395 | | loki.backend.extraArgs[0] | string | `"-log.level=debug"` | | 396 | | loki.backend.extraVolumeMounts[0].mountPath | string | `"/var/loki-ruler-wal"` | | 397 | | loki.backend.extraVolumeMounts[0].name | string | `"ruler-wal"` | | 398 | | loki.backend.extraVolumes[0].emptyDir | object | `{}` | | 399 | | loki.backend.extraVolumes[0].name | string | `"ruler-wal"` | | 400 | | loki.backend.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 401 | | loki.backend.replicas | int | `1` | | 402 | | loki.chunksCache.allocatedMemory | int | `1024` | | 403 | | loki.enabled | bool | `true` | | 404 | | loki.enterprise.adminApi.enabled | bool | `false` | | 405 | | loki.enterprise.enabled | bool | `false` | | 406 | | loki.enterprise.useExternalLicense | bool | `false` | | 407 | | loki.fullnameOverride | string | `"loki"` | | 408 | | loki.gateway.replicas | int | `1` | | 409 | | loki.loki.auth_enabled | bool | `false` | | 410 | | loki.loki.commonConfig.path_prefix | string | `"/var/loki"` | | 411 | | loki.loki.commonConfig.replication_factor | int | `1` | | 412 | | loki.loki.rulerConfig.alertmanager_url | string | `"http://alertmanager.{{ .Release.Namespace }}.{{ .Values.global.zone}}"` | | 413 | | loki.loki.rulerConfig.enable_alertmanager_v2 | bool | `true` | | 414 | | loki.loki.rulerConfig.enable_api | bool | `true` | | 415 | | loki.loki.rulerConfig.remote_write.clients.fake.headers.Source | string | `"Loki"` | | 416 | | loki.loki.rulerConfig.remote_write.clients.fake.remote_timeout | string | `"30s"` | | 417 | | loki.loki.rulerConfig.remote_write.clients.fake.url | string | `"http://prometheus.{{ .Release.Namespace }}.{{ .Values.global.zone}}/api/v1/write"` | | 418 | | loki.loki.rulerConfig.remote_write.enabled | bool | `true` | | 419 | | loki.loki.rulerConfig.ring.kvstore.store | string | `"inmemory"` | | 420 | | loki.loki.rulerConfig.rule_path | string | `"/rules"` | | 421 | | loki.loki.rulerConfig.storage.local.directory | string | `"/rules"` | | 422 | | loki.loki.rulerConfig.storage.type | string | `"local"` | | 423 | | loki.loki.rulerConfig.wal.dir | string | `"/var/loki-ruler-wal"` | | 424 | | loki.loki.schemaConfig.configs[0].from | string | `"2024-04-01"` | | 425 | | loki.loki.schemaConfig.configs[0].index.period | string | `"24h"` | | 426 | | loki.loki.schemaConfig.configs[0].index.prefix | string | `"index_"` | | 427 | | loki.loki.schemaConfig.configs[0].object_store | string | `"s3"` | | 428 | | loki.loki.schemaConfig.configs[0].schema | string | `"v13"` | | 429 | | loki.loki.schemaConfig.configs[0].store | string | `"tsdb"` | | 430 | | loki.lokiCanary.annotations."prometheus.io/scrape" | string | `"true"` | | 431 | | loki.lokiCanary.enabled | bool | `true` | | 432 | | loki.minio.address | string | `"loki-storage.{{ .Release.Namespace }}.{{ .Values.global.zone}}:9000"` | | 433 | | loki.minio.enabled | bool | `true` | | 434 | | loki.minio.fullnameOverride | string | `"loki-storage"` | | 435 | | loki.minio.podAnnotations."prometheus.io/path" | string | `"/minio/v2/metrics/cluster"` | | 436 | | loki.minio.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 437 | | loki.minio.podLabels."app.kubernetes.io/name" | string | `"loki-storage"` | | 438 | | loki.monitoring.dashboards.enabled | bool | `true` | | 439 | | loki.monitoring.selfMonitoring.enabled | bool | `false` | | 440 | | loki.monitoring.selfMonitoring.grafanaAgent.installOperator | bool | `false` | | 441 | | loki.nameOverride | string | `"loki"` | | 442 | | loki.read.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 443 | | loki.read.replicas | int | `1` | | 444 | | loki.resultsCache.allocatedMemory | int | `1024` | | 445 | | loki.sidecar.rules.folder | string | `"/rules/fake"` | | 446 | | loki.sidecar.rules.logLevel | string | `"DEBUG"` | | 447 | | loki.test.canaryServiceAddress | string | `"http://loki-canary:3500/metrics"` | | 448 | | loki.test.enabled | bool | `true` | | 449 | | loki.write.extraArgs[0] | string | `"-log.level=debug"` | | 450 | | loki.write.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 451 | | loki.write.replicas | int | `1` | | 452 | | prometheus.alertmanager.enabled | bool | `true` | | 453 | | prometheus.alertmanager.fullnameOverride | string | `"alertmanager"` | | 454 | | prometheus.alertmanager.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 455 | | prometheus.alertmanager.service.port | int | `80` | | 456 | | prometheus.configmapReload.prometheus.containerPort | int | `9091` | | 457 | | prometheus.configmapReload.prometheus.extraArgs.log-level | string | `"all"` | | 458 | | prometheus.configmapReload.prometheus.extraArgs.watch-interval | string | `"15s"` | | 459 | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].configMap | string | `"metrics-alerts"` | | 460 | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].mountPath | string | `"/etc/config/alerts"` | | 461 | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].name | string | `"alerts"` | | 462 | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].readonly | bool | `true` | | 463 | | prometheus.enabled | bool | `true` | | 464 | | prometheus.kube-state-metrics.enabled | bool | `true` | | 465 | | prometheus.kube-state-metrics.fullnameOverride | string | `"kube-state-metrics"` | | 466 | | prometheus.kube-state-metrics.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 467 | | prometheus.prometheus-node-exporter.enabled | bool | `true` | | 468 | | prometheus.prometheus-node-exporter.fullnameOverride | string | `"node-exporter"` | | 469 | | prometheus.prometheus-node-exporter.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 470 | | prometheus.prometheus-pushgateway.enabled | bool | `false` | | 471 | | prometheus.server.extraArgs."log.level" | string | `"debug"` | | 472 | | prometheus.server.extraConfigmapMounts[0].configMap | string | `"metrics-alerts"` | | 473 | | prometheus.server.extraConfigmapMounts[0].mountPath | string | `"/etc/config/alerts"` | | 474 | | prometheus.server.extraConfigmapMounts[0].name | string | `"alerts"` | | 475 | | prometheus.server.extraConfigmapMounts[0].readonly | bool | `true` | | 476 | | prometheus.server.extraFlags[0] | string | `"web.enable-lifecycle"` | | 477 | | prometheus.server.extraFlags[1] | string | `"enable-feature=remote-write-receiver"` | | 478 | | prometheus.server.fullnameOverride | string | `"prometheus"` | | 479 | | prometheus.server.global.evaluation_interval | string | `"30s"` | | 480 | | prometheus.server.persistentVolume.enabled | bool | `true` | | 481 | | prometheus.server.persistentVolume.size | string | `"12Gi"` | | 482 | | prometheus.server.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 483 | | prometheus.server.replicaCount | int | `1` | | 484 | | prometheus.server.retentionSize | string | `"10GB"` | | 485 | | prometheus.server.service.type | string | `"ClusterIP"` | | 486 | | prometheus.server.statefulSet.enabled | bool | `true` | | 487 | | prometheus.serverFiles."prometheus.yml".rule_files[0] | string | `"/etc/config/alerts/*.yaml"` | | 488 | | prometheus.serverFiles."prometheus.yml".scrape_configs | list | `[]` | | 489 | | prometheus.testFramework.enabled | bool | `false` | | 490 | | runbookViewer.image | string | `"dannyben/madness"` | | 491 | | sqlExporter.image | string | `"burningalchemist/sql_exporter"` | | 492 | 493 | -------------------------------------------------------------------------------- /artifacthub-repo.yaml: -------------------------------------------------------------------------------- 1 | # This file is uploaded to GCS at helm.coder.com/observability/artifacthub-repo.yml 2 | # and used by ArtifactHub to verify the repository. 3 | repositoryID: 167a0393-cb7e-4f42-af79-02f8a91915f5 4 | owners: 5 | - name: colin 6 | email: colin@coder.com 7 | - name: Danny Kopping 8 | email: danny@coder.com -------------------------------------------------------------------------------- /coder-observability/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /coder-observability/Chart.lock: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - name: grafana 3 | repository: https://grafana.github.io/helm-charts 4 | version: 7.3.12 5 | - name: prometheus 6 | repository: https://prometheus-community.github.io/helm-charts 7 | version: 25.24.2 8 | - name: loki 9 | repository: https://grafana.github.io/helm-charts 10 | version: 6.7.4 11 | - name: grafana-agent 12 | repository: https://grafana.github.io/helm-charts 13 | version: 0.37.0 14 | digest: sha256:05e0dae0200cabf5cb9e2cfb18a4e166fcaceefaf39827addff4299b18c31d4e 15 | generated: "2025-01-16T07:54:38.036598102Z" 16 | -------------------------------------------------------------------------------- /coder-observability/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: coder-observability 3 | description: Gain insights into your Coder deployment 4 | 5 | type: application 6 | version: 0.1.0 7 | dependencies: 8 | - name: grafana 9 | condition: grafana.enabled 10 | repository: https://grafana.github.io/helm-charts 11 | version: '~v7.3.7' 12 | - name: prometheus 13 | condition: prometheus.enabled 14 | repository: https://prometheus-community.github.io/helm-charts 15 | version: '~v25.24.1' 16 | - name: loki 17 | condition: loki.enabled 18 | repository: https://grafana.github.io/helm-charts 19 | version: '~v6.7.3' 20 | - name: grafana-agent 21 | alias: grafana-agent 22 | condition: grafana-agent.enabled 23 | repository: https://grafana.github.io/helm-charts 24 | version: '~0.37.0' 25 | maintainers: 26 | - name: Coder Technologies, Inc. 27 | url: https://github.com/coder/observability/issues 28 | keywords: 29 | - observability 30 | - coder 31 | - coder.com 32 | - cloud development environment 33 | - cde 34 | sources: 35 | - https://github.com/coder/observability 36 | icon: https://helm.coder.com/coder_logo_black.png 37 | annotations: 38 | artifacthub.io/category: monitoring-logging -------------------------------------------------------------------------------- /coder-observability/runbooks/coderd.md: -------------------------------------------------------------------------------- 1 | # Coderd Runbooks 2 | 3 | ## CoderdCPUUsage 4 | 5 | The CPU usage of one or more Coder pods has been close to the limit defined for 6 | the deployment. This can cause slowness in the application, workspaces becoming 7 | unavailable, and may lead to the application failing its liveness probes and 8 | being restarted. 9 | 10 | To resolve this issue, increase the CPU limits of the Coder deployment. 11 | 12 | If you find this occurring frequently, you may wish to check your Coder 13 | deployment against [Coder's Reference Architectures](https://coder.com/docs/v2/latest/admin/architectures). 14 | 15 | ## CoderdMemoryUsage 16 | 17 | The memory usage of one or more Coder pods has been close to the limit defined 18 | for the deployment. When the memory usage exceeds the limit, the pod(s) will be 19 | restarted by Kubernetes. This will interrupt all connections to workspaces being 20 | handled by the affected pod(s). 21 | 22 | To resolve this issue, increase the memory limits of the Coder deployment. 23 | 24 | If you find this occurring frequently, check the memory usage over a longer 25 | period of time. If it appears to be increasing monotonically, this is likely a 26 | memory leak and should be considered a bug. 27 | 28 | ## CoderdRestarts 29 | 30 | One or more Coder pods have been restarting multiple times in the last 10 31 | minutes. This may be due to a number of issues, including: 32 | 33 | - Failure to connect to the configured database: Coder requires a reachable 34 | PostgreSQL database to function. If it fails to connect, you will see an error 35 | similar to the following: 36 | 37 | ```console 38 | [warn] ping postgres: retrying error="dial tcp 10.43.94.60:5432: connect: connection refused" try=3 39 | ``` 40 | 41 | - Out-Of-Memory (OOM) kills due to memory usage (see [above](#codermemoryusage)), 42 | - An unexpected bug causing the application to exit with an error. 43 | 44 | If Coder is not restarting due to excessive memory usage, check the logs: 45 | 46 | 1. Check the logs of the deployment for any errors, 47 | 48 | ```console 49 | kubectl -n logs deployment/coder --previous 50 | ``` 51 | 52 | 2. Check any Kubernetes events related to the deployment, 53 | 54 | ```console 55 | kubectl -n events --watch 56 | ``` 57 | 58 | ## CoderdReplicas 59 | 60 | One or more Coderd replicas are down. This may cause availability problems and elevated 61 | response times for user and agent API calls. 62 | 63 | To resolve this issue, review the Coder deployment for possible `CrashLoopBackOff` 64 | instances or re-adjust alarm levels based on the actual number of replicas. 65 | 66 | ## CoderdWorkspaceBuildFailures 67 | 68 | A few workspace build errors have been recently observed. 69 | 70 | Review Prometheus metrics to identify failed jobs. Check the workspace build logs 71 | to determine if there is a relationship with a new template version or a buggy 72 | Terraform plugin. 73 | 74 | ## CoderdLicenseSeats 75 | 76 | Your Enterprise license is approaching or has exceeded the number of seats purchased. 77 | 78 | Please contact your Coder sales contact, or visit https://coder.com/contact/sales. 79 | 80 | ## CoderdIneligiblePrebuilds 81 | 82 | Prebuilds only become eligible to be claimed by users once the workspace's agent is a) running and b) all of its startup 83 | scripts have completed. 84 | 85 | If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. 86 | 87 | ## CoderdUnprovisionedPrebuiltWorkspaces 88 | 89 | The number of running prebuilt workspaces is lower than the desired instances. This could be for several reasons, 90 | ordered by likehood: 91 | 92 | ### Experiment/License 93 | 94 | The prebuilds feature is currently gated behind an experiment *and* a premium license. 95 | 96 | Ensure that the prebuilds experiment is enabled with `CODER_EXPERIMENTS=workspace-prebuilds`, and that you have a premium 97 | license added. 98 | 99 | ### Preset Validation Issue 100 | 101 | Templates which have prebuilds configured will require a configured preset defined, with ALL of the required parameters 102 | set in the preset. If any of these are missing, or any of the parameters - as defined - fail validation, then the prebuilds 103 | subsystem will refuse to attempt a workspace build. 104 | 105 | Consult the coderd logs for more information; look out for errors or warnings from the prebuilds subsystem. 106 | 107 | ### Template Misconfiguration or Error 108 | 109 | Prebuilt workspaces cannot be provisioned due to some issue at `terraform apply`-time. This could be due to misconfigured 110 | cloud resources, improper authorization, or any number of other issues. 111 | 112 | Visit the Workspaces page, change the search term to `owner:prebuilds`, and view on the previously failed builds. The 113 | error will likely be quite obvious. 114 | 115 | ### Provisioner Latency 116 | 117 | If your provisioners are overloaded and cannot process provisioner jobs quickly enough, prebuilt workspaces may be affected. 118 | There is no prioritization at present for prebuilt workspace jobs. 119 | 120 | Ensure your provisioners are appropriately resources (i.e. you have enough instances) to handle the concurrent build demand. 121 | 122 | ### Use of Workspace Tags 123 | 124 | If you are using `coder_workspace_tags` ([docs](https://coder.com/docs/admin/templates/extending-templates/workspace-tags)) 125 | in your template, chances are you do not have any provisioners running or they are under-resourced (see **Provisioner Latency**). 126 | 127 | Ensure your running provisioners are configured with your desired tags. 128 | 129 | ### Reconciliation Loop Issue 130 | 131 | The prebuilds subsystem runs a _reconciliation loop_ which monitors the state of prebuilt workspaces to ensure the desired 132 | number of instances are present at all times. Workspace Prebuilds is currently a BETA feature and so there could be a bug 133 | in this _reconciliation loop_, which should be reported to Coder. 134 | 135 | Examine your coderd logs for any errors or warnings relating to prebuilds. -------------------------------------------------------------------------------- /coder-observability/runbooks/postgres.md: -------------------------------------------------------------------------------- 1 | # Postgres Runbooks 2 | 3 | ## PostgresNotificationQueueFillingUp 4 | 5 | Postgres offers asynchronous notification via the `LISTEN` and `NOTIFY` 6 | commands. Coder depends heavily on this async notification mechanism for routine 7 | functionality. 8 | 9 | This may be due to a session executing `LISTEN()` and entering a long 10 | transaction. To verify: 11 | 12 | - Check active sessions with `SELECT * FROM pg_stat_activity;`, 13 | - Check the database log for the PID of the session that is preventing cleanup, 14 | - Kill the query: `SELECT pg_terminate_backend();` 15 | 16 | For more information, see the PostgreSQL documentation available here: 17 | 18 | - [PostgreSQL documentation on `LISTEN`](https://www.postgresql.org/docs/current/sql-listen.html) 19 | - [PostgreSQL documentation on `NOTIFY`](https://www.postgresql.org/docs/current/sql-notify.html) 20 | 21 | ## PostgresDown 22 | 23 | Postgres is not currently running, which means the Coder control plane will not be able to read or write any state. 24 | Workspaces may continue to work normally but it is recommended to get Postgres back up as quickly as possible. 25 | 26 | ## PostgresConnectionsRunningLow 27 | 28 | PostgreSQL has a `max_connections` setting that determines the maximum number of 29 | concurrent connections. Once this connection limit is reached, no new 30 | connections will be possible. 31 | 32 | To increase the maximum number of concurrent connections, update the `max_connections` 33 | configuration option for your PostgreSQL instance. See the PostgreSQL 34 | documentation for more details. 35 | 36 | **Note:** You may also need to adjust `shared_buffers` after increasing 37 | `max_connections`. Additionally, you may also need to adjust the kernel 38 | configuration value `kernel.shmmax` in `/etc/sysctl.conf` / 39 | `/etc/sysctl.conf.d`. 40 | 41 | For more information, see: 42 | 43 | - [PostgreSQL Documentation: Server Configuration](https://www.postgresql.org/docs/16/runtime-config-file-locations.html) 44 | - [Tuning your PostgreSQL Server](https://wiki.postgresql.org/wiki/Tuning_Your_PostgreSQL_Server) 45 | -------------------------------------------------------------------------------- /coder-observability/runbooks/provisionerd.md: -------------------------------------------------------------------------------- 1 | # Provisionerd Runbooks 2 | 3 | ## ProvisionerdReplicas 4 | 5 | One of more Provisioner replicas is down. Workspace builds may be queued and processed slower. 6 | 7 | To resolve this issue, review the Coder deployment (Coder provisioner pods) 8 | for possible `CrashLoopBackOff` instances or re-adjust alarm levels based on the actual 9 | number of replicas. 10 | -------------------------------------------------------------------------------- /coder-observability/templates/_collector-config.tpl: -------------------------------------------------------------------------------- 1 | {{- define "collector-config" -}} 2 | {{ $agent := (index .Values "grafana-agent") }} 3 | 4 | {{ $agent.logging }} 5 | {{ $agent.discovery }} 6 | 7 | discovery.relabel "pod_logs" { 8 | targets = discovery.kubernetes.pods.targets 9 | {{ $agent.commonRelabellings | nindent 2 }} 10 | rule { 11 | source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"] 12 | separator = "/" 13 | action = "replace" 14 | replacement = "/var/log/pods/*$1/*.log" 15 | target_label = "__path__" 16 | } 17 | rule { 18 | action = "replace" 19 | source_labels = ["__meta_kubernetes_pod_container_id"] 20 | regex = "^(\\w+):\\/\\/.+$" 21 | replacement = "$1" 22 | target_label = "tmp_container_runtime" 23 | } 24 | {{- if $agent.podLogsRelabelRules -}} 25 | {{ $agent.podLogsRelabelRules | trim | nindent 2 }} 26 | {{- end }} 27 | } 28 | 29 | discovery.relabel "pod_metrics" { 30 | targets = discovery.kubernetes.pods.targets 31 | {{ $agent.commonRelabellings | nindent 6 }} 32 | // drop ports that do not expose Prometheus metrics, but might otherwise be exposed by a container which *also* 33 | // exposes an HTTP port which exposes metrics 34 | rule { 35 | source_labels = ["__meta_kubernetes_pod_container_port_name"] 36 | regex = "grpc|http-(memberlist|console)" 37 | action = "drop" 38 | } 39 | // adapted from the Prometheus helm chart 40 | // https://github.com/prometheus-community/helm-charts/blob/862870fc3c847e32479b509e511584d5283126a3/charts/prometheus/values.yaml#L1070 41 | rule { 42 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scrape"] 43 | action = "keep" 44 | regex = "true" 45 | } 46 | rule { 47 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scheme"] 48 | action = "replace" 49 | regex = "(https?)" 50 | target_label = "__scheme__" 51 | } 52 | rule { 53 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_path"] 54 | action = "replace" 55 | target_label = "__metrics_path__" 56 | regex = "(.+)" 57 | } 58 | rule { 59 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_port", "__meta_kubernetes_pod_ip"] 60 | action = "replace" 61 | regex = "(\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})" 62 | replacement = "[$2]:$1" 63 | target_label = "__address__" 64 | } 65 | rule { 66 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_port", "__meta_kubernetes_pod_ip"] 67 | action = "replace" 68 | regex = "(\\d+);((([0-9]+?)(\\.|$)){4})" 69 | replacement = "$2:$1" 70 | target_label = "__address__" 71 | } 72 | {{- if $agent.podMetricsRelabelRules -}} 73 | {{ $agent.podMetricsRelabelRules | trim | nindent 2 }} 74 | {{- end }} 75 | } 76 | 77 | local.file_match "pod_logs" { 78 | path_targets = discovery.relabel.pod_logs.output 79 | } 80 | 81 | loki.source.file "pod_logs" { 82 | targets = local.file_match.pod_logs.targets 83 | forward_to = [loki.process.pod_logs.receiver] 84 | } 85 | 86 | loki.process "pod_logs" { 87 | stage.match { 88 | selector = "{tmp_container_runtime=\"containerd\"}" 89 | // the cri processing stage extracts the following k/v pairs: log, stream, time, flags 90 | stage.cri {} 91 | // Set the extract flags and stream values as labels 92 | stage.labels { 93 | values = { 94 | flags = "", 95 | stream = "", 96 | } 97 | } 98 | } 99 | 100 | // if the label tmp_container_runtime from above is docker parse using docker 101 | stage.match { 102 | selector = "{tmp_container_runtime=\"docker\"}" 103 | // the docker processing stage extracts the following k/v pairs: log, stream, time 104 | stage.docker {} 105 | 106 | // Set the extract stream value as a label 107 | stage.labels { 108 | values = { 109 | stream = "", 110 | } 111 | } 112 | } 113 | 114 | // drop the temporary container runtime label as it is no longer needed 115 | stage.label_drop { 116 | values = ["tmp_container_runtime"] 117 | } 118 | 119 | // parse Coder logs and extract level & logger for efficient filtering 120 | stage.match { 121 | selector = "{pod=~\"coder.*\"}" // TODO: make configurable 122 | 123 | stage.multiline { 124 | firstline = {{ printf `^(?P\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\.\d{3})` | quote }} 125 | max_wait_time = "10s" 126 | } 127 | 128 | stage.regex { 129 | expression = {{ printf `^(?P\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\.\d{3})\s\[(?P\w+)\]\s\s(?P[^:]+):\s(?P.+)` | quote }} 130 | } 131 | 132 | stage.timestamp { 133 | source = "ts" 134 | format = "2006-01-02 15:04:05.000" 135 | action_on_failure = "fudge" // rather have inaccurate time than drop the log line 136 | } 137 | 138 | stage.labels { 139 | values = { 140 | level = "", 141 | logger = "", 142 | } 143 | } 144 | } 145 | 146 | forward_to = [loki.write.loki.receiver] 147 | } 148 | {{ if $agent.extraBlocks -}} 149 | {{ $agent.extraBlocks }} 150 | {{- end }} 151 | loki.write "loki" { 152 | endpoint { 153 | url = "http://{{ include "loki.fullname" .Subcharts.loki }}-gateway.{{ .Release.Namespace }}.{{ .Values.global.zone }}/loki/api/v1/push" 154 | } 155 | } 156 | 157 | prometheus.scrape "pods" { 158 | targets = discovery.relabel.pod_metrics.output 159 | forward_to = [prometheus.relabel.pods.receiver] 160 | 161 | scrape_interval = "{{ .Values.global.telemetry.metrics.scrape_interval }}" 162 | scrape_timeout = "{{ .Values.global.telemetry.metrics.scrape_timeout }}" 163 | } 164 | 165 | // These are metric_relabel_configs while discovery.relabel are relabel_configs. 166 | // See https://github.com/grafana/agent/blob/main/internal/converter/internal/prometheusconvert/prometheusconvert.go#L95-L106 167 | prometheus.relabel "pods" { 168 | forward_to = [prometheus.remote_write.default.receiver] 169 | 170 | // Drop kube-state-metrics' labels which clash with ours 171 | rule { 172 | source_labels = ["__name__", "container"] 173 | regex = "kube_pod.+;(.+)" 174 | target_label = "container" 175 | replacement = "" 176 | } 177 | rule { 178 | source_labels = ["__name__", "pod"] 179 | regex = "kube_pod.+;(.+)" 180 | target_label = "pod" 181 | replacement = "" 182 | } 183 | rule { 184 | source_labels = ["__name__", "namespace"] 185 | regex = "kube_pod.+;(.+)" 186 | target_label = "namespace" 187 | replacement = "" 188 | } 189 | rule { 190 | source_labels = ["__name__", "exported_container"] 191 | // don't replace an empty label 192 | regex = "^kube_pod.+;(.+)$" 193 | target_label = "container" 194 | replacement = "$1" 195 | } 196 | rule { 197 | source_labels = ["__name__", "exported_pod"] 198 | // don't replace an empty label 199 | regex = "^kube_pod.+;(.+)$" 200 | target_label = "pod" 201 | replacement = "$1" 202 | } 203 | rule { 204 | source_labels = ["__name__", "exported_namespace"] 205 | // don't replace an empty label 206 | regex = "^kube_pod.+;(.+)$" 207 | target_label = "namespace" 208 | replacement = "$1" 209 | } 210 | rule { 211 | regex = "^(exported_.*|image_.*|container_id|id|uid)$" 212 | action = "labeldrop" 213 | } 214 | } 215 | 216 | discovery.relabel "cadvisor" { 217 | targets = discovery.kubernetes.nodes.targets 218 | rule { 219 | replacement = "/metrics/cadvisor" 220 | target_label = "__metrics_path__" 221 | } 222 | } 223 | 224 | prometheus.scrape "cadvisor" { 225 | targets = discovery.relabel.cadvisor.output 226 | forward_to = [ prometheus.relabel.cadvisor.receiver ] 227 | scheme = "https" 228 | tls_config { 229 | insecure_skip_verify = true 230 | } 231 | bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" 232 | scrape_interval = "{{ .Values.global.telemetry.metrics.scrape_interval }}" 233 | scrape_timeout = "{{ .Values.global.telemetry.metrics.scrape_timeout }}" 234 | } 235 | 236 | prometheus.relabel "cadvisor" { 237 | forward_to = [ prometheus.remote_write.default.receiver ] 238 | 239 | // Drop empty container labels, addressing https://github.com/google/cadvisor/issues/2688 240 | rule { 241 | source_labels = ["__name__","container"] 242 | separator = "@" 243 | regex = "(container_cpu_.*|container_fs_.*|container_memory_.*)@" 244 | action = "drop" 245 | } 246 | // Drop empty image labels, addressing https://github.com/google/cadvisor/issues/2688 247 | rule { 248 | source_labels = ["__name__","image"] 249 | separator = "@" 250 | regex = "(container_cpu_.*|container_fs_.*|container_memory_.*|container_network_.*)@" 251 | action = "drop" 252 | } 253 | // Drop irrelevant series 254 | rule { 255 | source_labels = ["container"] 256 | regex = "^POD$" 257 | action = "drop" 258 | } 259 | // Drop unnecessary labels 260 | rule { 261 | source_labels = ["id"] 262 | target_label = "id" 263 | replacement = "" 264 | } 265 | rule { 266 | source_labels = ["job"] 267 | target_label = "job" 268 | replacement = "" 269 | } 270 | rule { 271 | source_labels = ["name"] 272 | target_label = "name" 273 | replacement = "" 274 | } 275 | } 276 | 277 | prometheus.remote_write "default" { 278 | endpoint { 279 | url ="http://{{ include "prometheus.server.fullname" .Subcharts.prometheus }}.{{ .Release.Namespace }}.{{ .Values.global.zone }}/api/v1/write" 280 | 281 | // drop instance label which unnecessarily adds new series when pods are restarted, since pod IPs are dynamically assigned 282 | // NOTE: "__address__" is mapped to "instance", so will contain : 283 | write_relabel_config { 284 | regex = "instance" 285 | action = "labeldrop" 286 | } 287 | } 288 | } 289 | 290 | {{- if $agent.withOTLPReceiver -}} 291 | otelcol.receiver.otlp "otlp_receiver" { 292 | grpc { 293 | endpoint = "0.0.0.0:4317" 294 | } 295 | http { 296 | endpoint = "0.0.0.0:4318" 297 | } 298 | output { 299 | metrics = [otelcol.processor.batch.default.input] 300 | logs = [otelcol.processor.batch.default.input] 301 | } 302 | } 303 | otelcol.exporter.prometheus "to_prometheus" { 304 | forward_to = [ 305 | prometheus.remote_write.default.receiver, 306 | ] 307 | } 308 | otelcol.exporter.loki "to_loki" { 309 | forward_to = [ 310 | loki.write.loki.receiver, 311 | ] 312 | } 313 | otelcol.processor.batch "default" { 314 | output { 315 | metrics = [otelcol.exporter.prometheus.to_prometheus.input] 316 | logs = [otelcol.exporter.loki.to_loki.input] 317 | } 318 | } 319 | {{- end -}} 320 | 321 | {{ with .Values.global.coder.scrapeMetrics }} 322 | prometheus.scrape "coder_metrics" { 323 | targets = [ 324 | {"__address__" = "{{ .hostname }}:{{ .port }}", {{ include "collector-labels" .additionalLabels | trimSuffix "," }}}, 325 | ] 326 | 327 | forward_to = [prometheus.remote_write.default.receiver] 328 | scrape_interval = "{{ .scrapeInterval }}" 329 | } 330 | {{- end }} 331 | {{- end }} 332 | 333 | {{- define "collector-labels" -}} 334 | {{- range $key, $val := . -}} 335 | {{ $key }} = "{{ $val }}", 336 | {{- end -}} 337 | {{ end }} -------------------------------------------------------------------------------- /coder-observability/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "coder-observability.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "coder-observability.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "coder-observability.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "coder-observability.labels" -}} 37 | helm.sh/chart: {{ include "coder-observability.chart" . }} 38 | {{ include "coder-observability.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "coder-observability.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "coder-observability.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "coder-observability.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "coder-observability.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | 64 | {{/* Postgres connector string */}} 65 | {{- define "postgres-connector-string" -}} 66 | {{- if and .Values.global.postgres.password (eq .Values.global.postgres.sslmode "disable") -}} 67 | postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} 68 | {{- else if and .Values.global.postgres.password (ne .Values.global.postgres.sslmode "disable") -}} 69 | postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}&sslrootcert={{ .Values.global.postgres.sslrootcert }} 70 | {{- else if and .Values.global.postgres.mountSecret (eq .Values.global.postgres.sslmode "disable") -}} 71 | postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} 72 | {{- else if and .Values.global.postgres.mountSecret (ne .Values.global.postgres.sslmode "disable") -}} 73 | postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}&sslrootcert={{ .Values.global.postgres.sslrootcert }} 74 | {{- else -}} 75 | {{ fail "either postgres.password or postgres.mountSecret must be defined" }} 76 | {{- end -}} 77 | {{- end }} 78 | 79 | {{/* Postgres connector string */}} 80 | {{- define "postgres-secret-mount" -}} 81 | {{ if .Values.global.postgres.mountSecret }} 82 | envFrom: 83 | - secretRef: 84 | name: {{ .Values.global.postgres.mountSecret }} 85 | {{ end }} 86 | {{- end }} 87 | 88 | {{/* Postgres Exporter does not export a pubsub usage metric by default, so we add one */}} 89 | {{- define "postgres-pubsub-queue-usage-metric-name" -}}pg_pubsub_usage{{- end }} 90 | 91 | {{/* Build a runbook URL */}} 92 | {{- define "runbook-url" -}} 93 | {{ $outer := . }} 94 | {{- with .Values.global -}} 95 | {{- .externalScheme }}://runbook-viewer.{{ $outer.Release.Namespace }}.{{ .externalZone }}/{{- $outer.service }}#{{- $outer.alert | lower }} 96 | {{- end }} 97 | {{- end }} 98 | 99 | {{- define "coderd-selector" -}} {{- printf "%s, namespace=`%s`" .Values.global.coder.coderdSelector .Values.global.coder.controlPlaneNamespace -}} {{- end }} 100 | {{- define "provisionerd-selector" -}} {{- printf "%s, namespace=`%s`" .Values.global.coder.provisionerdSelector .Values.global.coder.externalProvisionersNamespace -}} {{- end }} 101 | {{- define "workspaces-selector" -}} {{- .Values.global.coder.workspacesSelector -}} {{- end }} 102 | {{- define "non-workspace-selector" -}} {{- printf "namespace=~`(%s|%s)`" (include "control-plane-namespace" .) (include "external-provisioners-namespace" .) -}} {{- end }} 103 | {{- define "control-plane-namespace" -}} {{- .Values.global.coder.controlPlaneNamespace -}} {{- end }} 104 | {{- define "external-provisioners-namespace" -}} {{- .Values.global.coder.externalProvisionersNamespace -}} {{- end }} 105 | 106 | {{/* The collector creates "job" labels in the form // */}} 107 | 108 | {{/* Prometheus job label */}} 109 | {{- define "prometheus-job" -}} {{- printf "%s/%s/%s" .Release.Namespace .Values.prometheus.server.fullnameOverride .Values.prometheus.server.name -}} {{- end }} 110 | {{/* Loki job label */}} 111 | {{- define "loki-job" -}} {{- printf "%s/%s" .Release.Namespace .Values.loki.fullnameOverride -}} {{- end }} 112 | {{/* Grafana Agent job label */}} 113 | {{- define "grafana-agent-job" -}} {{- printf "%s/%s/%s" .Release.Namespace (index .Values "grafana-agent").fullnameOverride "grafana-agent" -}} {{- end }} 114 | 115 | {{- define "dashboard-range" -}} {{ .Values.global.dashboards.timerange }} {{- end }} 116 | {{- define "dashboard-refresh" -}} {{ .Values.global.dashboards.refresh }} {{- end }} -------------------------------------------------------------------------------- /coder-observability/templates/configmap-collector.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: ConfigMap 3 | apiVersion: v1 4 | metadata: 5 | name: {{ (index .Values "grafana-agent").agent.configMap.name }} 6 | namespace: {{ .Release.Namespace }} 7 | data: 8 | config.river: |- {{- include "collector-config" . | trim | nindent 4 }} -------------------------------------------------------------------------------- /coder-observability/templates/configmap-prometheus-alerts.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: metrics-alerts 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | {{- $service := dict "service" "coderd" -}} 8 | 9 | {{- with .Values.global.coder.alerts.coderd }} {{/* start-section */}} 10 | coderd.yaml: |- 11 | groups: 12 | {{- with .groups.CPU }} 13 | {{- $group := . }} 14 | {{- if .enabled }} 15 | - name: CPU Usage 16 | rules: 17 | {{ $alert := "CoderdCPUUsage" }} 18 | {{- range $severity, $threshold := .thresholds }} 19 | - alert: {{ $alert }} 20 | expr: max by (pod) (rate(container_cpu_usage_seconds_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) / max by(pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="cpu"}) > {{ $threshold }} 21 | for: {{ $group.delay }} 22 | annotations: 23 | summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of CPU, which may impact application performance. 24 | labels: 25 | severity: {{ $severity }} 26 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 27 | {{- end }} 28 | {{- end }} 29 | {{- end }} 30 | 31 | {{- with .groups.Memory }} 32 | {{- $group := . }} 33 | {{- if .enabled }} 34 | - name: Memory Usage 35 | rules: 36 | {{ $alert := "CoderdMemoryUsage" }} 37 | {{- range $severity, $threshold := .thresholds }} 38 | - alert: {{ $alert }} 39 | expr: max by (pod) (container_memory_working_set_bytes{ {{- include "coderd-selector" $ -}} }) / max by (pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="memory"}) > {{ $threshold }} 40 | for: {{ $group.delay }} 41 | annotations: 42 | summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error. 43 | labels: 44 | severity: {{ $severity }} 45 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 46 | {{- end }} 47 | {{- end }} 48 | {{- end }} 49 | 50 | {{- with .groups.Restarts }} 51 | {{- $group := . }} 52 | {{- if .enabled }} 53 | - name: Pod Restarts 54 | rules: 55 | {{ $alert := "CoderdRestarts" }} 56 | {{- range $severity, $threshold := .thresholds }} 57 | - alert: {{ $alert }} 58 | expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) > {{ $threshold }} 59 | for: {{ $group.delay }} 60 | annotations: 61 | summary: The Coder instance {{ `{{ $labels.pod }}` }} has restarted multiple times in the last {{ $group.period -}}, which may indicate a CrashLoop. 62 | labels: 63 | severity: {{ $severity }} 64 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 65 | {{- end }} 66 | {{- end }} 67 | {{- end }} 68 | 69 | {{- with .groups.Replicas }} 70 | {{- $group := . }} 71 | {{- if .enabled }} 72 | - name: Coderd Replicas 73 | rules: 74 | {{ $alert := "CoderdReplicas" }} 75 | {{- range $severity, $threshold := .thresholds }} 76 | - alert: {{ $alert }} 77 | expr: sum(up{ {{- include "coderd-selector" $ -}} }) < {{ $threshold }} 78 | for: {{ $group.delay }} 79 | annotations: 80 | summary: Number of alive coderd replicas is below the threshold = {{ $threshold -}}. 81 | labels: 82 | severity: {{ $severity }} 83 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 84 | {{- end }} 85 | {{- end }} 86 | {{- end }} 87 | 88 | {{- with .groups.WorkspaceBuildFailures }} 89 | {{- $group := . }} 90 | {{- if .enabled }} 91 | - name: Coderd Workspace Build Failures 92 | rules: 93 | {{ $alert := "CoderdWorkspaceBuildFailures" }} 94 | {{- range $severity, $threshold := .thresholds }} 95 | - alert: {{ $alert }} 96 | expr: sum(increase(coderd_workspace_builds_total{ {{- include "coderd-selector" $ -}} , status="failed" }[{{- $group.period -}}])) > {{ $threshold }} 97 | for: {{ $group.delay }} 98 | annotations: 99 | summary: Workspace builds have failed multiple times in the last {{ $group.period -}}, which may indicate a broken Coder template. 100 | labels: 101 | severity: {{ $severity }} 102 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 103 | {{- end }} 104 | {{- end }} 105 | {{- end }} 106 | 107 | {{- with .groups.IneligiblePrebuilds }} 108 | {{- $group := . }} 109 | {{- if .enabled }} 110 | - name: Coderd Ineligible Prebuilds 111 | rules: 112 | {{ $alert := "CoderdIneligiblePrebuilds" }} 113 | {{- range $severity, $threshold := .thresholds }} 114 | - alert: {{ $alert }} 115 | expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0 116 | for: {{ $group.delay }} 117 | annotations: 118 | summary: > 119 | {{ `{{ $value }}` }} prebuilt workspace(s) are currently ineligible for claiming for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. 120 | This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time. 121 | labels: 122 | severity: {{ $severity }} 123 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 124 | {{- end }} 125 | {{- end }} 126 | {{- end }} 127 | 128 | {{- with .groups.UnprovisionedPrebuiltWorkspaces }} 129 | {{- $group := . }} 130 | {{- if .enabled }} 131 | - name: Coderd Unprovisioned Prebuilt Workspaces 132 | rules: 133 | {{ $alert := "CoderdUnprovisionedPrebuiltWorkspaces" }} 134 | {{- range $severity, $threshold := .thresholds }} 135 | - alert: {{ $alert }} 136 | expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0 137 | for: {{ $group.delay }} 138 | annotations: 139 | summary: > 140 | {{ `{{ $value }}` }} prebuilt workspace(s) not yet been provisioned for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. 141 | labels: 142 | severity: {{ $severity }} 143 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 144 | {{- end }} 145 | {{- end }} 146 | {{- end }} 147 | 148 | {{- end }} {{/* end-section */}} 149 | 150 | 151 | {{- with .Values.global.coder.alerts.provisionerd }} {{/* start-section */}} 152 | provisionerd.yaml: |- 153 | groups: 154 | {{- with .groups.Replicas }} 155 | {{- $group := . }} 156 | {{- if .enabled }} 157 | - name: Provisionerd Replicas 158 | rules: 159 | {{ $alert := "ProvisionerdReplicas" }} 160 | {{- range $severity, $threshold := .thresholds }} 161 | - alert: {{ $alert }} 162 | expr: sum(coderd_provisionerd_num_daemons{ {{- include "coderd-selector" $ -}} }) < {{ $threshold }} 163 | for: {{ $group.delay }} 164 | annotations: 165 | summary: Number of alive provisionerd replicas is below the threshold = {{ $threshold -}}. 166 | labels: 167 | severity: {{ $severity }} 168 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 169 | {{- end }} 170 | {{- end }} 171 | {{- end }} 172 | 173 | {{- end }} {{/* end-section */}} 174 | 175 | 176 | {{- $service = dict "service" "enterprise" -}} 177 | 178 | {{- with .Values.global.coder.alerts.enterprise }} {{/* start-section */}} 179 | enterprise.yaml: |- 180 | groups: 181 | {{- with .groups.Licences }} 182 | {{- $group := . }} 183 | {{- if .enabled }} 184 | - name: Licences 185 | rules: 186 | {{ $alert := "CoderLicenseSeats" }} 187 | {{- range $severity, $threshold := .thresholds }} 188 | - alert: {{ $alert }} 189 | expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >= {{- $threshold }}' 190 | for: {{ $group.delay }} 191 | annotations: 192 | summary: Your Coder enterprise licence usage is now at {{ `{{ $value | humanizePercentage }}` }} capacity. 193 | labels: 194 | severity: {{ $severity }} 195 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 196 | {{- end }} 197 | {{- end }} 198 | {{- end }} 199 | {{- end }} {{/* end-section */}} 200 | 201 | {{- $service = dict "service" "postgres" -}} 202 | {{- with .Values.global.postgres }} 203 | postgres.yaml: |- 204 | groups: 205 | {{- with .alerts.groups.Notifications }} 206 | {{- $group := . -}} 207 | {{- if .enabled }} 208 | - name: Notifications 209 | rules: 210 | {{ $alert := "PostgresNotificationQueueFillingUp" }} 211 | {{- range $severity, $threshold := .thresholds }} 212 | - alert: {{ $alert }} 213 | expr: {{ include "postgres-pubsub-queue-usage-metric-name" . }} > {{ $threshold }} 214 | for: {{ $group.delay }} 215 | annotations: 216 | summary: The postgres instance {{ `{{ $labels.instance }}` }} has a notification that is filling up, which may impact application performance. 217 | labels: 218 | severity: {{ $severity }} 219 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 220 | {{- end }} 221 | {{- end -}} 222 | {{- end -}} 223 | {{- with .alerts.groups.Basic }} 224 | {{ $group := . -}} 225 | {{- if .enabled }} 226 | - name: Liveness 227 | rules: 228 | {{ $alert := "PostgresDown" }} 229 | - alert: {{ $alert }} 230 | expr: pg_up == 0 231 | for: {{ $group.delay }} 232 | annotations: 233 | summary: The postgres instance {{ `{{ $labels.instance }}` }} is down! 234 | labels: 235 | severity: critical 236 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 237 | {{- end }} 238 | {{ end }} 239 | {{- with .alerts.groups.Connections }} 240 | {{ $group := . -}} 241 | {{- if .enabled }} 242 | - name: Connections 243 | rules: 244 | {{ $alert := "PostgresConnectionsRunningLow" }} 245 | {{- range $severity, $threshold := .thresholds }} 246 | - alert: {{ $alert }} 247 | expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * {{ $threshold }}) 248 | for: {{ $group.delay }} 249 | labels: 250 | summary: The postgres instance {{ `{{ $labels.instance }}` }} is running low on connections which may impact application performance. 251 | severity: {{ $severity }} 252 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} 253 | {{- end }} 254 | {{- end -}} 255 | {{- end -}} 256 | {{ end }} 257 | -------------------------------------------------------------------------------- /coder-observability/templates/configmap-runbooks.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: ConfigMap 3 | apiVersion: v1 4 | metadata: 5 | name: runbooks 6 | namespace: {{ .Release.Namespace }} 7 | annotations: 8 | checksum/config: {{ (.Files.Glob "runbooks/**").AsConfig | indent 2 | sha256sum }} 9 | data: 10 | {{ (.Files.Glob "runbooks/**").AsConfig | indent 2 }} -------------------------------------------------------------------------------- /coder-observability/templates/configmap-sql-exporter.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: sql-exporter-config 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | config.yaml: |- 8 | global: 9 | target: 10 | name: postgres 11 | data_source_name: '{{ include "postgres-connector-string" . }}' 12 | collectors: 13 | - notify 14 | collectors: 15 | - collector_name: notify 16 | metrics: 17 | # Add a metric to show the current usage of the Postgres "pub/sub" mechanism 18 | # See https://www.postgresql.org/docs/current/functions-info.html 19 | - metric_name: {{ include "postgres-pubsub-queue-usage-metric-name" . }} 20 | type: gauge 21 | help: "The fraction (0–1) of the asynchronous notification queue's maximum size that is currently occupied by notifications that are waiting to be processed" 22 | static_labels: 23 | hostname: {{ .Values.global.postgres.hostname }} 24 | database: {{ .Values.global.postgres.database }} 25 | values: [ usage ] 26 | query: | 27 | SELECT pg_notification_queue_usage() AS usage; -------------------------------------------------------------------------------- /coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl: -------------------------------------------------------------------------------- 1 | {{ define "prebuilds-dashboard.json" }} 2 | { 3 | "annotations": { 4 | "list": [ 5 | { 6 | "builtIn": 1, 7 | "datasource": { 8 | "type": "grafana", 9 | "uid": "-- Grafana --" 10 | }, 11 | "enable": true, 12 | "hide": true, 13 | "iconColor": "rgba(0, 211, 255, 1)", 14 | "name": "Annotations & Alerts", 15 | "type": "dashboard" 16 | } 17 | ] 18 | }, 19 | "editable": true, 20 | "fiscalYearStartMonth": 0, 21 | "graphTooltip": 0, 22 | "id": 10, 23 | "links": [], 24 | "panels": [ 25 | { 26 | "datasource": { 27 | "type": "prometheus", 28 | "uid": "prometheus" 29 | }, 30 | "fieldConfig": { 31 | "defaults": { 32 | "color": { 33 | "mode": "thresholds" 34 | }, 35 | "mappings": [ 36 | { 37 | "options": { 38 | "0": { 39 | "color": "orange", 40 | "index": 2, 41 | "text": "Not enabled" 42 | }, 43 | "1": { 44 | "color": "green", 45 | "index": 0, 46 | "text": "Enabled" 47 | } 48 | }, 49 | "type": "value" 50 | }, 51 | { 52 | "options": { 53 | "match": "null", 54 | "result": { 55 | "color": "orange", 56 | "index": 1, 57 | "text": "Not enabled" 58 | } 59 | }, 60 | "type": "special" 61 | } 62 | ], 63 | "thresholds": { 64 | "mode": "absolute", 65 | "steps": [ 66 | { 67 | "color": "green", 68 | "value": null 69 | }, 70 | { 71 | "color": "red", 72 | "value": 80 73 | } 74 | ] 75 | } 76 | }, 77 | "overrides": [] 78 | }, 79 | "gridPos": { 80 | "h": 4, 81 | "w": 4, 82 | "x": 0, 83 | "y": 0 84 | }, 85 | "id": 15, 86 | "options": { 87 | "colorMode": "value", 88 | "graphMode": "none", 89 | "justifyMode": "center", 90 | "orientation": "auto", 91 | "reduceOptions": { 92 | "calcs": [ 93 | "lastNotNull" 94 | ], 95 | "fields": "", 96 | "values": false 97 | }, 98 | "showPercentChange": false, 99 | "text": { 100 | "valueSize": 15 101 | }, 102 | "textMode": "auto", 103 | "wideLayout": true 104 | }, 105 | "pluginVersion": "10.4.3", 106 | "targets": [ 107 | { 108 | "datasource": { 109 | "type": "prometheus", 110 | "uid": "prometheus" 111 | }, 112 | "editorMode": "code", 113 | "expr": "min(coderd_experiments{experiment=\"workspace-prebuilds\"})", 114 | "instant": true, 115 | "legendFormat": "__auto", 116 | "range": false, 117 | "refId": "A" 118 | } 119 | ], 120 | "title": "Experiment enabled?", 121 | "type": "stat" 122 | }, 123 | { 124 | "datasource": { 125 | "type": "prometheus", 126 | "uid": "prometheus" 127 | }, 128 | "fieldConfig": { 129 | "defaults": { 130 | "color": { 131 | "fixedColor": "text", 132 | "mode": "fixed" 133 | }, 134 | "mappings": [], 135 | "thresholds": { 136 | "mode": "absolute", 137 | "steps": [ 138 | { 139 | "color": "green", 140 | "value": null 141 | }, 142 | { 143 | "color": "red", 144 | "value": 80 145 | } 146 | ] 147 | } 148 | }, 149 | "overrides": [] 150 | }, 151 | "gridPos": { 152 | "h": 4, 153 | "w": 4, 154 | "x": 4, 155 | "y": 0 156 | }, 157 | "id": 49, 158 | "interval": "30s", 159 | "options": { 160 | "colorMode": "value", 161 | "graphMode": "area", 162 | "justifyMode": "center", 163 | "orientation": "vertical", 164 | "reduceOptions": { 165 | "calcs": [ 166 | "lastNotNull" 167 | ], 168 | "fields": "", 169 | "values": false 170 | }, 171 | "showPercentChange": false, 172 | "textMode": "auto", 173 | "wideLayout": true 174 | }, 175 | "pluginVersion": "10.4.3", 176 | "repeatDirection": "v", 177 | "targets": [ 178 | { 179 | "datasource": { 180 | "type": "prometheus", 181 | "uid": "prometheus" 182 | }, 183 | "editorMode": "code", 184 | "exemplar": false, 185 | "expr": "sum(max(coderd_prebuilt_workspaces_desired) by (template_name, preset_name)) or vector(0)", 186 | "instant": true, 187 | "interval": "", 188 | "legendFormat": "Desired", 189 | "range": false, 190 | "refId": "A" 191 | }, 192 | { 193 | "datasource": { 194 | "type": "prometheus", 195 | "uid": "prometheus" 196 | }, 197 | "editorMode": "code", 198 | "exemplar": false, 199 | "expr": "sum(max(coderd_prebuilt_workspaces_running) by (template_name, preset_name)) or vector(0)", 200 | "hide": false, 201 | "instant": true, 202 | "interval": "", 203 | "legendFormat": "Running", 204 | "range": false, 205 | "refId": "D" 206 | }, 207 | { 208 | "datasource": { 209 | "type": "prometheus", 210 | "uid": "prometheus" 211 | }, 212 | "editorMode": "code", 213 | "exemplar": false, 214 | "expr": "sum(max(coderd_prebuilt_workspaces_eligible) by (template_name, preset_name)) or vector(0)", 215 | "hide": false, 216 | "instant": true, 217 | "interval": "", 218 | "legendFormat": "Eligible", 219 | "range": false, 220 | "refId": "E" 221 | } 222 | ], 223 | "title": "Current: Global", 224 | "type": "stat" 225 | }, 226 | { 227 | "datasource": { 228 | "type": "prometheus", 229 | "uid": "prometheus" 230 | }, 231 | "description": "", 232 | "fieldConfig": { 233 | "defaults": { 234 | "color": { 235 | "fixedColor": "text", 236 | "mode": "fixed" 237 | }, 238 | "mappings": [], 239 | "thresholds": { 240 | "mode": "absolute", 241 | "steps": [ 242 | { 243 | "color": "green", 244 | "value": null 245 | }, 246 | { 247 | "color": "red", 248 | "value": 80 249 | } 250 | ] 251 | } 252 | }, 253 | "overrides": [] 254 | }, 255 | "gridPos": { 256 | "h": 4, 257 | "w": 4, 258 | "x": 8, 259 | "y": 0 260 | }, 261 | "id": 48, 262 | "interval": "30s", 263 | "options": { 264 | "colorMode": "value", 265 | "graphMode": "area", 266 | "justifyMode": "center", 267 | "orientation": "vertical", 268 | "reduceOptions": { 269 | "calcs": [ 270 | "lastNotNull" 271 | ], 272 | "fields": "", 273 | "values": false 274 | }, 275 | "showPercentChange": false, 276 | "textMode": "auto", 277 | "wideLayout": true 278 | }, 279 | "pluginVersion": "10.4.3", 280 | "repeatDirection": "v", 281 | "targets": [ 282 | { 283 | "datasource": { 284 | "type": "prometheus", 285 | "uid": "prometheus" 286 | }, 287 | "editorMode": "code", 288 | "exemplar": false, 289 | "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_created_total)) or vector(0)", 290 | "hide": false, 291 | "instant": true, 292 | "interval": "", 293 | "legendFormat": "Created", 294 | "range": false, 295 | "refId": "B" 296 | }, 297 | { 298 | "datasource": { 299 | "type": "prometheus", 300 | "uid": "prometheus" 301 | }, 302 | "editorMode": "code", 303 | "exemplar": false, 304 | "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_failed_total)) or vector(0)", 305 | "hide": false, 306 | "instant": true, 307 | "interval": "", 308 | "legendFormat": "Failed", 309 | "range": false, 310 | "refId": "C" 311 | }, 312 | { 313 | "datasource": { 314 | "type": "prometheus", 315 | "uid": "prometheus" 316 | }, 317 | "editorMode": "code", 318 | "exemplar": false, 319 | "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_claimed_total)) or vector(0)", 320 | "hide": false, 321 | "instant": true, 322 | "interval": "", 323 | "legendFormat": "Claimed", 324 | "range": false, 325 | "refId": "A" 326 | } 327 | ], 328 | "title": "All Time: Global", 329 | "type": "stat" 330 | }, 331 | { 332 | "gridPos": { 333 | "h": 1, 334 | "w": 24, 335 | "x": 0, 336 | "y": 4 337 | }, 338 | "id": 2, 339 | "panels": [], 340 | "repeat": "template", 341 | "repeatDirection": "h", 342 | "title": "$template", 343 | "type": "row" 344 | }, 345 | { 346 | "datasource": { 347 | "type": "prometheus", 348 | "uid": "prometheus" 349 | }, 350 | "fieldConfig": { 351 | "defaults": { 352 | "color": { 353 | "fixedColor": "text", 354 | "mode": "fixed" 355 | }, 356 | "mappings": [], 357 | "thresholds": { 358 | "mode": "absolute", 359 | "steps": [ 360 | { 361 | "color": "green", 362 | "value": null 363 | }, 364 | { 365 | "color": "red", 366 | "value": 80 367 | } 368 | ] 369 | } 370 | }, 371 | "overrides": [] 372 | }, 373 | "gridPos": { 374 | "h": 7, 375 | "w": 4, 376 | "x": 0, 377 | "y": 5 378 | }, 379 | "id": 31, 380 | "interval": "30s", 381 | "options": { 382 | "colorMode": "value", 383 | "graphMode": "area", 384 | "justifyMode": "center", 385 | "orientation": "vertical", 386 | "reduceOptions": { 387 | "calcs": [ 388 | "lastNotNull" 389 | ], 390 | "fields": "", 391 | "values": false 392 | }, 393 | "showPercentChange": false, 394 | "textMode": "auto", 395 | "wideLayout": true 396 | }, 397 | "pluginVersion": "10.4.3", 398 | "repeat": "preset", 399 | "repeatDirection": "v", 400 | "targets": [ 401 | { 402 | "datasource": { 403 | "type": "prometheus", 404 | "uid": "prometheus" 405 | }, 406 | "editorMode": "code", 407 | "exemplar": false, 408 | "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", 409 | "instant": true, 410 | "interval": "", 411 | "legendFormat": "Desired", 412 | "range": false, 413 | "refId": "A" 414 | }, 415 | { 416 | "datasource": { 417 | "type": "prometheus", 418 | "uid": "prometheus" 419 | }, 420 | "editorMode": "code", 421 | "exemplar": false, 422 | "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", 423 | "hide": false, 424 | "instant": true, 425 | "interval": "", 426 | "legendFormat": "Running", 427 | "range": false, 428 | "refId": "D" 429 | }, 430 | { 431 | "datasource": { 432 | "type": "prometheus", 433 | "uid": "prometheus" 434 | }, 435 | "editorMode": "code", 436 | "exemplar": false, 437 | "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", 438 | "hide": false, 439 | "instant": true, 440 | "interval": "", 441 | "legendFormat": "Eligible", 442 | "range": false, 443 | "refId": "E" 444 | } 445 | ], 446 | "title": "Current: $preset", 447 | "type": "stat" 448 | }, 449 | { 450 | "datasource": { 451 | "type": "prometheus", 452 | "uid": "prometheus" 453 | }, 454 | "fieldConfig": { 455 | "defaults": { 456 | "color": { 457 | "mode": "palette-classic" 458 | }, 459 | "custom": { 460 | "axisBorderShow": false, 461 | "axisCenteredZero": false, 462 | "axisColorMode": "text", 463 | "axisLabel": "", 464 | "axisPlacement": "auto", 465 | "axisSoftMax": 10, 466 | "axisSoftMin": 0, 467 | "barAlignment": 0, 468 | "drawStyle": "line", 469 | "fillOpacity": 18, 470 | "gradientMode": "none", 471 | "hideFrom": { 472 | "legend": false, 473 | "tooltip": false, 474 | "viz": false 475 | }, 476 | "insertNulls": false, 477 | "lineInterpolation": "smooth", 478 | "lineStyle": { 479 | "fill": "solid" 480 | }, 481 | "lineWidth": 2, 482 | "pointSize": 5, 483 | "scaleDistribution": { 484 | "type": "linear" 485 | }, 486 | "showPoints": "never", 487 | "spanNulls": false, 488 | "stacking": { 489 | "group": "A", 490 | "mode": "none" 491 | }, 492 | "thresholdsStyle": { 493 | "mode": "off" 494 | } 495 | }, 496 | "decimals": 0, 497 | "fieldMinMax": false, 498 | "mappings": [], 499 | "thresholds": { 500 | "mode": "absolute", 501 | "steps": [ 502 | { 503 | "color": "green", 504 | "value": null 505 | }, 506 | { 507 | "color": "red", 508 | "value": 80 509 | } 510 | ] 511 | } 512 | }, 513 | "overrides": [ 514 | { 515 | "matcher": { 516 | "id": "byName", 517 | "options": "Desired" 518 | }, 519 | "properties": [ 520 | { 521 | "id": "color", 522 | "value": { 523 | "fixedColor": "purple", 524 | "mode": "fixed" 525 | } 526 | }, 527 | { 528 | "id": "custom.lineStyle", 529 | "value": { 530 | "dash": [ 531 | 10, 532 | 10 533 | ], 534 | "fill": "dash" 535 | } 536 | }, 537 | { 538 | "id": "custom.fillOpacity", 539 | "value": 85 540 | }, 541 | { 542 | "id": "custom.fillBelowTo", 543 | "value": "Running" 544 | } 545 | ] 546 | }, 547 | { 548 | "matcher": { 549 | "id": "byName", 550 | "options": "Running" 551 | }, 552 | "properties": [ 553 | { 554 | "id": "color", 555 | "value": { 556 | "fixedColor": "yellow", 557 | "mode": "fixed" 558 | } 559 | }, 560 | { 561 | "id": "custom.fillBelowTo", 562 | "value": "Eligible" 563 | } 564 | ] 565 | }, 566 | { 567 | "matcher": { 568 | "id": "byName", 569 | "options": "Eligible" 570 | }, 571 | "properties": [ 572 | { 573 | "id": "color", 574 | "value": { 575 | "fixedColor": "green", 576 | "mode": "fixed" 577 | } 578 | } 579 | ] 580 | } 581 | ] 582 | }, 583 | "gridPos": { 584 | "h": 7, 585 | "w": 8, 586 | "x": 4, 587 | "y": 5 588 | }, 589 | "id": 5, 590 | "options": { 591 | "legend": { 592 | "calcs": [], 593 | "displayMode": "list", 594 | "placement": "bottom", 595 | "showLegend": true 596 | }, 597 | "tooltip": { 598 | "mode": "single", 599 | "sort": "none" 600 | } 601 | }, 602 | "pluginVersion": "10.4.3", 603 | "repeat": "preset", 604 | "repeatDirection": "v", 605 | "targets": [ 606 | { 607 | "datasource": { 608 | "type": "prometheus", 609 | "uid": "prometheus" 610 | }, 611 | "editorMode": "code", 612 | "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", 613 | "instant": false, 614 | "interval": "", 615 | "legendFormat": "Desired", 616 | "range": true, 617 | "refId": "A" 618 | }, 619 | { 620 | "datasource": { 621 | "type": "prometheus", 622 | "uid": "prometheus" 623 | }, 624 | "editorMode": "code", 625 | "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", 626 | "hide": false, 627 | "instant": false, 628 | "interval": "", 629 | "legendFormat": "Running", 630 | "range": true, 631 | "refId": "D" 632 | }, 633 | { 634 | "datasource": { 635 | "type": "prometheus", 636 | "uid": "prometheus" 637 | }, 638 | "editorMode": "code", 639 | "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", 640 | "hide": false, 641 | "instant": false, 642 | "interval": "", 643 | "legendFormat": "Eligible", 644 | "range": true, 645 | "refId": "E" 646 | } 647 | ], 648 | "title": "Pool Capacity: $preset", 649 | "type": "timeseries" 650 | }, 651 | { 652 | "datasource": { 653 | "type": "prometheus", 654 | "uid": "prometheus" 655 | }, 656 | "fieldConfig": { 657 | "defaults": { 658 | "color": { 659 | "mode": "palette-classic" 660 | }, 661 | "custom": { 662 | "axisBorderShow": false, 663 | "axisCenteredZero": false, 664 | "axisColorMode": "text", 665 | "axisLabel": "", 666 | "axisPlacement": "auto", 667 | "axisSoftMax": 10, 668 | "axisSoftMin": 0, 669 | "barAlignment": 0, 670 | "drawStyle": "line", 671 | "fillOpacity": 13, 672 | "gradientMode": "none", 673 | "hideFrom": { 674 | "legend": false, 675 | "tooltip": false, 676 | "viz": false 677 | }, 678 | "insertNulls": false, 679 | "lineInterpolation": "smooth", 680 | "lineStyle": { 681 | "fill": "solid" 682 | }, 683 | "lineWidth": 2, 684 | "pointSize": 5, 685 | "scaleDistribution": { 686 | "type": "linear" 687 | }, 688 | "showPoints": "never", 689 | "spanNulls": false, 690 | "stacking": { 691 | "group": "A", 692 | "mode": "none" 693 | }, 694 | "thresholdsStyle": { 695 | "mode": "off" 696 | } 697 | }, 698 | "decimals": 0, 699 | "fieldMinMax": false, 700 | "mappings": [], 701 | "thresholds": { 702 | "mode": "absolute", 703 | "steps": [ 704 | { 705 | "color": "green", 706 | "value": null 707 | }, 708 | { 709 | "color": "red", 710 | "value": 80 711 | } 712 | ] 713 | } 714 | }, 715 | "overrides": [ 716 | { 717 | "matcher": { 718 | "id": "byName", 719 | "options": "Failed" 720 | }, 721 | "properties": [ 722 | { 723 | "id": "color", 724 | "value": { 725 | "fixedColor": "red", 726 | "mode": "fixed" 727 | } 728 | } 729 | ] 730 | }, 731 | { 732 | "matcher": { 733 | "id": "byName", 734 | "options": "Created" 735 | }, 736 | "properties": [ 737 | { 738 | "id": "color", 739 | "value": { 740 | "fixedColor": "blue", 741 | "mode": "fixed" 742 | } 743 | } 744 | ] 745 | }, 746 | { 747 | "matcher": { 748 | "id": "byName", 749 | "options": "Desired" 750 | }, 751 | "properties": [ 752 | { 753 | "id": "color", 754 | "value": { 755 | "fixedColor": "purple", 756 | "mode": "fixed" 757 | } 758 | } 759 | ] 760 | }, 761 | { 762 | "matcher": { 763 | "id": "byName", 764 | "options": "Running" 765 | }, 766 | "properties": [ 767 | { 768 | "id": "color", 769 | "value": { 770 | "fixedColor": "yellow", 771 | "mode": "fixed" 772 | } 773 | } 774 | ] 775 | }, 776 | { 777 | "matcher": { 778 | "id": "byName", 779 | "options": "Eligible" 780 | }, 781 | "properties": [ 782 | { 783 | "id": "color", 784 | "value": { 785 | "fixedColor": "green", 786 | "mode": "fixed" 787 | } 788 | } 789 | ] 790 | }, 791 | { 792 | "matcher": { 793 | "id": "byName", 794 | "options": "Claimed" 795 | }, 796 | "properties": [ 797 | { 798 | "id": "color", 799 | "value": { 800 | "fixedColor": "dark-green", 801 | "mode": "fixed" 802 | } 803 | } 804 | ] 805 | } 806 | ] 807 | }, 808 | "gridPos": { 809 | "h": 7, 810 | "w": 8, 811 | "x": 12, 812 | "y": 5 813 | }, 814 | "id": 38, 815 | "options": { 816 | "legend": { 817 | "calcs": [], 818 | "displayMode": "list", 819 | "placement": "bottom", 820 | "showLegend": true 821 | }, 822 | "tooltip": { 823 | "mode": "single", 824 | "sort": "none" 825 | } 826 | }, 827 | "pluginVersion": "10.4.3", 828 | "repeat": "preset", 829 | "repeatDirection": "v", 830 | "targets": [ 831 | { 832 | "datasource": { 833 | "type": "prometheus", 834 | "uid": "prometheus" 835 | }, 836 | "editorMode": "code", 837 | "expr": "floor(max(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", 838 | "hide": false, 839 | "instant": false, 840 | "interval": "", 841 | "legendFormat": "Created", 842 | "range": true, 843 | "refId": "B" 844 | }, 845 | { 846 | "datasource": { 847 | "type": "prometheus", 848 | "uid": "prometheus" 849 | }, 850 | "editorMode": "code", 851 | "expr": "floor(max(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", 852 | "hide": false, 853 | "instant": false, 854 | "interval": "", 855 | "legendFormat": "Failed", 856 | "range": true, 857 | "refId": "C" 858 | }, 859 | { 860 | "datasource": { 861 | "type": "prometheus", 862 | "uid": "prometheus" 863 | }, 864 | "editorMode": "code", 865 | "expr": "floor(max(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", 866 | "hide": false, 867 | "instant": false, 868 | "interval": "", 869 | "legendFormat": "Claimed", 870 | "range": true, 871 | "refId": "F" 872 | } 873 | ], 874 | "title": "Pool Operations: $preset", 875 | "type": "timeseries" 876 | }, 877 | { 878 | "datasource": { 879 | "type": "prometheus", 880 | "uid": "prometheus" 881 | }, 882 | "description": "", 883 | "fieldConfig": { 884 | "defaults": { 885 | "color": { 886 | "fixedColor": "text", 887 | "mode": "fixed" 888 | }, 889 | "mappings": [], 890 | "thresholds": { 891 | "mode": "absolute", 892 | "steps": [ 893 | { 894 | "color": "green", 895 | "value": null 896 | }, 897 | { 898 | "color": "red", 899 | "value": 80 900 | } 901 | ] 902 | } 903 | }, 904 | "overrides": [] 905 | }, 906 | "gridPos": { 907 | "h": 7, 908 | "w": 4, 909 | "x": 20, 910 | "y": 5 911 | }, 912 | "id": 1, 913 | "interval": "30s", 914 | "options": { 915 | "colorMode": "value", 916 | "graphMode": "area", 917 | "justifyMode": "center", 918 | "orientation": "vertical", 919 | "reduceOptions": { 920 | "calcs": [ 921 | "lastNotNull" 922 | ], 923 | "fields": "", 924 | "values": false 925 | }, 926 | "showPercentChange": false, 927 | "textMode": "auto", 928 | "wideLayout": true 929 | }, 930 | "pluginVersion": "10.4.3", 931 | "repeat": "preset", 932 | "repeatDirection": "v", 933 | "targets": [ 934 | { 935 | "datasource": { 936 | "type": "prometheus", 937 | "uid": "prometheus" 938 | }, 939 | "editorMode": "code", 940 | "exemplar": false, 941 | "expr": "max(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", 942 | "hide": false, 943 | "instant": true, 944 | "interval": "", 945 | "legendFormat": "Created", 946 | "range": false, 947 | "refId": "B" 948 | }, 949 | { 950 | "datasource": { 951 | "type": "prometheus", 952 | "uid": "prometheus" 953 | }, 954 | "editorMode": "code", 955 | "exemplar": false, 956 | "expr": "max(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", 957 | "hide": false, 958 | "instant": true, 959 | "interval": "", 960 | "legendFormat": "Failed", 961 | "range": false, 962 | "refId": "C" 963 | }, 964 | { 965 | "datasource": { 966 | "type": "prometheus", 967 | "uid": "prometheus" 968 | }, 969 | "editorMode": "code", 970 | "exemplar": false, 971 | "expr": "max(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", 972 | "hide": false, 973 | "instant": true, 974 | "interval": "", 975 | "legendFormat": "Claimed", 976 | "range": false, 977 | "refId": "A" 978 | } 979 | ], 980 | "title": "All Time: $preset", 981 | "type": "stat" 982 | } 983 | ], 984 | "refresh": "{{- include "dashboard-refresh" . -}}", 985 | "schemaVersion": 39, 986 | "tags": [], 987 | "templating": { 988 | "list": [ 989 | { 990 | "allValue": "", 991 | "datasource": { 992 | "type": "prometheus", 993 | "uid": "prometheus" 994 | }, 995 | "definition": "label_values(coderd_prebuilt_workspaces_desired,template_name)", 996 | "hide": 0, 997 | "includeAll": false, 998 | "label": "Template", 999 | "multi": false, 1000 | "name": "template", 1001 | "options": [], 1002 | "query": { 1003 | "qryType": 1, 1004 | "query": "label_values(coderd_prebuilt_workspaces_desired,template_name)", 1005 | "refId": "PrometheusVariableQueryEditor-VariableQuery" 1006 | }, 1007 | "refresh": 1, 1008 | "regex": "", 1009 | "skipUrlSync": false, 1010 | "sort": 0, 1011 | "type": "query" 1012 | }, 1013 | { 1014 | "allValue": "", 1015 | "datasource": { 1016 | "type": "prometheus", 1017 | "uid": "prometheus" 1018 | }, 1019 | "definition": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", 1020 | "hide": 0, 1021 | "includeAll": true, 1022 | "label": "Preset", 1023 | "multi": true, 1024 | "name": "preset", 1025 | "options": [], 1026 | "query": { 1027 | "qryType": 1, 1028 | "query": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", 1029 | "refId": "PrometheusVariableQueryEditor-VariableQuery" 1030 | }, 1031 | "refresh": 1, 1032 | "regex": "", 1033 | "skipUrlSync": false, 1034 | "sort": 0, 1035 | "type": "query" 1036 | } 1037 | ] 1038 | }, 1039 | "time": { 1040 | "from": "now-{{- include "dashboard-range" . -}}", 1041 | "to": "now" 1042 | }, 1043 | "timepicker": {}, 1044 | "timezone": "browser", 1045 | "title": "Prebuilds", 1046 | "uid": "cej6jysyme22oa", 1047 | "version": 13, 1048 | "weekStart": "" 1049 | } 1050 | {{ end }} -------------------------------------------------------------------------------- /coder-observability/templates/dashboards/_dashboards_provisionerd.json.tpl: -------------------------------------------------------------------------------- 1 | {{ define "provisionerd-dashboard.json" }} 2 | { 3 | "annotations": { 4 | "list": [ 5 | { 6 | "builtIn": 1, 7 | "datasource": { 8 | "type": "grafana", 9 | "uid": "-- Grafana --" 10 | }, 11 | "enable": true, 12 | "hide": true, 13 | "iconColor": "rgba(0, 211, 255, 1)", 14 | "name": "Annotations & Alerts", 15 | "target": { 16 | "limit": 100, 17 | "matchAny": false, 18 | "tags": [], 19 | "type": "dashboard" 20 | }, 21 | "type": "dashboard" 22 | } 23 | ] 24 | }, 25 | "editable": true, 26 | "fiscalYearStartMonth": 0, 27 | "graphTooltip": 0, 28 | "links": [], 29 | "panels": [ 30 | { 31 | "datasource": { 32 | "type": "prometheus", 33 | "uid": "prometheus" 34 | }, 35 | "description": "", 36 | "fieldConfig": { 37 | "defaults": { 38 | "color": { 39 | "mode": "thresholds" 40 | }, 41 | "mappings": [], 42 | "thresholds": { 43 | "mode": "absolute", 44 | "steps": [ 45 | { 46 | "color": "text", 47 | "value": null 48 | }, 49 | { 50 | "color": "green", 51 | "value": 1 52 | } 53 | ] 54 | } 55 | }, 56 | "overrides": [] 57 | }, 58 | "gridPos": { 59 | "h": 7, 60 | "w": 6, 61 | "x": 0, 62 | "y": 0 63 | }, 64 | "id": 17, 65 | "options": { 66 | "colorMode": "value", 67 | "graphMode": "area", 68 | "justifyMode": "center", 69 | "orientation": "auto", 70 | "reduceOptions": { 71 | "calcs": [ 72 | "lastNotNull" 73 | ], 74 | "fields": "", 75 | "values": false 76 | }, 77 | "showPercentChange": false, 78 | "textMode": "value_and_name", 79 | "wideLayout": false 80 | }, 81 | "pluginVersion": "10.4.0", 82 | "targets": [ 83 | { 84 | "datasource": { 85 | "type": "prometheus", 86 | "uid": "prometheus" 87 | }, 88 | "editorMode": "code", 89 | "exemplar": false, 90 | "expr": "sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`})", 91 | "instant": true, 92 | "legendFormat": "Built-in", 93 | "range": false, 94 | "refId": "A" 95 | }, 96 | { 97 | "datasource": { 98 | "type": "prometheus", 99 | "uid": "prometheus" 100 | }, 101 | "editorMode": "code", 102 | "exemplar": false, 103 | "expr": "sum(coderd_provisionerd_num_daemons{ {{- include "provisionerd-selector" . -}} })", 104 | "hide": false, 105 | "instant": true, 106 | "legendFormat": "External", 107 | "range": false, 108 | "refId": "B" 109 | } 110 | ], 111 | "title": "Provisioners", 112 | "type": "stat" 113 | }, 114 | { 115 | "datasource": { 116 | "type": "prometheus", 117 | "uid": "prometheus" 118 | }, 119 | "description": "", 120 | "gridPos": { 121 | "h": 7, 122 | "w": 6, 123 | "x": 6, 124 | "y": 0 125 | }, 126 | "id": 20, 127 | "options": { 128 | "code": { 129 | "language": "plaintext", 130 | "showLineNumbers": false, 131 | "showMiniMap": false 132 | }, 133 | "content": "Provisioners are responsible for building workspaces.\n\n`coderd` runs built-in provisioners by default. Control this with the `CODER_PROVISIONER_DAEMONS` environment variable or `--provisioner-daemons` flag.\n\nYou can also consider [External Provisioners](https://coder.com/docs/v2/latest/admin/provisioners). Running both built-in and external provisioners is perfectly valid,\nalthough dedicated (external) provisioners will generally give the best build performance.", 134 | "mode": "markdown" 135 | }, 136 | "pluginVersion": "10.4.0", 137 | "transparent": true, 138 | "type": "text" 139 | }, 140 | { 141 | "datasource": { 142 | "type": "prometheus", 143 | "uid": "prometheus" 144 | }, 145 | "description": "", 146 | "fieldConfig": { 147 | "defaults": { 148 | "color": { 149 | "mode": "thresholds" 150 | }, 151 | "mappings": [], 152 | "thresholds": { 153 | "mode": "absolute", 154 | "steps": [ 155 | { 156 | "color": "text", 157 | "value": null 158 | }, 159 | { 160 | "color": "green", 161 | "value": 1 162 | } 163 | ] 164 | } 165 | }, 166 | "overrides": [] 167 | }, 168 | "gridPos": { 169 | "h": 7, 170 | "w": 6, 171 | "x": 12, 172 | "y": 0 173 | }, 174 | "id": 21, 175 | "options": { 176 | "colorMode": "value", 177 | "graphMode": "area", 178 | "justifyMode": "center", 179 | "orientation": "auto", 180 | "reduceOptions": { 181 | "calcs": [ 182 | "last" 183 | ], 184 | "fields": "", 185 | "values": false 186 | }, 187 | "showPercentChange": false, 188 | "textMode": "auto", 189 | "wideLayout": true 190 | }, 191 | "pluginVersion": "10.4.0", 192 | "targets": [ 193 | { 194 | "datasource": { 195 | "type": "prometheus", 196 | "uid": "prometheus" 197 | }, 198 | "editorMode": "code", 199 | "exemplar": false, 200 | "expr": "(sum(coderd_provisionerd_jobs_current) > 0) or vector(0)", 201 | "instant": false, 202 | "legendFormat": "Current", 203 | "range": true, 204 | "refId": "A" 205 | }, 206 | { 207 | "datasource": { 208 | "type": "prometheus", 209 | "uid": "prometheus" 210 | }, 211 | "editorMode": "code", 212 | "exemplar": false, 213 | "expr": "sum(coderd_provisionerd_num_daemons)", 214 | "hide": false, 215 | "instant": true, 216 | "legendFormat": "Capacity", 217 | "range": false, 218 | "refId": "B" 219 | } 220 | ], 221 | "title": "Builds", 222 | "type": "stat" 223 | }, 224 | { 225 | "datasource": { 226 | "type": "prometheus", 227 | "uid": "prometheus" 228 | }, 229 | "description": "", 230 | "gridPos": { 231 | "h": 7, 232 | "w": 6, 233 | "x": 18, 234 | "y": 0 235 | }, 236 | "id": 22, 237 | "options": { 238 | "code": { 239 | "language": "plaintext", 240 | "showLineNumbers": false, 241 | "showMiniMap": false 242 | }, 243 | "content": "The maximum number of simultaneous builds is equivalent to the number of `provisionerd` daemons running.\n\nThe \"Capacity\" panel shows the how many simultaneous builds are possible.", 244 | "mode": "markdown" 245 | }, 246 | "pluginVersion": "10.4.0", 247 | "transparent": true, 248 | "type": "text" 249 | }, 250 | { 251 | "datasource": { 252 | "type": "prometheus", 253 | "uid": "prometheus" 254 | }, 255 | "description": "", 256 | "fieldConfig": { 257 | "defaults": { 258 | "color": { 259 | "mode": "thresholds" 260 | }, 261 | "fieldMinMax": false, 262 | "mappings": [], 263 | "thresholds": { 264 | "mode": "absolute", 265 | "steps": [ 266 | { 267 | "color": "text", 268 | "value": null 269 | } 270 | ] 271 | }, 272 | "unit": "s" 273 | }, 274 | "overrides": [] 275 | }, 276 | "gridPos": { 277 | "h": 7, 278 | "w": 6, 279 | "x": 0, 280 | "y": 7 281 | }, 282 | "id": 23, 283 | "options": { 284 | "colorMode": "value", 285 | "graphMode": "none", 286 | "justifyMode": "center", 287 | "orientation": "auto", 288 | "reduceOptions": { 289 | "calcs": [ 290 | "lastNotNull" 291 | ], 292 | "fields": "", 293 | "values": false 294 | }, 295 | "showPercentChange": false, 296 | "textMode": "auto", 297 | "wideLayout": true 298 | }, 299 | "pluginVersion": "10.4.0", 300 | "targets": [ 301 | { 302 | "datasource": { 303 | "type": "prometheus", 304 | "uid": "prometheus" 305 | }, 306 | "editorMode": "code", 307 | "exemplar": false, 308 | "expr": "histogram_quantile(0.5, sum by(le) (rate(coderd_provisionerd_job_timings_seconds_bucket[$__range])))", 309 | "hide": false, 310 | "instant": true, 311 | "legendFormat": "Median", 312 | "range": false, 313 | "refId": "B" 314 | }, 315 | { 316 | "datasource": { 317 | "type": "prometheus", 318 | "uid": "prometheus" 319 | }, 320 | "editorMode": "code", 321 | "exemplar": false, 322 | "expr": "histogram_quantile(0.9, sum by(le) (rate(coderd_provisionerd_job_timings_seconds_bucket[$__range])))", 323 | "hide": false, 324 | "instant": true, 325 | "legendFormat": "90th Percentile", 326 | "range": false, 327 | "refId": "A" 328 | } 329 | ], 330 | "title": "Build Times", 331 | "type": "stat" 332 | }, 333 | { 334 | "datasource": { 335 | "type": "prometheus", 336 | "uid": "prometheus" 337 | }, 338 | "description": "", 339 | "gridPos": { 340 | "h": 7, 341 | "w": 6, 342 | "x": 6, 343 | "y": 7 344 | }, 345 | "id": 24, 346 | "options": { 347 | "code": { 348 | "language": "plaintext", 349 | "showLineNumbers": false, 350 | "showMiniMap": false 351 | }, 352 | "content": "This shows the median and 90th percentile workspace build times.\n\nLong build times can impede developers' productivity while they wait for workspaces to start or be created.", 353 | "mode": "markdown" 354 | }, 355 | "pluginVersion": "10.4.0", 356 | "transparent": true, 357 | "type": "text" 358 | }, 359 | { 360 | "datasource": { 361 | "type": "prometheus", 362 | "uid": "prometheus" 363 | }, 364 | "description": "", 365 | "fieldConfig": { 366 | "defaults": { 367 | "color": { 368 | "mode": "palette-classic" 369 | }, 370 | "custom": { 371 | "axisBorderShow": false, 372 | "axisCenteredZero": false, 373 | "axisColorMode": "text", 374 | "axisLabel": "", 375 | "axisPlacement": "auto", 376 | "barAlignment": 0, 377 | "drawStyle": "bars", 378 | "fillOpacity": 100, 379 | "gradientMode": "none", 380 | "hideFrom": { 381 | "legend": false, 382 | "tooltip": false, 383 | "viz": false 384 | }, 385 | "insertNulls": false, 386 | "lineInterpolation": "linear", 387 | "lineWidth": 1, 388 | "pointSize": 5, 389 | "scaleDistribution": { 390 | "type": "linear" 391 | }, 392 | "showPoints": "auto", 393 | "spanNulls": false, 394 | "stacking": { 395 | "group": "A", 396 | "mode": "normal" 397 | }, 398 | "thresholdsStyle": { 399 | "mode": "off" 400 | } 401 | }, 402 | "decimals": 0, 403 | "fieldMinMax": false, 404 | "mappings": [], 405 | "thresholds": { 406 | "mode": "absolute", 407 | "steps": [ 408 | { 409 | "color": "text", 410 | "value": null 411 | } 412 | ] 413 | }, 414 | "unit": "short" 415 | }, 416 | "overrides": [ 417 | { 418 | "matcher": { 419 | "id": "byName", 420 | "options": "failed" 421 | }, 422 | "properties": [ 423 | { 424 | "id": "color", 425 | "value": { 426 | "fixedColor": "orange", 427 | "mode": "fixed" 428 | } 429 | }, 430 | { 431 | "id": "displayName", 432 | "value": "Failure" 433 | } 434 | ] 435 | }, 436 | { 437 | "matcher": { 438 | "id": "byName", 439 | "options": "success" 440 | }, 441 | "properties": [ 442 | { 443 | "id": "color", 444 | "value": { 445 | "fixedColor": "green", 446 | "mode": "fixed" 447 | } 448 | }, 449 | { 450 | "id": "displayName", 451 | "value": "Success" 452 | } 453 | ] 454 | } 455 | ] 456 | }, 457 | "gridPos": { 458 | "h": 7, 459 | "w": 6, 460 | "x": 12, 461 | "y": 7 462 | }, 463 | "id": 25, 464 | "interval": "1h", 465 | "options": { 466 | "legend": { 467 | "calcs": [], 468 | "displayMode": "list", 469 | "placement": "bottom", 470 | "showLegend": true 471 | }, 472 | "tooltip": { 473 | "mode": "multi", 474 | "sort": "none" 475 | } 476 | }, 477 | "pluginVersion": "10.4.0", 478 | "targets": [ 479 | { 480 | "datasource": { 481 | "type": "prometheus", 482 | "uid": "prometheus" 483 | }, 484 | "editorMode": "code", 485 | "exemplar": false, 486 | "expr": "sum by (status) (increase(coderd_provisionerd_job_timings_seconds_count[$__interval]))", 487 | "hide": false, 488 | "instant": false, 489 | "interval": "1h", 490 | "legendFormat": "__auto", 491 | "range": true, 492 | "refId": "A" 493 | } 494 | ], 495 | "title": "Build Count Per Hour", 496 | "type": "timeseries" 497 | }, 498 | { 499 | "datasource": { 500 | "type": "prometheus", 501 | "uid": "prometheus" 502 | }, 503 | "description": "", 504 | "gridPos": { 505 | "h": 7, 506 | "w": 6, 507 | "x": 18, 508 | "y": 7 509 | }, 510 | "id": 26, 511 | "options": { 512 | "code": { 513 | "language": "plaintext", 514 | "showLineNumbers": false, 515 | "showMiniMap": false 516 | }, 517 | "content": "_NOTE: this will not show the current hour._", 518 | "mode": "markdown" 519 | }, 520 | "pluginVersion": "10.4.0", 521 | "transparent": true, 522 | "type": "text" 523 | }, 524 | { 525 | "datasource": { 526 | "type": "prometheus", 527 | "uid": "prometheus" 528 | }, 529 | "description": "", 530 | "fieldConfig": { 531 | "defaults": { 532 | "color": { 533 | "mode": "palette-classic" 534 | }, 535 | "custom": { 536 | "axisBorderShow": false, 537 | "axisCenteredZero": false, 538 | "axisColorMode": "text", 539 | "axisLabel": "", 540 | "axisPlacement": "auto", 541 | "barAlignment": 0, 542 | "drawStyle": "bars", 543 | "fillOpacity": 100, 544 | "gradientMode": "none", 545 | "hideFrom": { 546 | "legend": false, 547 | "tooltip": false, 548 | "viz": false 549 | }, 550 | "insertNulls": false, 551 | "lineInterpolation": "linear", 552 | "lineWidth": 1, 553 | "pointSize": 5, 554 | "scaleDistribution": { 555 | "type": "linear" 556 | }, 557 | "showPoints": "never", 558 | "spanNulls": false, 559 | "stacking": { 560 | "group": "A", 561 | "mode": "none" 562 | }, 563 | "thresholdsStyle": { 564 | "mode": "off" 565 | } 566 | }, 567 | "fieldMinMax": false, 568 | "mappings": [], 569 | "thresholds": { 570 | "mode": "absolute", 571 | "steps": [ 572 | { 573 | "color": "text", 574 | "value": null 575 | } 576 | ] 577 | }, 578 | "unit": "s" 579 | }, 580 | "overrides": [ 581 | { 582 | "matcher": { 583 | "id": "byRegexp", 584 | "options": "/(Limit|Requested)/" 585 | }, 586 | "properties": [ 587 | { 588 | "id": "custom.drawStyle", 589 | "value": "line" 590 | }, 591 | { 592 | "id": "custom.fillOpacity", 593 | "value": 5 594 | }, 595 | { 596 | "id": "custom.lineStyle", 597 | "value": { 598 | "dash": [ 599 | 0, 600 | 10 601 | ], 602 | "fill": "dot" 603 | } 604 | } 605 | ] 606 | }, 607 | { 608 | "matcher": { 609 | "id": "byName", 610 | "options": "Limit" 611 | }, 612 | "properties": [ 613 | { 614 | "id": "color", 615 | "value": { 616 | "fixedColor": "orange", 617 | "mode": "fixed" 618 | } 619 | } 620 | ] 621 | }, 622 | { 623 | "matcher": { 624 | "id": "byName", 625 | "options": "Requested" 626 | }, 627 | "properties": [ 628 | { 629 | "id": "color", 630 | "value": { 631 | "fixedColor": "green", 632 | "mode": "fixed" 633 | } 634 | } 635 | ] 636 | } 637 | ] 638 | }, 639 | "gridPos": { 640 | "h": 7, 641 | "w": 6, 642 | "x": 0, 643 | "y": 14 644 | }, 645 | "id": 28, 646 | "options": { 647 | "legend": { 648 | "calcs": [], 649 | "displayMode": "list", 650 | "placement": "bottom", 651 | "showLegend": true 652 | }, 653 | "tooltip": { 654 | "mode": "single", 655 | "sort": "none" 656 | } 657 | }, 658 | "pluginVersion": "10.4.0", 659 | "targets": [ 660 | { 661 | "datasource": { 662 | "type": "prometheus", 663 | "uid": "prometheus" 664 | }, 665 | "editorMode": "code", 666 | "exemplar": false, 667 | "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{ {{- include "provisionerd-selector" . -}} }[$__rate_interval]))", 668 | "hide": false, 669 | "instant": false, 670 | "legendFormat": "__auto", 671 | "range": true, 672 | "refId": "A" 673 | }, 674 | { 675 | "datasource": { 676 | "type": "prometheus", 677 | "uid": "prometheus" 678 | }, 679 | "editorMode": "code", 680 | "exemplar": false, 681 | "expr": "max(kube_pod_container_resource_limits{ {{- include "provisionerd-selector" . -}} , resource=\"cpu\"})", 682 | "hide": false, 683 | "instant": false, 684 | "legendFormat": "Limit", 685 | "range": true, 686 | "refId": "B" 687 | }, 688 | { 689 | "datasource": { 690 | "type": "prometheus", 691 | "uid": "prometheus" 692 | }, 693 | "editorMode": "code", 694 | "exemplar": false, 695 | "expr": "max(kube_pod_container_resource_requests{ {{- include "provisionerd-selector" . -}} , resource=\"cpu\"})", 696 | "hide": false, 697 | "instant": false, 698 | "legendFormat": "Requested", 699 | "range": true, 700 | "refId": "C" 701 | } 702 | ], 703 | "title": "CPU Usage Seconds", 704 | "type": "timeseries" 705 | }, 706 | { 707 | "datasource": { 708 | "type": "prometheus", 709 | "uid": "prometheus" 710 | }, 711 | "description": "", 712 | "gridPos": { 713 | "h": 7, 714 | "w": 6, 715 | "x": 6, 716 | "y": 14 717 | }, 718 | "id": 30, 719 | "options": { 720 | "code": { 721 | "language": "plaintext", 722 | "showLineNumbers": false, 723 | "showMiniMap": false 724 | }, 725 | "content": "The cumulative CPU used per core-second. If the process was using a full CPU core, that would be represented as 1 second.\n\nRequests & limits are shown if set.", 726 | "mode": "markdown" 727 | }, 728 | "pluginVersion": "10.4.0", 729 | "transparent": true, 730 | "type": "text" 731 | }, 732 | { 733 | "datasource": { 734 | "type": "prometheus", 735 | "uid": "prometheus" 736 | }, 737 | "description": "", 738 | "fieldConfig": { 739 | "defaults": { 740 | "color": { 741 | "mode": "palette-classic" 742 | }, 743 | "custom": { 744 | "axisBorderShow": false, 745 | "axisCenteredZero": false, 746 | "axisColorMode": "text", 747 | "axisLabel": "", 748 | "axisPlacement": "auto", 749 | "barAlignment": 0, 750 | "drawStyle": "bars", 751 | "fillOpacity": 100, 752 | "gradientMode": "none", 753 | "hideFrom": { 754 | "legend": false, 755 | "tooltip": false, 756 | "viz": false 757 | }, 758 | "insertNulls": false, 759 | "lineInterpolation": "linear", 760 | "lineWidth": 1, 761 | "pointSize": 5, 762 | "scaleDistribution": { 763 | "type": "linear" 764 | }, 765 | "showPoints": "never", 766 | "spanNulls": false, 767 | "stacking": { 768 | "group": "A", 769 | "mode": "none" 770 | }, 771 | "thresholdsStyle": { 772 | "mode": "off" 773 | } 774 | }, 775 | "fieldMinMax": false, 776 | "mappings": [], 777 | "thresholds": { 778 | "mode": "absolute", 779 | "steps": [ 780 | { 781 | "color": "text", 782 | "value": null 783 | } 784 | ] 785 | }, 786 | "unit": "bytes" 787 | }, 788 | "overrides": [ 789 | { 790 | "matcher": { 791 | "id": "byRegexp", 792 | "options": "/(Limit|Requested)/" 793 | }, 794 | "properties": [ 795 | { 796 | "id": "custom.drawStyle", 797 | "value": "line" 798 | }, 799 | { 800 | "id": "custom.fillOpacity", 801 | "value": 5 802 | }, 803 | { 804 | "id": "custom.lineStyle", 805 | "value": { 806 | "dash": [ 807 | 0, 808 | 10 809 | ], 810 | "fill": "dot" 811 | } 812 | } 813 | ] 814 | }, 815 | { 816 | "matcher": { 817 | "id": "byName", 818 | "options": "Limit" 819 | }, 820 | "properties": [ 821 | { 822 | "id": "color", 823 | "value": { 824 | "fixedColor": "orange", 825 | "mode": "fixed" 826 | } 827 | } 828 | ] 829 | }, 830 | { 831 | "matcher": { 832 | "id": "byName", 833 | "options": "Requested" 834 | }, 835 | "properties": [ 836 | { 837 | "id": "color", 838 | "value": { 839 | "fixedColor": "green", 840 | "mode": "fixed" 841 | } 842 | } 843 | ] 844 | } 845 | ] 846 | }, 847 | "gridPos": { 848 | "h": 7, 849 | "w": 6, 850 | "x": 12, 851 | "y": 14 852 | }, 853 | "id": 29, 854 | "options": { 855 | "legend": { 856 | "calcs": [], 857 | "displayMode": "list", 858 | "placement": "bottom", 859 | "showLegend": true 860 | }, 861 | "tooltip": { 862 | "mode": "single", 863 | "sort": "none" 864 | } 865 | }, 866 | "pluginVersion": "10.4.0", 867 | "targets": [ 868 | { 869 | "datasource": { 870 | "type": "prometheus", 871 | "uid": "prometheus" 872 | }, 873 | "editorMode": "code", 874 | "exemplar": false, 875 | "expr": "max by (pod) (container_memory_working_set_bytes{ {{- include "provisionerd-selector" . -}} })", 876 | "hide": false, 877 | "instant": false, 878 | "legendFormat": "__auto", 879 | "range": true, 880 | "refId": "A" 881 | }, 882 | { 883 | "datasource": { 884 | "type": "prometheus", 885 | "uid": "prometheus" 886 | }, 887 | "editorMode": "code", 888 | "exemplar": false, 889 | "expr": "max(kube_pod_container_resource_limits{ {{- include "provisionerd-selector" . -}} , resource=\"memory\"})", 890 | "hide": false, 891 | "instant": false, 892 | "legendFormat": "Limit", 893 | "range": true, 894 | "refId": "B" 895 | }, 896 | { 897 | "datasource": { 898 | "type": "prometheus", 899 | "uid": "prometheus" 900 | }, 901 | "editorMode": "code", 902 | "exemplar": false, 903 | "expr": "max(kube_pod_container_resource_requests{ {{- include "provisionerd-selector" . -}} , resource=\"memory\"})", 904 | "hide": false, 905 | "instant": false, 906 | "legendFormat": "Requested", 907 | "range": true, 908 | "refId": "C" 909 | } 910 | ], 911 | "title": "RAM Usage", 912 | "type": "timeseries" 913 | }, 914 | { 915 | "datasource": { 916 | "type": "prometheus", 917 | "uid": "prometheus" 918 | }, 919 | "description": "", 920 | "gridPos": { 921 | "h": 7, 922 | "w": 6, 923 | "x": 18, 924 | "y": 14 925 | }, 926 | "id": 31, 927 | "options": { 928 | "code": { 929 | "language": "plaintext", 930 | "showLineNumbers": false, 931 | "showMiniMap": false 932 | }, 933 | "content": "This shows the total memory used by each container; it is the same metric which the [OOM killer](https://www.kernel.org/doc/gorman/html/understand/understand016.html) uses.\n\nRequests & limits are shown if set.", 934 | "mode": "markdown" 935 | }, 936 | "pluginVersion": "10.4.0", 937 | "transparent": true, 938 | "type": "text" 939 | }, 940 | { 941 | "datasource": { 942 | "type": "loki", 943 | "uid": "loki" 944 | }, 945 | "gridPos": { 946 | "h": 18, 947 | "w": 18, 948 | "x": 0, 949 | "y": 21 950 | }, 951 | "id": 27, 952 | "options": { 953 | "dedupStrategy": "exact", 954 | "enableLogDetails": true, 955 | "prettifyLogMessage": false, 956 | "showCommonLabels": false, 957 | "showLabels": false, 958 | "showTime": true, 959 | "sortOrder": "Descending", 960 | "wrapLogMessage": false 961 | }, 962 | "targets": [ 963 | { 964 | "datasource": { 965 | "type": "loki", 966 | "uid": "loki" 967 | }, 968 | "editorMode": "code", 969 | "expr": "{ {{- include "non-workspace-selector" . -}}, logger=~\"(.*runner|terraform|provisioner.*)\"}", 970 | "queryType": "range", 971 | "refId": "A" 972 | } 973 | ], 974 | "title": "Logs", 975 | "type": "logs" 976 | }, 977 | { 978 | "datasource": { 979 | "type": "prometheus", 980 | "uid": "prometheus" 981 | }, 982 | "description": "", 983 | "gridPos": { 984 | "h": 7, 985 | "w": 6, 986 | "x": 18, 987 | "y": 21 988 | }, 989 | "id": 32, 990 | "options": { 991 | "code": { 992 | "language": "plaintext", 993 | "showLineNumbers": false, 994 | "showMiniMap": false 995 | }, 996 | "content": "This panel shows all logs across built-in and [external provisioners](https://coder.com/docs/v2/latest/admin/provisioners).", 997 | "mode": "markdown" 998 | }, 999 | "pluginVersion": "10.4.0", 1000 | "transparent": true, 1001 | "type": "text" 1002 | } 1003 | ], 1004 | "refresh": "{{- include "dashboard-refresh" . -}}", 1005 | "schemaVersion": 39, 1006 | "tags": [], 1007 | "templating": { 1008 | "list": [] 1009 | }, 1010 | "time": { 1011 | "from": "now-{{- include "dashboard-range" . -}}", 1012 | "to": "now" 1013 | }, 1014 | "timepicker": {}, 1015 | "timezone": "browser", 1016 | "title": "Provisioners", 1017 | "uid": "provisionerd", 1018 | "version": 10, 1019 | "weekStart": "" 1020 | } 1021 | {{ end }} -------------------------------------------------------------------------------- /coder-observability/templates/dashboards/configmap-dashboards-coderd.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: dashboards-coderd 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | coderd.json: |- {{- include "coderd-dashboard.json" . | trim | nindent 4 }} -------------------------------------------------------------------------------- /coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: dashboards-prebuilds 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | prebuilds.json: |- {{- include "prebuilds-dashboard.json" . | trim | nindent 4 }} -------------------------------------------------------------------------------- /coder-observability/templates/dashboards/configmap-dashboards-provisionerd.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: dashboards-provisionerd 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | provisionerd.json: |- {{- include "provisionerd-dashboard.json" . | trim | nindent 4 }} -------------------------------------------------------------------------------- /coder-observability/templates/dashboards/configmap-dashboards-status.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: dashboards-status 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | status.json: |- {{- include "status-dashboard.json" . | trim | nindent 4 }} -------------------------------------------------------------------------------- /coder-observability/templates/dashboards/configmap-dashboards-workspace_detail.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: dashboards-workspace-detail 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | workspaces-detail.json: |- {{- include "workspace-detail-dashboard.json" . | trim | nindent 4 }} -------------------------------------------------------------------------------- /coder-observability/templates/dashboards/configmap-dashboards-workspaces.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: dashboards-workspaces 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | workspaces.json: |- {{- include "workspaces-dashboard.json" . | trim | nindent 4 }} -------------------------------------------------------------------------------- /coder-observability/templates/service-runbook-viewer.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: runbook-viewer 6 | spec: 7 | ports: 8 | - port: 80 9 | targetPort: 3000 10 | protocol: TCP 11 | selector: 12 | app: runbook-viewer 13 | -------------------------------------------------------------------------------- /coder-observability/templates/statefulset-postgres-exporter.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: StatefulSet 4 | metadata: 5 | name: postgres-exporter 6 | namespace: {{ .Release.Namespace }} 7 | spec: 8 | selector: 9 | matchLabels: 10 | app: postgres-exporter 11 | serviceName: postgres-exporter 12 | replicas: 1 13 | template: 14 | metadata: 15 | annotations: 16 | prometheus.io/scrape: 'true' 17 | labels: 18 | app: postgres-exporter 19 | app.kubernetes.io/name: "database-stats" 20 | spec: 21 | containers: 22 | - name: postgres-exporter 23 | image: {{ .Values.global.postgres.exporter.image }} 24 | args: 25 | - --collector.long_running_transactions 26 | ports: 27 | - containerPort: 9187 28 | name: exporter 29 | env: 30 | - name: DATA_SOURCE_NAME 31 | value: '{{ include "postgres-connector-string" . }}' 32 | {{ include "postgres-secret-mount" . | nindent 10 }} 33 | 34 | volumeMounts: 35 | {{ toYaml .Values.global.postgres.volumeMounts | nindent 12 }} 36 | 37 | volumes: 38 | {{ toYaml .Values.global.postgres.volumes | nindent 8 }} -------------------------------------------------------------------------------- /coder-observability/templates/statefulset-runbook-viewer.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: StatefulSet 4 | metadata: 5 | name: runbook-viewer 6 | namespace: {{ .Release.Namespace }} 7 | spec: 8 | selector: 9 | matchLabels: 10 | app: runbook-viewer 11 | serviceName: runbook-viewer 12 | replicas: 1 13 | template: 14 | metadata: 15 | annotations: 16 | checksum/config: {{ (.Files.Glob "runbooks/**").AsConfig | indent 2 | sha256sum }} 17 | labels: 18 | app: runbook-viewer 19 | spec: 20 | containers: 21 | - name: madness 22 | image: {{ .Values.runbookViewer.image }} 23 | ports: 24 | - containerPort: 3000 25 | name: madness 26 | args: 27 | - server 28 | volumeMounts: 29 | - mountPath: /docs/ 30 | name: runbooks 31 | volumes: 32 | - name: runbooks 33 | configMap: 34 | name: runbooks 35 | -------------------------------------------------------------------------------- /coder-observability/templates/statefulset-sql-exporter.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: StatefulSet 4 | metadata: 5 | name: sql-exporter 6 | namespace: {{ .Release.Namespace }} 7 | spec: 8 | selector: 9 | matchLabels: 10 | app: sql-exporter 11 | serviceName: sql-exporter 12 | replicas: 1 13 | template: 14 | metadata: 15 | annotations: 16 | prometheus.io/scrape: 'true' 17 | checksum/config: {{ include (print $.Template.BasePath "/configmap-sql-exporter.yaml") . | sha256sum }} 18 | labels: 19 | app: sql-exporter 20 | app.kubernetes.io/name: "database-stats" 21 | spec: 22 | containers: 23 | - name: sql-exporter 24 | image: {{ .Values.sqlExporter.image }} 25 | args: 26 | - -config.file=/cfg/config.yaml 27 | ports: 28 | - containerPort: 9399 29 | name: exporter 30 | volumeMounts: 31 | - mountPath: /cfg/ 32 | name: config 33 | {{ include "postgres-secret-mount" . | nindent 10 }} 34 | volumes: 35 | - name: config 36 | configMap: 37 | name: sql-exporter-config 38 | -------------------------------------------------------------------------------- /coder-observability/values.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | coder: 3 | # global.coder.scrapeMetrics -- use this to scrape metrics from a standalone (set of) coder deployment(s) 4 | # if using kubernetes, rather add an annotation "prometheus.io/scrape=true" and coder will get automatically scraped; 5 | # set this value to null and configure coderdSelector to target your coder pods 6 | scrapeMetrics: null 7 | # hostname: localhost 8 | # port: 2112 9 | # scrapeInterval: 15s 10 | # additionalLabels: 11 | # job: coder 12 | # global.coder.coderdSelector -- series selector for Prometheus/Loki to locate provisioner pods. 13 | # ensure this uses backticks for quotes! 14 | coderdSelector: 'pod=~`coder.*`, pod!~`.*provisioner.*`' 15 | # global.coder.provisionerdSelector -- series selector for Prometheus/Loki to locate provisioner pods. 16 | # https://coder.com/docs/v2/latest/admin/provisioners 17 | # TODO: rename container label in provisioner helm chart to be "provisioner" not "coder" 18 | # ensure this uses backticks for quotes! 19 | provisionerdSelector: 'pod=~`coder-provisioner.*`' 20 | # global.coder.workspacesSelector -- the namespace into which any external provisioners have been deployed. 21 | workspacesSelector: 'namespace=`coder-workspaces`' 22 | # global.coder.controlPlaneNamespace -- the namespace into which the control plane has been deployed. 23 | controlPlaneNamespace: coder 24 | # global.coder.externalProvisionersNamespace -- the namespace into which any external provisioners have been deployed. 25 | externalProvisionersNamespace: coder 26 | # See https://coder.com/docs/v2/latest/cli/server#--log-human 27 | # "Human" format is the default, which is a combination of plaintext and logfmt but it' quite tricky to parse reliably 28 | # with regex matchers. 29 | # TODO: support "json" format 30 | logFormat: human 31 | # global.coder.alerts -- alerts for the various aspects of Coder 32 | alerts: 33 | enterprise: 34 | groups: 35 | Licences: 36 | enabled: true 37 | delay: 1m 38 | thresholds: 39 | warning: 0.9 40 | critical: 1 41 | coderd: 42 | groups: 43 | CPU: 44 | enabled: true 45 | delay: 10m 46 | period: 10m 47 | thresholds: 48 | warning: 0.8 49 | critical: 0.9 50 | Memory: 51 | enabled: true 52 | delay: 10m 53 | thresholds: 54 | warning: 0.8 55 | critical: 0.9 56 | Restarts: 57 | enabled: true 58 | delay: 1m 59 | period: 10m 60 | thresholds: 61 | notify: 1 62 | warning: 2 63 | critical: 3 64 | Replicas: 65 | enabled: true 66 | delay: 5m 67 | thresholds: 68 | notify: 3 # 2/3 replicas are alive 69 | warning: 2 # 1/3 replicas are alive 70 | critical: 1 # 0/3 replicas are alive 71 | WorkspaceBuildFailures: 72 | enabled: true 73 | delay: 10m 74 | period: 10m 75 | thresholds: 76 | notify: 2 77 | warning: 5 78 | critical: 10 79 | IneligiblePrebuilds: 80 | enabled: true 81 | delay: 10m 82 | thresholds: 83 | notify: 1 84 | UnprovisionedPrebuiltWorkspaces: 85 | enabled: true 86 | delay: 10m 87 | thresholds: 88 | warn: 1 89 | provisionerd: 90 | groups: 91 | Replicas: 92 | enabled: true 93 | delay: 5m 94 | thresholds: 95 | notify: 3 # 2/3 replicas are alive 96 | warning: 2 # 1/3 replicas are alive 97 | critical: 1 # 0/3 replicas are alive 98 | 99 | zone: svc 100 | 101 | externalScheme: http 102 | # The external hostname from which k8s services can be accessed in the form of: 103 | # :.<> 104 | # e.g. 105 | # http://dashboards.coder-observability.svc.cluster.local 106 | externalZone: svc.cluster.local 107 | 108 | # global.telemetry -- control telemetry collection 109 | telemetry: 110 | # global.telemetry.metrics -- control metric collection 111 | metrics: 112 | # global.telemetry.metrics.scrape_interval -- how often the collector will scrape discovered pods 113 | scrape_interval: 15s 114 | # global.telemetry.metrics.scrape_timeout -- how long a request will be allowed to wait before being canceled 115 | scrape_timeout: 12s 116 | 117 | # global.postgres -- postgres connection information 118 | # NOTE: these settings are global so we can parameterise some values which get rendered by subcharts 119 | postgres: 120 | hostname: localhost 121 | port: 5432 122 | username: coder 123 | password: 124 | database: coder 125 | sslmode: disable 126 | # add root cert path if using SSL 127 | sslrootcert: /home/coder/.postgresql/rootcert.pem 128 | 129 | # ensure that your secret has a field named `PGPASSWORD` 130 | mountSecret: "secret-postgres" 131 | exporter: 132 | image: "quay.io/prometheuscommunity/postgres-exporter" 133 | 134 | volumes: 135 | - name: "pg-certs-mount" 136 | configMap: 137 | name: "pg-certs-mount-config-map" 138 | 139 | volumeMounts: 140 | - name: "pg-certs-mount" 141 | mountPath: "/home/coder/.postgresql" 142 | readOnly: true 143 | 144 | # global.postgres.alerts -- alerts for postgres 145 | alerts: 146 | groups: 147 | Basic: 148 | enabled: true 149 | delay: 1m 150 | Notifications: 151 | enabled: true 152 | delay: 15m 153 | thresholds: 154 | notify: 0.5 155 | warning: 0.8 156 | critical: 0.9 157 | Connections: 158 | enabled: true 159 | delay: 5m 160 | thresholds: 161 | notify: 0.5 162 | warning: 0.8 163 | critical: 0.9 164 | 165 | # global.dashboards -- settings for bundled dashboards 166 | dashboards: 167 | # global.dashboards.timerange -- how far back dashboards should look 168 | timerange: 12h 169 | # global.dashboards.refresh -- how often dashboards should refresh 170 | refresh: 30s 171 | # global.dashboards.queryTimeout -- how long until a query in Grafana will timeout after 172 | queryTimeout: 900 173 | 174 | runbookViewer: 175 | image: "dannyben/madness" 176 | 177 | sqlExporter: 178 | image: "burningalchemist/sql_exporter" 179 | 180 | grafana-agent: 181 | enabled: true 182 | fullnameOverride: grafana-agent 183 | agent: 184 | mode: flow 185 | configMap: 186 | name: collector-config 187 | key: config.river 188 | create: false 189 | clustering: 190 | enabled: false 191 | extraArgs: 192 | - --disable-reporting=true 193 | mounts: 194 | varlog: true 195 | dockercontainers: true 196 | controller: 197 | type: daemonset 198 | podAnnotations: 199 | prometheus.io/scrape: "true" 200 | crds: 201 | create: false 202 | 203 | withOTLPReceiver: false 204 | 205 | # Configuration blocks 206 | # 207 | # Enable debug logging (warning: produces large amount of logs!) 208 | #logging: |- 209 | # logging { 210 | # level = "debug" 211 | # format = "logfmt" 212 | # } 213 | discovery: |- 214 | // Discover k8s nodes 215 | discovery.kubernetes "nodes" { 216 | role = "node" 217 | } 218 | 219 | // Discover k8s pods 220 | discovery.kubernetes "pods" { 221 | role = "pod" 222 | selectors { 223 | role = "pod" 224 | } 225 | } 226 | commonRelabellings: |- 227 | rule { 228 | source_labels = ["__meta_kubernetes_namespace"] 229 | target_label = "namespace" 230 | } 231 | rule { 232 | source_labels = ["__meta_kubernetes_pod_name"] 233 | target_label = "pod" 234 | } 235 | // coalesce the following labels and pick the first value; we'll use this to define the "job" label 236 | rule { 237 | source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_component", "app", "__meta_kubernetes_pod_container_name"] 238 | separator = "/" 239 | target_label = "__meta_app" 240 | action = "replace" 241 | regex = "^/*([^/]+?)(?:/.*)?$" // split by the delimiter if it exists, we only want the first one 242 | replacement = "${1}" 243 | } 244 | rule { 245 | source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_app"] 246 | separator = "/" 247 | target_label = "job" 248 | } 249 | rule { 250 | source_labels = ["__meta_kubernetes_pod_container_name"] 251 | target_label = "container" 252 | } 253 | rule { 254 | regex = "__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)" 255 | action = "labeldrop" 256 | } 257 | rule { 258 | regex = "pod_template_generation" 259 | action = "labeldrop" 260 | } 261 | rule { 262 | source_labels = ["__meta_kubernetes_pod_phase"] 263 | regex = "Pending|Succeeded|Failed|Completed" 264 | action = "drop" 265 | } 266 | rule { 267 | source_labels = ["__meta_kubernetes_pod_node_name"] 268 | action = "replace" 269 | target_label = "node" 270 | } 271 | rule { 272 | action = "labelmap" 273 | regex = "__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)" 274 | replacement = "__param_$1" 275 | } 276 | extraBlocks: "" 277 | # Examples: 278 | # loki.source.file "tmpfiles" { 279 | # targets = [ 280 | # {__path__ = "/tmp/foo.txt", "color" = "pink"}, 281 | # {__path__ = "/tmp/bar.txt", "color" = "blue"}, 282 | # {__path__ = "/tmp/baz.txt", "color" = "grey"}, 283 | # ] 284 | # forward_to = [loki.write.loki.receiver] 285 | # } 286 | podMetricsRelabelRules: "" 287 | podLogsRelabelRules: "" 288 | 289 | grafana: 290 | enabled: true 291 | image: 292 | tag: 10.4.19 293 | fullnameOverride: grafana 294 | useStatefulSet: true 295 | replicas: 1 296 | deploymentStrategy: 297 | type: Recreate # avoid MultiAttachError for standard-rwo sc 298 | service: 299 | enabled: true 300 | persistence: 301 | enabled: true 302 | size: 10Gi 303 | testFramework: 304 | enabled: false 305 | annotations: 306 | # TODO: this adds annotations to _all_ resources; can we be more specific? 307 | prometheus.io/scrape: "true" 308 | dashboardProviders: 309 | infra.yaml: 310 | apiVersion: 1 311 | providers: 312 | - name: infra 313 | orgId: 1 314 | folder: 'Infrastructure' 315 | type: file 316 | disableDeletion: false 317 | editable: false 318 | options: 319 | path: /var/lib/grafana/dashboards/infra 320 | coder.yaml: 321 | apiVersion: 1 322 | providers: 323 | - name: coder 324 | orgId: 1 325 | folder: 'Coder' 326 | type: file 327 | updateIntervalSeconds: 5 328 | disableDeletion: false 329 | editable: false 330 | options: 331 | path: /var/lib/grafana/dashboards/coder 332 | sidecar.yaml: 333 | apiVersion: 1 334 | providers: 335 | - name: sidecar 336 | orgId: 1 337 | type: file 338 | folder: 'Other' 339 | disableDeletion: false 340 | updateIntervalSeconds: 30 341 | editable: false 342 | options: 343 | path: /tmp/dashboards 344 | dashboards: 345 | # TODO: import dashboards from coder/coder 346 | infra: 347 | node-exporter-full: 348 | gnetId: 1860 349 | revision: 36 350 | datasource: metrics 351 | postgres-database: 352 | gnetId: 9628 353 | revision: 7 354 | datasource: metrics 355 | datasources: 356 | datasources.yaml: 357 | apiVersion: 1 358 | datasources: 359 | - name: metrics 360 | type: prometheus 361 | url: http://prometheus.{{ .Release.Namespace }}.{{ $.Values.global.zone }} 362 | access: proxy 363 | isDefault: true 364 | editable: false 365 | # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout 366 | timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}' 367 | uid: prometheus 368 | - name: logs 369 | type: loki 370 | url: http://loki-gateway.{{ .Release.Namespace }}.{{ $.Values.global.zone }} 371 | access: proxy 372 | isDefault: false 373 | editable: false 374 | # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout 375 | timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}' 376 | uid: loki 377 | - name: postgres 378 | type: postgres 379 | url: '{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}' 380 | user: '{{ .Values.global.postgres.username }}' 381 | secureJsonData: 382 | password: '{{ if .Values.global.postgres.password }}{{ .Values.global.postgres.password }}{{ else }}$PGPASSWORD{{ end }}' 383 | jsonData: 384 | sslmode: '{{ .Values.global.postgres.sslmode }}' 385 | isDefault: false 386 | editable: false 387 | # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout 388 | timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}' 389 | uid: postgres 390 | admin: 391 | existingSecret: "" 392 | env: 393 | GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION: true 394 | grafana.ini: 395 | auth.anonymous: 396 | enabled: true 397 | org_name: Main Org. 398 | org_role: Admin 399 | analytics: 400 | reporting_enabled: false 401 | users: 402 | allow_sign_up: false 403 | feature_toggles: 404 | # migrate Angular panels to React 405 | # see https://grafana.com/docs/grafana/latest/developers/angular_deprecation/angular-plugins/#automatic-migration-of-plugins 406 | autoMigrateOldPanels: true 407 | dashboards: 408 | # mounted configmap will be synced with sidecar 409 | default_home_dashboard_path: /var/lib/grafana/dashboards/coder/0/status.json 410 | dataproxy: 411 | timeout: '{{ $.Values.global.dashboards.queryTimeout }}' 412 | sidecar: 413 | dashboards: 414 | provider: 415 | disableDelete: true 416 | allowUiUpdates: true 417 | enabled: false 418 | labelValue: "1" 419 | extraConfigmapMounts: 420 | # we can't combine configmaps because of the 1MiB size limit, but Grafana will scan 421 | # the /var/lib/grafana/dashboards/coder directory deeply to find dashboards 422 | - name: dashboards-status 423 | mountPath: /var/lib/grafana/dashboards/coder/0 424 | configMap: dashboards-status 425 | readOnly: false 426 | - name: dashboards-coderd 427 | mountPath: /var/lib/grafana/dashboards/coder/1 428 | configMap: dashboards-coderd 429 | readOnly: false 430 | - name: dashboards-provisionerd 431 | mountPath: /var/lib/grafana/dashboards/coder/2 432 | configMap: dashboards-provisionerd 433 | readOnly: false 434 | - name: dashboards-workspaces 435 | mountPath: /var/lib/grafana/dashboards/coder/3 436 | configMap: dashboards-workspaces 437 | readOnly: false 438 | - name: dashboards-workspace-detail 439 | mountPath: /var/lib/grafana/dashboards/coder/4 440 | configMap: dashboards-workspace-detail 441 | readOnly: false 442 | - name: dashboards-prebuilds 443 | mountPath: /var/lib/grafana/dashboards/coder/5 444 | configMap: dashboards-prebuilds 445 | readOnly: false 446 | 447 | prometheus: 448 | enabled: true 449 | server: 450 | fullnameOverride: prometheus 451 | podAnnotations: 452 | prometheus.io/scrape: "true" 453 | 454 | global: 455 | # prometheus.server.evaluation_interval -- how often to evaluate recording & alerting rule groups 456 | evaluation_interval: 30s 457 | 458 | extraArgs: 459 | log.level: debug 460 | 461 | replicaCount: 1 462 | statefulSet: 463 | enabled: true 464 | 465 | retentionSize: 10GB 466 | persistentVolume: 467 | enabled: true 468 | # Note: allowing +2GB breathing room above storage.tsdb.retention.size 469 | size: 12Gi 470 | service: 471 | type: ClusterIP 472 | extraFlags: 473 | - web.enable-lifecycle 474 | - enable-feature=remote-write-receiver 475 | extraConfigmapMounts: 476 | - name: alerts 477 | mountPath: /etc/config/alerts 478 | configMap: metrics-alerts 479 | readonly: true 480 | 481 | serverFiles: 482 | prometheus.yml: 483 | # disables scraping of metrics by the Prometheus helm chart since this is managed by the collector 484 | scrape_configs: [] 485 | # use custom rule files to be able to render templates (can't do that in values.yaml, unless that value is evaluated by a tpl call) 486 | rule_files: 487 | - /etc/config/alerts/*.yaml 488 | 489 | testFramework: 490 | enabled: false 491 | 492 | # enable metric collection from configmap reloader 493 | configmapReload: 494 | prometheus: 495 | extraArgs: 496 | log-level: all 497 | watch-interval: 15s 498 | containerPort: 9091 499 | extraConfigmapMounts: 500 | - name: alerts 501 | mountPath: /etc/config/alerts 502 | configMap: metrics-alerts 503 | readonly: true 504 | 505 | alertmanager: 506 | fullnameOverride: alertmanager 507 | enabled: true 508 | service: 509 | port: 80 510 | podAnnotations: 511 | prometheus.io/scrape: "true" 512 | kube-state-metrics: 513 | fullnameOverride: kube-state-metrics 514 | enabled: true 515 | podAnnotations: 516 | prometheus.io/scrape: "true" 517 | prometheus-node-exporter: 518 | fullnameOverride: node-exporter 519 | enabled: true 520 | podAnnotations: 521 | prometheus.io/scrape: "true" 522 | 523 | # Disable push gateway 524 | prometheus-pushgateway: 525 | enabled: false 526 | 527 | loki: 528 | enabled: true 529 | nameOverride: loki 530 | fullnameOverride: loki 531 | 532 | enterprise: 533 | enabled: false 534 | adminApi: 535 | enabled: false 536 | useExternalLicense: false 537 | 538 | test: 539 | canaryServiceAddress: "http://loki-canary:3500/metrics" 540 | enabled: true 541 | 542 | minio: 543 | enabled: true 544 | fullnameOverride: loki-storage 545 | address: loki-storage.{{ .Release.Namespace }}.{{ .Values.global.zone}}:9000 546 | podAnnotations: 547 | prometheus.io/scrape: "true" 548 | prometheus.io/path: "/minio/v2/metrics/cluster" 549 | podLabels: 550 | app.kubernetes.io/name: "loki-storage" 551 | 552 | loki: 553 | auth_enabled: false 554 | commonConfig: 555 | path_prefix: /var/loki 556 | replication_factor: 1 557 | schemaConfig: 558 | configs: 559 | - from: 2024-04-01 560 | store: tsdb 561 | object_store: s3 562 | schema: v13 563 | index: 564 | prefix: index_ 565 | period: 24h 566 | 567 | rulerConfig: 568 | remote_write: 569 | enabled: true 570 | clients: 571 | # "fake" is the default username when auth is disabled (unfortunate, I know) 572 | fake: 573 | url: http://prometheus.{{ .Release.Namespace }}.{{ .Values.global.zone}}/api/v1/write 574 | headers: 575 | Source: Loki 576 | remote_timeout: 30s 577 | wal: 578 | dir: /var/loki-ruler-wal 579 | alertmanager_url: http://alertmanager.{{ .Release.Namespace }}.{{ .Values.global.zone}} 580 | enable_api: true 581 | ring: 582 | kvstore: 583 | store: inmemory 584 | enable_alertmanager_v2: true 585 | storage: 586 | type: local 587 | local: 588 | directory: /rules 589 | rule_path: /rules 590 | 591 | lokiCanary: 592 | enabled: true 593 | annotations: 594 | prometheus.io/scrape: "true" 595 | 596 | chunksCache: 597 | allocatedMemory: 1024 598 | resultsCache: 599 | allocatedMemory: 1024 600 | 601 | # disabled scraping of logs by the Loki helm chart since this is managed by the collector 602 | monitoring: 603 | selfMonitoring: 604 | enabled: false 605 | grafanaAgent: 606 | installOperator: false 607 | # creates ConfigMaps of dashboards which are discovered via labels 608 | dashboards: 609 | enabled: true 610 | 611 | sidecar: 612 | rules: 613 | logLevel: DEBUG 614 | folder: /rules/fake 615 | 616 | gateway: 617 | replicas: 1 618 | write: 619 | podAnnotations: 620 | prometheus.io/scrape: "true" 621 | replicas: 1 622 | extraArgs: 623 | - -log.level=debug 624 | read: 625 | podAnnotations: 626 | prometheus.io/scrape: "true" 627 | replicas: 1 628 | backend: 629 | podAnnotations: 630 | prometheus.io/scrape: "true" 631 | replicas: 1 632 | extraVolumes: 633 | - name: ruler-wal 634 | emptyDir: { } 635 | extraVolumeMounts: 636 | - name: ruler-wal 637 | mountPath: /var/loki-ruler-wal 638 | extraArgs: 639 | - -log.level=debug 640 | -------------------------------------------------------------------------------- /scripts/check-unstaged.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | source "$(dirname "${BASH_SOURCE[0]}")/lib.sh" 4 | 5 | check_unstaged -------------------------------------------------------------------------------- /scripts/compile.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # check versions 5 | HELM_VERSION=3.17 6 | YQ_VERSION=4.42 7 | [[ "$(helm version)" == *v${HELM_VERSION}* ]] || { echo "Expected helm version v${HELM_VERSION} but got $(helm version)" >&2; exit 1; } 8 | [[ "$(yq --version)" == *v${YQ_VERSION}* ]] || { echo "Expected yq version v${YQ_VERSION} but got $(yq --version)" >&2; exit 1; } 9 | 10 | source "$(dirname "${BASH_SOURCE[0]}")/lib.sh" 11 | 12 | helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 13 | helm repo add grafana https://grafana.github.io/helm-charts 14 | helm --repository-cache /tmp/cache repo update 15 | # Check for unexpected changes. 16 | # Helm dependencies are versioned using ^ which accepts minor & patch changes: 17 | # e.g. ^1.2.3 is equivalent to >= 1.2.3 < 2.0.0 18 | helm dependency update coder-observability/ 19 | # We *expect* that the versions will change in the rendered template output, so we ignore those, but 20 | # if there are changes to the manifests themselves then we need to fail the build to force manual review. 21 | helm template --namespace coder-observability -f coder-observability/values.yaml coder-observability coder-observability/ | \ 22 | yq e 'del(.spec.template.spec.containers[].image, .metadata.labels."helm.sh/chart", .metadata.labels."app.kubernetes.io/version")' - \ 23 | > compiled/resources.yaml 24 | 25 | check_unstaged "compiled" -------------------------------------------------------------------------------- /scripts/lib.sh: -------------------------------------------------------------------------------- 1 | function check_unstaged() { 2 | FILES="$(git ls-files --other --modified --exclude-standard -- ${1:-.})" 3 | if [[ "$FILES" != "" ]]; then 4 | mapfile -t files <<<"$FILES" 5 | 6 | echo 7 | echo "The following files contain unstaged changes:" 8 | echo 9 | for file in "${files[@]}"; do 10 | echo " - $file" 11 | done 12 | 13 | echo 14 | echo "These are the changes:" 15 | echo 16 | for file in "${files[@]}"; do 17 | git --no-pager diff "$file" 1>&2 18 | done 19 | 20 | echo 21 | echo >&2 "Unstaged changes, see above for details." 22 | exit 1 23 | fi 24 | } -------------------------------------------------------------------------------- /scripts/lint-rules.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | temp_dir="$(mktemp -d)" 5 | rules_file="${temp_dir}/rules.yaml" 6 | helm template coder-o11y coder-observability -f coder-observability/values.yaml --show-only templates/configmap-prometheus-alerts.yaml > ${rules_file} 7 | 8 | for key in $(yq e '.data | keys' -o csv ${rules_file} | tr ',' "\n"); do 9 | file="${temp_dir}/${key}" 10 | echo "=========================== [${file}] ===========================" 11 | 12 | yq e ".data[\"${key}\"]" ${rules_file} > ${file} 13 | go run github.com/cloudflare/pint/cmd/pint@latest -l DEBUG lint ${file} 14 | done -------------------------------------------------------------------------------- /scripts/publish.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euox pipefail 3 | 4 | version=$("$(dirname "${BASH_SOURCE[0]}")/version.sh") 5 | mkdir -p build/helm 6 | helm package coder-observability --version=${version} --dependency-update --destination build/helm 7 | gsutil cp gs://helm.coder.com/observability/index.yaml build/helm/index.yaml 8 | helm repo index build/helm --url https://helm.coder.com/observability --merge build/helm/index.yaml 9 | gsutil -h "Cache-Control:no-cache,max-age=0" cp build/helm/index.yaml gs://helm.coder.com/observability/ 10 | gsutil -h "Cache-Control:no-cache,max-age=0" cp build/helm/coder-observability-${version}.tgz gs://helm.coder.com/observability/ 11 | gsutil -h "Cache-Control:no-cache,max-age=0" cp artifacthub-repo.yaml gs://helm.coder.com/observability/ 12 | 13 | echo $version -------------------------------------------------------------------------------- /scripts/version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script generates the version string used by the helm chart, including for 4 | # dev versions. Note: the version returned by this script will NOT include the "v" 5 | # prefix that is included in the Git tag. 6 | # The script can also bump the version based on the given argument (major, minor, patch). 7 | 8 | set -euo pipefail 9 | 10 | remote_url=$(git remote get-url origin) 11 | current_version="$(git tag -l | sort --version-sort | tail -n1)" 12 | 13 | function help() { 14 | echo "$0 [options] [arguments]" 15 | echo " " 16 | echo "options:" 17 | echo "-h, --help show brief help" 18 | echo "-c, --current show the current version" 19 | echo "-b, --bump bump the version based on the given argument" 20 | exit 0 21 | } 22 | 23 | function bump_version() { 24 | local version=$1 25 | local new_version 26 | 27 | if [[ $version == "major" ]]; then 28 | new_version=$(echo $current_version | awk -F. '{print $1+1".0.0"}') 29 | elif [[ $version == "minor" ]]; then 30 | new_version=$(echo $current_version | awk -F. '{print $1"."$2+1".0"}') 31 | elif [[ $version == "patch" ]]; then 32 | new_version=$(echo $current_version | awk -F. '{print $1"."$2"."$3+1}') 33 | else 34 | echo "Error: Unknown argument $version" 35 | exit 1 36 | fi 37 | 38 | echo $new_version 39 | } 40 | 41 | function show_current() { 42 | # Version without the "v" prefix. 43 | echo "${current_version#v}" 44 | } 45 | 46 | if [ $# == 0 ]; then 47 | show_current 48 | fi 49 | 50 | while test $# -gt 0; do 51 | case "$1" in 52 | -h|--help) 53 | help 54 | ;; 55 | -c|--current) 56 | show_current 57 | shift 58 | ;; 59 | -b|--bump) 60 | if [ $# -lt 2 ]; then 61 | echo "Error: Missing argument for bump" 62 | exit 1 63 | fi 64 | shift 65 | bump_version $1 66 | shift 67 | ;; 68 | *) 69 | echo "Error: Unknown argument $1" 70 | exit 1 71 | ;; 72 | esac 73 | done 74 | --------------------------------------------------------------------------------