├── .gitattributes
├── .github
└── workflows
│ ├── lint.yaml
│ ├── nightly-build.yaml
│ └── release.yaml
├── .gitignore
├── CHANGELOG.md
├── CODEOWNERS
├── LICENSE
├── Makefile
├── PUBLISH.md
├── README.gotmpl
├── README.md
├── artifacthub-repo.yaml
├── coder-observability
├── .helmignore
├── Chart.lock
├── Chart.yaml
├── runbooks
│ ├── coderd.md
│ ├── postgres.md
│ └── provisionerd.md
├── templates
│ ├── _collector-config.tpl
│ ├── _helpers.tpl
│ ├── configmap-collector.yaml
│ ├── configmap-prometheus-alerts.yaml
│ ├── configmap-runbooks.yaml
│ ├── configmap-sql-exporter.yaml
│ ├── dashboards
│ │ ├── _dashboards_coderd.json.tpl
│ │ ├── _dashboards_prebuilds.json.tpl
│ │ ├── _dashboards_provisionerd.json.tpl
│ │ ├── _dashboards_status.json.tpl
│ │ ├── _dashboards_workspace_detail.json.tpl
│ │ ├── _dashboards_workspaces.json.tpl
│ │ ├── configmap-dashboards-coderd.yaml
│ │ ├── configmap-dashboards-prebuilds.yaml
│ │ ├── configmap-dashboards-provisionerd.yaml
│ │ ├── configmap-dashboards-status.yaml
│ │ ├── configmap-dashboards-workspace_detail.yaml
│ │ └── configmap-dashboards-workspaces.yaml
│ ├── service-runbook-viewer.yaml
│ ├── statefulset-postgres-exporter.yaml
│ ├── statefulset-runbook-viewer.yaml
│ └── statefulset-sql-exporter.yaml
└── values.yaml
├── compiled
└── resources.yaml
└── scripts
├── check-unstaged.sh
├── compile.sh
├── lib.sh
├── lint-rules.sh
├── publish.sh
└── version.sh
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.tpl linguist-language=go
--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
1 | name: Lint
2 | on:
3 | push:
4 | branches:
5 | - main
6 | pull_request:
7 | branches:
8 | - main
9 |
10 | jobs:
11 | lint:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - name: Checkout code
15 | uses: actions/checkout@v4
16 |
17 | - name: Setup Go
18 | uses: actions/setup-go@v2
19 | with:
20 | go-version: 1.23
21 |
22 | - name: Install Helm
23 | uses: azure/setup-helm@v4
24 | with:
25 | version: v3.17.1
26 |
27 | - name: Install yq
28 | run: |
29 | sudo wget https://github.com/mikefarah/yq/releases/download/v4.42.1/yq_linux_amd64 -O /usr/bin/yq &&\
30 | sudo chmod +x /usr/bin/yq
31 |
32 | - name: Lint Helm chart and rules
33 | run: make lint
--------------------------------------------------------------------------------
/.github/workflows/nightly-build.yaml:
--------------------------------------------------------------------------------
1 | name: Nightly build
2 |
3 | on:
4 | schedule:
5 | - cron: '0 0 * * *'
6 | workflow_dispatch: # Allows manual triggering of the workflow
7 |
8 | jobs:
9 | nightly-build:
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - name: Checkout code
14 | uses: actions/checkout@v4
15 |
16 | - name: Setup Go
17 | uses: actions/setup-go@v2
18 | with:
19 | go-version: 1.22
20 |
21 | - name: Install Helm
22 | uses: azure/setup-helm@v4
23 | with:
24 | version: v3.17.1
25 |
26 | - name: Install yq
27 | run: |
28 | sudo wget https://github.com/mikefarah/yq/releases/download/v4.42.1/yq_linux_amd64 -O /usr/bin/yq &&\
29 | sudo chmod +x /usr/bin/yq
30 |
31 | - name: make build
32 | run: |
33 | make build > output.log 2>&1
34 | continue-on-error: false
35 |
36 | - name: Upload script output
37 | uses: actions/upload-artifact@v4
38 | with:
39 | name: script-output
40 | path: output.log
41 |
42 | - name: Create issue from file on failure
43 | if: failure()
44 | uses: peter-evans/create-issue-from-file@v5
45 | with:
46 | title: nightly build failure
47 | content-filepath: output.log
48 | assignees: dannykopping
--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
1 | # GitHub release workflow.
2 | name: publish-helm
3 | on:
4 | push:
5 | tags:
6 | - v*
7 |
8 | permissions:
9 | # Required to publish a release
10 | contents: write
11 | # Necessary to push docker images to ghcr.io.
12 | packages: write
13 | # Necessary for GCP authentication (https://github.com/google-github-actions/setup-gcloud#usage)
14 | id-token: write
15 |
16 | concurrency: ${{ github.workflow }}-${{ github.ref }}
17 |
18 | jobs:
19 | release:
20 | name: Build and publish
21 | runs-on: ubuntu-latest
22 | outputs:
23 | version: ${{ steps.version.outputs.version }}
24 | steps:
25 | - name: Checkout
26 | uses: actions/checkout@v4
27 | with:
28 | fetch-depth: 0
29 |
30 | # If the event that triggered the build was an annotated tag (which our
31 | # tags are supposed to be), actions/checkout has a bug where the tag in
32 | # question is only a lightweight tag and not a full annotated tag. This
33 | # command seems to fix it.
34 | # https://github.com/actions/checkout/issues/290
35 | - name: Fetch git tags
36 | run: git fetch --tags --force
37 |
38 | - name: Authenticate to Google Cloud
39 | uses: google-github-actions/auth@v2
40 | with:
41 | workload_identity_provider: projects/898976630798/locations/global/workloadIdentityPools/coder-ci/providers/github-actions
42 | service_account: coder-observability@coder-customer-releases.iam.gserviceaccount.com
43 |
44 | - name: Setup GCloud SDK
45 | uses: "google-github-actions/setup-gcloud@v2"
46 |
47 | - name: Install helm
48 | uses: azure/setup-helm@v4
49 | with:
50 | version: v3.9.2
51 |
52 | - name: Publish Helm Chart
53 | if: ${{ !inputs.dry_run }}
54 | run: |
55 | ./scripts/publish.sh
56 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | charts/
2 | build/
3 | scratch
4 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # CHANGELOG
2 |
3 | ## v0.3.0
4 |
5 | - Adding prebuilt workspace dashboard & alerts
6 |
7 | ## v0.2.1
8 |
9 | - Upgraded subcharts
10 | - Loki: upgraded to v6.7.1 -> v6.7.3
11 | - FIX: `listen-address` duplicate removed in `prometheus-config-reloader`
12 |
13 | ## v0.2.0
14 |
15 | - Upgraded subcharts
16 | - Grafana: upgraded from v7.3.7 -> v7.3.12
17 | - Prometheus: upgraded to v25.18.0 -> v25.24.1
18 | - Loki: upgraded to v6.3.4 -> v6.7.1
19 |
20 | ## v0.1.0
21 |
22 | - Lint Helm chart in CI
23 |
24 | ## v0.0.2 -> v0.0.11
25 |
26 | - Several stability & configurability improvements
27 |
28 | ## v0.0.1
29 |
30 | - Initial release
31 |
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @dannykopping
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Legal Code
2 |
3 | CC0 1.0 Universal
4 |
5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12 | HEREUNDER.
13 |
14 | Statement of Purpose
15 |
16 | The laws of most jurisdictions throughout the world automatically confer
17 | exclusive Copyright and Related Rights (defined below) upon the creator
18 | and subsequent owner(s) (each and all, an "owner") of an original work of
19 | authorship and/or a database (each, a "Work").
20 |
21 | Certain owners wish to permanently relinquish those rights to a Work for
22 | the purpose of contributing to a commons of creative, cultural and
23 | scientific works ("Commons") that the public can reliably and without fear
24 | of later claims of infringement build upon, modify, incorporate in other
25 | works, reuse and redistribute as freely as possible in any form whatsoever
26 | and for any purposes, including without limitation commercial purposes.
27 | These owners may contribute to the Commons to promote the ideal of a free
28 | culture and the further production of creative, cultural and scientific
29 | works, or to gain reputation or greater distribution for their Work in
30 | part through the use and efforts of others.
31 |
32 | For these and/or other purposes and motivations, and without any
33 | expectation of additional consideration or compensation, the person
34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35 | is an owner of Copyright and Related Rights in the Work, voluntarily
36 | elects to apply CC0 to the Work and publicly distribute the Work under its
37 | terms, with knowledge of his or her Copyright and Related Rights in the
38 | Work and the meaning and intended legal effect of CC0 on those rights.
39 |
40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
41 | protected by copyright and related or neighboring rights ("Copyright and
42 | Related Rights"). Copyright and Related Rights include, but are not
43 | limited to, the following:
44 |
45 | i. the right to reproduce, adapt, distribute, perform, display,
46 | communicate, and translate a Work;
47 | ii. moral rights retained by the original author(s) and/or performer(s);
48 | iii. publicity and privacy rights pertaining to a person's image or
49 | likeness depicted in a Work;
50 | iv. rights protecting against unfair competition in regards to a Work,
51 | subject to the limitations in paragraph 4(a), below;
52 | v. rights protecting the extraction, dissemination, use and reuse of data
53 | in a Work;
54 | vi. database rights (such as those arising under Directive 96/9/EC of the
55 | European Parliament and of the Council of 11 March 1996 on the legal
56 | protection of databases, and under any national implementation
57 | thereof, including any amended or successor version of such
58 | directive); and
59 | vii. other similar, equivalent or corresponding rights throughout the
60 | world based on applicable law or treaty, and any national
61 | implementations thereof.
62 |
63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
65 | irrevocably and unconditionally waives, abandons, and surrenders all of
66 | Affirmer's Copyright and Related Rights and associated claims and causes
67 | of action, whether now known or unknown (including existing as well as
68 | future claims and causes of action), in the Work (i) in all territories
69 | worldwide, (ii) for the maximum duration provided by applicable law or
70 | treaty (including future time extensions), (iii) in any current or future
71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
72 | including without limitation commercial, advertising or promotional
73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74 | member of the public at large and to the detriment of Affirmer's heirs and
75 | successors, fully intending that such Waiver shall not be subject to
76 | revocation, rescission, cancellation, termination, or any other legal or
77 | equitable action to disrupt the quiet enjoyment of the Work by the public
78 | as contemplated by Affirmer's express Statement of Purpose.
79 |
80 | 3. Public License Fallback. Should any part of the Waiver for any reason
81 | be judged legally invalid or ineffective under applicable law, then the
82 | Waiver shall be preserved to the maximum extent permitted taking into
83 | account Affirmer's express Statement of Purpose. In addition, to the
84 | extent the Waiver is so judged Affirmer hereby grants to each affected
85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
88 | maximum duration provided by applicable law or treaty (including future
89 | time extensions), (iii) in any current or future medium and for any number
90 | of copies, and (iv) for any purpose whatsoever, including without
91 | limitation commercial, advertising or promotional purposes (the
92 | "License"). The License shall be deemed effective as of the date CC0 was
93 | applied by Affirmer to the Work. Should any part of the License for any
94 | reason be judged legally invalid or ineffective under applicable law, such
95 | partial invalidity or ineffectiveness shall not invalidate the remainder
96 | of the License, and in such case Affirmer hereby affirms that he or she
97 | will not (i) exercise any of his or her remaining Copyright and Related
98 | Rights in the Work or (ii) assert any associated claims and causes of
99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 |
102 | 4. Limitations and Disclaimers.
103 |
104 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 | surrendered, licensed or otherwise affected by this document.
106 | b. Affirmer offers the Work as-is and makes no representations or
107 | warranties of any kind concerning the Work, express, implied,
108 | statutory or otherwise, including without limitation warranties of
109 | title, merchantability, fitness for a particular purpose, non
110 | infringement, or the absence of latent or other defects, accuracy, or
111 | the present or absence of errors, whether or not discoverable, all to
112 | the greatest extent permissible under applicable law.
113 | c. Affirmer disclaims responsibility for clearing rights of other persons
114 | that may apply to the Work or any use thereof, including without
115 | limitation any person's Copyright and Related Rights in the Work.
116 | Further, Affirmer disclaims responsibility for obtaining any necessary
117 | consents, permissions or other rights required for any use of the
118 | Work.
119 | d. Affirmer understands and acknowledges that Creative Commons is not a
120 | party to this document and has no duty or obligation with respect to
121 | this CC0 or use of the Work.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Use a single bash shell for each job, and immediately exit on failure
2 | SHELL := bash
3 | .SHELLFLAGS = -ceu
4 | .ONESHELL:
5 |
6 | # This doesn't work on directories.
7 | # See https://stackoverflow.com/questions/25752543/make-delete-on-error-for-directory-targets
8 | .DELETE_ON_ERROR:
9 |
10 | all: lint
11 | .PHONY: all
12 |
13 | lint: build lint/helm lint/rules readme
14 | ./scripts/check-unstaged.sh
15 | .PHONY: lint
16 |
17 | lint/helm: lint/helm/coder-observability
18 | .PHONY: lint/helm
19 |
20 | lint/helm/coder-observability:
21 | helm lint --strict --set coder.image.tag=v$(shell ./scripts/version.sh) coder-observability/
22 | .PHONY: lint/helm/coder-observability
23 |
24 | build:
25 | ./scripts/compile.sh
26 | .PHONY: build
27 |
28 | lint/rules: lint/helm/prometheus-rules
29 | .PHONY: lint/rules
30 |
31 | lint/helm/prometheus-rules:
32 | @./scripts/lint-rules.sh
33 |
34 | .PHONY: lint/helm/prometheus-rules
35 |
36 | # Usage: publish-patch, publish-minor, publish-major
37 | # Publishing is handled by GitHub Actions, triggered by tag creation.
38 | publish-%:
39 | version=$(shell ./scripts/version.sh --bump $*) && \
40 | git tag --sign "$$version" -m "Release: $$version" && \
41 | git push origin tag "$$version"
42 |
43 | readme:
44 | go install github.com/norwoodj/helm-docs/cmd/helm-docs@latest
45 | helm-docs --output-file ../README.md \
46 | --values-file=values.yaml --chart-search-root=coder-observability --template-files=../README.gotmpl
--------------------------------------------------------------------------------
/PUBLISH.md:
--------------------------------------------------------------------------------
1 | # Publishing the Coder Observability Chart
2 |
3 | - make desired changes
4 | - run `make publish-{major|minor|patch}` which creates & pushes a new tag, which kicks off a GH Action to publish the chart
--------------------------------------------------------------------------------
/README.gotmpl:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Coder Observability Chart
5 |
6 | > [!NOTE]
7 | > This Helm chart is in BETA; use with caution
8 |
9 | ## Overview
10 |
11 | This chart contains a highly opinionated set of integrations between Grafana, Loki, Prometheus, Alertmanager, and
12 | Grafana Agent.
13 |
14 | Dashboards, alerts, and runbooks are preconfigured for monitoring [Coder](https://coder.com/) installations.
15 |
16 | Out of the box:
17 |
18 | Metrics will be scraped from all pods which have a `prometheus.io/scrape=true` annotation.
19 | Logs will be scraped from all pods in the Kubernetes cluster.
20 |
21 | ## Installation
22 |
23 |
24 |
25 | ```bash
26 | helm repo add coder-observability https://helm.coder.com/observability
27 | helm upgrade --install coder-observability coder-observability/coder-observability --version 0.1.1 --namespace coder-observability --create-namespace
28 | ```
29 |
30 | ## Requirements
31 |
32 | ### General
33 |
34 | - Helm 3.7+
35 |
36 | ### Coder
37 |
38 |
39 | Kubernetes-based deployments
40 | If your installation is not in a namespace named `coder`, you will need to modify:
41 |
42 | ```yaml
43 | global:
44 | coder:
45 | controlPlaneNamespace:
46 | externalProvisionersNamespace:
47 | ```
48 |
49 |
50 |
51 |
52 | Non-Kubernetes deployments (click to expand)
53 | Ensure your Coder installation is accessible to the resources created by this chart.
54 |
55 | Set `global.coder.scrapeMetrics` such that the metrics can be scraped from your installation, e.g.:
56 |
57 | ```yaml
58 | global:
59 | coder:
60 | scrapeMetrics:
61 | hostname: your.coder.host
62 | port: 2112
63 | scrapeInterval: 15s
64 | additionalLabels:
65 | job: coder
66 | ```
67 |
68 | If you would like your logs scraped from a process outside Kubernetes, you need to mount the log file(s) in and
69 | configure Grafana Agent to scrape them; here's an example configuration:
70 |
71 | ```yaml
72 | grafana-agent:
73 | agent:
74 | mounts:
75 | extra:
76 | - mountPath: /var/log
77 | name: logs
78 | readOnly: true
79 | controller:
80 | volumes:
81 | extra:
82 | - hostPath:
83 | path: /var/log
84 | name: logs
85 |
86 | extraBlocks: |-
87 | loki.source.file "coder_log" {
88 | targets = [
89 | {__path__ = "/var/log/coder.log", job="coder"},
90 | ]
91 | forward_to = [loki.write.loki.receiver]
92 | }
93 | ```
94 |
95 |
96 |
97 | Ensure these environment variables are set in your Coder deployment:
98 |
99 | - `CODER_PROMETHEUS_ENABLE=true`
100 | - `CODER_PROMETHEUS_COLLECT_AGENT_STATS=true`
101 | - `CODER_LOGGING_HUMAN=/dev/stderr` (only `human` log format is supported
102 | currently; [issue](https://github.com/coder/observability/issues/8))
103 |
104 | Ensure these labels exist on your Coder & provisioner deployments:
105 |
106 | - `prometheus.io/scrape=true`
107 | - `prometheus.io/port=2112` (ensure this matches the port defined by `CODER_PROMETHEUS_ADDRESS`)
108 |
109 | If you use the [`coder/coder` helm chart](https://github.com/coder/coder/tree/main/helm), you can use the
110 | following:
111 |
112 | ```yaml
113 | coder:
114 | podAnnotations:
115 | prometheus.io/scrape: "true"
116 | prometheus.io/port: "2112"
117 | ```
118 |
119 | For more details, see
120 | the [coder documentation on exposing Prometheus metrics](https://coder.com/docs/v2/latest/admin/prometheus).
121 |
122 | ### Postgres
123 |
124 | You may configure the Helm chart to monitor your Coder deployment's Postgres server. Ensure that the resources created
125 | by this Helm chart can access your Postgres server.
126 |
127 | Create a secret with your Postgres password and reference it as follows, along with the other connection details:
128 |
129 | ```yaml
130 | global:
131 | postgres:
132 | hostname:
133 | port:
134 | database:
135 | username:
136 | mountSecret:
137 | ```
138 |
139 | The secret should be in the form of `PGPASSWORD=`, as this secret will be used to create an environment
140 | variable.
141 |
142 | ```yaml
143 | apiVersion: v1
144 | kind: Secret
145 | metadata:
146 | name: pg-secret
147 | namespace: coder-observability
148 | data:
149 | PGPASSWORD:
150 | ```
151 |
152 |
153 | Postgres metrics (click to expand)
154 |
155 | A tool called [`postgres-exporter`](https://github.com/prometheus-community/postgres_exporter) is used to scrape metrics
156 | from your Postgres server, and you can see the metrics it is exposing as follows:
157 |
158 | ```bash
159 | kubectl -n coder-observability port-forward statefulset/postgres-exporter 9187
160 |
161 | curl http://localhost:9187/metrics
162 | ```
163 |
164 |
165 |
166 | ### Grafana
167 |
168 | To access Grafana, run:
169 |
170 | ```bash
171 | kubectl -n coder-observability port-forward svc/grafana 3000:80
172 | ```
173 |
174 | And open your web browser to http://localhost:3000/.
175 |
176 | By default, Grafana is configured to allow anonymous access; if you want password authentication, define this in
177 | your `values.yaml`:
178 |
179 | ```yaml
180 | grafana:
181 | admin:
182 | existingSecret: grafana-admin
183 | userKey: username
184 | passwordKey: password
185 | grafana.ini:
186 | auth.anonymous:
187 | enabled: false
188 | ```
189 |
190 | You will also need to define a secret as follows:
191 |
192 | ```yaml
193 | apiVersion: v1
194 | kind: Secret
195 | metadata:
196 | name: grafana-admin # this matches the "existingSecret" field above
197 | stringData:
198 | username: "" # this matches the "userKey" field above
199 | password: "" # this matches the "passwordKey" field above
200 | ```
201 |
202 | To add an Ingress for Grafana, define this in your `values.yaml`:
203 |
204 | ```yaml
205 | grafana:
206 | grafana.ini:
207 | server:
208 | domain: observability.example.com
209 | root_url: "%(protocol)s://%(domain)s/grafana"
210 | serve_from_sub_path: true
211 | ingress:
212 | enabled: true
213 | hosts:
214 | - "observability.example.com"
215 | path: "/"
216 | ```
217 |
218 | ## Subcharts
219 |
220 | {{ template "chart.requirementsTable" . }}
221 |
222 | Each subchart can be disabled by setting the `enabled` field to `false`.
223 |
224 | | Subchart | Setting |
225 | |-----------------|-------------------------|
226 | | `grafana` | `grafana.enabled` |
227 | | `grafana-agent` | `grafana-agent.enabled` |
228 | | `loki` | `loki.enabled` |
229 | | `prometheus` | `prometheus.enabled` |
230 |
231 | ## Values
232 |
233 | The `global` values are the values which pertain to this chart, while the rest pertain to the subcharts.
234 | These values represent only the values _set_ in this chart. For the full list of available values, please see each
235 | subchart.
236 |
237 | For example, the `grafana.replicas` value is set by this chart by default, and is one of hundreds of available
238 | values which are defined [here](https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration).
239 |
240 | {{ template "chart.valuesTable" . }}
241 |
242 | {{ template "helm-docs.versionFooter" . }}
243 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Coder Observability Chart
5 |
6 | > [!NOTE]
7 | > This Helm chart is in BETA; use with caution
8 |
9 | ## Overview
10 |
11 | This chart contains a highly opinionated set of integrations between Grafana, Loki, Prometheus, Alertmanager, and
12 | Grafana Agent.
13 |
14 | Dashboards, alerts, and runbooks are preconfigured for monitoring [Coder](https://coder.com/) installations.
15 |
16 | Out of the box:
17 |
18 | Metrics will be scraped from all pods which have a `prometheus.io/scrape=true` annotation.
19 | Logs will be scraped from all pods in the Kubernetes cluster.
20 |
21 | ## Installation
22 |
23 |
24 |
25 | ```bash
26 | helm repo add coder-observability https://helm.coder.com/observability
27 | helm upgrade --install coder-observability coder-observability/coder-observability --version 0.1.1 --namespace coder-observability --create-namespace
28 | ```
29 |
30 | ## Requirements
31 |
32 | ### General
33 |
34 | - Helm 3.7+
35 |
36 | ### Coder
37 |
38 |
39 | Kubernetes-based deployments
40 | If your installation is not in a namespace named `coder`, you will need to modify:
41 |
42 | ```yaml
43 | global:
44 | coder:
45 | controlPlaneNamespace:
46 | externalProvisionersNamespace:
47 | ```
48 |
49 |
50 |
51 |
52 | Non-Kubernetes deployments (click to expand)
53 | Ensure your Coder installation is accessible to the resources created by this chart.
54 |
55 | Set `global.coder.scrapeMetrics` such that the metrics can be scraped from your installation, e.g.:
56 |
57 | ```yaml
58 | global:
59 | coder:
60 | scrapeMetrics:
61 | hostname: your.coder.host
62 | port: 2112
63 | scrapeInterval: 15s
64 | additionalLabels:
65 | job: coder
66 | ```
67 |
68 | If you would like your logs scraped from a process outside Kubernetes, you need to mount the log file(s) in and
69 | configure Grafana Agent to scrape them; here's an example configuration:
70 |
71 | ```yaml
72 | grafana-agent:
73 | agent:
74 | mounts:
75 | extra:
76 | - mountPath: /var/log
77 | name: logs
78 | readOnly: true
79 | controller:
80 | volumes:
81 | extra:
82 | - hostPath:
83 | path: /var/log
84 | name: logs
85 |
86 | extraBlocks: |-
87 | loki.source.file "coder_log" {
88 | targets = [
89 | {__path__ = "/var/log/coder.log", job="coder"},
90 | ]
91 | forward_to = [loki.write.loki.receiver]
92 | }
93 | ```
94 |
95 |
96 |
97 | Ensure these environment variables are set in your Coder deployment:
98 |
99 | - `CODER_PROMETHEUS_ENABLE=true`
100 | - `CODER_PROMETHEUS_COLLECT_AGENT_STATS=true`
101 | - `CODER_LOGGING_HUMAN=/dev/stderr` (only `human` log format is supported
102 | currently; [issue](https://github.com/coder/observability/issues/8))
103 |
104 | Ensure these labels exist on your Coder & provisioner deployments:
105 |
106 | - `prometheus.io/scrape=true`
107 | - `prometheus.io/port=2112` (ensure this matches the port defined by `CODER_PROMETHEUS_ADDRESS`)
108 |
109 | If you use the [`coder/coder` helm chart](https://github.com/coder/coder/tree/main/helm), you can use the
110 | following:
111 |
112 | ```yaml
113 | coder:
114 | podAnnotations:
115 | prometheus.io/scrape: "true"
116 | prometheus.io/port: "2112"
117 | ```
118 |
119 | For more details, see
120 | the [coder documentation on exposing Prometheus metrics](https://coder.com/docs/v2/latest/admin/prometheus).
121 |
122 | ### Postgres
123 |
124 | You may configure the Helm chart to monitor your Coder deployment's Postgres server. Ensure that the resources created
125 | by this Helm chart can access your Postgres server.
126 |
127 | Create a secret with your Postgres password and reference it as follows, along with the other connection details:
128 |
129 | ```yaml
130 | global:
131 | postgres:
132 | hostname:
133 | port:
134 | database:
135 | username:
136 | mountSecret:
137 | ```
138 |
139 | The secret should be in the form of `PGPASSWORD=`, as this secret will be used to create an environment
140 | variable.
141 |
142 | ```yaml
143 | apiVersion: v1
144 | kind: Secret
145 | metadata:
146 | name: pg-secret
147 | namespace: coder-observability
148 | data:
149 | PGPASSWORD:
150 | ```
151 |
152 |
153 | Postgres metrics (click to expand)
154 |
155 | A tool called [`postgres-exporter`](https://github.com/prometheus-community/postgres_exporter) is used to scrape metrics
156 | from your Postgres server, and you can see the metrics it is exposing as follows:
157 |
158 | ```bash
159 | kubectl -n coder-observability port-forward statefulset/postgres-exporter 9187
160 |
161 | curl http://localhost:9187/metrics
162 | ```
163 |
164 |
165 |
166 | ### Grafana
167 |
168 | To access Grafana, run:
169 |
170 | ```bash
171 | kubectl -n coder-observability port-forward svc/grafana 3000:80
172 | ```
173 |
174 | And open your web browser to http://localhost:3000/.
175 |
176 | By default, Grafana is configured to allow anonymous access; if you want password authentication, define this in
177 | your `values.yaml`:
178 |
179 | ```yaml
180 | grafana:
181 | admin:
182 | existingSecret: grafana-admin
183 | userKey: username
184 | passwordKey: password
185 | grafana.ini:
186 | auth.anonymous:
187 | enabled: false
188 | ```
189 |
190 | You will also need to define a secret as follows:
191 |
192 | ```yaml
193 | apiVersion: v1
194 | kind: Secret
195 | metadata:
196 | name: grafana-admin # this matches the "existingSecret" field above
197 | stringData:
198 | username: "" # this matches the "userKey" field above
199 | password: "" # this matches the "passwordKey" field above
200 | ```
201 |
202 | To add an Ingress for Grafana, define this in your `values.yaml`:
203 |
204 | ```yaml
205 | grafana:
206 | grafana.ini:
207 | server:
208 | domain: observability.example.com
209 | root_url: "%(protocol)s://%(domain)s/grafana"
210 | serve_from_sub_path: true
211 | ingress:
212 | enabled: true
213 | hosts:
214 | - "observability.example.com"
215 | path: "/"
216 | ```
217 |
218 | ## Subcharts
219 |
220 | | Repository | Name | Version |
221 | |------------|------|---------|
222 | | https://grafana.github.io/helm-charts | grafana | ~v7.3.7 |
223 | | https://grafana.github.io/helm-charts | grafana-agent(grafana-agent) | ~0.37.0 |
224 | | https://grafana.github.io/helm-charts | loki | ~v6.7.3 |
225 | | https://prometheus-community.github.io/helm-charts | prometheus | ~v25.24.1 |
226 |
227 | Each subchart can be disabled by setting the `enabled` field to `false`.
228 |
229 | | Subchart | Setting |
230 | |-----------------|-------------------------|
231 | | `grafana` | `grafana.enabled` |
232 | | `grafana-agent` | `grafana-agent.enabled` |
233 | | `loki` | `loki.enabled` |
234 | | `prometheus` | `prometheus.enabled` |
235 |
236 | ## Values
237 |
238 | The `global` values are the values which pertain to this chart, while the rest pertain to the subcharts.
239 | These values represent only the values _set_ in this chart. For the full list of available values, please see each
240 | subchart.
241 |
242 | For example, the `grafana.replicas` value is set by this chart by default, and is one of hundreds of available
243 | values which are defined [here](https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration).
244 |
245 | | Key | Type | Default | Description |
246 | |-----|------|---------|-------------|
247 | | global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"IneligiblePrebuilds":{"delay":"10m","enabled":true,"thresholds":{"notify":1}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"UnprovisionedPrebuiltWorkspaces":{"delay":"10m","enabled":true,"thresholds":{"warn":1}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder |
248 | | global.coder.coderdSelector | string | `"pod=~`coder.*`, pod!~`.*provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. ensure this uses backticks for quotes! |
249 | | global.coder.controlPlaneNamespace | string | `"coder"` | the namespace into which the control plane has been deployed. |
250 | | global.coder.externalProvisionersNamespace | string | `"coder"` | the namespace into which any external provisioners have been deployed. |
251 | | global.coder.logFormat | string | `"human"` | |
252 | | global.coder.provisionerdSelector | string | `"pod=~`coder-provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. https://coder.com/docs/v2/latest/admin/provisioners TODO: rename container label in provisioner helm chart to be "provisioner" not "coder" ensure this uses backticks for quotes! |
253 | | global.coder.scrapeMetrics | string | `nil` | use this to scrape metrics from a standalone (set of) coder deployment(s) if using kubernetes, rather add an annotation "prometheus.io/scrape=true" and coder will get automatically scraped; set this value to null and configure coderdSelector to target your coder pods |
254 | | global.coder.workspacesSelector | string | `"namespace=`coder-workspaces`"` | the namespace into which any external provisioners have been deployed. |
255 | | global.dashboards | object | `{"queryTimeout":900,"refresh":"30s","timerange":"12h"}` | settings for bundled dashboards |
256 | | global.dashboards.queryTimeout | int | `900` | how long until a query in Grafana will timeout after |
257 | | global.dashboards.refresh | string | `"30s"` | how often dashboards should refresh |
258 | | global.dashboards.timerange | string | `"12h"` | how far back dashboards should look |
259 | | global.externalScheme | string | `"http"` | |
260 | | global.externalZone | string | `"svc.cluster.local"` | |
261 | | global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","sslrootcert":"/home/coder/.postgresql/rootcert.pem","username":"coder","volumeMounts":[{"mountPath":"/home/coder/.postgresql","name":"pg-certs-mount","readOnly":true}],"volumes":[{"configMap":{"name":"pg-certs-mount-config-map"},"name":"pg-certs-mount"}]}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts |
262 | | global.postgres.alerts | object | `{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}}` | alerts for postgres |
263 | | global.telemetry | object | `{"metrics":{"scrape_interval":"15s","scrape_timeout":"12s"}}` | control telemetry collection |
264 | | global.telemetry.metrics | object | `{"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection |
265 | | global.telemetry.metrics.scrape_interval | string | `"15s"` | how often the collector will scrape discovered pods |
266 | | global.telemetry.metrics.scrape_timeout | string | `"12s"` | how long a request will be allowed to wait before being canceled |
267 | | global.zone | string | `"svc"` | |
268 | | grafana-agent.agent.clustering.enabled | bool | `false` | |
269 | | grafana-agent.agent.configMap.create | bool | `false` | |
270 | | grafana-agent.agent.configMap.key | string | `"config.river"` | |
271 | | grafana-agent.agent.configMap.name | string | `"collector-config"` | |
272 | | grafana-agent.agent.extraArgs[0] | string | `"--disable-reporting=true"` | |
273 | | grafana-agent.agent.mode | string | `"flow"` | |
274 | | grafana-agent.agent.mounts.dockercontainers | bool | `true` | |
275 | | grafana-agent.agent.mounts.varlog | bool | `true` | |
276 | | grafana-agent.commonRelabellings | string | `"rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n}\n// coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\nrule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n}\nrule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n}\nrule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n}\nrule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n}\nrule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n}"` | |
277 | | grafana-agent.controller.podAnnotations."prometheus.io/scrape" | string | `"true"` | |
278 | | grafana-agent.controller.type | string | `"daemonset"` | |
279 | | grafana-agent.crds.create | bool | `false` | |
280 | | grafana-agent.discovery | string | `"// Discover k8s nodes\ndiscovery.kubernetes \"nodes\" {\n role = \"node\"\n}\n\n// Discover k8s pods\ndiscovery.kubernetes \"pods\" {\n role = \"pod\"\n selectors {\n role = \"pod\"\n }\n}"` | |
281 | | grafana-agent.enabled | bool | `true` | |
282 | | grafana-agent.extraBlocks | string | `""` | |
283 | | grafana-agent.fullnameOverride | string | `"grafana-agent"` | |
284 | | grafana-agent.podLogsRelabelRules | string | `""` | |
285 | | grafana-agent.podMetricsRelabelRules | string | `""` | |
286 | | grafana-agent.withOTLPReceiver | bool | `false` | |
287 | | grafana."grafana.ini"."auth.anonymous".enabled | bool | `true` | |
288 | | grafana."grafana.ini"."auth.anonymous".org_name | string | `"Main Org."` | |
289 | | grafana."grafana.ini"."auth.anonymous".org_role | string | `"Admin"` | |
290 | | grafana."grafana.ini".analytics.reporting_enabled | bool | `false` | |
291 | | grafana."grafana.ini".dashboards.default_home_dashboard_path | string | `"/var/lib/grafana/dashboards/coder/0/status.json"` | |
292 | | grafana."grafana.ini".dataproxy.timeout | string | `"{{ $.Values.global.dashboards.queryTimeout }}"` | |
293 | | grafana."grafana.ini".feature_toggles.autoMigrateOldPanels | bool | `true` | |
294 | | grafana."grafana.ini".users.allow_sign_up | bool | `false` | |
295 | | grafana.admin.existingSecret | string | `""` | |
296 | | grafana.annotations."prometheus.io/scrape" | string | `"true"` | |
297 | | grafana.dashboardProviders."coder.yaml".apiVersion | int | `1` | |
298 | | grafana.dashboardProviders."coder.yaml".providers[0].disableDeletion | bool | `false` | |
299 | | grafana.dashboardProviders."coder.yaml".providers[0].editable | bool | `false` | |
300 | | grafana.dashboardProviders."coder.yaml".providers[0].folder | string | `"Coder"` | |
301 | | grafana.dashboardProviders."coder.yaml".providers[0].name | string | `"coder"` | |
302 | | grafana.dashboardProviders."coder.yaml".providers[0].options.path | string | `"/var/lib/grafana/dashboards/coder"` | |
303 | | grafana.dashboardProviders."coder.yaml".providers[0].orgId | int | `1` | |
304 | | grafana.dashboardProviders."coder.yaml".providers[0].type | string | `"file"` | |
305 | | grafana.dashboardProviders."coder.yaml".providers[0].updateIntervalSeconds | int | `5` | |
306 | | grafana.dashboardProviders."infra.yaml".apiVersion | int | `1` | |
307 | | grafana.dashboardProviders."infra.yaml".providers[0].disableDeletion | bool | `false` | |
308 | | grafana.dashboardProviders."infra.yaml".providers[0].editable | bool | `false` | |
309 | | grafana.dashboardProviders."infra.yaml".providers[0].folder | string | `"Infrastructure"` | |
310 | | grafana.dashboardProviders."infra.yaml".providers[0].name | string | `"infra"` | |
311 | | grafana.dashboardProviders."infra.yaml".providers[0].options.path | string | `"/var/lib/grafana/dashboards/infra"` | |
312 | | grafana.dashboardProviders."infra.yaml".providers[0].orgId | int | `1` | |
313 | | grafana.dashboardProviders."infra.yaml".providers[0].type | string | `"file"` | |
314 | | grafana.dashboardProviders."sidecar.yaml".apiVersion | int | `1` | |
315 | | grafana.dashboardProviders."sidecar.yaml".providers[0].disableDeletion | bool | `false` | |
316 | | grafana.dashboardProviders."sidecar.yaml".providers[0].editable | bool | `false` | |
317 | | grafana.dashboardProviders."sidecar.yaml".providers[0].folder | string | `"Other"` | |
318 | | grafana.dashboardProviders."sidecar.yaml".providers[0].name | string | `"sidecar"` | |
319 | | grafana.dashboardProviders."sidecar.yaml".providers[0].options.path | string | `"/tmp/dashboards"` | |
320 | | grafana.dashboardProviders."sidecar.yaml".providers[0].orgId | int | `1` | |
321 | | grafana.dashboardProviders."sidecar.yaml".providers[0].type | string | `"file"` | |
322 | | grafana.dashboardProviders."sidecar.yaml".providers[0].updateIntervalSeconds | int | `30` | |
323 | | grafana.dashboards.infra.node-exporter-full.datasource | string | `"metrics"` | |
324 | | grafana.dashboards.infra.node-exporter-full.gnetId | int | `1860` | |
325 | | grafana.dashboards.infra.node-exporter-full.revision | int | `36` | |
326 | | grafana.dashboards.infra.postgres-database.datasource | string | `"metrics"` | |
327 | | grafana.dashboards.infra.postgres-database.gnetId | int | `9628` | |
328 | | grafana.dashboards.infra.postgres-database.revision | int | `7` | |
329 | | grafana.datasources."datasources.yaml".apiVersion | int | `1` | |
330 | | grafana.datasources."datasources.yaml".datasources[0].access | string | `"proxy"` | |
331 | | grafana.datasources."datasources.yaml".datasources[0].editable | bool | `false` | |
332 | | grafana.datasources."datasources.yaml".datasources[0].isDefault | bool | `true` | |
333 | | grafana.datasources."datasources.yaml".datasources[0].name | string | `"metrics"` | |
334 | | grafana.datasources."datasources.yaml".datasources[0].timeout | string | `"{{ add $.Values.global.dashboards.queryTimeout 5 }}"` | |
335 | | grafana.datasources."datasources.yaml".datasources[0].type | string | `"prometheus"` | |
336 | | grafana.datasources."datasources.yaml".datasources[0].uid | string | `"prometheus"` | |
337 | | grafana.datasources."datasources.yaml".datasources[0].url | string | `"http://prometheus.{{ .Release.Namespace }}.{{ $.Values.global.zone }}"` | |
338 | | grafana.datasources."datasources.yaml".datasources[1].access | string | `"proxy"` | |
339 | | grafana.datasources."datasources.yaml".datasources[1].editable | bool | `false` | |
340 | | grafana.datasources."datasources.yaml".datasources[1].isDefault | bool | `false` | |
341 | | grafana.datasources."datasources.yaml".datasources[1].name | string | `"logs"` | |
342 | | grafana.datasources."datasources.yaml".datasources[1].timeout | string | `"{{ add $.Values.global.dashboards.queryTimeout 5 }}"` | |
343 | | grafana.datasources."datasources.yaml".datasources[1].type | string | `"loki"` | |
344 | | grafana.datasources."datasources.yaml".datasources[1].uid | string | `"loki"` | |
345 | | grafana.datasources."datasources.yaml".datasources[1].url | string | `"http://loki-gateway.{{ .Release.Namespace }}.{{ $.Values.global.zone }}"` | |
346 | | grafana.datasources."datasources.yaml".datasources[2].editable | bool | `false` | |
347 | | grafana.datasources."datasources.yaml".datasources[2].isDefault | bool | `false` | |
348 | | grafana.datasources."datasources.yaml".datasources[2].jsonData.sslmode | string | `"{{ .Values.global.postgres.sslmode }}"` | |
349 | | grafana.datasources."datasources.yaml".datasources[2].name | string | `"postgres"` | |
350 | | grafana.datasources."datasources.yaml".datasources[2].secureJsonData.password | string | `"{{ if .Values.global.postgres.password }}{{ .Values.global.postgres.password }}{{ else }}$PGPASSWORD{{ end }}"` | |
351 | | grafana.datasources."datasources.yaml".datasources[2].timeout | string | `"{{ add $.Values.global.dashboards.queryTimeout 5 }}"` | |
352 | | grafana.datasources."datasources.yaml".datasources[2].type | string | `"postgres"` | |
353 | | grafana.datasources."datasources.yaml".datasources[2].uid | string | `"postgres"` | |
354 | | grafana.datasources."datasources.yaml".datasources[2].url | string | `"{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}"` | |
355 | | grafana.datasources."datasources.yaml".datasources[2].user | string | `"{{ .Values.global.postgres.username }}"` | |
356 | | grafana.deploymentStrategy.type | string | `"Recreate"` | |
357 | | grafana.enabled | bool | `true` | |
358 | | grafana.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION | bool | `true` | |
359 | | grafana.extraConfigmapMounts[0].configMap | string | `"dashboards-status"` | |
360 | | grafana.extraConfigmapMounts[0].mountPath | string | `"/var/lib/grafana/dashboards/coder/0"` | |
361 | | grafana.extraConfigmapMounts[0].name | string | `"dashboards-status"` | |
362 | | grafana.extraConfigmapMounts[0].readOnly | bool | `false` | |
363 | | grafana.extraConfigmapMounts[1].configMap | string | `"dashboards-coderd"` | |
364 | | grafana.extraConfigmapMounts[1].mountPath | string | `"/var/lib/grafana/dashboards/coder/1"` | |
365 | | grafana.extraConfigmapMounts[1].name | string | `"dashboards-coderd"` | |
366 | | grafana.extraConfigmapMounts[1].readOnly | bool | `false` | |
367 | | grafana.extraConfigmapMounts[2].configMap | string | `"dashboards-provisionerd"` | |
368 | | grafana.extraConfigmapMounts[2].mountPath | string | `"/var/lib/grafana/dashboards/coder/2"` | |
369 | | grafana.extraConfigmapMounts[2].name | string | `"dashboards-provisionerd"` | |
370 | | grafana.extraConfigmapMounts[2].readOnly | bool | `false` | |
371 | | grafana.extraConfigmapMounts[3].configMap | string | `"dashboards-workspaces"` | |
372 | | grafana.extraConfigmapMounts[3].mountPath | string | `"/var/lib/grafana/dashboards/coder/3"` | |
373 | | grafana.extraConfigmapMounts[3].name | string | `"dashboards-workspaces"` | |
374 | | grafana.extraConfigmapMounts[3].readOnly | bool | `false` | |
375 | | grafana.extraConfigmapMounts[4].configMap | string | `"dashboards-workspace-detail"` | |
376 | | grafana.extraConfigmapMounts[4].mountPath | string | `"/var/lib/grafana/dashboards/coder/4"` | |
377 | | grafana.extraConfigmapMounts[4].name | string | `"dashboards-workspace-detail"` | |
378 | | grafana.extraConfigmapMounts[4].readOnly | bool | `false` | |
379 | | grafana.extraConfigmapMounts[5].configMap | string | `"dashboards-prebuilds"` | |
380 | | grafana.extraConfigmapMounts[5].mountPath | string | `"/var/lib/grafana/dashboards/coder/5"` | |
381 | | grafana.extraConfigmapMounts[5].name | string | `"dashboards-prebuilds"` | |
382 | | grafana.extraConfigmapMounts[5].readOnly | bool | `false` | |
383 | | grafana.fullnameOverride | string | `"grafana"` | |
384 | | grafana.image.tag | string | `"10.4.19"` | |
385 | | grafana.persistence.enabled | bool | `true` | |
386 | | grafana.persistence.size | string | `"10Gi"` | |
387 | | grafana.replicas | int | `1` | |
388 | | grafana.service.enabled | bool | `true` | |
389 | | grafana.sidecar.dashboards.enabled | bool | `false` | |
390 | | grafana.sidecar.dashboards.labelValue | string | `"1"` | |
391 | | grafana.sidecar.dashboards.provider.allowUiUpdates | bool | `true` | |
392 | | grafana.sidecar.dashboards.provider.disableDelete | bool | `true` | |
393 | | grafana.testFramework.enabled | bool | `false` | |
394 | | grafana.useStatefulSet | bool | `true` | |
395 | | loki.backend.extraArgs[0] | string | `"-log.level=debug"` | |
396 | | loki.backend.extraVolumeMounts[0].mountPath | string | `"/var/loki-ruler-wal"` | |
397 | | loki.backend.extraVolumeMounts[0].name | string | `"ruler-wal"` | |
398 | | loki.backend.extraVolumes[0].emptyDir | object | `{}` | |
399 | | loki.backend.extraVolumes[0].name | string | `"ruler-wal"` | |
400 | | loki.backend.podAnnotations."prometheus.io/scrape" | string | `"true"` | |
401 | | loki.backend.replicas | int | `1` | |
402 | | loki.chunksCache.allocatedMemory | int | `1024` | |
403 | | loki.enabled | bool | `true` | |
404 | | loki.enterprise.adminApi.enabled | bool | `false` | |
405 | | loki.enterprise.enabled | bool | `false` | |
406 | | loki.enterprise.useExternalLicense | bool | `false` | |
407 | | loki.fullnameOverride | string | `"loki"` | |
408 | | loki.gateway.replicas | int | `1` | |
409 | | loki.loki.auth_enabled | bool | `false` | |
410 | | loki.loki.commonConfig.path_prefix | string | `"/var/loki"` | |
411 | | loki.loki.commonConfig.replication_factor | int | `1` | |
412 | | loki.loki.rulerConfig.alertmanager_url | string | `"http://alertmanager.{{ .Release.Namespace }}.{{ .Values.global.zone}}"` | |
413 | | loki.loki.rulerConfig.enable_alertmanager_v2 | bool | `true` | |
414 | | loki.loki.rulerConfig.enable_api | bool | `true` | |
415 | | loki.loki.rulerConfig.remote_write.clients.fake.headers.Source | string | `"Loki"` | |
416 | | loki.loki.rulerConfig.remote_write.clients.fake.remote_timeout | string | `"30s"` | |
417 | | loki.loki.rulerConfig.remote_write.clients.fake.url | string | `"http://prometheus.{{ .Release.Namespace }}.{{ .Values.global.zone}}/api/v1/write"` | |
418 | | loki.loki.rulerConfig.remote_write.enabled | bool | `true` | |
419 | | loki.loki.rulerConfig.ring.kvstore.store | string | `"inmemory"` | |
420 | | loki.loki.rulerConfig.rule_path | string | `"/rules"` | |
421 | | loki.loki.rulerConfig.storage.local.directory | string | `"/rules"` | |
422 | | loki.loki.rulerConfig.storage.type | string | `"local"` | |
423 | | loki.loki.rulerConfig.wal.dir | string | `"/var/loki-ruler-wal"` | |
424 | | loki.loki.schemaConfig.configs[0].from | string | `"2024-04-01"` | |
425 | | loki.loki.schemaConfig.configs[0].index.period | string | `"24h"` | |
426 | | loki.loki.schemaConfig.configs[0].index.prefix | string | `"index_"` | |
427 | | loki.loki.schemaConfig.configs[0].object_store | string | `"s3"` | |
428 | | loki.loki.schemaConfig.configs[0].schema | string | `"v13"` | |
429 | | loki.loki.schemaConfig.configs[0].store | string | `"tsdb"` | |
430 | | loki.lokiCanary.annotations."prometheus.io/scrape" | string | `"true"` | |
431 | | loki.lokiCanary.enabled | bool | `true` | |
432 | | loki.minio.address | string | `"loki-storage.{{ .Release.Namespace }}.{{ .Values.global.zone}}:9000"` | |
433 | | loki.minio.enabled | bool | `true` | |
434 | | loki.minio.fullnameOverride | string | `"loki-storage"` | |
435 | | loki.minio.podAnnotations."prometheus.io/path" | string | `"/minio/v2/metrics/cluster"` | |
436 | | loki.minio.podAnnotations."prometheus.io/scrape" | string | `"true"` | |
437 | | loki.minio.podLabels."app.kubernetes.io/name" | string | `"loki-storage"` | |
438 | | loki.monitoring.dashboards.enabled | bool | `true` | |
439 | | loki.monitoring.selfMonitoring.enabled | bool | `false` | |
440 | | loki.monitoring.selfMonitoring.grafanaAgent.installOperator | bool | `false` | |
441 | | loki.nameOverride | string | `"loki"` | |
442 | | loki.read.podAnnotations."prometheus.io/scrape" | string | `"true"` | |
443 | | loki.read.replicas | int | `1` | |
444 | | loki.resultsCache.allocatedMemory | int | `1024` | |
445 | | loki.sidecar.rules.folder | string | `"/rules/fake"` | |
446 | | loki.sidecar.rules.logLevel | string | `"DEBUG"` | |
447 | | loki.test.canaryServiceAddress | string | `"http://loki-canary:3500/metrics"` | |
448 | | loki.test.enabled | bool | `true` | |
449 | | loki.write.extraArgs[0] | string | `"-log.level=debug"` | |
450 | | loki.write.podAnnotations."prometheus.io/scrape" | string | `"true"` | |
451 | | loki.write.replicas | int | `1` | |
452 | | prometheus.alertmanager.enabled | bool | `true` | |
453 | | prometheus.alertmanager.fullnameOverride | string | `"alertmanager"` | |
454 | | prometheus.alertmanager.podAnnotations."prometheus.io/scrape" | string | `"true"` | |
455 | | prometheus.alertmanager.service.port | int | `80` | |
456 | | prometheus.configmapReload.prometheus.containerPort | int | `9091` | |
457 | | prometheus.configmapReload.prometheus.extraArgs.log-level | string | `"all"` | |
458 | | prometheus.configmapReload.prometheus.extraArgs.watch-interval | string | `"15s"` | |
459 | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].configMap | string | `"metrics-alerts"` | |
460 | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].mountPath | string | `"/etc/config/alerts"` | |
461 | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].name | string | `"alerts"` | |
462 | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].readonly | bool | `true` | |
463 | | prometheus.enabled | bool | `true` | |
464 | | prometheus.kube-state-metrics.enabled | bool | `true` | |
465 | | prometheus.kube-state-metrics.fullnameOverride | string | `"kube-state-metrics"` | |
466 | | prometheus.kube-state-metrics.podAnnotations."prometheus.io/scrape" | string | `"true"` | |
467 | | prometheus.prometheus-node-exporter.enabled | bool | `true` | |
468 | | prometheus.prometheus-node-exporter.fullnameOverride | string | `"node-exporter"` | |
469 | | prometheus.prometheus-node-exporter.podAnnotations."prometheus.io/scrape" | string | `"true"` | |
470 | | prometheus.prometheus-pushgateway.enabled | bool | `false` | |
471 | | prometheus.server.extraArgs."log.level" | string | `"debug"` | |
472 | | prometheus.server.extraConfigmapMounts[0].configMap | string | `"metrics-alerts"` | |
473 | | prometheus.server.extraConfigmapMounts[0].mountPath | string | `"/etc/config/alerts"` | |
474 | | prometheus.server.extraConfigmapMounts[0].name | string | `"alerts"` | |
475 | | prometheus.server.extraConfigmapMounts[0].readonly | bool | `true` | |
476 | | prometheus.server.extraFlags[0] | string | `"web.enable-lifecycle"` | |
477 | | prometheus.server.extraFlags[1] | string | `"enable-feature=remote-write-receiver"` | |
478 | | prometheus.server.fullnameOverride | string | `"prometheus"` | |
479 | | prometheus.server.global.evaluation_interval | string | `"30s"` | |
480 | | prometheus.server.persistentVolume.enabled | bool | `true` | |
481 | | prometheus.server.persistentVolume.size | string | `"12Gi"` | |
482 | | prometheus.server.podAnnotations."prometheus.io/scrape" | string | `"true"` | |
483 | | prometheus.server.replicaCount | int | `1` | |
484 | | prometheus.server.retentionSize | string | `"10GB"` | |
485 | | prometheus.server.service.type | string | `"ClusterIP"` | |
486 | | prometheus.server.statefulSet.enabled | bool | `true` | |
487 | | prometheus.serverFiles."prometheus.yml".rule_files[0] | string | `"/etc/config/alerts/*.yaml"` | |
488 | | prometheus.serverFiles."prometheus.yml".scrape_configs | list | `[]` | |
489 | | prometheus.testFramework.enabled | bool | `false` | |
490 | | runbookViewer.image | string | `"dannyben/madness"` | |
491 | | sqlExporter.image | string | `"burningalchemist/sql_exporter"` | |
492 |
493 |
--------------------------------------------------------------------------------
/artifacthub-repo.yaml:
--------------------------------------------------------------------------------
1 | # This file is uploaded to GCS at helm.coder.com/observability/artifacthub-repo.yml
2 | # and used by ArtifactHub to verify the repository.
3 | repositoryID: 167a0393-cb7e-4f42-af79-02f8a91915f5
4 | owners:
5 | - name: colin
6 | email: colin@coder.com
7 | - name: Danny Kopping
8 | email: danny@coder.com
--------------------------------------------------------------------------------
/coder-observability/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 |
--------------------------------------------------------------------------------
/coder-observability/Chart.lock:
--------------------------------------------------------------------------------
1 | dependencies:
2 | - name: grafana
3 | repository: https://grafana.github.io/helm-charts
4 | version: 7.3.12
5 | - name: prometheus
6 | repository: https://prometheus-community.github.io/helm-charts
7 | version: 25.24.2
8 | - name: loki
9 | repository: https://grafana.github.io/helm-charts
10 | version: 6.7.4
11 | - name: grafana-agent
12 | repository: https://grafana.github.io/helm-charts
13 | version: 0.37.0
14 | digest: sha256:05e0dae0200cabf5cb9e2cfb18a4e166fcaceefaf39827addff4299b18c31d4e
15 | generated: "2025-01-16T07:54:38.036598102Z"
16 |
--------------------------------------------------------------------------------
/coder-observability/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: coder-observability
3 | description: Gain insights into your Coder deployment
4 |
5 | type: application
6 | version: 0.1.0
7 | dependencies:
8 | - name: grafana
9 | condition: grafana.enabled
10 | repository: https://grafana.github.io/helm-charts
11 | version: '~v7.3.7'
12 | - name: prometheus
13 | condition: prometheus.enabled
14 | repository: https://prometheus-community.github.io/helm-charts
15 | version: '~v25.24.1'
16 | - name: loki
17 | condition: loki.enabled
18 | repository: https://grafana.github.io/helm-charts
19 | version: '~v6.7.3'
20 | - name: grafana-agent
21 | alias: grafana-agent
22 | condition: grafana-agent.enabled
23 | repository: https://grafana.github.io/helm-charts
24 | version: '~0.37.0'
25 | maintainers:
26 | - name: Coder Technologies, Inc.
27 | url: https://github.com/coder/observability/issues
28 | keywords:
29 | - observability
30 | - coder
31 | - coder.com
32 | - cloud development environment
33 | - cde
34 | sources:
35 | - https://github.com/coder/observability
36 | icon: https://helm.coder.com/coder_logo_black.png
37 | annotations:
38 | artifacthub.io/category: monitoring-logging
--------------------------------------------------------------------------------
/coder-observability/runbooks/coderd.md:
--------------------------------------------------------------------------------
1 | # Coderd Runbooks
2 |
3 | ## CoderdCPUUsage
4 |
5 | The CPU usage of one or more Coder pods has been close to the limit defined for
6 | the deployment. This can cause slowness in the application, workspaces becoming
7 | unavailable, and may lead to the application failing its liveness probes and
8 | being restarted.
9 |
10 | To resolve this issue, increase the CPU limits of the Coder deployment.
11 |
12 | If you find this occurring frequently, you may wish to check your Coder
13 | deployment against [Coder's Reference Architectures](https://coder.com/docs/v2/latest/admin/architectures).
14 |
15 | ## CoderdMemoryUsage
16 |
17 | The memory usage of one or more Coder pods has been close to the limit defined
18 | for the deployment. When the memory usage exceeds the limit, the pod(s) will be
19 | restarted by Kubernetes. This will interrupt all connections to workspaces being
20 | handled by the affected pod(s).
21 |
22 | To resolve this issue, increase the memory limits of the Coder deployment.
23 |
24 | If you find this occurring frequently, check the memory usage over a longer
25 | period of time. If it appears to be increasing monotonically, this is likely a
26 | memory leak and should be considered a bug.
27 |
28 | ## CoderdRestarts
29 |
30 | One or more Coder pods have been restarting multiple times in the last 10
31 | minutes. This may be due to a number of issues, including:
32 |
33 | - Failure to connect to the configured database: Coder requires a reachable
34 | PostgreSQL database to function. If it fails to connect, you will see an error
35 | similar to the following:
36 |
37 | ```console
38 | [warn] ping postgres: retrying error="dial tcp 10.43.94.60:5432: connect: connection refused" try=3
39 | ```
40 |
41 | - Out-Of-Memory (OOM) kills due to memory usage (see [above](#codermemoryusage)),
42 | - An unexpected bug causing the application to exit with an error.
43 |
44 | If Coder is not restarting due to excessive memory usage, check the logs:
45 |
46 | 1. Check the logs of the deployment for any errors,
47 |
48 | ```console
49 | kubectl -n logs deployment/coder --previous
50 | ```
51 |
52 | 2. Check any Kubernetes events related to the deployment,
53 |
54 | ```console
55 | kubectl -n events --watch
56 | ```
57 |
58 | ## CoderdReplicas
59 |
60 | One or more Coderd replicas are down. This may cause availability problems and elevated
61 | response times for user and agent API calls.
62 |
63 | To resolve this issue, review the Coder deployment for possible `CrashLoopBackOff`
64 | instances or re-adjust alarm levels based on the actual number of replicas.
65 |
66 | ## CoderdWorkspaceBuildFailures
67 |
68 | A few workspace build errors have been recently observed.
69 |
70 | Review Prometheus metrics to identify failed jobs. Check the workspace build logs
71 | to determine if there is a relationship with a new template version or a buggy
72 | Terraform plugin.
73 |
74 | ## CoderdLicenseSeats
75 |
76 | Your Enterprise license is approaching or has exceeded the number of seats purchased.
77 |
78 | Please contact your Coder sales contact, or visit https://coder.com/contact/sales.
79 |
80 | ## CoderdIneligiblePrebuilds
81 |
82 | Prebuilds only become eligible to be claimed by users once the workspace's agent is a) running and b) all of its startup
83 | scripts have completed.
84 |
85 | If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem.
86 |
87 | ## CoderdUnprovisionedPrebuiltWorkspaces
88 |
89 | The number of running prebuilt workspaces is lower than the desired instances. This could be for several reasons,
90 | ordered by likehood:
91 |
92 | ### Experiment/License
93 |
94 | The prebuilds feature is currently gated behind an experiment *and* a premium license.
95 |
96 | Ensure that the prebuilds experiment is enabled with `CODER_EXPERIMENTS=workspace-prebuilds`, and that you have a premium
97 | license added.
98 |
99 | ### Preset Validation Issue
100 |
101 | Templates which have prebuilds configured will require a configured preset defined, with ALL of the required parameters
102 | set in the preset. If any of these are missing, or any of the parameters - as defined - fail validation, then the prebuilds
103 | subsystem will refuse to attempt a workspace build.
104 |
105 | Consult the coderd logs for more information; look out for errors or warnings from the prebuilds subsystem.
106 |
107 | ### Template Misconfiguration or Error
108 |
109 | Prebuilt workspaces cannot be provisioned due to some issue at `terraform apply`-time. This could be due to misconfigured
110 | cloud resources, improper authorization, or any number of other issues.
111 |
112 | Visit the Workspaces page, change the search term to `owner:prebuilds`, and view on the previously failed builds. The
113 | error will likely be quite obvious.
114 |
115 | ### Provisioner Latency
116 |
117 | If your provisioners are overloaded and cannot process provisioner jobs quickly enough, prebuilt workspaces may be affected.
118 | There is no prioritization at present for prebuilt workspace jobs.
119 |
120 | Ensure your provisioners are appropriately resources (i.e. you have enough instances) to handle the concurrent build demand.
121 |
122 | ### Use of Workspace Tags
123 |
124 | If you are using `coder_workspace_tags` ([docs](https://coder.com/docs/admin/templates/extending-templates/workspace-tags))
125 | in your template, chances are you do not have any provisioners running or they are under-resourced (see **Provisioner Latency**).
126 |
127 | Ensure your running provisioners are configured with your desired tags.
128 |
129 | ### Reconciliation Loop Issue
130 |
131 | The prebuilds subsystem runs a _reconciliation loop_ which monitors the state of prebuilt workspaces to ensure the desired
132 | number of instances are present at all times. Workspace Prebuilds is currently a BETA feature and so there could be a bug
133 | in this _reconciliation loop_, which should be reported to Coder.
134 |
135 | Examine your coderd logs for any errors or warnings relating to prebuilds.
--------------------------------------------------------------------------------
/coder-observability/runbooks/postgres.md:
--------------------------------------------------------------------------------
1 | # Postgres Runbooks
2 |
3 | ## PostgresNotificationQueueFillingUp
4 |
5 | Postgres offers asynchronous notification via the `LISTEN` and `NOTIFY`
6 | commands. Coder depends heavily on this async notification mechanism for routine
7 | functionality.
8 |
9 | This may be due to a session executing `LISTEN()` and entering a long
10 | transaction. To verify:
11 |
12 | - Check active sessions with `SELECT * FROM pg_stat_activity;`,
13 | - Check the database log for the PID of the session that is preventing cleanup,
14 | - Kill the query: `SELECT pg_terminate_backend();`
15 |
16 | For more information, see the PostgreSQL documentation available here:
17 |
18 | - [PostgreSQL documentation on `LISTEN`](https://www.postgresql.org/docs/current/sql-listen.html)
19 | - [PostgreSQL documentation on `NOTIFY`](https://www.postgresql.org/docs/current/sql-notify.html)
20 |
21 | ## PostgresDown
22 |
23 | Postgres is not currently running, which means the Coder control plane will not be able to read or write any state.
24 | Workspaces may continue to work normally but it is recommended to get Postgres back up as quickly as possible.
25 |
26 | ## PostgresConnectionsRunningLow
27 |
28 | PostgreSQL has a `max_connections` setting that determines the maximum number of
29 | concurrent connections. Once this connection limit is reached, no new
30 | connections will be possible.
31 |
32 | To increase the maximum number of concurrent connections, update the `max_connections`
33 | configuration option for your PostgreSQL instance. See the PostgreSQL
34 | documentation for more details.
35 |
36 | **Note:** You may also need to adjust `shared_buffers` after increasing
37 | `max_connections`. Additionally, you may also need to adjust the kernel
38 | configuration value `kernel.shmmax` in `/etc/sysctl.conf` /
39 | `/etc/sysctl.conf.d`.
40 |
41 | For more information, see:
42 |
43 | - [PostgreSQL Documentation: Server Configuration](https://www.postgresql.org/docs/16/runtime-config-file-locations.html)
44 | - [Tuning your PostgreSQL Server](https://wiki.postgresql.org/wiki/Tuning_Your_PostgreSQL_Server)
45 |
--------------------------------------------------------------------------------
/coder-observability/runbooks/provisionerd.md:
--------------------------------------------------------------------------------
1 | # Provisionerd Runbooks
2 |
3 | ## ProvisionerdReplicas
4 |
5 | One of more Provisioner replicas is down. Workspace builds may be queued and processed slower.
6 |
7 | To resolve this issue, review the Coder deployment (Coder provisioner pods)
8 | for possible `CrashLoopBackOff` instances or re-adjust alarm levels based on the actual
9 | number of replicas.
10 |
--------------------------------------------------------------------------------
/coder-observability/templates/_collector-config.tpl:
--------------------------------------------------------------------------------
1 | {{- define "collector-config" -}}
2 | {{ $agent := (index .Values "grafana-agent") }}
3 |
4 | {{ $agent.logging }}
5 | {{ $agent.discovery }}
6 |
7 | discovery.relabel "pod_logs" {
8 | targets = discovery.kubernetes.pods.targets
9 | {{ $agent.commonRelabellings | nindent 2 }}
10 | rule {
11 | source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
12 | separator = "/"
13 | action = "replace"
14 | replacement = "/var/log/pods/*$1/*.log"
15 | target_label = "__path__"
16 | }
17 | rule {
18 | action = "replace"
19 | source_labels = ["__meta_kubernetes_pod_container_id"]
20 | regex = "^(\\w+):\\/\\/.+$"
21 | replacement = "$1"
22 | target_label = "tmp_container_runtime"
23 | }
24 | {{- if $agent.podLogsRelabelRules -}}
25 | {{ $agent.podLogsRelabelRules | trim | nindent 2 }}
26 | {{- end }}
27 | }
28 |
29 | discovery.relabel "pod_metrics" {
30 | targets = discovery.kubernetes.pods.targets
31 | {{ $agent.commonRelabellings | nindent 6 }}
32 | // drop ports that do not expose Prometheus metrics, but might otherwise be exposed by a container which *also*
33 | // exposes an HTTP port which exposes metrics
34 | rule {
35 | source_labels = ["__meta_kubernetes_pod_container_port_name"]
36 | regex = "grpc|http-(memberlist|console)"
37 | action = "drop"
38 | }
39 | // adapted from the Prometheus helm chart
40 | // https://github.com/prometheus-community/helm-charts/blob/862870fc3c847e32479b509e511584d5283126a3/charts/prometheus/values.yaml#L1070
41 | rule {
42 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scrape"]
43 | action = "keep"
44 | regex = "true"
45 | }
46 | rule {
47 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scheme"]
48 | action = "replace"
49 | regex = "(https?)"
50 | target_label = "__scheme__"
51 | }
52 | rule {
53 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_path"]
54 | action = "replace"
55 | target_label = "__metrics_path__"
56 | regex = "(.+)"
57 | }
58 | rule {
59 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_port", "__meta_kubernetes_pod_ip"]
60 | action = "replace"
61 | regex = "(\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})"
62 | replacement = "[$2]:$1"
63 | target_label = "__address__"
64 | }
65 | rule {
66 | source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_port", "__meta_kubernetes_pod_ip"]
67 | action = "replace"
68 | regex = "(\\d+);((([0-9]+?)(\\.|$)){4})"
69 | replacement = "$2:$1"
70 | target_label = "__address__"
71 | }
72 | {{- if $agent.podMetricsRelabelRules -}}
73 | {{ $agent.podMetricsRelabelRules | trim | nindent 2 }}
74 | {{- end }}
75 | }
76 |
77 | local.file_match "pod_logs" {
78 | path_targets = discovery.relabel.pod_logs.output
79 | }
80 |
81 | loki.source.file "pod_logs" {
82 | targets = local.file_match.pod_logs.targets
83 | forward_to = [loki.process.pod_logs.receiver]
84 | }
85 |
86 | loki.process "pod_logs" {
87 | stage.match {
88 | selector = "{tmp_container_runtime=\"containerd\"}"
89 | // the cri processing stage extracts the following k/v pairs: log, stream, time, flags
90 | stage.cri {}
91 | // Set the extract flags and stream values as labels
92 | stage.labels {
93 | values = {
94 | flags = "",
95 | stream = "",
96 | }
97 | }
98 | }
99 |
100 | // if the label tmp_container_runtime from above is docker parse using docker
101 | stage.match {
102 | selector = "{tmp_container_runtime=\"docker\"}"
103 | // the docker processing stage extracts the following k/v pairs: log, stream, time
104 | stage.docker {}
105 |
106 | // Set the extract stream value as a label
107 | stage.labels {
108 | values = {
109 | stream = "",
110 | }
111 | }
112 | }
113 |
114 | // drop the temporary container runtime label as it is no longer needed
115 | stage.label_drop {
116 | values = ["tmp_container_runtime"]
117 | }
118 |
119 | // parse Coder logs and extract level & logger for efficient filtering
120 | stage.match {
121 | selector = "{pod=~\"coder.*\"}" // TODO: make configurable
122 |
123 | stage.multiline {
124 | firstline = {{ printf `^(?P\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\.\d{3})` | quote }}
125 | max_wait_time = "10s"
126 | }
127 |
128 | stage.regex {
129 | expression = {{ printf `^(?P\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\.\d{3})\s\[(?P\w+)\]\s\s(?P[^:]+):\s(?P.+)` | quote }}
130 | }
131 |
132 | stage.timestamp {
133 | source = "ts"
134 | format = "2006-01-02 15:04:05.000"
135 | action_on_failure = "fudge" // rather have inaccurate time than drop the log line
136 | }
137 |
138 | stage.labels {
139 | values = {
140 | level = "",
141 | logger = "",
142 | }
143 | }
144 | }
145 |
146 | forward_to = [loki.write.loki.receiver]
147 | }
148 | {{ if $agent.extraBlocks -}}
149 | {{ $agent.extraBlocks }}
150 | {{- end }}
151 | loki.write "loki" {
152 | endpoint {
153 | url = "http://{{ include "loki.fullname" .Subcharts.loki }}-gateway.{{ .Release.Namespace }}.{{ .Values.global.zone }}/loki/api/v1/push"
154 | }
155 | }
156 |
157 | prometheus.scrape "pods" {
158 | targets = discovery.relabel.pod_metrics.output
159 | forward_to = [prometheus.relabel.pods.receiver]
160 |
161 | scrape_interval = "{{ .Values.global.telemetry.metrics.scrape_interval }}"
162 | scrape_timeout = "{{ .Values.global.telemetry.metrics.scrape_timeout }}"
163 | }
164 |
165 | // These are metric_relabel_configs while discovery.relabel are relabel_configs.
166 | // See https://github.com/grafana/agent/blob/main/internal/converter/internal/prometheusconvert/prometheusconvert.go#L95-L106
167 | prometheus.relabel "pods" {
168 | forward_to = [prometheus.remote_write.default.receiver]
169 |
170 | // Drop kube-state-metrics' labels which clash with ours
171 | rule {
172 | source_labels = ["__name__", "container"]
173 | regex = "kube_pod.+;(.+)"
174 | target_label = "container"
175 | replacement = ""
176 | }
177 | rule {
178 | source_labels = ["__name__", "pod"]
179 | regex = "kube_pod.+;(.+)"
180 | target_label = "pod"
181 | replacement = ""
182 | }
183 | rule {
184 | source_labels = ["__name__", "namespace"]
185 | regex = "kube_pod.+;(.+)"
186 | target_label = "namespace"
187 | replacement = ""
188 | }
189 | rule {
190 | source_labels = ["__name__", "exported_container"]
191 | // don't replace an empty label
192 | regex = "^kube_pod.+;(.+)$"
193 | target_label = "container"
194 | replacement = "$1"
195 | }
196 | rule {
197 | source_labels = ["__name__", "exported_pod"]
198 | // don't replace an empty label
199 | regex = "^kube_pod.+;(.+)$"
200 | target_label = "pod"
201 | replacement = "$1"
202 | }
203 | rule {
204 | source_labels = ["__name__", "exported_namespace"]
205 | // don't replace an empty label
206 | regex = "^kube_pod.+;(.+)$"
207 | target_label = "namespace"
208 | replacement = "$1"
209 | }
210 | rule {
211 | regex = "^(exported_.*|image_.*|container_id|id|uid)$"
212 | action = "labeldrop"
213 | }
214 | }
215 |
216 | discovery.relabel "cadvisor" {
217 | targets = discovery.kubernetes.nodes.targets
218 | rule {
219 | replacement = "/metrics/cadvisor"
220 | target_label = "__metrics_path__"
221 | }
222 | }
223 |
224 | prometheus.scrape "cadvisor" {
225 | targets = discovery.relabel.cadvisor.output
226 | forward_to = [ prometheus.relabel.cadvisor.receiver ]
227 | scheme = "https"
228 | tls_config {
229 | insecure_skip_verify = true
230 | }
231 | bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
232 | scrape_interval = "{{ .Values.global.telemetry.metrics.scrape_interval }}"
233 | scrape_timeout = "{{ .Values.global.telemetry.metrics.scrape_timeout }}"
234 | }
235 |
236 | prometheus.relabel "cadvisor" {
237 | forward_to = [ prometheus.remote_write.default.receiver ]
238 |
239 | // Drop empty container labels, addressing https://github.com/google/cadvisor/issues/2688
240 | rule {
241 | source_labels = ["__name__","container"]
242 | separator = "@"
243 | regex = "(container_cpu_.*|container_fs_.*|container_memory_.*)@"
244 | action = "drop"
245 | }
246 | // Drop empty image labels, addressing https://github.com/google/cadvisor/issues/2688
247 | rule {
248 | source_labels = ["__name__","image"]
249 | separator = "@"
250 | regex = "(container_cpu_.*|container_fs_.*|container_memory_.*|container_network_.*)@"
251 | action = "drop"
252 | }
253 | // Drop irrelevant series
254 | rule {
255 | source_labels = ["container"]
256 | regex = "^POD$"
257 | action = "drop"
258 | }
259 | // Drop unnecessary labels
260 | rule {
261 | source_labels = ["id"]
262 | target_label = "id"
263 | replacement = ""
264 | }
265 | rule {
266 | source_labels = ["job"]
267 | target_label = "job"
268 | replacement = ""
269 | }
270 | rule {
271 | source_labels = ["name"]
272 | target_label = "name"
273 | replacement = ""
274 | }
275 | }
276 |
277 | prometheus.remote_write "default" {
278 | endpoint {
279 | url ="http://{{ include "prometheus.server.fullname" .Subcharts.prometheus }}.{{ .Release.Namespace }}.{{ .Values.global.zone }}/api/v1/write"
280 |
281 | // drop instance label which unnecessarily adds new series when pods are restarted, since pod IPs are dynamically assigned
282 | // NOTE: "__address__" is mapped to "instance", so will contain :
283 | write_relabel_config {
284 | regex = "instance"
285 | action = "labeldrop"
286 | }
287 | }
288 | }
289 |
290 | {{- if $agent.withOTLPReceiver -}}
291 | otelcol.receiver.otlp "otlp_receiver" {
292 | grpc {
293 | endpoint = "0.0.0.0:4317"
294 | }
295 | http {
296 | endpoint = "0.0.0.0:4318"
297 | }
298 | output {
299 | metrics = [otelcol.processor.batch.default.input]
300 | logs = [otelcol.processor.batch.default.input]
301 | }
302 | }
303 | otelcol.exporter.prometheus "to_prometheus" {
304 | forward_to = [
305 | prometheus.remote_write.default.receiver,
306 | ]
307 | }
308 | otelcol.exporter.loki "to_loki" {
309 | forward_to = [
310 | loki.write.loki.receiver,
311 | ]
312 | }
313 | otelcol.processor.batch "default" {
314 | output {
315 | metrics = [otelcol.exporter.prometheus.to_prometheus.input]
316 | logs = [otelcol.exporter.loki.to_loki.input]
317 | }
318 | }
319 | {{- end -}}
320 |
321 | {{ with .Values.global.coder.scrapeMetrics }}
322 | prometheus.scrape "coder_metrics" {
323 | targets = [
324 | {"__address__" = "{{ .hostname }}:{{ .port }}", {{ include "collector-labels" .additionalLabels | trimSuffix "," }}},
325 | ]
326 |
327 | forward_to = [prometheus.remote_write.default.receiver]
328 | scrape_interval = "{{ .scrapeInterval }}"
329 | }
330 | {{- end }}
331 | {{- end }}
332 |
333 | {{- define "collector-labels" -}}
334 | {{- range $key, $val := . -}}
335 | {{ $key }} = "{{ $val }}",
336 | {{- end -}}
337 | {{ end }}
--------------------------------------------------------------------------------
/coder-observability/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/*
2 | Expand the name of the chart.
3 | */}}
4 | {{- define "coder-observability.name" -}}
5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
6 | {{- end }}
7 |
8 | {{/*
9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "coder-observability.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 |
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "coder-observability.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 |
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "coder-observability.labels" -}}
37 | helm.sh/chart: {{ include "coder-observability.chart" . }}
38 | {{ include "coder-observability.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 |
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "coder-observability.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "coder-observability.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 |
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "coder-observability.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "coder-observability.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}
63 |
64 | {{/* Postgres connector string */}}
65 | {{- define "postgres-connector-string" -}}
66 | {{- if and .Values.global.postgres.password (eq .Values.global.postgres.sslmode "disable") -}}
67 | postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}
68 | {{- else if and .Values.global.postgres.password (ne .Values.global.postgres.sslmode "disable") -}}
69 | postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}&sslrootcert={{ .Values.global.postgres.sslrootcert }}
70 | {{- else if and .Values.global.postgres.mountSecret (eq .Values.global.postgres.sslmode "disable") -}}
71 | postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}
72 | {{- else if and .Values.global.postgres.mountSecret (ne .Values.global.postgres.sslmode "disable") -}}
73 | postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}&sslrootcert={{ .Values.global.postgres.sslrootcert }}
74 | {{- else -}}
75 | {{ fail "either postgres.password or postgres.mountSecret must be defined" }}
76 | {{- end -}}
77 | {{- end }}
78 |
79 | {{/* Postgres connector string */}}
80 | {{- define "postgres-secret-mount" -}}
81 | {{ if .Values.global.postgres.mountSecret }}
82 | envFrom:
83 | - secretRef:
84 | name: {{ .Values.global.postgres.mountSecret }}
85 | {{ end }}
86 | {{- end }}
87 |
88 | {{/* Postgres Exporter does not export a pubsub usage metric by default, so we add one */}}
89 | {{- define "postgres-pubsub-queue-usage-metric-name" -}}pg_pubsub_usage{{- end }}
90 |
91 | {{/* Build a runbook URL */}}
92 | {{- define "runbook-url" -}}
93 | {{ $outer := . }}
94 | {{- with .Values.global -}}
95 | {{- .externalScheme }}://runbook-viewer.{{ $outer.Release.Namespace }}.{{ .externalZone }}/{{- $outer.service }}#{{- $outer.alert | lower }}
96 | {{- end }}
97 | {{- end }}
98 |
99 | {{- define "coderd-selector" -}} {{- printf "%s, namespace=`%s`" .Values.global.coder.coderdSelector .Values.global.coder.controlPlaneNamespace -}} {{- end }}
100 | {{- define "provisionerd-selector" -}} {{- printf "%s, namespace=`%s`" .Values.global.coder.provisionerdSelector .Values.global.coder.externalProvisionersNamespace -}} {{- end }}
101 | {{- define "workspaces-selector" -}} {{- .Values.global.coder.workspacesSelector -}} {{- end }}
102 | {{- define "non-workspace-selector" -}} {{- printf "namespace=~`(%s|%s)`" (include "control-plane-namespace" .) (include "external-provisioners-namespace" .) -}} {{- end }}
103 | {{- define "control-plane-namespace" -}} {{- .Values.global.coder.controlPlaneNamespace -}} {{- end }}
104 | {{- define "external-provisioners-namespace" -}} {{- .Values.global.coder.externalProvisionersNamespace -}} {{- end }}
105 |
106 | {{/* The collector creates "job" labels in the form // */}}
107 |
108 | {{/* Prometheus job label */}}
109 | {{- define "prometheus-job" -}} {{- printf "%s/%s/%s" .Release.Namespace .Values.prometheus.server.fullnameOverride .Values.prometheus.server.name -}} {{- end }}
110 | {{/* Loki job label */}}
111 | {{- define "loki-job" -}} {{- printf "%s/%s" .Release.Namespace .Values.loki.fullnameOverride -}} {{- end }}
112 | {{/* Grafana Agent job label */}}
113 | {{- define "grafana-agent-job" -}} {{- printf "%s/%s/%s" .Release.Namespace (index .Values "grafana-agent").fullnameOverride "grafana-agent" -}} {{- end }}
114 |
115 | {{- define "dashboard-range" -}} {{ .Values.global.dashboards.timerange }} {{- end }}
116 | {{- define "dashboard-refresh" -}} {{ .Values.global.dashboards.refresh }} {{- end }}
--------------------------------------------------------------------------------
/coder-observability/templates/configmap-collector.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | kind: ConfigMap
3 | apiVersion: v1
4 | metadata:
5 | name: {{ (index .Values "grafana-agent").agent.configMap.name }}
6 | namespace: {{ .Release.Namespace }}
7 | data:
8 | config.river: |- {{- include "collector-config" . | trim | nindent 4 }}
--------------------------------------------------------------------------------
/coder-observability/templates/configmap-prometheus-alerts.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: metrics-alerts
5 | namespace: {{ .Release.Namespace }}
6 | data:
7 | {{- $service := dict "service" "coderd" -}}
8 |
9 | {{- with .Values.global.coder.alerts.coderd }} {{/* start-section */}}
10 | coderd.yaml: |-
11 | groups:
12 | {{- with .groups.CPU }}
13 | {{- $group := . }}
14 | {{- if .enabled }}
15 | - name: CPU Usage
16 | rules:
17 | {{ $alert := "CoderdCPUUsage" }}
18 | {{- range $severity, $threshold := .thresholds }}
19 | - alert: {{ $alert }}
20 | expr: max by (pod) (rate(container_cpu_usage_seconds_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) / max by(pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="cpu"}) > {{ $threshold }}
21 | for: {{ $group.delay }}
22 | annotations:
23 | summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of CPU, which may impact application performance.
24 | labels:
25 | severity: {{ $severity }}
26 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
27 | {{- end }}
28 | {{- end }}
29 | {{- end }}
30 |
31 | {{- with .groups.Memory }}
32 | {{- $group := . }}
33 | {{- if .enabled }}
34 | - name: Memory Usage
35 | rules:
36 | {{ $alert := "CoderdMemoryUsage" }}
37 | {{- range $severity, $threshold := .thresholds }}
38 | - alert: {{ $alert }}
39 | expr: max by (pod) (container_memory_working_set_bytes{ {{- include "coderd-selector" $ -}} }) / max by (pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="memory"}) > {{ $threshold }}
40 | for: {{ $group.delay }}
41 | annotations:
42 | summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.
43 | labels:
44 | severity: {{ $severity }}
45 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
46 | {{- end }}
47 | {{- end }}
48 | {{- end }}
49 |
50 | {{- with .groups.Restarts }}
51 | {{- $group := . }}
52 | {{- if .enabled }}
53 | - name: Pod Restarts
54 | rules:
55 | {{ $alert := "CoderdRestarts" }}
56 | {{- range $severity, $threshold := .thresholds }}
57 | - alert: {{ $alert }}
58 | expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) > {{ $threshold }}
59 | for: {{ $group.delay }}
60 | annotations:
61 | summary: The Coder instance {{ `{{ $labels.pod }}` }} has restarted multiple times in the last {{ $group.period -}}, which may indicate a CrashLoop.
62 | labels:
63 | severity: {{ $severity }}
64 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
65 | {{- end }}
66 | {{- end }}
67 | {{- end }}
68 |
69 | {{- with .groups.Replicas }}
70 | {{- $group := . }}
71 | {{- if .enabled }}
72 | - name: Coderd Replicas
73 | rules:
74 | {{ $alert := "CoderdReplicas" }}
75 | {{- range $severity, $threshold := .thresholds }}
76 | - alert: {{ $alert }}
77 | expr: sum(up{ {{- include "coderd-selector" $ -}} }) < {{ $threshold }}
78 | for: {{ $group.delay }}
79 | annotations:
80 | summary: Number of alive coderd replicas is below the threshold = {{ $threshold -}}.
81 | labels:
82 | severity: {{ $severity }}
83 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
84 | {{- end }}
85 | {{- end }}
86 | {{- end }}
87 |
88 | {{- with .groups.WorkspaceBuildFailures }}
89 | {{- $group := . }}
90 | {{- if .enabled }}
91 | - name: Coderd Workspace Build Failures
92 | rules:
93 | {{ $alert := "CoderdWorkspaceBuildFailures" }}
94 | {{- range $severity, $threshold := .thresholds }}
95 | - alert: {{ $alert }}
96 | expr: sum(increase(coderd_workspace_builds_total{ {{- include "coderd-selector" $ -}} , status="failed" }[{{- $group.period -}}])) > {{ $threshold }}
97 | for: {{ $group.delay }}
98 | annotations:
99 | summary: Workspace builds have failed multiple times in the last {{ $group.period -}}, which may indicate a broken Coder template.
100 | labels:
101 | severity: {{ $severity }}
102 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
103 | {{- end }}
104 | {{- end }}
105 | {{- end }}
106 |
107 | {{- with .groups.IneligiblePrebuilds }}
108 | {{- $group := . }}
109 | {{- if .enabled }}
110 | - name: Coderd Ineligible Prebuilds
111 | rules:
112 | {{ $alert := "CoderdIneligiblePrebuilds" }}
113 | {{- range $severity, $threshold := .thresholds }}
114 | - alert: {{ $alert }}
115 | expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0
116 | for: {{ $group.delay }}
117 | annotations:
118 | summary: >
119 | {{ `{{ $value }}` }} prebuilt workspace(s) are currently ineligible for claiming for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset.
120 | This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.
121 | labels:
122 | severity: {{ $severity }}
123 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
124 | {{- end }}
125 | {{- end }}
126 | {{- end }}
127 |
128 | {{- with .groups.UnprovisionedPrebuiltWorkspaces }}
129 | {{- $group := . }}
130 | {{- if .enabled }}
131 | - name: Coderd Unprovisioned Prebuilt Workspaces
132 | rules:
133 | {{ $alert := "CoderdUnprovisionedPrebuiltWorkspaces" }}
134 | {{- range $severity, $threshold := .thresholds }}
135 | - alert: {{ $alert }}
136 | expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0
137 | for: {{ $group.delay }}
138 | annotations:
139 | summary: >
140 | {{ `{{ $value }}` }} prebuilt workspace(s) not yet been provisioned for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset.
141 | labels:
142 | severity: {{ $severity }}
143 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
144 | {{- end }}
145 | {{- end }}
146 | {{- end }}
147 |
148 | {{- end }} {{/* end-section */}}
149 |
150 |
151 | {{- with .Values.global.coder.alerts.provisionerd }} {{/* start-section */}}
152 | provisionerd.yaml: |-
153 | groups:
154 | {{- with .groups.Replicas }}
155 | {{- $group := . }}
156 | {{- if .enabled }}
157 | - name: Provisionerd Replicas
158 | rules:
159 | {{ $alert := "ProvisionerdReplicas" }}
160 | {{- range $severity, $threshold := .thresholds }}
161 | - alert: {{ $alert }}
162 | expr: sum(coderd_provisionerd_num_daemons{ {{- include "coderd-selector" $ -}} }) < {{ $threshold }}
163 | for: {{ $group.delay }}
164 | annotations:
165 | summary: Number of alive provisionerd replicas is below the threshold = {{ $threshold -}}.
166 | labels:
167 | severity: {{ $severity }}
168 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
169 | {{- end }}
170 | {{- end }}
171 | {{- end }}
172 |
173 | {{- end }} {{/* end-section */}}
174 |
175 |
176 | {{- $service = dict "service" "enterprise" -}}
177 |
178 | {{- with .Values.global.coder.alerts.enterprise }} {{/* start-section */}}
179 | enterprise.yaml: |-
180 | groups:
181 | {{- with .groups.Licences }}
182 | {{- $group := . }}
183 | {{- if .enabled }}
184 | - name: Licences
185 | rules:
186 | {{ $alert := "CoderLicenseSeats" }}
187 | {{- range $severity, $threshold := .thresholds }}
188 | - alert: {{ $alert }}
189 | expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >= {{- $threshold }}'
190 | for: {{ $group.delay }}
191 | annotations:
192 | summary: Your Coder enterprise licence usage is now at {{ `{{ $value | humanizePercentage }}` }} capacity.
193 | labels:
194 | severity: {{ $severity }}
195 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
196 | {{- end }}
197 | {{- end }}
198 | {{- end }}
199 | {{- end }} {{/* end-section */}}
200 |
201 | {{- $service = dict "service" "postgres" -}}
202 | {{- with .Values.global.postgres }}
203 | postgres.yaml: |-
204 | groups:
205 | {{- with .alerts.groups.Notifications }}
206 | {{- $group := . -}}
207 | {{- if .enabled }}
208 | - name: Notifications
209 | rules:
210 | {{ $alert := "PostgresNotificationQueueFillingUp" }}
211 | {{- range $severity, $threshold := .thresholds }}
212 | - alert: {{ $alert }}
213 | expr: {{ include "postgres-pubsub-queue-usage-metric-name" . }} > {{ $threshold }}
214 | for: {{ $group.delay }}
215 | annotations:
216 | summary: The postgres instance {{ `{{ $labels.instance }}` }} has a notification that is filling up, which may impact application performance.
217 | labels:
218 | severity: {{ $severity }}
219 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
220 | {{- end }}
221 | {{- end -}}
222 | {{- end -}}
223 | {{- with .alerts.groups.Basic }}
224 | {{ $group := . -}}
225 | {{- if .enabled }}
226 | - name: Liveness
227 | rules:
228 | {{ $alert := "PostgresDown" }}
229 | - alert: {{ $alert }}
230 | expr: pg_up == 0
231 | for: {{ $group.delay }}
232 | annotations:
233 | summary: The postgres instance {{ `{{ $labels.instance }}` }} is down!
234 | labels:
235 | severity: critical
236 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
237 | {{- end }}
238 | {{ end }}
239 | {{- with .alerts.groups.Connections }}
240 | {{ $group := . -}}
241 | {{- if .enabled }}
242 | - name: Connections
243 | rules:
244 | {{ $alert := "PostgresConnectionsRunningLow" }}
245 | {{- range $severity, $threshold := .thresholds }}
246 | - alert: {{ $alert }}
247 | expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * {{ $threshold }})
248 | for: {{ $group.delay }}
249 | labels:
250 | summary: The postgres instance {{ `{{ $labels.instance }}` }} is running low on connections which may impact application performance.
251 | severity: {{ $severity }}
252 | runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
253 | {{- end }}
254 | {{- end -}}
255 | {{- end -}}
256 | {{ end }}
257 |
--------------------------------------------------------------------------------
/coder-observability/templates/configmap-runbooks.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | kind: ConfigMap
3 | apiVersion: v1
4 | metadata:
5 | name: runbooks
6 | namespace: {{ .Release.Namespace }}
7 | annotations:
8 | checksum/config: {{ (.Files.Glob "runbooks/**").AsConfig | indent 2 | sha256sum }}
9 | data:
10 | {{ (.Files.Glob "runbooks/**").AsConfig | indent 2 }}
--------------------------------------------------------------------------------
/coder-observability/templates/configmap-sql-exporter.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: sql-exporter-config
5 | namespace: {{ .Release.Namespace }}
6 | data:
7 | config.yaml: |-
8 | global:
9 | target:
10 | name: postgres
11 | data_source_name: '{{ include "postgres-connector-string" . }}'
12 | collectors:
13 | - notify
14 | collectors:
15 | - collector_name: notify
16 | metrics:
17 | # Add a metric to show the current usage of the Postgres "pub/sub" mechanism
18 | # See https://www.postgresql.org/docs/current/functions-info.html
19 | - metric_name: {{ include "postgres-pubsub-queue-usage-metric-name" . }}
20 | type: gauge
21 | help: "The fraction (0–1) of the asynchronous notification queue's maximum size that is currently occupied by notifications that are waiting to be processed"
22 | static_labels:
23 | hostname: {{ .Values.global.postgres.hostname }}
24 | database: {{ .Values.global.postgres.database }}
25 | values: [ usage ]
26 | query: |
27 | SELECT pg_notification_queue_usage() AS usage;
--------------------------------------------------------------------------------
/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl:
--------------------------------------------------------------------------------
1 | {{ define "prebuilds-dashboard.json" }}
2 | {
3 | "annotations": {
4 | "list": [
5 | {
6 | "builtIn": 1,
7 | "datasource": {
8 | "type": "grafana",
9 | "uid": "-- Grafana --"
10 | },
11 | "enable": true,
12 | "hide": true,
13 | "iconColor": "rgba(0, 211, 255, 1)",
14 | "name": "Annotations & Alerts",
15 | "type": "dashboard"
16 | }
17 | ]
18 | },
19 | "editable": true,
20 | "fiscalYearStartMonth": 0,
21 | "graphTooltip": 0,
22 | "id": 10,
23 | "links": [],
24 | "panels": [
25 | {
26 | "datasource": {
27 | "type": "prometheus",
28 | "uid": "prometheus"
29 | },
30 | "fieldConfig": {
31 | "defaults": {
32 | "color": {
33 | "mode": "thresholds"
34 | },
35 | "mappings": [
36 | {
37 | "options": {
38 | "0": {
39 | "color": "orange",
40 | "index": 2,
41 | "text": "Not enabled"
42 | },
43 | "1": {
44 | "color": "green",
45 | "index": 0,
46 | "text": "Enabled"
47 | }
48 | },
49 | "type": "value"
50 | },
51 | {
52 | "options": {
53 | "match": "null",
54 | "result": {
55 | "color": "orange",
56 | "index": 1,
57 | "text": "Not enabled"
58 | }
59 | },
60 | "type": "special"
61 | }
62 | ],
63 | "thresholds": {
64 | "mode": "absolute",
65 | "steps": [
66 | {
67 | "color": "green",
68 | "value": null
69 | },
70 | {
71 | "color": "red",
72 | "value": 80
73 | }
74 | ]
75 | }
76 | },
77 | "overrides": []
78 | },
79 | "gridPos": {
80 | "h": 4,
81 | "w": 4,
82 | "x": 0,
83 | "y": 0
84 | },
85 | "id": 15,
86 | "options": {
87 | "colorMode": "value",
88 | "graphMode": "none",
89 | "justifyMode": "center",
90 | "orientation": "auto",
91 | "reduceOptions": {
92 | "calcs": [
93 | "lastNotNull"
94 | ],
95 | "fields": "",
96 | "values": false
97 | },
98 | "showPercentChange": false,
99 | "text": {
100 | "valueSize": 15
101 | },
102 | "textMode": "auto",
103 | "wideLayout": true
104 | },
105 | "pluginVersion": "10.4.3",
106 | "targets": [
107 | {
108 | "datasource": {
109 | "type": "prometheus",
110 | "uid": "prometheus"
111 | },
112 | "editorMode": "code",
113 | "expr": "min(coderd_experiments{experiment=\"workspace-prebuilds\"})",
114 | "instant": true,
115 | "legendFormat": "__auto",
116 | "range": false,
117 | "refId": "A"
118 | }
119 | ],
120 | "title": "Experiment enabled?",
121 | "type": "stat"
122 | },
123 | {
124 | "datasource": {
125 | "type": "prometheus",
126 | "uid": "prometheus"
127 | },
128 | "fieldConfig": {
129 | "defaults": {
130 | "color": {
131 | "fixedColor": "text",
132 | "mode": "fixed"
133 | },
134 | "mappings": [],
135 | "thresholds": {
136 | "mode": "absolute",
137 | "steps": [
138 | {
139 | "color": "green",
140 | "value": null
141 | },
142 | {
143 | "color": "red",
144 | "value": 80
145 | }
146 | ]
147 | }
148 | },
149 | "overrides": []
150 | },
151 | "gridPos": {
152 | "h": 4,
153 | "w": 4,
154 | "x": 4,
155 | "y": 0
156 | },
157 | "id": 49,
158 | "interval": "30s",
159 | "options": {
160 | "colorMode": "value",
161 | "graphMode": "area",
162 | "justifyMode": "center",
163 | "orientation": "vertical",
164 | "reduceOptions": {
165 | "calcs": [
166 | "lastNotNull"
167 | ],
168 | "fields": "",
169 | "values": false
170 | },
171 | "showPercentChange": false,
172 | "textMode": "auto",
173 | "wideLayout": true
174 | },
175 | "pluginVersion": "10.4.3",
176 | "repeatDirection": "v",
177 | "targets": [
178 | {
179 | "datasource": {
180 | "type": "prometheus",
181 | "uid": "prometheus"
182 | },
183 | "editorMode": "code",
184 | "exemplar": false,
185 | "expr": "sum(max(coderd_prebuilt_workspaces_desired) by (template_name, preset_name)) or vector(0)",
186 | "instant": true,
187 | "interval": "",
188 | "legendFormat": "Desired",
189 | "range": false,
190 | "refId": "A"
191 | },
192 | {
193 | "datasource": {
194 | "type": "prometheus",
195 | "uid": "prometheus"
196 | },
197 | "editorMode": "code",
198 | "exemplar": false,
199 | "expr": "sum(max(coderd_prebuilt_workspaces_running) by (template_name, preset_name)) or vector(0)",
200 | "hide": false,
201 | "instant": true,
202 | "interval": "",
203 | "legendFormat": "Running",
204 | "range": false,
205 | "refId": "D"
206 | },
207 | {
208 | "datasource": {
209 | "type": "prometheus",
210 | "uid": "prometheus"
211 | },
212 | "editorMode": "code",
213 | "exemplar": false,
214 | "expr": "sum(max(coderd_prebuilt_workspaces_eligible) by (template_name, preset_name)) or vector(0)",
215 | "hide": false,
216 | "instant": true,
217 | "interval": "",
218 | "legendFormat": "Eligible",
219 | "range": false,
220 | "refId": "E"
221 | }
222 | ],
223 | "title": "Current: Global",
224 | "type": "stat"
225 | },
226 | {
227 | "datasource": {
228 | "type": "prometheus",
229 | "uid": "prometheus"
230 | },
231 | "description": "",
232 | "fieldConfig": {
233 | "defaults": {
234 | "color": {
235 | "fixedColor": "text",
236 | "mode": "fixed"
237 | },
238 | "mappings": [],
239 | "thresholds": {
240 | "mode": "absolute",
241 | "steps": [
242 | {
243 | "color": "green",
244 | "value": null
245 | },
246 | {
247 | "color": "red",
248 | "value": 80
249 | }
250 | ]
251 | }
252 | },
253 | "overrides": []
254 | },
255 | "gridPos": {
256 | "h": 4,
257 | "w": 4,
258 | "x": 8,
259 | "y": 0
260 | },
261 | "id": 48,
262 | "interval": "30s",
263 | "options": {
264 | "colorMode": "value",
265 | "graphMode": "area",
266 | "justifyMode": "center",
267 | "orientation": "vertical",
268 | "reduceOptions": {
269 | "calcs": [
270 | "lastNotNull"
271 | ],
272 | "fields": "",
273 | "values": false
274 | },
275 | "showPercentChange": false,
276 | "textMode": "auto",
277 | "wideLayout": true
278 | },
279 | "pluginVersion": "10.4.3",
280 | "repeatDirection": "v",
281 | "targets": [
282 | {
283 | "datasource": {
284 | "type": "prometheus",
285 | "uid": "prometheus"
286 | },
287 | "editorMode": "code",
288 | "exemplar": false,
289 | "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_created_total)) or vector(0)",
290 | "hide": false,
291 | "instant": true,
292 | "interval": "",
293 | "legendFormat": "Created",
294 | "range": false,
295 | "refId": "B"
296 | },
297 | {
298 | "datasource": {
299 | "type": "prometheus",
300 | "uid": "prometheus"
301 | },
302 | "editorMode": "code",
303 | "exemplar": false,
304 | "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_failed_total)) or vector(0)",
305 | "hide": false,
306 | "instant": true,
307 | "interval": "",
308 | "legendFormat": "Failed",
309 | "range": false,
310 | "refId": "C"
311 | },
312 | {
313 | "datasource": {
314 | "type": "prometheus",
315 | "uid": "prometheus"
316 | },
317 | "editorMode": "code",
318 | "exemplar": false,
319 | "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_claimed_total)) or vector(0)",
320 | "hide": false,
321 | "instant": true,
322 | "interval": "",
323 | "legendFormat": "Claimed",
324 | "range": false,
325 | "refId": "A"
326 | }
327 | ],
328 | "title": "All Time: Global",
329 | "type": "stat"
330 | },
331 | {
332 | "gridPos": {
333 | "h": 1,
334 | "w": 24,
335 | "x": 0,
336 | "y": 4
337 | },
338 | "id": 2,
339 | "panels": [],
340 | "repeat": "template",
341 | "repeatDirection": "h",
342 | "title": "$template",
343 | "type": "row"
344 | },
345 | {
346 | "datasource": {
347 | "type": "prometheus",
348 | "uid": "prometheus"
349 | },
350 | "fieldConfig": {
351 | "defaults": {
352 | "color": {
353 | "fixedColor": "text",
354 | "mode": "fixed"
355 | },
356 | "mappings": [],
357 | "thresholds": {
358 | "mode": "absolute",
359 | "steps": [
360 | {
361 | "color": "green",
362 | "value": null
363 | },
364 | {
365 | "color": "red",
366 | "value": 80
367 | }
368 | ]
369 | }
370 | },
371 | "overrides": []
372 | },
373 | "gridPos": {
374 | "h": 7,
375 | "w": 4,
376 | "x": 0,
377 | "y": 5
378 | },
379 | "id": 31,
380 | "interval": "30s",
381 | "options": {
382 | "colorMode": "value",
383 | "graphMode": "area",
384 | "justifyMode": "center",
385 | "orientation": "vertical",
386 | "reduceOptions": {
387 | "calcs": [
388 | "lastNotNull"
389 | ],
390 | "fields": "",
391 | "values": false
392 | },
393 | "showPercentChange": false,
394 | "textMode": "auto",
395 | "wideLayout": true
396 | },
397 | "pluginVersion": "10.4.3",
398 | "repeat": "preset",
399 | "repeatDirection": "v",
400 | "targets": [
401 | {
402 | "datasource": {
403 | "type": "prometheus",
404 | "uid": "prometheus"
405 | },
406 | "editorMode": "code",
407 | "exemplar": false,
408 | "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)",
409 | "instant": true,
410 | "interval": "",
411 | "legendFormat": "Desired",
412 | "range": false,
413 | "refId": "A"
414 | },
415 | {
416 | "datasource": {
417 | "type": "prometheus",
418 | "uid": "prometheus"
419 | },
420 | "editorMode": "code",
421 | "exemplar": false,
422 | "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)",
423 | "hide": false,
424 | "instant": true,
425 | "interval": "",
426 | "legendFormat": "Running",
427 | "range": false,
428 | "refId": "D"
429 | },
430 | {
431 | "datasource": {
432 | "type": "prometheus",
433 | "uid": "prometheus"
434 | },
435 | "editorMode": "code",
436 | "exemplar": false,
437 | "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)",
438 | "hide": false,
439 | "instant": true,
440 | "interval": "",
441 | "legendFormat": "Eligible",
442 | "range": false,
443 | "refId": "E"
444 | }
445 | ],
446 | "title": "Current: $preset",
447 | "type": "stat"
448 | },
449 | {
450 | "datasource": {
451 | "type": "prometheus",
452 | "uid": "prometheus"
453 | },
454 | "fieldConfig": {
455 | "defaults": {
456 | "color": {
457 | "mode": "palette-classic"
458 | },
459 | "custom": {
460 | "axisBorderShow": false,
461 | "axisCenteredZero": false,
462 | "axisColorMode": "text",
463 | "axisLabel": "",
464 | "axisPlacement": "auto",
465 | "axisSoftMax": 10,
466 | "axisSoftMin": 0,
467 | "barAlignment": 0,
468 | "drawStyle": "line",
469 | "fillOpacity": 18,
470 | "gradientMode": "none",
471 | "hideFrom": {
472 | "legend": false,
473 | "tooltip": false,
474 | "viz": false
475 | },
476 | "insertNulls": false,
477 | "lineInterpolation": "smooth",
478 | "lineStyle": {
479 | "fill": "solid"
480 | },
481 | "lineWidth": 2,
482 | "pointSize": 5,
483 | "scaleDistribution": {
484 | "type": "linear"
485 | },
486 | "showPoints": "never",
487 | "spanNulls": false,
488 | "stacking": {
489 | "group": "A",
490 | "mode": "none"
491 | },
492 | "thresholdsStyle": {
493 | "mode": "off"
494 | }
495 | },
496 | "decimals": 0,
497 | "fieldMinMax": false,
498 | "mappings": [],
499 | "thresholds": {
500 | "mode": "absolute",
501 | "steps": [
502 | {
503 | "color": "green",
504 | "value": null
505 | },
506 | {
507 | "color": "red",
508 | "value": 80
509 | }
510 | ]
511 | }
512 | },
513 | "overrides": [
514 | {
515 | "matcher": {
516 | "id": "byName",
517 | "options": "Desired"
518 | },
519 | "properties": [
520 | {
521 | "id": "color",
522 | "value": {
523 | "fixedColor": "purple",
524 | "mode": "fixed"
525 | }
526 | },
527 | {
528 | "id": "custom.lineStyle",
529 | "value": {
530 | "dash": [
531 | 10,
532 | 10
533 | ],
534 | "fill": "dash"
535 | }
536 | },
537 | {
538 | "id": "custom.fillOpacity",
539 | "value": 85
540 | },
541 | {
542 | "id": "custom.fillBelowTo",
543 | "value": "Running"
544 | }
545 | ]
546 | },
547 | {
548 | "matcher": {
549 | "id": "byName",
550 | "options": "Running"
551 | },
552 | "properties": [
553 | {
554 | "id": "color",
555 | "value": {
556 | "fixedColor": "yellow",
557 | "mode": "fixed"
558 | }
559 | },
560 | {
561 | "id": "custom.fillBelowTo",
562 | "value": "Eligible"
563 | }
564 | ]
565 | },
566 | {
567 | "matcher": {
568 | "id": "byName",
569 | "options": "Eligible"
570 | },
571 | "properties": [
572 | {
573 | "id": "color",
574 | "value": {
575 | "fixedColor": "green",
576 | "mode": "fixed"
577 | }
578 | }
579 | ]
580 | }
581 | ]
582 | },
583 | "gridPos": {
584 | "h": 7,
585 | "w": 8,
586 | "x": 4,
587 | "y": 5
588 | },
589 | "id": 5,
590 | "options": {
591 | "legend": {
592 | "calcs": [],
593 | "displayMode": "list",
594 | "placement": "bottom",
595 | "showLegend": true
596 | },
597 | "tooltip": {
598 | "mode": "single",
599 | "sort": "none"
600 | }
601 | },
602 | "pluginVersion": "10.4.3",
603 | "repeat": "preset",
604 | "repeatDirection": "v",
605 | "targets": [
606 | {
607 | "datasource": {
608 | "type": "prometheus",
609 | "uid": "prometheus"
610 | },
611 | "editorMode": "code",
612 | "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)",
613 | "instant": false,
614 | "interval": "",
615 | "legendFormat": "Desired",
616 | "range": true,
617 | "refId": "A"
618 | },
619 | {
620 | "datasource": {
621 | "type": "prometheus",
622 | "uid": "prometheus"
623 | },
624 | "editorMode": "code",
625 | "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)",
626 | "hide": false,
627 | "instant": false,
628 | "interval": "",
629 | "legendFormat": "Running",
630 | "range": true,
631 | "refId": "D"
632 | },
633 | {
634 | "datasource": {
635 | "type": "prometheus",
636 | "uid": "prometheus"
637 | },
638 | "editorMode": "code",
639 | "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)",
640 | "hide": false,
641 | "instant": false,
642 | "interval": "",
643 | "legendFormat": "Eligible",
644 | "range": true,
645 | "refId": "E"
646 | }
647 | ],
648 | "title": "Pool Capacity: $preset",
649 | "type": "timeseries"
650 | },
651 | {
652 | "datasource": {
653 | "type": "prometheus",
654 | "uid": "prometheus"
655 | },
656 | "fieldConfig": {
657 | "defaults": {
658 | "color": {
659 | "mode": "palette-classic"
660 | },
661 | "custom": {
662 | "axisBorderShow": false,
663 | "axisCenteredZero": false,
664 | "axisColorMode": "text",
665 | "axisLabel": "",
666 | "axisPlacement": "auto",
667 | "axisSoftMax": 10,
668 | "axisSoftMin": 0,
669 | "barAlignment": 0,
670 | "drawStyle": "line",
671 | "fillOpacity": 13,
672 | "gradientMode": "none",
673 | "hideFrom": {
674 | "legend": false,
675 | "tooltip": false,
676 | "viz": false
677 | },
678 | "insertNulls": false,
679 | "lineInterpolation": "smooth",
680 | "lineStyle": {
681 | "fill": "solid"
682 | },
683 | "lineWidth": 2,
684 | "pointSize": 5,
685 | "scaleDistribution": {
686 | "type": "linear"
687 | },
688 | "showPoints": "never",
689 | "spanNulls": false,
690 | "stacking": {
691 | "group": "A",
692 | "mode": "none"
693 | },
694 | "thresholdsStyle": {
695 | "mode": "off"
696 | }
697 | },
698 | "decimals": 0,
699 | "fieldMinMax": false,
700 | "mappings": [],
701 | "thresholds": {
702 | "mode": "absolute",
703 | "steps": [
704 | {
705 | "color": "green",
706 | "value": null
707 | },
708 | {
709 | "color": "red",
710 | "value": 80
711 | }
712 | ]
713 | }
714 | },
715 | "overrides": [
716 | {
717 | "matcher": {
718 | "id": "byName",
719 | "options": "Failed"
720 | },
721 | "properties": [
722 | {
723 | "id": "color",
724 | "value": {
725 | "fixedColor": "red",
726 | "mode": "fixed"
727 | }
728 | }
729 | ]
730 | },
731 | {
732 | "matcher": {
733 | "id": "byName",
734 | "options": "Created"
735 | },
736 | "properties": [
737 | {
738 | "id": "color",
739 | "value": {
740 | "fixedColor": "blue",
741 | "mode": "fixed"
742 | }
743 | }
744 | ]
745 | },
746 | {
747 | "matcher": {
748 | "id": "byName",
749 | "options": "Desired"
750 | },
751 | "properties": [
752 | {
753 | "id": "color",
754 | "value": {
755 | "fixedColor": "purple",
756 | "mode": "fixed"
757 | }
758 | }
759 | ]
760 | },
761 | {
762 | "matcher": {
763 | "id": "byName",
764 | "options": "Running"
765 | },
766 | "properties": [
767 | {
768 | "id": "color",
769 | "value": {
770 | "fixedColor": "yellow",
771 | "mode": "fixed"
772 | }
773 | }
774 | ]
775 | },
776 | {
777 | "matcher": {
778 | "id": "byName",
779 | "options": "Eligible"
780 | },
781 | "properties": [
782 | {
783 | "id": "color",
784 | "value": {
785 | "fixedColor": "green",
786 | "mode": "fixed"
787 | }
788 | }
789 | ]
790 | },
791 | {
792 | "matcher": {
793 | "id": "byName",
794 | "options": "Claimed"
795 | },
796 | "properties": [
797 | {
798 | "id": "color",
799 | "value": {
800 | "fixedColor": "dark-green",
801 | "mode": "fixed"
802 | }
803 | }
804 | ]
805 | }
806 | ]
807 | },
808 | "gridPos": {
809 | "h": 7,
810 | "w": 8,
811 | "x": 12,
812 | "y": 5
813 | },
814 | "id": 38,
815 | "options": {
816 | "legend": {
817 | "calcs": [],
818 | "displayMode": "list",
819 | "placement": "bottom",
820 | "showLegend": true
821 | },
822 | "tooltip": {
823 | "mode": "single",
824 | "sort": "none"
825 | }
826 | },
827 | "pluginVersion": "10.4.3",
828 | "repeat": "preset",
829 | "repeatDirection": "v",
830 | "targets": [
831 | {
832 | "datasource": {
833 | "type": "prometheus",
834 | "uid": "prometheus"
835 | },
836 | "editorMode": "code",
837 | "expr": "floor(max(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)",
838 | "hide": false,
839 | "instant": false,
840 | "interval": "",
841 | "legendFormat": "Created",
842 | "range": true,
843 | "refId": "B"
844 | },
845 | {
846 | "datasource": {
847 | "type": "prometheus",
848 | "uid": "prometheus"
849 | },
850 | "editorMode": "code",
851 | "expr": "floor(max(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)",
852 | "hide": false,
853 | "instant": false,
854 | "interval": "",
855 | "legendFormat": "Failed",
856 | "range": true,
857 | "refId": "C"
858 | },
859 | {
860 | "datasource": {
861 | "type": "prometheus",
862 | "uid": "prometheus"
863 | },
864 | "editorMode": "code",
865 | "expr": "floor(max(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)",
866 | "hide": false,
867 | "instant": false,
868 | "interval": "",
869 | "legendFormat": "Claimed",
870 | "range": true,
871 | "refId": "F"
872 | }
873 | ],
874 | "title": "Pool Operations: $preset",
875 | "type": "timeseries"
876 | },
877 | {
878 | "datasource": {
879 | "type": "prometheus",
880 | "uid": "prometheus"
881 | },
882 | "description": "",
883 | "fieldConfig": {
884 | "defaults": {
885 | "color": {
886 | "fixedColor": "text",
887 | "mode": "fixed"
888 | },
889 | "mappings": [],
890 | "thresholds": {
891 | "mode": "absolute",
892 | "steps": [
893 | {
894 | "color": "green",
895 | "value": null
896 | },
897 | {
898 | "color": "red",
899 | "value": 80
900 | }
901 | ]
902 | }
903 | },
904 | "overrides": []
905 | },
906 | "gridPos": {
907 | "h": 7,
908 | "w": 4,
909 | "x": 20,
910 | "y": 5
911 | },
912 | "id": 1,
913 | "interval": "30s",
914 | "options": {
915 | "colorMode": "value",
916 | "graphMode": "area",
917 | "justifyMode": "center",
918 | "orientation": "vertical",
919 | "reduceOptions": {
920 | "calcs": [
921 | "lastNotNull"
922 | ],
923 | "fields": "",
924 | "values": false
925 | },
926 | "showPercentChange": false,
927 | "textMode": "auto",
928 | "wideLayout": true
929 | },
930 | "pluginVersion": "10.4.3",
931 | "repeat": "preset",
932 | "repeatDirection": "v",
933 | "targets": [
934 | {
935 | "datasource": {
936 | "type": "prometheus",
937 | "uid": "prometheus"
938 | },
939 | "editorMode": "code",
940 | "exemplar": false,
941 | "expr": "max(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)",
942 | "hide": false,
943 | "instant": true,
944 | "interval": "",
945 | "legendFormat": "Created",
946 | "range": false,
947 | "refId": "B"
948 | },
949 | {
950 | "datasource": {
951 | "type": "prometheus",
952 | "uid": "prometheus"
953 | },
954 | "editorMode": "code",
955 | "exemplar": false,
956 | "expr": "max(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)",
957 | "hide": false,
958 | "instant": true,
959 | "interval": "",
960 | "legendFormat": "Failed",
961 | "range": false,
962 | "refId": "C"
963 | },
964 | {
965 | "datasource": {
966 | "type": "prometheus",
967 | "uid": "prometheus"
968 | },
969 | "editorMode": "code",
970 | "exemplar": false,
971 | "expr": "max(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)",
972 | "hide": false,
973 | "instant": true,
974 | "interval": "",
975 | "legendFormat": "Claimed",
976 | "range": false,
977 | "refId": "A"
978 | }
979 | ],
980 | "title": "All Time: $preset",
981 | "type": "stat"
982 | }
983 | ],
984 | "refresh": "{{- include "dashboard-refresh" . -}}",
985 | "schemaVersion": 39,
986 | "tags": [],
987 | "templating": {
988 | "list": [
989 | {
990 | "allValue": "",
991 | "datasource": {
992 | "type": "prometheus",
993 | "uid": "prometheus"
994 | },
995 | "definition": "label_values(coderd_prebuilt_workspaces_desired,template_name)",
996 | "hide": 0,
997 | "includeAll": false,
998 | "label": "Template",
999 | "multi": false,
1000 | "name": "template",
1001 | "options": [],
1002 | "query": {
1003 | "qryType": 1,
1004 | "query": "label_values(coderd_prebuilt_workspaces_desired,template_name)",
1005 | "refId": "PrometheusVariableQueryEditor-VariableQuery"
1006 | },
1007 | "refresh": 1,
1008 | "regex": "",
1009 | "skipUrlSync": false,
1010 | "sort": 0,
1011 | "type": "query"
1012 | },
1013 | {
1014 | "allValue": "",
1015 | "datasource": {
1016 | "type": "prometheus",
1017 | "uid": "prometheus"
1018 | },
1019 | "definition": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)",
1020 | "hide": 0,
1021 | "includeAll": true,
1022 | "label": "Preset",
1023 | "multi": true,
1024 | "name": "preset",
1025 | "options": [],
1026 | "query": {
1027 | "qryType": 1,
1028 | "query": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)",
1029 | "refId": "PrometheusVariableQueryEditor-VariableQuery"
1030 | },
1031 | "refresh": 1,
1032 | "regex": "",
1033 | "skipUrlSync": false,
1034 | "sort": 0,
1035 | "type": "query"
1036 | }
1037 | ]
1038 | },
1039 | "time": {
1040 | "from": "now-{{- include "dashboard-range" . -}}",
1041 | "to": "now"
1042 | },
1043 | "timepicker": {},
1044 | "timezone": "browser",
1045 | "title": "Prebuilds",
1046 | "uid": "cej6jysyme22oa",
1047 | "version": 13,
1048 | "weekStart": ""
1049 | }
1050 | {{ end }}
--------------------------------------------------------------------------------
/coder-observability/templates/dashboards/_dashboards_provisionerd.json.tpl:
--------------------------------------------------------------------------------
1 | {{ define "provisionerd-dashboard.json" }}
2 | {
3 | "annotations": {
4 | "list": [
5 | {
6 | "builtIn": 1,
7 | "datasource": {
8 | "type": "grafana",
9 | "uid": "-- Grafana --"
10 | },
11 | "enable": true,
12 | "hide": true,
13 | "iconColor": "rgba(0, 211, 255, 1)",
14 | "name": "Annotations & Alerts",
15 | "target": {
16 | "limit": 100,
17 | "matchAny": false,
18 | "tags": [],
19 | "type": "dashboard"
20 | },
21 | "type": "dashboard"
22 | }
23 | ]
24 | },
25 | "editable": true,
26 | "fiscalYearStartMonth": 0,
27 | "graphTooltip": 0,
28 | "links": [],
29 | "panels": [
30 | {
31 | "datasource": {
32 | "type": "prometheus",
33 | "uid": "prometheus"
34 | },
35 | "description": "",
36 | "fieldConfig": {
37 | "defaults": {
38 | "color": {
39 | "mode": "thresholds"
40 | },
41 | "mappings": [],
42 | "thresholds": {
43 | "mode": "absolute",
44 | "steps": [
45 | {
46 | "color": "text",
47 | "value": null
48 | },
49 | {
50 | "color": "green",
51 | "value": 1
52 | }
53 | ]
54 | }
55 | },
56 | "overrides": []
57 | },
58 | "gridPos": {
59 | "h": 7,
60 | "w": 6,
61 | "x": 0,
62 | "y": 0
63 | },
64 | "id": 17,
65 | "options": {
66 | "colorMode": "value",
67 | "graphMode": "area",
68 | "justifyMode": "center",
69 | "orientation": "auto",
70 | "reduceOptions": {
71 | "calcs": [
72 | "lastNotNull"
73 | ],
74 | "fields": "",
75 | "values": false
76 | },
77 | "showPercentChange": false,
78 | "textMode": "value_and_name",
79 | "wideLayout": false
80 | },
81 | "pluginVersion": "10.4.0",
82 | "targets": [
83 | {
84 | "datasource": {
85 | "type": "prometheus",
86 | "uid": "prometheus"
87 | },
88 | "editorMode": "code",
89 | "exemplar": false,
90 | "expr": "sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`})",
91 | "instant": true,
92 | "legendFormat": "Built-in",
93 | "range": false,
94 | "refId": "A"
95 | },
96 | {
97 | "datasource": {
98 | "type": "prometheus",
99 | "uid": "prometheus"
100 | },
101 | "editorMode": "code",
102 | "exemplar": false,
103 | "expr": "sum(coderd_provisionerd_num_daemons{ {{- include "provisionerd-selector" . -}} })",
104 | "hide": false,
105 | "instant": true,
106 | "legendFormat": "External",
107 | "range": false,
108 | "refId": "B"
109 | }
110 | ],
111 | "title": "Provisioners",
112 | "type": "stat"
113 | },
114 | {
115 | "datasource": {
116 | "type": "prometheus",
117 | "uid": "prometheus"
118 | },
119 | "description": "",
120 | "gridPos": {
121 | "h": 7,
122 | "w": 6,
123 | "x": 6,
124 | "y": 0
125 | },
126 | "id": 20,
127 | "options": {
128 | "code": {
129 | "language": "plaintext",
130 | "showLineNumbers": false,
131 | "showMiniMap": false
132 | },
133 | "content": "Provisioners are responsible for building workspaces.\n\n`coderd` runs built-in provisioners by default. Control this with the `CODER_PROVISIONER_DAEMONS` environment variable or `--provisioner-daemons` flag.\n\nYou can also consider [External Provisioners](https://coder.com/docs/v2/latest/admin/provisioners). Running both built-in and external provisioners is perfectly valid,\nalthough dedicated (external) provisioners will generally give the best build performance.",
134 | "mode": "markdown"
135 | },
136 | "pluginVersion": "10.4.0",
137 | "transparent": true,
138 | "type": "text"
139 | },
140 | {
141 | "datasource": {
142 | "type": "prometheus",
143 | "uid": "prometheus"
144 | },
145 | "description": "",
146 | "fieldConfig": {
147 | "defaults": {
148 | "color": {
149 | "mode": "thresholds"
150 | },
151 | "mappings": [],
152 | "thresholds": {
153 | "mode": "absolute",
154 | "steps": [
155 | {
156 | "color": "text",
157 | "value": null
158 | },
159 | {
160 | "color": "green",
161 | "value": 1
162 | }
163 | ]
164 | }
165 | },
166 | "overrides": []
167 | },
168 | "gridPos": {
169 | "h": 7,
170 | "w": 6,
171 | "x": 12,
172 | "y": 0
173 | },
174 | "id": 21,
175 | "options": {
176 | "colorMode": "value",
177 | "graphMode": "area",
178 | "justifyMode": "center",
179 | "orientation": "auto",
180 | "reduceOptions": {
181 | "calcs": [
182 | "last"
183 | ],
184 | "fields": "",
185 | "values": false
186 | },
187 | "showPercentChange": false,
188 | "textMode": "auto",
189 | "wideLayout": true
190 | },
191 | "pluginVersion": "10.4.0",
192 | "targets": [
193 | {
194 | "datasource": {
195 | "type": "prometheus",
196 | "uid": "prometheus"
197 | },
198 | "editorMode": "code",
199 | "exemplar": false,
200 | "expr": "(sum(coderd_provisionerd_jobs_current) > 0) or vector(0)",
201 | "instant": false,
202 | "legendFormat": "Current",
203 | "range": true,
204 | "refId": "A"
205 | },
206 | {
207 | "datasource": {
208 | "type": "prometheus",
209 | "uid": "prometheus"
210 | },
211 | "editorMode": "code",
212 | "exemplar": false,
213 | "expr": "sum(coderd_provisionerd_num_daemons)",
214 | "hide": false,
215 | "instant": true,
216 | "legendFormat": "Capacity",
217 | "range": false,
218 | "refId": "B"
219 | }
220 | ],
221 | "title": "Builds",
222 | "type": "stat"
223 | },
224 | {
225 | "datasource": {
226 | "type": "prometheus",
227 | "uid": "prometheus"
228 | },
229 | "description": "",
230 | "gridPos": {
231 | "h": 7,
232 | "w": 6,
233 | "x": 18,
234 | "y": 0
235 | },
236 | "id": 22,
237 | "options": {
238 | "code": {
239 | "language": "plaintext",
240 | "showLineNumbers": false,
241 | "showMiniMap": false
242 | },
243 | "content": "The maximum number of simultaneous builds is equivalent to the number of `provisionerd` daemons running.\n\nThe \"Capacity\" panel shows the how many simultaneous builds are possible.",
244 | "mode": "markdown"
245 | },
246 | "pluginVersion": "10.4.0",
247 | "transparent": true,
248 | "type": "text"
249 | },
250 | {
251 | "datasource": {
252 | "type": "prometheus",
253 | "uid": "prometheus"
254 | },
255 | "description": "",
256 | "fieldConfig": {
257 | "defaults": {
258 | "color": {
259 | "mode": "thresholds"
260 | },
261 | "fieldMinMax": false,
262 | "mappings": [],
263 | "thresholds": {
264 | "mode": "absolute",
265 | "steps": [
266 | {
267 | "color": "text",
268 | "value": null
269 | }
270 | ]
271 | },
272 | "unit": "s"
273 | },
274 | "overrides": []
275 | },
276 | "gridPos": {
277 | "h": 7,
278 | "w": 6,
279 | "x": 0,
280 | "y": 7
281 | },
282 | "id": 23,
283 | "options": {
284 | "colorMode": "value",
285 | "graphMode": "none",
286 | "justifyMode": "center",
287 | "orientation": "auto",
288 | "reduceOptions": {
289 | "calcs": [
290 | "lastNotNull"
291 | ],
292 | "fields": "",
293 | "values": false
294 | },
295 | "showPercentChange": false,
296 | "textMode": "auto",
297 | "wideLayout": true
298 | },
299 | "pluginVersion": "10.4.0",
300 | "targets": [
301 | {
302 | "datasource": {
303 | "type": "prometheus",
304 | "uid": "prometheus"
305 | },
306 | "editorMode": "code",
307 | "exemplar": false,
308 | "expr": "histogram_quantile(0.5, sum by(le) (rate(coderd_provisionerd_job_timings_seconds_bucket[$__range])))",
309 | "hide": false,
310 | "instant": true,
311 | "legendFormat": "Median",
312 | "range": false,
313 | "refId": "B"
314 | },
315 | {
316 | "datasource": {
317 | "type": "prometheus",
318 | "uid": "prometheus"
319 | },
320 | "editorMode": "code",
321 | "exemplar": false,
322 | "expr": "histogram_quantile(0.9, sum by(le) (rate(coderd_provisionerd_job_timings_seconds_bucket[$__range])))",
323 | "hide": false,
324 | "instant": true,
325 | "legendFormat": "90th Percentile",
326 | "range": false,
327 | "refId": "A"
328 | }
329 | ],
330 | "title": "Build Times",
331 | "type": "stat"
332 | },
333 | {
334 | "datasource": {
335 | "type": "prometheus",
336 | "uid": "prometheus"
337 | },
338 | "description": "",
339 | "gridPos": {
340 | "h": 7,
341 | "w": 6,
342 | "x": 6,
343 | "y": 7
344 | },
345 | "id": 24,
346 | "options": {
347 | "code": {
348 | "language": "plaintext",
349 | "showLineNumbers": false,
350 | "showMiniMap": false
351 | },
352 | "content": "This shows the median and 90th percentile workspace build times.\n\nLong build times can impede developers' productivity while they wait for workspaces to start or be created.",
353 | "mode": "markdown"
354 | },
355 | "pluginVersion": "10.4.0",
356 | "transparent": true,
357 | "type": "text"
358 | },
359 | {
360 | "datasource": {
361 | "type": "prometheus",
362 | "uid": "prometheus"
363 | },
364 | "description": "",
365 | "fieldConfig": {
366 | "defaults": {
367 | "color": {
368 | "mode": "palette-classic"
369 | },
370 | "custom": {
371 | "axisBorderShow": false,
372 | "axisCenteredZero": false,
373 | "axisColorMode": "text",
374 | "axisLabel": "",
375 | "axisPlacement": "auto",
376 | "barAlignment": 0,
377 | "drawStyle": "bars",
378 | "fillOpacity": 100,
379 | "gradientMode": "none",
380 | "hideFrom": {
381 | "legend": false,
382 | "tooltip": false,
383 | "viz": false
384 | },
385 | "insertNulls": false,
386 | "lineInterpolation": "linear",
387 | "lineWidth": 1,
388 | "pointSize": 5,
389 | "scaleDistribution": {
390 | "type": "linear"
391 | },
392 | "showPoints": "auto",
393 | "spanNulls": false,
394 | "stacking": {
395 | "group": "A",
396 | "mode": "normal"
397 | },
398 | "thresholdsStyle": {
399 | "mode": "off"
400 | }
401 | },
402 | "decimals": 0,
403 | "fieldMinMax": false,
404 | "mappings": [],
405 | "thresholds": {
406 | "mode": "absolute",
407 | "steps": [
408 | {
409 | "color": "text",
410 | "value": null
411 | }
412 | ]
413 | },
414 | "unit": "short"
415 | },
416 | "overrides": [
417 | {
418 | "matcher": {
419 | "id": "byName",
420 | "options": "failed"
421 | },
422 | "properties": [
423 | {
424 | "id": "color",
425 | "value": {
426 | "fixedColor": "orange",
427 | "mode": "fixed"
428 | }
429 | },
430 | {
431 | "id": "displayName",
432 | "value": "Failure"
433 | }
434 | ]
435 | },
436 | {
437 | "matcher": {
438 | "id": "byName",
439 | "options": "success"
440 | },
441 | "properties": [
442 | {
443 | "id": "color",
444 | "value": {
445 | "fixedColor": "green",
446 | "mode": "fixed"
447 | }
448 | },
449 | {
450 | "id": "displayName",
451 | "value": "Success"
452 | }
453 | ]
454 | }
455 | ]
456 | },
457 | "gridPos": {
458 | "h": 7,
459 | "w": 6,
460 | "x": 12,
461 | "y": 7
462 | },
463 | "id": 25,
464 | "interval": "1h",
465 | "options": {
466 | "legend": {
467 | "calcs": [],
468 | "displayMode": "list",
469 | "placement": "bottom",
470 | "showLegend": true
471 | },
472 | "tooltip": {
473 | "mode": "multi",
474 | "sort": "none"
475 | }
476 | },
477 | "pluginVersion": "10.4.0",
478 | "targets": [
479 | {
480 | "datasource": {
481 | "type": "prometheus",
482 | "uid": "prometheus"
483 | },
484 | "editorMode": "code",
485 | "exemplar": false,
486 | "expr": "sum by (status) (increase(coderd_provisionerd_job_timings_seconds_count[$__interval]))",
487 | "hide": false,
488 | "instant": false,
489 | "interval": "1h",
490 | "legendFormat": "__auto",
491 | "range": true,
492 | "refId": "A"
493 | }
494 | ],
495 | "title": "Build Count Per Hour",
496 | "type": "timeseries"
497 | },
498 | {
499 | "datasource": {
500 | "type": "prometheus",
501 | "uid": "prometheus"
502 | },
503 | "description": "",
504 | "gridPos": {
505 | "h": 7,
506 | "w": 6,
507 | "x": 18,
508 | "y": 7
509 | },
510 | "id": 26,
511 | "options": {
512 | "code": {
513 | "language": "plaintext",
514 | "showLineNumbers": false,
515 | "showMiniMap": false
516 | },
517 | "content": "_NOTE: this will not show the current hour._",
518 | "mode": "markdown"
519 | },
520 | "pluginVersion": "10.4.0",
521 | "transparent": true,
522 | "type": "text"
523 | },
524 | {
525 | "datasource": {
526 | "type": "prometheus",
527 | "uid": "prometheus"
528 | },
529 | "description": "",
530 | "fieldConfig": {
531 | "defaults": {
532 | "color": {
533 | "mode": "palette-classic"
534 | },
535 | "custom": {
536 | "axisBorderShow": false,
537 | "axisCenteredZero": false,
538 | "axisColorMode": "text",
539 | "axisLabel": "",
540 | "axisPlacement": "auto",
541 | "barAlignment": 0,
542 | "drawStyle": "bars",
543 | "fillOpacity": 100,
544 | "gradientMode": "none",
545 | "hideFrom": {
546 | "legend": false,
547 | "tooltip": false,
548 | "viz": false
549 | },
550 | "insertNulls": false,
551 | "lineInterpolation": "linear",
552 | "lineWidth": 1,
553 | "pointSize": 5,
554 | "scaleDistribution": {
555 | "type": "linear"
556 | },
557 | "showPoints": "never",
558 | "spanNulls": false,
559 | "stacking": {
560 | "group": "A",
561 | "mode": "none"
562 | },
563 | "thresholdsStyle": {
564 | "mode": "off"
565 | }
566 | },
567 | "fieldMinMax": false,
568 | "mappings": [],
569 | "thresholds": {
570 | "mode": "absolute",
571 | "steps": [
572 | {
573 | "color": "text",
574 | "value": null
575 | }
576 | ]
577 | },
578 | "unit": "s"
579 | },
580 | "overrides": [
581 | {
582 | "matcher": {
583 | "id": "byRegexp",
584 | "options": "/(Limit|Requested)/"
585 | },
586 | "properties": [
587 | {
588 | "id": "custom.drawStyle",
589 | "value": "line"
590 | },
591 | {
592 | "id": "custom.fillOpacity",
593 | "value": 5
594 | },
595 | {
596 | "id": "custom.lineStyle",
597 | "value": {
598 | "dash": [
599 | 0,
600 | 10
601 | ],
602 | "fill": "dot"
603 | }
604 | }
605 | ]
606 | },
607 | {
608 | "matcher": {
609 | "id": "byName",
610 | "options": "Limit"
611 | },
612 | "properties": [
613 | {
614 | "id": "color",
615 | "value": {
616 | "fixedColor": "orange",
617 | "mode": "fixed"
618 | }
619 | }
620 | ]
621 | },
622 | {
623 | "matcher": {
624 | "id": "byName",
625 | "options": "Requested"
626 | },
627 | "properties": [
628 | {
629 | "id": "color",
630 | "value": {
631 | "fixedColor": "green",
632 | "mode": "fixed"
633 | }
634 | }
635 | ]
636 | }
637 | ]
638 | },
639 | "gridPos": {
640 | "h": 7,
641 | "w": 6,
642 | "x": 0,
643 | "y": 14
644 | },
645 | "id": 28,
646 | "options": {
647 | "legend": {
648 | "calcs": [],
649 | "displayMode": "list",
650 | "placement": "bottom",
651 | "showLegend": true
652 | },
653 | "tooltip": {
654 | "mode": "single",
655 | "sort": "none"
656 | }
657 | },
658 | "pluginVersion": "10.4.0",
659 | "targets": [
660 | {
661 | "datasource": {
662 | "type": "prometheus",
663 | "uid": "prometheus"
664 | },
665 | "editorMode": "code",
666 | "exemplar": false,
667 | "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{ {{- include "provisionerd-selector" . -}} }[$__rate_interval]))",
668 | "hide": false,
669 | "instant": false,
670 | "legendFormat": "__auto",
671 | "range": true,
672 | "refId": "A"
673 | },
674 | {
675 | "datasource": {
676 | "type": "prometheus",
677 | "uid": "prometheus"
678 | },
679 | "editorMode": "code",
680 | "exemplar": false,
681 | "expr": "max(kube_pod_container_resource_limits{ {{- include "provisionerd-selector" . -}} , resource=\"cpu\"})",
682 | "hide": false,
683 | "instant": false,
684 | "legendFormat": "Limit",
685 | "range": true,
686 | "refId": "B"
687 | },
688 | {
689 | "datasource": {
690 | "type": "prometheus",
691 | "uid": "prometheus"
692 | },
693 | "editorMode": "code",
694 | "exemplar": false,
695 | "expr": "max(kube_pod_container_resource_requests{ {{- include "provisionerd-selector" . -}} , resource=\"cpu\"})",
696 | "hide": false,
697 | "instant": false,
698 | "legendFormat": "Requested",
699 | "range": true,
700 | "refId": "C"
701 | }
702 | ],
703 | "title": "CPU Usage Seconds",
704 | "type": "timeseries"
705 | },
706 | {
707 | "datasource": {
708 | "type": "prometheus",
709 | "uid": "prometheus"
710 | },
711 | "description": "",
712 | "gridPos": {
713 | "h": 7,
714 | "w": 6,
715 | "x": 6,
716 | "y": 14
717 | },
718 | "id": 30,
719 | "options": {
720 | "code": {
721 | "language": "plaintext",
722 | "showLineNumbers": false,
723 | "showMiniMap": false
724 | },
725 | "content": "The cumulative CPU used per core-second. If the process was using a full CPU core, that would be represented as 1 second.\n\nRequests & limits are shown if set.",
726 | "mode": "markdown"
727 | },
728 | "pluginVersion": "10.4.0",
729 | "transparent": true,
730 | "type": "text"
731 | },
732 | {
733 | "datasource": {
734 | "type": "prometheus",
735 | "uid": "prometheus"
736 | },
737 | "description": "",
738 | "fieldConfig": {
739 | "defaults": {
740 | "color": {
741 | "mode": "palette-classic"
742 | },
743 | "custom": {
744 | "axisBorderShow": false,
745 | "axisCenteredZero": false,
746 | "axisColorMode": "text",
747 | "axisLabel": "",
748 | "axisPlacement": "auto",
749 | "barAlignment": 0,
750 | "drawStyle": "bars",
751 | "fillOpacity": 100,
752 | "gradientMode": "none",
753 | "hideFrom": {
754 | "legend": false,
755 | "tooltip": false,
756 | "viz": false
757 | },
758 | "insertNulls": false,
759 | "lineInterpolation": "linear",
760 | "lineWidth": 1,
761 | "pointSize": 5,
762 | "scaleDistribution": {
763 | "type": "linear"
764 | },
765 | "showPoints": "never",
766 | "spanNulls": false,
767 | "stacking": {
768 | "group": "A",
769 | "mode": "none"
770 | },
771 | "thresholdsStyle": {
772 | "mode": "off"
773 | }
774 | },
775 | "fieldMinMax": false,
776 | "mappings": [],
777 | "thresholds": {
778 | "mode": "absolute",
779 | "steps": [
780 | {
781 | "color": "text",
782 | "value": null
783 | }
784 | ]
785 | },
786 | "unit": "bytes"
787 | },
788 | "overrides": [
789 | {
790 | "matcher": {
791 | "id": "byRegexp",
792 | "options": "/(Limit|Requested)/"
793 | },
794 | "properties": [
795 | {
796 | "id": "custom.drawStyle",
797 | "value": "line"
798 | },
799 | {
800 | "id": "custom.fillOpacity",
801 | "value": 5
802 | },
803 | {
804 | "id": "custom.lineStyle",
805 | "value": {
806 | "dash": [
807 | 0,
808 | 10
809 | ],
810 | "fill": "dot"
811 | }
812 | }
813 | ]
814 | },
815 | {
816 | "matcher": {
817 | "id": "byName",
818 | "options": "Limit"
819 | },
820 | "properties": [
821 | {
822 | "id": "color",
823 | "value": {
824 | "fixedColor": "orange",
825 | "mode": "fixed"
826 | }
827 | }
828 | ]
829 | },
830 | {
831 | "matcher": {
832 | "id": "byName",
833 | "options": "Requested"
834 | },
835 | "properties": [
836 | {
837 | "id": "color",
838 | "value": {
839 | "fixedColor": "green",
840 | "mode": "fixed"
841 | }
842 | }
843 | ]
844 | }
845 | ]
846 | },
847 | "gridPos": {
848 | "h": 7,
849 | "w": 6,
850 | "x": 12,
851 | "y": 14
852 | },
853 | "id": 29,
854 | "options": {
855 | "legend": {
856 | "calcs": [],
857 | "displayMode": "list",
858 | "placement": "bottom",
859 | "showLegend": true
860 | },
861 | "tooltip": {
862 | "mode": "single",
863 | "sort": "none"
864 | }
865 | },
866 | "pluginVersion": "10.4.0",
867 | "targets": [
868 | {
869 | "datasource": {
870 | "type": "prometheus",
871 | "uid": "prometheus"
872 | },
873 | "editorMode": "code",
874 | "exemplar": false,
875 | "expr": "max by (pod) (container_memory_working_set_bytes{ {{- include "provisionerd-selector" . -}} })",
876 | "hide": false,
877 | "instant": false,
878 | "legendFormat": "__auto",
879 | "range": true,
880 | "refId": "A"
881 | },
882 | {
883 | "datasource": {
884 | "type": "prometheus",
885 | "uid": "prometheus"
886 | },
887 | "editorMode": "code",
888 | "exemplar": false,
889 | "expr": "max(kube_pod_container_resource_limits{ {{- include "provisionerd-selector" . -}} , resource=\"memory\"})",
890 | "hide": false,
891 | "instant": false,
892 | "legendFormat": "Limit",
893 | "range": true,
894 | "refId": "B"
895 | },
896 | {
897 | "datasource": {
898 | "type": "prometheus",
899 | "uid": "prometheus"
900 | },
901 | "editorMode": "code",
902 | "exemplar": false,
903 | "expr": "max(kube_pod_container_resource_requests{ {{- include "provisionerd-selector" . -}} , resource=\"memory\"})",
904 | "hide": false,
905 | "instant": false,
906 | "legendFormat": "Requested",
907 | "range": true,
908 | "refId": "C"
909 | }
910 | ],
911 | "title": "RAM Usage",
912 | "type": "timeseries"
913 | },
914 | {
915 | "datasource": {
916 | "type": "prometheus",
917 | "uid": "prometheus"
918 | },
919 | "description": "",
920 | "gridPos": {
921 | "h": 7,
922 | "w": 6,
923 | "x": 18,
924 | "y": 14
925 | },
926 | "id": 31,
927 | "options": {
928 | "code": {
929 | "language": "plaintext",
930 | "showLineNumbers": false,
931 | "showMiniMap": false
932 | },
933 | "content": "This shows the total memory used by each container; it is the same metric which the [OOM killer](https://www.kernel.org/doc/gorman/html/understand/understand016.html) uses.\n\nRequests & limits are shown if set.",
934 | "mode": "markdown"
935 | },
936 | "pluginVersion": "10.4.0",
937 | "transparent": true,
938 | "type": "text"
939 | },
940 | {
941 | "datasource": {
942 | "type": "loki",
943 | "uid": "loki"
944 | },
945 | "gridPos": {
946 | "h": 18,
947 | "w": 18,
948 | "x": 0,
949 | "y": 21
950 | },
951 | "id": 27,
952 | "options": {
953 | "dedupStrategy": "exact",
954 | "enableLogDetails": true,
955 | "prettifyLogMessage": false,
956 | "showCommonLabels": false,
957 | "showLabels": false,
958 | "showTime": true,
959 | "sortOrder": "Descending",
960 | "wrapLogMessage": false
961 | },
962 | "targets": [
963 | {
964 | "datasource": {
965 | "type": "loki",
966 | "uid": "loki"
967 | },
968 | "editorMode": "code",
969 | "expr": "{ {{- include "non-workspace-selector" . -}}, logger=~\"(.*runner|terraform|provisioner.*)\"}",
970 | "queryType": "range",
971 | "refId": "A"
972 | }
973 | ],
974 | "title": "Logs",
975 | "type": "logs"
976 | },
977 | {
978 | "datasource": {
979 | "type": "prometheus",
980 | "uid": "prometheus"
981 | },
982 | "description": "",
983 | "gridPos": {
984 | "h": 7,
985 | "w": 6,
986 | "x": 18,
987 | "y": 21
988 | },
989 | "id": 32,
990 | "options": {
991 | "code": {
992 | "language": "plaintext",
993 | "showLineNumbers": false,
994 | "showMiniMap": false
995 | },
996 | "content": "This panel shows all logs across built-in and [external provisioners](https://coder.com/docs/v2/latest/admin/provisioners).",
997 | "mode": "markdown"
998 | },
999 | "pluginVersion": "10.4.0",
1000 | "transparent": true,
1001 | "type": "text"
1002 | }
1003 | ],
1004 | "refresh": "{{- include "dashboard-refresh" . -}}",
1005 | "schemaVersion": 39,
1006 | "tags": [],
1007 | "templating": {
1008 | "list": []
1009 | },
1010 | "time": {
1011 | "from": "now-{{- include "dashboard-range" . -}}",
1012 | "to": "now"
1013 | },
1014 | "timepicker": {},
1015 | "timezone": "browser",
1016 | "title": "Provisioners",
1017 | "uid": "provisionerd",
1018 | "version": 10,
1019 | "weekStart": ""
1020 | }
1021 | {{ end }}
--------------------------------------------------------------------------------
/coder-observability/templates/dashboards/configmap-dashboards-coderd.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: dashboards-coderd
5 | namespace: {{ .Release.Namespace }}
6 | data:
7 | coderd.json: |- {{- include "coderd-dashboard.json" . | trim | nindent 4 }}
--------------------------------------------------------------------------------
/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: dashboards-prebuilds
5 | namespace: {{ .Release.Namespace }}
6 | data:
7 | prebuilds.json: |- {{- include "prebuilds-dashboard.json" . | trim | nindent 4 }}
--------------------------------------------------------------------------------
/coder-observability/templates/dashboards/configmap-dashboards-provisionerd.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: dashboards-provisionerd
5 | namespace: {{ .Release.Namespace }}
6 | data:
7 | provisionerd.json: |- {{- include "provisionerd-dashboard.json" . | trim | nindent 4 }}
--------------------------------------------------------------------------------
/coder-observability/templates/dashboards/configmap-dashboards-status.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: dashboards-status
5 | namespace: {{ .Release.Namespace }}
6 | data:
7 | status.json: |- {{- include "status-dashboard.json" . | trim | nindent 4 }}
--------------------------------------------------------------------------------
/coder-observability/templates/dashboards/configmap-dashboards-workspace_detail.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: dashboards-workspace-detail
5 | namespace: {{ .Release.Namespace }}
6 | data:
7 | workspaces-detail.json: |- {{- include "workspace-detail-dashboard.json" . | trim | nindent 4 }}
--------------------------------------------------------------------------------
/coder-observability/templates/dashboards/configmap-dashboards-workspaces.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: dashboards-workspaces
5 | namespace: {{ .Release.Namespace }}
6 | data:
7 | workspaces.json: |- {{- include "workspaces-dashboard.json" . | trim | nindent 4 }}
--------------------------------------------------------------------------------
/coder-observability/templates/service-runbook-viewer.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v1
3 | kind: Service
4 | metadata:
5 | name: runbook-viewer
6 | spec:
7 | ports:
8 | - port: 80
9 | targetPort: 3000
10 | protocol: TCP
11 | selector:
12 | app: runbook-viewer
13 |
--------------------------------------------------------------------------------
/coder-observability/templates/statefulset-postgres-exporter.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: StatefulSet
4 | metadata:
5 | name: postgres-exporter
6 | namespace: {{ .Release.Namespace }}
7 | spec:
8 | selector:
9 | matchLabels:
10 | app: postgres-exporter
11 | serviceName: postgres-exporter
12 | replicas: 1
13 | template:
14 | metadata:
15 | annotations:
16 | prometheus.io/scrape: 'true'
17 | labels:
18 | app: postgres-exporter
19 | app.kubernetes.io/name: "database-stats"
20 | spec:
21 | containers:
22 | - name: postgres-exporter
23 | image: {{ .Values.global.postgres.exporter.image }}
24 | args:
25 | - --collector.long_running_transactions
26 | ports:
27 | - containerPort: 9187
28 | name: exporter
29 | env:
30 | - name: DATA_SOURCE_NAME
31 | value: '{{ include "postgres-connector-string" . }}'
32 | {{ include "postgres-secret-mount" . | nindent 10 }}
33 |
34 | volumeMounts:
35 | {{ toYaml .Values.global.postgres.volumeMounts | nindent 12 }}
36 |
37 | volumes:
38 | {{ toYaml .Values.global.postgres.volumes | nindent 8 }}
--------------------------------------------------------------------------------
/coder-observability/templates/statefulset-runbook-viewer.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: StatefulSet
4 | metadata:
5 | name: runbook-viewer
6 | namespace: {{ .Release.Namespace }}
7 | spec:
8 | selector:
9 | matchLabels:
10 | app: runbook-viewer
11 | serviceName: runbook-viewer
12 | replicas: 1
13 | template:
14 | metadata:
15 | annotations:
16 | checksum/config: {{ (.Files.Glob "runbooks/**").AsConfig | indent 2 | sha256sum }}
17 | labels:
18 | app: runbook-viewer
19 | spec:
20 | containers:
21 | - name: madness
22 | image: {{ .Values.runbookViewer.image }}
23 | ports:
24 | - containerPort: 3000
25 | name: madness
26 | args:
27 | - server
28 | volumeMounts:
29 | - mountPath: /docs/
30 | name: runbooks
31 | volumes:
32 | - name: runbooks
33 | configMap:
34 | name: runbooks
35 |
--------------------------------------------------------------------------------
/coder-observability/templates/statefulset-sql-exporter.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: StatefulSet
4 | metadata:
5 | name: sql-exporter
6 | namespace: {{ .Release.Namespace }}
7 | spec:
8 | selector:
9 | matchLabels:
10 | app: sql-exporter
11 | serviceName: sql-exporter
12 | replicas: 1
13 | template:
14 | metadata:
15 | annotations:
16 | prometheus.io/scrape: 'true'
17 | checksum/config: {{ include (print $.Template.BasePath "/configmap-sql-exporter.yaml") . | sha256sum }}
18 | labels:
19 | app: sql-exporter
20 | app.kubernetes.io/name: "database-stats"
21 | spec:
22 | containers:
23 | - name: sql-exporter
24 | image: {{ .Values.sqlExporter.image }}
25 | args:
26 | - -config.file=/cfg/config.yaml
27 | ports:
28 | - containerPort: 9399
29 | name: exporter
30 | volumeMounts:
31 | - mountPath: /cfg/
32 | name: config
33 | {{ include "postgres-secret-mount" . | nindent 10 }}
34 | volumes:
35 | - name: config
36 | configMap:
37 | name: sql-exporter-config
38 |
--------------------------------------------------------------------------------
/coder-observability/values.yaml:
--------------------------------------------------------------------------------
1 | global:
2 | coder:
3 | # global.coder.scrapeMetrics -- use this to scrape metrics from a standalone (set of) coder deployment(s)
4 | # if using kubernetes, rather add an annotation "prometheus.io/scrape=true" and coder will get automatically scraped;
5 | # set this value to null and configure coderdSelector to target your coder pods
6 | scrapeMetrics: null
7 | # hostname: localhost
8 | # port: 2112
9 | # scrapeInterval: 15s
10 | # additionalLabels:
11 | # job: coder
12 | # global.coder.coderdSelector -- series selector for Prometheus/Loki to locate provisioner pods.
13 | # ensure this uses backticks for quotes!
14 | coderdSelector: 'pod=~`coder.*`, pod!~`.*provisioner.*`'
15 | # global.coder.provisionerdSelector -- series selector for Prometheus/Loki to locate provisioner pods.
16 | # https://coder.com/docs/v2/latest/admin/provisioners
17 | # TODO: rename container label in provisioner helm chart to be "provisioner" not "coder"
18 | # ensure this uses backticks for quotes!
19 | provisionerdSelector: 'pod=~`coder-provisioner.*`'
20 | # global.coder.workspacesSelector -- the namespace into which any external provisioners have been deployed.
21 | workspacesSelector: 'namespace=`coder-workspaces`'
22 | # global.coder.controlPlaneNamespace -- the namespace into which the control plane has been deployed.
23 | controlPlaneNamespace: coder
24 | # global.coder.externalProvisionersNamespace -- the namespace into which any external provisioners have been deployed.
25 | externalProvisionersNamespace: coder
26 | # See https://coder.com/docs/v2/latest/cli/server#--log-human
27 | # "Human" format is the default, which is a combination of plaintext and logfmt but it' quite tricky to parse reliably
28 | # with regex matchers.
29 | # TODO: support "json" format
30 | logFormat: human
31 | # global.coder.alerts -- alerts for the various aspects of Coder
32 | alerts:
33 | enterprise:
34 | groups:
35 | Licences:
36 | enabled: true
37 | delay: 1m
38 | thresholds:
39 | warning: 0.9
40 | critical: 1
41 | coderd:
42 | groups:
43 | CPU:
44 | enabled: true
45 | delay: 10m
46 | period: 10m
47 | thresholds:
48 | warning: 0.8
49 | critical: 0.9
50 | Memory:
51 | enabled: true
52 | delay: 10m
53 | thresholds:
54 | warning: 0.8
55 | critical: 0.9
56 | Restarts:
57 | enabled: true
58 | delay: 1m
59 | period: 10m
60 | thresholds:
61 | notify: 1
62 | warning: 2
63 | critical: 3
64 | Replicas:
65 | enabled: true
66 | delay: 5m
67 | thresholds:
68 | notify: 3 # 2/3 replicas are alive
69 | warning: 2 # 1/3 replicas are alive
70 | critical: 1 # 0/3 replicas are alive
71 | WorkspaceBuildFailures:
72 | enabled: true
73 | delay: 10m
74 | period: 10m
75 | thresholds:
76 | notify: 2
77 | warning: 5
78 | critical: 10
79 | IneligiblePrebuilds:
80 | enabled: true
81 | delay: 10m
82 | thresholds:
83 | notify: 1
84 | UnprovisionedPrebuiltWorkspaces:
85 | enabled: true
86 | delay: 10m
87 | thresholds:
88 | warn: 1
89 | provisionerd:
90 | groups:
91 | Replicas:
92 | enabled: true
93 | delay: 5m
94 | thresholds:
95 | notify: 3 # 2/3 replicas are alive
96 | warning: 2 # 1/3 replicas are alive
97 | critical: 1 # 0/3 replicas are alive
98 |
99 | zone: svc
100 |
101 | externalScheme: http
102 | # The external hostname from which k8s services can be accessed in the form of:
103 | # :.<>
104 | # e.g.
105 | # http://dashboards.coder-observability.svc.cluster.local
106 | externalZone: svc.cluster.local
107 |
108 | # global.telemetry -- control telemetry collection
109 | telemetry:
110 | # global.telemetry.metrics -- control metric collection
111 | metrics:
112 | # global.telemetry.metrics.scrape_interval -- how often the collector will scrape discovered pods
113 | scrape_interval: 15s
114 | # global.telemetry.metrics.scrape_timeout -- how long a request will be allowed to wait before being canceled
115 | scrape_timeout: 12s
116 |
117 | # global.postgres -- postgres connection information
118 | # NOTE: these settings are global so we can parameterise some values which get rendered by subcharts
119 | postgres:
120 | hostname: localhost
121 | port: 5432
122 | username: coder
123 | password:
124 | database: coder
125 | sslmode: disable
126 | # add root cert path if using SSL
127 | sslrootcert: /home/coder/.postgresql/rootcert.pem
128 |
129 | # ensure that your secret has a field named `PGPASSWORD`
130 | mountSecret: "secret-postgres"
131 | exporter:
132 | image: "quay.io/prometheuscommunity/postgres-exporter"
133 |
134 | volumes:
135 | - name: "pg-certs-mount"
136 | configMap:
137 | name: "pg-certs-mount-config-map"
138 |
139 | volumeMounts:
140 | - name: "pg-certs-mount"
141 | mountPath: "/home/coder/.postgresql"
142 | readOnly: true
143 |
144 | # global.postgres.alerts -- alerts for postgres
145 | alerts:
146 | groups:
147 | Basic:
148 | enabled: true
149 | delay: 1m
150 | Notifications:
151 | enabled: true
152 | delay: 15m
153 | thresholds:
154 | notify: 0.5
155 | warning: 0.8
156 | critical: 0.9
157 | Connections:
158 | enabled: true
159 | delay: 5m
160 | thresholds:
161 | notify: 0.5
162 | warning: 0.8
163 | critical: 0.9
164 |
165 | # global.dashboards -- settings for bundled dashboards
166 | dashboards:
167 | # global.dashboards.timerange -- how far back dashboards should look
168 | timerange: 12h
169 | # global.dashboards.refresh -- how often dashboards should refresh
170 | refresh: 30s
171 | # global.dashboards.queryTimeout -- how long until a query in Grafana will timeout after
172 | queryTimeout: 900
173 |
174 | runbookViewer:
175 | image: "dannyben/madness"
176 |
177 | sqlExporter:
178 | image: "burningalchemist/sql_exporter"
179 |
180 | grafana-agent:
181 | enabled: true
182 | fullnameOverride: grafana-agent
183 | agent:
184 | mode: flow
185 | configMap:
186 | name: collector-config
187 | key: config.river
188 | create: false
189 | clustering:
190 | enabled: false
191 | extraArgs:
192 | - --disable-reporting=true
193 | mounts:
194 | varlog: true
195 | dockercontainers: true
196 | controller:
197 | type: daemonset
198 | podAnnotations:
199 | prometheus.io/scrape: "true"
200 | crds:
201 | create: false
202 |
203 | withOTLPReceiver: false
204 |
205 | # Configuration blocks
206 | #
207 | # Enable debug logging (warning: produces large amount of logs!)
208 | #logging: |-
209 | # logging {
210 | # level = "debug"
211 | # format = "logfmt"
212 | # }
213 | discovery: |-
214 | // Discover k8s nodes
215 | discovery.kubernetes "nodes" {
216 | role = "node"
217 | }
218 |
219 | // Discover k8s pods
220 | discovery.kubernetes "pods" {
221 | role = "pod"
222 | selectors {
223 | role = "pod"
224 | }
225 | }
226 | commonRelabellings: |-
227 | rule {
228 | source_labels = ["__meta_kubernetes_namespace"]
229 | target_label = "namespace"
230 | }
231 | rule {
232 | source_labels = ["__meta_kubernetes_pod_name"]
233 | target_label = "pod"
234 | }
235 | // coalesce the following labels and pick the first value; we'll use this to define the "job" label
236 | rule {
237 | source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_component", "app", "__meta_kubernetes_pod_container_name"]
238 | separator = "/"
239 | target_label = "__meta_app"
240 | action = "replace"
241 | regex = "^/*([^/]+?)(?:/.*)?$" // split by the delimiter if it exists, we only want the first one
242 | replacement = "${1}"
243 | }
244 | rule {
245 | source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_app"]
246 | separator = "/"
247 | target_label = "job"
248 | }
249 | rule {
250 | source_labels = ["__meta_kubernetes_pod_container_name"]
251 | target_label = "container"
252 | }
253 | rule {
254 | regex = "__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)"
255 | action = "labeldrop"
256 | }
257 | rule {
258 | regex = "pod_template_generation"
259 | action = "labeldrop"
260 | }
261 | rule {
262 | source_labels = ["__meta_kubernetes_pod_phase"]
263 | regex = "Pending|Succeeded|Failed|Completed"
264 | action = "drop"
265 | }
266 | rule {
267 | source_labels = ["__meta_kubernetes_pod_node_name"]
268 | action = "replace"
269 | target_label = "node"
270 | }
271 | rule {
272 | action = "labelmap"
273 | regex = "__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)"
274 | replacement = "__param_$1"
275 | }
276 | extraBlocks: ""
277 | # Examples:
278 | # loki.source.file "tmpfiles" {
279 | # targets = [
280 | # {__path__ = "/tmp/foo.txt", "color" = "pink"},
281 | # {__path__ = "/tmp/bar.txt", "color" = "blue"},
282 | # {__path__ = "/tmp/baz.txt", "color" = "grey"},
283 | # ]
284 | # forward_to = [loki.write.loki.receiver]
285 | # }
286 | podMetricsRelabelRules: ""
287 | podLogsRelabelRules: ""
288 |
289 | grafana:
290 | enabled: true
291 | image:
292 | tag: 10.4.19
293 | fullnameOverride: grafana
294 | useStatefulSet: true
295 | replicas: 1
296 | deploymentStrategy:
297 | type: Recreate # avoid MultiAttachError for standard-rwo sc
298 | service:
299 | enabled: true
300 | persistence:
301 | enabled: true
302 | size: 10Gi
303 | testFramework:
304 | enabled: false
305 | annotations:
306 | # TODO: this adds annotations to _all_ resources; can we be more specific?
307 | prometheus.io/scrape: "true"
308 | dashboardProviders:
309 | infra.yaml:
310 | apiVersion: 1
311 | providers:
312 | - name: infra
313 | orgId: 1
314 | folder: 'Infrastructure'
315 | type: file
316 | disableDeletion: false
317 | editable: false
318 | options:
319 | path: /var/lib/grafana/dashboards/infra
320 | coder.yaml:
321 | apiVersion: 1
322 | providers:
323 | - name: coder
324 | orgId: 1
325 | folder: 'Coder'
326 | type: file
327 | updateIntervalSeconds: 5
328 | disableDeletion: false
329 | editable: false
330 | options:
331 | path: /var/lib/grafana/dashboards/coder
332 | sidecar.yaml:
333 | apiVersion: 1
334 | providers:
335 | - name: sidecar
336 | orgId: 1
337 | type: file
338 | folder: 'Other'
339 | disableDeletion: false
340 | updateIntervalSeconds: 30
341 | editable: false
342 | options:
343 | path: /tmp/dashboards
344 | dashboards:
345 | # TODO: import dashboards from coder/coder
346 | infra:
347 | node-exporter-full:
348 | gnetId: 1860
349 | revision: 36
350 | datasource: metrics
351 | postgres-database:
352 | gnetId: 9628
353 | revision: 7
354 | datasource: metrics
355 | datasources:
356 | datasources.yaml:
357 | apiVersion: 1
358 | datasources:
359 | - name: metrics
360 | type: prometheus
361 | url: http://prometheus.{{ .Release.Namespace }}.{{ $.Values.global.zone }}
362 | access: proxy
363 | isDefault: true
364 | editable: false
365 | # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout
366 | timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}'
367 | uid: prometheus
368 | - name: logs
369 | type: loki
370 | url: http://loki-gateway.{{ .Release.Namespace }}.{{ $.Values.global.zone }}
371 | access: proxy
372 | isDefault: false
373 | editable: false
374 | # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout
375 | timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}'
376 | uid: loki
377 | - name: postgres
378 | type: postgres
379 | url: '{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}'
380 | user: '{{ .Values.global.postgres.username }}'
381 | secureJsonData:
382 | password: '{{ if .Values.global.postgres.password }}{{ .Values.global.postgres.password }}{{ else }}$PGPASSWORD{{ end }}'
383 | jsonData:
384 | sslmode: '{{ .Values.global.postgres.sslmode }}'
385 | isDefault: false
386 | editable: false
387 | # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout
388 | timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}'
389 | uid: postgres
390 | admin:
391 | existingSecret: ""
392 | env:
393 | GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION: true
394 | grafana.ini:
395 | auth.anonymous:
396 | enabled: true
397 | org_name: Main Org.
398 | org_role: Admin
399 | analytics:
400 | reporting_enabled: false
401 | users:
402 | allow_sign_up: false
403 | feature_toggles:
404 | # migrate Angular panels to React
405 | # see https://grafana.com/docs/grafana/latest/developers/angular_deprecation/angular-plugins/#automatic-migration-of-plugins
406 | autoMigrateOldPanels: true
407 | dashboards:
408 | # mounted configmap will be synced with sidecar
409 | default_home_dashboard_path: /var/lib/grafana/dashboards/coder/0/status.json
410 | dataproxy:
411 | timeout: '{{ $.Values.global.dashboards.queryTimeout }}'
412 | sidecar:
413 | dashboards:
414 | provider:
415 | disableDelete: true
416 | allowUiUpdates: true
417 | enabled: false
418 | labelValue: "1"
419 | extraConfigmapMounts:
420 | # we can't combine configmaps because of the 1MiB size limit, but Grafana will scan
421 | # the /var/lib/grafana/dashboards/coder directory deeply to find dashboards
422 | - name: dashboards-status
423 | mountPath: /var/lib/grafana/dashboards/coder/0
424 | configMap: dashboards-status
425 | readOnly: false
426 | - name: dashboards-coderd
427 | mountPath: /var/lib/grafana/dashboards/coder/1
428 | configMap: dashboards-coderd
429 | readOnly: false
430 | - name: dashboards-provisionerd
431 | mountPath: /var/lib/grafana/dashboards/coder/2
432 | configMap: dashboards-provisionerd
433 | readOnly: false
434 | - name: dashboards-workspaces
435 | mountPath: /var/lib/grafana/dashboards/coder/3
436 | configMap: dashboards-workspaces
437 | readOnly: false
438 | - name: dashboards-workspace-detail
439 | mountPath: /var/lib/grafana/dashboards/coder/4
440 | configMap: dashboards-workspace-detail
441 | readOnly: false
442 | - name: dashboards-prebuilds
443 | mountPath: /var/lib/grafana/dashboards/coder/5
444 | configMap: dashboards-prebuilds
445 | readOnly: false
446 |
447 | prometheus:
448 | enabled: true
449 | server:
450 | fullnameOverride: prometheus
451 | podAnnotations:
452 | prometheus.io/scrape: "true"
453 |
454 | global:
455 | # prometheus.server.evaluation_interval -- how often to evaluate recording & alerting rule groups
456 | evaluation_interval: 30s
457 |
458 | extraArgs:
459 | log.level: debug
460 |
461 | replicaCount: 1
462 | statefulSet:
463 | enabled: true
464 |
465 | retentionSize: 10GB
466 | persistentVolume:
467 | enabled: true
468 | # Note: allowing +2GB breathing room above storage.tsdb.retention.size
469 | size: 12Gi
470 | service:
471 | type: ClusterIP
472 | extraFlags:
473 | - web.enable-lifecycle
474 | - enable-feature=remote-write-receiver
475 | extraConfigmapMounts:
476 | - name: alerts
477 | mountPath: /etc/config/alerts
478 | configMap: metrics-alerts
479 | readonly: true
480 |
481 | serverFiles:
482 | prometheus.yml:
483 | # disables scraping of metrics by the Prometheus helm chart since this is managed by the collector
484 | scrape_configs: []
485 | # use custom rule files to be able to render templates (can't do that in values.yaml, unless that value is evaluated by a tpl call)
486 | rule_files:
487 | - /etc/config/alerts/*.yaml
488 |
489 | testFramework:
490 | enabled: false
491 |
492 | # enable metric collection from configmap reloader
493 | configmapReload:
494 | prometheus:
495 | extraArgs:
496 | log-level: all
497 | watch-interval: 15s
498 | containerPort: 9091
499 | extraConfigmapMounts:
500 | - name: alerts
501 | mountPath: /etc/config/alerts
502 | configMap: metrics-alerts
503 | readonly: true
504 |
505 | alertmanager:
506 | fullnameOverride: alertmanager
507 | enabled: true
508 | service:
509 | port: 80
510 | podAnnotations:
511 | prometheus.io/scrape: "true"
512 | kube-state-metrics:
513 | fullnameOverride: kube-state-metrics
514 | enabled: true
515 | podAnnotations:
516 | prometheus.io/scrape: "true"
517 | prometheus-node-exporter:
518 | fullnameOverride: node-exporter
519 | enabled: true
520 | podAnnotations:
521 | prometheus.io/scrape: "true"
522 |
523 | # Disable push gateway
524 | prometheus-pushgateway:
525 | enabled: false
526 |
527 | loki:
528 | enabled: true
529 | nameOverride: loki
530 | fullnameOverride: loki
531 |
532 | enterprise:
533 | enabled: false
534 | adminApi:
535 | enabled: false
536 | useExternalLicense: false
537 |
538 | test:
539 | canaryServiceAddress: "http://loki-canary:3500/metrics"
540 | enabled: true
541 |
542 | minio:
543 | enabled: true
544 | fullnameOverride: loki-storage
545 | address: loki-storage.{{ .Release.Namespace }}.{{ .Values.global.zone}}:9000
546 | podAnnotations:
547 | prometheus.io/scrape: "true"
548 | prometheus.io/path: "/minio/v2/metrics/cluster"
549 | podLabels:
550 | app.kubernetes.io/name: "loki-storage"
551 |
552 | loki:
553 | auth_enabled: false
554 | commonConfig:
555 | path_prefix: /var/loki
556 | replication_factor: 1
557 | schemaConfig:
558 | configs:
559 | - from: 2024-04-01
560 | store: tsdb
561 | object_store: s3
562 | schema: v13
563 | index:
564 | prefix: index_
565 | period: 24h
566 |
567 | rulerConfig:
568 | remote_write:
569 | enabled: true
570 | clients:
571 | # "fake" is the default username when auth is disabled (unfortunate, I know)
572 | fake:
573 | url: http://prometheus.{{ .Release.Namespace }}.{{ .Values.global.zone}}/api/v1/write
574 | headers:
575 | Source: Loki
576 | remote_timeout: 30s
577 | wal:
578 | dir: /var/loki-ruler-wal
579 | alertmanager_url: http://alertmanager.{{ .Release.Namespace }}.{{ .Values.global.zone}}
580 | enable_api: true
581 | ring:
582 | kvstore:
583 | store: inmemory
584 | enable_alertmanager_v2: true
585 | storage:
586 | type: local
587 | local:
588 | directory: /rules
589 | rule_path: /rules
590 |
591 | lokiCanary:
592 | enabled: true
593 | annotations:
594 | prometheus.io/scrape: "true"
595 |
596 | chunksCache:
597 | allocatedMemory: 1024
598 | resultsCache:
599 | allocatedMemory: 1024
600 |
601 | # disabled scraping of logs by the Loki helm chart since this is managed by the collector
602 | monitoring:
603 | selfMonitoring:
604 | enabled: false
605 | grafanaAgent:
606 | installOperator: false
607 | # creates ConfigMaps of dashboards which are discovered via labels
608 | dashboards:
609 | enabled: true
610 |
611 | sidecar:
612 | rules:
613 | logLevel: DEBUG
614 | folder: /rules/fake
615 |
616 | gateway:
617 | replicas: 1
618 | write:
619 | podAnnotations:
620 | prometheus.io/scrape: "true"
621 | replicas: 1
622 | extraArgs:
623 | - -log.level=debug
624 | read:
625 | podAnnotations:
626 | prometheus.io/scrape: "true"
627 | replicas: 1
628 | backend:
629 | podAnnotations:
630 | prometheus.io/scrape: "true"
631 | replicas: 1
632 | extraVolumes:
633 | - name: ruler-wal
634 | emptyDir: { }
635 | extraVolumeMounts:
636 | - name: ruler-wal
637 | mountPath: /var/loki-ruler-wal
638 | extraArgs:
639 | - -log.level=debug
640 |
--------------------------------------------------------------------------------
/scripts/check-unstaged.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
4 |
5 | check_unstaged
--------------------------------------------------------------------------------
/scripts/compile.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 |
4 | # check versions
5 | HELM_VERSION=3.17
6 | YQ_VERSION=4.42
7 | [[ "$(helm version)" == *v${HELM_VERSION}* ]] || { echo "Expected helm version v${HELM_VERSION} but got $(helm version)" >&2; exit 1; }
8 | [[ "$(yq --version)" == *v${YQ_VERSION}* ]] || { echo "Expected yq version v${YQ_VERSION} but got $(yq --version)" >&2; exit 1; }
9 |
10 | source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
11 |
12 | helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
13 | helm repo add grafana https://grafana.github.io/helm-charts
14 | helm --repository-cache /tmp/cache repo update
15 | # Check for unexpected changes.
16 | # Helm dependencies are versioned using ^ which accepts minor & patch changes:
17 | # e.g. ^1.2.3 is equivalent to >= 1.2.3 < 2.0.0
18 | helm dependency update coder-observability/
19 | # We *expect* that the versions will change in the rendered template output, so we ignore those, but
20 | # if there are changes to the manifests themselves then we need to fail the build to force manual review.
21 | helm template --namespace coder-observability -f coder-observability/values.yaml coder-observability coder-observability/ | \
22 | yq e 'del(.spec.template.spec.containers[].image, .metadata.labels."helm.sh/chart", .metadata.labels."app.kubernetes.io/version")' - \
23 | > compiled/resources.yaml
24 |
25 | check_unstaged "compiled"
--------------------------------------------------------------------------------
/scripts/lib.sh:
--------------------------------------------------------------------------------
1 | function check_unstaged() {
2 | FILES="$(git ls-files --other --modified --exclude-standard -- ${1:-.})"
3 | if [[ "$FILES" != "" ]]; then
4 | mapfile -t files <<<"$FILES"
5 |
6 | echo
7 | echo "The following files contain unstaged changes:"
8 | echo
9 | for file in "${files[@]}"; do
10 | echo " - $file"
11 | done
12 |
13 | echo
14 | echo "These are the changes:"
15 | echo
16 | for file in "${files[@]}"; do
17 | git --no-pager diff "$file" 1>&2
18 | done
19 |
20 | echo
21 | echo >&2 "Unstaged changes, see above for details."
22 | exit 1
23 | fi
24 | }
--------------------------------------------------------------------------------
/scripts/lint-rules.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 |
4 | temp_dir="$(mktemp -d)"
5 | rules_file="${temp_dir}/rules.yaml"
6 | helm template coder-o11y coder-observability -f coder-observability/values.yaml --show-only templates/configmap-prometheus-alerts.yaml > ${rules_file}
7 |
8 | for key in $(yq e '.data | keys' -o csv ${rules_file} | tr ',' "\n"); do
9 | file="${temp_dir}/${key}"
10 | echo "=========================== [${file}] ==========================="
11 |
12 | yq e ".data[\"${key}\"]" ${rules_file} > ${file}
13 | go run github.com/cloudflare/pint/cmd/pint@latest -l DEBUG lint ${file}
14 | done
--------------------------------------------------------------------------------
/scripts/publish.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euox pipefail
3 |
4 | version=$("$(dirname "${BASH_SOURCE[0]}")/version.sh")
5 | mkdir -p build/helm
6 | helm package coder-observability --version=${version} --dependency-update --destination build/helm
7 | gsutil cp gs://helm.coder.com/observability/index.yaml build/helm/index.yaml
8 | helm repo index build/helm --url https://helm.coder.com/observability --merge build/helm/index.yaml
9 | gsutil -h "Cache-Control:no-cache,max-age=0" cp build/helm/index.yaml gs://helm.coder.com/observability/
10 | gsutil -h "Cache-Control:no-cache,max-age=0" cp build/helm/coder-observability-${version}.tgz gs://helm.coder.com/observability/
11 | gsutil -h "Cache-Control:no-cache,max-age=0" cp artifacthub-repo.yaml gs://helm.coder.com/observability/
12 |
13 | echo $version
--------------------------------------------------------------------------------
/scripts/version.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # This script generates the version string used by the helm chart, including for
4 | # dev versions. Note: the version returned by this script will NOT include the "v"
5 | # prefix that is included in the Git tag.
6 | # The script can also bump the version based on the given argument (major, minor, patch).
7 |
8 | set -euo pipefail
9 |
10 | remote_url=$(git remote get-url origin)
11 | current_version="$(git tag -l | sort --version-sort | tail -n1)"
12 |
13 | function help() {
14 | echo "$0 [options] [arguments]"
15 | echo " "
16 | echo "options:"
17 | echo "-h, --help show brief help"
18 | echo "-c, --current show the current version"
19 | echo "-b, --bump bump the version based on the given argument"
20 | exit 0
21 | }
22 |
23 | function bump_version() {
24 | local version=$1
25 | local new_version
26 |
27 | if [[ $version == "major" ]]; then
28 | new_version=$(echo $current_version | awk -F. '{print $1+1".0.0"}')
29 | elif [[ $version == "minor" ]]; then
30 | new_version=$(echo $current_version | awk -F. '{print $1"."$2+1".0"}')
31 | elif [[ $version == "patch" ]]; then
32 | new_version=$(echo $current_version | awk -F. '{print $1"."$2"."$3+1}')
33 | else
34 | echo "Error: Unknown argument $version"
35 | exit 1
36 | fi
37 |
38 | echo $new_version
39 | }
40 |
41 | function show_current() {
42 | # Version without the "v" prefix.
43 | echo "${current_version#v}"
44 | }
45 |
46 | if [ $# == 0 ]; then
47 | show_current
48 | fi
49 |
50 | while test $# -gt 0; do
51 | case "$1" in
52 | -h|--help)
53 | help
54 | ;;
55 | -c|--current)
56 | show_current
57 | shift
58 | ;;
59 | -b|--bump)
60 | if [ $# -lt 2 ]; then
61 | echo "Error: Missing argument for bump"
62 | exit 1
63 | fi
64 | shift
65 | bump_version $1
66 | shift
67 | ;;
68 | *)
69 | echo "Error: Unknown argument $1"
70 | exit 1
71 | ;;
72 | esac
73 | done
74 |
--------------------------------------------------------------------------------