├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── cloudbuild.yaml
├── cloudprober
├── Dockerfile
└── probe.cfg
├── grafana
├── Dockerfile
├── dashboard.yml
├── datasource.yml
├── grafana.ini
└── slo_dashboard.json
├── prometheus
├── Dockerfile
├── oc.rules.yml
├── prometheus.yml
└── slos.rules.yml
├── server
├── Dockerfile
├── go.mod
├── go.sum
└── server.go
└── terraform
├── .gitignore
├── k8s
├── 0-system-settings.yml
├── 1-prometheus.yml
├── 2-server.yml
├── 3-grafana.yml
└── 4-cloudprober.yml
├── main.tf
├── vars.tf
└── versions.tf
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | ## Community Guidelines
26 |
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google.com/conduct/).
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Prometheus SLO Burn
2 |
3 | This is home to example code for exposing SLIs using open source code in prometheus.
4 |
5 | ## Build Images
6 |
7 | - `$ gcloud builds submit --project $GOOGLE_PROJECT` in the root directory.
8 | - These images are currently published and publicly available from the project
9 | `cre-prometheus-slo-alerting`.
10 |
11 | ## Terraform setup
12 |
13 | - Install Terraform
14 | - Set up terraform env (assumes you have a working gcloud install and a google project):
15 |
16 | ```
17 | $ [[ $CLOUD_SHELL ]] || gcloud auth application-default login
18 | $ export GOOGLE_PROJECT=$(gcloud config get-value project)
19 | $ export REGION=europe-west2
20 | ```
21 |
22 | - `$ cd terraform`
23 | - `$ terraform init` - installs terraform deps
24 | - `$ terraform apply -var "gcp_region=$REGION"` - Will ask you before it does
25 | anything. Will take ~10m to actually run. You can also run `terraform plan`
26 | to just get a dry run output.
27 | - `$ gcloud container clusters get-credentials example --region $REGION
28 | --project $GOOGLE_PROJECT` - Configures `kubectl` to work with the cluster
29 | you just created.
30 | - `$ kubectl create clusterrolebinding $USER-cluster-admin-binding
31 | --clusterrole=cluster-admin --user=$(gcloud config get-value account
32 | --project $GOOGLE_PROJECT)` - Gives your user permissions to create cluster
33 | role bindings that prometheus needs.
34 | - `$ kubectl apply -f ./k8s`
35 |
36 | ## Teardown
37 |
38 | - `$ cd terraform; terraform destroy -var "gcp_region=$REGION"`
39 |
40 | ## Running Locally
41 |
42 | - Start kubernetes (see
43 | https://kubernetes.io/docs/setup/pick-right-solution/#local-machine-solutions
44 | ).
45 | - Run `$ kubectl config current-context` to make sure you are in the correct
46 | context.
47 | - `$ cd terraform`
48 | - `$ kubectl apply -f ./k8s`
49 | - `$ kubectl get services --namespace=monitoring` you will see something like:
50 |
51 | ```
52 | $ kubectl get services --namespace=monitoring
53 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
54 | cloudprober NodePort 10.104.187.119 8080:31589/TCP 21m
55 | grafana NodePort 10.104.206.150 8080:30431/TCP 21m
56 | node-exporter ClusterIP None 9100/TCP 21m
57 | prometheus NodePort 10.101.58.210 9090:31517/TCP 21m
58 | server NodePort 10.111.115.243 8080:31796/TCP 21m
59 | ```
60 |
61 | This means that now you can visit http://localhost:30431 and see the grafana
62 | dashboard.
63 |
--------------------------------------------------------------------------------
/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | steps:
15 | - name: 'gcr.io/cloud-builders/docker'
16 | args: ['build', '-t', 'gcr.io/$PROJECT_ID/prometheus', './prometheus']
17 | timeout: 500s
18 | - name: 'gcr.io/cloud-builders/docker'
19 | args: ['build', '-t', 'gcr.io/$PROJECT_ID/grafana', './grafana']
20 | timeout: 500s
21 | - name: 'gcr.io/cloud-builders/docker'
22 | args: ['build', '-t', 'gcr.io/$PROJECT_ID/cloudprober', './cloudprober']
23 | timeout: 500s
24 | - name: 'gcr.io/cloud-builders/docker'
25 | args: ['build', '-t', 'gcr.io/$PROJECT_ID/example-server', './server']
26 | timeout: 500s
27 | images:
28 | - 'gcr.io/$PROJECT_ID/example-server'
29 | - 'gcr.io/$PROJECT_ID/grafana'
30 | - 'gcr.io/$PROJECT_ID/cloudprober'
31 | - 'gcr.io/$PROJECT_ID/prometheus'
32 |
--------------------------------------------------------------------------------
/cloudprober/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM cloudprober/cloudprober:latest
2 |
3 | ENV PORT 8080
4 | ENV CLOUDPROBER_PORT $PORT
5 | EXPOSE $PORT
6 |
7 | COPY ./probe.cfg /etc/cloudprober.cfg
8 | RUN chmod a+r /etc/cloudprober.cfg
9 |
--------------------------------------------------------------------------------
/cloudprober/probe.cfg:
--------------------------------------------------------------------------------
1 | probe {
2 | name: "server"
3 | type: HTTP
4 | targets {
5 | host_names: "server.monitoring.svc.cluster.local:8080"
6 | }
7 |
8 | interval_msec: 100 # .1s
9 | timeout_msec: 50 # .05s
10 | }
11 |
--------------------------------------------------------------------------------
/grafana/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM grafana/grafana:6.0.1
2 |
3 | # We do this so we can have sqlite later
4 | USER root
5 | RUN apt update && apt install -y sqlite && rm -rf /var/lib/apt/lists/*
6 |
7 | USER grafana
8 | RUN mkdir -p /var/log/grafana /var/lib/grafana
9 | COPY --chown=grafana:grafana grafana.ini /etc/grafana/
10 | COPY --chown=grafana:grafana datasource.yml /etc/grafana/provisioning/datasources/
11 | COPY --chown=grafana:grafana dashboard.yml /etc/grafana/provisioning/dashboards/
12 | COPY --chown=grafana:grafana *.json /var/lib/grafana/dashboards/
13 |
14 | # We need to run this command to make sure the database exists. We then modify
15 | # the database to set the default dashboard. We have to do this because we turn
16 | # off user accounts and the API in our settings.
17 | RUN grafana-cli admin reset-admin-password $(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
18 |
19 | # This magic incantation to set the preferences for an anonomyous user is
20 | # undocumented, but can be found in code at
21 | # https://github.com/grafana/grafana/blob/v6.0.0/pkg/services/sqlstore/preferences.go#L29
22 | RUN echo "insert into preferences (org_id, user_id, version, home_dashboard_id, team_id, timezone, theme, created, updated) VALUES (1,0,0,1,0,'','', datetime('now'), datetime('now'));" | sqlite3 /var/lib/grafana/grafana.db
23 |
--------------------------------------------------------------------------------
/grafana/dashboard.yml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 |
3 | providers:
4 | - name: 'default'
5 | orgId: 1
6 | folder: ''
7 | type: file
8 | disableDeletion: true
9 | updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards
10 | options:
11 | path: /var/lib/grafana/dashboards
12 |
--------------------------------------------------------------------------------
/grafana/datasource.yml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 | datasources:
3 | - name: Prometheus
4 | type: prometheus
5 | access: proxy
6 | url: http://prometheus.monitoring.svc.cluster.local:9090
7 | editable: false
8 | isDefault: true
9 |
--------------------------------------------------------------------------------
/grafana/grafana.ini:
--------------------------------------------------------------------------------
1 | instance_name = "Prometheus Example"
2 |
3 | [auth.anonymous]
4 | enabled = true
5 | org_role = Viewer
6 |
7 | [auth.basic]
8 | enabled = false
9 |
10 | [auth]
11 | disable_login_form = true
12 |
13 | [server]
14 | http_port = 8080
15 | enable_gzip = true
16 |
17 | [log]
18 | mode = console
19 |
20 | [users]
21 | allow_sign_up = false
22 | allow_org_create = false
23 |
24 | [snapshots]
25 | external_enabled = false
26 |
27 | [alerting]
28 | enabled = false
29 |
--------------------------------------------------------------------------------
/grafana/slo_dashboard.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": "-- Grafana --",
7 | "enable": true,
8 | "hide": true,
9 | "iconColor": "rgba(0, 211, 255, 1)",
10 | "name": "Annotations & Alerts",
11 | "type": "dashboard"
12 | }
13 | ]
14 | },
15 | "editable": false,
16 | "gnetId": null,
17 | "graphTooltip": 0,
18 | "links": [],
19 | "panels": [
20 | {
21 | "cacheTimeout": null,
22 | "columns": [],
23 | "fontSize": "100%",
24 | "gridPos": {
25 | "h": 5,
26 | "w": 24,
27 | "x": 0,
28 | "y": 0
29 | },
30 | "id": 18,
31 | "links": [],
32 | "pageSize": null,
33 | "scroll": true,
34 | "showHeader": true,
35 | "sort": {
36 | "col": 0,
37 | "desc": true
38 | },
39 | "styles": [],
40 | "targets": [
41 | {
42 | "expr": "ALERTS{alertstate=\"firing\"}",
43 | "format": "time_series",
44 | "intervalFactor": 1,
45 | "refId": "A"
46 | }
47 | ],
48 | "timeFrom": null,
49 | "timeShift": null,
50 | "title": "Alerts Firing",
51 | "transform": "timeseries_aggregations",
52 | "type": "table"
53 | },
54 | {
55 | "aliasColors": {},
56 | "bars": false,
57 | "dashLength": 10,
58 | "dashes": false,
59 | "fill": 0,
60 | "gridPos": {
61 | "h": 11,
62 | "w": 24,
63 | "x": 0,
64 | "y": 5
65 | },
66 | "id": 22,
67 | "legend": {
68 | "alignAsTable": true,
69 | "avg": false,
70 | "current": true,
71 | "max": false,
72 | "min": false,
73 | "rightSide": false,
74 | "show": true,
75 | "total": false,
76 | "values": true
77 | },
78 | "lines": true,
79 | "linewidth": 3,
80 | "links": [],
81 | "nullPointMode": "null",
82 | "paceLength": 10,
83 | "percentage": false,
84 | "pointradius": 2,
85 | "points": false,
86 | "renderer": "flot",
87 | "seriesOverrides": [],
88 | "stack": false,
89 | "steppedLine": false,
90 | "targets": [
91 | {
92 | "expr": "job:error_budget:slo",
93 | "format": "time_series",
94 | "intervalFactor": 1,
95 | "refId": "B"
96 | },
97 | {
98 | "expr": "job:error_budget:remaining",
99 | "format": "time_series",
100 | "intervalFactor": 1,
101 | "refId": "A"
102 | }
103 | ],
104 | "thresholds": [],
105 | "timeFrom": null,
106 | "timeRegions": [],
107 | "timeShift": null,
108 | "title": "28 Day Error Budget",
109 | "tooltip": {
110 | "shared": true,
111 | "sort": 0,
112 | "value_type": "individual"
113 | },
114 | "type": "graph",
115 | "xaxis": {
116 | "buckets": null,
117 | "mode": "time",
118 | "name": null,
119 | "show": true,
120 | "values": []
121 | },
122 | "yaxes": [
123 | {
124 | "format": "percent",
125 | "label": null,
126 | "logBase": 1,
127 | "max": "100",
128 | "min": null,
129 | "show": true
130 | },
131 | {
132 | "format": "short",
133 | "label": null,
134 | "logBase": 1,
135 | "max": null,
136 | "min": null,
137 | "show": false
138 | }
139 | ],
140 | "yaxis": {
141 | "align": false,
142 | "alignLevel": null
143 | }
144 | },
145 | {
146 | "aliasColors": {},
147 | "bars": false,
148 | "dashLength": 10,
149 | "dashes": false,
150 | "fill": 1,
151 | "gridPos": {
152 | "h": 8,
153 | "w": 12,
154 | "x": 0,
155 | "y": 16
156 | },
157 | "id": 20,
158 | "legend": {
159 | "avg": false,
160 | "current": false,
161 | "max": false,
162 | "min": false,
163 | "show": true,
164 | "total": false,
165 | "values": false
166 | },
167 | "lines": true,
168 | "linewidth": 1,
169 | "links": [],
170 | "nullPointMode": "null",
171 | "paceLength": 10,
172 | "percentage": false,
173 | "pointradius": 2,
174 | "points": false,
175 | "renderer": "flot",
176 | "seriesOverrides": [],
177 | "stack": false,
178 | "steppedLine": false,
179 | "targets": [
180 | {
181 | "expr": "avg(example_configured_error_ratio) by (k8s_service)",
182 | "format": "time_series",
183 | "intervalFactor": 1,
184 | "refId": "A"
185 | }
186 | ],
187 | "thresholds": [],
188 | "timeFrom": null,
189 | "timeRegions": [],
190 | "timeShift": null,
191 | "title": "Average Configured Error Ratio",
192 | "tooltip": {
193 | "shared": true,
194 | "sort": 0,
195 | "value_type": "individual"
196 | },
197 | "type": "graph",
198 | "xaxis": {
199 | "buckets": null,
200 | "mode": "time",
201 | "name": null,
202 | "show": true,
203 | "values": []
204 | },
205 | "yaxes": [
206 | {
207 | "format": "short",
208 | "label": null,
209 | "logBase": 1,
210 | "max": null,
211 | "min": "0",
212 | "show": true
213 | },
214 | {
215 | "format": "short",
216 | "label": null,
217 | "logBase": 1,
218 | "max": null,
219 | "min": null,
220 | "show": true
221 | }
222 | ],
223 | "yaxis": {
224 | "align": false,
225 | "alignLevel": null
226 | }
227 | },
228 | {
229 | "aliasColors": {},
230 | "bars": false,
231 | "dashLength": 10,
232 | "dashes": false,
233 | "description": "",
234 | "fill": 1,
235 | "gridPos": {
236 | "h": 8,
237 | "w": 12,
238 | "x": 12,
239 | "y": 16
240 | },
241 | "id": 16,
242 | "legend": {
243 | "avg": false,
244 | "current": false,
245 | "max": false,
246 | "min": false,
247 | "show": true,
248 | "total": false,
249 | "values": false
250 | },
251 | "lines": true,
252 | "linewidth": 1,
253 | "links": [],
254 | "nullPointMode": "null",
255 | "paceLength": 10,
256 | "percentage": false,
257 | "pointradius": 2,
258 | "points": false,
259 | "renderer": "flot",
260 | "seriesOverrides": [],
261 | "stack": false,
262 | "steppedLine": false,
263 | "targets": [
264 | {
265 | "expr": "avg by (probe) ((rate(total[5m]) - rate(success[5m])) / rate(total[5m]))\n",
266 | "format": "time_series",
267 | "intervalFactor": 1,
268 | "refId": "A"
269 | }
270 | ],
271 | "thresholds": [],
272 | "timeFrom": null,
273 | "timeRegions": [],
274 | "timeShift": null,
275 | "title": "Prober Error Ratio",
276 | "tooltip": {
277 | "shared": true,
278 | "sort": 0,
279 | "value_type": "individual"
280 | },
281 | "type": "graph",
282 | "xaxis": {
283 | "buckets": null,
284 | "mode": "time",
285 | "name": null,
286 | "show": true,
287 | "values": []
288 | },
289 | "yaxes": [
290 | {
291 | "format": "percentunit",
292 | "label": null,
293 | "logBase": 1,
294 | "max": "1.1",
295 | "min": "0",
296 | "show": true
297 | },
298 | {
299 | "format": "short",
300 | "label": null,
301 | "logBase": 1,
302 | "max": null,
303 | "min": null,
304 | "show": true
305 | }
306 | ],
307 | "yaxis": {
308 | "align": false,
309 | "alignLevel": null
310 | }
311 | },
312 | {
313 | "aliasColors": {},
314 | "bars": false,
315 | "dashLength": 10,
316 | "dashes": false,
317 | "fill": 1,
318 | "gridPos": {
319 | "h": 8,
320 | "w": 12,
321 | "x": 0,
322 | "y": 24
323 | },
324 | "id": 24,
325 | "legend": {
326 | "avg": false,
327 | "current": false,
328 | "max": false,
329 | "min": false,
330 | "show": true,
331 | "total": false,
332 | "values": false
333 | },
334 | "lines": true,
335 | "linewidth": 1,
336 | "links": [],
337 | "nullPointMode": "null",
338 | "paceLength": 10,
339 | "percentage": false,
340 | "pointradius": 2,
341 | "points": false,
342 | "renderer": "flot",
343 | "seriesOverrides": [],
344 | "stack": false,
345 | "steppedLine": false,
346 | "targets": [
347 | {
348 | "expr": "sum by (job,k8s_service) (rate(task:http_response_total_count[1m]))",
349 | "format": "time_series",
350 | "intervalFactor": 1,
351 | "refId": "A"
352 | }
353 | ],
354 | "thresholds": [],
355 | "timeFrom": null,
356 | "timeRegions": [],
357 | "timeShift": null,
358 | "title": "Example Server QPS",
359 | "tooltip": {
360 | "shared": true,
361 | "sort": 0,
362 | "value_type": "individual"
363 | },
364 | "type": "graph",
365 | "xaxis": {
366 | "buckets": null,
367 | "mode": "time",
368 | "name": null,
369 | "show": true,
370 | "values": []
371 | },
372 | "yaxes": [
373 | {
374 | "format": "reqps",
375 | "label": null,
376 | "logBase": 1,
377 | "max": null,
378 | "min": 0,
379 | "show": true
380 | },
381 | {
382 | "format": "short",
383 | "label": null,
384 | "logBase": 1,
385 | "max": null,
386 | "min": null,
387 | "show": true
388 | }
389 | ],
390 | "yaxis": {
391 | "align": false,
392 | "alignLevel": null
393 | }
394 | },
395 | {
396 | "cards": {
397 | "cardPadding": null,
398 | "cardRound": null
399 | },
400 | "color": {
401 | "cardColor": "#b4ff00",
402 | "colorScale": "sqrt",
403 | "colorScheme": "interpolateOranges",
404 | "exponent": 0.5,
405 | "mode": "spectrum"
406 | },
407 | "dataFormat": "timeseries",
408 | "gridPos": {
409 | "h": 8,
410 | "w": 12,
411 | "x": 12,
412 | "y": 24
413 | },
414 | "heatmap": {},
415 | "highlightCards": true,
416 | "id": 26,
417 | "legend": {
418 | "show": false
419 | },
420 | "links": [],
421 | "targets": [
422 | {
423 | "expr": "sum(increase(opencensus_io_http_server_latency_bucket{k8s_service=\"server\"}[1m])) by (le)",
424 | "format": "time_series",
425 | "intervalFactor": 1,
426 | "refId": "A"
427 | }
428 | ],
429 | "timeFrom": null,
430 | "timeShift": null,
431 | "title": "Example Server Request Latencies",
432 | "tooltip": {
433 | "show": true,
434 | "showHistogram": false
435 | },
436 | "type": "heatmap",
437 | "xAxis": {
438 | "show": true
439 | },
440 | "xBucketNumber": null,
441 | "xBucketSize": null,
442 | "yAxis": {
443 | "decimals": null,
444 | "format": "ms",
445 | "logBase": 1,
446 | "max": null,
447 | "min": null,
448 | "show": true,
449 | "splitFactor": null
450 | },
451 | "yBucketBound": "auto",
452 | "yBucketNumber": null,
453 | "yBucketSize": null
454 | }
455 | ],
456 | "refresh": "5m",
457 | "schemaVersion": 18,
458 | "style": "dark",
459 | "tags": [],
460 | "templating": {
461 | "list": []
462 | },
463 | "time": {
464 | "from": "now-6h",
465 | "to": "now"
466 | },
467 | "timepicker": {
468 | "refresh_intervals": [
469 | "5s",
470 | "10s",
471 | "30s",
472 | "1m",
473 | "5m",
474 | "15m",
475 | "30m",
476 | "1h",
477 | "2h",
478 | "1d"
479 | ],
480 | "time_options": [
481 | "5m",
482 | "15m",
483 | "1h",
484 | "6h",
485 | "12h",
486 | "24h",
487 | "2d",
488 | "7d",
489 | "30d"
490 | ]
491 | },
492 | "timezone": "",
493 | "title": "SLO Dashboard",
494 | "uid": "slosloslo",
495 | "id": 1,
496 | "version": 7
497 | }
498 |
--------------------------------------------------------------------------------
/prometheus/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM prom/prometheus
2 |
3 | COPY --chown=nobody:nogroup *.yml /etc/prometheus/
4 |
5 | # Tests your rules before starting
6 | RUN promtool check rules /etc/prometheus/*.rules.yml
7 |
--------------------------------------------------------------------------------
/prometheus/oc.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: opencensus
3 | rules:
4 | - record: task:http_response_error_count
5 | expr: opencensus_io_http_server_response_count_by_status_code{http_status=~"5[0-9]{2}"}
6 |
7 | - record: task:http_response_total_count
8 | expr: opencensus_io_http_server_response_count_by_status_code{http_status=~"[0-9]{3}"}
9 |
--------------------------------------------------------------------------------
/prometheus/prometheus.yml:
--------------------------------------------------------------------------------
1 | global:
2 | scrape_interval: 30s
3 | scrape_timeout: 5s
4 | evaluation_interval: 30s
5 | scrape_configs:
6 | - job_name: kubernetes-apiservers
7 | kubernetes_sd_configs:
8 | - role: endpoints
9 | relabel_configs:
10 | - action: keep
11 | regex: default;kubernetes;https
12 | source_labels:
13 | - __meta_kubernetes_namespace
14 | - __meta_kubernetes_service_name
15 | - __meta_kubernetes_endpoint_port_name
16 | scheme: https
17 | tls_config:
18 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
19 | insecure_skip_verify: true
20 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
21 |
22 | - job_name: kubernetes-nodes-kubelet
23 | kubernetes_sd_configs:
24 | - role: node
25 | relabel_configs:
26 | - target_label: __address__
27 | regex: "(.+):10250"
28 | source_labels: [__address__]
29 | replacement: "${1}:10255"
30 |
31 | - job_name: kubernetes-nodes-cadvisor
32 | kubernetes_sd_configs:
33 | - role: node
34 | relabel_configs:
35 | - target_label: __metrics_path__
36 | replacement: /metrics/cadvisor
37 | - target_label: __address__
38 | regex: "(.+):10250"
39 | source_labels: [__address__]
40 | replacement: "${1}:10255"
41 |
42 | # Configuration for the port (prometheus.io/port) that service endpoints are
43 | # annotated with.
44 | - job_name: kubernetes-service-endpoints
45 | kubernetes_sd_configs:
46 | - role: endpoints
47 | relabel_configs:
48 | - action: keep
49 | regex: true
50 | source_labels:
51 | - __meta_kubernetes_service_annotation_prometheus_io_scrape
52 | - action: replace
53 | regex: (https?)
54 | source_labels:
55 | - __meta_kubernetes_service_annotation_prometheus_io_scheme
56 | target_label: __scheme__
57 | - action: replace
58 | regex: (.+)
59 | source_labels:
60 | - __meta_kubernetes_service_annotation_prometheus_io_path
61 | target_label: __metrics_path__
62 | - action: replace
63 | regex: ([^:]+)(?::\d+)?;(\d+)
64 | replacement: $1:$2
65 | source_labels:
66 | - __address__
67 | - __meta_kubernetes_service_annotation_prometheus_io_port
68 | target_label: __address__
69 | - action: replace
70 | source_labels:
71 | - __meta_kubernetes_namespace
72 | target_label: k8s_namespace
73 | - action: replace
74 | source_labels:
75 | - __meta_kubernetes_service_name
76 | target_label: k8s_service
77 | - action: replace
78 | source_labels:
79 | - __meta_kubernetes_pod_name
80 | target_label: k8s_pod
81 |
82 | rule_files:
83 | - "oc.rules.yml"
84 | - "slos.rules.yml"
85 |
--------------------------------------------------------------------------------
/prometheus/slos.rules.yml:
--------------------------------------------------------------------------------
1 | # This file based on the work from
2 | # https://landing.google.com/sre/workbook/chapters/alerting-on-slos/
3 | groups:
4 | - name: slo_metrics
5 | rules:
6 | - record: job:slo_errors_per_request:ratio_rate5m
7 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[5m])) / sum by (job,k8s_service) (rate(task:http_response_total_count[5m]))
8 |
9 | - record: job:slo_errors_per_request:ratio_rate10m
10 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[10m])) / sum by (job,k8s_service) (rate(task:http_response_total_count[10m]))
11 |
12 | - record: job:slo_errors_per_request:ratio_rate30m
13 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[30m])) / sum by (job,k8s_service) (rate(task:http_response_total_count[30m]))
14 |
15 | - record: job:slo_errors_per_request:ratio_rate1h
16 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[1h])) / sum by (job,k8s_service) (rate(task:http_response_total_count[1h]))
17 |
18 | - record: job:slo_errors_per_request:ratio_rate2h
19 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[2h])) / sum by (job,k8s_service) (rate(task:http_response_total_count[2h]))
20 |
21 | - record: job:slo_errors_per_request:ratio_rate6h
22 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[6h])) / sum by (job,k8s_service) (rate(task:http_response_total_count[6h]))
23 |
24 | - record: job:slo_errors_per_request:ratio_rate24h
25 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[24h])) / sum by (job,k8s_service) (rate(task:http_response_total_count[24h]))
26 |
27 | - record: job:slo_errors_per_request:ratio_rate3d
28 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[3d])) / sum by (job,k8s_service) (rate(task:http_response_total_count[3d]))
29 |
30 | - record: job:slo_errors_per_request:ratio_rate28d
31 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[28d])) / sum by (job,k8s_service) (rate(task:http_response_total_count[28d]))
32 |
33 | - record: job:error_budget:remaining
34 | expr: (1 - job:slo_errors_per_request:ratio_rate28d) * 100
35 |
36 | - record: job:error_budget:slo
37 | expr: (1 - 0.05) * 100
38 |
39 | - name: slo_alerts
40 | rules:
41 | - alert: slo_page
42 | expr: |
43 | (job:slo_errors_per_request:ratio_rate1h > (14.4*0.05) and job:slo_errors_per_request:ratio_rate5m > (14.4*0.05))
44 | or
45 | (job:slo_errors_per_request:ratio_rate6h > (6*0.05) and job:slo_errors_per_request:ratio_rate30m > (6*0.05))
46 | labels:
47 | severity: page
48 |
49 | - alert: slo_ticket
50 | expr: |
51 | (job:slo_errors_per_request:ratio_rate24h > (3*0.05) and job:slo_errors_per_request:ratio_rate2h > (3*0.05))
52 | or
53 | (job:slo_errors_per_request:ratio_rate3d > 0.05 and job:slo_errors_per_request:ratio_rate6h > 0.05)
54 | labels:
55 | severity: ticket
56 |
--------------------------------------------------------------------------------
/server/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.12-alpine as builder
2 | ENV GO111MODULE=on
3 | WORKDIR /go/src/example
4 | RUN apk add --no-cache git
5 | COPY . .
6 |
7 | RUN go get ./...
8 | RUN CGO_ENABLED=0 GOOS=linux go build -v -o /go/bin/helloworld
9 |
10 | ENV PORT 8080
11 | EXPOSE $PORT
12 |
13 | CMD ["/go/bin/helloworld"]
14 |
--------------------------------------------------------------------------------
/server/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/googlecloud/prometheus-example
2 |
3 | require (
4 | github.com/go-chi/chi v4.0.2+incompatible
5 | go.opencensus.io v0.19.0
6 | )
7 |
--------------------------------------------------------------------------------
/server/go.sum:
--------------------------------------------------------------------------------
1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
2 | cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
3 | git.apache.org/thrift.git v0.0.0-20181218151757-9b75e4fe745a/go.mod h1:fPE2ZNJGynbRyZ4dJvy6G277gSllfV2HJqblrnkyeyg=
4 | github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973 h1:xJ4a3vCFaGF/jqvzLMYoU8P317H5OQ+Via4RmuPwCS0=
5 | github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
6 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
7 | github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
8 | github.com/go-chi/chi v4.0.2+incompatible h1:maB6vn6FqCxrpz4FqWdh4+lwpyZIQS7YEAUcHlgXVRs=
9 | github.com/go-chi/chi v4.0.2+incompatible/go.mod h1:eB3wogJHnLi3x/kFX2A+IbTBlXxmMeXJVKy9tTv1XzQ=
10 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
11 | github.com/golang/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:tluoj9z5200jBnyusfRPU2LqT6J+DAorxEvtC7LHB+E=
12 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
13 | github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
14 | github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
15 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
16 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
17 | github.com/grpc-ecosystem/grpc-gateway v1.6.2/go.mod h1:RSKVYQBd5MCa4OVpNdGskqpgL2+G+NZTnrVHpWWfpdw=
18 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
19 | github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
20 | github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
21 | github.com/openzipkin/zipkin-go v0.1.3/go.mod h1:NtoC/o8u3JlF1lSlyPNswIbeQH9bJTmOf0Erfk+hxe8=
22 | github.com/prometheus/client_golang v0.9.2 h1:awm861/B8OKDd2I/6o1dy3ra4BamzKhYOiGItCeZ740=
23 | github.com/prometheus/client_golang v0.9.2/go.mod h1:OsXs2jCmiKlQ1lTBmv21f2mNfw4xf/QclQDMrYNZzcM=
24 | github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910 h1:idejC8f05m9MGOsuEi1ATq9shN03HrxNkD/luQvxCv8=
25 | github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
26 | github.com/prometheus/common v0.0.0-20181126121408-4724e9255275/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
27 | github.com/prometheus/common v0.0.0-20181218105931-67670fe90761 h1:z6tvbDJ5OLJ48FFmnksv04a78maSTRBUIhkdHYV5Y98=
28 | github.com/prometheus/common v0.0.0-20181218105931-67670fe90761/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
29 | github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a h1:9a8MnZMP0X2nLJdBg+pBmGgkJlSaKC2KaQmTCk1XDtE=
30 | github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
31 | go.opencensus.io v0.19.0 h1:+jrnNy8MR4GZXvwF9PEuSyHxA4NaTf6601oNRwCSXq0=
32 | go.opencensus.io v0.19.0/go.mod h1:AYeH0+ZxYyghG8diqaaIq/9P3VgCCt5GF2ldCY4dkFg=
33 | golang.org/x/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
34 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
35 | golang.org/x/lint v0.0.0-20181217174547-8f45f776aaf1/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
36 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
37 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
38 | golang.org/x/net v0.0.0-20181106065722-10aee1819953/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
39 | golang.org/x/net v0.0.0-20181201002055-351d144fa1fc/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
40 | golang.org/x/net v0.0.0-20181217023233-e147a9138326/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
41 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
42 | golang.org/x/oauth2 v0.0.0-20181203162652-d668ce993890/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
43 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
44 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
45 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
46 | golang.org/x/sys v0.0.0-20181218192612-074acd46bca6/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
47 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
48 | golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
49 | golang.org/x/tools v0.0.0-20181219222714-6e267b5cc78e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
50 | google.golang.org/api v0.0.0-20181220000619-583d854617af/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0=
51 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
52 | google.golang.org/appengine v1.3.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
53 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
54 | google.golang.org/genproto v0.0.0-20181219182458-5a97ab628bfb/go.mod h1:7Ep/1NZk928CDR8SjdVbjWNpdIf6nzjE3BTgJDr2Atg=
55 | google.golang.org/grpc v1.16.0/go.mod h1:0JHn/cJsOMiMfNA9+DeHDlAU7KAAB5GDlYFpa9MZMio=
56 | google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs=
57 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
58 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
59 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
60 | honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
61 | honnef.co/go/tools v0.0.0-20180920025451-e3ad64cb4ed3/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
62 |
--------------------------------------------------------------------------------
/server/server.go:
--------------------------------------------------------------------------------
1 | // Copyright 2019 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | //
16 | // This is a simple http server which generates 500s randomly a percentage of
17 | // the time.
18 | package main
19 |
20 | import (
21 | "context"
22 | "encoding/json"
23 | "io/ioutil"
24 | "log"
25 | "math/rand"
26 | "net/http"
27 | "os"
28 | "path/filepath"
29 | "strconv"
30 |
31 | "github.com/go-chi/chi"
32 | "github.com/go-chi/chi/middleware"
33 | "go.opencensus.io/exporter/prometheus"
34 | "go.opencensus.io/plugin/ochttp"
35 | "go.opencensus.io/stats"
36 | "go.opencensus.io/stats/view"
37 | )
38 |
39 | var (
40 | errorRatio = stats.Float64("configured_error_ratio", "configured error ratio", stats.UnitDimensionless)
41 | errorRatioView = &view.View{
42 | Name: "example/configured_error_ratio",
43 | Measure: errorRatio,
44 | Description: "The current configured error ratio.",
45 | Aggregation: view.LastValue(),
46 | }
47 | )
48 |
49 | func init() {
50 | // Set a default error rate
51 | err := SetErrorRate(context.Background(), 0.001)
52 | if err != nil {
53 | log.Fatal(err.Error())
54 | }
55 | }
56 |
57 | func main() {
58 | port := "8080"
59 | if fromEnv := os.Getenv("PORT"); fromEnv != "" {
60 | port = fromEnv
61 | }
62 | log.Printf("Starting up on http://localhost:%s", port)
63 |
64 | pe, err := prometheus.NewExporter(prometheus.Options{})
65 | if err != nil {
66 | log.Fatalf("Failed to create Prometheus exporter: %v", err)
67 | }
68 | view.RegisterExporter(pe)
69 |
70 | err = view.Register(errorRatioView)
71 | if err != nil {
72 | log.Fatalf("Failed to register views: %v", err)
73 | }
74 |
75 | r := chi.NewRouter()
76 | r.Use(middleware.RequestID)
77 | r.Use(middleware.RealIP)
78 | r.Use(middleware.Recoverer)
79 |
80 | r.NotFound(func(w http.ResponseWriter, r *http.Request) {
81 | JSON(w, http.StatusNotFound, map[string]string{
82 | "error": "404: This page could not be found",
83 | })
84 | })
85 |
86 | r.Handle("/metrics", pe)
87 |
88 | r.Get("/healthz", func(w http.ResponseWriter, r *http.Request) {
89 | JSON(w, http.StatusOK, map[string]string{
90 | "healthy": "true",
91 | })
92 | })
93 |
94 | r.Get("/quitquitquit", func(w http.ResponseWriter, r *http.Request) {
95 | log.Printf("/quitquitquit called, exiting")
96 | os.Exit(1)
97 | })
98 |
99 | r.Get("/", func(w http.ResponseWriter, r *http.Request) {
100 | rate, err := GetErrorRate(r.Context())
101 | if err != nil {
102 | log.Printf(err.Error())
103 | w.WriteHeader(500)
104 | return
105 | }
106 |
107 | if float64(rand.Intn(1000)) <= rate*1000 {
108 | w.WriteHeader(500)
109 | return
110 | }
111 |
112 | JSON(w, http.StatusOK, map[string]string{
113 | "Hello": "World",
114 | })
115 | })
116 |
117 | r.Get("/errors", func(w http.ResponseWriter, r *http.Request) {
118 | rate, err := GetErrorRate(r.Context())
119 | if err != nil {
120 | log.Printf(err.Error())
121 | w.WriteHeader(500)
122 | return
123 | }
124 |
125 | JSON(w, http.StatusOK, map[string]float64{
126 | "rate": rate,
127 | })
128 | })
129 |
130 | r.Get("/errors/{percent}", func(w http.ResponseWriter, r *http.Request) {
131 | rate, err := strconv.ParseFloat(chi.URLParam(r, "percent"), 64)
132 | if err != nil {
133 | log.Printf(err.Error())
134 | w.WriteHeader(500)
135 | return
136 | }
137 |
138 | if rate < 0 || rate > 100 {
139 | log.Printf("rate out of range")
140 | w.WriteHeader(500)
141 | return
142 | }
143 |
144 | err = SetErrorRate(r.Context(), rate)
145 | if err != nil {
146 | log.Printf(err.Error())
147 | w.WriteHeader(500)
148 | return
149 | }
150 |
151 | JSON(w, http.StatusOK, map[string]string{
152 | "status": "success",
153 | })
154 | })
155 |
156 | h := &ochttp.Handler{Handler: r}
157 | if err := view.Register(ochttp.DefaultServerViews...); err != nil {
158 | log.Fatal("Failed to register ochttp.DefaultServerViews")
159 | }
160 |
161 | log.Fatal(http.ListenAndServe(":"+port, h))
162 | }
163 |
164 | // JSON takes a piece of data and turns it into json and writes it out to the
165 | // response with the correct headers.
166 | func JSON(w http.ResponseWriter, statusCode int, data interface{}) error {
167 | result, err := json.Marshal(data)
168 | if err != nil {
169 | return err
170 | }
171 |
172 | w.Header().Set("Content-Type", "application/json")
173 | w.Write(result)
174 | return nil
175 | }
176 |
177 | func SetErrorRate(ctx context.Context, rate float64) error {
178 | fp := filepath.Join(os.TempDir(), "rate.txt")
179 | content := []byte(strconv.FormatFloat(rate, 'E', -1, 64))
180 |
181 | err := ioutil.WriteFile(fp, content, 0644)
182 | if err != nil {
183 | return err
184 | }
185 |
186 | stats.Record(ctx, errorRatio.M(rate))
187 |
188 | return nil
189 | }
190 |
191 | func GetErrorRate(ctx context.Context) (float64, error) {
192 | fp := filepath.Join(os.TempDir(), "rate.txt")
193 | rateString, err := ioutil.ReadFile(fp)
194 | if err != nil {
195 | return 0, err
196 | }
197 |
198 | rate, err := strconv.ParseFloat(string(rateString), 64)
199 | if err != nil {
200 | return 0, err
201 | }
202 |
203 | stats.Record(ctx, errorRatio.M(rate))
204 |
205 | return rate, nil
206 | }
207 |
--------------------------------------------------------------------------------
/terraform/.gitignore:
--------------------------------------------------------------------------------
1 | .terraform
2 | *.tfstate*
3 |
--------------------------------------------------------------------------------
/terraform/k8s/0-system-settings.yml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 | name: monitoring
5 | ---
6 | apiVersion: rbac.authorization.k8s.io/v1beta1
7 | kind: ClusterRoleBinding
8 | metadata:
9 | name: prometheus
10 | roleRef:
11 | apiGroup: rbac.authorization.k8s.io
12 | kind: ClusterRole
13 | name: prometheus
14 | subjects:
15 | - kind: ServiceAccount
16 | name: prometheus
17 | namespace: monitoring
18 | ---
19 | apiVersion: rbac.authorization.k8s.io/v1beta1
20 | kind: ClusterRole
21 | metadata:
22 | name: prometheus
23 | rules:
24 | - apiGroups: [""]
25 | resources:
26 | - endpoints
27 | - nodes
28 | - nodes/proxy
29 | - pods
30 | - services
31 | verbs: ["get", "list", "watch"]
32 | - apiGroups:
33 | - extensions
34 | resources:
35 | - ingresses
36 | verbs: ["get", "list", "watch"]
37 | - apiGroups: [""]
38 | resources:
39 | - configmaps
40 | verbs: ["get"]
41 | - nonResourceURLs: ["/metrics"]
42 | verbs: ["get"]
43 | ---
44 | apiVersion: v1
45 | kind: ServiceAccount
46 | metadata:
47 | name: prometheus
48 | namespace: monitoring
49 | ---
50 | apiVersion: extensions/v1beta1
51 | kind: DaemonSet
52 | metadata:
53 | name: node-exporter
54 | namespace: monitoring
55 | labels:
56 | app: node-exporter
57 | spec:
58 | template:
59 | metadata:
60 | name: node-exporter
61 | labels:
62 | app: node-exporter
63 | spec:
64 | containers:
65 | - image: prom/node-exporter
66 | name: node-exporter
67 | ports:
68 | - name: node-exporter
69 | #^ must be an IANA_SVC_NAME (at most 15 characters, ..)
70 | containerPort: 9100
71 | hostPort: 9100
72 | hostNetwork: true
73 | hostPID: true
74 | ---
75 | apiVersion: v1
76 | kind: Service
77 | metadata:
78 | annotations:
79 | prometheus.io/scrape: 'true'
80 | name: node-exporter
81 | namespace: monitoring
82 | labels:
83 | app: prometheus
84 | component: node-exporter
85 | spec:
86 | clusterIP: None
87 | ports:
88 | - name: node-exporter
89 | port: 9100
90 | protocol: TCP
91 | selector:
92 | app: node-exporter
93 | type: ClusterIP
94 |
--------------------------------------------------------------------------------
/terraform/k8s/1-prometheus.yml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: prometheus-disk
5 | namespace: monitoring
6 | spec:
7 | accessModes:
8 | - ReadWriteOnce
9 | resources:
10 | requests:
11 | storage: 100Gi
12 | ---
13 | apiVersion: extensions/v1beta1
14 | kind: Deployment
15 | metadata:
16 | name: prometheus
17 | namespace: monitoring
18 | labels:
19 | app: prometheus
20 | spec:
21 | replicas: 1
22 | template:
23 | metadata:
24 | name: prometheus
25 | labels:
26 | app: prometheus
27 | spec:
28 | serviceAccountName: prometheus
29 | containers:
30 | - name: prometheus
31 | image: gcr.io/cre-prometheus-slo-alerting/prometheus:latest
32 | imagePullPolicy: Always
33 | ports:
34 | - name: webui
35 | containerPort: 9090
36 | volumeMounts:
37 | - mountPath: "/prometheus"
38 | name: pd
39 | securityContext:
40 | fsGroup: 65534
41 | volumes:
42 | - name: pd
43 | persistentVolumeClaim:
44 | claimName: prometheus-disk
45 | ---
46 | apiVersion: v1
47 | kind: Service
48 | metadata:
49 | name: prometheus
50 | namespace: monitoring
51 | labels:
52 | app: prometheus
53 | annotations:
54 | prometheus.io/scrape: 'true'
55 | spec:
56 | type: NodePort
57 | ports:
58 | - port: 9090
59 | protocol: TCP
60 | name: webui
61 | selector:
62 | app: prometheus
63 |
--------------------------------------------------------------------------------
/terraform/k8s/2-server.yml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1beta1
2 | kind: Deployment
3 | metadata:
4 | name: server
5 | namespace: monitoring
6 | labels:
7 | app: server
8 | spec:
9 | replicas: 10
10 | template:
11 | metadata:
12 | labels:
13 | app: server
14 | spec:
15 | containers:
16 | - name: server
17 | image: gcr.io/cre-prometheus-slo-alerting/example-server:latest
18 | ports:
19 | - name: appport
20 | containerPort: 8080
21 | livenessProbe:
22 | httpGet:
23 | path: /healthz
24 | port: appport
25 | readinessProbe:
26 | httpGet:
27 | path: /healthz
28 | port: appport
29 | ---
30 | apiVersion: v1
31 | kind: Service
32 | metadata:
33 | name: server
34 | namespace: monitoring
35 | labels:
36 | app: server
37 | annotations:
38 | prometheus.io/scrape: 'true'
39 | spec:
40 | type: NodePort
41 | selector:
42 | app: server
43 | ports:
44 | - port: 8080
45 | targetPort: 8080
46 | ---
47 | apiVersion: extensions/v1beta1
48 | kind: Ingress
49 | metadata:
50 | namespace: monitoring
51 | name: server
52 | annotations:
53 | kubernetes.io/ingress.class: "gce"
54 | kubernetes.io/ingress.global-static-ip-name: server-ip
55 | labels:
56 | app: server
57 | spec:
58 | backend:
59 | serviceName: server
60 | servicePort: 8080
61 |
--------------------------------------------------------------------------------
/terraform/k8s/3-grafana.yml:
--------------------------------------------------------------------------------
1 | apiVersion: extensions/v1beta1
2 | kind: Deployment
3 | metadata:
4 | name: grafana
5 | namespace: monitoring
6 | labels:
7 | app: grafana
8 | spec:
9 | replicas: 1
10 | template:
11 | metadata:
12 | labels:
13 | app: grafana
14 | spec:
15 | containers:
16 | - image: gcr.io/cre-prometheus-slo-alerting/grafana:latest
17 | name: grafana
18 | imagePullPolicy: Always
19 | ports:
20 | - name: appport
21 | containerPort: 8080
22 | ---
23 | apiVersion: v1
24 | kind: Service
25 | metadata:
26 | name: grafana
27 | namespace: monitoring
28 | labels:
29 | app: grafana
30 | annotations:
31 | prometheus.io/scrape: 'true'
32 | spec:
33 | type: NodePort
34 | ports:
35 | - port: 8080
36 | targetPort: 8080
37 | selector:
38 | app: grafana
39 | ---
40 | apiVersion: extensions/v1beta1
41 | kind: Ingress
42 | metadata:
43 | namespace: monitoring
44 | name: grafana
45 | annotations:
46 | kubernetes.io/ingress.class: "gce"
47 | kubernetes.io/ingress.global-static-ip-name: grafana-ip
48 | labels:
49 | app: grafana
50 | spec:
51 | backend:
52 | serviceName: grafana
53 | servicePort: 8080
54 |
--------------------------------------------------------------------------------
/terraform/k8s/4-cloudprober.yml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1beta1
2 | kind: Deployment
3 | metadata:
4 | namespace: monitoring
5 | name: cloudprober
6 | labels:
7 | app: cloudprober
8 | spec:
9 | replicas: 1
10 | template:
11 | metadata:
12 | labels:
13 | app: cloudprober
14 | spec:
15 | containers:
16 | - name: cloudprober
17 | image: gcr.io/cre-prometheus-slo-alerting/cloudprober:latest
18 | env:
19 | - name: PORT
20 | value: "8080"
21 | - name: CLOUDPROBER_PORT
22 | value: "8080"
23 | ports:
24 | - name: appport
25 | containerPort: 8080
26 | livenessProbe:
27 | httpGet:
28 | path: /metrics
29 | port: appport
30 | readinessProbe:
31 | httpGet:
32 | path: /metrics
33 | port: appport
34 | ---
35 | apiVersion: v1
36 | kind: Service
37 | metadata:
38 | name: cloudprober
39 | namespace: monitoring
40 | labels:
41 | app: cloudprober
42 | annotations:
43 | prometheus.io/scrape: 'true'
44 | spec:
45 | type: NodePort
46 | selector:
47 | app: cloudprober
48 | ports:
49 | - port: 8080
50 | targetPort: 8080
51 |
--------------------------------------------------------------------------------
/terraform/main.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 0.11.1"
3 | }
4 |
5 | provider "google" {
6 | region = var.gcp_region
7 | }
8 |
9 | data "google_client_config" "current" {
10 | }
11 |
12 | resource "google_container_cluster" "example-cluster" {
13 | name = var.cluster_name
14 | description = "prometheus example k8s cluster"
15 | region = var.gcp_region
16 | initial_node_count = "1"
17 |
18 | logging_service = "logging.googleapis.com/kubernetes"
19 | monitoring_service = "monitoring.googleapis.com/kubernetes"
20 |
21 | // Use legacy ABAC until these issues are resolved:
22 | // https://github.com/mcuadros/terraform-provider-helm/issues/56
23 | // https://github.com/terraform-providers/terraform-provider-kubernetes/pull/73
24 | enable_legacy_abac = true
25 |
26 | remove_default_node_pool = true
27 | }
28 |
29 | resource "google_container_node_pool" "pool0" {
30 | name = "pool-0"
31 | cluster = google_container_cluster.example-cluster.name
32 | node_count = 1
33 | region = var.gcp_region
34 |
35 | autoscaling {
36 | min_node_count = 1
37 | max_node_count = 5
38 | }
39 |
40 | management {
41 | auto_repair = "true"
42 | auto_upgrade = "true"
43 | }
44 |
45 | node_config {
46 | machine_type = var.machine_type
47 | preemptible = "true"
48 |
49 | metadata = {
50 | disable-legacy-endpoints = "true"
51 | }
52 |
53 | oauth_scopes = [
54 | "https://www.googleapis.com/auth/cloud-platform",
55 | "https://www.googleapis.com/auth/cloud_debugger",
56 | "https://www.googleapis.com/auth/compute",
57 | "https://www.googleapis.com/auth/devstorage.read_only",
58 | "https://www.googleapis.com/auth/logging.write",
59 | "https://www.googleapis.com/auth/monitoring",
60 | "https://www.googleapis.com/auth/service.management",
61 | "https://www.googleapis.com/auth/servicecontrol",
62 | "https://www.googleapis.com/auth/source.read_only",
63 | "https://www.googleapis.com/auth/taskqueue",
64 | "https://www.googleapis.com/auth/trace.append",
65 | ]
66 | }
67 | }
68 |
69 | resource "google_compute_global_address" "grafana-ip" {
70 | name = "grafana-ip"
71 | }
72 |
73 | resource "google_compute_global_address" "server-ip" {
74 | name = "server-ip"
75 | }
76 |
77 |
--------------------------------------------------------------------------------
/terraform/vars.tf:
--------------------------------------------------------------------------------
1 | variable "cluster_name" {
2 | description = "GKE Cluster Name"
3 | default = "example"
4 | }
5 |
6 | variable "gcp_region" {
7 | description = "GCP region, e.g. us-east1"
8 | default = "europe-west2"
9 | }
10 |
11 | variable "gcp_zone" {
12 | description = "GCP zone, e.g. us-east1-a"
13 | default = "europe-west2-a"
14 | }
15 |
16 | variable "machine_type" {
17 | description = "GCP machine type"
18 | default = "n1-standard-2"
19 | }
20 |
21 |
--------------------------------------------------------------------------------
/terraform/versions.tf:
--------------------------------------------------------------------------------
1 |
2 | terraform {
3 | required_version = ">= 0.12"
4 | }
5 |
--------------------------------------------------------------------------------