├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── cloudbuild.yaml ├── cloudprober ├── Dockerfile └── probe.cfg ├── grafana ├── Dockerfile ├── dashboard.yml ├── datasource.yml ├── grafana.ini └── slo_dashboard.json ├── prometheus ├── Dockerfile ├── oc.rules.yml ├── prometheus.yml └── slos.rules.yml ├── server ├── Dockerfile ├── go.mod ├── go.sum └── server.go └── terraform ├── .gitignore ├── k8s ├── 0-system-settings.yml ├── 1-prometheus.yml ├── 2-server.yml ├── 3-grafana.yml └── 4-cloudprober.yml ├── main.tf ├── vars.tf └── versions.tf /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prometheus SLO Burn 2 | 3 | This is home to example code for exposing SLIs using open source code in prometheus. 4 | 5 | ## Build Images 6 | 7 | - `$ gcloud builds submit --project $GOOGLE_PROJECT` in the root directory. 8 | - These images are currently published and publicly available from the project 9 | `cre-prometheus-slo-alerting`. 10 | 11 | ## Terraform setup 12 | 13 | - Install Terraform 14 | - Set up terraform env (assumes you have a working gcloud install and a google project): 15 | 16 | ``` 17 | $ [[ $CLOUD_SHELL ]] || gcloud auth application-default login 18 | $ export GOOGLE_PROJECT=$(gcloud config get-value project) 19 | $ export REGION=europe-west2 20 | ``` 21 | 22 | - `$ cd terraform` 23 | - `$ terraform init` - installs terraform deps 24 | - `$ terraform apply -var "gcp_region=$REGION"` - Will ask you before it does 25 | anything. Will take ~10m to actually run. You can also run `terraform plan` 26 | to just get a dry run output. 27 | - `$ gcloud container clusters get-credentials example --region $REGION 28 | --project $GOOGLE_PROJECT` - Configures `kubectl` to work with the cluster 29 | you just created. 30 | - `$ kubectl create clusterrolebinding $USER-cluster-admin-binding 31 | --clusterrole=cluster-admin --user=$(gcloud config get-value account 32 | --project $GOOGLE_PROJECT)` - Gives your user permissions to create cluster 33 | role bindings that prometheus needs. 34 | - `$ kubectl apply -f ./k8s` 35 | 36 | ## Teardown 37 | 38 | - `$ cd terraform; terraform destroy -var "gcp_region=$REGION"` 39 | 40 | ## Running Locally 41 | 42 | - Start kubernetes (see 43 | https://kubernetes.io/docs/setup/pick-right-solution/#local-machine-solutions 44 | ). 45 | - Run `$ kubectl config current-context` to make sure you are in the correct 46 | context. 47 | - `$ cd terraform` 48 | - `$ kubectl apply -f ./k8s` 49 | - `$ kubectl get services --namespace=monitoring` you will see something like: 50 | 51 | ``` 52 | $ kubectl get services --namespace=monitoring 53 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 54 | cloudprober NodePort 10.104.187.119 8080:31589/TCP 21m 55 | grafana NodePort 10.104.206.150 8080:30431/TCP 21m 56 | node-exporter ClusterIP None 9100/TCP 21m 57 | prometheus NodePort 10.101.58.210 9090:31517/TCP 21m 58 | server NodePort 10.111.115.243 8080:31796/TCP 21m 59 | ``` 60 | 61 | This means that now you can visit http://localhost:30431 and see the grafana 62 | dashboard. 63 | -------------------------------------------------------------------------------- /cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | steps: 15 | - name: 'gcr.io/cloud-builders/docker' 16 | args: ['build', '-t', 'gcr.io/$PROJECT_ID/prometheus', './prometheus'] 17 | timeout: 500s 18 | - name: 'gcr.io/cloud-builders/docker' 19 | args: ['build', '-t', 'gcr.io/$PROJECT_ID/grafana', './grafana'] 20 | timeout: 500s 21 | - name: 'gcr.io/cloud-builders/docker' 22 | args: ['build', '-t', 'gcr.io/$PROJECT_ID/cloudprober', './cloudprober'] 23 | timeout: 500s 24 | - name: 'gcr.io/cloud-builders/docker' 25 | args: ['build', '-t', 'gcr.io/$PROJECT_ID/example-server', './server'] 26 | timeout: 500s 27 | images: 28 | - 'gcr.io/$PROJECT_ID/example-server' 29 | - 'gcr.io/$PROJECT_ID/grafana' 30 | - 'gcr.io/$PROJECT_ID/cloudprober' 31 | - 'gcr.io/$PROJECT_ID/prometheus' 32 | -------------------------------------------------------------------------------- /cloudprober/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM cloudprober/cloudprober:latest 2 | 3 | ENV PORT 8080 4 | ENV CLOUDPROBER_PORT $PORT 5 | EXPOSE $PORT 6 | 7 | COPY ./probe.cfg /etc/cloudprober.cfg 8 | RUN chmod a+r /etc/cloudprober.cfg 9 | -------------------------------------------------------------------------------- /cloudprober/probe.cfg: -------------------------------------------------------------------------------- 1 | probe { 2 | name: "server" 3 | type: HTTP 4 | targets { 5 | host_names: "server.monitoring.svc.cluster.local:8080" 6 | } 7 | 8 | interval_msec: 100 # .1s 9 | timeout_msec: 50 # .05s 10 | } 11 | -------------------------------------------------------------------------------- /grafana/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM grafana/grafana:6.0.1 2 | 3 | # We do this so we can have sqlite later 4 | USER root 5 | RUN apt update && apt install -y sqlite && rm -rf /var/lib/apt/lists/* 6 | 7 | USER grafana 8 | RUN mkdir -p /var/log/grafana /var/lib/grafana 9 | COPY --chown=grafana:grafana grafana.ini /etc/grafana/ 10 | COPY --chown=grafana:grafana datasource.yml /etc/grafana/provisioning/datasources/ 11 | COPY --chown=grafana:grafana dashboard.yml /etc/grafana/provisioning/dashboards/ 12 | COPY --chown=grafana:grafana *.json /var/lib/grafana/dashboards/ 13 | 14 | # We need to run this command to make sure the database exists. We then modify 15 | # the database to set the default dashboard. We have to do this because we turn 16 | # off user accounts and the API in our settings. 17 | RUN grafana-cli admin reset-admin-password $(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1) 18 | 19 | # This magic incantation to set the preferences for an anonomyous user is 20 | # undocumented, but can be found in code at 21 | # https://github.com/grafana/grafana/blob/v6.0.0/pkg/services/sqlstore/preferences.go#L29 22 | RUN echo "insert into preferences (org_id, user_id, version, home_dashboard_id, team_id, timezone, theme, created, updated) VALUES (1,0,0,1,0,'','', datetime('now'), datetime('now'));" | sqlite3 /var/lib/grafana/grafana.db 23 | -------------------------------------------------------------------------------- /grafana/dashboard.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'default' 5 | orgId: 1 6 | folder: '' 7 | type: file 8 | disableDeletion: true 9 | updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards 10 | options: 11 | path: /var/lib/grafana/dashboards 12 | -------------------------------------------------------------------------------- /grafana/datasource.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | datasources: 3 | - name: Prometheus 4 | type: prometheus 5 | access: proxy 6 | url: http://prometheus.monitoring.svc.cluster.local:9090 7 | editable: false 8 | isDefault: true 9 | -------------------------------------------------------------------------------- /grafana/grafana.ini: -------------------------------------------------------------------------------- 1 | instance_name = "Prometheus Example" 2 | 3 | [auth.anonymous] 4 | enabled = true 5 | org_role = Viewer 6 | 7 | [auth.basic] 8 | enabled = false 9 | 10 | [auth] 11 | disable_login_form = true 12 | 13 | [server] 14 | http_port = 8080 15 | enable_gzip = true 16 | 17 | [log] 18 | mode = console 19 | 20 | [users] 21 | allow_sign_up = false 22 | allow_org_create = false 23 | 24 | [snapshots] 25 | external_enabled = false 26 | 27 | [alerting] 28 | enabled = false 29 | -------------------------------------------------------------------------------- /grafana/slo_dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": false, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "links": [], 19 | "panels": [ 20 | { 21 | "cacheTimeout": null, 22 | "columns": [], 23 | "fontSize": "100%", 24 | "gridPos": { 25 | "h": 5, 26 | "w": 24, 27 | "x": 0, 28 | "y": 0 29 | }, 30 | "id": 18, 31 | "links": [], 32 | "pageSize": null, 33 | "scroll": true, 34 | "showHeader": true, 35 | "sort": { 36 | "col": 0, 37 | "desc": true 38 | }, 39 | "styles": [], 40 | "targets": [ 41 | { 42 | "expr": "ALERTS{alertstate=\"firing\"}", 43 | "format": "time_series", 44 | "intervalFactor": 1, 45 | "refId": "A" 46 | } 47 | ], 48 | "timeFrom": null, 49 | "timeShift": null, 50 | "title": "Alerts Firing", 51 | "transform": "timeseries_aggregations", 52 | "type": "table" 53 | }, 54 | { 55 | "aliasColors": {}, 56 | "bars": false, 57 | "dashLength": 10, 58 | "dashes": false, 59 | "fill": 0, 60 | "gridPos": { 61 | "h": 11, 62 | "w": 24, 63 | "x": 0, 64 | "y": 5 65 | }, 66 | "id": 22, 67 | "legend": { 68 | "alignAsTable": true, 69 | "avg": false, 70 | "current": true, 71 | "max": false, 72 | "min": false, 73 | "rightSide": false, 74 | "show": true, 75 | "total": false, 76 | "values": true 77 | }, 78 | "lines": true, 79 | "linewidth": 3, 80 | "links": [], 81 | "nullPointMode": "null", 82 | "paceLength": 10, 83 | "percentage": false, 84 | "pointradius": 2, 85 | "points": false, 86 | "renderer": "flot", 87 | "seriesOverrides": [], 88 | "stack": false, 89 | "steppedLine": false, 90 | "targets": [ 91 | { 92 | "expr": "job:error_budget:slo", 93 | "format": "time_series", 94 | "intervalFactor": 1, 95 | "refId": "B" 96 | }, 97 | { 98 | "expr": "job:error_budget:remaining", 99 | "format": "time_series", 100 | "intervalFactor": 1, 101 | "refId": "A" 102 | } 103 | ], 104 | "thresholds": [], 105 | "timeFrom": null, 106 | "timeRegions": [], 107 | "timeShift": null, 108 | "title": "28 Day Error Budget", 109 | "tooltip": { 110 | "shared": true, 111 | "sort": 0, 112 | "value_type": "individual" 113 | }, 114 | "type": "graph", 115 | "xaxis": { 116 | "buckets": null, 117 | "mode": "time", 118 | "name": null, 119 | "show": true, 120 | "values": [] 121 | }, 122 | "yaxes": [ 123 | { 124 | "format": "percent", 125 | "label": null, 126 | "logBase": 1, 127 | "max": "100", 128 | "min": null, 129 | "show": true 130 | }, 131 | { 132 | "format": "short", 133 | "label": null, 134 | "logBase": 1, 135 | "max": null, 136 | "min": null, 137 | "show": false 138 | } 139 | ], 140 | "yaxis": { 141 | "align": false, 142 | "alignLevel": null 143 | } 144 | }, 145 | { 146 | "aliasColors": {}, 147 | "bars": false, 148 | "dashLength": 10, 149 | "dashes": false, 150 | "fill": 1, 151 | "gridPos": { 152 | "h": 8, 153 | "w": 12, 154 | "x": 0, 155 | "y": 16 156 | }, 157 | "id": 20, 158 | "legend": { 159 | "avg": false, 160 | "current": false, 161 | "max": false, 162 | "min": false, 163 | "show": true, 164 | "total": false, 165 | "values": false 166 | }, 167 | "lines": true, 168 | "linewidth": 1, 169 | "links": [], 170 | "nullPointMode": "null", 171 | "paceLength": 10, 172 | "percentage": false, 173 | "pointradius": 2, 174 | "points": false, 175 | "renderer": "flot", 176 | "seriesOverrides": [], 177 | "stack": false, 178 | "steppedLine": false, 179 | "targets": [ 180 | { 181 | "expr": "avg(example_configured_error_ratio) by (k8s_service)", 182 | "format": "time_series", 183 | "intervalFactor": 1, 184 | "refId": "A" 185 | } 186 | ], 187 | "thresholds": [], 188 | "timeFrom": null, 189 | "timeRegions": [], 190 | "timeShift": null, 191 | "title": "Average Configured Error Ratio", 192 | "tooltip": { 193 | "shared": true, 194 | "sort": 0, 195 | "value_type": "individual" 196 | }, 197 | "type": "graph", 198 | "xaxis": { 199 | "buckets": null, 200 | "mode": "time", 201 | "name": null, 202 | "show": true, 203 | "values": [] 204 | }, 205 | "yaxes": [ 206 | { 207 | "format": "short", 208 | "label": null, 209 | "logBase": 1, 210 | "max": null, 211 | "min": "0", 212 | "show": true 213 | }, 214 | { 215 | "format": "short", 216 | "label": null, 217 | "logBase": 1, 218 | "max": null, 219 | "min": null, 220 | "show": true 221 | } 222 | ], 223 | "yaxis": { 224 | "align": false, 225 | "alignLevel": null 226 | } 227 | }, 228 | { 229 | "aliasColors": {}, 230 | "bars": false, 231 | "dashLength": 10, 232 | "dashes": false, 233 | "description": "", 234 | "fill": 1, 235 | "gridPos": { 236 | "h": 8, 237 | "w": 12, 238 | "x": 12, 239 | "y": 16 240 | }, 241 | "id": 16, 242 | "legend": { 243 | "avg": false, 244 | "current": false, 245 | "max": false, 246 | "min": false, 247 | "show": true, 248 | "total": false, 249 | "values": false 250 | }, 251 | "lines": true, 252 | "linewidth": 1, 253 | "links": [], 254 | "nullPointMode": "null", 255 | "paceLength": 10, 256 | "percentage": false, 257 | "pointradius": 2, 258 | "points": false, 259 | "renderer": "flot", 260 | "seriesOverrides": [], 261 | "stack": false, 262 | "steppedLine": false, 263 | "targets": [ 264 | { 265 | "expr": "avg by (probe) ((rate(total[5m]) - rate(success[5m])) / rate(total[5m]))\n", 266 | "format": "time_series", 267 | "intervalFactor": 1, 268 | "refId": "A" 269 | } 270 | ], 271 | "thresholds": [], 272 | "timeFrom": null, 273 | "timeRegions": [], 274 | "timeShift": null, 275 | "title": "Prober Error Ratio", 276 | "tooltip": { 277 | "shared": true, 278 | "sort": 0, 279 | "value_type": "individual" 280 | }, 281 | "type": "graph", 282 | "xaxis": { 283 | "buckets": null, 284 | "mode": "time", 285 | "name": null, 286 | "show": true, 287 | "values": [] 288 | }, 289 | "yaxes": [ 290 | { 291 | "format": "percentunit", 292 | "label": null, 293 | "logBase": 1, 294 | "max": "1.1", 295 | "min": "0", 296 | "show": true 297 | }, 298 | { 299 | "format": "short", 300 | "label": null, 301 | "logBase": 1, 302 | "max": null, 303 | "min": null, 304 | "show": true 305 | } 306 | ], 307 | "yaxis": { 308 | "align": false, 309 | "alignLevel": null 310 | } 311 | }, 312 | { 313 | "aliasColors": {}, 314 | "bars": false, 315 | "dashLength": 10, 316 | "dashes": false, 317 | "fill": 1, 318 | "gridPos": { 319 | "h": 8, 320 | "w": 12, 321 | "x": 0, 322 | "y": 24 323 | }, 324 | "id": 24, 325 | "legend": { 326 | "avg": false, 327 | "current": false, 328 | "max": false, 329 | "min": false, 330 | "show": true, 331 | "total": false, 332 | "values": false 333 | }, 334 | "lines": true, 335 | "linewidth": 1, 336 | "links": [], 337 | "nullPointMode": "null", 338 | "paceLength": 10, 339 | "percentage": false, 340 | "pointradius": 2, 341 | "points": false, 342 | "renderer": "flot", 343 | "seriesOverrides": [], 344 | "stack": false, 345 | "steppedLine": false, 346 | "targets": [ 347 | { 348 | "expr": "sum by (job,k8s_service) (rate(task:http_response_total_count[1m]))", 349 | "format": "time_series", 350 | "intervalFactor": 1, 351 | "refId": "A" 352 | } 353 | ], 354 | "thresholds": [], 355 | "timeFrom": null, 356 | "timeRegions": [], 357 | "timeShift": null, 358 | "title": "Example Server QPS", 359 | "tooltip": { 360 | "shared": true, 361 | "sort": 0, 362 | "value_type": "individual" 363 | }, 364 | "type": "graph", 365 | "xaxis": { 366 | "buckets": null, 367 | "mode": "time", 368 | "name": null, 369 | "show": true, 370 | "values": [] 371 | }, 372 | "yaxes": [ 373 | { 374 | "format": "reqps", 375 | "label": null, 376 | "logBase": 1, 377 | "max": null, 378 | "min": 0, 379 | "show": true 380 | }, 381 | { 382 | "format": "short", 383 | "label": null, 384 | "logBase": 1, 385 | "max": null, 386 | "min": null, 387 | "show": true 388 | } 389 | ], 390 | "yaxis": { 391 | "align": false, 392 | "alignLevel": null 393 | } 394 | }, 395 | { 396 | "cards": { 397 | "cardPadding": null, 398 | "cardRound": null 399 | }, 400 | "color": { 401 | "cardColor": "#b4ff00", 402 | "colorScale": "sqrt", 403 | "colorScheme": "interpolateOranges", 404 | "exponent": 0.5, 405 | "mode": "spectrum" 406 | }, 407 | "dataFormat": "timeseries", 408 | "gridPos": { 409 | "h": 8, 410 | "w": 12, 411 | "x": 12, 412 | "y": 24 413 | }, 414 | "heatmap": {}, 415 | "highlightCards": true, 416 | "id": 26, 417 | "legend": { 418 | "show": false 419 | }, 420 | "links": [], 421 | "targets": [ 422 | { 423 | "expr": "sum(increase(opencensus_io_http_server_latency_bucket{k8s_service=\"server\"}[1m])) by (le)", 424 | "format": "time_series", 425 | "intervalFactor": 1, 426 | "refId": "A" 427 | } 428 | ], 429 | "timeFrom": null, 430 | "timeShift": null, 431 | "title": "Example Server Request Latencies", 432 | "tooltip": { 433 | "show": true, 434 | "showHistogram": false 435 | }, 436 | "type": "heatmap", 437 | "xAxis": { 438 | "show": true 439 | }, 440 | "xBucketNumber": null, 441 | "xBucketSize": null, 442 | "yAxis": { 443 | "decimals": null, 444 | "format": "ms", 445 | "logBase": 1, 446 | "max": null, 447 | "min": null, 448 | "show": true, 449 | "splitFactor": null 450 | }, 451 | "yBucketBound": "auto", 452 | "yBucketNumber": null, 453 | "yBucketSize": null 454 | } 455 | ], 456 | "refresh": "5m", 457 | "schemaVersion": 18, 458 | "style": "dark", 459 | "tags": [], 460 | "templating": { 461 | "list": [] 462 | }, 463 | "time": { 464 | "from": "now-6h", 465 | "to": "now" 466 | }, 467 | "timepicker": { 468 | "refresh_intervals": [ 469 | "5s", 470 | "10s", 471 | "30s", 472 | "1m", 473 | "5m", 474 | "15m", 475 | "30m", 476 | "1h", 477 | "2h", 478 | "1d" 479 | ], 480 | "time_options": [ 481 | "5m", 482 | "15m", 483 | "1h", 484 | "6h", 485 | "12h", 486 | "24h", 487 | "2d", 488 | "7d", 489 | "30d" 490 | ] 491 | }, 492 | "timezone": "", 493 | "title": "SLO Dashboard", 494 | "uid": "slosloslo", 495 | "id": 1, 496 | "version": 7 497 | } 498 | -------------------------------------------------------------------------------- /prometheus/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM prom/prometheus 2 | 3 | COPY --chown=nobody:nogroup *.yml /etc/prometheus/ 4 | 5 | # Tests your rules before starting 6 | RUN promtool check rules /etc/prometheus/*.rules.yml 7 | -------------------------------------------------------------------------------- /prometheus/oc.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: opencensus 3 | rules: 4 | - record: task:http_response_error_count 5 | expr: opencensus_io_http_server_response_count_by_status_code{http_status=~"5[0-9]{2}"} 6 | 7 | - record: task:http_response_total_count 8 | expr: opencensus_io_http_server_response_count_by_status_code{http_status=~"[0-9]{3}"} 9 | -------------------------------------------------------------------------------- /prometheus/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 30s 3 | scrape_timeout: 5s 4 | evaluation_interval: 30s 5 | scrape_configs: 6 | - job_name: kubernetes-apiservers 7 | kubernetes_sd_configs: 8 | - role: endpoints 9 | relabel_configs: 10 | - action: keep 11 | regex: default;kubernetes;https 12 | source_labels: 13 | - __meta_kubernetes_namespace 14 | - __meta_kubernetes_service_name 15 | - __meta_kubernetes_endpoint_port_name 16 | scheme: https 17 | tls_config: 18 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 19 | insecure_skip_verify: true 20 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 21 | 22 | - job_name: kubernetes-nodes-kubelet 23 | kubernetes_sd_configs: 24 | - role: node 25 | relabel_configs: 26 | - target_label: __address__ 27 | regex: "(.+):10250" 28 | source_labels: [__address__] 29 | replacement: "${1}:10255" 30 | 31 | - job_name: kubernetes-nodes-cadvisor 32 | kubernetes_sd_configs: 33 | - role: node 34 | relabel_configs: 35 | - target_label: __metrics_path__ 36 | replacement: /metrics/cadvisor 37 | - target_label: __address__ 38 | regex: "(.+):10250" 39 | source_labels: [__address__] 40 | replacement: "${1}:10255" 41 | 42 | # Configuration for the port (prometheus.io/port) that service endpoints are 43 | # annotated with. 44 | - job_name: kubernetes-service-endpoints 45 | kubernetes_sd_configs: 46 | - role: endpoints 47 | relabel_configs: 48 | - action: keep 49 | regex: true 50 | source_labels: 51 | - __meta_kubernetes_service_annotation_prometheus_io_scrape 52 | - action: replace 53 | regex: (https?) 54 | source_labels: 55 | - __meta_kubernetes_service_annotation_prometheus_io_scheme 56 | target_label: __scheme__ 57 | - action: replace 58 | regex: (.+) 59 | source_labels: 60 | - __meta_kubernetes_service_annotation_prometheus_io_path 61 | target_label: __metrics_path__ 62 | - action: replace 63 | regex: ([^:]+)(?::\d+)?;(\d+) 64 | replacement: $1:$2 65 | source_labels: 66 | - __address__ 67 | - __meta_kubernetes_service_annotation_prometheus_io_port 68 | target_label: __address__ 69 | - action: replace 70 | source_labels: 71 | - __meta_kubernetes_namespace 72 | target_label: k8s_namespace 73 | - action: replace 74 | source_labels: 75 | - __meta_kubernetes_service_name 76 | target_label: k8s_service 77 | - action: replace 78 | source_labels: 79 | - __meta_kubernetes_pod_name 80 | target_label: k8s_pod 81 | 82 | rule_files: 83 | - "oc.rules.yml" 84 | - "slos.rules.yml" 85 | -------------------------------------------------------------------------------- /prometheus/slos.rules.yml: -------------------------------------------------------------------------------- 1 | # This file based on the work from 2 | # https://landing.google.com/sre/workbook/chapters/alerting-on-slos/ 3 | groups: 4 | - name: slo_metrics 5 | rules: 6 | - record: job:slo_errors_per_request:ratio_rate5m 7 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[5m])) / sum by (job,k8s_service) (rate(task:http_response_total_count[5m])) 8 | 9 | - record: job:slo_errors_per_request:ratio_rate10m 10 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[10m])) / sum by (job,k8s_service) (rate(task:http_response_total_count[10m])) 11 | 12 | - record: job:slo_errors_per_request:ratio_rate30m 13 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[30m])) / sum by (job,k8s_service) (rate(task:http_response_total_count[30m])) 14 | 15 | - record: job:slo_errors_per_request:ratio_rate1h 16 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[1h])) / sum by (job,k8s_service) (rate(task:http_response_total_count[1h])) 17 | 18 | - record: job:slo_errors_per_request:ratio_rate2h 19 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[2h])) / sum by (job,k8s_service) (rate(task:http_response_total_count[2h])) 20 | 21 | - record: job:slo_errors_per_request:ratio_rate6h 22 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[6h])) / sum by (job,k8s_service) (rate(task:http_response_total_count[6h])) 23 | 24 | - record: job:slo_errors_per_request:ratio_rate24h 25 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[24h])) / sum by (job,k8s_service) (rate(task:http_response_total_count[24h])) 26 | 27 | - record: job:slo_errors_per_request:ratio_rate3d 28 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[3d])) / sum by (job,k8s_service) (rate(task:http_response_total_count[3d])) 29 | 30 | - record: job:slo_errors_per_request:ratio_rate28d 31 | expr: sum by (job,k8s_service) (rate(task:http_response_error_count[28d])) / sum by (job,k8s_service) (rate(task:http_response_total_count[28d])) 32 | 33 | - record: job:error_budget:remaining 34 | expr: (1 - job:slo_errors_per_request:ratio_rate28d) * 100 35 | 36 | - record: job:error_budget:slo 37 | expr: (1 - 0.05) * 100 38 | 39 | - name: slo_alerts 40 | rules: 41 | - alert: slo_page 42 | expr: | 43 | (job:slo_errors_per_request:ratio_rate1h > (14.4*0.05) and job:slo_errors_per_request:ratio_rate5m > (14.4*0.05)) 44 | or 45 | (job:slo_errors_per_request:ratio_rate6h > (6*0.05) and job:slo_errors_per_request:ratio_rate30m > (6*0.05)) 46 | labels: 47 | severity: page 48 | 49 | - alert: slo_ticket 50 | expr: | 51 | (job:slo_errors_per_request:ratio_rate24h > (3*0.05) and job:slo_errors_per_request:ratio_rate2h > (3*0.05)) 52 | or 53 | (job:slo_errors_per_request:ratio_rate3d > 0.05 and job:slo_errors_per_request:ratio_rate6h > 0.05) 54 | labels: 55 | severity: ticket 56 | -------------------------------------------------------------------------------- /server/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.12-alpine as builder 2 | ENV GO111MODULE=on 3 | WORKDIR /go/src/example 4 | RUN apk add --no-cache git 5 | COPY . . 6 | 7 | RUN go get ./... 8 | RUN CGO_ENABLED=0 GOOS=linux go build -v -o /go/bin/helloworld 9 | 10 | ENV PORT 8080 11 | EXPOSE $PORT 12 | 13 | CMD ["/go/bin/helloworld"] 14 | -------------------------------------------------------------------------------- /server/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/googlecloud/prometheus-example 2 | 3 | require ( 4 | github.com/go-chi/chi v4.0.2+incompatible 5 | go.opencensus.io v0.19.0 6 | ) 7 | -------------------------------------------------------------------------------- /server/go.sum: -------------------------------------------------------------------------------- 1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= 2 | cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= 3 | git.apache.org/thrift.git v0.0.0-20181218151757-9b75e4fe745a/go.mod h1:fPE2ZNJGynbRyZ4dJvy6G277gSllfV2HJqblrnkyeyg= 4 | github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973 h1:xJ4a3vCFaGF/jqvzLMYoU8P317H5OQ+Via4RmuPwCS0= 5 | github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= 6 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= 7 | github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= 8 | github.com/go-chi/chi v4.0.2+incompatible h1:maB6vn6FqCxrpz4FqWdh4+lwpyZIQS7YEAUcHlgXVRs= 9 | github.com/go-chi/chi v4.0.2+incompatible/go.mod h1:eB3wogJHnLi3x/kFX2A+IbTBlXxmMeXJVKy9tTv1XzQ= 10 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= 11 | github.com/golang/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:tluoj9z5200jBnyusfRPU2LqT6J+DAorxEvtC7LHB+E= 12 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= 13 | github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= 14 | github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM= 15 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 16 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= 17 | github.com/grpc-ecosystem/grpc-gateway v1.6.2/go.mod h1:RSKVYQBd5MCa4OVpNdGskqpgL2+G+NZTnrVHpWWfpdw= 18 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= 19 | github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= 20 | github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= 21 | github.com/openzipkin/zipkin-go v0.1.3/go.mod h1:NtoC/o8u3JlF1lSlyPNswIbeQH9bJTmOf0Erfk+hxe8= 22 | github.com/prometheus/client_golang v0.9.2 h1:awm861/B8OKDd2I/6o1dy3ra4BamzKhYOiGItCeZ740= 23 | github.com/prometheus/client_golang v0.9.2/go.mod h1:OsXs2jCmiKlQ1lTBmv21f2mNfw4xf/QclQDMrYNZzcM= 24 | github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910 h1:idejC8f05m9MGOsuEi1ATq9shN03HrxNkD/luQvxCv8= 25 | github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= 26 | github.com/prometheus/common v0.0.0-20181126121408-4724e9255275/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= 27 | github.com/prometheus/common v0.0.0-20181218105931-67670fe90761 h1:z6tvbDJ5OLJ48FFmnksv04a78maSTRBUIhkdHYV5Y98= 28 | github.com/prometheus/common v0.0.0-20181218105931-67670fe90761/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= 29 | github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a h1:9a8MnZMP0X2nLJdBg+pBmGgkJlSaKC2KaQmTCk1XDtE= 30 | github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= 31 | go.opencensus.io v0.19.0 h1:+jrnNy8MR4GZXvwF9PEuSyHxA4NaTf6601oNRwCSXq0= 32 | go.opencensus.io v0.19.0/go.mod h1:AYeH0+ZxYyghG8diqaaIq/9P3VgCCt5GF2ldCY4dkFg= 33 | golang.org/x/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= 34 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= 35 | golang.org/x/lint v0.0.0-20181217174547-8f45f776aaf1/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= 36 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 37 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 38 | golang.org/x/net v0.0.0-20181106065722-10aee1819953/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 39 | golang.org/x/net v0.0.0-20181201002055-351d144fa1fc/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 40 | golang.org/x/net v0.0.0-20181217023233-e147a9138326/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 41 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= 42 | golang.org/x/oauth2 v0.0.0-20181203162652-d668ce993890/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= 43 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 44 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 45 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 46 | golang.org/x/sys v0.0.0-20181218192612-074acd46bca6/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 47 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 48 | golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 49 | golang.org/x/tools v0.0.0-20181219222714-6e267b5cc78e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 50 | google.golang.org/api v0.0.0-20181220000619-583d854617af/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0= 51 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= 52 | google.golang.org/appengine v1.3.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= 53 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= 54 | google.golang.org/genproto v0.0.0-20181219182458-5a97ab628bfb/go.mod h1:7Ep/1NZk928CDR8SjdVbjWNpdIf6nzjE3BTgJDr2Atg= 55 | google.golang.org/grpc v1.16.0/go.mod h1:0JHn/cJsOMiMfNA9+DeHDlAU7KAAB5GDlYFpa9MZMio= 56 | google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs= 57 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 58 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 59 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 60 | honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 61 | honnef.co/go/tools v0.0.0-20180920025451-e3ad64cb4ed3/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 62 | -------------------------------------------------------------------------------- /server/server.go: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // 16 | // This is a simple http server which generates 500s randomly a percentage of 17 | // the time. 18 | package main 19 | 20 | import ( 21 | "context" 22 | "encoding/json" 23 | "io/ioutil" 24 | "log" 25 | "math/rand" 26 | "net/http" 27 | "os" 28 | "path/filepath" 29 | "strconv" 30 | 31 | "github.com/go-chi/chi" 32 | "github.com/go-chi/chi/middleware" 33 | "go.opencensus.io/exporter/prometheus" 34 | "go.opencensus.io/plugin/ochttp" 35 | "go.opencensus.io/stats" 36 | "go.opencensus.io/stats/view" 37 | ) 38 | 39 | var ( 40 | errorRatio = stats.Float64("configured_error_ratio", "configured error ratio", stats.UnitDimensionless) 41 | errorRatioView = &view.View{ 42 | Name: "example/configured_error_ratio", 43 | Measure: errorRatio, 44 | Description: "The current configured error ratio.", 45 | Aggregation: view.LastValue(), 46 | } 47 | ) 48 | 49 | func init() { 50 | // Set a default error rate 51 | err := SetErrorRate(context.Background(), 0.001) 52 | if err != nil { 53 | log.Fatal(err.Error()) 54 | } 55 | } 56 | 57 | func main() { 58 | port := "8080" 59 | if fromEnv := os.Getenv("PORT"); fromEnv != "" { 60 | port = fromEnv 61 | } 62 | log.Printf("Starting up on http://localhost:%s", port) 63 | 64 | pe, err := prometheus.NewExporter(prometheus.Options{}) 65 | if err != nil { 66 | log.Fatalf("Failed to create Prometheus exporter: %v", err) 67 | } 68 | view.RegisterExporter(pe) 69 | 70 | err = view.Register(errorRatioView) 71 | if err != nil { 72 | log.Fatalf("Failed to register views: %v", err) 73 | } 74 | 75 | r := chi.NewRouter() 76 | r.Use(middleware.RequestID) 77 | r.Use(middleware.RealIP) 78 | r.Use(middleware.Recoverer) 79 | 80 | r.NotFound(func(w http.ResponseWriter, r *http.Request) { 81 | JSON(w, http.StatusNotFound, map[string]string{ 82 | "error": "404: This page could not be found", 83 | }) 84 | }) 85 | 86 | r.Handle("/metrics", pe) 87 | 88 | r.Get("/healthz", func(w http.ResponseWriter, r *http.Request) { 89 | JSON(w, http.StatusOK, map[string]string{ 90 | "healthy": "true", 91 | }) 92 | }) 93 | 94 | r.Get("/quitquitquit", func(w http.ResponseWriter, r *http.Request) { 95 | log.Printf("/quitquitquit called, exiting") 96 | os.Exit(1) 97 | }) 98 | 99 | r.Get("/", func(w http.ResponseWriter, r *http.Request) { 100 | rate, err := GetErrorRate(r.Context()) 101 | if err != nil { 102 | log.Printf(err.Error()) 103 | w.WriteHeader(500) 104 | return 105 | } 106 | 107 | if float64(rand.Intn(1000)) <= rate*1000 { 108 | w.WriteHeader(500) 109 | return 110 | } 111 | 112 | JSON(w, http.StatusOK, map[string]string{ 113 | "Hello": "World", 114 | }) 115 | }) 116 | 117 | r.Get("/errors", func(w http.ResponseWriter, r *http.Request) { 118 | rate, err := GetErrorRate(r.Context()) 119 | if err != nil { 120 | log.Printf(err.Error()) 121 | w.WriteHeader(500) 122 | return 123 | } 124 | 125 | JSON(w, http.StatusOK, map[string]float64{ 126 | "rate": rate, 127 | }) 128 | }) 129 | 130 | r.Get("/errors/{percent}", func(w http.ResponseWriter, r *http.Request) { 131 | rate, err := strconv.ParseFloat(chi.URLParam(r, "percent"), 64) 132 | if err != nil { 133 | log.Printf(err.Error()) 134 | w.WriteHeader(500) 135 | return 136 | } 137 | 138 | if rate < 0 || rate > 100 { 139 | log.Printf("rate out of range") 140 | w.WriteHeader(500) 141 | return 142 | } 143 | 144 | err = SetErrorRate(r.Context(), rate) 145 | if err != nil { 146 | log.Printf(err.Error()) 147 | w.WriteHeader(500) 148 | return 149 | } 150 | 151 | JSON(w, http.StatusOK, map[string]string{ 152 | "status": "success", 153 | }) 154 | }) 155 | 156 | h := &ochttp.Handler{Handler: r} 157 | if err := view.Register(ochttp.DefaultServerViews...); err != nil { 158 | log.Fatal("Failed to register ochttp.DefaultServerViews") 159 | } 160 | 161 | log.Fatal(http.ListenAndServe(":"+port, h)) 162 | } 163 | 164 | // JSON takes a piece of data and turns it into json and writes it out to the 165 | // response with the correct headers. 166 | func JSON(w http.ResponseWriter, statusCode int, data interface{}) error { 167 | result, err := json.Marshal(data) 168 | if err != nil { 169 | return err 170 | } 171 | 172 | w.Header().Set("Content-Type", "application/json") 173 | w.Write(result) 174 | return nil 175 | } 176 | 177 | func SetErrorRate(ctx context.Context, rate float64) error { 178 | fp := filepath.Join(os.TempDir(), "rate.txt") 179 | content := []byte(strconv.FormatFloat(rate, 'E', -1, 64)) 180 | 181 | err := ioutil.WriteFile(fp, content, 0644) 182 | if err != nil { 183 | return err 184 | } 185 | 186 | stats.Record(ctx, errorRatio.M(rate)) 187 | 188 | return nil 189 | } 190 | 191 | func GetErrorRate(ctx context.Context) (float64, error) { 192 | fp := filepath.Join(os.TempDir(), "rate.txt") 193 | rateString, err := ioutil.ReadFile(fp) 194 | if err != nil { 195 | return 0, err 196 | } 197 | 198 | rate, err := strconv.ParseFloat(string(rateString), 64) 199 | if err != nil { 200 | return 0, err 201 | } 202 | 203 | stats.Record(ctx, errorRatio.M(rate)) 204 | 205 | return rate, nil 206 | } 207 | -------------------------------------------------------------------------------- /terraform/.gitignore: -------------------------------------------------------------------------------- 1 | .terraform 2 | *.tfstate* 3 | -------------------------------------------------------------------------------- /terraform/k8s/0-system-settings.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: monitoring 5 | --- 6 | apiVersion: rbac.authorization.k8s.io/v1beta1 7 | kind: ClusterRoleBinding 8 | metadata: 9 | name: prometheus 10 | roleRef: 11 | apiGroup: rbac.authorization.k8s.io 12 | kind: ClusterRole 13 | name: prometheus 14 | subjects: 15 | - kind: ServiceAccount 16 | name: prometheus 17 | namespace: monitoring 18 | --- 19 | apiVersion: rbac.authorization.k8s.io/v1beta1 20 | kind: ClusterRole 21 | metadata: 22 | name: prometheus 23 | rules: 24 | - apiGroups: [""] 25 | resources: 26 | - endpoints 27 | - nodes 28 | - nodes/proxy 29 | - pods 30 | - services 31 | verbs: ["get", "list", "watch"] 32 | - apiGroups: 33 | - extensions 34 | resources: 35 | - ingresses 36 | verbs: ["get", "list", "watch"] 37 | - apiGroups: [""] 38 | resources: 39 | - configmaps 40 | verbs: ["get"] 41 | - nonResourceURLs: ["/metrics"] 42 | verbs: ["get"] 43 | --- 44 | apiVersion: v1 45 | kind: ServiceAccount 46 | metadata: 47 | name: prometheus 48 | namespace: monitoring 49 | --- 50 | apiVersion: extensions/v1beta1 51 | kind: DaemonSet 52 | metadata: 53 | name: node-exporter 54 | namespace: monitoring 55 | labels: 56 | app: node-exporter 57 | spec: 58 | template: 59 | metadata: 60 | name: node-exporter 61 | labels: 62 | app: node-exporter 63 | spec: 64 | containers: 65 | - image: prom/node-exporter 66 | name: node-exporter 67 | ports: 68 | - name: node-exporter 69 | #^ must be an IANA_SVC_NAME (at most 15 characters, ..) 70 | containerPort: 9100 71 | hostPort: 9100 72 | hostNetwork: true 73 | hostPID: true 74 | --- 75 | apiVersion: v1 76 | kind: Service 77 | metadata: 78 | annotations: 79 | prometheus.io/scrape: 'true' 80 | name: node-exporter 81 | namespace: monitoring 82 | labels: 83 | app: prometheus 84 | component: node-exporter 85 | spec: 86 | clusterIP: None 87 | ports: 88 | - name: node-exporter 89 | port: 9100 90 | protocol: TCP 91 | selector: 92 | app: node-exporter 93 | type: ClusterIP 94 | -------------------------------------------------------------------------------- /terraform/k8s/1-prometheus.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: prometheus-disk 5 | namespace: monitoring 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 100Gi 12 | --- 13 | apiVersion: extensions/v1beta1 14 | kind: Deployment 15 | metadata: 16 | name: prometheus 17 | namespace: monitoring 18 | labels: 19 | app: prometheus 20 | spec: 21 | replicas: 1 22 | template: 23 | metadata: 24 | name: prometheus 25 | labels: 26 | app: prometheus 27 | spec: 28 | serviceAccountName: prometheus 29 | containers: 30 | - name: prometheus 31 | image: gcr.io/cre-prometheus-slo-alerting/prometheus:latest 32 | imagePullPolicy: Always 33 | ports: 34 | - name: webui 35 | containerPort: 9090 36 | volumeMounts: 37 | - mountPath: "/prometheus" 38 | name: pd 39 | securityContext: 40 | fsGroup: 65534 41 | volumes: 42 | - name: pd 43 | persistentVolumeClaim: 44 | claimName: prometheus-disk 45 | --- 46 | apiVersion: v1 47 | kind: Service 48 | metadata: 49 | name: prometheus 50 | namespace: monitoring 51 | labels: 52 | app: prometheus 53 | annotations: 54 | prometheus.io/scrape: 'true' 55 | spec: 56 | type: NodePort 57 | ports: 58 | - port: 9090 59 | protocol: TCP 60 | name: webui 61 | selector: 62 | app: prometheus 63 | -------------------------------------------------------------------------------- /terraform/k8s/2-server.yml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: server 5 | namespace: monitoring 6 | labels: 7 | app: server 8 | spec: 9 | replicas: 10 10 | template: 11 | metadata: 12 | labels: 13 | app: server 14 | spec: 15 | containers: 16 | - name: server 17 | image: gcr.io/cre-prometheus-slo-alerting/example-server:latest 18 | ports: 19 | - name: appport 20 | containerPort: 8080 21 | livenessProbe: 22 | httpGet: 23 | path: /healthz 24 | port: appport 25 | readinessProbe: 26 | httpGet: 27 | path: /healthz 28 | port: appport 29 | --- 30 | apiVersion: v1 31 | kind: Service 32 | metadata: 33 | name: server 34 | namespace: monitoring 35 | labels: 36 | app: server 37 | annotations: 38 | prometheus.io/scrape: 'true' 39 | spec: 40 | type: NodePort 41 | selector: 42 | app: server 43 | ports: 44 | - port: 8080 45 | targetPort: 8080 46 | --- 47 | apiVersion: extensions/v1beta1 48 | kind: Ingress 49 | metadata: 50 | namespace: monitoring 51 | name: server 52 | annotations: 53 | kubernetes.io/ingress.class: "gce" 54 | kubernetes.io/ingress.global-static-ip-name: server-ip 55 | labels: 56 | app: server 57 | spec: 58 | backend: 59 | serviceName: server 60 | servicePort: 8080 61 | -------------------------------------------------------------------------------- /terraform/k8s/3-grafana.yml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: grafana 5 | namespace: monitoring 6 | labels: 7 | app: grafana 8 | spec: 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: grafana 14 | spec: 15 | containers: 16 | - image: gcr.io/cre-prometheus-slo-alerting/grafana:latest 17 | name: grafana 18 | imagePullPolicy: Always 19 | ports: 20 | - name: appport 21 | containerPort: 8080 22 | --- 23 | apiVersion: v1 24 | kind: Service 25 | metadata: 26 | name: grafana 27 | namespace: monitoring 28 | labels: 29 | app: grafana 30 | annotations: 31 | prometheus.io/scrape: 'true' 32 | spec: 33 | type: NodePort 34 | ports: 35 | - port: 8080 36 | targetPort: 8080 37 | selector: 38 | app: grafana 39 | --- 40 | apiVersion: extensions/v1beta1 41 | kind: Ingress 42 | metadata: 43 | namespace: monitoring 44 | name: grafana 45 | annotations: 46 | kubernetes.io/ingress.class: "gce" 47 | kubernetes.io/ingress.global-static-ip-name: grafana-ip 48 | labels: 49 | app: grafana 50 | spec: 51 | backend: 52 | serviceName: grafana 53 | servicePort: 8080 54 | -------------------------------------------------------------------------------- /terraform/k8s/4-cloudprober.yml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta1 2 | kind: Deployment 3 | metadata: 4 | namespace: monitoring 5 | name: cloudprober 6 | labels: 7 | app: cloudprober 8 | spec: 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: cloudprober 14 | spec: 15 | containers: 16 | - name: cloudprober 17 | image: gcr.io/cre-prometheus-slo-alerting/cloudprober:latest 18 | env: 19 | - name: PORT 20 | value: "8080" 21 | - name: CLOUDPROBER_PORT 22 | value: "8080" 23 | ports: 24 | - name: appport 25 | containerPort: 8080 26 | livenessProbe: 27 | httpGet: 28 | path: /metrics 29 | port: appport 30 | readinessProbe: 31 | httpGet: 32 | path: /metrics 33 | port: appport 34 | --- 35 | apiVersion: v1 36 | kind: Service 37 | metadata: 38 | name: cloudprober 39 | namespace: monitoring 40 | labels: 41 | app: cloudprober 42 | annotations: 43 | prometheus.io/scrape: 'true' 44 | spec: 45 | type: NodePort 46 | selector: 47 | app: cloudprober 48 | ports: 49 | - port: 8080 50 | targetPort: 8080 51 | -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 0.11.1" 3 | } 4 | 5 | provider "google" { 6 | region = var.gcp_region 7 | } 8 | 9 | data "google_client_config" "current" { 10 | } 11 | 12 | resource "google_container_cluster" "example-cluster" { 13 | name = var.cluster_name 14 | description = "prometheus example k8s cluster" 15 | region = var.gcp_region 16 | initial_node_count = "1" 17 | 18 | logging_service = "logging.googleapis.com/kubernetes" 19 | monitoring_service = "monitoring.googleapis.com/kubernetes" 20 | 21 | // Use legacy ABAC until these issues are resolved: 22 | // https://github.com/mcuadros/terraform-provider-helm/issues/56 23 | // https://github.com/terraform-providers/terraform-provider-kubernetes/pull/73 24 | enable_legacy_abac = true 25 | 26 | remove_default_node_pool = true 27 | } 28 | 29 | resource "google_container_node_pool" "pool0" { 30 | name = "pool-0" 31 | cluster = google_container_cluster.example-cluster.name 32 | node_count = 1 33 | region = var.gcp_region 34 | 35 | autoscaling { 36 | min_node_count = 1 37 | max_node_count = 5 38 | } 39 | 40 | management { 41 | auto_repair = "true" 42 | auto_upgrade = "true" 43 | } 44 | 45 | node_config { 46 | machine_type = var.machine_type 47 | preemptible = "true" 48 | 49 | metadata = { 50 | disable-legacy-endpoints = "true" 51 | } 52 | 53 | oauth_scopes = [ 54 | "https://www.googleapis.com/auth/cloud-platform", 55 | "https://www.googleapis.com/auth/cloud_debugger", 56 | "https://www.googleapis.com/auth/compute", 57 | "https://www.googleapis.com/auth/devstorage.read_only", 58 | "https://www.googleapis.com/auth/logging.write", 59 | "https://www.googleapis.com/auth/monitoring", 60 | "https://www.googleapis.com/auth/service.management", 61 | "https://www.googleapis.com/auth/servicecontrol", 62 | "https://www.googleapis.com/auth/source.read_only", 63 | "https://www.googleapis.com/auth/taskqueue", 64 | "https://www.googleapis.com/auth/trace.append", 65 | ] 66 | } 67 | } 68 | 69 | resource "google_compute_global_address" "grafana-ip" { 70 | name = "grafana-ip" 71 | } 72 | 73 | resource "google_compute_global_address" "server-ip" { 74 | name = "server-ip" 75 | } 76 | 77 | -------------------------------------------------------------------------------- /terraform/vars.tf: -------------------------------------------------------------------------------- 1 | variable "cluster_name" { 2 | description = "GKE Cluster Name" 3 | default = "example" 4 | } 5 | 6 | variable "gcp_region" { 7 | description = "GCP region, e.g. us-east1" 8 | default = "europe-west2" 9 | } 10 | 11 | variable "gcp_zone" { 12 | description = "GCP zone, e.g. us-east1-a" 13 | default = "europe-west2-a" 14 | } 15 | 16 | variable "machine_type" { 17 | description = "GCP machine type" 18 | default = "n1-standard-2" 19 | } 20 | 21 | -------------------------------------------------------------------------------- /terraform/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.12" 4 | } 5 | --------------------------------------------------------------------------------