├── .gitignore ├── .ruby-version ├── .terraform-version ├── .travis.yml ├── Brewfile ├── CODEOWNERS ├── LICENCE ├── README.md ├── ci ├── deploy.vars.default.yml ├── deploy.yml ├── images │ └── task │ │ ├── Dockerfile │ │ └── assume-role └── tasks │ ├── deploy-project.yml │ ├── generate-prometheus-test-jq.yml │ ├── http-ping.yml │ └── wait-ecs-services-stable.yml ├── logstash └── prometheus-for-paas-production.conf ├── terraform ├── modules │ ├── alertmanager │ │ ├── alb.tf │ │ ├── alertmanager-service.tf │ │ ├── certificate.tf │ │ ├── main.tf │ │ ├── security-group.tf │ │ ├── service_discovery.tf │ │ ├── task-definitions │ │ │ └── alertmanager.json │ │ ├── templates │ │ │ ├── alertmanager.tpl │ │ │ └── default.tmpl │ │ └── versions.tf │ ├── app-ecs-albs │ │ ├── main.tf │ │ └── versions.tf │ ├── common │ │ └── ami │ │ │ ├── main.tf │ │ │ └── versions.tf │ ├── infra-networking │ │ ├── main.tf │ │ └── versions.tf │ ├── infra-security-groups │ │ ├── main.tf │ │ └── versions.tf │ └── prom-ec2 │ │ ├── README.md │ │ ├── alerts-config │ │ └── alerts │ │ │ ├── README.md │ │ │ ├── data-gov-uk-alerts.yml │ │ │ ├── doc-checking-alerts.yml │ │ │ ├── notify-alerts.yml │ │ │ └── observe-alerts.yml │ │ ├── paas-config │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── prometheus.conf.tpl │ │ ├── variables.tf │ │ └── versions.tf │ │ └── prometheus │ │ ├── .ruby-version │ │ ├── cloud.conf │ │ ├── filebeat.yml.tpl │ │ ├── iam.tf │ │ ├── main.tf │ │ ├── output.tf │ │ ├── targets.tf │ │ ├── variables.tf │ │ └── versions.tf └── projects │ ├── alertmanager-production │ ├── main.tf │ └── versions.tf │ ├── alertmanager-staging │ ├── main.tf │ └── versions.tf │ ├── app-ecs-albs-production │ ├── main.tf │ └── versions.tf │ ├── app-ecs-albs-staging │ ├── main.tf │ └── versions.tf │ ├── infra-networking-production │ ├── main.tf │ └── versions.tf │ ├── infra-networking-staging │ ├── main.tf │ └── versions.tf │ ├── infra-security-groups-production │ ├── main.tf │ └── versions.tf │ ├── infra-security-groups-staging │ ├── main.tf │ └── versions.tf │ └── prom-ec2 │ ├── paas-production │ ├── extra-prometheus-scrape-configs.yml.tpl │ ├── main.tf │ └── versions.tf │ └── paas-staging │ ├── main.tf │ └── versions.tf └── tools ├── check-alerting-rules.sh ├── grafana_info ├── .python-version ├── README.md ├── bearer_auth.py ├── find_missing_metrics.py ├── requirements.txt └── show_queries.py └── terraform-format.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # terraform state files 2 | .terraform/ 3 | *.tfst* 4 | 5 | # editor config stuff 6 | .idea 7 | .idea/*/** 8 | .vscode 9 | .*.swp 10 | 11 | # os files 12 | .DS_Store 13 | 14 | *.plan 15 | 16 | /tools/prometheus-configs/**/data 17 | /tools/prometheus-configs/log-cache-adapter/token 18 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 2.6.1 2 | -------------------------------------------------------------------------------- /.terraform-version: -------------------------------------------------------------------------------- 1 | 0.13.3 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | env: 2 | global: 3 | - TERRAFORM_VERSION=0.13.3 4 | - TERRAFORM_FILE_NAME=terraform_${TERRAFORM_VERSION}_linux_amd64.zip 5 | - TERRAFORM_DOWNLOAD_URL=https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/${TERRAFORM_FILE_NAME} 6 | - PROMETHEUS_VERSION=2.3.2 7 | - PROMETHEUS_FILE_NAME=prometheus-${PROMETHEUS_VERSION}.linux-amd64 8 | - PROMETHEUS_TAR_FILE_NAME=${PROMETHEUS_FILE_NAME}.tar.gz 9 | - PROMETHEUS_DOWNLOAD_URL=https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/${PROMETHEUS_TAR_FILE_NAME} 10 | install: 11 | - wget ${TERRAFORM_DOWNLOAD_URL} 12 | - unzip -o ${TERRAFORM_FILE_NAME} -d /tmp 13 | - export PATH=/tmp:${PATH} 14 | - wget ${PROMETHEUS_DOWNLOAD_URL} 15 | - tar -xvzf ${PROMETHEUS_TAR_FILE_NAME} -C /tmp 16 | - export PATH=/tmp/${PROMETHEUS_FILE_NAME}:${PATH} 17 | 18 | script: 19 | - find . -name '*.tf' | xargs tools/terraform-format.sh 20 | - tools/check-alerting-rules.sh 21 | notifications: 22 | email: false 23 | -------------------------------------------------------------------------------- /Brewfile: -------------------------------------------------------------------------------- 1 | tap "alphagov/gds" 2 | 3 | brew "jq" 4 | brew "tfenv" 5 | brew "gds-cli" 6 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/en/articles/about-code-owners 2 | * @alphagov/re-autom8 3 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Crown Copyright (Government Digital Service) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **This repo is no longer in use and has been archived** 2 | 3 | # Prometheus configuration on AWS # 4 | 5 | Terraform configuration to manage a Prometheus server running on AWS. 6 | 7 | ## Setup ## 8 | 9 | ### Install dependencies 10 | 11 | brew bundle 12 | tfenv install # this will pick up the version from .terraform-version 13 | 14 | ### Allow access to secrets 15 | 16 | You will need to clone the re-secrets repo into `~/.password-store/re-secrets`: 17 | 18 | git clone git@github.com:alphagov/re-secrets.git ~/.password-store/re-secrets 19 | 20 | ## Deploying Terraform 21 | 22 | ```shell 23 | cd terraform/projects/PROJECT-ENV/ 24 | gds aws re-prom- -- terraform init 25 | gds aws re-prom- -- terraform plan 26 | gds aws re-prom- -- terraform apply 27 | ``` 28 | 29 | eg 30 | 31 | ```shell 32 | cd terraform/projects/app-ecs-albs-staging 33 | gds aws re-prom-staging -- terraform plan 34 | ``` 35 | 36 | ### Deploy EC2 Prometheus with zero downtime 37 | 38 | To avoid all three instances being respun at the same time you can do one instance at a time using: 39 | 40 | ``` 41 | gds aws re-prom- -- terraform apply -target=module.paas-config.aws_route53_record.prom_ec2_a_record[i] -target=module.prometheus.aws_volume_attachment.attach-prometheus-disk[i] -target=module.prometheus.aws_instance.prometheus[i] -target=module.prometheus.aws_lb_target_group_attachment.prom_target_group_attachment[i] 42 | ``` 43 | 44 | where `i` is `0`, `1` or `2`. 45 | 46 | ## EC2 Prometheus 47 | 48 | Prometheis are not deployed on Amazon ECS and are instead deployed using the prom-ec2 modules onto EC2 instances. For details of how to develop and deploy them see the [terraform/modules/prom-ec2 README](terraform/modules/prom-ec2). 49 | 50 | ## ECS 51 | 52 | Alertmanager and NGINX are deployed on Amazon ECS Fargate. 53 | 54 | ## License 55 | [MIT License](LICENCE) 56 | -------------------------------------------------------------------------------- /ci/deploy.vars.default.yml: -------------------------------------------------------------------------------- 1 | background-image: "" 2 | prometheus-aws-configuration-beta-branch: master 3 | -------------------------------------------------------------------------------- /ci/deploy.yml: -------------------------------------------------------------------------------- 1 | display: 2 | background_image: ((background-image)) 3 | 4 | resource_types: 5 | - name: cf 6 | type: docker-image 7 | source: 8 | repository: concourse/cf-resource 9 | tag: "1.1" 10 | - name: git 11 | type: docker-image 12 | source: 13 | repository: concourse/git-resource 14 | tag: "1.6" 15 | 16 | resources: 17 | - name: task-image 18 | type: docker-image 19 | icon: layers 20 | source: 21 | repository: ((readonly_private_ecr_repo_url)) 22 | tag: prometheus-task-image 23 | - name: prometheus-aws-configuration-beta 24 | type: git 25 | icon: git 26 | source: 27 | uri: https://github.com/alphagov/prometheus-aws-configuration-beta.git 28 | branch: ((prometheus-aws-configuration-beta-branch)) 29 | # image building is expensive even when nothing has changed, hence dedicated resource 30 | - name: prometheus-aws-configuration-beta-images 31 | type: git 32 | icon: git 33 | source: 34 | uri: https://github.com/alphagov/prometheus-aws-configuration-beta.git 35 | branch: ((prometheus-aws-configuration-beta-branch)) 36 | paths: 37 | - ci/images 38 | - name: cf-app-discovery 39 | type: git 40 | icon: git 41 | source: 42 | uri: https://github.com/alphagov/cf_app_discovery.git 43 | branch: master 44 | - name: re-secrets 45 | type: git 46 | icon: git 47 | source: 48 | private_key: | 49 | ((re-secrets-ssh-key)) 50 | uri: git@github.com:alphagov/re-secrets.git 51 | branch: master 52 | paths: 53 | - observe 54 | - name: service-broker-ireland-staging 55 | type: cf 56 | icon: anvil 57 | source: 58 | api: https://api.cloud.service.gov.uk 59 | username: ((cf_user)) 60 | password: ((cf_password)) 61 | organization: gds-tech-ops 62 | space: prometheus-staging 63 | - name: service-broker-ireland-production 64 | type: cf 65 | icon: anvil 66 | source: 67 | api: https://api.cloud.service.gov.uk 68 | username: ((cf_user)) 69 | password: ((cf_password)) 70 | organization: gds-tech-ops 71 | space: prometheus-production 72 | - name: service-broker-london-production 73 | type: cf 74 | icon: anvil 75 | source: 76 | api: https://api.london.cloud.service.gov.uk 77 | username: ((cf_london_user)) 78 | password: ((cf_london_password)) 79 | organization: gds-tech-ops 80 | space: prometheus-production 81 | 82 | jobs: 83 | 84 | - name: configure-pipeline 85 | serial: true 86 | plan: 87 | - get: prometheus-aws-configuration-beta 88 | trigger: true 89 | - set_pipeline: self 90 | file: prometheus-aws-configuration-beta/ci/deploy.yml 91 | vars: 92 | prometheus-aws-configuration-beta-branch: ((prometheus-aws-configuration-beta-branch)) 93 | background-image: ((background-image)) 94 | 95 | - name: build-task-image 96 | serial: true 97 | plan: 98 | - get: prometheus-aws-configuration-beta-images 99 | trigger: true 100 | - put: task-image 101 | params: {build: prometheus-aws-configuration-beta-images/ci/images/task} 102 | get_params: {skip_download: true} 103 | 104 | - name: deploy-common-staging 105 | serial: true 106 | plan: 107 | - in_parallel: 108 | - get: prometheus-aws-configuration-beta 109 | passed: [configure-pipeline] 110 | trigger: true 111 | - get: task-image 112 | passed: [build-task-image] 113 | trigger: true 114 | - get: re-secrets 115 | trigger: true 116 | - task: apply-infra-networking-terraform 117 | image: task-image 118 | timeout: 15m 119 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 120 | input_mapping: {src: prometheus-aws-configuration-beta} 121 | params: 122 | PROJECT: infra-networking-staging 123 | DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer 124 | GPG_PRIVATE_KEY: ((gpg_private_key)) 125 | - task: apply-infra-security-groups-terraform 126 | image: task-image 127 | timeout: 15m 128 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 129 | input_mapping: {src: prometheus-aws-configuration-beta} 130 | params: 131 | PROJECT: infra-security-groups-staging 132 | DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer 133 | GPG_PRIVATE_KEY: ((gpg_private_key)) 134 | - task: apply-app-ecs-elbs-terraform 135 | image: task-image 136 | timeout: 15m 137 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 138 | input_mapping: {src: prometheus-aws-configuration-beta} 139 | params: 140 | PROJECT: app-ecs-albs-staging 141 | DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer 142 | GPG_PRIVATE_KEY: ((gpg_private_key)) 143 | 144 | - name: deploy-common-production 145 | serial: true 146 | plan: 147 | - in_parallel: 148 | - get: prometheus-aws-configuration-beta 149 | passed: [deploy-prometheus-staging, deploy-alertmanager-staging] 150 | trigger: true 151 | - get: task-image 152 | passed: [deploy-prometheus-staging, deploy-alertmanager-staging] 153 | trigger: true 154 | - get: re-secrets 155 | passed: [deploy-prometheus-staging, deploy-alertmanager-staging] 156 | trigger: true 157 | - task: apply-infra-networking-terraform 158 | image: task-image 159 | timeout: 15m 160 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 161 | input_mapping: {src: prometheus-aws-configuration-beta} 162 | params: 163 | PROJECT: infra-networking-production 164 | DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer 165 | GPG_PRIVATE_KEY: ((gpg_private_key)) 166 | - task: apply-infra-security-groups-terraform 167 | image: task-image 168 | timeout: 15m 169 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 170 | input_mapping: {src: prometheus-aws-configuration-beta} 171 | params: 172 | PROJECT: infra-security-groups-production 173 | DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer 174 | GPG_PRIVATE_KEY: ((gpg_private_key)) 175 | - task: apply-app-ecs-elbs-terraform 176 | image: task-image 177 | timeout: 15m 178 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 179 | input_mapping: {src: prometheus-aws-configuration-beta} 180 | params: 181 | PROJECT: app-ecs-albs-production 182 | DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer 183 | GPG_PRIVATE_KEY: ((gpg_private_key)) 184 | 185 | - name: deploy-prometheus-staging 186 | serial: true 187 | plan: 188 | - in_parallel: 189 | - get: prometheus-aws-configuration-beta 190 | passed: [deploy-common-staging] 191 | trigger: true 192 | - get: task-image 193 | passed: [deploy-common-staging] 194 | trigger: true 195 | - get: re-secrets 196 | passed: [deploy-common-staging] 197 | trigger: true 198 | - task: apply-terraform 199 | image: task-image 200 | timeout: 15m 201 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 202 | input_mapping: {src: prometheus-aws-configuration-beta} 203 | output_mapping: {outputs: terraform-outputs} 204 | params: 205 | PROJECT: prom-ec2/paas-staging 206 | DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer 207 | GPG_PRIVATE_KEY: ((gpg_private_key)) 208 | - task: generate-prometheus-test-jq 209 | image: task-image 210 | file: prometheus-aws-configuration-beta/ci/tasks/generate-prometheus-test-jq.yml 211 | input_mapping: {input: terraform-outputs} 212 | output_mapping: {output: prometheus-test-jq} 213 | - in_parallel: 214 | - do: 215 | - task: conf-test-prom-1 216 | attempts: 8 217 | timeout: 2m 218 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 219 | input_mapping: {response-jq-test: prometheus-test-jq} 220 | params: 221 | URL: https://prom-1.monitoring-staging.gds-reliability.engineering/last-config 222 | - task: smoke-test-prom-1 223 | attempts: 8 224 | timeout: 2m 225 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 226 | params: 227 | URL: https://prom-1.monitoring-staging.gds-reliability.engineering/-/ready 228 | - do: 229 | - task: conf-test-prom-2 230 | attempts: 8 231 | timeout: 2m 232 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 233 | input_mapping: {response-jq-test: prometheus-test-jq} 234 | params: 235 | URL: https://prom-2.monitoring-staging.gds-reliability.engineering/last-config 236 | - task: smoke-test-prom-2 237 | attempts: 8 238 | timeout: 2m 239 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 240 | params: 241 | URL: https://prom-2.monitoring-staging.gds-reliability.engineering/-/ready 242 | - do: 243 | - task: conf-test-prom-3 244 | attempts: 8 245 | timeout: 2m 246 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 247 | input_mapping: {response-jq-test: prometheus-test-jq} 248 | params: 249 | URL: https://prom-3.monitoring-staging.gds-reliability.engineering/last-config 250 | - task: smoke-test-prom-3 251 | attempts: 8 252 | timeout: 2m 253 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 254 | params: 255 | URL: https://prom-3.monitoring-staging.gds-reliability.engineering/-/ready 256 | 257 | - name: deploy-prometheus-production 258 | serial: true 259 | plan: 260 | - in_parallel: 261 | - get: prometheus-aws-configuration-beta 262 | passed: [deploy-common-production] 263 | trigger: true 264 | - get: task-image 265 | passed: [deploy-common-production] 266 | trigger: true 267 | - get: re-secrets 268 | passed: [deploy-common-production] 269 | trigger: true 270 | - task: apply-terraform 271 | image: task-image 272 | timeout: 15m 273 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 274 | input_mapping: {src: prometheus-aws-configuration-beta} 275 | output_mapping: {outputs: terraform-outputs} 276 | params: 277 | PROJECT: prom-ec2/paas-production 278 | DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer 279 | GPG_PRIVATE_KEY: ((gpg_private_key)) 280 | - task: generate-prometheus-test-jq 281 | image: task-image 282 | file: prometheus-aws-configuration-beta/ci/tasks/generate-prometheus-test-jq.yml 283 | input_mapping: {input: terraform-outputs} 284 | output_mapping: {output: prometheus-test-jq} 285 | - in_parallel: 286 | - do: 287 | - task: conf-test-prom-1 288 | attempts: 8 289 | timeout: 2m 290 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 291 | input_mapping: {response-jq-test: prometheus-test-jq} 292 | params: 293 | URL: https://prom-1.monitoring.gds-reliability.engineering/last-config 294 | - task: smoke-test-prom-1 295 | attempts: 8 296 | timeout: 2m 297 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 298 | params: 299 | URL: https://prom-1.monitoring.gds-reliability.engineering/-/ready 300 | - do: 301 | - task: conf-test-prom-2 302 | attempts: 8 303 | timeout: 2m 304 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 305 | input_mapping: {response-jq-test: prometheus-test-jq} 306 | params: 307 | URL: https://prom-2.monitoring.gds-reliability.engineering/last-config 308 | - task: smoke-test-prom-2 309 | attempts: 8 310 | timeout: 2m 311 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 312 | params: 313 | URL: https://prom-2.monitoring.gds-reliability.engineering/-/ready 314 | - do: 315 | - task: conf-test-prom-3 316 | attempts: 8 317 | timeout: 2m 318 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 319 | input_mapping: {response-jq-test: prometheus-test-jq} 320 | params: 321 | URL: https://prom-3.monitoring.gds-reliability.engineering/last-config 322 | - task: smoke-test-prom-3 323 | attempts: 8 324 | timeout: 2m 325 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 326 | params: 327 | URL: https://prom-3.monitoring.gds-reliability.engineering/-/ready 328 | 329 | - name: deploy-alertmanager-staging 330 | serial: true 331 | plan: 332 | - in_parallel: 333 | - get: prometheus-aws-configuration-beta 334 | passed: [deploy-common-staging] 335 | trigger: true 336 | - get: task-image 337 | passed: [deploy-common-staging] 338 | trigger: true 339 | - get: re-secrets 340 | passed: [deploy-common-staging] 341 | trigger: true 342 | - task: apply-terraform 343 | image: task-image 344 | timeout: 15m 345 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 346 | input_mapping: {src: prometheus-aws-configuration-beta} 347 | output_mapping: {outputs: terraform-outputs} 348 | params: 349 | PROJECT: alertmanager-staging 350 | DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer 351 | GPG_PRIVATE_KEY: ((gpg_private_key)) 352 | - task: wait-ecs-services-stable 353 | image: task-image 354 | file: prometheus-aws-configuration-beta/ci/tasks/wait-ecs-services-stable.yml 355 | params: 356 | DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer 357 | TERRAFORM_VAR: alertmanager_ecs_clusters_services 358 | - in_parallel: 359 | - task: smoke-test-alertmanager 360 | attempts: 6 361 | timeout: 2m 362 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 363 | params: 364 | URL: https://alerts.monitoring-staging.gds-reliability.engineering/-/healthy 365 | - task: smoke-test-alertmanager-eu-west-1a 366 | attempts: 6 367 | timeout: 2m 368 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 369 | params: 370 | URL: https://alerts-eu-west-1a.monitoring-staging.gds-reliability.engineering/-/healthy 371 | - task: smoke-test-alertmanager-eu-west-1b 372 | attempts: 6 373 | timeout: 2m 374 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 375 | params: 376 | URL: https://alerts-eu-west-1b.monitoring-staging.gds-reliability.engineering/-/healthy 377 | - task: smoke-test-alertmanager-eu-west-1c 378 | attempts: 6 379 | timeout: 2m 380 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 381 | params: 382 | URL: https://alerts-eu-west-1c.monitoring-staging.gds-reliability.engineering/-/healthy 383 | 384 | - name: deploy-alertmanager-production 385 | serial: true 386 | plan: 387 | - in_parallel: 388 | - get: prometheus-aws-configuration-beta 389 | passed: [deploy-common-production] 390 | trigger: true 391 | - get: task-image 392 | passed: [deploy-common-production] 393 | trigger: true 394 | - get: re-secrets 395 | passed: [deploy-common-production] 396 | trigger: true 397 | - task: apply-terraform 398 | image: task-image 399 | timeout: 15m 400 | file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml 401 | input_mapping: {src: prometheus-aws-configuration-beta} 402 | output_mapping: {outputs: terraform-outputs} 403 | params: 404 | PROJECT: alertmanager-production 405 | DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer 406 | GPG_PRIVATE_KEY: ((gpg_private_key)) 407 | - task: wait-ecs-services-stable 408 | image: task-image 409 | file: prometheus-aws-configuration-beta/ci/tasks/wait-ecs-services-stable.yml 410 | params: 411 | DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer 412 | TERRAFORM_VAR: alertmanager_ecs_clusters_services 413 | - in_parallel: 414 | - task: smoke-test-alertmanager 415 | attempts: 6 416 | timeout: 2m 417 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 418 | params: 419 | URL: https://alerts.monitoring.gds-reliability.engineering/-/healthy 420 | - task: smoke-test-alertmanager-eu-west-1a 421 | attempts: 6 422 | timeout: 2m 423 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 424 | params: 425 | URL: https://alerts-eu-west-1a.monitoring.gds-reliability.engineering/-/healthy 426 | - task: smoke-test-alertmanager-eu-west-1b 427 | attempts: 6 428 | timeout: 2m 429 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 430 | params: 431 | URL: https://alerts-eu-west-1b.monitoring.gds-reliability.engineering/-/healthy 432 | - task: smoke-test-alertmanager-eu-west-1c 433 | attempts: 6 434 | timeout: 2m 435 | file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml 436 | params: 437 | URL: https://alerts-eu-west-1c.monitoring.gds-reliability.engineering/-/healthy 438 | 439 | - name: run-service-broker-tests 440 | plan: 441 | - get: cf-app-discovery 442 | trigger: true 443 | - task: run-tests 444 | timeout: 15m 445 | config: 446 | platform: linux 447 | image_resource: 448 | type: docker-image 449 | source: 450 | repository: ruby 451 | tag: 2.6.6 452 | inputs: 453 | - name: cf-app-discovery 454 | path: repo 455 | run: 456 | path: sh 457 | dir: repo 458 | args: 459 | - -c 460 | - | 461 | apt-get update 462 | gem install bundler -v 2.0.1 463 | bundle install --without development 464 | bundle exec rake 465 | - name: deploy-service-broker-ireland-staging 466 | plan: 467 | - get: cf-app-discovery 468 | trigger: true 469 | passed: [ run-service-broker-tests ] 470 | - put: service-broker-ireland-staging 471 | params: 472 | manifest: cf-app-discovery/manifest-ireland-staging.yml 473 | show_app_log: true 474 | - name: deploy-service-broker-ireland-production 475 | plan: 476 | - get: cf-app-discovery 477 | trigger: true 478 | passed: [ deploy-service-broker-ireland-staging ] 479 | - put: service-broker-ireland-production 480 | params: 481 | manifest: cf-app-discovery/manifest-ireland-production.yml 482 | show_app_log: true 483 | - name: deploy-service-broker-london-production 484 | plan: 485 | - get: cf-app-discovery 486 | trigger: true 487 | passed: [ deploy-service-broker-ireland-staging ] 488 | - put: service-broker-london-production 489 | params: 490 | manifest: cf-app-discovery/manifest-london-production.yml 491 | show_app_log: true 492 | -------------------------------------------------------------------------------- /ci/images/task/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | ENV TF_VERSION 0.13.3 4 | ENV TF_ZIP_SHA256 35c662be9d32d38815cde5fa4c9fa61a3b7f39952ecd50ebf92fd1b2ddd6109b 5 | 6 | LABEL ubuntu="20.04" 7 | LABEL terraform="$TF_VERSION" 8 | 9 | ENV TZ=Europe/London 10 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 11 | 12 | RUN apt-get update --yes && \ 13 | apt-get install --yes --no-install-recommends \ 14 | ca-certificates \ 15 | awscli \ 16 | jq \ 17 | curl \ 18 | dnsutils \ 19 | unzip \ 20 | gpg \ 21 | gpg-agent \ 22 | golang \ 23 | git 24 | 25 | WORKDIR /tmp 26 | 27 | RUN curl https://releases.hashicorp.com/terraform/${TF_VERSION}/terraform_${TF_VERSION}_linux_amd64.zip > terraform.zip && \ 28 | echo "${TF_ZIP_SHA256} terraform.zip" > terraform.sha && \ 29 | sha256sum -c terraform.sha && unzip terraform.zip && mv terraform /usr/bin/terraform && \ 30 | rm terraform.zip && rm terraform.sha 31 | 32 | RUN GO111MODULE=on go get -v github.com/camptocamp/terraform-provider-pass && \ 33 | mkdir -p ~/.terraform.d/plugins/linux_amd64 && \ 34 | mv ~/go/bin/terraform-provider-pass ~/.terraform.d/plugins/linux_amd64/ 35 | 36 | # prom-ec2 terraform expects a pub ssh key even if it doesn't use it 37 | RUN mkdir -p $HOME/.ssh/ && touch $HOME/.ssh/id_rsa.pub 38 | 39 | COPY assume-role /usr/bin/assume-role 40 | 41 | ENTRYPOINT ["bash"] 42 | -------------------------------------------------------------------------------- /ci/images/task/assume-role: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu 4 | 5 | arn="$1" 6 | creds="$(aws \ 7 | sts assume-role \ 8 | --role-arn="$arn" \ 9 | --role-session-name="deploy-concourse-$(date +%s)" \ 10 | --duration 1800 \ 11 | )" 12 | 13 | access_key="$(echo "$creds" | jq -r ".Credentials.AccessKeyId")" 14 | secret_key="$(echo "$creds" | jq -r ".Credentials.SecretAccessKey")" 15 | session_token="$(echo "$creds" | jq -r ".Credentials.SessionToken")" 16 | 17 | echo "export AWS_ACCESS_KEY_ID=\"$access_key\"" 18 | echo "export AWS_SECRET_ACCESS_KEY=\"$secret_key\"" 19 | echo "export AWS_SESSION_TOKEN=\"$session_token\"" 20 | echo "export AWS_DEFAULT_REGION=\"eu-west-1\"" 21 | 22 | -------------------------------------------------------------------------------- /ci/tasks/deploy-project.yml: -------------------------------------------------------------------------------- 1 | platform: linux 2 | inputs: 3 | - name: src 4 | - name: re-secrets 5 | outputs: 6 | - name: outputs 7 | params: 8 | PROJECT: 9 | DEPLOYER_ARN: 10 | GPG_PRIVATE_KEY: 11 | AWS_REGION: 'eu-west-1' 12 | AWS_DEFAULT_REGION: 'eu-west-1' 13 | PASSWORD_STORE_DIR: "re-secrets/observe" 14 | run: 15 | path: bash 16 | args: 17 | - -eu 18 | - -c 19 | - | 20 | BUILD_DIR=$(pwd) 21 | 22 | echo "configuring aws client..." 23 | eval $(assume-role "${DEPLOYER_ARN}") 24 | 25 | echo "configuring re-secrets store..." 26 | echo "${GPG_PRIVATE_KEY}" | gpg --import 27 | mkdir -p $HOME/.password-store 28 | cp -R re-secrets $HOME/.password-store 29 | 30 | echo "terraforming..." 31 | pushd "src/terraform/projects/${PROJECT}" 32 | terraform init 33 | terraform apply -auto-approve 34 | terraform output -json > $BUILD_DIR/outputs/terraform-outputs.json 35 | popd 36 | -------------------------------------------------------------------------------- /ci/tasks/generate-prometheus-test-jq.yml: -------------------------------------------------------------------------------- 1 | platform: linux 2 | inputs: 3 | - name: input 4 | outputs: 5 | - name: output 6 | run: 7 | path: sh 8 | args: 9 | - -euxc 10 | - | 11 | echo ".last_successful_config == $(jq '.prometheus_config_etag.value' input/terraform-outputs.json)" > output/test.jq 12 | -------------------------------------------------------------------------------- /ci/tasks/http-ping.yml: -------------------------------------------------------------------------------- 1 | platform: linux 2 | image_resource: 3 | type: docker-image 4 | source: 5 | repository: governmentpaas/curl-ssl 6 | tag: fe3e384e81ccb50842509d7237e3828b293de694 7 | inputs: 8 | - name: response-jq-test 9 | optional: true 10 | params: 11 | URL: 12 | run: 13 | path: sh 14 | args: 15 | - -euxc 16 | - | 17 | DOMAIN=$(echo "${URL}" | awk -F/ '{print $3}') 18 | getent ahosts ${DOMAIN} | cut -d ' ' -f1 | sort | uniq | tee /dev/stderr | while read TARGET_IP ; do 19 | curl \ 20 | --resolve ${DOMAIN}:443:${TARGET_IP} \ 21 | --silent \ 22 | --fail \ 23 | --write-out "${TARGET_IP} %{http_code} %{time_total}s"$'\n' \ 24 | --output curl_output \ 25 | --max-time 5 "${URL}" 26 | 27 | if [[ -e response-jq-test/test.jq ]] ; then 28 | if ! jq -e -f response-jq-test/test.jq curl_output ; then 29 | echo 'Response:' 30 | cat curl_output 31 | echo 'Failed jq test:' 32 | cat response-jq-test/test.jq 33 | # don't spin through attempts too fast 34 | sleep 5 35 | exit 9 36 | fi 37 | fi 38 | done 39 | 40 | -------------------------------------------------------------------------------- /ci/tasks/wait-ecs-services-stable.yml: -------------------------------------------------------------------------------- 1 | platform: linux 2 | inputs: 3 | - name: terraform-outputs 4 | params: 5 | DEPLOYER_ARN: 6 | TERRAFORM_VAR: 7 | AWS_REGION: 'eu-west-1' 8 | AWS_DEFAULT_REGION: 'eu-west-1' 9 | run: 10 | path: bash 11 | args: 12 | - -eu 13 | - -c 14 | - | 15 | echo "configuring aws client..." 16 | eval $(assume-role "${DEPLOYER_ARN}") 17 | 18 | jq -c '.[env.TERRAFORM_VAR].value | to_entries | .[]' terraform-outputs/terraform-outputs.json | while read entry ; do 19 | CLUSTER="$(echo ${entry} | jq -r '.key')" 20 | SERVICES="$(echo ${entry} | jq -r '.value | join(" ")')" 21 | 22 | echo "Waiting for services ${SERVICES} of cluster ${CLUSTER} to be stable..." 23 | 24 | aws ecs wait services-stable \ 25 | --cluster "${CLUSTER}" \ 26 | --services ${SERVICES} 27 | done 28 | -------------------------------------------------------------------------------- /logstash/prometheus-for-paas-production.conf: -------------------------------------------------------------------------------- 1 | filter { 2 | if !("beats_input_codec_plain_applied" in [tags]) { 3 | grok { 4 | # attempt to parse syslog lines 5 | match => { "message" => "%{SYSLOG5424PRI}%{NONNEGINT:syslog_ver} +(?:%{TIMESTAMP_ISO8601:syslog_timestamp}|-) +(?:%{HOSTNAME:syslog_host}|-) +(?:%{NOTSPACE:syslog_app}|-) +(?:%{NOTSPACE:syslog_proc}|-) +(?:%{WORD:syslog_msgid}|-) +(?:%{SYSLOG5424SD:syslog_sd}|-|) +%{GREEDYDATA:syslog_msg}" } 6 | # if successful, save original `@timestamp` and `host` fields created by logstash 7 | add_field => [ "received_at", "%{@timestamp}" ] 8 | add_field => [ "received_from", "%{host}" ] 9 | add_tag => ["cf"] 10 | tag_on_failure => ["_syslogparsefailure"] 11 | } 12 | } 13 | 14 | if "cf" in [tags] { 15 | # parse the syslog pri field into severity/facility 16 | if [syslog5424_pri] { 17 | syslog_pri { syslog_pri_field_name => 'syslog5424_pri' } 18 | } 19 | 20 | # replace @timestamp field with the one from syslog 21 | date { match => [ "syslog_timestamp", "ISO8601" ] } 22 | 23 | # if we successfully parsed cf syslog, replace the message and source_host fields 24 | mutate { 25 | replace => [ "source_host", "%{syslog_host}" ] 26 | replace => [ "message", "%{syslog_msg}" ] 27 | } 28 | 29 | # Cloud Foundry passes the app name, space and organisation in the syslog_host 30 | # Filtering them into separate fields makes it easier to query multiple apps in a single Kibana instance 31 | dissect { 32 | mapping => { "syslog_host" => "%{[cf][org]}.%{[cf][space]}.%{[cf][app]}" } 33 | tag_on_failure => ["_sysloghostdissectfailure"] 34 | } 35 | 36 | # Cloud Foundry gorouter logs 37 | if [syslog_proc] =~ "RTR" { 38 | mutate { replace => { "type" => "gorouter" } } 39 | grok { 40 | match => { "syslog_msg" => "%{HOSTNAME:[access][host]} - \[%{TIMESTAMP_ISO8601:router_timestamp}\] \"%{WORD:[access][method]} %{NOTSPACE:[access][url]} HTTP/%{NUMBER:[access][http_version]}\" %{NONNEGINT:[access][response_code]:int} %{NONNEGINT:[access][body_received][bytes]:int} %{NONNEGINT:[access][body_sent][bytes]:int} %{QUOTEDSTRING:[access][referrer]} %{QUOTEDSTRING:[access][agent]} \"%{HOSTPORT:[access][remote_ip_and_port]}\" \"%{HOSTPORT:[access][upstream_ip_and_port]}\" %{GREEDYDATA:router_keys}" } 41 | tag_on_failure => ["_routerparsefailure"] 42 | add_tag => ["gorouter"] 43 | } 44 | # replace @timestamp field with the one from router access log 45 | date { 46 | match => [ "router_timestamp", "ISO8601" ] 47 | } 48 | kv { 49 | source => "router_keys" 50 | target => "router" 51 | value_split => ":" 52 | remove_field => "router_keys" 53 | } 54 | } 55 | 56 | # Application logs 57 | if [syslog_proc] =~ "APP" { 58 | json { 59 | source => "syslog_msg" 60 | add_tag => ["app"] 61 | } 62 | } 63 | 64 | # User agent parsing 65 | if [access][agent] { 66 | useragent { 67 | source => "[access][agent]" 68 | target => "[access][user_agent]" 69 | } 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/alb.tf: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | # ----- alertmanager public ALB ------- 3 | ###################################################################### 4 | # 5 | # 6 | # The ALB serves one main purpose: so we can use ACM certs instead of 7 | # managing our own. We don't actually want it to load-balance; each 8 | # public domain name associated with alertmanager should route to 9 | # exactly one internal alertmanager instance. We achieve this by 10 | # using listener rules, so that requests with a particular host: 11 | # header must go to a particular AZ, and running one alertmanager per 12 | # AZ. 13 | 14 | 15 | resource "aws_lb" "alertmanager_alb" { 16 | name = "${var.environment}-alertmanager-alb" 17 | internal = false 18 | load_balancer_type = "application" 19 | 20 | security_groups = [aws_security_group.alertmanager_alb.id] 21 | 22 | subnets = data.terraform_remote_state.infra_networking.outputs.public_subnets 23 | 24 | tags = merge( 25 | local.default_tags, 26 | { 27 | Name = "${var.environment}-alertmanager-alb" 28 | }, 29 | ) 30 | } 31 | 32 | resource "aws_lb_listener" "alertmanager_listener_alb_http" { 33 | load_balancer_arn = aws_lb.alertmanager_alb.arn 34 | port = "80" 35 | protocol = "HTTP" 36 | 37 | default_action { 38 | type = "redirect" 39 | 40 | redirect { 41 | port = "443" 42 | protocol = "HTTPS" 43 | status_code = "HTTP_301" 44 | } 45 | } 46 | } 47 | 48 | resource "aws_lb_listener" "alertmanager_listener_alb_https" { 49 | load_balancer_arn = aws_lb.alertmanager_alb.arn 50 | port = "443" 51 | protocol = "HTTPS" 52 | ssl_policy = "ELBSecurityPolicy-TLS-1-2-2017-01" 53 | certificate_arn = aws_acm_certificate_validation.alertmanager_cert.certificate_arn 54 | 55 | default_action { 56 | type = "forward" 57 | target_group_arn = aws_lb_target_group.alertmanager_all.arn 58 | } 59 | } 60 | 61 | resource "aws_lb_listener_rule" "alertmanager_listener_rule_per_az" { 62 | for_each = toset(local.availability_zones) 63 | 64 | listener_arn = aws_lb_listener.alertmanager_listener_alb_https.arn 65 | 66 | action { 67 | type = "forward" 68 | target_group_arn = aws_lb_target_group.alertmanager_per_az[each.key].arn 69 | } 70 | 71 | condition { 72 | host_header { 73 | values = ["alerts-${each.key}.*"] 74 | } 75 | } 76 | } 77 | 78 | resource "aws_lb_target_group" "alertmanager_per_az" { 79 | for_each = toset(local.availability_zones) 80 | name = "${var.environment}-alerts-${each.key}" 81 | port = 9093 82 | protocol = "HTTP" 83 | vpc_id = local.vpc_id 84 | deregistration_delay = 30 85 | target_type = "ip" 86 | 87 | health_check { 88 | interval = 10 89 | path = "/" 90 | matcher = "200" 91 | protocol = "HTTP" 92 | healthy_threshold = 2 93 | unhealthy_threshold = 2 94 | timeout = "5" 95 | } 96 | 97 | tags = merge( 98 | local.default_tags, 99 | { 100 | Name = "${var.environment}-alertmanager-${each.key}" 101 | }, 102 | ) 103 | } 104 | 105 | resource "aws_lb_target_group" "alertmanager_all" { 106 | name = "${var.environment}-alerts-all" 107 | port = 9093 108 | protocol = "HTTP" 109 | vpc_id = local.vpc_id 110 | deregistration_delay = 30 111 | target_type = "ip" 112 | 113 | health_check { 114 | interval = 10 115 | path = "/" 116 | matcher = "200" 117 | protocol = "HTTP" 118 | healthy_threshold = 2 119 | unhealthy_threshold = 2 120 | timeout = "5" 121 | } 122 | 123 | tags = merge( 124 | local.default_tags, 125 | { 126 | Name = "${var.environment}-alertmanager-all" 127 | }, 128 | ) 129 | } 130 | 131 | resource "aws_route53_record" "alerts_alias" { 132 | zone_id = local.zone_id 133 | name = "alerts" 134 | type = "A" 135 | 136 | alias { 137 | name = aws_lb.alertmanager_alb.dns_name 138 | zone_id = aws_lb.alertmanager_alb.zone_id 139 | evaluate_target_health = false 140 | } 141 | } 142 | 143 | resource "aws_route53_record" "alerts_az_alias" { 144 | for_each = toset(local.availability_zones) 145 | 146 | zone_id = local.zone_id 147 | name = "alerts-${each.key}" 148 | type = "A" 149 | 150 | alias { 151 | name = aws_lb.alertmanager_alb.dns_name 152 | zone_id = aws_lb.alertmanager_alb.zone_id 153 | evaluate_target_health = false 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/alertmanager-service.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ECS service that runs alertmanager 3 | * 4 | */ 5 | 6 | ### container, task, service definitions 7 | 8 | resource "aws_ecs_cluster" "prometheus_cluster" { 9 | name = "${var.environment}-ecs-monitoring" 10 | 11 | tags = merge(local.default_tags, { 12 | Name = "${var.environment}-alertmanager" 13 | }) 14 | } 15 | 16 | resource "aws_iam_role" "execution" { 17 | name = "${var.environment}-alertmanager-execution" 18 | 19 | assume_role_policy = <<-EOF 20 | { 21 | "Version": "2012-10-17", 22 | "Statement": [ 23 | { 24 | "Effect": "Allow", 25 | "Principal": { 26 | "Service": "ecs-tasks.amazonaws.com" 27 | }, 28 | "Action": "sts:AssumeRole" 29 | } 30 | ] 31 | } 32 | EOF 33 | 34 | tags = merge(local.default_tags, { 35 | Name = "${var.environment}-alertmanager-execution" 36 | }) 37 | } 38 | 39 | resource "aws_iam_policy" "execution" { 40 | name = "${var.environment}-alertmanager-execution" 41 | 42 | policy = <<-EOF 43 | { 44 | "Version": "2012-10-17", 45 | "Statement": [ 46 | { 47 | "Effect": "Allow", 48 | "Action": [ 49 | "logs:CreateLogStream", 50 | "logs:PutLogEvents" 51 | ], 52 | "Resource": "*" 53 | } 54 | ] 55 | } 56 | EOF 57 | 58 | } 59 | 60 | resource "aws_iam_role_policy_attachment" "execution_execution" { 61 | role = aws_iam_role.execution.name 62 | policy_arn = aws_iam_policy.execution.arn 63 | } 64 | 65 | data "template_file" "alertmanager_nlb_container_defn" { 66 | template = file("${path.module}/task-definitions/alertmanager.json") 67 | 68 | vars = { 69 | alertmanager_config_base64 = base64encode(data.template_file.alertmanager_config_file.rendered) 70 | templates_base64 = base64encode(file("${path.module}/templates/default.tmpl")) 71 | alertmanager_url = "--web.external-url=https://${aws_route53_record.alerts_alias.fqdn}" 72 | log_group = aws_cloudwatch_log_group.task_logs.name 73 | region = var.aws_region 74 | } 75 | 76 | depends_on = [ 77 | module.assertion_alertmanager_config_file_valid_yaml.checked, 78 | ] 79 | } 80 | 81 | module "assertion_alertmanager_nlb_container_defn_valid_json" { 82 | source = "github.com/Invicton-Labs/terraform-null-assertion?ref=47d7354cc5521853fbe8df96b7bb0223bea732cd" 83 | 84 | condition = can(jsondecode(data.template_file.alertmanager_nlb_container_defn.rendered)) 85 | 86 | error_message = "Alertmanager NLB container definition failed JSON parsing" 87 | } 88 | 89 | resource "aws_ecs_task_definition" "alertmanager_nlb" { 90 | family = "${var.environment}-alertmanager" 91 | container_definitions = data.template_file.alertmanager_nlb_container_defn.rendered 92 | network_mode = "awsvpc" 93 | execution_role_arn = aws_iam_role.execution.arn 94 | requires_compatibilities = ["FARGATE"] 95 | cpu = 256 96 | memory = 512 97 | 98 | tags = merge(local.default_tags, { 99 | Name = "${var.environment}-alertmanager" 100 | }) 101 | 102 | depends_on = [ 103 | module.assertion_alertmanager_nlb_container_defn_valid_json.checked, 104 | ] 105 | } 106 | 107 | resource "aws_ecs_service" "alertmanager_alb" { 108 | for_each = { 109 | for _, subnet in data.aws_subnet.private_subnets : 110 | subnet.id => subnet.availability_zone 111 | } 112 | name = "${var.environment}-alertmanager-alb-${each.value}" 113 | cluster = "${var.environment}-ecs-monitoring" 114 | task_definition = aws_ecs_task_definition.alertmanager_nlb.arn 115 | desired_count = 1 116 | launch_type = "FARGATE" 117 | 118 | wait_for_steady_state = true 119 | 120 | load_balancer { 121 | target_group_arn = aws_lb_target_group.alertmanager_all.arn 122 | container_name = "alertmanager" 123 | container_port = 9093 124 | } 125 | 126 | load_balancer { 127 | target_group_arn = aws_lb_target_group.alertmanager_per_az[each.value].arn 128 | container_name = "alertmanager" 129 | container_port = 9093 130 | } 131 | 132 | network_configuration { 133 | subnets = [each.key] 134 | security_groups = [aws_security_group.alertmanager_task.id] 135 | } 136 | 137 | service_registries { 138 | registry_arn = aws_service_discovery_service.alertmanager.arn 139 | } 140 | } 141 | 142 | #### alertmanager 143 | 144 | data "pass_password" "observe_pagerduty_key" { 145 | path = "pagerduty/integration-keys/production" 146 | } 147 | 148 | data "pass_password" "dgu_pagerduty_key" { 149 | path = "pagerduty/integration-keys/dgu" 150 | } 151 | 152 | data "pass_password" "govuk_pagerduty_key" { 153 | path = "pagerduty/integration-keys/govuk" 154 | } 155 | 156 | data "pass_password" "verify_p1_pagerduty_key" { 157 | path = "pagerduty/integration-keys/verify-p1" 158 | } 159 | 160 | data "pass_password" "verify_p2_pagerduty_key" { 161 | path = "pagerduty/integration-keys/verify-p2" 162 | } 163 | 164 | data "pass_password" "dcs_p2_pagerduty_key" { 165 | path = "pagerduty/integration-keys/dcs-p2" 166 | } 167 | 168 | data "pass_password" "slack_api_url" { 169 | path = "slack-api-url" 170 | } 171 | 172 | data "pass_password" "notify_zendesk" { 173 | path = "receivers/notify/zendesk" 174 | } 175 | 176 | data "pass_password" "notify_p2_pagerduty_key" { 177 | path = "receivers/notify/p2_pagerduty" 178 | } 179 | 180 | data "pass_password" "autom8_email" { 181 | path = "receivers/autom8/email" 182 | 183 | } 184 | 185 | data "pass_password" "verify_staging_cronitor" { 186 | path = "cronitor/verify-staging-url" 187 | } 188 | 189 | data "pass_password" "verify_integration_cronitor" { 190 | path = "cronitor/verify-integration-url" 191 | } 192 | 193 | data "pass_password" "verify_prod_cronitor" { 194 | path = "cronitor/verify-prod-url" 195 | } 196 | 197 | data "template_file" "alertmanager_config_file" { 198 | template = file("${path.module}/templates/alertmanager.tpl") 199 | 200 | vars = { 201 | observe_pagerduty_key = data.pass_password.observe_pagerduty_key.password 202 | dgu_pagerduty_key = data.pass_password.dgu_pagerduty_key.password 203 | govuk_pagerduty_key = data.pass_password.govuk_pagerduty_key.password 204 | verify_p1_pagerduty_key = data.pass_password.verify_p1_pagerduty_key.password 205 | verify_p2_pagerduty_key = data.pass_password.verify_p2_pagerduty_key.password 206 | dcs_p2_pagerduty_key = data.pass_password.dcs_p2_pagerduty_key.password 207 | slack_api_url = data.pass_password.slack_api_url.password 208 | notify_zendesk = data.pass_password.notify_zendesk.password 209 | notify_p2_pagerduty_key = data.pass_password.notify_p2_pagerduty_key.password 210 | smtp_from = "alerts@${data.terraform_remote_state.infra_networking.outputs.public_subdomain}" 211 | # Port as requested by https://docs.aws.amazon.com/ses/latest/DeveloperGuide/smtp-connect.html 212 | smtp_smarthost = "email-smtp.${var.aws_region}.amazonaws.com:587" 213 | smtp_username = aws_iam_access_key.smtp.id 214 | smtp_password = aws_iam_access_key.smtp.ses_smtp_password_v4 215 | autom8_recipient_email = data.pass_password.autom8_email.password 216 | observe_cronitor = var.observe_cronitor 217 | verify_staging_cronitor = data.pass_password.verify_staging_cronitor.password 218 | verify_integration_cronitor = data.pass_password.verify_integration_cronitor.password 219 | verify_prod_cronitor = data.pass_password.verify_prod_cronitor.password 220 | } 221 | } 222 | 223 | module "assertion_alertmanager_config_file_valid_yaml" { 224 | source = "github.com/Invicton-Labs/terraform-null-assertion?ref=47d7354cc5521853fbe8df96b7bb0223bea732cd" 225 | 226 | condition = can(yamldecode(data.template_file.alertmanager_config_file.rendered)) 227 | 228 | error_message = "Alertmanager config failed YAML parsing" 229 | } 230 | 231 | ## AWS SES 232 | 233 | resource "aws_ses_domain_identity" "main" { 234 | domain = data.terraform_remote_state.infra_networking.outputs.public_subdomain 235 | } 236 | 237 | resource "aws_route53_record" "txt_amazonses_verification_record" { 238 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 239 | name = "_amazonses.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}" 240 | type = "TXT" 241 | ttl = "600" 242 | records = [aws_ses_domain_identity.main.verification_token] 243 | } 244 | 245 | resource "aws_ses_domain_dkim" "main" { 246 | domain = aws_ses_domain_identity.main.domain 247 | } 248 | 249 | resource "aws_route53_record" "dkim_amazonses_verification_record" { 250 | count = 3 251 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 252 | name = "${element(aws_ses_domain_dkim.main.dkim_tokens, count.index)}._domainkey.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}" 253 | type = "CNAME" 254 | ttl = "600" 255 | records = ["${element(aws_ses_domain_dkim.main.dkim_tokens, count.index)}.dkim.amazonses.com"] 256 | } 257 | 258 | resource "aws_ses_domain_mail_from" "alerts" { 259 | domain = aws_ses_domain_identity.main.domain 260 | mail_from_domain = "mail.${aws_ses_domain_identity.main.domain}" 261 | } 262 | 263 | resource "aws_route53_record" "alerts_ses_domain_mail_from_mx" { 264 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 265 | name = aws_ses_domain_mail_from.alerts.mail_from_domain 266 | type = "MX" 267 | ttl = "600" 268 | records = ["10 feedback-smtp.${var.aws_region}.amazonses.com"] 269 | } 270 | 271 | resource "aws_route53_record" "alerts_ses_domain_mail_from_txt" { 272 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 273 | name = aws_ses_domain_mail_from.alerts.mail_from_domain 274 | type = "TXT" 275 | ttl = "600" 276 | records = ["v=spf1 include:amazonses.com -all"] 277 | } 278 | 279 | # IAM for SMTP 280 | 281 | resource "aws_iam_user" "smtp" { 282 | name = "${var.environment}.smtp" 283 | path = "/system/" 284 | 285 | tags = merge(local.default_tags, { 286 | Name = "${var.environment}-alertmanager-smtp" 287 | }) 288 | } 289 | 290 | resource "aws_iam_access_key" "smtp" { 291 | user = aws_iam_user.smtp.name 292 | } 293 | 294 | resource "aws_iam_user_policy" "smtp_ro" { 295 | name = "${var.environment}.smtp" 296 | user = aws_iam_user.smtp.name 297 | 298 | policy = < { 21 | name = dvo.resource_record_name 22 | record = dvo.resource_record_value 23 | type = dvo.resource_record_type 24 | } 25 | } 26 | 27 | name = each.value.name 28 | records = [each.value.record] 29 | type = each.value.type 30 | zone_id = local.zone_id 31 | ttl = 60 32 | 33 | allow_overwrite = true 34 | 35 | depends_on = [aws_acm_certificate.alertmanager_cert] 36 | } 37 | 38 | resource "aws_acm_certificate_validation" "alertmanager_cert" { 39 | certificate_arn = aws_acm_certificate.alertmanager_cert.arn 40 | validation_record_fqdns = [for record in aws_route53_record.alertmanager_cert_validation : record.fqdn] 41 | } 42 | 43 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## Module: alertmanager 3 | * 4 | * Create services and task definitions for the ECS cluster 5 | * 6 | */ 7 | 8 | variable "aws_region" { 9 | type = string 10 | description = "AWS region" 11 | default = "eu-west-1" 12 | } 13 | 14 | variable "remote_state_bucket" { 15 | type = string 16 | description = "S3 bucket we store our terraform state in" 17 | default = "ecs-monitoring" 18 | } 19 | 20 | variable "environment" { 21 | type = string 22 | description = "Unique name for this collection of resources" 23 | default = "ecs-monitoring" 24 | } 25 | 26 | variable "observe_cronitor" { 27 | type = string 28 | description = "URL to send Observe heartbeats to" 29 | default = "" 30 | } 31 | 32 | variable "allowed_cidrs" { 33 | type = list(string) 34 | description = "List of CIDRs which are able to access alertmanager, default are GDS ips and concourse egress" 35 | 36 | default = [ 37 | "213.86.153.211/32", 38 | "213.86.153.212/32", 39 | "213.86.153.213/32", 40 | "213.86.153.214/32", 41 | "213.86.153.231/32", 42 | "213.86.153.235/32", 43 | "213.86.153.236/32", 44 | "213.86.153.237/32", 45 | "85.133.67.244/32", 46 | "35.177.37.128/32", 47 | "35.176.252.164/32", 48 | "51.149.8.0/25", 49 | "51.149.8.128/29", # CO 50 | "51.149.9.112/29", # CO 51 | "51.149.9.240/29", # CO 52 | ] 53 | } 54 | 55 | locals { 56 | default_tags = { 57 | Terraform = "true" 58 | Project = "alertmanager" 59 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 60 | Environment = var.environment 61 | Service = "alertmanager" 62 | } 63 | vpc_id = data.terraform_remote_state.infra_networking.outputs.vpc_id 64 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 65 | availability_zones = data.aws_subnet.public_subnets.*.availability_zone 66 | } 67 | 68 | # Resources 69 | # -------------------------------------------------------------- 70 | 71 | ## Data sources 72 | data "terraform_remote_state" "infra_networking" { 73 | backend = "s3" 74 | 75 | config = { 76 | bucket = var.remote_state_bucket 77 | key = "infra-networking-modular.tfstate" 78 | region = var.aws_region 79 | } 80 | } 81 | 82 | data "terraform_remote_state" "infra_security_groups" { 83 | backend = "s3" 84 | 85 | config = { 86 | bucket = var.remote_state_bucket 87 | key = "infra-security-groups-modular.tfstate" 88 | region = var.aws_region 89 | } 90 | } 91 | 92 | data "aws_availability_zones" "available" {} 93 | 94 | data "aws_subnet" "public_subnets" { 95 | count = length(data.terraform_remote_state.infra_networking.outputs.public_subnets) 96 | id = data.terraform_remote_state.infra_networking.outputs.public_subnets[count.index] 97 | } 98 | 99 | data "aws_subnet" "private_subnets" { 100 | count = length(data.terraform_remote_state.infra_networking.outputs.private_subnets) 101 | id = data.terraform_remote_state.infra_networking.outputs.private_subnets[count.index] 102 | } 103 | 104 | ## Resources 105 | 106 | resource "aws_cloudwatch_log_group" "task_logs" { 107 | name = var.environment 108 | retention_in_days = 7 109 | 110 | tags = merge(local.default_tags, { 111 | Name = "${var.environment}-alertmanager-task-logs" 112 | }) 113 | } 114 | 115 | ## Outputs 116 | 117 | output "ecs_clusters_services" { 118 | description = "Names of ECS services created, listed by ECS cluster name" 119 | value = transpose({ 120 | for _, service in aws_ecs_service.alertmanager_alb: 121 | service.name => [ service.cluster ] 122 | }) 123 | } 124 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/security-group.tf: -------------------------------------------------------------------------------- 1 | resource "aws_security_group" "alertmanager_alb" { 2 | name = "${var.environment}-alertmanager-alb" 3 | vpc_id = local.vpc_id 4 | description = "Alertmanager ALB" 5 | 6 | tags = merge( 7 | local.default_tags, 8 | { 9 | Name = "alertmanager-alb", 10 | }, 11 | ) 12 | } 13 | 14 | resource "aws_security_group" "alertmanager_task" { 15 | name = "${var.environment}-alertmanager-task" 16 | vpc_id = local.vpc_id 17 | description = "Controls ingress and egress for the alertmanager task" 18 | 19 | tags = merge( 20 | local.default_tags, 21 | { 22 | Name = "alertmanager-task", 23 | }, 24 | ) 25 | } 26 | 27 | # Alertmanager is behind an NLB, so it needs to allow ingress from the 28 | # allowed public internet cidrs directly 29 | resource "aws_security_group_rule" "ingress_from_allowed_cidrs_to_alertmanager_9093" { 30 | security_group_id = aws_security_group.alertmanager_task.id 31 | type = "ingress" 32 | from_port = 9093 33 | to_port = 9093 34 | protocol = "tcp" 35 | cidr_blocks = var.allowed_cidrs 36 | } 37 | 38 | # Alertmanager ALB needs to allow ingress from the allowed public 39 | # internet cidrs 40 | resource "aws_security_group_rule" "ingress_from_allowed_cidrs_to_alertmanager_alb_http" { 41 | security_group_id = aws_security_group.alertmanager_alb.id 42 | type = "ingress" 43 | from_port = 80 44 | to_port = 80 45 | protocol = "tcp" 46 | cidr_blocks = var.allowed_cidrs 47 | } 48 | 49 | resource "aws_security_group_rule" "ingress_from_allowed_cidrs_to_alertmanager_alb_https" { 50 | security_group_id = aws_security_group.alertmanager_alb.id 51 | type = "ingress" 52 | from_port = 443 53 | to_port = 443 54 | protocol = "tcp" 55 | cidr_blocks = var.allowed_cidrs 56 | } 57 | 58 | # NLB health checks come from the public subnet IP range 59 | resource "aws_security_group_rule" "ingress_from_public_subnets_to_alertmanager_9093" { 60 | security_group_id = aws_security_group.alertmanager_task.id 61 | type = "ingress" 62 | from_port = 9093 63 | to_port = 9093 64 | protocol = "tcp" 65 | cidr_blocks = data.aws_subnet.public_subnets.*.cidr_block 66 | } 67 | 68 | resource "aws_security_group_rule" "ingress_from_alertmanager_alb_to_alertmanager_9093" { 69 | security_group_id = aws_security_group.alertmanager_task.id 70 | source_security_group_id = aws_security_group.alertmanager_alb.id 71 | type = "ingress" 72 | from_port = 9093 73 | to_port = 9093 74 | protocol = "tcp" 75 | } 76 | 77 | resource "aws_security_group_rule" "egress_from_alertmanager_alb_to_alertmanager_9093" { 78 | security_group_id = aws_security_group.alertmanager_alb.id 79 | # source_security_group_id means destination for egress rules 80 | source_security_group_id = aws_security_group.alertmanager_task.id 81 | type = "egress" 82 | from_port = 9093 83 | to_port = 9093 84 | protocol = "tcp" 85 | } 86 | 87 | # TODO: could we make observe prometheus more consistent with external 88 | # prometheis and go via public NLB IPs? 89 | resource "aws_security_group_rule" "ingress_from_prometheus_ec2_to_alertmanager_task" { 90 | security_group_id = aws_security_group.alertmanager_task.id 91 | type = "ingress" 92 | from_port = 9093 93 | to_port = 9093 94 | protocol = "tcp" 95 | source_security_group_id = data.terraform_remote_state.infra_security_groups.outputs.prometheus_ec2_sg_id 96 | } 97 | 98 | 99 | resource "aws_security_group_rule" "ingress_alertmanager_task_meshing" { 100 | security_group_id = aws_security_group.alertmanager_task.id 101 | type = "ingress" 102 | from_port = 9094 103 | to_port = 9094 104 | protocol = "tcp" 105 | source_security_group_id = aws_security_group.alertmanager_task.id 106 | } 107 | 108 | # This rule allows all egress out of alertmanager_task. This is for the following purposes: 109 | # - raising alerts with receivers such as pagerduty and cronitor 110 | # - sending emails via AWS API 111 | # - communicate with other alertmanagers to mesh 112 | resource "aws_security_group_rule" "egress_from_alertmanager_task_to_all" { 113 | security_group_id = aws_security_group.alertmanager_task.id 114 | type = "egress" 115 | from_port = 0 116 | to_port = 0 117 | protocol = "-1" 118 | cidr_blocks = ["0.0.0.0/0"] 119 | } 120 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/service_discovery.tf: -------------------------------------------------------------------------------- 1 | resource "aws_service_discovery_private_dns_namespace" "observe" { 2 | name = "local.gds-reliability.engineering" 3 | description = "Observe instances" 4 | vpc = local.vpc_id 5 | } 6 | 7 | resource "aws_service_discovery_service" "alertmanager" { 8 | name = "alertmanager" 9 | 10 | description = "A service to allow alertmanager peers to discover each other" 11 | 12 | dns_config { 13 | namespace_id = aws_service_discovery_private_dns_namespace.observe.id 14 | 15 | dns_records { 16 | ttl = 10 17 | type = "A" 18 | } 19 | 20 | routing_policy = "MULTIVALUE" 21 | } 22 | 23 | health_check_custom_config { 24 | failure_threshold = 2 25 | } 26 | } 27 | 28 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/task-definitions/alertmanager.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "alertmanager", 4 | "image": "prom/alertmanager", 5 | "memoryReservation": 512, 6 | "essential": true, 7 | "portMappings": [ 8 | { 9 | "containerPort": 9093, 10 | "hostPort": 9093 11 | }, 12 | { 13 | "containerPort": 9094, 14 | "hostPort": 9094 15 | } 16 | ], 17 | "environment": [ 18 | { 19 | "Name": "ALERTMANAGER_CONFIG", 20 | "Value": "${alertmanager_config_base64}" 21 | }, 22 | { 23 | "Name": "TEMPLATES", 24 | "Value": "${templates_base64}" 25 | } 26 | ], 27 | "entryPoint": [ 28 | "/bin/sh", 29 | "-c", 30 | "echo \"$ALERTMANAGER_CONFIG\" | base64 -d > /etc/alertmanager/alertmanager.yml; echo \"$TEMPLATES\" | base64 -d > /etc/alertmanager/default.tmpl; /bin/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --cluster.peer=alertmanager.local.gds-reliability.engineering:9094 ${alertmanager_url}" 31 | ], 32 | "logConfiguration": { 33 | "logDriver": "awslogs", 34 | "options": { 35 | "awslogs-group": "${log_group}", 36 | "awslogs-region": "${region}", 37 | "awslogs-stream-prefix": "alertmanager" 38 | } 39 | } 40 | } 41 | ] 42 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/templates/alertmanager.tpl: -------------------------------------------------------------------------------- 1 | global: 2 | resolve_timeout: 5m 3 | 4 | smtp_from: "${smtp_from}" 5 | smtp_smarthost: "${smtp_smarthost}" 6 | smtp_auth_username: "${smtp_username}" 7 | smtp_auth_password: "${smtp_password}" 8 | slack_api_url: "${slack_api_url}" 9 | 10 | templates: 11 | - '/etc/alertmanager/default.tmpl' 12 | 13 | route: 14 | receiver: "re-observe-pagerduty" 15 | group_by: 16 | - alertname 17 | - product 18 | - deployment 19 | routes: 20 | - receiver: "autom8-tickets" 21 | repeat_interval: 7d 22 | match: 23 | product: "prometheus" 24 | severity: "ticket" 25 | - receiver: "notify-tickets" 26 | repeat_interval: 7d 27 | match: 28 | product: "notify" 29 | severity: "ticket" 30 | - receiver: "notify-p2" 31 | repeat_interval: 7d 32 | match: 33 | product: "notify" 34 | severity: "p2" 35 | - receiver: "dgu-pagerduty" 36 | match: 37 | product: "data-gov-uk" 38 | - receiver: "govuk-pagerduty" 39 | match: 40 | product: "govuk-accounts" 41 | - receiver: "re-observe-pagerduty" 42 | match: 43 | product: "prometheus" 44 | severity: "page" 45 | - receiver: "observe-cronitor" 46 | group_interval: 1m 47 | repeat_interval: 1m 48 | match: 49 | product: "prometheus" 50 | severity: "constant" 51 | - receiver: "dev-null" 52 | match: 53 | product: "doc-checking" 54 | routes: 55 | - match_re: 56 | space: production|integration 57 | receiver: dcs-slack 58 | routes: 59 | - match: 60 | space: production 61 | severity: p2 62 | receiver: "dcs-p2" 63 | # Verify hub ECS 64 | - receiver: "verify-2ndline-slack" 65 | match: 66 | product: "verify" 67 | routes: 68 | - receiver: "verify-p1" 69 | match: 70 | deployment: prod 71 | severity: p1 72 | - receiver: "verify-p2" 73 | match: 74 | deployment: integration 75 | severity: p1 76 | - receiver: "verify-p3" 77 | match: 78 | severity: ticket 79 | - match: 80 | severity: constant 81 | group_interval: 1m 82 | repeat_interval: 1m 83 | routes: 84 | - match: 85 | deployment: prod 86 | receiver: "verify-prod-cronitor" 87 | - match: 88 | deployment: integration 89 | receiver: "verify-integration-cronitor" 90 | - match: 91 | deployment: staging 92 | receiver: "verify-staging-cronitor" 93 | 94 | receivers: 95 | - name: "re-observe-pagerduty" 96 | pagerduty_configs: 97 | - service_key: "${observe_pagerduty_key}" 98 | - name: "dgu-pagerduty" 99 | pagerduty_configs: 100 | - service_key: "${dgu_pagerduty_key}" 101 | - name: "govuk-pagerduty" 102 | pagerduty_configs: 103 | - service_key: "${govuk_pagerduty_key}" 104 | - name: "notify-tickets" 105 | email_configs: 106 | - to: "${notify_zendesk}" 107 | - name: "notify-p2" 108 | pagerduty_configs: 109 | - service_key: "${notify_p2_pagerduty_key}" 110 | - name: "observe-cronitor" 111 | webhook_configs: 112 | - send_resolved: false 113 | url: "${observe_cronitor}" 114 | - name: "verify-prod-cronitor" 115 | webhook_configs: 116 | - send_resolved: false 117 | url: "${verify_prod_cronitor}" 118 | - name: "verify-integration-cronitor" 119 | webhook_configs: 120 | - send_resolved: false 121 | url: "${verify_integration_cronitor}" 122 | - name: "verify-staging-cronitor" 123 | webhook_configs: 124 | - send_resolved: false 125 | url: "${verify_staging_cronitor}" 126 | - name: "verify-2ndline-slack" 127 | slack_configs: &verify-2ndline-slack-configs 128 | - send_resolved: true 129 | channel: '#verify-2ndline' 130 | icon_emoji: ':verify-shield:' 131 | username: alertmanager 132 | - name: "autom8-tickets" 133 | email_configs: 134 | - to: "${autom8_recipient_email}" 135 | slack_configs: 136 | - send_resolved: true 137 | channel: '#re-autom8-alerts' 138 | icon_emoji: ':verify-shield:' 139 | username: alertmanager 140 | color: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}warning{{ else }}danger{{ end }}{{ else }}good{{ end }}' 141 | pretext: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}:warning:{{ else }}:rotating_light:{{ end }}{{ else }}:green_tick:{{ end }} {{ .CommonLabels.alertname }}:{{ .CommonAnnotations.summary }}' 142 | text: |- 143 | *Description:* {{ .CommonAnnotations.message }} 144 | {{ range .Alerts }} 145 | *Details:* 146 | {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` 147 | {{ end }} 148 | {{ end }} 149 | short_fields: true 150 | fields: 151 | - title: Product 152 | value: '{{ .CommonLabels.product }}' 153 | - title: Deployment 154 | value: '{{ .CommonLabels.deployment }}' 155 | actions: 156 | - type: button 157 | text: Runbook 158 | url: '{{ .CommonAnnotations.runbook_url }}' 159 | - name: "dcs-slack" 160 | slack_configs: 161 | - send_resolved: true 162 | channel: '#di-dcs-2ndline' 163 | icon_emoji: ':gsp:' 164 | username: alertmanager 165 | color: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}warning{{ else }}danger{{ end }}{{ else }}good{{ end }}' 166 | pretext: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}:warning:{{ else }}:rotating_light:{{ end }}{{ else }}:green_tick:{{ end }} {{ .CommonLabels.alertname }}:{{ .CommonAnnotations.summary }}' 167 | text: |- 168 | *Description:* {{ .CommonAnnotations.message }} 169 | {{ range .Alerts }} 170 | *Details:* 171 | {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` 172 | {{ end }} 173 | {{ end }} 174 | short_fields: true 175 | fields: 176 | - title: Product 177 | value: '{{ .CommonLabels.product }}' 178 | - title: Namespace 179 | value: '{{ .CommonLabels.namespace }}' 180 | - title: | 181 | {{- if .CommonLabels.job_name -}} 182 | Job 183 | {{- else if .CommonLabels.deployment -}} 184 | Deployment 185 | {{- else if match "^KubePod" .CommonLabels.alertname -}} 186 | Pod 187 | {{- end -}} 188 | value: | 189 | {{- if .CommonLabels.job_name -}} 190 | {{ .CommonLabels.job_name }} 191 | {{- else if .CommonLabels.deployment -}} 192 | {{ .CommonLabels.deployment }} 193 | {{- else if match "^KubePod" .CommonLabels.alertname -}} 194 | {{ .CommonLabels.pod }} 195 | {{- end -}} 196 | actions: 197 | - type: button 198 | text: Runbook 199 | url: '{{ .CommonAnnotations.runbook_url }}' 200 | - name: "dcs-p2" 201 | pagerduty_configs: 202 | - service_key: "${dcs_p2_pagerduty_key}" 203 | - name: "verify-p1" 204 | pagerduty_configs: 205 | - service_key: "${verify_p1_pagerduty_key}" 206 | slack_configs: *verify-2ndline-slack-configs 207 | - name: "verify-p2" 208 | pagerduty_configs: 209 | - service_key: "${verify_p2_pagerduty_key}" 210 | slack_configs: *verify-2ndline-slack-configs 211 | - name: "verify-p3" 212 | slack_configs: *verify-2ndline-slack-configs 213 | - name: "dev-null" 214 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/templates/default.tmpl: -------------------------------------------------------------------------------- 1 | {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} 2 | 3 | {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} 4 | 5 | {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} 6 | {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} 7 | {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} 8 | {{ define "slack.default.footer" }}{{ end }} 9 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | pass = { 9 | source = "camptocamp/pass" 10 | } 11 | template = { 12 | source = "hashicorp/template" 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /terraform/modules/app-ecs-albs/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## Module: app-ecs-albs 3 | * 4 | * Load balancer for Prometheus 5 | * 6 | */ 7 | 8 | variable "aws_region" { 9 | type = string 10 | description = "AWS region" 11 | } 12 | 13 | variable "remote_state_bucket" { 14 | type = string 15 | description = "S3 bucket we store our terraform state in" 16 | } 17 | 18 | variable "environment" { 19 | type = string 20 | description = "Unique name for this collection of resources" 21 | } 22 | 23 | variable "zone_id" { 24 | type = string 25 | description = "Route 53 zone ID for registering public DNS records" 26 | } 27 | 28 | variable "subnets" { 29 | type = list(string) 30 | description = "Subnets to attach load balancers to" 31 | } 32 | 33 | variable "prometheus_count" { 34 | type = string 35 | description = "Number of prometheus instances to create listener rules and target groups for" 36 | default = "3" 37 | } 38 | 39 | # locals 40 | # -------------------------------------------------------------- 41 | 42 | locals { 43 | default_tags = { 44 | Terraform = "true" 45 | Project = "app-ecs-albs" 46 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 47 | Environment = var.environment 48 | } 49 | 50 | prom_records_count = var.prometheus_count 51 | 52 | # data.aws_route_53.XXX.name has a trailing dot which we remove with replace() to make ACM happy 53 | subdomain = replace(data.aws_route53_zone.public_zone.name, "/\\.$/", "") 54 | vpc_id = data.aws_subnet.first_subnet.vpc_id 55 | } 56 | 57 | ## Data sources 58 | 59 | data "terraform_remote_state" "infra_networking" { 60 | backend = "s3" 61 | 62 | config = { 63 | bucket = var.remote_state_bucket 64 | key = "infra-networking-modular.tfstate" 65 | region = var.aws_region 66 | } 67 | } 68 | 69 | data "terraform_remote_state" "infra_security_groups" { 70 | backend = "s3" 71 | 72 | config = { 73 | bucket = var.remote_state_bucket 74 | key = "infra-security-groups-modular.tfstate" 75 | region = var.aws_region 76 | } 77 | } 78 | 79 | data "aws_route53_zone" "public_zone" { 80 | zone_id = var.zone_id 81 | } 82 | 83 | data "aws_subnet" "first_subnet" { 84 | id = var.subnets[0] 85 | } 86 | 87 | ###################################################################### 88 | # ----- prometheus public ALB ------- 89 | ###################################################################### 90 | 91 | # AWS should manage the certificate renewal automatically 92 | # https://docs.aws.amazon.com/acm/latest/userguide/managed-renewal.html 93 | # If this fails, AWS will email associated with the AWS account 94 | resource "aws_acm_certificate" "prometheus_cert" { 95 | domain_name = "prom.${local.subdomain}" 96 | validation_method = "DNS" 97 | 98 | subject_alternative_names = aws_route53_record.prom_alias.*.fqdn 99 | 100 | lifecycle { 101 | # We can't destroy a certificate that's in use, and we can't stop 102 | # using it until the new one is ready. Hence 103 | # create_before_destroy here. 104 | create_before_destroy = true 105 | } 106 | } 107 | 108 | resource "aws_route53_record" "prometheus_cert_validation" { 109 | for_each = { 110 | for dvo in aws_acm_certificate.prometheus_cert.domain_validation_options : dvo.domain_name => { 111 | name = dvo.resource_record_name 112 | record = dvo.resource_record_value 113 | type = dvo.resource_record_type 114 | } 115 | } 116 | 117 | name = each.value.name 118 | records = [each.value.record] 119 | type = each.value.type 120 | zone_id = var.zone_id 121 | ttl = 60 122 | 123 | allow_overwrite = true 124 | 125 | depends_on = [aws_acm_certificate.prometheus_cert] 126 | } 127 | 128 | resource "aws_acm_certificate_validation" "prometheus_cert" { 129 | certificate_arn = aws_acm_certificate.prometheus_cert.arn 130 | validation_record_fqdns = [for record in aws_route53_record.prometheus_cert_validation : record.fqdn] 131 | } 132 | 133 | resource "aws_route53_record" "prom_alias" { 134 | count = local.prom_records_count 135 | 136 | zone_id = var.zone_id 137 | name = "prom-${count.index + 1}" 138 | type = "A" 139 | 140 | alias { 141 | name = aws_lb.prometheus_alb.dns_name 142 | zone_id = aws_lb.prometheus_alb.zone_id 143 | evaluate_target_health = false 144 | } 145 | } 146 | 147 | resource "aws_lb" "prometheus_alb" { 148 | name = "${var.environment}-prometheus-alb" 149 | internal = false 150 | load_balancer_type = "application" 151 | 152 | security_groups = [data.terraform_remote_state.infra_security_groups.outputs.prometheus_alb_sg_id] 153 | 154 | subnets = var.subnets 155 | 156 | tags = merge( 157 | local.default_tags, 158 | { 159 | Name = "${var.environment}-prometheus-alb" 160 | Service = "observe-prometheus" 161 | }, 162 | ) 163 | } 164 | 165 | resource "aws_lb_listener" "prometheus_listener_http" { 166 | load_balancer_arn = aws_lb.prometheus_alb.arn 167 | port = "80" 168 | protocol = "HTTP" 169 | 170 | default_action { 171 | type = "redirect" 172 | 173 | redirect { 174 | port = "443" 175 | protocol = "HTTPS" 176 | status_code = "HTTP_301" 177 | } 178 | } 179 | } 180 | 181 | resource "aws_lb_listener" "prometheus_listener_https" { 182 | load_balancer_arn = aws_lb.prometheus_alb.arn 183 | port = "443" 184 | protocol = "HTTPS" 185 | ssl_policy = "ELBSecurityPolicy-TLS-1-2-2017-01" 186 | certificate_arn = aws_acm_certificate_validation.prometheus_cert.certificate_arn 187 | 188 | default_action { 189 | type = "fixed-response" 190 | 191 | fixed_response { 192 | content_type = "text/plain" 193 | message_body = "Not found" 194 | status_code = "404" 195 | } 196 | } 197 | } 198 | 199 | resource "aws_lb_listener_rule" "prom_listener_https" { 200 | count = var.prometheus_count 201 | 202 | listener_arn = aws_lb_listener.prometheus_listener_https.arn 203 | priority = 100 + count.index 204 | 205 | action { 206 | type = "forward" 207 | target_group_arn = element(aws_lb_target_group.prometheus_tg.*.arn, count.index) 208 | } 209 | 210 | condition { 211 | host_header { 212 | values = ["prom-${count.index + 1}.*"] 213 | } 214 | } 215 | } 216 | 217 | resource "aws_lb_target_group" "prometheus_tg" { 218 | count = var.prometheus_count 219 | 220 | name = "${var.environment}-prom-${count.index + 1}-tg" 221 | port = 80 222 | protocol = "HTTP" 223 | vpc_id = local.vpc_id 224 | deregistration_delay = 30 225 | 226 | health_check { 227 | interval = "10" 228 | path = "/health" # static health check on nginx auth proxy 229 | matcher = "200" 230 | protocol = "HTTP" 231 | healthy_threshold = 2 232 | unhealthy_threshold = 2 233 | timeout = "5" 234 | } 235 | } 236 | 237 | ## Outputs 238 | 239 | output "prom_public_record_fqdns" { 240 | value = aws_route53_record.prom_alias.*.fqdn 241 | description = "Prometheus public DNS FQDNs" 242 | } 243 | 244 | output "prometheus_target_group_ids" { 245 | value = aws_lb_target_group.prometheus_tg.*.arn 246 | description = "Prometheus target group IDs" 247 | } 248 | -------------------------------------------------------------------------------- /terraform/modules/app-ecs-albs/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/common/ami/main.tf: -------------------------------------------------------------------------------- 1 | ## Variables 2 | 3 | locals { 4 | canonical_account_id = "099720109477" 5 | } 6 | 7 | ## Data sources 8 | 9 | data "aws_ami" "ubuntu_focal" { 10 | most_recent = true 11 | 12 | filter { 13 | name = "name" 14 | values = ["ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*"] 15 | } 16 | 17 | filter { 18 | name = "architecture" 19 | values = ["x86_64"] 20 | } 21 | 22 | filter { 23 | name = "virtualization-type" 24 | values = ["hvm"] 25 | } 26 | 27 | owners = [local.canonical_account_id] 28 | } 29 | 30 | ## Outputs 31 | 32 | output "ubuntu_focal_ami_id" { 33 | value = data.aws_ami.ubuntu_focal.id 34 | } 35 | 36 | -------------------------------------------------------------------------------- /terraform/modules/common/ami/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/infra-networking/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## module: infra-networking 3 | * 4 | * Terraform module to deploy the networking required for a VPC and 5 | * related services. You will often have multiple VPCs in an account 6 | * 7 | */ 8 | 9 | variable "aws_region" { 10 | type = string 11 | description = "AWS region" 12 | default = "eu-west-1" 13 | } 14 | 15 | variable "environment" { 16 | type = string 17 | description = "Unique name for this collection of resources" 18 | } 19 | 20 | variable "prometheus_subdomain" { 21 | type = string 22 | description = "Subdomain for prometheus" 23 | default = "monitoring" 24 | } 25 | 26 | # locals 27 | # -------------------------------------------------------------- 28 | 29 | locals { 30 | default_tags = { 31 | Terraform = "true" 32 | Project = "infra-networking" 33 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 34 | Environment = var.environment 35 | } 36 | 37 | subdomain_name = "${var.prometheus_subdomain}.gds-reliability.engineering" 38 | private_subdomain_name = "${var.environment}.monitoring.private" 39 | } 40 | 41 | ## Data sources 42 | 43 | data "aws_availability_zones" "available" {} 44 | 45 | ## Resources 46 | 47 | module "vpc" { 48 | source = "terraform-aws-modules/vpc/aws" 49 | version = "3.5.0" 50 | 51 | name = "observe-${var.environment}" 52 | cidr = "10.0.0.0/16" 53 | 54 | # subnets assumes 3 AZs although 3AZs are not implemented elsewhere 55 | azs = data.aws_availability_zones.available.names 56 | private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] 57 | public_subnets = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"] 58 | 59 | create_database_subnet_group = false 60 | 61 | enable_nat_gateway = true 62 | single_nat_gateway = false 63 | 64 | enable_dns_hostnames = true 65 | enable_dns_support = true 66 | 67 | enable_dhcp_options = true 68 | dhcp_options_domain_name = local.private_subdomain_name 69 | 70 | # no `Name` tag unlike other resources as this is taken care of by the vpc module `name` property 71 | tags = local.default_tags 72 | } 73 | 74 | resource "aws_route53_zone" "subdomain" { 75 | name = local.subdomain_name 76 | } 77 | 78 | resource "aws_route53_zone" "private" { 79 | name = local.private_subdomain_name 80 | force_destroy = true 81 | vpc { 82 | vpc_id = module.vpc.vpc_id 83 | } 84 | } 85 | 86 | ## Outputs 87 | 88 | output "vpc_id" { 89 | value = module.vpc.vpc_id 90 | description = "VPC ID where the stack resources are created" 91 | } 92 | 93 | output "private_subnets" { 94 | value = module.vpc.private_subnets 95 | description = "List of private subnet IDs" 96 | } 97 | 98 | output "public_subnets" { 99 | value = module.vpc.public_subnets 100 | description = "List of public subnet IDs" 101 | } 102 | 103 | output "public_zone_id" { 104 | value = aws_route53_zone.subdomain.zone_id 105 | description = "Route 53 Zone ID for publicly visible zone" 106 | } 107 | 108 | output "public_subdomain" { 109 | value = aws_route53_zone.subdomain.name 110 | description = "This is the subdomain for root zone" 111 | } 112 | 113 | output "private_zone_id" { 114 | value = aws_route53_zone.private.zone_id 115 | description = "Route 53 Zone ID for the internal zone" 116 | } 117 | 118 | output "private_zone_name" { 119 | value = aws_route53_zone.private.name 120 | description = "Route 53 Zone name for the internal zone" 121 | } 122 | 123 | output "private_subnets_ips" { 124 | value = module.vpc.private_subnets_cidr_blocks 125 | description = "List of private subnet IPs" 126 | } 127 | 128 | output "nat_gateway" { 129 | value = module.vpc.nat_public_ips 130 | description = "List of nat gateway IP" 131 | } 132 | 133 | output "private_subdomain" { 134 | value = aws_route53_zone.private.name 135 | description = "This is the subdomain for private zone" 136 | } 137 | 138 | output "subnets_by_az" { 139 | value = zipmap( 140 | data.aws_availability_zones.available.names, 141 | module.vpc.private_subnets_cidr_blocks, 142 | ) 143 | 144 | description = "Map of availability zones to private subnets" 145 | } 146 | 147 | -------------------------------------------------------------------------------- /terraform/modules/infra-networking/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/infra-security-groups/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## module: infra-security-groups 3 | * 4 | * Central module to manage all security groups. 5 | * 6 | * This is done in a single module to reduce conflicts 7 | * and cascade issues. 8 | * 9 | */ 10 | 11 | variable "aws_region" { 12 | type = string 13 | description = "The AWS region to use." 14 | } 15 | 16 | variable "remote_state_bucket" { 17 | type = string 18 | description = "S3 bucket we store our terraform state in" 19 | } 20 | 21 | variable "environment" { 22 | type = string 23 | description = "Unique name for this collection of resources" 24 | } 25 | 26 | # locals 27 | # -------------------------------------------------------------- 28 | 29 | locals { 30 | default_tags = { 31 | Terraform = "true" 32 | Project = "infra-security-groups" 33 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 34 | Environment = var.environment 35 | } 36 | } 37 | 38 | # Resources 39 | # -------------------------------------------------------------- 40 | 41 | ## Data sources 42 | 43 | data "terraform_remote_state" "infra_networking" { 44 | backend = "s3" 45 | 46 | config = { 47 | bucket = var.remote_state_bucket 48 | key = "infra-networking-modular.tfstate" 49 | region = var.aws_region 50 | } 51 | } 52 | 53 | resource "aws_security_group" "prometheus_alb" { 54 | name = "${var.environment}-prometheus-alb" 55 | vpc_id = data.terraform_remote_state.infra_networking.outputs.vpc_id 56 | description = "Controls ingress and egress for prometheus ALB" 57 | 58 | tags = merge( 59 | local.default_tags, 60 | { 61 | Name = "prometheus-alb", 62 | Service = "observe-prometheus", 63 | }, 64 | ) 65 | } 66 | 67 | # We allow all IPs to access the ALB as Prometheus is fronted by an nginx which controls access to either approved IP 68 | # addresses, or users with basic auth creds 69 | resource "aws_security_group_rule" "ingress_from_public_http_to_prometheus_alb" { 70 | security_group_id = aws_security_group.prometheus_alb.id 71 | type = "ingress" 72 | from_port = 80 73 | to_port = 80 74 | protocol = "tcp" 75 | cidr_blocks = ["0.0.0.0/0"] 76 | } 77 | 78 | resource "aws_security_group_rule" "ingress_from_public_https_to_prometheus_alb" { 79 | security_group_id = aws_security_group.prometheus_alb.id 80 | type = "ingress" 81 | from_port = 443 82 | to_port = 443 83 | protocol = "tcp" 84 | cidr_blocks = ["0.0.0.0/0"] 85 | } 86 | 87 | resource "aws_security_group_rule" "egress_from_prometheus_alb_to_prometheus_ec2" { 88 | security_group_id = aws_security_group.prometheus_alb.id 89 | type = "egress" 90 | to_port = 80 91 | from_port = 80 92 | protocol = "tcp" 93 | source_security_group_id = aws_security_group.prometheus_ec2.id 94 | } 95 | 96 | resource "aws_security_group" "prometheus_ec2" { 97 | name = "${var.environment}-prometheus-ec2" 98 | vpc_id = data.terraform_remote_state.infra_networking.outputs.vpc_id 99 | description = "Controls ingress and egress for prometheus EC2 instances" 100 | 101 | tags = merge( 102 | local.default_tags, 103 | { 104 | Name = "prometheus-ec2", 105 | Service = "observe-prometheus", 106 | }, 107 | ) 108 | } 109 | 110 | resource "aws_security_group_rule" "ingress_from_prometheus_alb_to_prometheus_ec2" { 111 | security_group_id = aws_security_group.prometheus_ec2.id 112 | type = "ingress" 113 | to_port = 80 114 | from_port = 80 115 | protocol = "tcp" 116 | source_security_group_id = aws_security_group.prometheus_alb.id 117 | } 118 | 119 | resource "aws_security_group_rule" "ingress_from_prometheus_ec2_to_prometheus_ec2" { 120 | security_group_id = aws_security_group.prometheus_ec2.id 121 | type = "ingress" 122 | to_port = 9090 123 | from_port = 9090 124 | protocol = "tcp" 125 | source_security_group_id = aws_security_group.prometheus_ec2.id 126 | } 127 | 128 | resource "aws_security_group_rule" "ingress_from_prometheus_to_prometheus_node_exporter" { 129 | security_group_id = aws_security_group.prometheus_ec2.id 130 | type = "ingress" 131 | to_port = 9100 132 | from_port = 9100 133 | protocol = "tcp" 134 | source_security_group_id = aws_security_group.prometheus_ec2.id 135 | } 136 | 137 | # This rule allows all egress out of prometheus_ec2. This is for the following purposes: 138 | # - downloading packages from package repos 139 | # - calling AWS APIs such as SSM, S3 and EC2 140 | # - scraping alertmanager on port 9093 141 | # - sending alerts to alertmanager on port 9093 142 | # - scraping external targets that run on the PaaS 143 | # - scraping itself and other promethis on port 9090 144 | # - scraping node exporters on port 9100 145 | resource "aws_security_group_rule" "egress_from_prometheus_ec2_to_all" { 146 | security_group_id = aws_security_group.prometheus_ec2.id 147 | type = "egress" 148 | to_port = 0 149 | from_port = 0 150 | protocol = "-1" 151 | cidr_blocks = ["0.0.0.0/0"] 152 | } 153 | 154 | ## Outputs 155 | 156 | output "prometheus_ec2_sg_id" { 157 | value = aws_security_group.prometheus_ec2.id 158 | description = "security group prometheus_ec2 ID" 159 | } 160 | 161 | output "prometheus_alb_sg_id" { 162 | value = aws_security_group.prometheus_alb.id 163 | description = "security group prometheus_alb ID" 164 | } 165 | -------------------------------------------------------------------------------- /terraform/modules/infra-security-groups/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/README.md: -------------------------------------------------------------------------------- 1 | # Prometheus EC2 module 2 | 3 | There are two modules 4 | 5 | - `prometheus`, which deploys prometheus to the target network. 6 | - `paas-config`, which contains configuration specific to our 7 | prometheus-for-paas deployment 8 | 9 | We deploy using raw Terraform commands, scoped per environment. 10 | 11 | ## Deploying 12 | 13 | To deploy (for example to staging): 14 | 15 | ```shell 16 | cd terraform/projects/prom-ec2/paas-staging/prometheus 17 | gds aws re-prom-staging -- terraform plan 18 | ``` 19 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/README.md: -------------------------------------------------------------------------------- 1 | # Example Alert 2 | 3 | Below is an example alert that you can copy and rewrite to create your 4 | own alert. [View the RE 5 | docs](https://reliability-engineering.cloudapps.digital/monitoring-alerts.html#create-and-edit-alerts-using-prometheus) 6 | for more information on what to consider when writing alerts. 7 | 8 | It alerts if the number of 5xx status codes exceeds 25% of total 9 | requests for 120 seconds (2 minutes) or more. 10 | 11 | It is broken down into: 12 | 13 | - `alert`: The alert name, in the format `TeamName_Problem`. 14 | - `expr`: The PromQL query that queries for the data, followed by `>= 15 | 0.25` defining the threshold of values. 16 | - `for`: Optional: The alert fires if the query is over threshold for 17 | this amount of time. 18 | - `labels`: 19 | - `product`: The team name or product for the team that this alert 20 | refers to. For example, "Observe" or "Prometheus". 21 | - `annotations`: 22 | - `summary`: Required: A summary of what the alert shows. 23 | - `description`: Required: A more detailed description of what the alert shows. 24 | - `dashboard_url`: Optional: A link to your team's dashboard (ie Grafana) to see 25 | trends for the alert. 26 | - `runbook`: Optional: A link to your team manual describing what to do about 27 | the alert. 28 | - `logs`: Optional: A link to your logs (ie Kibana URL). 29 | 30 | In the `annotations` section, `{{ $labels.app }}` refers to your team 31 | name, and `{{ $labels.job }}` refers to your app name. 32 | 33 | ``` 34 | - alert: Example_AppRequestsExcess5xx 35 | expr: sum by(app) (rate(requests{org="example-paas-org", space="example-paas-space", status_range="5xx"}[5m])) / sum by(app) (rate(requests{org="example-paas-org", space="example-paas-space"}[5m])) >= 0.25 36 | for: 120s 37 | labels: 38 | product: "example-team-name" 39 | annotations: 40 | summary: "App {{ $labels.app }} has too many 5xx errors" 41 | description: "App {{ $labels.app }} has 5xx errors in excess of 25% of total requests" 42 | dashboard_url: https://grafana-paas.cloudapps.digital/d//?refresh=1m&orgId=1 43 | runbook: "https://re-team-manual.cloudapps.digital/" 44 | logs: "https://kibana.logit.io/s//app/kibana#/discover" 45 | ``` 46 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/data-gov-uk-alerts.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: DataGovUk 3 | rules: 4 | - alert: DataGovUk_HighCpuUsage 5 | expr: avg(cpu{job="metric-exporter"}) without (exported_instance) >= 80 6 | for: 5m 7 | labels: 8 | product: "data-gov-uk" 9 | annotations: 10 | summary: "App {{ $labels.app }} has high CPU usage" 11 | message: "Application {{ $labels.app }} has been using over 80% CPU (averaged over all instances) for 5 minutes or more" 12 | - alert: DataGovUk_HighDiskUsage 13 | expr: max(disk_utilization{job="metric-exporter"}) without (exported_instance) >= 80 14 | labels: 15 | product: "data-gov-uk" 16 | annotations: 17 | summary: "App {{ $labels.app }} has high disk usage" 18 | message: "Application {{ $labels.app }} has an instance which is using over 80% disk." 19 | - alert: DataGovUk_ElasticSearchIndexSizeIncrease 20 | expr: max without(instance, host, name, es_client_node, es_data_node, es_ingest_node, es_master_node) (delta(elasticsearch_indices_docs{space="data-gov-uk"}[30m])) >= 300 21 | for: 1m 22 | labels: 23 | product: "data-gov-uk" 24 | annotations: 25 | summary: "Index size of Elasticsearch for {{ $labels.job }} has increased significantly" 26 | message: "The index size of Elasticsearch for {{ $labels.job }} has increased by more than 300 documents in the last 30 minutes" 27 | runbook: https://docs.publishing.service.gov.uk/manual/data-gov-uk-troubleshooting.html#different-number-of-datasets-in-ckan-to-find 28 | - alert: DataGovUk_ElasticSearchIndexSizeDecrease 29 | expr: max without(instance, host, name, es_client_node, es_data_node, es_ingest_node, es_master_node) (delta(elasticsearch_indices_docs{space="data-gov-uk"}[30m])) <= -300 30 | for: 1m 31 | labels: 32 | product: "data-gov-uk" 33 | annotations: 34 | summary: "Index size of Elasticsearch for {{ $labels.job }} has decreased significantly" 35 | message: "The index size of Elasticsearch for {{ $labels.job }} has decreased by more than 300 documents in the last 30 minutes" 36 | runbook: https://docs.publishing.service.gov.uk/manual/data-gov-uk-troubleshooting.html#different-number-of-datasets-in-ckan-to-find 37 | - alert: DataGovUk_HighSidekiqEnqueuedJobs 38 | expr: sidekiq_enqueued_jobs{org="gds-data-gov-uk",job="publish-data-production-queue-monitor"} > 800 39 | for: 5m 40 | labels: 41 | product: "data-gov-uk" 42 | annotations: 43 | summary: "Sidekiq's enqueued jobs do not seem to be clearing for Publish Data on production" 44 | message: "Sidekiq has had more than 800 enqueued jobs for Publish Data on production for at least 5 minutes" 45 | runbook: https://docs.publishing.service.gov.uk/manual/data-gov-uk-monitoring.html#sidekiq-publish 46 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/doc-checking-alerts.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: DocChecking 3 | rules: 4 | - alert: AuditEventsNotProcessing 5 | annotations: 6 | message: >- 7 | The audit consumer should be writing audit events to the 8 | database. This hasn't happened in a while. 9 | runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/AuditEventsNotProcessing/ 10 | expr: | 11 | sum without(instance) (rate(audit_consumer_events_processing_attempts_total[5m])) 12 | - 13 | sum without(instance) (rate(audit_consumer_events_processing_failures_total[5m])) 14 | == 0 15 | for: 10m 16 | labels: 17 | product: doc-checking 18 | severity: p4 19 | - alert: AuditEventsFailedProcessing 20 | annotations: 21 | message: >- 22 | The audit consumer has a high error rate when attempting to 23 | write audit events to the database. Those events may have 24 | ended up on the dead letter queue. 25 | runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/AuditEventsFailedProcessing/ 26 | expr: | 27 | sum without(instance) (rate(audit_consumer_events_processing_failures_total[2m])) > 3 28 | for: 5m 29 | labels: 30 | product: doc-checking 31 | severity: p4 32 | - alert: AuditEventsOnTheDeadLetterQueue 33 | annotations: 34 | message: | 35 | There are unprocessed audit events on the dead letter queue. 36 | runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/AuditEventsOnTheDeadLetterQueue/ 37 | expr: | 38 | max without(instance) (audit_consumer_dead_letter_queue_approximate_messages) > 0 39 | for: 5m 40 | labels: 41 | product: doc-checking 42 | severity: p4 43 | - alert: RedisNotAvailable 44 | annotations: 45 | message: | 46 | Redis is not available for rate limiting and quota. 47 | runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/RedisNotAvailable/ 48 | expr: | 49 | (avg by (job) (dcs_dmz_proxy_using_redis_for_rate_limiting) != 1) or (avg by (job) (dcs_agents_using_redis_for_rate_limiting) != 1) 50 | for: 5m 51 | labels: 52 | product: doc-checking 53 | severity: p4 54 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/notify-alerts.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: GOVUK_Notify 3 | rules: 4 | - alert: GOVUK_Notify_Disk_75_percent_full 5 | expr: max(disk_utilization{space="production", organisation="govuk-notify"}) by (app, space) > 75 6 | for: 5m 7 | labels: 8 | product: "notify" 9 | severity: "ticket" 10 | annotations: 11 | message: "{{ $labels.space }}: disk usage for {{ $labels.app }} is over 75% full. You should redeploy the app to avoid running out of disk space" 12 | grafana: "https://grafana-paas.cloudapps.digital/d/_GlGBNbmk/notify-apps?orgId=2&var-space=production&var-app={{ $labels.app }}" 13 | - alert: GOVUK_Notify_Disk_95_percent_full 14 | expr: max(disk_utilization{space="production", organisation="govuk-notify", app!~"(.*conduit.*)|(.*exporter)"}) by (app, space) > 95 15 | for: 5m 16 | labels: 17 | product: "notify" 18 | severity: "p2" 19 | annotations: 20 | summary: "{{ $labels.space }}: disk usage for {{ $labels.app }} is over 95% full. You should redeploy the app to avoid running out of disk space" 21 | grafana: "https://grafana-paas.cloudapps.digital/d/_GlGBNbmk/notify-apps?orgId=2&var-space=production&var-app={{ $labels.app }}" 22 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/observe-alerts.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: RE_Observe 3 | rules: 4 | - alert: RE_Observe_Grafana_Down 5 | expr: up{job="grafana-paas"} == 0 6 | for: 5m 7 | labels: 8 | product: "prometheus" 9 | severity: "page" 10 | annotations: 11 | summary: "Prometheus is not able to scrape Grafana" 12 | message: "Prometheus has not successfully scraped {{ $labels.job }} in the last 5 minutes. https://grafana-paas.cloudapps.digital/ may be down." 13 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=()&_a=(columns:!(_source),index:'*-*',interval:h,query:(query_string:(query:'grafana-paas.cloudapps.digital%20AND%20NOT%20access.response_code:200')),sort:!('@timestamp',desc))" 14 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-grafana-down" 15 | 16 | - alert: RE_Observe_AlertManager_Below_Threshold 17 | expr: sum(up{job="alertmanager"}) <= 1 18 | for: 10s 19 | labels: 20 | product: "prometheus" 21 | severity: "page" 22 | annotations: 23 | summary: "There is one or fewer Alertmanagers that can be scraped" 24 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-alertmanager-below-threshold" 25 | 26 | - alert: RE_Observe_Prometheus_Below_Threshold 27 | expr: sum(up{job="prometheus"}) <= 1 28 | for: 10s 29 | labels: 30 | product: "prometheus" 31 | severity: "page" 32 | annotations: 33 | summary: "There is one or fewer Prometheis that can be scraped" 34 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 35 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-below-threshold" 36 | 37 | - alert: RE_Observe_Prometheus_AtLeastOneMissing 38 | expr: sum(up{job="prometheus"}) < 3 39 | for: 3m 40 | labels: 41 | product: "prometheus" 42 | severity: "ticket" 43 | annotations: 44 | summary: "At least one Prometheus can't be scraped" 45 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 46 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-at-least-one-missing" 47 | 48 | - alert: RE_Observe_PrometheusDiskPredictedToFill 49 | expr: | 50 | predict_linear( 51 | node_filesystem_avail{job="prometheus_node", mountpoint="/mnt"}[12h], 3 * 24 * 60 * 60 52 | ) <= 0 53 | and on(instance) 54 | (time() - node_creation_time > 12 * 60 * 60) 55 | labels: 56 | product: "prometheus" 57 | severity: "ticket" 58 | annotations: 59 | summary: "Instance {{ $labels.instance }} disk {{ $labels.mountpoint }} is predicted to fill in 72h" 60 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 61 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-disk-predicted-to-fill" 62 | 63 | - alert: RE_Observe_No_Paas_Targets 64 | expr: prometheus_sd_discovered_targets{config=~"paas-(london|ireland)-targets"} == 0 65 | for: 10m 66 | labels: 67 | product: "prometheus" 68 | severity: "page" 69 | annotations: 70 | summary: "No PaaS targets detected" 71 | message: "No PaaS file_sd targets were detected from the service broker. Is there a problem accessing the targets bucket?" 72 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 73 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-no-filesd-targets" 74 | 75 | - alert: RE_Observe_Prometheus_Over_Capacity 76 | expr: sum without(slice)(rate(prometheus_engine_query_duration_seconds_sum{job="prometheus"}[5m])) > 8 77 | for: 10s 78 | labels: 79 | product: "prometheus" 80 | severity: "page" 81 | annotations: 82 | summary: "Service is over capacity." 83 | message: "The service name is {{ $labels.job }}. The URL experiencing the issue is {{ $labels.instance }}." 84 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 85 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-over-capacity" 86 | 87 | - alert: RE_Observe_Prometheus_High_Load 88 | expr: sum without(slice)(rate(prometheus_engine_query_duration_seconds_sum{job="prometheus"}[2h])) > 4 89 | labels: 90 | product: "prometheus" 91 | severity: "ticket" 92 | annotations: 93 | summary: "Service is approaching capacity." 94 | message: "The service name is {{ $labels.job }}. The URL experiencing the issue is {{ $labels.instance }}." 95 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 96 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-high-load" 97 | 98 | - alert: RE_Observe_Target_Down 99 | expr: up{} == 0 100 | for: 24h 101 | labels: 102 | product: "prometheus" 103 | severity: "ticket" 104 | annotations: 105 | summary: "{{ $labels.job }} target is down" 106 | message: "One of the {{ $labels.job }} targets has been down for 24 hours" 107 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-target-down" 108 | 109 | - alert: RE_Observe_No_Successful_Updates 110 | expr: sum(increase(observe_broker_http_requests_total{code="200", path="/update-targets", method="post"}[30m])) by (region) == 0 111 | for: 12h 112 | labels: 113 | product: "prometheus" 114 | severity: "ticket" 115 | annotations: 116 | summary: "No recent target updates in region '{{ $labels.region }}'" 117 | message: "Target update in region '{{ $labels.region }}' hasn't completed successfully in at least 12h" 118 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-no-successful-updates" 119 | 120 | - alert: AlwaysAlert 121 | annotations: 122 | message: | 123 | This is an alert meant to ensure that the entire alerting pipeline is functional. 124 | This alert is always firing, therefore it should always be firing in Alertmanager 125 | and always fire against a receiver. We use cronitor to alert us if this ever 126 | *doesn't* fire, because this indicates a problem with our alerting pipeline 127 | expr: vector(1) 128 | labels: 129 | product: "prometheus" 130 | severity: "constant" 131 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/main.tf: -------------------------------------------------------------------------------- 1 | data "template_file" "prometheus_config_template" { 2 | template = file("${path.module}/prometheus.conf.tpl") 3 | 4 | vars = { 5 | environment = var.environment 6 | } 7 | } 8 | 9 | locals { 10 | prometheus_config = yamldecode(data.template_file.prometheus_config_template.rendered) 11 | final_scrape_configs = concat(local.prometheus_config["scrape_configs"], var.extra_scrape_configs) 12 | final_prometheus_config = merge(local.prometheus_config, { "scrape_configs" = local.final_scrape_configs }) 13 | final_prometheus_config_yaml = yamlencode(local.final_prometheus_config) 14 | } 15 | 16 | resource "aws_route53_record" "prom_ec2_a_record" { 17 | count = 3 18 | 19 | zone_id = var.private_zone_id 20 | name = "prom-ec2-${count.index + 1}" 21 | type = "A" 22 | ttl = 300 23 | 24 | records = [var.prom_private_ips[count.index]] 25 | } 26 | 27 | resource "aws_s3_bucket_object" "prometheus_config" { 28 | bucket = var.prometheus_config_bucket 29 | key = "prometheus/prometheus.yml" 30 | content = local.final_prometheus_config_yaml 31 | etag = md5(local.final_prometheus_config_yaml) 32 | } 33 | 34 | resource "aws_s3_bucket_object" "alerts-config" { 35 | bucket = var.prometheus_config_bucket 36 | key = "prometheus/alerts/observe-alerts.yml" 37 | source = "${var.alerts_path}observe-alerts.yml" 38 | etag = filemd5("${var.alerts_path}observe-alerts.yml") 39 | } 40 | 41 | resource "aws_s3_bucket_object" "alerts-data-gov-uk-config" { 42 | bucket = var.prometheus_config_bucket 43 | key = "prometheus/alerts/data-gov-uk-alerts.yml" 44 | source = "${var.alerts_path}data-gov-uk-alerts.yml" 45 | etag = filemd5("${var.alerts_path}data-gov-uk-alerts.yml") 46 | } 47 | 48 | resource "aws_s3_bucket_object" "alerts-doc-checking-config" { 49 | bucket = var.prometheus_config_bucket 50 | key = "prometheus/alerts/doc-checking-alerts.yml" 51 | source = "${var.alerts_path}doc-checking-alerts.yml" 52 | etag = filemd5("${var.alerts_path}doc-checking-alerts.yml") 53 | } 54 | 55 | resource "aws_s3_bucket_object" "alerts-notify-config" { 56 | bucket = var.prometheus_config_bucket 57 | key = "prometheus/alerts/notify-alerts.yml" 58 | source = "${var.alerts_path}notify-alerts.yml" 59 | etag = filemd5("${var.alerts_path}notify-alerts.yml") 60 | } 61 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/outputs.tf: -------------------------------------------------------------------------------- 1 | output "prometheus_config_etag" { 2 | value = aws_s3_bucket_object.prometheus_config.etag 3 | } 4 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/prometheus.conf.tpl: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 30s 3 | evaluation_interval: 30s 4 | alerting: 5 | alertmanagers: 6 | - scheme: http 7 | dns_sd_configs: 8 | - names: 9 | - 'alertmanager.local.gds-reliability.engineering' 10 | type: 'A' 11 | port: 9093 12 | rule_files: 13 | - "/etc/prometheus/alerts/*" 14 | scrape_configs: 15 | - job_name: prometheus 16 | ec2_sd_configs: 17 | - region: eu-west-1 18 | port: 9090 19 | relabel_configs: 20 | - source_labels: ['__meta_ec2_tag_Environment'] 21 | regex: '${environment}' 22 | action: keep 23 | - source_labels: ['__meta_ec2_tag_Service'] 24 | regex: 'observe-prometheus' 25 | action: keep 26 | - source_labels: ['__meta_ec2_availability_zone'] 27 | target_label: availability_zone 28 | - source_labels: ['__meta_ec2_instance_id'] 29 | replacement: '$1:9090' 30 | target_label: instance 31 | - job_name: paas-ireland-targets 32 | scheme: http 33 | proxy_url: 'http://localhost:8080' 34 | file_sd_configs: 35 | - files: ['/etc/prometheus/ireland-targets/*.json'] 36 | refresh_interval: 30s 37 | relabel_configs: 38 | - target_label: region 39 | replacement: ireland 40 | - job_name: paas-london-targets 41 | scheme: http 42 | proxy_url: 'http://localhost:8080' 43 | file_sd_configs: 44 | - files: ['/etc/prometheus/london-targets/*.json'] 45 | refresh_interval: 30s 46 | relabel_configs: 47 | - target_label: region 48 | replacement: london 49 | - job_name: alertmanager 50 | dns_sd_configs: 51 | - names: 52 | - 'alertmanager.local.gds-reliability.engineering' 53 | type: 'A' 54 | port: 9093 55 | - job_name: prometheus_node 56 | ec2_sd_configs: 57 | - region: eu-west-1 58 | port: 9100 59 | relabel_configs: 60 | - source_labels: ['__meta_ec2_tag_Environment'] 61 | regex: '${environment}' 62 | action: keep 63 | - source_labels: ['__meta_ec2_tag_Service'] 64 | regex: 'observe-prometheus' 65 | action: keep 66 | - source_labels: ['__meta_ec2_availability_zone'] 67 | target_label: availability_zone 68 | - source_labels: ['__meta_ec2_instance_id'] 69 | replacement: '$1:9100' 70 | target_label: instance 71 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/variables.tf: -------------------------------------------------------------------------------- 1 | variable "environment" {} 2 | variable "prometheus_config_bucket" {} 3 | variable "alerts_path" {} 4 | variable "private_zone_id" {} 5 | 6 | variable "prom_private_ips" { 7 | type = list(string) 8 | } 9 | 10 | variable "extra_scrape_configs" { 11 | default = [] 12 | description = "List of scrape configs to append to the Prometheus config" 13 | } 14 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 0.13" 3 | required_providers { 4 | aws = { 5 | source = "hashicorp/aws" 6 | } 7 | template = { 8 | source = "hashicorp/template" 9 | } 10 | } 11 | } 12 | 13 | provider "template" { 14 | version = ">= 2" 15 | } 16 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/.ruby-version: -------------------------------------------------------------------------------- 1 | 2.6.1 2 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/cloud.conf: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | package_update: true 3 | package_upgrade: true 4 | packages: ['prometheus', 'prometheus-node-exporter', 'awscli', 'inotify-tools', 'nginx', 'jq'] 5 | 6 | write_files: 7 | - owner: root:root 8 | path: /etc/default/prometheus 9 | permissions: 0444 10 | content: 'ARGS="--storage.tsdb.path=\"/mnt/\" --web.external-url=${prom_external_url} --storage.tsdb.retention=60d --query.timeout=30s"' 11 | - owner: root:root 12 | path: /etc/cron.d/config_pull 13 | permissions: 0755 14 | content: | 15 | * * * * * root flock -w 30 /run/lock/prometheus-config-updates aws s3 sync s3://${config_bucket}/prometheus/ /etc/prometheus/ --region=${region} 16 | @reboot root /root/watch_prometheus_dir 17 | - owner: root:root 18 | path: /etc/cron.d/ireland_targets_pull 19 | permissions: 0755 20 | content: | 21 | # if targets bucket exists then sync it, otherwise this cron runs but has no effect 22 | * * * * * root [ "${ireland_targets_bucket}" != "" ] && aws s3 sync s3://${ireland_targets_bucket}/active/ /etc/prometheus/ireland-targets --region=${region} --delete 23 | - owner: root:root 24 | path: /etc/cron.d/london_targets_pull 25 | permissions: 0755 26 | content: | 27 | # if targets bucket exists then sync it, otherwise this cron runs but has no effect 28 | * * * * * root [ "${london_targets_bucket}" != "" ] && aws s3 sync s3://${london_targets_bucket}/active/ /etc/prometheus/london-targets --region=${region} --delete 29 | - owner: root:root 30 | path: /etc/cron.d/alerts_pull 31 | permissions: 0755 32 | content: | 33 | # if alerts bucket exists then sync it, otherwise this cron runs but has no effect 34 | * * * * * root [ "${alerts_bucket}" != "" ] && aws s3 sync s3://${alerts_bucket}/prometheus/alerts/ /etc/prometheus/alerts --region=${region} --delete 35 | - content: | 36 | echo 'Configuring prometheus EBS' 37 | vol="" 38 | while [ -z "$vol" ]; do 39 | # adapted from 40 | # https://medium.com/@moonape1226/mount-aws-ebs-on-ec2-automatically-with-cloud-init-e5e837e5438a 41 | # [Last accessed on 2020-04-02] 42 | vol=$(lsblk | grep -e disk | awk '{sub("G","",$4)} {if ($4+0 == ${data_volume_size}) print $1}') 43 | echo "still waiting for data volume ; sleeping 5" 44 | sleep 5 45 | done 46 | echo "found volume /dev/$vol" 47 | if [ -z "$(lsblk | grep "$vol" | awk '{print $7}')" ] ; then 48 | if [ -z "$(blkid /dev/$vol | grep ext4)" ] ; then 49 | echo "volume /dev/$vol is not formatted ; formatting" 50 | mkfs -F -t ext4 -L 'prometheus_disk' "/dev/$vol" 51 | else 52 | echo "volume /dev/$vol is already formatted" 53 | fi 54 | 55 | echo "volume /dev/$vol is not mounted ; mounting" 56 | mount "/dev/$vol" /mnt 57 | UUID=$(blkid /dev/$vol -s UUID -o value) 58 | if [ -z "$(grep $UUID /etc/fstab)" ] ; then 59 | echo "writing fstab entry" 60 | 61 | echo "UUID=$UUID /mnt ext4 defaults,nofail 0 2" >> /etc/fstab 62 | fi 63 | fi 64 | echo "ensuring fs block size matches volume block size" 65 | resize2fs "/dev/$vol" 66 | path: /root/manage_data_volume.sh 67 | permissions: 0755 68 | - content: | 69 | #!/bin/bash 70 | STATUS_JSON='/srv/prometheus-last-config.json' 71 | 72 | attempt_reload() { 73 | ( 74 | # take out lock to ensure updater doesn't switch the config between the time we 75 | # calculate NEW_HASH and prometheus reads it 76 | flock 321 77 | 78 | # why md5? because it should be the same as the s3 etag and so easy to check 79 | export NEW_HASH=$(md5sum /etc/prometheus/prometheus.yml | cut -d ' ' -f 1) 80 | if systemctl reload prometheus ; then 81 | jq -n '{last_successful_config: env.NEW_HASH, last_reload_successful: true}' > $STATUS_JSON 82 | else 83 | touch $STATUS_JSON 84 | jq '{last_successful_config: .last_successful_config, last_reload_successful: false, failed_config: env.NEW_HASH}' $STATUS_JSON > $STATUS_JSON 85 | fi 86 | 87 | ) 321>/run/lock/prometheus-config-updates 88 | } 89 | 90 | systemctl start prometheus # ensure prometheus is started before initial attempt_reload 91 | attempt_reload 92 | 93 | inotifywait -e modify,create,delete,move -m /etc/prometheus | 94 | while read -r directory events; do 95 | attempt_reload 96 | done 97 | path: /root/watch_prometheus_dir 98 | permissions: 0755 99 | - content: | 100 | #!/bin/bash 101 | curl -L -O https://artifacts.elastic.co/downloads/beats/filebeat/filebeat-6.4.2-amd64.deb && sudo dpkg -i filebeat-6.4.2-amd64.deb 102 | aws s3 sync s3://${config_bucket}/filebeat/ /etc/filebeat/ --region=${region} 103 | update-rc.d filebeat defaults 104 | update-rc.d filebeat enable 5 105 | path: /root/setup_filebeat.sh 106 | permissions: 0755 107 | - content: | 108 | server { 109 | listen 8080; 110 | 111 | location / { 112 | set $cleaned_header $arg_cf_app_instance; 113 | if ($arg_cf_app_instance ~* "^(.*)%3A(.*)$") { 114 | set $cleaned_header $1:$2; 115 | } 116 | proxy_http_version 1.1; 117 | proxy_pass https://$host$uri; 118 | proxy_ssl_server_name on; 119 | proxy_set_header Connection ""; 120 | proxy_set_header X-CF-APP-INSTANCE $cleaned_header; 121 | proxy_set_header XX-CF-APP-INSTANCE $cleaned_header; 122 | proxy_set_header Authorization "Bearer $arg_cf_app_guid"; 123 | } 124 | 125 | location /health { 126 | return 200 "Static health check"; 127 | } 128 | 129 | resolver 10.0.0.2 valid=10s; 130 | } 131 | path: /etc/nginx/sites-enabled/paas-proxy 132 | permissions: 0644 133 | - content: | 134 | ${prometheus_htpasswd} 135 | path: /etc/nginx/conf.d/.htpasswd 136 | owner: www-data:www-data 137 | permissions: 0600 138 | # the package-provided default server conflicts with auth-proxy 139 | # below and causes package installation to fail because of a 140 | # duplicate default_server on port 80. So we wipe the default 141 | # server (and then remove it in runcmd at the bottom) 142 | - content: "" 143 | path: /etc/nginx/sites-enabled/default 144 | - content: | 145 | server { 146 | listen 80 default_server; 147 | 148 | location /health { 149 | # This location is not protected by basic auth because of 150 | # https://stackoverflow.com/questions/40447376/auth-basic-within-location-block-doesnt-work-when-return-is-specified 151 | return 200 "Static health check"; 152 | } 153 | 154 | location = /last-config { 155 | default_type application/json; 156 | alias /srv/prometheus-last-config.json; 157 | } 158 | 159 | location / { 160 | proxy_pass http://localhost:9090; 161 | proxy_set_header X-Real-IP $remote_addr; 162 | proxy_set_header Host $host; 163 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 164 | } 165 | 166 | satisfy any; 167 | auth_basic "Prometheus"; 168 | auth_basic_user_file /etc/nginx/conf.d/.htpasswd; 169 | 170 | real_ip_header X-Forwarded-For; 171 | set_real_ip_from 10.0.0.0/8; 172 | set_real_ip_from 127.0.0.1/32; 173 | ${allowed_cidrs} 174 | deny all; 175 | } 176 | path: /etc/nginx/sites-enabled/auth-proxy 177 | 178 | runcmd: 179 | - rm /etc/nginx/sites-enabled/default 180 | - "if [ -n '${logstash_host}' ]; then /root/setup_filebeat.sh; fi" 181 | - [bash, -c, "/root/manage_data_volume.sh"] 182 | - [bash, -c, "chown -R prometheus /mnt/"] 183 | - [bash, -c, "echo \"node_creation_time `date +%s`\" > /var/lib/prometheus/node-exporter/node-creation-time.prom"] 184 | - [bash, -c, "rm /etc/resolv.conf && sed -e 's/ trust-ad//' < /run/systemd/resolve/stub-resolv.conf > /etc/resolv.conf"] 185 | - [reboot] 186 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/filebeat.yml.tpl: -------------------------------------------------------------------------------- 1 | filebeat.inputs: 2 | - type: log 3 | enabled: true 4 | paths: 5 | - /var/log/syslog 6 | 7 | output.logstash: 8 | hosts: ["${logstash_host}"] 9 | loadbalance: true 10 | ssl.enabled: true 11 | 12 | tags: ["prometheus", "${environment}"] 13 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/iam.tf: -------------------------------------------------------------------------------- 1 | #Prepare to attach role to instance 2 | resource "aws_iam_instance_profile" "prometheus_instance_profile" { 3 | name = "prometheus_${var.environment}_config_reader_profile" 4 | role = aws_iam_role.prometheus_role.name 5 | } 6 | 7 | #Create role 8 | resource "aws_iam_role" "prometheus_role" { 9 | name = "prometheus_profile_${var.environment}" 10 | 11 | assume_role_policy = data.aws_iam_policy_document.prometheus_assume_role_policy.json 12 | 13 | tags = merge(local.default_tags, { 14 | Name = "${var.environment}-prometheus" 15 | }) 16 | } 17 | 18 | #Create permission to assume role 19 | data "aws_iam_policy_document" "prometheus_assume_role_policy" { 20 | statement { 21 | actions = ["sts:AssumeRole"] 22 | 23 | principals { 24 | type = "Service" 25 | identifiers = ["ec2.amazonaws.com"] 26 | } 27 | } 28 | } 29 | 30 | #Define the policy to attach the role too 31 | resource "aws_iam_policy" "prometheus_instance_profile" { 32 | name = "prometheus_instance_profile_${var.environment}" 33 | path = "/" 34 | description = "This is the main profile, that has bucket permission and decribe permissions" 35 | 36 | policy = data.aws_iam_policy_document.instance_role_policy.json 37 | } 38 | 39 | #define IAM policy documention 40 | data "aws_iam_policy_document" "instance_role_policy" { 41 | statement { 42 | sid = "ec2Policy" 43 | actions = ["ec2:Describe*"] 44 | resources = ["*"] 45 | } 46 | 47 | statement { 48 | sid = "s3Bucket" 49 | 50 | actions = [ 51 | "s3:Get*", 52 | "s3:ListBucket", 53 | ] 54 | 55 | resources = [ 56 | "arn:aws:s3:::${aws_s3_bucket.prometheus_config.id}/*", 57 | "arn:aws:s3:::${aws_s3_bucket.prometheus_config.id}", 58 | ] 59 | } 60 | } 61 | 62 | #Attach policy to role 63 | resource "aws_iam_role_policy_attachment" "iam_policy" { 64 | role = aws_iam_role.prometheus_role.name 65 | policy_arn = aws_iam_policy.prometheus_instance_profile.arn 66 | } 67 | 68 | resource "aws_iam_role_policy_attachment" "session_manager_access" { 69 | role = aws_iam_role.prometheus_role.name 70 | policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM" 71 | } 72 | 73 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/main.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | filebeat_count = var.logstash_host != "" ? 1 : 0 3 | default_tags = { 4 | ManagedBy = "terraform" 5 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 6 | Environment = var.environment 7 | Service = "observe-prometheus" 8 | } 9 | } 10 | 11 | resource "aws_key_pair" "ssh_key" { 12 | count = var.enable_ssh == true ? 1 : 0 13 | key_name = "${var.environment}-prom-key" 14 | public_key = file("~/.ssh/id_rsa.pub") 15 | } 16 | 17 | resource "aws_instance" "prometheus" { 18 | count = length(keys(var.availability_zones)) 19 | 20 | ami = var.ami_id 21 | instance_type = var.instance_size 22 | user_data = data.template_file.user_data_script[count.index].rendered 23 | iam_instance_profile = aws_iam_instance_profile.prometheus_instance_profile.id 24 | subnet_id = var.subnet_ids[count.index] 25 | 26 | associate_public_ip_address = var.enable_ssh 27 | 28 | key_name = var.enable_ssh ? format("%s-prom-key", var.environment) : "" 29 | 30 | vpc_security_group_ids = var.vpc_security_groups 31 | 32 | tags = merge(local.default_tags, { 33 | Name = "paas-${var.environment}-prometheus-${element(keys(var.availability_zones), count.index)}" 34 | }) 35 | } 36 | 37 | resource "aws_volume_attachment" "attach-prometheus-disk" { 38 | count = length(keys(var.availability_zones)) 39 | 40 | device_name = var.device_mount_path 41 | volume_id = aws_ebs_volume.prometheus-disk[count.index].id 42 | instance_id = aws_instance.prometheus[count.index].id 43 | 44 | # Required to work around a bug in terraform https://github.com/hashicorp/terraform/issues/2957 45 | # terraform tries to destroy the attachment before stoping/destorying the instance 46 | skip_destroy = true 47 | } 48 | 49 | resource "aws_ebs_volume" "prometheus-disk" { 50 | count = length(keys(var.availability_zones)) 51 | 52 | availability_zone = element(keys(var.availability_zones), count.index) 53 | size = var.data_volume_size 54 | 55 | tags = merge(local.default_tags, { 56 | Name = "prometheus-disk" 57 | }) 58 | } 59 | 60 | data "template_file" "user_data_script" { 61 | count = length(keys(var.availability_zones)) 62 | 63 | template = file("${path.module}/cloud.conf") 64 | 65 | vars = { 66 | config_bucket = aws_s3_bucket.prometheus_config.id 67 | region = var.region 68 | ireland_targets_bucket = aws_s3_bucket.prometheus_targets.id 69 | london_targets_bucket = aws_s3_bucket.prometheus_london_targets.id 70 | alerts_bucket = aws_s3_bucket.prometheus_config.id 71 | prom_external_url = "https://${var.prometheus_public_fqdns[count.index]}" 72 | logstash_host = var.logstash_host 73 | prometheus_htpasswd = var.prometheus_htpasswd 74 | allowed_cidrs = join("\n ", formatlist("allow %s;", var.allowed_cidrs)) 75 | data_volume_size = var.data_volume_size 76 | } 77 | } 78 | 79 | resource "aws_s3_bucket" "prometheus_config" { 80 | bucket = var.config_bucket 81 | acl = "private" 82 | force_destroy = true 83 | 84 | versioning { 85 | enabled = true 86 | } 87 | 88 | tags = merge(local.default_tags, { 89 | Name = "${var.environment}-prometheus-config" 90 | }) 91 | } 92 | 93 | data "template_file" "filebeat_conf" { 94 | count = local.filebeat_count 95 | template = file("${path.module}/filebeat.yml.tpl") 96 | 97 | vars = { 98 | logstash_host = var.logstash_host 99 | environment = var.environment 100 | } 101 | } 102 | 103 | resource "aws_s3_bucket_object" "filebeat" { 104 | count = local.filebeat_count 105 | bucket = var.config_bucket 106 | key = "filebeat/filebeat.yml" 107 | content = data.template_file.filebeat_conf[0].rendered 108 | } 109 | 110 | resource "aws_lb_target_group_attachment" "prom_target_group_attachment" { 111 | count = length(var.prometheus_target_group_arns) 112 | target_group_arn = var.prometheus_target_group_arns[count.index] 113 | target_id = aws_instance.prometheus[count.index].id 114 | port = 80 115 | } 116 | 117 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/output.tf: -------------------------------------------------------------------------------- 1 | output "public_ip_address" { 2 | value = aws_instance.prometheus.*.public_ip 3 | } 4 | 5 | output "private_ip_addresses" { 6 | value = aws_instance.prometheus.*.private_ip 7 | } 8 | 9 | output "prometheus_instance_id" { 10 | value = aws_instance.prometheus.*.id 11 | } 12 | 13 | output "prometheus_private_dns" { 14 | value = aws_instance.prometheus.*.private_dns 15 | } 16 | 17 | output "prometheus_public_dns" { 18 | value = aws_instance.prometheus.*.public_dns 19 | } 20 | 21 | output "s3_config_bucket" { 22 | value = aws_s3_bucket.prometheus_config.id 23 | } 24 | 25 | output "ec2_instance_profile_name" { 26 | value = aws_iam_instance_profile.prometheus_instance_profile.name 27 | } 28 | 29 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/targets.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "prometheus_targets" { 2 | bucket = "govukobserve-targets-${var.environment}" 3 | acl = "private" 4 | force_destroy = true 5 | 6 | versioning { 7 | enabled = true 8 | } 9 | 10 | tags = merge(local.default_tags, { 11 | Name = "${var.environment}-ireland-targets" 12 | }) 13 | } 14 | 15 | resource "aws_iam_user" "targets_writer" { 16 | name = "targets-writer" 17 | path = "/${var.environment}/" 18 | 19 | tags = merge(local.default_tags, { 20 | Name = "${var.environment}-ireland-targets-writer" 21 | }) 22 | } 23 | 24 | resource "aws_iam_user_policy" "writer_has_full_access_to_targets_bucket" { 25 | name = "targets_bucket_full_access" 26 | user = aws_iam_user.targets_writer.name 27 | 28 | policy = < 0: 88 | print(index, expr) 89 | 90 | expr = rationalise_expr(expr, r'\{([^}]+)', "{%s}") # filters 91 | expr = rationalise_expr(expr, r'\[([^]]+)', "[%s]") # time ranges 92 | expr = rationalise_expr(expr, r'\$[_\w]+') # grafana vars 93 | expr = rationalise_expr(expr, r'\([a-z]+\)') # labels 94 | 95 | matched_words = re.findall(r'[^\d\W]+', expr) 96 | words.extend(matched_words) 97 | 98 | index += 1 99 | 100 | return words 101 | 102 | 103 | def check_metric_exists_for_word(words): 104 | index = 0 105 | missing_metric = [] 106 | print('**** Metrics evaluation:') 107 | for w in set(words).difference(IGNORE_WORDS): 108 | r_old = requests.get("{}/api/v1/query?query={}".format(os.environ.get("OLD_PROM_SERVER"), w)) 109 | resp_old = json.loads(r_old.content) 110 | 111 | if resp_old['status'] == 'success': 112 | print('{}: {}, {}'.format(index, len(resp_old['data']['result']) > 0, w)) 113 | 114 | # if old prometheus server doesn't have the metric then check if new prometheus server has the metric 115 | if not len(resp_old['data']['result']): 116 | r_new = requests.get("{}/api/v1/query?query={}".format(os.environ.get("NEW_PROM_SERVER"), w)) 117 | resp_new = json.loads(r_new.content) 118 | # only report it as missing if metrics are found on the new prometheus server 119 | if len(resp_new['data']['result']) > 0: 120 | missing_metric.append(w) 121 | else: 122 | print("{}: *** {} - {}".format(index, w, resp_old)) 123 | 124 | index += 1 125 | 126 | return missing_metric 127 | 128 | 129 | if __name__ == "__main__": 130 | try: 131 | token = os.environ['GRAFANA_TOKEN'] 132 | g = GrafanaAPI(BearerAuth(token), 'grafana-paas.cloudapps.digital', protocol='https') 133 | dashboards = g.get('/search?type=dash-db') 134 | exprs = [expr for dashboard in dashboards for expr in exprs_for_dashboard(dashboard)] 135 | exprs.sort(key=lambda e: e['dashboard_title'] + e['panel_title']) 136 | 137 | exprs.extend(exprs_for_alerts()) 138 | 139 | words = extract_words_from_expressions(exprs) 140 | 141 | missing_metric = check_metric_exists_for_word(words) 142 | 143 | print('**** Missing metrics:' if missing_metric else '**** No missing metrics') 144 | for m in missing_metric: 145 | print(m) 146 | 147 | except KeyError as e: 148 | print('Please set the %s environment variable' % e.args[0], file=sys.stderr) 149 | exit(1) 150 | -------------------------------------------------------------------------------- /tools/grafana_info/requirements.txt: -------------------------------------------------------------------------------- 1 | grafana-api==0.2.4 2 | simplejson==3.16.0 3 | pyyaml>=4.2b1 4 | -------------------------------------------------------------------------------- /tools/grafana_info/show_queries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from grafana_api.grafana_api import GrafanaAPI 4 | from bearer_auth import BearerAuth 5 | import os, sys 6 | 7 | 8 | def exprs_for_dashboard(dashboard): 9 | d = g.get('/dashboards/uid/%s' % dashboard['uid']) 10 | if 'panels' in d['dashboard']: 11 | panels = d['dashboard']['panels'] 12 | for panel in panels: 13 | targets = panel.get('targets',[]) 14 | for target in targets: 15 | if 'expr' in target: 16 | yield (target['expr'], dashboard['title'], panel['title']) 17 | else: 18 | print('***** no panels {}'.format(dashboard['title'])) 19 | 20 | 21 | if __name__ == "__main__": 22 | try: 23 | token = os.environ['GRAFANA_TOKEN'] 24 | g = GrafanaAPI(BearerAuth(token), 'grafana-paas.cloudapps.digital', protocol='https') 25 | dashboards = g.get('/search?type=dash-db') 26 | exprs = [expr for dashboard in dashboards for expr in exprs_for_dashboard(dashboard)] 27 | exprs.sort() 28 | for expr in exprs: 29 | print(expr) 30 | except KeyError as e: 31 | print('Please set the %s environment variable' % e.args[0], file=sys.stderr) 32 | exit(1) 33 | -------------------------------------------------------------------------------- /tools/terraform-format.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eu 3 | 4 | for file in "$@"; do 5 | lint=$(terraform fmt -write=false -diff=true -list=true "${file}") 6 | failed="" 7 | 8 | if [ ! -z "${lint}" ]; then 9 | failed="yes" 10 | echo -e "Your code is not in a canonical format:\n" 11 | echo "${lint}" 12 | echo -e "To apply these changes do 'terraform fmt ${file}'\n" 13 | fi 14 | 15 | if [ "$failed" == "yes" ];then 16 | exit 1 17 | fi 18 | done 19 | --------------------------------------------------------------------------------