├── .gitignore
├── .ruby-version
├── .terraform-version
├── .travis.yml
├── Brewfile
├── CODEOWNERS
├── LICENCE
├── README.md
├── ci
    ├── deploy.vars.default.yml
    ├── deploy.yml
    ├── images
    │   └── task
    │   │   ├── Dockerfile
    │   │   └── assume-role
    └── tasks
    │   ├── deploy-project.yml
    │   ├── generate-prometheus-test-jq.yml
    │   ├── http-ping.yml
    │   └── wait-ecs-services-stable.yml
├── logstash
    └── prometheus-for-paas-production.conf
├── terraform
    ├── modules
    │   ├── alertmanager
    │   │   ├── alb.tf
    │   │   ├── alertmanager-service.tf
    │   │   ├── certificate.tf
    │   │   ├── main.tf
    │   │   ├── security-group.tf
    │   │   ├── service_discovery.tf
    │   │   ├── task-definitions
    │   │   │   └── alertmanager.json
    │   │   ├── templates
    │   │   │   ├── alertmanager.tpl
    │   │   │   └── default.tmpl
    │   │   └── versions.tf
    │   ├── app-ecs-albs
    │   │   ├── main.tf
    │   │   └── versions.tf
    │   ├── common
    │   │   └── ami
    │   │   │   ├── main.tf
    │   │   │   └── versions.tf
    │   ├── infra-networking
    │   │   ├── main.tf
    │   │   └── versions.tf
    │   ├── infra-security-groups
    │   │   ├── main.tf
    │   │   └── versions.tf
    │   └── prom-ec2
    │   │   ├── README.md
    │   │   ├── alerts-config
    │   │       └── alerts
    │   │       │   ├── README.md
    │   │       │   ├── data-gov-uk-alerts.yml
    │   │       │   ├── doc-checking-alerts.yml
    │   │       │   ├── notify-alerts.yml
    │   │       │   └── observe-alerts.yml
    │   │   ├── paas-config
    │   │       ├── main.tf
    │   │       ├── outputs.tf
    │   │       ├── prometheus.conf.tpl
    │   │       ├── variables.tf
    │   │       └── versions.tf
    │   │   └── prometheus
    │   │       ├── .ruby-version
    │   │       ├── cloud.conf
    │   │       ├── filebeat.yml.tpl
    │   │       ├── iam.tf
    │   │       ├── main.tf
    │   │       ├── output.tf
    │   │       ├── targets.tf
    │   │       ├── variables.tf
    │   │       └── versions.tf
    └── projects
    │   ├── alertmanager-production
    │       ├── main.tf
    │       └── versions.tf
    │   ├── alertmanager-staging
    │       ├── main.tf
    │       └── versions.tf
    │   ├── app-ecs-albs-production
    │       ├── main.tf
    │       └── versions.tf
    │   ├── app-ecs-albs-staging
    │       ├── main.tf
    │       └── versions.tf
    │   ├── infra-networking-production
    │       ├── main.tf
    │       └── versions.tf
    │   ├── infra-networking-staging
    │       ├── main.tf
    │       └── versions.tf
    │   ├── infra-security-groups-production
    │       ├── main.tf
    │       └── versions.tf
    │   ├── infra-security-groups-staging
    │       ├── main.tf
    │       └── versions.tf
    │   └── prom-ec2
    │       ├── paas-production
    │           ├── extra-prometheus-scrape-configs.yml.tpl
    │           ├── main.tf
    │           └── versions.tf
    │       └── paas-staging
    │           ├── main.tf
    │           └── versions.tf
└── tools
    ├── check-alerting-rules.sh
    ├── grafana_info
        ├── .python-version
        ├── README.md
        ├── bearer_auth.py
        ├── find_missing_metrics.py
        ├── requirements.txt
        └── show_queries.py
    └── terraform-format.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | # terraform state files
 2 | .terraform/
 3 | *.tfst*
 4 | 
 5 | # editor config stuff
 6 | .idea
 7 | .idea/*/**
 8 | .vscode
 9 | .*.swp
10 | 
11 | # os files 
12 | .DS_Store
13 | 
14 | *.plan
15 | 
16 | /tools/prometheus-configs/**/data
17 | /tools/prometheus-configs/log-cache-adapter/token
18 | 


--------------------------------------------------------------------------------
/.ruby-version:
--------------------------------------------------------------------------------
1 | 2.6.1
2 | 


--------------------------------------------------------------------------------
/.terraform-version:
--------------------------------------------------------------------------------
1 | 0.13.3
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | env:
 2 |   global:
 3 |     - TERRAFORM_VERSION=0.13.3
 4 |     - TERRAFORM_FILE_NAME=terraform_${TERRAFORM_VERSION}_linux_amd64.zip
 5 |     - TERRAFORM_DOWNLOAD_URL=https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/${TERRAFORM_FILE_NAME}
 6 |     - PROMETHEUS_VERSION=2.3.2
 7 |     - PROMETHEUS_FILE_NAME=prometheus-${PROMETHEUS_VERSION}.linux-amd64
 8 |     - PROMETHEUS_TAR_FILE_NAME=${PROMETHEUS_FILE_NAME}.tar.gz
 9 |     - PROMETHEUS_DOWNLOAD_URL=https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/${PROMETHEUS_TAR_FILE_NAME}
10 | install:
11 |   - wget ${TERRAFORM_DOWNLOAD_URL}
12 |   - unzip -o ${TERRAFORM_FILE_NAME} -d /tmp
13 |   - export PATH=/tmp:${PATH}
14 |   - wget ${PROMETHEUS_DOWNLOAD_URL}
15 |   - tar -xvzf ${PROMETHEUS_TAR_FILE_NAME} -C /tmp
16 |   - export PATH=/tmp/${PROMETHEUS_FILE_NAME}:${PATH}
17 | 
18 | script:
19 |   - find . -name '*.tf' | xargs tools/terraform-format.sh
20 |   - tools/check-alerting-rules.sh
21 | notifications:
22 |   email: false
23 | 


--------------------------------------------------------------------------------
/Brewfile:
--------------------------------------------------------------------------------
1 | tap "alphagov/gds"
2 | 
3 | brew "jq"
4 | brew "tfenv"
5 | brew "gds-cli"
6 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/en/articles/about-code-owners
2 | * @alphagov/re-autom8
3 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Crown Copyright (Government Digital Service)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **This repo is no longer in use and has been archived**
 2 | 
 3 | # Prometheus configuration on AWS #
 4 | 
 5 | Terraform configuration to manage a Prometheus server running on AWS.
 6 | 
 7 | ## Setup ##
 8 | 
 9 | ### Install dependencies
10 | 
11 |     brew bundle
12 |     tfenv install # this will pick up the version from .terraform-version
13 | 
14 | ### Allow access to secrets
15 | 
16 | You will need to clone the re-secrets repo into `~/.password-store/re-secrets`:
17 | 
18 |     git clone git@github.com:alphagov/re-secrets.git ~/.password-store/re-secrets
19 | 
20 | ## Deploying Terraform
21 | 
22 | ```shell
23 | cd terraform/projects/PROJECT-ENV/
24 | gds aws re-prom-<env> -- terraform init
25 | gds aws re-prom-<env> -- terraform plan
26 | gds aws re-prom-<env> -- terraform apply
27 | ```
28 | 
29 | eg
30 | 
31 | ```shell
32 | cd terraform/projects/app-ecs-albs-staging
33 | gds aws re-prom-staging -- terraform plan
34 | ```
35 | 
36 | ### Deploy EC2 Prometheus with zero downtime
37 | 
38 | To avoid all three instances being respun at the same time you can do one instance at a time using:
39 | 
40 | ```
41 | gds aws re-prom-<env> -- terraform apply -target=module.paas-config.aws_route53_record.prom_ec2_a_record[i] -target=module.prometheus.aws_volume_attachment.attach-prometheus-disk[i] -target=module.prometheus.aws_instance.prometheus[i] -target=module.prometheus.aws_lb_target_group_attachment.prom_target_group_attachment[i]
42 | ```
43 | 
44 | where `i` is `0`, `1` or `2`.
45 | 
46 | ## EC2 Prometheus
47 | 
48 | Prometheis are not deployed on Amazon ECS and are instead deployed using the prom-ec2 modules onto EC2 instances. For details of how to develop and deploy them see the [terraform/modules/prom-ec2 README](terraform/modules/prom-ec2).
49 | 
50 | ## ECS
51 | 
52 | Alertmanager and NGINX are deployed on Amazon ECS Fargate.
53 | 
54 | ## License
55 | [MIT License](LICENCE)
56 | 


--------------------------------------------------------------------------------
/ci/deploy.vars.default.yml:
--------------------------------------------------------------------------------
1 | background-image: ""
2 | prometheus-aws-configuration-beta-branch: master
3 | 


--------------------------------------------------------------------------------
/ci/deploy.yml:
--------------------------------------------------------------------------------
  1 | display:
  2 |   background_image: ((background-image))
  3 | 
  4 | resource_types:
  5 | - name: cf
  6 |   type: docker-image
  7 |   source:
  8 |     repository: concourse/cf-resource
  9 |     tag: "1.1"
 10 | - name: git
 11 |   type: docker-image
 12 |   source:
 13 |     repository: concourse/git-resource
 14 |     tag: "1.6"
 15 | 
 16 | resources:
 17 |   - name: task-image
 18 |     type: docker-image
 19 |     icon: layers
 20 |     source:
 21 |       repository: ((readonly_private_ecr_repo_url))
 22 |       tag: prometheus-task-image
 23 |   - name: prometheus-aws-configuration-beta
 24 |     type: git
 25 |     icon: git
 26 |     source:
 27 |       uri: https://github.com/alphagov/prometheus-aws-configuration-beta.git
 28 |       branch: ((prometheus-aws-configuration-beta-branch))
 29 |   # image building is expensive even when nothing has changed, hence dedicated resource
 30 |   - name: prometheus-aws-configuration-beta-images
 31 |     type: git
 32 |     icon: git
 33 |     source:
 34 |       uri: https://github.com/alphagov/prometheus-aws-configuration-beta.git
 35 |       branch: ((prometheus-aws-configuration-beta-branch))
 36 |       paths:
 37 |         - ci/images
 38 |   - name: cf-app-discovery
 39 |     type: git
 40 |     icon: git
 41 |     source:
 42 |       uri: https://github.com/alphagov/cf_app_discovery.git
 43 |       branch: master
 44 |   - name: re-secrets
 45 |     type: git
 46 |     icon: git
 47 |     source:
 48 |       private_key: |
 49 |         ((re-secrets-ssh-key))
 50 |       uri: git@github.com:alphagov/re-secrets.git
 51 |       branch: master
 52 |       paths:
 53 |         - observe
 54 |   - name: service-broker-ireland-staging
 55 |     type: cf
 56 |     icon: anvil
 57 |     source:
 58 |       api: https://api.cloud.service.gov.uk
 59 |       username: ((cf_user))
 60 |       password: ((cf_password))
 61 |       organization: gds-tech-ops
 62 |       space: prometheus-staging
 63 |   - name: service-broker-ireland-production
 64 |     type: cf
 65 |     icon: anvil
 66 |     source:
 67 |       api: https://api.cloud.service.gov.uk
 68 |       username: ((cf_user))
 69 |       password: ((cf_password))
 70 |       organization: gds-tech-ops
 71 |       space: prometheus-production
 72 |   - name: service-broker-london-production
 73 |     type: cf
 74 |     icon: anvil
 75 |     source:
 76 |       api: https://api.london.cloud.service.gov.uk
 77 |       username: ((cf_london_user))
 78 |       password: ((cf_london_password))
 79 |       organization: gds-tech-ops
 80 |       space: prometheus-production
 81 | 
 82 | jobs:
 83 | 
 84 |   - name: configure-pipeline
 85 |     serial: true
 86 |     plan:
 87 |     - get: prometheus-aws-configuration-beta
 88 |       trigger: true
 89 |     - set_pipeline: self
 90 |       file: prometheus-aws-configuration-beta/ci/deploy.yml
 91 |       vars:
 92 |         prometheus-aws-configuration-beta-branch: ((prometheus-aws-configuration-beta-branch))
 93 |         background-image: ((background-image))
 94 | 
 95 |   - name: build-task-image
 96 |     serial: true
 97 |     plan:
 98 |     - get: prometheus-aws-configuration-beta-images
 99 |       trigger: true
100 |     - put: task-image
101 |       params: {build: prometheus-aws-configuration-beta-images/ci/images/task}
102 |       get_params: {skip_download: true}
103 | 
104 |   - name: deploy-common-staging
105 |     serial: true
106 |     plan:
107 |       - in_parallel:
108 |         - get: prometheus-aws-configuration-beta
109 |           passed: [configure-pipeline]
110 |           trigger: true
111 |         - get: task-image
112 |           passed: [build-task-image]
113 |           trigger: true
114 |         - get: re-secrets
115 |           trigger: true
116 |       - task: apply-infra-networking-terraform
117 |         image: task-image
118 |         timeout: 15m
119 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
120 |         input_mapping: {src: prometheus-aws-configuration-beta}
121 |         params:
122 |           PROJECT: infra-networking-staging
123 |           DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer
124 |           GPG_PRIVATE_KEY: ((gpg_private_key))
125 |       - task: apply-infra-security-groups-terraform
126 |         image: task-image
127 |         timeout: 15m
128 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
129 |         input_mapping: {src: prometheus-aws-configuration-beta}
130 |         params:
131 |           PROJECT: infra-security-groups-staging
132 |           DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer
133 |           GPG_PRIVATE_KEY: ((gpg_private_key))
134 |       - task: apply-app-ecs-elbs-terraform
135 |         image: task-image
136 |         timeout: 15m
137 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
138 |         input_mapping: {src: prometheus-aws-configuration-beta}
139 |         params:
140 |           PROJECT: app-ecs-albs-staging
141 |           DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer
142 |           GPG_PRIVATE_KEY: ((gpg_private_key))
143 | 
144 |   - name: deploy-common-production
145 |     serial: true
146 |     plan:
147 |       - in_parallel:
148 |         - get: prometheus-aws-configuration-beta
149 |           passed: [deploy-prometheus-staging, deploy-alertmanager-staging]
150 |           trigger: true
151 |         - get: task-image
152 |           passed: [deploy-prometheus-staging, deploy-alertmanager-staging]
153 |           trigger: true
154 |         - get: re-secrets
155 |           passed: [deploy-prometheus-staging, deploy-alertmanager-staging]
156 |           trigger: true
157 |       - task: apply-infra-networking-terraform
158 |         image: task-image
159 |         timeout: 15m
160 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
161 |         input_mapping: {src: prometheus-aws-configuration-beta}
162 |         params:
163 |           PROJECT: infra-networking-production
164 |           DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer
165 |           GPG_PRIVATE_KEY: ((gpg_private_key))
166 |       - task: apply-infra-security-groups-terraform
167 |         image: task-image
168 |         timeout: 15m
169 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
170 |         input_mapping: {src: prometheus-aws-configuration-beta}
171 |         params:
172 |           PROJECT: infra-security-groups-production
173 |           DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer
174 |           GPG_PRIVATE_KEY: ((gpg_private_key))
175 |       - task: apply-app-ecs-elbs-terraform
176 |         image: task-image
177 |         timeout: 15m
178 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
179 |         input_mapping: {src: prometheus-aws-configuration-beta}
180 |         params:
181 |           PROJECT: app-ecs-albs-production
182 |           DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer
183 |           GPG_PRIVATE_KEY: ((gpg_private_key))
184 | 
185 |   - name: deploy-prometheus-staging
186 |     serial: true
187 |     plan:
188 |       - in_parallel:
189 |         - get: prometheus-aws-configuration-beta
190 |           passed: [deploy-common-staging]
191 |           trigger: true
192 |         - get: task-image
193 |           passed: [deploy-common-staging]
194 |           trigger: true
195 |         - get: re-secrets
196 |           passed: [deploy-common-staging]
197 |           trigger: true
198 |       - task: apply-terraform
199 |         image: task-image
200 |         timeout: 15m
201 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
202 |         input_mapping: {src: prometheus-aws-configuration-beta}
203 |         output_mapping: {outputs: terraform-outputs}
204 |         params:
205 |           PROJECT: prom-ec2/paas-staging
206 |           DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer
207 |           GPG_PRIVATE_KEY: ((gpg_private_key))
208 |       - task: generate-prometheus-test-jq
209 |         image: task-image
210 |         file: prometheus-aws-configuration-beta/ci/tasks/generate-prometheus-test-jq.yml
211 |         input_mapping: {input: terraform-outputs}
212 |         output_mapping: {output: prometheus-test-jq}
213 |       - in_parallel:
214 |         - do:
215 |           - task: conf-test-prom-1
216 |             attempts: 8
217 |             timeout: 2m
218 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
219 |             input_mapping: {response-jq-test: prometheus-test-jq}
220 |             params:
221 |               URL: https://prom-1.monitoring-staging.gds-reliability.engineering/last-config
222 |           - task: smoke-test-prom-1
223 |             attempts: 8
224 |             timeout: 2m
225 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
226 |             params:
227 |               URL: https://prom-1.monitoring-staging.gds-reliability.engineering/-/ready
228 |         - do:
229 |           - task: conf-test-prom-2
230 |             attempts: 8
231 |             timeout: 2m
232 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
233 |             input_mapping: {response-jq-test: prometheus-test-jq}
234 |             params:
235 |               URL: https://prom-2.monitoring-staging.gds-reliability.engineering/last-config
236 |           - task: smoke-test-prom-2
237 |             attempts: 8
238 |             timeout: 2m
239 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
240 |             params:
241 |               URL: https://prom-2.monitoring-staging.gds-reliability.engineering/-/ready
242 |         - do:
243 |           - task: conf-test-prom-3
244 |             attempts: 8
245 |             timeout: 2m
246 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
247 |             input_mapping: {response-jq-test: prometheus-test-jq}
248 |             params:
249 |               URL: https://prom-3.monitoring-staging.gds-reliability.engineering/last-config
250 |           - task: smoke-test-prom-3
251 |             attempts: 8
252 |             timeout: 2m
253 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
254 |             params:
255 |               URL: https://prom-3.monitoring-staging.gds-reliability.engineering/-/ready
256 | 
257 |   - name: deploy-prometheus-production
258 |     serial: true
259 |     plan:
260 |       - in_parallel:
261 |         - get: prometheus-aws-configuration-beta
262 |           passed: [deploy-common-production]
263 |           trigger: true
264 |         - get: task-image
265 |           passed: [deploy-common-production]
266 |           trigger: true
267 |         - get: re-secrets
268 |           passed: [deploy-common-production]
269 |           trigger: true
270 |       - task: apply-terraform
271 |         image: task-image
272 |         timeout: 15m
273 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
274 |         input_mapping: {src: prometheus-aws-configuration-beta}
275 |         output_mapping: {outputs: terraform-outputs}
276 |         params:
277 |           PROJECT: prom-ec2/paas-production
278 |           DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer
279 |           GPG_PRIVATE_KEY: ((gpg_private_key))
280 |       - task: generate-prometheus-test-jq
281 |         image: task-image
282 |         file: prometheus-aws-configuration-beta/ci/tasks/generate-prometheus-test-jq.yml
283 |         input_mapping: {input: terraform-outputs}
284 |         output_mapping: {output: prometheus-test-jq}
285 |       - in_parallel:
286 |         - do:
287 |           - task: conf-test-prom-1
288 |             attempts: 8
289 |             timeout: 2m
290 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
291 |             input_mapping: {response-jq-test: prometheus-test-jq}
292 |             params:
293 |               URL: https://prom-1.monitoring.gds-reliability.engineering/last-config
294 |           - task: smoke-test-prom-1
295 |             attempts: 8
296 |             timeout: 2m
297 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
298 |             params:
299 |               URL: https://prom-1.monitoring.gds-reliability.engineering/-/ready
300 |         - do:
301 |           - task: conf-test-prom-2
302 |             attempts: 8
303 |             timeout: 2m
304 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
305 |             input_mapping: {response-jq-test: prometheus-test-jq}
306 |             params:
307 |               URL: https://prom-2.monitoring.gds-reliability.engineering/last-config
308 |           - task: smoke-test-prom-2
309 |             attempts: 8
310 |             timeout: 2m
311 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
312 |             params:
313 |               URL: https://prom-2.monitoring.gds-reliability.engineering/-/ready
314 |         - do:
315 |           - task: conf-test-prom-3
316 |             attempts: 8
317 |             timeout: 2m
318 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
319 |             input_mapping: {response-jq-test: prometheus-test-jq}
320 |             params:
321 |               URL: https://prom-3.monitoring.gds-reliability.engineering/last-config
322 |           - task: smoke-test-prom-3
323 |             attempts: 8
324 |             timeout: 2m
325 |             file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
326 |             params:
327 |               URL: https://prom-3.monitoring.gds-reliability.engineering/-/ready
328 | 
329 |   - name: deploy-alertmanager-staging
330 |     serial: true
331 |     plan:
332 |       - in_parallel:
333 |         - get: prometheus-aws-configuration-beta
334 |           passed: [deploy-common-staging]
335 |           trigger: true
336 |         - get: task-image
337 |           passed: [deploy-common-staging]
338 |           trigger: true
339 |         - get: re-secrets
340 |           passed: [deploy-common-staging]
341 |           trigger: true
342 |       - task: apply-terraform
343 |         image: task-image
344 |         timeout: 15m
345 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
346 |         input_mapping: {src: prometheus-aws-configuration-beta}
347 |         output_mapping: {outputs: terraform-outputs}
348 |         params:
349 |           PROJECT: alertmanager-staging
350 |           DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer
351 |           GPG_PRIVATE_KEY: ((gpg_private_key))
352 |       - task: wait-ecs-services-stable
353 |         image: task-image
354 |         file: prometheus-aws-configuration-beta/ci/tasks/wait-ecs-services-stable.yml
355 |         params:
356 |           DEPLOYER_ARN: arn:aws:iam::027317422673:role/autom8-deployer
357 |           TERRAFORM_VAR: alertmanager_ecs_clusters_services
358 |       - in_parallel:
359 |         - task: smoke-test-alertmanager
360 |           attempts: 6
361 |           timeout: 2m
362 |           file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
363 |           params:
364 |             URL: https://alerts.monitoring-staging.gds-reliability.engineering/-/healthy
365 |         - task: smoke-test-alertmanager-eu-west-1a
366 |           attempts: 6
367 |           timeout: 2m
368 |           file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
369 |           params:
370 |             URL: https://alerts-eu-west-1a.monitoring-staging.gds-reliability.engineering/-/healthy
371 |         - task: smoke-test-alertmanager-eu-west-1b
372 |           attempts: 6
373 |           timeout: 2m
374 |           file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
375 |           params:
376 |             URL: https://alerts-eu-west-1b.monitoring-staging.gds-reliability.engineering/-/healthy
377 |         - task: smoke-test-alertmanager-eu-west-1c
378 |           attempts: 6
379 |           timeout: 2m
380 |           file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
381 |           params:
382 |             URL: https://alerts-eu-west-1c.monitoring-staging.gds-reliability.engineering/-/healthy
383 | 
384 |   - name: deploy-alertmanager-production
385 |     serial: true
386 |     plan:
387 |       - in_parallel:
388 |         - get: prometheus-aws-configuration-beta
389 |           passed: [deploy-common-production]
390 |           trigger: true
391 |         - get: task-image
392 |           passed: [deploy-common-production]
393 |           trigger: true
394 |         - get: re-secrets
395 |           passed: [deploy-common-production]
396 |           trigger: true
397 |       - task: apply-terraform
398 |         image: task-image
399 |         timeout: 15m
400 |         file: prometheus-aws-configuration-beta/ci/tasks/deploy-project.yml
401 |         input_mapping: {src: prometheus-aws-configuration-beta}
402 |         output_mapping: {outputs: terraform-outputs}
403 |         params:
404 |           PROJECT: alertmanager-production
405 |           DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer
406 |           GPG_PRIVATE_KEY: ((gpg_private_key))
407 |       - task: wait-ecs-services-stable
408 |         image: task-image
409 |         file: prometheus-aws-configuration-beta/ci/tasks/wait-ecs-services-stable.yml
410 |         params:
411 |           DEPLOYER_ARN: arn:aws:iam::455214962221:role/autom8-deployer
412 |           TERRAFORM_VAR: alertmanager_ecs_clusters_services
413 |       - in_parallel:
414 |         - task: smoke-test-alertmanager
415 |           attempts: 6
416 |           timeout: 2m
417 |           file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
418 |           params:
419 |             URL: https://alerts.monitoring.gds-reliability.engineering/-/healthy
420 |         - task: smoke-test-alertmanager-eu-west-1a
421 |           attempts: 6
422 |           timeout: 2m
423 |           file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
424 |           params:
425 |             URL: https://alerts-eu-west-1a.monitoring.gds-reliability.engineering/-/healthy
426 |         - task: smoke-test-alertmanager-eu-west-1b
427 |           attempts: 6
428 |           timeout: 2m
429 |           file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
430 |           params:
431 |             URL: https://alerts-eu-west-1b.monitoring.gds-reliability.engineering/-/healthy
432 |         - task: smoke-test-alertmanager-eu-west-1c
433 |           attempts: 6
434 |           timeout: 2m
435 |           file: prometheus-aws-configuration-beta/ci/tasks/http-ping.yml
436 |           params:
437 |             URL: https://alerts-eu-west-1c.monitoring.gds-reliability.engineering/-/healthy
438 | 
439 |   - name: run-service-broker-tests
440 |     plan:
441 |       - get: cf-app-discovery
442 |         trigger: true
443 |       - task: run-tests
444 |         timeout: 15m
445 |         config:
446 |           platform: linux
447 |           image_resource:
448 |             type: docker-image
449 |             source:
450 |               repository: ruby
451 |               tag: 2.6.6
452 |           inputs:
453 |             - name: cf-app-discovery
454 |               path: repo
455 |           run:
456 |             path: sh
457 |             dir: repo
458 |             args:
459 |             - -c
460 |             - |
461 |               apt-get update
462 |               gem install bundler -v 2.0.1
463 |               bundle install --without development
464 |               bundle exec rake
465 |   - name: deploy-service-broker-ireland-staging
466 |     plan:
467 |       - get: cf-app-discovery
468 |         trigger: true
469 |         passed: [ run-service-broker-tests ]
470 |       - put: service-broker-ireland-staging
471 |         params:
472 |           manifest: cf-app-discovery/manifest-ireland-staging.yml
473 |           show_app_log: true
474 |   - name: deploy-service-broker-ireland-production
475 |     plan:
476 |       - get: cf-app-discovery
477 |         trigger: true
478 |         passed: [ deploy-service-broker-ireland-staging ]
479 |       - put: service-broker-ireland-production
480 |         params:
481 |           manifest: cf-app-discovery/manifest-ireland-production.yml
482 |           show_app_log: true
483 |   - name: deploy-service-broker-london-production
484 |     plan:
485 |       - get: cf-app-discovery
486 |         trigger: true
487 |         passed: [ deploy-service-broker-ireland-staging ]
488 |       - put: service-broker-london-production
489 |         params:
490 |           manifest: cf-app-discovery/manifest-london-production.yml
491 |           show_app_log: true
492 | 


--------------------------------------------------------------------------------
/ci/images/task/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | ENV TF_VERSION 0.13.3
 4 | ENV TF_ZIP_SHA256 35c662be9d32d38815cde5fa4c9fa61a3b7f39952ecd50ebf92fd1b2ddd6109b
 5 | 
 6 | LABEL ubuntu="20.04"
 7 | LABEL terraform="$TF_VERSION"
 8 | 
 9 | ENV TZ=Europe/London
10 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
11 | 
12 | RUN apt-get update  --yes && \
13 | 	apt-get install --yes --no-install-recommends \
14 | 	ca-certificates \
15 | 	awscli \
16 | 	jq \
17 | 	curl \
18 | 	dnsutils \
19 | 	unzip \
20 | 	gpg \
21 | 	gpg-agent \
22 | 	golang \
23 | 	git
24 | 
25 | WORKDIR /tmp
26 | 
27 | RUN curl https://releases.hashicorp.com/terraform/${TF_VERSION}/terraform_${TF_VERSION}_linux_amd64.zip > terraform.zip && \
28 | 	echo "${TF_ZIP_SHA256}  terraform.zip"  > terraform.sha && \
29 | 	sha256sum -c terraform.sha && unzip terraform.zip && mv terraform /usr/bin/terraform                    && \
30 | 	rm terraform.zip && rm terraform.sha
31 | 
32 | RUN GO111MODULE=on go get -v github.com/camptocamp/terraform-provider-pass && \
33 | 	mkdir -p ~/.terraform.d/plugins/linux_amd64 && \
34 | 	mv ~/go/bin/terraform-provider-pass ~/.terraform.d/plugins/linux_amd64/
35 | 
36 | # prom-ec2 terraform expects a pub ssh key even if it doesn't use it
37 | RUN mkdir -p $HOME/.ssh/ && touch $HOME/.ssh/id_rsa.pub
38 | 
39 | COPY assume-role /usr/bin/assume-role
40 | 
41 | ENTRYPOINT ["bash"]
42 | 


--------------------------------------------------------------------------------
/ci/images/task/assume-role:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eu
 4 | 
 5 | arn="$1"
 6 | creds="$(aws \
 7 | 		 sts assume-role \
 8 | 		 --role-arn="$arn" \
 9 | 		 --role-session-name="deploy-concourse-$(date +%s)" \
10 | 		 --duration 1800 \
11 | )"
12 | 
13 | access_key="$(echo "$creds"    | jq -r ".Credentials.AccessKeyId")"
14 | secret_key="$(echo "$creds"    | jq -r ".Credentials.SecretAccessKey")"
15 | session_token="$(echo "$creds" | jq -r ".Credentials.SessionToken")"
16 | 
17 | echo "export AWS_ACCESS_KEY_ID=\"$access_key\""
18 | echo "export AWS_SECRET_ACCESS_KEY=\"$secret_key\""
19 | echo "export AWS_SESSION_TOKEN=\"$session_token\""
20 | echo "export AWS_DEFAULT_REGION=\"eu-west-1\""
21 | 
22 | 


--------------------------------------------------------------------------------
/ci/tasks/deploy-project.yml:
--------------------------------------------------------------------------------
 1 |   platform: linux
 2 |   inputs:
 3 |     - name: src
 4 |     - name: re-secrets
 5 |   outputs:
 6 |     - name: outputs
 7 |   params:
 8 |     PROJECT:
 9 |     DEPLOYER_ARN:
10 |     GPG_PRIVATE_KEY:
11 |     AWS_REGION: 'eu-west-1'
12 |     AWS_DEFAULT_REGION: 'eu-west-1'
13 |     PASSWORD_STORE_DIR: "re-secrets/observe"
14 |   run:
15 |     path: bash
16 |     args:
17 |       - -eu
18 |       - -c
19 |       - |
20 |         BUILD_DIR=$(pwd)
21 | 
22 |         echo "configuring aws client..."
23 |         eval $(assume-role "${DEPLOYER_ARN}")
24 | 
25 |         echo "configuring re-secrets store..."
26 |         echo "${GPG_PRIVATE_KEY}" | gpg --import
27 |         mkdir -p $HOME/.password-store
28 |         cp -R re-secrets $HOME/.password-store
29 | 
30 |         echo "terraforming..."
31 |         pushd "src/terraform/projects/${PROJECT}"
32 |         terraform init
33 |         terraform apply -auto-approve
34 |         terraform output -json > $BUILD_DIR/outputs/terraform-outputs.json
35 |         popd
36 | 


--------------------------------------------------------------------------------
/ci/tasks/generate-prometheus-test-jq.yml:
--------------------------------------------------------------------------------
 1 | platform: linux
 2 | inputs:
 3 |   - name: input
 4 | outputs:
 5 |   - name: output
 6 | run:
 7 |   path: sh
 8 |   args:
 9 |     - -euxc
10 |     - |
11 |       echo ".last_successful_config == $(jq '.prometheus_config_etag.value' input/terraform-outputs.json)" > output/test.jq
12 | 


--------------------------------------------------------------------------------
/ci/tasks/http-ping.yml:
--------------------------------------------------------------------------------
 1 | platform: linux
 2 | image_resource:
 3 |   type: docker-image
 4 |   source:
 5 |     repository: governmentpaas/curl-ssl
 6 |     tag: fe3e384e81ccb50842509d7237e3828b293de694
 7 | inputs:
 8 |   - name: response-jq-test
 9 |     optional: true
10 | params:
11 |   URL:
12 | run:
13 |   path: sh
14 |   args:
15 |     - -euxc
16 |     - |
17 |       DOMAIN=$(echo "${URL}" | awk -F/ '{print $3}')
18 |       getent ahosts ${DOMAIN} | cut -d ' ' -f1 | sort | uniq | tee /dev/stderr | while read TARGET_IP ; do
19 |         curl \
20 |           --resolve ${DOMAIN}:443:${TARGET_IP} \
21 |           --silent \
22 |           --fail \
23 |           --write-out "${TARGET_IP} %{http_code} %{time_total}s"$'\n' \
24 |           --output curl_output \
25 |           --max-time 5 "${URL}"
26 | 
27 |         if [[ -e response-jq-test/test.jq ]] ; then
28 |           if ! jq -e -f response-jq-test/test.jq curl_output ; then
29 |             echo 'Response:'
30 |             cat curl_output
31 |             echo 'Failed jq test:'
32 |             cat response-jq-test/test.jq
33 |             # don't spin through attempts too fast
34 |             sleep 5
35 |             exit 9
36 |           fi
37 |         fi
38 |       done
39 | 
40 | 


--------------------------------------------------------------------------------
/ci/tasks/wait-ecs-services-stable.yml:
--------------------------------------------------------------------------------
 1 | platform: linux
 2 | inputs:
 3 |   - name: terraform-outputs
 4 | params:
 5 |   DEPLOYER_ARN:
 6 |   TERRAFORM_VAR:
 7 |   AWS_REGION: 'eu-west-1'
 8 |   AWS_DEFAULT_REGION: 'eu-west-1'
 9 | run:
10 |   path: bash
11 |   args:
12 |     - -eu
13 |     - -c
14 |     - |
15 |       echo "configuring aws client..."
16 |       eval $(assume-role "${DEPLOYER_ARN}")
17 | 
18 |       jq -c '.[env.TERRAFORM_VAR].value | to_entries | .[]' terraform-outputs/terraform-outputs.json | while read entry ; do
19 |         CLUSTER="$(echo ${entry} | jq -r '.key')"
20 |         SERVICES="$(echo ${entry} | jq -r '.value | join(" ")')"
21 | 
22 |         echo "Waiting for services ${SERVICES} of cluster ${CLUSTER} to be stable..."
23 | 
24 |         aws ecs wait services-stable \
25 |           --cluster "${CLUSTER}" \
26 |           --services ${SERVICES}
27 |       done
28 | 


--------------------------------------------------------------------------------
/logstash/prometheus-for-paas-production.conf:
--------------------------------------------------------------------------------
 1 | filter {
 2 |     if !("beats_input_codec_plain_applied" in [tags]) {
 3 |         grok {
 4 |             # attempt to parse syslog lines
 5 |             match => { "message" => "%{SYSLOG5424PRI}%{NONNEGINT:syslog_ver} +(?:%{TIMESTAMP_ISO8601:syslog_timestamp}|-) +(?:%{HOSTNAME:syslog_host}|-) +(?:%{NOTSPACE:syslog_app}|-) +(?:%{NOTSPACE:syslog_proc}|-) +(?:%{WORD:syslog_msgid}|-) +(?:%{SYSLOG5424SD:syslog_sd}|-|) +%{GREEDYDATA:syslog_msg}" }
 6 |             # if successful, save original `@timestamp` and `host` fields created by logstash
 7 |             add_field => [ "received_at", "%{@timestamp}" ]
 8 |             add_field => [ "received_from", "%{host}" ]
 9 |             add_tag => ["cf"]
10 |             tag_on_failure => ["_syslogparsefailure"]
11 |         }
12 |     }
13 | 
14 |     if "cf" in [tags] {
15 |         # parse the syslog pri field into severity/facility
16 |         if [syslog5424_pri] {
17 |             syslog_pri { syslog_pri_field_name => 'syslog5424_pri' }
18 |         }
19 | 
20 |         # replace @timestamp field with the one from syslog
21 |         date { match => [ "syslog_timestamp", "ISO8601" ] }
22 | 
23 |         # if we successfully parsed cf syslog, replace the message and source_host fields
24 |         mutate {
25 |             replace => [ "source_host", "%{syslog_host}" ]
26 |             replace => [ "message", "%{syslog_msg}" ]
27 |         }
28 | 
29 |         # Cloud Foundry passes the app name, space and organisation in the syslog_host
30 |         # Filtering them into separate fields makes it easier to query multiple apps in a single Kibana instance
31 |         dissect {
32 |             mapping => { "syslog_host" => "%{[cf][org]}.%{[cf][space]}.%{[cf][app]}" }
33 |             tag_on_failure => ["_sysloghostdissectfailure"]
34 |         }
35 | 
36 |         # Cloud Foundry gorouter logs
37 |         if [syslog_proc] =~ "RTR" {
38 |             mutate { replace => { "type" => "gorouter" } }
39 |             grok {
40 |                 match => { "syslog_msg" => "%{HOSTNAME:[access][host]} - \[%{TIMESTAMP_ISO8601:router_timestamp}\] \"%{WORD:[access][method]} %{NOTSPACE:[access][url]} HTTP/%{NUMBER:[access][http_version]}\" %{NONNEGINT:[access][response_code]:int} %{NONNEGINT:[access][body_received][bytes]:int} %{NONNEGINT:[access][body_sent][bytes]:int} %{QUOTEDSTRING:[access][referrer]} %{QUOTEDSTRING:[access][agent]} \"%{HOSTPORT:[access][remote_ip_and_port]}\" \"%{HOSTPORT:[access][upstream_ip_and_port]}\" %{GREEDYDATA:router_keys}" }
41 |                 tag_on_failure => ["_routerparsefailure"]
42 |                 add_tag => ["gorouter"]
43 |             }
44 |             # replace @timestamp field with the one from router access log
45 |             date {
46 |                 match => [ "router_timestamp", "ISO8601" ]
47 |             }
48 |             kv {
49 |                 source => "router_keys"
50 |                 target => "router"
51 |                 value_split => ":"
52 |                 remove_field => "router_keys"
53 |             }
54 |         }
55 | 
56 |         # Application logs
57 |         if [syslog_proc] =~ "APP" {
58 |             json {
59 |                 source => "syslog_msg"
60 |                 add_tag => ["app"]
61 |             }
62 |         }
63 | 
64 |         # User agent parsing
65 |         if [access][agent] {
66 |             useragent {
67 |                 source => "[access][agent]"
68 |                 target => "[access][user_agent]"
69 |             }
70 |         }
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/alb.tf:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | # ----- alertmanager public ALB -------
  3 | ######################################################################
  4 | #
  5 | #
  6 | # The ALB serves one main purpose: so we can use ACM certs instead of
  7 | # managing our own.  We don't actually want it to load-balance; each
  8 | # public domain name associated with alertmanager should route to
  9 | # exactly one internal alertmanager instance.  We achieve this by
 10 | # using listener rules, so that requests with a particular host:
 11 | # header must go to a particular AZ, and running one alertmanager per
 12 | # AZ.
 13 | 
 14 | 
 15 | resource "aws_lb" "alertmanager_alb" {
 16 |   name               = "${var.environment}-alertmanager-alb"
 17 |   internal           = false
 18 |   load_balancer_type = "application"
 19 | 
 20 |   security_groups = [aws_security_group.alertmanager_alb.id]
 21 | 
 22 |   subnets = data.terraform_remote_state.infra_networking.outputs.public_subnets
 23 | 
 24 |   tags = merge(
 25 |     local.default_tags,
 26 |     {
 27 |       Name = "${var.environment}-alertmanager-alb"
 28 |     },
 29 |   )
 30 | }
 31 | 
 32 | resource "aws_lb_listener" "alertmanager_listener_alb_http" {
 33 |   load_balancer_arn = aws_lb.alertmanager_alb.arn
 34 |   port              = "80"
 35 |   protocol          = "HTTP"
 36 | 
 37 |   default_action {
 38 |     type = "redirect"
 39 | 
 40 |     redirect {
 41 |       port        = "443"
 42 |       protocol    = "HTTPS"
 43 |       status_code = "HTTP_301"
 44 |     }
 45 |   }
 46 | }
 47 | 
 48 | resource "aws_lb_listener" "alertmanager_listener_alb_https" {
 49 |   load_balancer_arn = aws_lb.alertmanager_alb.arn
 50 |   port              = "443"
 51 |   protocol          = "HTTPS"
 52 |   ssl_policy        = "ELBSecurityPolicy-TLS-1-2-2017-01"
 53 |   certificate_arn   = aws_acm_certificate_validation.alertmanager_cert.certificate_arn
 54 | 
 55 |   default_action {
 56 |     type             = "forward"
 57 |     target_group_arn = aws_lb_target_group.alertmanager_all.arn
 58 |   }
 59 | }
 60 | 
 61 | resource "aws_lb_listener_rule" "alertmanager_listener_rule_per_az" {
 62 |   for_each = toset(local.availability_zones)
 63 | 
 64 |   listener_arn = aws_lb_listener.alertmanager_listener_alb_https.arn
 65 | 
 66 |   action {
 67 |     type             = "forward"
 68 |     target_group_arn = aws_lb_target_group.alertmanager_per_az[each.key].arn
 69 |   }
 70 | 
 71 |   condition {
 72 |     host_header {
 73 |       values = ["alerts-${each.key}.*"]
 74 |     }
 75 |   }
 76 | }
 77 | 
 78 | resource "aws_lb_target_group" "alertmanager_per_az" {
 79 |   for_each             = toset(local.availability_zones)
 80 |   name                 = "${var.environment}-alerts-${each.key}"
 81 |   port                 = 9093
 82 |   protocol             = "HTTP"
 83 |   vpc_id               = local.vpc_id
 84 |   deregistration_delay = 30
 85 |   target_type          = "ip"
 86 | 
 87 |   health_check {
 88 |     interval            = 10
 89 |     path                = "/"
 90 |     matcher             = "200"
 91 |     protocol            = "HTTP"
 92 |     healthy_threshold   = 2
 93 |     unhealthy_threshold = 2
 94 |     timeout             = "5"
 95 |   }
 96 | 
 97 |   tags = merge(
 98 |     local.default_tags,
 99 |     {
100 |       Name = "${var.environment}-alertmanager-${each.key}"
101 |     },
102 |   )
103 | }
104 | 
105 | resource "aws_lb_target_group" "alertmanager_all" {
106 |   name                 = "${var.environment}-alerts-all"
107 |   port                 = 9093
108 |   protocol             = "HTTP"
109 |   vpc_id               = local.vpc_id
110 |   deregistration_delay = 30
111 |   target_type          = "ip"
112 | 
113 |   health_check {
114 |     interval            = 10
115 |     path                = "/"
116 |     matcher             = "200"
117 |     protocol            = "HTTP"
118 |     healthy_threshold   = 2
119 |     unhealthy_threshold = 2
120 |     timeout             = "5"
121 |   }
122 | 
123 |   tags = merge(
124 |     local.default_tags,
125 |     {
126 |       Name = "${var.environment}-alertmanager-all"
127 |     },
128 |   )
129 | }
130 | 
131 | resource "aws_route53_record" "alerts_alias" {
132 |   zone_id = local.zone_id
133 |   name    = "alerts"
134 |   type    = "A"
135 | 
136 |   alias {
137 |     name                   = aws_lb.alertmanager_alb.dns_name
138 |     zone_id                = aws_lb.alertmanager_alb.zone_id
139 |     evaluate_target_health = false
140 |   }
141 | }
142 | 
143 | resource "aws_route53_record" "alerts_az_alias" {
144 |   for_each = toset(local.availability_zones)
145 | 
146 |   zone_id = local.zone_id
147 |   name    = "alerts-${each.key}"
148 |   type    = "A"
149 | 
150 |   alias {
151 |     name                   = aws_lb.alertmanager_alb.dns_name
152 |     zone_id                = aws_lb.alertmanager_alb.zone_id
153 |     evaluate_target_health = false
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/alertmanager-service.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 | * ECS service that runs alertmanager
  3 | *
  4 | */
  5 | 
  6 | ### container, task, service definitions
  7 | 
  8 | resource "aws_ecs_cluster" "prometheus_cluster" {
  9 |   name = "${var.environment}-ecs-monitoring"
 10 | 
 11 |   tags = merge(local.default_tags, {
 12 |     Name = "${var.environment}-alertmanager"
 13 |   })
 14 | }
 15 | 
 16 | resource "aws_iam_role" "execution" {
 17 |   name = "${var.environment}-alertmanager-execution"
 18 | 
 19 |   assume_role_policy = <<-EOF
 20 |   {
 21 |     "Version": "2012-10-17",
 22 |     "Statement": [
 23 |       {
 24 |         "Effect": "Allow",
 25 |         "Principal": {
 26 |           "Service": "ecs-tasks.amazonaws.com"
 27 |         },
 28 |         "Action": "sts:AssumeRole"
 29 |       }
 30 |     ]
 31 |   }
 32 | EOF
 33 | 
 34 |   tags = merge(local.default_tags, {
 35 |     Name = "${var.environment}-alertmanager-execution"
 36 |   })
 37 | }
 38 | 
 39 | resource "aws_iam_policy" "execution" {
 40 |   name = "${var.environment}-alertmanager-execution"
 41 | 
 42 |   policy = <<-EOF
 43 |   {
 44 |     "Version": "2012-10-17",
 45 |     "Statement": [
 46 |       {
 47 |         "Effect": "Allow",
 48 |         "Action": [
 49 |           "logs:CreateLogStream",
 50 |           "logs:PutLogEvents"
 51 |         ],
 52 |         "Resource": "*"
 53 |       }
 54 |     ]
 55 |   }
 56 | EOF
 57 | 
 58 | }
 59 | 
 60 | resource "aws_iam_role_policy_attachment" "execution_execution" {
 61 |   role       = aws_iam_role.execution.name
 62 |   policy_arn = aws_iam_policy.execution.arn
 63 | }
 64 | 
 65 | data "template_file" "alertmanager_nlb_container_defn" {
 66 |   template = file("${path.module}/task-definitions/alertmanager.json")
 67 | 
 68 |   vars = {
 69 |     alertmanager_config_base64 = base64encode(data.template_file.alertmanager_config_file.rendered)
 70 |     templates_base64           = base64encode(file("${path.module}/templates/default.tmpl"))
 71 |     alertmanager_url           = "--web.external-url=https://${aws_route53_record.alerts_alias.fqdn}"
 72 |     log_group                  = aws_cloudwatch_log_group.task_logs.name
 73 |     region                     = var.aws_region
 74 |   }
 75 | 
 76 |   depends_on = [
 77 |     module.assertion_alertmanager_config_file_valid_yaml.checked,
 78 |   ]
 79 | }
 80 | 
 81 | module "assertion_alertmanager_nlb_container_defn_valid_json" {
 82 |   source = "github.com/Invicton-Labs/terraform-null-assertion?ref=47d7354cc5521853fbe8df96b7bb0223bea732cd"
 83 | 
 84 |   condition = can(jsondecode(data.template_file.alertmanager_nlb_container_defn.rendered))
 85 | 
 86 |   error_message = "Alertmanager NLB container definition failed JSON parsing"
 87 | }
 88 | 
 89 | resource "aws_ecs_task_definition" "alertmanager_nlb" {
 90 |   family                   = "${var.environment}-alertmanager"
 91 |   container_definitions    = data.template_file.alertmanager_nlb_container_defn.rendered
 92 |   network_mode             = "awsvpc"
 93 |   execution_role_arn       = aws_iam_role.execution.arn
 94 |   requires_compatibilities = ["FARGATE"]
 95 |   cpu                      = 256
 96 |   memory                   = 512
 97 | 
 98 |   tags = merge(local.default_tags, {
 99 |     Name = "${var.environment}-alertmanager"
100 |   })
101 | 
102 |   depends_on = [
103 |     module.assertion_alertmanager_nlb_container_defn_valid_json.checked,
104 |   ]
105 | }
106 | 
107 | resource "aws_ecs_service" "alertmanager_alb" {
108 |   for_each = {
109 |     for _, subnet in data.aws_subnet.private_subnets :
110 |     subnet.id => subnet.availability_zone
111 |   }
112 |   name            = "${var.environment}-alertmanager-alb-${each.value}"
113 |   cluster         = "${var.environment}-ecs-monitoring"
114 |   task_definition = aws_ecs_task_definition.alertmanager_nlb.arn
115 |   desired_count   = 1
116 |   launch_type     = "FARGATE"
117 | 
118 |   wait_for_steady_state = true
119 | 
120 |   load_balancer {
121 |     target_group_arn = aws_lb_target_group.alertmanager_all.arn
122 |     container_name   = "alertmanager"
123 |     container_port   = 9093
124 |   }
125 | 
126 |   load_balancer {
127 |     target_group_arn = aws_lb_target_group.alertmanager_per_az[each.value].arn
128 |     container_name   = "alertmanager"
129 |     container_port   = 9093
130 |   }
131 | 
132 |   network_configuration {
133 |     subnets         = [each.key]
134 |     security_groups = [aws_security_group.alertmanager_task.id]
135 |   }
136 | 
137 |   service_registries {
138 |     registry_arn = aws_service_discovery_service.alertmanager.arn
139 |   }
140 | }
141 | 
142 | #### alertmanager
143 | 
144 | data "pass_password" "observe_pagerduty_key" {
145 |   path = "pagerduty/integration-keys/production"
146 | }
147 | 
148 | data "pass_password" "dgu_pagerduty_key" {
149 |   path = "pagerduty/integration-keys/dgu"
150 | }
151 | 
152 | data "pass_password" "govuk_pagerduty_key" {
153 |   path = "pagerduty/integration-keys/govuk"
154 | }
155 | 
156 | data "pass_password" "verify_p1_pagerduty_key" {
157 |   path = "pagerduty/integration-keys/verify-p1"
158 | }
159 | 
160 | data "pass_password" "verify_p2_pagerduty_key" {
161 |   path = "pagerduty/integration-keys/verify-p2"
162 | }
163 | 
164 | data "pass_password" "dcs_p2_pagerduty_key" {
165 |   path = "pagerduty/integration-keys/dcs-p2"
166 | }
167 | 
168 | data "pass_password" "slack_api_url" {
169 |   path = "slack-api-url"
170 | }
171 | 
172 | data "pass_password" "notify_zendesk" {
173 |   path = "receivers/notify/zendesk"
174 | }
175 | 
176 | data "pass_password" "notify_p2_pagerduty_key" {
177 |   path = "receivers/notify/p2_pagerduty"
178 | }
179 | 
180 | data "pass_password" "autom8_email" {
181 |   path = "receivers/autom8/email"
182 | 
183 | }
184 | 
185 | data "pass_password" "verify_staging_cronitor" {
186 |   path = "cronitor/verify-staging-url"
187 | }
188 | 
189 | data "pass_password" "verify_integration_cronitor" {
190 |   path = "cronitor/verify-integration-url"
191 | }
192 | 
193 | data "pass_password" "verify_prod_cronitor" {
194 |   path = "cronitor/verify-prod-url"
195 | }
196 | 
197 | data "template_file" "alertmanager_config_file" {
198 |   template = file("${path.module}/templates/alertmanager.tpl")
199 | 
200 |   vars = {
201 |     observe_pagerduty_key   = data.pass_password.observe_pagerduty_key.password
202 |     dgu_pagerduty_key       = data.pass_password.dgu_pagerduty_key.password
203 |     govuk_pagerduty_key     = data.pass_password.govuk_pagerduty_key.password
204 |     verify_p1_pagerduty_key = data.pass_password.verify_p1_pagerduty_key.password
205 |     verify_p2_pagerduty_key = data.pass_password.verify_p2_pagerduty_key.password
206 |     dcs_p2_pagerduty_key    = data.pass_password.dcs_p2_pagerduty_key.password
207 |     slack_api_url           = data.pass_password.slack_api_url.password
208 |     notify_zendesk          = data.pass_password.notify_zendesk.password
209 |     notify_p2_pagerduty_key = data.pass_password.notify_p2_pagerduty_key.password
210 |     smtp_from               = "alerts@${data.terraform_remote_state.infra_networking.outputs.public_subdomain}"
211 |     # Port as requested by https://docs.aws.amazon.com/ses/latest/DeveloperGuide/smtp-connect.html
212 |     smtp_smarthost              = "email-smtp.${var.aws_region}.amazonaws.com:587"
213 |     smtp_username               = aws_iam_access_key.smtp.id
214 |     smtp_password               = aws_iam_access_key.smtp.ses_smtp_password_v4
215 |     autom8_recipient_email      = data.pass_password.autom8_email.password
216 |     observe_cronitor            = var.observe_cronitor
217 |     verify_staging_cronitor     = data.pass_password.verify_staging_cronitor.password
218 |     verify_integration_cronitor = data.pass_password.verify_integration_cronitor.password
219 |     verify_prod_cronitor        = data.pass_password.verify_prod_cronitor.password
220 |   }
221 | }
222 | 
223 | module "assertion_alertmanager_config_file_valid_yaml" {
224 |   source = "github.com/Invicton-Labs/terraform-null-assertion?ref=47d7354cc5521853fbe8df96b7bb0223bea732cd"
225 | 
226 |   condition = can(yamldecode(data.template_file.alertmanager_config_file.rendered))
227 | 
228 |   error_message = "Alertmanager config failed YAML parsing"
229 | }
230 | 
231 | ## AWS SES
232 | 
233 | resource "aws_ses_domain_identity" "main" {
234 |   domain = data.terraform_remote_state.infra_networking.outputs.public_subdomain
235 | }
236 | 
237 | resource "aws_route53_record" "txt_amazonses_verification_record" {
238 |   zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id
239 |   name    = "_amazonses.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}"
240 |   type    = "TXT"
241 |   ttl     = "600"
242 |   records = [aws_ses_domain_identity.main.verification_token]
243 | }
244 | 
245 | resource "aws_ses_domain_dkim" "main" {
246 |   domain = aws_ses_domain_identity.main.domain
247 | }
248 | 
249 | resource "aws_route53_record" "dkim_amazonses_verification_record" {
250 |   count   = 3
251 |   zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id
252 |   name    = "${element(aws_ses_domain_dkim.main.dkim_tokens, count.index)}._domainkey.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}"
253 |   type    = "CNAME"
254 |   ttl     = "600"
255 |   records = ["${element(aws_ses_domain_dkim.main.dkim_tokens, count.index)}.dkim.amazonses.com"]
256 | }
257 | 
258 | resource "aws_ses_domain_mail_from" "alerts" {
259 |   domain           = aws_ses_domain_identity.main.domain
260 |   mail_from_domain = "mail.${aws_ses_domain_identity.main.domain}"
261 | }
262 | 
263 | resource "aws_route53_record" "alerts_ses_domain_mail_from_mx" {
264 |   zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id
265 |   name    = aws_ses_domain_mail_from.alerts.mail_from_domain
266 |   type    = "MX"
267 |   ttl     = "600"
268 |   records = ["10 feedback-smtp.${var.aws_region}.amazonses.com"]
269 | }
270 | 
271 | resource "aws_route53_record" "alerts_ses_domain_mail_from_txt" {
272 |   zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id
273 |   name    = aws_ses_domain_mail_from.alerts.mail_from_domain
274 |   type    = "TXT"
275 |   ttl     = "600"
276 |   records = ["v=spf1 include:amazonses.com -all"]
277 | }
278 | 
279 | # IAM for SMTP
280 | 
281 | resource "aws_iam_user" "smtp" {
282 |   name = "${var.environment}.smtp"
283 |   path = "/system/"
284 | 
285 |   tags = merge(local.default_tags, {
286 |     Name = "${var.environment}-alertmanager-smtp"
287 |   })
288 | }
289 | 
290 | resource "aws_iam_access_key" "smtp" {
291 |   user = aws_iam_user.smtp.name
292 | }
293 | 
294 | resource "aws_iam_user_policy" "smtp_ro" {
295 |   name = "${var.environment}.smtp"
296 |   user = aws_iam_user.smtp.name
297 | 
298 |   policy = <<EOF
299 | {
300 |   "Version": "2012-10-17",
301 |   "Statement": [
302 |     {
303 |       "Effect": "Allow",
304 |       "Action": "ses:SendRawEmail",
305 |       "Resource": "*"
306 |     }
307 |   ]
308 | }
309 | EOF
310 | 
311 | }
312 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/certificate.tf:
--------------------------------------------------------------------------------
 1 | # AWS should manage the certificate renewal automatically
 2 | # https://docs.aws.amazon.com/acm/latest/userguide/managed-renewal.html
 3 | # If this fails, AWS will email associated with the AWS account
 4 | resource "aws_acm_certificate" "alertmanager_cert" {
 5 |   domain_name       = "alerts.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}"
 6 |   validation_method = "DNS"
 7 | 
 8 |   subject_alternative_names = formatlist("alerts-%s.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}", data.aws_availability_zones.available.names)
 9 | 
10 |   lifecycle {
11 |     # We can't destroy a certificate that's in use, and we can't stop
12 |     # using it until the new one is ready.  Hence
13 |     # create_before_destroy here.
14 |     create_before_destroy = true
15 |   }
16 | }
17 | 
18 | resource "aws_route53_record" "alertmanager_cert_validation" {
19 |   for_each = {
20 |     for dvo in aws_acm_certificate.alertmanager_cert.domain_validation_options : dvo.domain_name => {
21 |       name   = dvo.resource_record_name
22 |       record = dvo.resource_record_value
23 |       type   = dvo.resource_record_type
24 |     }
25 |   }
26 | 
27 |   name    = each.value.name
28 |   records = [each.value.record]
29 |   type    = each.value.type
30 |   zone_id = local.zone_id
31 |   ttl     = 60
32 | 
33 |   allow_overwrite = true
34 | 
35 |   depends_on = [aws_acm_certificate.alertmanager_cert]
36 | }
37 | 
38 | resource "aws_acm_certificate_validation" "alertmanager_cert" {
39 |   certificate_arn         = aws_acm_certificate.alertmanager_cert.arn
40 |   validation_record_fqdns = [for record in aws_route53_record.alertmanager_cert_validation : record.fqdn]
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 | * ## Module: alertmanager
  3 | *
  4 | * Create services and task definitions for the ECS cluster
  5 | *
  6 | */
  7 | 
  8 | variable "aws_region" {
  9 |   type        = string
 10 |   description = "AWS region"
 11 |   default     = "eu-west-1"
 12 | }
 13 | 
 14 | variable "remote_state_bucket" {
 15 |   type        = string
 16 |   description = "S3 bucket we store our terraform state in"
 17 |   default     = "ecs-monitoring"
 18 | }
 19 | 
 20 | variable "environment" {
 21 |   type        = string
 22 |   description = "Unique name for this collection of resources"
 23 |   default     = "ecs-monitoring"
 24 | }
 25 | 
 26 | variable "observe_cronitor" {
 27 |   type        = string
 28 |   description = "URL to send Observe heartbeats to"
 29 |   default     = ""
 30 | }
 31 | 
 32 | variable "allowed_cidrs" {
 33 |   type        = list(string)
 34 |   description = "List of CIDRs which are able to access alertmanager, default are GDS ips and concourse egress"
 35 | 
 36 |   default = [
 37 |     "213.86.153.211/32",
 38 |     "213.86.153.212/32",
 39 |     "213.86.153.213/32",
 40 |     "213.86.153.214/32",
 41 |     "213.86.153.231/32",
 42 |     "213.86.153.235/32",
 43 |     "213.86.153.236/32",
 44 |     "213.86.153.237/32",
 45 |     "85.133.67.244/32",
 46 |     "35.177.37.128/32",
 47 |     "35.176.252.164/32",
 48 |     "51.149.8.0/25",
 49 |     "51.149.8.128/29", # CO
 50 |     "51.149.9.112/29", # CO
 51 |     "51.149.9.240/29", # CO
 52 |   ]
 53 | }
 54 | 
 55 | locals {
 56 |   default_tags = {
 57 |     Terraform   = "true"
 58 |     Project     = "alertmanager"
 59 |     Source      = "github.com/alphagov/prometheus-aws-configuration-beta"
 60 |     Environment = var.environment
 61 |     Service     = "alertmanager"
 62 |   }
 63 |   vpc_id             = data.terraform_remote_state.infra_networking.outputs.vpc_id
 64 |   zone_id            = data.terraform_remote_state.infra_networking.outputs.public_zone_id
 65 |   availability_zones = data.aws_subnet.public_subnets.*.availability_zone
 66 | }
 67 | 
 68 | # Resources
 69 | # --------------------------------------------------------------
 70 | 
 71 | ## Data sources
 72 | data "terraform_remote_state" "infra_networking" {
 73 |   backend = "s3"
 74 | 
 75 |   config = {
 76 |     bucket = var.remote_state_bucket
 77 |     key    = "infra-networking-modular.tfstate"
 78 |     region = var.aws_region
 79 |   }
 80 | }
 81 | 
 82 | data "terraform_remote_state" "infra_security_groups" {
 83 |   backend = "s3"
 84 | 
 85 |   config = {
 86 |     bucket = var.remote_state_bucket
 87 |     key    = "infra-security-groups-modular.tfstate"
 88 |     region = var.aws_region
 89 |   }
 90 | }
 91 | 
 92 | data "aws_availability_zones" "available" {}
 93 | 
 94 | data "aws_subnet" "public_subnets" {
 95 |   count = length(data.terraform_remote_state.infra_networking.outputs.public_subnets)
 96 |   id    = data.terraform_remote_state.infra_networking.outputs.public_subnets[count.index]
 97 | }
 98 | 
 99 | data "aws_subnet" "private_subnets" {
100 |   count = length(data.terraform_remote_state.infra_networking.outputs.private_subnets)
101 |   id    = data.terraform_remote_state.infra_networking.outputs.private_subnets[count.index]
102 | }
103 | 
104 | ## Resources
105 | 
106 | resource "aws_cloudwatch_log_group" "task_logs" {
107 |   name              = var.environment
108 |   retention_in_days = 7
109 | 
110 |   tags = merge(local.default_tags, {
111 |     Name = "${var.environment}-alertmanager-task-logs"
112 |   })
113 | }
114 | 
115 | ## Outputs
116 | 
117 | output "ecs_clusters_services" {
118 |   description = "Names of ECS services created, listed by ECS cluster name"
119 |   value = transpose({
120 |     for _, service in aws_ecs_service.alertmanager_alb:
121 |     service.name => [ service.cluster ]
122 |   })
123 | }
124 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/security-group.tf:
--------------------------------------------------------------------------------
  1 | resource "aws_security_group" "alertmanager_alb" {
  2 |   name        = "${var.environment}-alertmanager-alb"
  3 |   vpc_id      = local.vpc_id
  4 |   description = "Alertmanager ALB"
  5 | 
  6 |   tags = merge(
  7 |     local.default_tags,
  8 |     {
  9 |       Name = "alertmanager-alb",
 10 |     },
 11 |   )
 12 | }
 13 | 
 14 | resource "aws_security_group" "alertmanager_task" {
 15 |   name        = "${var.environment}-alertmanager-task"
 16 |   vpc_id      = local.vpc_id
 17 |   description = "Controls ingress and egress for the alertmanager task"
 18 | 
 19 |   tags = merge(
 20 |     local.default_tags,
 21 |     {
 22 |       Name = "alertmanager-task",
 23 |     },
 24 |   )
 25 | }
 26 | 
 27 | # Alertmanager is behind an NLB, so it needs to allow ingress from the
 28 | # allowed public internet cidrs directly
 29 | resource "aws_security_group_rule" "ingress_from_allowed_cidrs_to_alertmanager_9093" {
 30 |   security_group_id = aws_security_group.alertmanager_task.id
 31 |   type              = "ingress"
 32 |   from_port         = 9093
 33 |   to_port           = 9093
 34 |   protocol          = "tcp"
 35 |   cidr_blocks       = var.allowed_cidrs
 36 | }
 37 | 
 38 | # Alertmanager ALB needs to allow ingress from the allowed public
 39 | # internet cidrs
 40 | resource "aws_security_group_rule" "ingress_from_allowed_cidrs_to_alertmanager_alb_http" {
 41 |   security_group_id = aws_security_group.alertmanager_alb.id
 42 |   type              = "ingress"
 43 |   from_port         = 80
 44 |   to_port           = 80
 45 |   protocol          = "tcp"
 46 |   cidr_blocks       = var.allowed_cidrs
 47 | }
 48 | 
 49 | resource "aws_security_group_rule" "ingress_from_allowed_cidrs_to_alertmanager_alb_https" {
 50 |   security_group_id = aws_security_group.alertmanager_alb.id
 51 |   type              = "ingress"
 52 |   from_port         = 443
 53 |   to_port           = 443
 54 |   protocol          = "tcp"
 55 |   cidr_blocks       = var.allowed_cidrs
 56 | }
 57 | 
 58 | # NLB health checks come from the public subnet IP range
 59 | resource "aws_security_group_rule" "ingress_from_public_subnets_to_alertmanager_9093" {
 60 |   security_group_id = aws_security_group.alertmanager_task.id
 61 |   type              = "ingress"
 62 |   from_port         = 9093
 63 |   to_port           = 9093
 64 |   protocol          = "tcp"
 65 |   cidr_blocks       = data.aws_subnet.public_subnets.*.cidr_block
 66 | }
 67 | 
 68 | resource "aws_security_group_rule" "ingress_from_alertmanager_alb_to_alertmanager_9093" {
 69 |   security_group_id        = aws_security_group.alertmanager_task.id
 70 |   source_security_group_id = aws_security_group.alertmanager_alb.id
 71 |   type                     = "ingress"
 72 |   from_port                = 9093
 73 |   to_port                  = 9093
 74 |   protocol                 = "tcp"
 75 | }
 76 | 
 77 | resource "aws_security_group_rule" "egress_from_alertmanager_alb_to_alertmanager_9093" {
 78 |   security_group_id = aws_security_group.alertmanager_alb.id
 79 |   # source_security_group_id means destination for egress rules
 80 |   source_security_group_id = aws_security_group.alertmanager_task.id
 81 |   type                     = "egress"
 82 |   from_port                = 9093
 83 |   to_port                  = 9093
 84 |   protocol                 = "tcp"
 85 | }
 86 | 
 87 | # TODO: could we make observe prometheus more consistent with external
 88 | # prometheis and go via public NLB IPs?
 89 | resource "aws_security_group_rule" "ingress_from_prometheus_ec2_to_alertmanager_task" {
 90 |   security_group_id        = aws_security_group.alertmanager_task.id
 91 |   type                     = "ingress"
 92 |   from_port                = 9093
 93 |   to_port                  = 9093
 94 |   protocol                 = "tcp"
 95 |   source_security_group_id = data.terraform_remote_state.infra_security_groups.outputs.prometheus_ec2_sg_id
 96 | }
 97 | 
 98 | 
 99 | resource "aws_security_group_rule" "ingress_alertmanager_task_meshing" {
100 |   security_group_id        = aws_security_group.alertmanager_task.id
101 |   type                     = "ingress"
102 |   from_port                = 9094
103 |   to_port                  = 9094
104 |   protocol                 = "tcp"
105 |   source_security_group_id = aws_security_group.alertmanager_task.id
106 | }
107 | 
108 | # This rule allows all egress out of alertmanager_task. This is for the following purposes:
109 | # - raising alerts with receivers such as pagerduty and cronitor
110 | # - sending emails via AWS API
111 | # - communicate with other alertmanagers to mesh
112 | resource "aws_security_group_rule" "egress_from_alertmanager_task_to_all" {
113 |   security_group_id = aws_security_group.alertmanager_task.id
114 |   type              = "egress"
115 |   from_port         = 0
116 |   to_port           = 0
117 |   protocol          = "-1"
118 |   cidr_blocks       = ["0.0.0.0/0"]
119 | }
120 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/service_discovery.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_service_discovery_private_dns_namespace" "observe" {
 2 |   name        = "local.gds-reliability.engineering"
 3 |   description = "Observe instances"
 4 |   vpc         = local.vpc_id
 5 | }
 6 | 
 7 | resource "aws_service_discovery_service" "alertmanager" {
 8 |   name = "alertmanager"
 9 | 
10 |   description = "A service to allow alertmanager peers to discover each other"
11 | 
12 |   dns_config {
13 |     namespace_id = aws_service_discovery_private_dns_namespace.observe.id
14 | 
15 |     dns_records {
16 |       ttl  = 10
17 |       type = "A"
18 |     }
19 | 
20 |     routing_policy = "MULTIVALUE"
21 |   }
22 | 
23 |   health_check_custom_config {
24 |     failure_threshold = 2
25 |   }
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/task-definitions/alertmanager.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "alertmanager",
 4 |     "image": "prom/alertmanager",
 5 |     "memoryReservation": 512,
 6 |     "essential": true,
 7 |     "portMappings": [
 8 |       {
 9 |         "containerPort": 9093,
10 |         "hostPort": 9093
11 |       },
12 |       {
13 |         "containerPort": 9094,
14 |         "hostPort": 9094
15 |       }
16 |     ],
17 |     "environment": [
18 |       {
19 |         "Name": "ALERTMANAGER_CONFIG",
20 |         "Value": "${alertmanager_config_base64}"
21 |       },
22 |       {
23 |         "Name": "TEMPLATES",
24 |         "Value": "${templates_base64}"
25 |       }
26 |     ],
27 |     "entryPoint": [
28 |       "/bin/sh",
29 |       "-c",
30 |       "echo \"$ALERTMANAGER_CONFIG\" | base64 -d > /etc/alertmanager/alertmanager.yml; echo \"$TEMPLATES\" | base64 -d > /etc/alertmanager/default.tmpl; /bin/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --cluster.peer=alertmanager.local.gds-reliability.engineering:9094 ${alertmanager_url}"
31 |     ],
32 |     "logConfiguration": {
33 |       "logDriver": "awslogs",
34 |       "options": {
35 |         "awslogs-group": "${log_group}",
36 |         "awslogs-region": "${region}",
37 |         "awslogs-stream-prefix": "alertmanager"
38 |       }
39 |     }
40 |   }
41 | ]
42 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/templates/alertmanager.tpl:
--------------------------------------------------------------------------------
  1 | global:
  2 |   resolve_timeout: 5m
  3 | 
  4 |   smtp_from: "${smtp_from}"
  5 |   smtp_smarthost: "${smtp_smarthost}"
  6 |   smtp_auth_username: "${smtp_username}"
  7 |   smtp_auth_password: "${smtp_password}"
  8 |   slack_api_url: "${slack_api_url}"
  9 | 
 10 | templates:
 11 | - '/etc/alertmanager/default.tmpl'
 12 | 
 13 | route:
 14 |   receiver: "re-observe-pagerduty"
 15 |   group_by:
 16 |     - alertname
 17 |     - product
 18 |     - deployment
 19 |   routes:
 20 |   - receiver: "autom8-tickets"
 21 |     repeat_interval: 7d
 22 |     match:
 23 |       product: "prometheus"
 24 |       severity: "ticket"
 25 |   - receiver: "notify-tickets"
 26 |     repeat_interval: 7d
 27 |     match:
 28 |       product: "notify"
 29 |       severity: "ticket"
 30 |   - receiver: "notify-p2"
 31 |     repeat_interval: 7d
 32 |     match:
 33 |       product: "notify"
 34 |       severity: "p2"
 35 |   - receiver: "dgu-pagerduty"
 36 |     match:
 37 |       product: "data-gov-uk"
 38 |   - receiver: "govuk-pagerduty"
 39 |     match:
 40 |       product: "govuk-accounts"
 41 |   - receiver: "re-observe-pagerduty"
 42 |     match:
 43 |       product: "prometheus"
 44 |       severity: "page"
 45 |   - receiver: "observe-cronitor"
 46 |     group_interval: 1m
 47 |     repeat_interval: 1m
 48 |     match:
 49 |       product: "prometheus"
 50 |       severity: "constant"
 51 |   - receiver: "dev-null"
 52 |     match:
 53 |       product: "doc-checking"
 54 |     routes:
 55 |     - match_re:
 56 |         space: production|integration
 57 |       receiver: dcs-slack
 58 |       routes:
 59 |       - match:
 60 |           space: production
 61 |           severity: p2
 62 |         receiver: "dcs-p2"
 63 |   # Verify hub ECS
 64 |   - receiver: "verify-2ndline-slack"
 65 |     match:
 66 |       product: "verify"
 67 |     routes:
 68 |     - receiver: "verify-p1"
 69 |       match:
 70 |         deployment: prod
 71 |         severity: p1
 72 |     - receiver: "verify-p2"
 73 |       match:
 74 |         deployment: integration
 75 |         severity: p1
 76 |     - receiver: "verify-p3"
 77 |       match:
 78 |         severity: ticket
 79 |     - match:
 80 |         severity: constant
 81 |       group_interval: 1m
 82 |       repeat_interval: 1m
 83 |       routes:
 84 |         - match:
 85 |             deployment: prod
 86 |           receiver: "verify-prod-cronitor"
 87 |         - match:
 88 |             deployment: integration
 89 |           receiver: "verify-integration-cronitor"
 90 |         - match:
 91 |             deployment: staging
 92 |           receiver: "verify-staging-cronitor"
 93 | 
 94 | receivers:
 95 | - name: "re-observe-pagerduty"
 96 |   pagerduty_configs:
 97 |     - service_key: "${observe_pagerduty_key}"
 98 | - name: "dgu-pagerduty"
 99 |   pagerduty_configs:
100 |     - service_key: "${dgu_pagerduty_key}"
101 | - name: "govuk-pagerduty"
102 |   pagerduty_configs:
103 |     - service_key: "${govuk_pagerduty_key}"
104 | - name: "notify-tickets"
105 |   email_configs:
106 |   - to: "${notify_zendesk}"
107 | - name: "notify-p2"
108 |   pagerduty_configs:
109 |     - service_key: "${notify_p2_pagerduty_key}"
110 | - name: "observe-cronitor"
111 |   webhook_configs:
112 |   - send_resolved: false
113 |     url: "${observe_cronitor}"
114 | - name: "verify-prod-cronitor"
115 |   webhook_configs:
116 |   - send_resolved: false
117 |     url: "${verify_prod_cronitor}"
118 | - name: "verify-integration-cronitor"
119 |   webhook_configs:
120 |   - send_resolved: false
121 |     url: "${verify_integration_cronitor}"
122 | - name: "verify-staging-cronitor"
123 |   webhook_configs:
124 |   - send_resolved: false
125 |     url: "${verify_staging_cronitor}"
126 | - name: "verify-2ndline-slack"
127 |   slack_configs: &verify-2ndline-slack-configs
128 |   - send_resolved: true
129 |     channel: '#verify-2ndline'
130 |     icon_emoji: ':verify-shield:'
131 |     username: alertmanager
132 | - name: "autom8-tickets"
133 |   email_configs:
134 |   - to: "${autom8_recipient_email}"
135 |   slack_configs:
136 |   - send_resolved: true
137 |     channel: '#re-autom8-alerts'
138 |     icon_emoji: ':verify-shield:'
139 |     username: alertmanager
140 |     color: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}warning{{ else }}danger{{ end }}{{ else }}good{{ end }}'
141 |     pretext: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}:warning:{{ else }}:rotating_light:{{ end }}{{ else }}:green_tick:{{ end }} {{ .CommonLabels.alertname }}:{{ .CommonAnnotations.summary }}'
142 |     text: |-
143 |       *Description:* {{ .CommonAnnotations.message }}
144 |       {{ range .Alerts }}
145 |         *Details:*
146 |         {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
147 |         {{ end }}
148 |       {{ end }}
149 |     short_fields: true
150 |     fields:
151 |     - title: Product
152 |       value: '{{ .CommonLabels.product }}'
153 |     - title: Deployment
154 |       value: '{{ .CommonLabels.deployment }}'
155 |     actions:
156 |     - type: button
157 |       text: Runbook
158 |       url: '{{ .CommonAnnotations.runbook_url }}'
159 | - name: "dcs-slack"
160 |   slack_configs:
161 |   - send_resolved: true
162 |     channel: '#di-dcs-2ndline'
163 |     icon_emoji: ':gsp:'
164 |     username: alertmanager
165 |     color: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}warning{{ else }}danger{{ end }}{{ else }}good{{ end }}'
166 |     pretext: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}:warning:{{ else }}:rotating_light:{{ end }}{{ else }}:green_tick:{{ end }} {{ .CommonLabels.alertname }}:{{ .CommonAnnotations.summary }}'
167 |     text: |-
168 |       *Description:* {{ .CommonAnnotations.message }}
169 |       {{ range .Alerts }}
170 |         *Details:*
171 |         {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
172 |         {{ end }}
173 |       {{ end }}
174 |     short_fields: true
175 |     fields:
176 |     - title: Product
177 |       value: '{{ .CommonLabels.product }}'
178 |     - title: Namespace
179 |       value: '{{ .CommonLabels.namespace }}'
180 |     - title: |
181 |         {{- if .CommonLabels.job_name -}}
182 |         Job
183 |         {{- else if .CommonLabels.deployment -}}
184 |         Deployment
185 |         {{- else if match "^KubePod" .CommonLabels.alertname -}}
186 |         Pod
187 |         {{- end -}}
188 |       value: |
189 |         {{- if .CommonLabels.job_name -}}
190 |           {{ .CommonLabels.job_name }}
191 |         {{- else if .CommonLabels.deployment -}}
192 |           {{ .CommonLabels.deployment }}
193 |         {{- else if match "^KubePod" .CommonLabels.alertname -}}
194 |           {{ .CommonLabels.pod }}
195 |         {{- end -}}
196 |     actions:
197 |     - type: button
198 |       text: Runbook
199 |       url: '{{ .CommonAnnotations.runbook_url }}'
200 | - name: "dcs-p2"
201 |   pagerduty_configs:
202 |     - service_key: "${dcs_p2_pagerduty_key}"
203 | - name: "verify-p1"
204 |   pagerduty_configs:
205 |     - service_key: "${verify_p1_pagerduty_key}"
206 |   slack_configs: *verify-2ndline-slack-configs
207 | - name: "verify-p2"
208 |   pagerduty_configs:
209 |     - service_key: "${verify_p2_pagerduty_key}"
210 |   slack_configs: *verify-2ndline-slack-configs
211 | - name: "verify-p3"
212 |   slack_configs: *verify-2ndline-slack-configs
213 | - name: "dev-null"
214 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/templates/default.tmpl:
--------------------------------------------------------------------------------
1 | {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
2 | 
3 | {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
4 | 
5 | {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }}
6 | {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }}
7 | {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }}
8 | {{ define "slack.default.footer" }}{{ end }}
9 | 


--------------------------------------------------------------------------------
/terraform/modules/alertmanager/versions.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | terraform {
 3 |   required_version = ">= 0.13"
 4 |   required_providers {
 5 |     aws = {
 6 |       source = "hashicorp/aws"
 7 |     }
 8 |     pass = {
 9 |       source = "camptocamp/pass"
10 |     }
11 |     template = {
12 |       source = "hashicorp/template"
13 |     }
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/terraform/modules/app-ecs-albs/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 | * ## Module: app-ecs-albs
  3 | *
  4 | * Load balancer for Prometheus
  5 | *
  6 | */
  7 | 
  8 | variable "aws_region" {
  9 |   type        = string
 10 |   description = "AWS region"
 11 | }
 12 | 
 13 | variable "remote_state_bucket" {
 14 |   type        = string
 15 |   description = "S3 bucket we store our terraform state in"
 16 | }
 17 | 
 18 | variable "environment" {
 19 |   type        = string
 20 |   description = "Unique name for this collection of resources"
 21 | }
 22 | 
 23 | variable "zone_id" {
 24 |   type        = string
 25 |   description = "Route 53 zone ID for registering public DNS records"
 26 | }
 27 | 
 28 | variable "subnets" {
 29 |   type        = list(string)
 30 |   description = "Subnets to attach load balancers to"
 31 | }
 32 | 
 33 | variable "prometheus_count" {
 34 |   type        = string
 35 |   description = "Number of prometheus instances to create listener rules and target groups for"
 36 |   default     = "3"
 37 | }
 38 | 
 39 | # locals
 40 | # --------------------------------------------------------------
 41 | 
 42 | locals {
 43 |   default_tags = {
 44 |     Terraform   = "true"
 45 |     Project     = "app-ecs-albs"
 46 |     Source      = "github.com/alphagov/prometheus-aws-configuration-beta"
 47 |     Environment = var.environment
 48 |   }
 49 | 
 50 |   prom_records_count = var.prometheus_count
 51 | 
 52 |   # data.aws_route_53.XXX.name has a trailing dot which we remove with replace() to make ACM happy
 53 |   subdomain = replace(data.aws_route53_zone.public_zone.name, "/\\.$/", "")
 54 |   vpc_id    = data.aws_subnet.first_subnet.vpc_id
 55 | }
 56 | 
 57 | ## Data sources
 58 | 
 59 | data "terraform_remote_state" "infra_networking" {
 60 |   backend = "s3"
 61 | 
 62 |   config = {
 63 |     bucket = var.remote_state_bucket
 64 |     key    = "infra-networking-modular.tfstate"
 65 |     region = var.aws_region
 66 |   }
 67 | }
 68 | 
 69 | data "terraform_remote_state" "infra_security_groups" {
 70 |   backend = "s3"
 71 | 
 72 |   config = {
 73 |     bucket = var.remote_state_bucket
 74 |     key    = "infra-security-groups-modular.tfstate"
 75 |     region = var.aws_region
 76 |   }
 77 | }
 78 | 
 79 | data "aws_route53_zone" "public_zone" {
 80 |   zone_id = var.zone_id
 81 | }
 82 | 
 83 | data "aws_subnet" "first_subnet" {
 84 |   id = var.subnets[0]
 85 | }
 86 | 
 87 | ######################################################################
 88 | # ----- prometheus public ALB -------
 89 | ######################################################################
 90 | 
 91 | # AWS should manage the certificate renewal automatically
 92 | # https://docs.aws.amazon.com/acm/latest/userguide/managed-renewal.html
 93 | # If this fails, AWS will email associated with the AWS account
 94 | resource "aws_acm_certificate" "prometheus_cert" {
 95 |   domain_name       = "prom.${local.subdomain}"
 96 |   validation_method = "DNS"
 97 | 
 98 |   subject_alternative_names = aws_route53_record.prom_alias.*.fqdn
 99 | 
100 |   lifecycle {
101 |     # We can't destroy a certificate that's in use, and we can't stop
102 |     # using it until the new one is ready.  Hence
103 |     # create_before_destroy here.
104 |     create_before_destroy = true
105 |   }
106 | }
107 | 
108 | resource "aws_route53_record" "prometheus_cert_validation" {
109 |   for_each = {
110 |     for dvo in aws_acm_certificate.prometheus_cert.domain_validation_options : dvo.domain_name => {
111 |       name   = dvo.resource_record_name
112 |       record = dvo.resource_record_value
113 |       type   = dvo.resource_record_type
114 |     }
115 |   }
116 | 
117 |   name    = each.value.name
118 |   records = [each.value.record]
119 |   type    = each.value.type
120 |   zone_id = var.zone_id
121 |   ttl     = 60
122 | 
123 |   allow_overwrite = true
124 | 
125 |   depends_on = [aws_acm_certificate.prometheus_cert]
126 | }
127 | 
128 | resource "aws_acm_certificate_validation" "prometheus_cert" {
129 |   certificate_arn         = aws_acm_certificate.prometheus_cert.arn
130 |   validation_record_fqdns = [for record in aws_route53_record.prometheus_cert_validation : record.fqdn]
131 | }
132 | 
133 | resource "aws_route53_record" "prom_alias" {
134 |   count = local.prom_records_count
135 | 
136 |   zone_id = var.zone_id
137 |   name    = "prom-${count.index + 1}"
138 |   type    = "A"
139 | 
140 |   alias {
141 |     name                   = aws_lb.prometheus_alb.dns_name
142 |     zone_id                = aws_lb.prometheus_alb.zone_id
143 |     evaluate_target_health = false
144 |   }
145 | }
146 | 
147 | resource "aws_lb" "prometheus_alb" {
148 |   name               = "${var.environment}-prometheus-alb"
149 |   internal           = false
150 |   load_balancer_type = "application"
151 | 
152 |   security_groups = [data.terraform_remote_state.infra_security_groups.outputs.prometheus_alb_sg_id]
153 | 
154 |   subnets = var.subnets
155 | 
156 |   tags = merge(
157 |     local.default_tags,
158 |     {
159 |       Name    = "${var.environment}-prometheus-alb"
160 |       Service = "observe-prometheus"
161 |     },
162 |   )
163 | }
164 | 
165 | resource "aws_lb_listener" "prometheus_listener_http" {
166 |   load_balancer_arn = aws_lb.prometheus_alb.arn
167 |   port              = "80"
168 |   protocol          = "HTTP"
169 | 
170 |   default_action {
171 |     type = "redirect"
172 | 
173 |     redirect {
174 |       port        = "443"
175 |       protocol    = "HTTPS"
176 |       status_code = "HTTP_301"
177 |     }
178 |   }
179 | }
180 | 
181 | resource "aws_lb_listener" "prometheus_listener_https" {
182 |   load_balancer_arn = aws_lb.prometheus_alb.arn
183 |   port              = "443"
184 |   protocol          = "HTTPS"
185 |   ssl_policy        = "ELBSecurityPolicy-TLS-1-2-2017-01"
186 |   certificate_arn   = aws_acm_certificate_validation.prometheus_cert.certificate_arn
187 | 
188 |   default_action {
189 |     type = "fixed-response"
190 | 
191 |     fixed_response {
192 |       content_type = "text/plain"
193 |       message_body = "Not found"
194 |       status_code  = "404"
195 |     }
196 |   }
197 | }
198 | 
199 | resource "aws_lb_listener_rule" "prom_listener_https" {
200 |   count = var.prometheus_count
201 | 
202 |   listener_arn = aws_lb_listener.prometheus_listener_https.arn
203 |   priority     = 100 + count.index
204 | 
205 |   action {
206 |     type             = "forward"
207 |     target_group_arn = element(aws_lb_target_group.prometheus_tg.*.arn, count.index)
208 |   }
209 | 
210 |   condition {
211 |     host_header {
212 |       values = ["prom-${count.index + 1}.*"]
213 |     }
214 |   }
215 | }
216 | 
217 | resource "aws_lb_target_group" "prometheus_tg" {
218 |   count = var.prometheus_count
219 | 
220 |   name                 = "${var.environment}-prom-${count.index + 1}-tg"
221 |   port                 = 80
222 |   protocol             = "HTTP"
223 |   vpc_id               = local.vpc_id
224 |   deregistration_delay = 30
225 | 
226 |   health_check {
227 |     interval            = "10"
228 |     path                = "/health" # static health check on nginx auth proxy
229 |     matcher             = "200"
230 |     protocol            = "HTTP"
231 |     healthy_threshold   = 2
232 |     unhealthy_threshold = 2
233 |     timeout             = "5"
234 |   }
235 | }
236 | 
237 | ## Outputs
238 | 
239 | output "prom_public_record_fqdns" {
240 |   value       = aws_route53_record.prom_alias.*.fqdn
241 |   description = "Prometheus public DNS FQDNs"
242 | }
243 | 
244 | output "prometheus_target_group_ids" {
245 |   value       = aws_lb_target_group.prometheus_tg.*.arn
246 |   description = "Prometheus target group IDs"
247 | }
248 | 


--------------------------------------------------------------------------------
/terraform/modules/app-ecs-albs/versions.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | terraform {
 3 |   required_version = ">= 0.13"
 4 |   required_providers {
 5 |     aws = {
 6 |       source = "hashicorp/aws"
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/modules/common/ami/main.tf:
--------------------------------------------------------------------------------
 1 | ## Variables
 2 | 
 3 | locals {
 4 |   canonical_account_id = "099720109477"
 5 | }
 6 | 
 7 | ## Data sources
 8 | 
 9 | data "aws_ami" "ubuntu_focal" {
10 |   most_recent = true
11 | 
12 |   filter {
13 |     name   = "name"
14 |     values = ["ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*"]
15 |   }
16 | 
17 |   filter {
18 |     name   = "architecture"
19 |     values = ["x86_64"]
20 |   }
21 | 
22 |   filter {
23 |     name   = "virtualization-type"
24 |     values = ["hvm"]
25 |   }
26 | 
27 |   owners = [local.canonical_account_id]
28 | }
29 | 
30 | ## Outputs
31 | 
32 | output "ubuntu_focal_ami_id" {
33 |   value = data.aws_ami.ubuntu_focal.id
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/terraform/modules/common/ami/versions.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | terraform {
 3 |   required_version = ">= 0.13"
 4 |   required_providers {
 5 |     aws = {
 6 |       source = "hashicorp/aws"
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/modules/infra-networking/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 | * ## module: infra-networking
  3 | *
  4 | * Terraform module to deploy the networking required for a VPC and
  5 | * related services. You will often have multiple VPCs in an account
  6 | *
  7 | */
  8 | 
  9 | variable "aws_region" {
 10 |   type        = string
 11 |   description = "AWS region"
 12 |   default     = "eu-west-1"
 13 | }
 14 | 
 15 | variable "environment" {
 16 |   type        = string
 17 |   description = "Unique name for this collection of resources"
 18 | }
 19 | 
 20 | variable "prometheus_subdomain" {
 21 |   type        = string
 22 |   description = "Subdomain for prometheus"
 23 |   default     = "monitoring"
 24 | }
 25 | 
 26 | # locals
 27 | # --------------------------------------------------------------
 28 | 
 29 | locals {
 30 |   default_tags = {
 31 |     Terraform   = "true"
 32 |     Project     = "infra-networking"
 33 |     Source      = "github.com/alphagov/prometheus-aws-configuration-beta"
 34 |     Environment = var.environment
 35 |   }
 36 | 
 37 |   subdomain_name         = "${var.prometheus_subdomain}.gds-reliability.engineering"
 38 |   private_subdomain_name = "${var.environment}.monitoring.private"
 39 | }
 40 | 
 41 | ## Data sources
 42 | 
 43 | data "aws_availability_zones" "available" {}
 44 | 
 45 | ## Resources
 46 | 
 47 | module "vpc" {
 48 |   source = "terraform-aws-modules/vpc/aws"
 49 |   version = "3.5.0"
 50 | 
 51 |   name = "observe-${var.environment}"
 52 |   cidr = "10.0.0.0/16"
 53 | 
 54 |   # subnets assumes 3 AZs although 3AZs are not implemented elsewhere
 55 |   azs             = data.aws_availability_zones.available.names
 56 |   private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
 57 |   public_subnets  = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
 58 | 
 59 |   create_database_subnet_group = false
 60 | 
 61 |   enable_nat_gateway = true
 62 |   single_nat_gateway = false
 63 | 
 64 |   enable_dns_hostnames = true
 65 |   enable_dns_support   = true
 66 | 
 67 |   enable_dhcp_options      = true
 68 |   dhcp_options_domain_name = local.private_subdomain_name
 69 | 
 70 |   # no `Name` tag unlike other resources as this is taken care of by the vpc module `name` property
 71 |   tags = local.default_tags
 72 | }
 73 | 
 74 | resource "aws_route53_zone" "subdomain" {
 75 |   name = local.subdomain_name
 76 | }
 77 | 
 78 | resource "aws_route53_zone" "private" {
 79 |   name          = local.private_subdomain_name
 80 |   force_destroy = true
 81 |   vpc {
 82 |     vpc_id = module.vpc.vpc_id
 83 |   }
 84 | }
 85 | 
 86 | ## Outputs
 87 | 
 88 | output "vpc_id" {
 89 |   value       = module.vpc.vpc_id
 90 |   description = "VPC ID where the stack resources are created"
 91 | }
 92 | 
 93 | output "private_subnets" {
 94 |   value       = module.vpc.private_subnets
 95 |   description = "List of private subnet IDs"
 96 | }
 97 | 
 98 | output "public_subnets" {
 99 |   value       = module.vpc.public_subnets
100 |   description = "List of public subnet IDs"
101 | }
102 | 
103 | output "public_zone_id" {
104 |   value       = aws_route53_zone.subdomain.zone_id
105 |   description = "Route 53 Zone ID for publicly visible zone"
106 | }
107 | 
108 | output "public_subdomain" {
109 |   value       = aws_route53_zone.subdomain.name
110 |   description = "This is the subdomain for root zone"
111 | }
112 | 
113 | output "private_zone_id" {
114 |   value       = aws_route53_zone.private.zone_id
115 |   description = "Route 53 Zone ID for the internal zone"
116 | }
117 | 
118 | output "private_zone_name" {
119 |   value       = aws_route53_zone.private.name
120 |   description = "Route 53 Zone name for the internal zone"
121 | }
122 | 
123 | output "private_subnets_ips" {
124 |   value       = module.vpc.private_subnets_cidr_blocks
125 |   description = "List of private subnet IPs"
126 | }
127 | 
128 | output "nat_gateway" {
129 |   value       = module.vpc.nat_public_ips
130 |   description = "List of nat gateway IP"
131 | }
132 | 
133 | output "private_subdomain" {
134 |   value       = aws_route53_zone.private.name
135 |   description = "This is the subdomain for private zone"
136 | }
137 | 
138 | output "subnets_by_az" {
139 |   value = zipmap(
140 |     data.aws_availability_zones.available.names,
141 |     module.vpc.private_subnets_cidr_blocks,
142 |   )
143 | 
144 |   description = "Map of availability zones to private subnets"
145 | }
146 | 
147 | 


--------------------------------------------------------------------------------
/terraform/modules/infra-networking/versions.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | terraform {
 3 |   required_version = ">= 0.13"
 4 |   required_providers {
 5 |     aws = {
 6 |       source = "hashicorp/aws"
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/modules/infra-security-groups/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 | * ## module: infra-security-groups
  3 | *
  4 | * Central module to manage all security groups.
  5 | *
  6 | * This is done in a single module to reduce conflicts
  7 | * and cascade issues.
  8 | *
  9 | */
 10 | 
 11 | variable "aws_region" {
 12 |   type        = string
 13 |   description = "The AWS region to use."
 14 | }
 15 | 
 16 | variable "remote_state_bucket" {
 17 |   type        = string
 18 |   description = "S3 bucket we store our terraform state in"
 19 | }
 20 | 
 21 | variable "environment" {
 22 |   type        = string
 23 |   description = "Unique name for this collection of resources"
 24 | }
 25 | 
 26 | # locals
 27 | # --------------------------------------------------------------
 28 | 
 29 | locals {
 30 |   default_tags = {
 31 |     Terraform   = "true"
 32 |     Project     = "infra-security-groups"
 33 |     Source      = "github.com/alphagov/prometheus-aws-configuration-beta"
 34 |     Environment = var.environment
 35 |   }
 36 | }
 37 | 
 38 | # Resources
 39 | # --------------------------------------------------------------
 40 | 
 41 | ## Data sources
 42 | 
 43 | data "terraform_remote_state" "infra_networking" {
 44 |   backend = "s3"
 45 | 
 46 |   config = {
 47 |     bucket = var.remote_state_bucket
 48 |     key    = "infra-networking-modular.tfstate"
 49 |     region = var.aws_region
 50 |   }
 51 | }
 52 | 
 53 | resource "aws_security_group" "prometheus_alb" {
 54 |   name        = "${var.environment}-prometheus-alb"
 55 |   vpc_id      = data.terraform_remote_state.infra_networking.outputs.vpc_id
 56 |   description = "Controls ingress and egress for prometheus ALB"
 57 | 
 58 |   tags = merge(
 59 |     local.default_tags,
 60 |     {
 61 |       Name    = "prometheus-alb",
 62 |       Service = "observe-prometheus",
 63 |     },
 64 |   )
 65 | }
 66 | 
 67 | # We allow all IPs to access the ALB as Prometheus is fronted by an nginx which controls access to either approved IP
 68 | # addresses, or users with basic auth creds
 69 | resource "aws_security_group_rule" "ingress_from_public_http_to_prometheus_alb" {
 70 |   security_group_id = aws_security_group.prometheus_alb.id
 71 |   type              = "ingress"
 72 |   from_port         = 80
 73 |   to_port           = 80
 74 |   protocol          = "tcp"
 75 |   cidr_blocks       = ["0.0.0.0/0"]
 76 | }
 77 | 
 78 | resource "aws_security_group_rule" "ingress_from_public_https_to_prometheus_alb" {
 79 |   security_group_id = aws_security_group.prometheus_alb.id
 80 |   type              = "ingress"
 81 |   from_port         = 443
 82 |   to_port           = 443
 83 |   protocol          = "tcp"
 84 |   cidr_blocks       = ["0.0.0.0/0"]
 85 | }
 86 | 
 87 | resource "aws_security_group_rule" "egress_from_prometheus_alb_to_prometheus_ec2" {
 88 |   security_group_id        = aws_security_group.prometheus_alb.id
 89 |   type                     = "egress"
 90 |   to_port                  = 80
 91 |   from_port                = 80
 92 |   protocol                 = "tcp"
 93 |   source_security_group_id = aws_security_group.prometheus_ec2.id
 94 | }
 95 | 
 96 | resource "aws_security_group" "prometheus_ec2" {
 97 |   name        = "${var.environment}-prometheus-ec2"
 98 |   vpc_id      = data.terraform_remote_state.infra_networking.outputs.vpc_id
 99 |   description = "Controls ingress and egress for prometheus EC2 instances"
100 | 
101 |   tags = merge(
102 |     local.default_tags,
103 |     {
104 |       Name    = "prometheus-ec2",
105 |       Service = "observe-prometheus",
106 |     },
107 |   )
108 | }
109 | 
110 | resource "aws_security_group_rule" "ingress_from_prometheus_alb_to_prometheus_ec2" {
111 |   security_group_id        = aws_security_group.prometheus_ec2.id
112 |   type                     = "ingress"
113 |   to_port                  = 80
114 |   from_port                = 80
115 |   protocol                 = "tcp"
116 |   source_security_group_id = aws_security_group.prometheus_alb.id
117 | }
118 | 
119 | resource "aws_security_group_rule" "ingress_from_prometheus_ec2_to_prometheus_ec2" {
120 |   security_group_id        = aws_security_group.prometheus_ec2.id
121 |   type                     = "ingress"
122 |   to_port                  = 9090
123 |   from_port                = 9090
124 |   protocol                 = "tcp"
125 |   source_security_group_id = aws_security_group.prometheus_ec2.id
126 | }
127 | 
128 | resource "aws_security_group_rule" "ingress_from_prometheus_to_prometheus_node_exporter" {
129 |   security_group_id        = aws_security_group.prometheus_ec2.id
130 |   type                     = "ingress"
131 |   to_port                  = 9100
132 |   from_port                = 9100
133 |   protocol                 = "tcp"
134 |   source_security_group_id = aws_security_group.prometheus_ec2.id
135 | }
136 | 
137 | # This rule allows all egress out of prometheus_ec2. This is for the following purposes:
138 | # - downloading packages from package repos
139 | # - calling AWS APIs such as SSM, S3 and EC2
140 | # - scraping alertmanager on port 9093
141 | # - sending alerts to alertmanager on port 9093
142 | # - scraping external targets that run on the PaaS
143 | # - scraping itself and other promethis on port 9090
144 | # - scraping node exporters on port 9100
145 | resource "aws_security_group_rule" "egress_from_prometheus_ec2_to_all" {
146 |   security_group_id = aws_security_group.prometheus_ec2.id
147 |   type              = "egress"
148 |   to_port           = 0
149 |   from_port         = 0
150 |   protocol          = "-1"
151 |   cidr_blocks       = ["0.0.0.0/0"]
152 | }
153 | 
154 | ## Outputs
155 | 
156 | output "prometheus_ec2_sg_id" {
157 |   value       = aws_security_group.prometheus_ec2.id
158 |   description = "security group prometheus_ec2 ID"
159 | }
160 | 
161 | output "prometheus_alb_sg_id" {
162 |   value       = aws_security_group.prometheus_alb.id
163 |   description = "security group prometheus_alb ID"
164 | }
165 | 


--------------------------------------------------------------------------------
/terraform/modules/infra-security-groups/versions.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | terraform {
 3 |   required_version = ">= 0.13"
 4 |   required_providers {
 5 |     aws = {
 6 |       source = "hashicorp/aws"
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/README.md:
--------------------------------------------------------------------------------
 1 | # Prometheus EC2 module
 2 | 
 3 | There are two modules
 4 | 
 5 |  - `prometheus`, which deploys prometheus to the target network.
 6 |  - `paas-config`, which contains configuration specific to our
 7 |    prometheus-for-paas deployment
 8 | 
 9 | We deploy using raw Terraform commands, scoped per environment.
10 | 
11 | ## Deploying
12 | 
13 | To deploy (for example to staging):
14 | 
15 | ```shell
16 | cd terraform/projects/prom-ec2/paas-staging/prometheus
17 | gds aws re-prom-staging -- terraform plan
18 | ```
19 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/alerts-config/alerts/README.md:
--------------------------------------------------------------------------------
 1 | # Example Alert
 2 | 
 3 | Below is an example alert that you can copy and rewrite to create your
 4 | own alert. [View the RE
 5 | docs](https://reliability-engineering.cloudapps.digital/monitoring-alerts.html#create-and-edit-alerts-using-prometheus)
 6 | for more information on what to consider when writing alerts.
 7 | 
 8 | It alerts if the number of 5xx status codes exceeds 25% of total
 9 | requests for 120 seconds (2 minutes) or more.
10 | 
11 | It is broken down into:
12 | 
13 | - `alert`: The alert name, in the format `TeamName_Problem`.
14 | - `expr`: The PromQL query that queries for the data, followed by `>=
15 |         0.25` defining the threshold of values.
16 | - `for`: Optional: The alert fires if the query is over threshold for
17 |          this amount of time.
18 | - `labels`:
19 |   - `product`: The team name or product for the team that this alert
20 |                refers to. For example, "Observe" or "Prometheus".
21 | - `annotations`:
22 |   - `summary`: Required: A summary of what the alert shows.
23 |   - `description`: Required: A more detailed description of what the alert shows.
24 |   - `dashboard_url`: Optional: A link to your team's dashboard (ie Grafana) to see
25 |                      trends for the alert.
26 |   - `runbook`: Optional: A link to your team manual describing what to do about
27 |                the alert.
28 |   - `logs`: Optional: A link to your logs (ie Kibana URL).
29 | 
30 | In the `annotations` section, `{{ $labels.app }}` refers to your team
31 | name, and `{{ $labels.job }}` refers to your app name.
32 | 
33 | ```
34 | - alert: Example_AppRequestsExcess5xx
35 |   expr: sum by(app) (rate(requests{org="example-paas-org", space="example-paas-space", status_range="5xx"}[5m])) / sum by(app) (rate(requests{org="example-paas-org", space="example-paas-space"}[5m])) >= 0.25
36 |   for: 120s
37 |   labels:
38 |     product: "example-team-name"
39 |   annotations:
40 |     summary: "App {{ $labels.app }} has too many 5xx errors"
41 |     description: "App {{ $labels.app }} has 5xx errors in excess of 25% of total requests"
42 |     dashboard_url: https://grafana-paas.cloudapps.digital/d/<example-id>/<example-dashboard-name>?refresh=1m&orgId=1
43 |     runbook: "https://re-team-manual.cloudapps.digital/"
44 |     logs: "https://kibana.logit.io/s/<example-stack-id>/app/kibana#/discover"
45 | ```
46 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/alerts-config/alerts/data-gov-uk-alerts.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: DataGovUk
 3 |   rules:
 4 |   - alert: DataGovUk_HighCpuUsage
 5 |     expr: avg(cpu{job="metric-exporter"}) without (exported_instance) >= 80
 6 |     for: 5m
 7 |     labels:
 8 |         product: "data-gov-uk"
 9 |     annotations:
10 |         summary: "App {{ $labels.app }} has high CPU usage"
11 |         message: "Application {{ $labels.app }} has been using over 80% CPU (averaged over all instances) for 5 minutes or more"
12 |   - alert: DataGovUk_HighDiskUsage
13 |     expr: max(disk_utilization{job="metric-exporter"}) without (exported_instance) >= 80
14 |     labels:
15 |         product: "data-gov-uk"
16 |     annotations:
17 |         summary: "App {{ $labels.app }} has high disk usage"
18 |         message: "Application {{ $labels.app }} has an instance which is using over 80% disk."
19 |   - alert: DataGovUk_ElasticSearchIndexSizeIncrease
20 |     expr: max without(instance, host, name, es_client_node, es_data_node, es_ingest_node, es_master_node) (delta(elasticsearch_indices_docs{space="data-gov-uk"}[30m])) >= 300
21 |     for: 1m
22 |     labels:
23 |         product: "data-gov-uk"
24 |     annotations:
25 |         summary: "Index size of Elasticsearch for {{ $labels.job }} has increased significantly"
26 |         message: "The index size of Elasticsearch for {{ $labels.job }} has increased by more than 300 documents in the last 30 minutes"
27 |         runbook: https://docs.publishing.service.gov.uk/manual/data-gov-uk-troubleshooting.html#different-number-of-datasets-in-ckan-to-find
28 |   - alert: DataGovUk_ElasticSearchIndexSizeDecrease
29 |     expr: max without(instance, host, name, es_client_node, es_data_node, es_ingest_node, es_master_node) (delta(elasticsearch_indices_docs{space="data-gov-uk"}[30m])) <= -300
30 |     for: 1m
31 |     labels:
32 |         product: "data-gov-uk"
33 |     annotations:
34 |         summary: "Index size of Elasticsearch for {{ $labels.job }} has decreased significantly"
35 |         message: "The index size of Elasticsearch for {{ $labels.job }} has decreased by more than 300 documents in the last 30 minutes"
36 |         runbook: https://docs.publishing.service.gov.uk/manual/data-gov-uk-troubleshooting.html#different-number-of-datasets-in-ckan-to-find
37 |   - alert: DataGovUk_HighSidekiqEnqueuedJobs
38 |     expr: sidekiq_enqueued_jobs{org="gds-data-gov-uk",job="publish-data-production-queue-monitor"} > 800
39 |     for: 5m
40 |     labels:
41 |         product: "data-gov-uk"
42 |     annotations:
43 |         summary: "Sidekiq's enqueued jobs do not seem to be clearing for Publish Data on production"
44 |         message: "Sidekiq has had more than 800 enqueued jobs for Publish Data on production for at least 5 minutes"
45 |         runbook: https://docs.publishing.service.gov.uk/manual/data-gov-uk-monitoring.html#sidekiq-publish
46 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/alerts-config/alerts/doc-checking-alerts.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 |   - name: DocChecking
 3 |     rules:
 4 |     - alert: AuditEventsNotProcessing
 5 |       annotations:
 6 |         message: >-
 7 |           The audit consumer should be writing audit events to the
 8 |           database. This hasn't happened in a while.
 9 |         runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/AuditEventsNotProcessing/
10 |       expr: |
11 |           sum without(instance) (rate(audit_consumer_events_processing_attempts_total[5m]))
12 |           -
13 |           sum without(instance) (rate(audit_consumer_events_processing_failures_total[5m]))
14 |           == 0
15 |       for: 10m
16 |       labels:
17 |         product: doc-checking
18 |         severity: p4
19 |     - alert: AuditEventsFailedProcessing
20 |       annotations:
21 |         message: >-
22 |           The audit consumer has a high error rate when attempting to
23 |           write audit events to the database.  Those events may have
24 |           ended up on the dead letter queue.
25 |         runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/AuditEventsFailedProcessing/
26 |       expr: |
27 |         sum without(instance) (rate(audit_consumer_events_processing_failures_total[2m])) > 3
28 |       for: 5m
29 |       labels:
30 |         product: doc-checking
31 |         severity: p4
32 |     - alert: AuditEventsOnTheDeadLetterQueue
33 |       annotations:
34 |         message: |
35 |           There are unprocessed audit events on the dead letter queue.
36 |         runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/AuditEventsOnTheDeadLetterQueue/
37 |       expr: |
38 |             max without(instance) (audit_consumer_dead_letter_queue_approximate_messages) > 0
39 |       for: 5m
40 |       labels:
41 |         product: doc-checking
42 |         severity: p4
43 |     - alert: RedisNotAvailable
44 |       annotations:
45 |         message: |
46 |           Redis is not available for rate limiting and quota.
47 |         runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/RedisNotAvailable/
48 |       expr: |
49 |         (avg by (job) (dcs_dmz_proxy_using_redis_for_rate_limiting) != 1) or (avg by (job) (dcs_agents_using_redis_for_rate_limiting) != 1)
50 |       for: 5m
51 |       labels:
52 |         product: doc-checking
53 |         severity: p4
54 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/alerts-config/alerts/notify-alerts.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: GOVUK_Notify
 3 |   rules:
 4 |   - alert: GOVUK_Notify_Disk_75_percent_full
 5 |     expr: max(disk_utilization{space="production", organisation="govuk-notify"}) by (app, space) > 75
 6 |     for: 5m
 7 |     labels:
 8 |         product: "notify"
 9 |         severity: "ticket"
10 |     annotations:
11 |         message: "{{ $labels.space }}: disk usage for {{ $labels.app }} is over 75% full. You should redeploy the app to avoid running out of disk space"
12 |         grafana: "https://grafana-paas.cloudapps.digital/d/_GlGBNbmk/notify-apps?orgId=2&var-space=production&var-app={{ $labels.app }}"
13 |   - alert: GOVUK_Notify_Disk_95_percent_full
14 |     expr: max(disk_utilization{space="production", organisation="govuk-notify", app!~"(.*conduit.*)|(.*exporter)"}) by (app, space) > 95
15 |     for: 5m
16 |     labels:
17 |         product: "notify"
18 |         severity: "p2"
19 |     annotations:
20 |         summary: "{{ $labels.space }}: disk usage for {{ $labels.app }} is over 95% full. You should redeploy the app to avoid running out of disk space"
21 |         grafana: "https://grafana-paas.cloudapps.digital/d/_GlGBNbmk/notify-apps?orgId=2&var-space=production&var-app={{ $labels.app }}"
22 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/alerts-config/alerts/observe-alerts.yml:
--------------------------------------------------------------------------------
  1 | groups:
  2 | - name: RE_Observe
  3 |   rules:
  4 |   - alert: RE_Observe_Grafana_Down
  5 |     expr: up{job="grafana-paas"} == 0
  6 |     for: 5m
  7 |     labels:
  8 |         product: "prometheus"
  9 |         severity: "page"
 10 |     annotations:
 11 |         summary: "Prometheus is not able to scrape Grafana"
 12 |         message: "Prometheus has not successfully scraped {{ $labels.job }} in the last 5 minutes. https://grafana-paas.cloudapps.digital/ may be down."
 13 |         logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=()&_a=(columns:!(_source),index:'*-*',interval:h,query:(query_string:(query:'grafana-paas.cloudapps.digital%20AND%20NOT%20access.response_code:200')),sort:!('@timestamp',desc))"
 14 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-grafana-down"
 15 | 
 16 |   - alert: RE_Observe_AlertManager_Below_Threshold
 17 |     expr: sum(up{job="alertmanager"}) <= 1
 18 |     for: 10s
 19 |     labels:
 20 |         product: "prometheus"
 21 |         severity: "page"
 22 |     annotations:
 23 |         summary: "There is one or fewer Alertmanagers that can be scraped"
 24 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-alertmanager-below-threshold"
 25 | 
 26 |   - alert: RE_Observe_Prometheus_Below_Threshold
 27 |     expr: sum(up{job="prometheus"}) <= 1
 28 |     for: 10s
 29 |     labels:
 30 |         product: "prometheus"
 31 |         severity: "page"
 32 |     annotations:
 33 |         summary: "There is one or fewer Prometheis that can be scraped"
 34 |         logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))"
 35 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-below-threshold"
 36 | 
 37 |   - alert: RE_Observe_Prometheus_AtLeastOneMissing
 38 |     expr: sum(up{job="prometheus"}) < 3
 39 |     for: 3m
 40 |     labels:
 41 |         product: "prometheus"
 42 |         severity: "ticket"
 43 |     annotations:
 44 |         summary: "At least one Prometheus can't be scraped"
 45 |         logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))"
 46 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-at-least-one-missing"
 47 | 
 48 |   - alert: RE_Observe_PrometheusDiskPredictedToFill
 49 |     expr: |
 50 |       predict_linear(
 51 |         node_filesystem_avail{job="prometheus_node", mountpoint="/mnt"}[12h], 3 * 24 * 60 * 60
 52 |       ) <= 0
 53 |       and on(instance)
 54 |       (time() - node_creation_time > 12 * 60 * 60)
 55 |     labels:
 56 |         product: "prometheus"
 57 |         severity: "ticket"
 58 |     annotations:
 59 |         summary: "Instance {{ $labels.instance }} disk {{ $labels.mountpoint }} is predicted to fill in 72h"
 60 |         logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))"
 61 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-disk-predicted-to-fill"
 62 | 
 63 |   - alert: RE_Observe_No_Paas_Targets
 64 |     expr: prometheus_sd_discovered_targets{config=~"paas-(london|ireland)-targets"} == 0
 65 |     for: 10m
 66 |     labels:
 67 |         product: "prometheus"
 68 |         severity: "page"
 69 |     annotations:
 70 |         summary: "No PaaS targets detected"
 71 |         message: "No PaaS file_sd targets were detected from the service broker.  Is there a problem accessing the targets bucket?"
 72 |         logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))"
 73 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-no-filesd-targets"
 74 | 
 75 |   - alert: RE_Observe_Prometheus_Over_Capacity
 76 |     expr: sum without(slice)(rate(prometheus_engine_query_duration_seconds_sum{job="prometheus"}[5m])) > 8
 77 |     for: 10s
 78 |     labels:
 79 |         product: "prometheus"
 80 |         severity: "page"
 81 |     annotations:
 82 |         summary: "Service is over capacity."
 83 |         message: "The service name is {{ $labels.job }}. The URL experiencing the issue is {{ $labels.instance }}."
 84 |         logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))"
 85 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-over-capacity"
 86 | 
 87 |   - alert: RE_Observe_Prometheus_High_Load
 88 |     expr: sum without(slice)(rate(prometheus_engine_query_duration_seconds_sum{job="prometheus"}[2h])) > 4
 89 |     labels:
 90 |         product: "prometheus"
 91 |         severity: "ticket"
 92 |     annotations:
 93 |         summary: "Service is approaching capacity."
 94 |         message: "The service name is {{ $labels.job }}. The URL experiencing the issue is {{ $labels.instance }}."
 95 |         logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))"
 96 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-high-load"
 97 | 
 98 |   - alert: RE_Observe_Target_Down
 99 |     expr: up{} == 0
100 |     for: 24h
101 |     labels:
102 |         product: "prometheus"
103 |         severity: "ticket"
104 |     annotations:
105 |         summary: "{{ $labels.job }} target is down"
106 |         message: "One of the {{ $labels.job }} targets has been down for 24 hours"
107 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-target-down"
108 | 
109 |   - alert: RE_Observe_No_Successful_Updates
110 |     expr: sum(increase(observe_broker_http_requests_total{code="200", path="/update-targets", method="post"}[30m])) by (region) == 0
111 |     for: 12h
112 |     labels:
113 |         product: "prometheus"
114 |         severity: "ticket"
115 |     annotations:
116 |         summary: "No recent target updates in region '{{ $labels.region }}'"
117 |         message: "Target update in region '{{ $labels.region }}' hasn't completed successfully in at least 12h"
118 |         runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-no-successful-updates"
119 | 
120 |   - alert: AlwaysAlert
121 |     annotations:
122 |       message: |
123 |         This is an alert meant to ensure that the entire alerting pipeline is functional.
124 |         This alert is always firing, therefore it should always be firing in Alertmanager
125 |         and always fire against a receiver.  We use cronitor to alert us if this ever
126 |         *doesn't* fire, because this indicates a problem with our alerting pipeline
127 |     expr: vector(1)
128 |     labels:
129 |         product: "prometheus"
130 |         severity: "constant"
131 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/paas-config/main.tf:
--------------------------------------------------------------------------------
 1 | data "template_file" "prometheus_config_template" {
 2 |   template = file("${path.module}/prometheus.conf.tpl")
 3 | 
 4 |   vars = {
 5 |     environment = var.environment
 6 |   }
 7 | }
 8 | 
 9 | locals {
10 |   prometheus_config            = yamldecode(data.template_file.prometheus_config_template.rendered)
11 |   final_scrape_configs         = concat(local.prometheus_config["scrape_configs"], var.extra_scrape_configs)
12 |   final_prometheus_config      = merge(local.prometheus_config, { "scrape_configs" = local.final_scrape_configs })
13 |   final_prometheus_config_yaml = yamlencode(local.final_prometheus_config)
14 | }
15 | 
16 | resource "aws_route53_record" "prom_ec2_a_record" {
17 |   count = 3
18 | 
19 |   zone_id = var.private_zone_id
20 |   name    = "prom-ec2-${count.index + 1}"
21 |   type    = "A"
22 |   ttl     = 300
23 | 
24 |   records = [var.prom_private_ips[count.index]]
25 | }
26 | 
27 | resource "aws_s3_bucket_object" "prometheus_config" {
28 |   bucket  = var.prometheus_config_bucket
29 |   key     = "prometheus/prometheus.yml"
30 |   content = local.final_prometheus_config_yaml
31 |   etag    = md5(local.final_prometheus_config_yaml)
32 | }
33 | 
34 | resource "aws_s3_bucket_object" "alerts-config" {
35 |   bucket = var.prometheus_config_bucket
36 |   key    = "prometheus/alerts/observe-alerts.yml"
37 |   source = "${var.alerts_path}observe-alerts.yml"
38 |   etag   = filemd5("${var.alerts_path}observe-alerts.yml")
39 | }
40 | 
41 | resource "aws_s3_bucket_object" "alerts-data-gov-uk-config" {
42 |   bucket = var.prometheus_config_bucket
43 |   key    = "prometheus/alerts/data-gov-uk-alerts.yml"
44 |   source = "${var.alerts_path}data-gov-uk-alerts.yml"
45 |   etag   = filemd5("${var.alerts_path}data-gov-uk-alerts.yml")
46 | }
47 | 
48 | resource "aws_s3_bucket_object" "alerts-doc-checking-config" {
49 |   bucket = var.prometheus_config_bucket
50 |   key    = "prometheus/alerts/doc-checking-alerts.yml"
51 |   source = "${var.alerts_path}doc-checking-alerts.yml"
52 |   etag   = filemd5("${var.alerts_path}doc-checking-alerts.yml")
53 | }
54 | 
55 | resource "aws_s3_bucket_object" "alerts-notify-config" {
56 |   bucket = var.prometheus_config_bucket
57 |   key    = "prometheus/alerts/notify-alerts.yml"
58 |   source = "${var.alerts_path}notify-alerts.yml"
59 |   etag   = filemd5("${var.alerts_path}notify-alerts.yml")
60 | }
61 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/paas-config/outputs.tf:
--------------------------------------------------------------------------------
1 | output "prometheus_config_etag" {
2 |   value = aws_s3_bucket_object.prometheus_config.etag
3 | }
4 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/paas-config/prometheus.conf.tpl:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: 30s
 3 |   evaluation_interval: 30s
 4 | alerting:
 5 |   alertmanagers:
 6 |   - scheme: http
 7 |     dns_sd_configs:
 8 |       - names:
 9 |           - 'alertmanager.local.gds-reliability.engineering'
10 |         type: 'A'
11 |         port: 9093
12 | rule_files:
13 |   - "/etc/prometheus/alerts/*"
14 | scrape_configs:
15 |   - job_name: prometheus
16 |     ec2_sd_configs:
17 |       - region: eu-west-1
18 |         port: 9090
19 |     relabel_configs:
20 |       - source_labels: ['__meta_ec2_tag_Environment']
21 |         regex: '${environment}'
22 |         action: keep
23 |       - source_labels: ['__meta_ec2_tag_Service']
24 |         regex: 'observe-prometheus'
25 |         action: keep
26 |       - source_labels: ['__meta_ec2_availability_zone']
27 |         target_label: availability_zone
28 |       - source_labels: ['__meta_ec2_instance_id']
29 |         replacement: '$1:9090'
30 |         target_label: instance
31 |   - job_name: paas-ireland-targets
32 |     scheme: http
33 |     proxy_url: 'http://localhost:8080'
34 |     file_sd_configs:
35 |       - files: ['/etc/prometheus/ireland-targets/*.json']
36 |         refresh_interval: 30s
37 |     relabel_configs:
38 |       - target_label: region
39 |         replacement: ireland
40 |   - job_name: paas-london-targets
41 |     scheme: http
42 |     proxy_url: 'http://localhost:8080'
43 |     file_sd_configs:
44 |       - files: ['/etc/prometheus/london-targets/*.json']
45 |         refresh_interval: 30s
46 |     relabel_configs:
47 |       - target_label: region
48 |         replacement: london
49 |   - job_name: alertmanager
50 |     dns_sd_configs:
51 |       - names:
52 |           - 'alertmanager.local.gds-reliability.engineering'
53 |         type: 'A'
54 |         port: 9093
55 |   - job_name: prometheus_node
56 |     ec2_sd_configs:
57 |       - region: eu-west-1
58 |         port: 9100
59 |     relabel_configs:
60 |       - source_labels: ['__meta_ec2_tag_Environment']
61 |         regex: '${environment}'
62 |         action: keep
63 |       - source_labels: ['__meta_ec2_tag_Service']
64 |         regex: 'observe-prometheus'
65 |         action: keep
66 |       - source_labels: ['__meta_ec2_availability_zone']
67 |         target_label: availability_zone
68 |       - source_labels: ['__meta_ec2_instance_id']
69 |         replacement: '$1:9100'
70 |         target_label: instance
71 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/paas-config/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "environment" {}
 2 | variable "prometheus_config_bucket" {}
 3 | variable "alerts_path" {}
 4 | variable "private_zone_id" {}
 5 | 
 6 | variable "prom_private_ips" {
 7 |   type = list(string)
 8 | }
 9 | 
10 | variable "extra_scrape_configs" {
11 |   default     = []
12 |   description = "List of scrape configs to append to the Prometheus config"
13 | }
14 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/paas-config/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 0.13"
 3 |   required_providers {
 4 |     aws = {
 5 |       source = "hashicorp/aws"
 6 |     }
 7 |     template = {
 8 |       source = "hashicorp/template"
 9 |     }
10 |   }
11 | }
12 | 
13 | provider "template" {
14 |   version = ">= 2"
15 | }
16 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/prometheus/.ruby-version:
--------------------------------------------------------------------------------
1 | 2.6.1
2 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/prometheus/cloud.conf:
--------------------------------------------------------------------------------
  1 | #cloud-config
  2 | package_update: true
  3 | package_upgrade: true
  4 | packages: ['prometheus', 'prometheus-node-exporter', 'awscli', 'inotify-tools', 'nginx', 'jq']
  5 | 
  6 | write_files:
  7 |   - owner: root:root
  8 |     path: /etc/default/prometheus
  9 |     permissions: 0444
 10 |     content: 'ARGS="--storage.tsdb.path=\"/mnt/\" --web.external-url=${prom_external_url} --storage.tsdb.retention=60d --query.timeout=30s"'
 11 |   - owner: root:root
 12 |     path: /etc/cron.d/config_pull
 13 |     permissions: 0755
 14 |     content: |
 15 |         * * * * * root flock -w 30 /run/lock/prometheus-config-updates aws s3 sync s3://${config_bucket}/prometheus/ /etc/prometheus/ --region=${region}
 16 |         @reboot root /root/watch_prometheus_dir
 17 |   - owner: root:root
 18 |     path: /etc/cron.d/ireland_targets_pull
 19 |     permissions: 0755
 20 |     content: |
 21 |         # if targets bucket exists then sync it, otherwise this cron runs but has no effect
 22 |         * * * * * root [ "${ireland_targets_bucket}" != "" ] && aws s3 sync s3://${ireland_targets_bucket}/active/ /etc/prometheus/ireland-targets --region=${region} --delete
 23 |   - owner: root:root
 24 |     path: /etc/cron.d/london_targets_pull
 25 |     permissions: 0755
 26 |     content: |
 27 |         # if targets bucket exists then sync it, otherwise this cron runs but has no effect
 28 |         * * * * * root [ "${london_targets_bucket}" != "" ] && aws s3 sync s3://${london_targets_bucket}/active/ /etc/prometheus/london-targets --region=${region} --delete
 29 |   - owner: root:root
 30 |     path: /etc/cron.d/alerts_pull
 31 |     permissions: 0755
 32 |     content: |
 33 |         # if alerts bucket exists then sync it, otherwise this cron runs but has no effect
 34 |         * * * * * root [ "${alerts_bucket}" != "" ] && aws s3 sync s3://${alerts_bucket}/prometheus/alerts/ /etc/prometheus/alerts --region=${region} --delete
 35 |   - content: |
 36 |         echo 'Configuring prometheus EBS'
 37 |         vol=""
 38 |         while [ -z "$vol" ]; do
 39 |           # adapted from
 40 |           # https://medium.com/@moonape1226/mount-aws-ebs-on-ec2-automatically-with-cloud-init-e5e837e5438a
 41 |           # [Last accessed on 2020-04-02]
 42 |           vol=$(lsblk | grep -e disk | awk '{sub("G","",$4)} {if ($4+0 == ${data_volume_size}) print $1}')
 43 |           echo "still waiting for data volume ; sleeping 5"
 44 |           sleep 5
 45 |         done
 46 |         echo "found volume /dev/$vol"
 47 |         if [ -z "$(lsblk | grep "$vol" | awk '{print $7}')" ] ; then
 48 |           if [ -z "$(blkid /dev/$vol | grep ext4)" ] ; then
 49 |             echo "volume /dev/$vol is not formatted ; formatting"
 50 |             mkfs -F -t ext4 -L 'prometheus_disk' "/dev/$vol"
 51 |           else
 52 |             echo "volume /dev/$vol is already formatted"
 53 |           fi
 54 | 
 55 |           echo "volume /dev/$vol is not mounted ; mounting"
 56 |           mount "/dev/$vol" /mnt
 57 |           UUID=$(blkid /dev/$vol -s UUID -o value)
 58 |           if [ -z "$(grep $UUID /etc/fstab)" ] ; then
 59 |             echo "writing fstab entry"
 60 | 
 61 |             echo "UUID=$UUID /mnt ext4 defaults,nofail 0 2" >> /etc/fstab
 62 |           fi
 63 |         fi
 64 |         echo "ensuring fs block size matches volume block size"
 65 |         resize2fs "/dev/$vol"
 66 |     path: /root/manage_data_volume.sh
 67 |     permissions: 0755
 68 |   - content: |
 69 |        #!/bin/bash
 70 |        STATUS_JSON='/srv/prometheus-last-config.json'
 71 | 
 72 |        attempt_reload() {
 73 |          (
 74 |            # take out lock to ensure updater doesn't switch the config between the time we
 75 |            # calculate NEW_HASH and prometheus reads it
 76 |            flock 321
 77 | 
 78 |            # why md5? because it should be the same as the s3 etag and so easy to check
 79 |            export NEW_HASH=$(md5sum /etc/prometheus/prometheus.yml | cut -d ' ' -f 1)
 80 |            if systemctl reload prometheus ; then
 81 |              jq -n '{last_successful_config: env.NEW_HASH, last_reload_successful: true}' > $STATUS_JSON
 82 |            else
 83 |              touch $STATUS_JSON
 84 |              jq '{last_successful_config: .last_successful_config, last_reload_successful: false, failed_config: env.NEW_HASH}' $STATUS_JSON > $STATUS_JSON
 85 |            fi
 86 | 
 87 |          ) 321>/run/lock/prometheus-config-updates
 88 |        }
 89 | 
 90 |        systemctl start prometheus  # ensure prometheus is started before initial attempt_reload
 91 |        attempt_reload
 92 | 
 93 |        inotifywait -e modify,create,delete,move -m /etc/prometheus |
 94 |        while read -r directory events; do
 95 |          attempt_reload
 96 |        done
 97 |     path: /root/watch_prometheus_dir
 98 |     permissions: 0755
 99 |   - content: |
100 |       #!/bin/bash
101 |       curl -L -O https://artifacts.elastic.co/downloads/beats/filebeat/filebeat-6.4.2-amd64.deb && sudo dpkg -i filebeat-6.4.2-amd64.deb
102 |       aws s3 sync s3://${config_bucket}/filebeat/ /etc/filebeat/ --region=${region}
103 |       update-rc.d filebeat defaults
104 |       update-rc.d filebeat enable 5
105 |     path: /root/setup_filebeat.sh
106 |     permissions: 0755
107 |   - content: |
108 |       server {
109 |         listen 8080;
110 | 
111 |         location / {
112 |           set $cleaned_header $arg_cf_app_instance;
113 |           if ($arg_cf_app_instance ~* "^(.*)%3A(.*)$") {
114 |             set $cleaned_header $1:$2;
115 |           }
116 |           proxy_http_version 1.1;
117 |           proxy_pass https://$host$uri;
118 |           proxy_ssl_server_name on;
119 |           proxy_set_header Connection "";
120 |           proxy_set_header X-CF-APP-INSTANCE $cleaned_header;
121 |           proxy_set_header XX-CF-APP-INSTANCE $cleaned_header;
122 |           proxy_set_header Authorization "Bearer $arg_cf_app_guid";
123 |         }
124 | 
125 |         location /health {
126 |           return 200 "Static health check";
127 |         }
128 | 
129 |         resolver 10.0.0.2 valid=10s;
130 |       }
131 |     path: /etc/nginx/sites-enabled/paas-proxy
132 |     permissions: 0644
133 |   - content: |
134 |       ${prometheus_htpasswd}
135 |     path: /etc/nginx/conf.d/.htpasswd
136 |     owner: www-data:www-data
137 |     permissions: 0600
138 |   # the package-provided default server conflicts with auth-proxy
139 |   # below and causes package installation to fail because of a
140 |   # duplicate default_server on port 80.  So we wipe the default
141 |   # server (and then remove it in runcmd at the bottom)
142 |   - content: ""
143 |     path: /etc/nginx/sites-enabled/default
144 |   - content: |
145 |       server {
146 |         listen 80 default_server;
147 | 
148 |         location /health {
149 |           # This location is not protected by basic auth because of
150 |           # https://stackoverflow.com/questions/40447376/auth-basic-within-location-block-doesnt-work-when-return-is-specified
151 |           return 200 "Static health check";
152 |         }
153 | 
154 |         location = /last-config {
155 |           default_type application/json;
156 |           alias /srv/prometheus-last-config.json;
157 |         }
158 | 
159 |         location / {
160 |           proxy_pass  http://localhost:9090;
161 |           proxy_set_header X-Real-IP $remote_addr;
162 |           proxy_set_header Host $host;
163 |           proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
164 |         }
165 | 
166 |         satisfy any;
167 |         auth_basic "Prometheus";
168 |         auth_basic_user_file /etc/nginx/conf.d/.htpasswd;
169 | 
170 |         real_ip_header X-Forwarded-For;
171 |         set_real_ip_from 10.0.0.0/8;
172 |         set_real_ip_from 127.0.0.1/32;
173 |         ${allowed_cidrs}
174 |         deny all;
175 |       }
176 |     path: /etc/nginx/sites-enabled/auth-proxy
177 | 
178 | runcmd:
179 |   - rm /etc/nginx/sites-enabled/default
180 |   - "if [ -n '${logstash_host}' ]; then /root/setup_filebeat.sh; fi"
181 |   - [bash, -c, "/root/manage_data_volume.sh"]
182 |   - [bash, -c, "chown -R prometheus /mnt/"]
183 |   - [bash, -c, "echo \"node_creation_time `date +%s`\" > /var/lib/prometheus/node-exporter/node-creation-time.prom"]
184 |   - [bash, -c, "rm /etc/resolv.conf && sed -e 's/ trust-ad//' < /run/systemd/resolve/stub-resolv.conf > /etc/resolv.conf"]
185 |   - [reboot]
186 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/prometheus/filebeat.yml.tpl:
--------------------------------------------------------------------------------
 1 | filebeat.inputs:
 2 | - type: log
 3 |   enabled: true
 4 |   paths:
 5 |     - /var/log/syslog
 6 | 
 7 | output.logstash:
 8 |   hosts: ["${logstash_host}"]
 9 |   loadbalance: true
10 |   ssl.enabled: true
11 | 
12 | tags: ["prometheus", "${environment}"]
13 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/prometheus/iam.tf:
--------------------------------------------------------------------------------
 1 | #Prepare to attach role to instance
 2 | resource "aws_iam_instance_profile" "prometheus_instance_profile" {
 3 |   name = "prometheus_${var.environment}_config_reader_profile"
 4 |   role = aws_iam_role.prometheus_role.name
 5 | }
 6 | 
 7 | #Create role
 8 | resource "aws_iam_role" "prometheus_role" {
 9 |   name = "prometheus_profile_${var.environment}"
10 | 
11 |   assume_role_policy = data.aws_iam_policy_document.prometheus_assume_role_policy.json
12 | 
13 |   tags = merge(local.default_tags, {
14 |     Name = "${var.environment}-prometheus"
15 |   })
16 | }
17 | 
18 | #Create permission to assume role
19 | data "aws_iam_policy_document" "prometheus_assume_role_policy" {
20 |   statement {
21 |     actions = ["sts:AssumeRole"]
22 | 
23 |     principals {
24 |       type        = "Service"
25 |       identifiers = ["ec2.amazonaws.com"]
26 |     }
27 |   }
28 | }
29 | 
30 | #Define the policy to attach the role too
31 | resource "aws_iam_policy" "prometheus_instance_profile" {
32 |   name        = "prometheus_instance_profile_${var.environment}"
33 |   path        = "/"
34 |   description = "This is the main profile, that has bucket permission and decribe permissions"
35 | 
36 |   policy = data.aws_iam_policy_document.instance_role_policy.json
37 | }
38 | 
39 | #define IAM policy documention
40 | data "aws_iam_policy_document" "instance_role_policy" {
41 |   statement {
42 |     sid       = "ec2Policy"
43 |     actions   = ["ec2:Describe*"]
44 |     resources = ["*"]
45 |   }
46 | 
47 |   statement {
48 |     sid = "s3Bucket"
49 | 
50 |     actions = [
51 |       "s3:Get*",
52 |       "s3:ListBucket",
53 |     ]
54 | 
55 |     resources = [
56 |       "arn:aws:s3:::${aws_s3_bucket.prometheus_config.id}/*",
57 |       "arn:aws:s3:::${aws_s3_bucket.prometheus_config.id}",
58 |     ]
59 |   }
60 | }
61 | 
62 | #Attach policy to role
63 | resource "aws_iam_role_policy_attachment" "iam_policy" {
64 |   role       = aws_iam_role.prometheus_role.name
65 |   policy_arn = aws_iam_policy.prometheus_instance_profile.arn
66 | }
67 | 
68 | resource "aws_iam_role_policy_attachment" "session_manager_access" {
69 |   role       = aws_iam_role.prometheus_role.name
70 |   policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM"
71 | }
72 | 
73 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/prometheus/main.tf:
--------------------------------------------------------------------------------
  1 | locals {
  2 |   filebeat_count = var.logstash_host != "" ? 1 : 0
  3 |   default_tags = {
  4 |     ManagedBy   = "terraform"
  5 |     Source      = "github.com/alphagov/prometheus-aws-configuration-beta"
  6 |     Environment = var.environment
  7 |     Service     = "observe-prometheus"
  8 |   }
  9 | }
 10 | 
 11 | resource "aws_key_pair" "ssh_key" {
 12 |   count      = var.enable_ssh == true ? 1 : 0
 13 |   key_name   = "${var.environment}-prom-key"
 14 |   public_key = file("~/.ssh/id_rsa.pub")
 15 | }
 16 | 
 17 | resource "aws_instance" "prometheus" {
 18 |   count = length(keys(var.availability_zones))
 19 | 
 20 |   ami                  = var.ami_id
 21 |   instance_type        = var.instance_size
 22 |   user_data            = data.template_file.user_data_script[count.index].rendered
 23 |   iam_instance_profile = aws_iam_instance_profile.prometheus_instance_profile.id
 24 |   subnet_id            = var.subnet_ids[count.index]
 25 | 
 26 |   associate_public_ip_address = var.enable_ssh
 27 | 
 28 |   key_name = var.enable_ssh ? format("%s-prom-key", var.environment) : ""
 29 | 
 30 |   vpc_security_group_ids = var.vpc_security_groups
 31 | 
 32 |   tags = merge(local.default_tags, {
 33 |     Name = "paas-${var.environment}-prometheus-${element(keys(var.availability_zones), count.index)}"
 34 |   })
 35 | }
 36 | 
 37 | resource "aws_volume_attachment" "attach-prometheus-disk" {
 38 |   count = length(keys(var.availability_zones))
 39 | 
 40 |   device_name = var.device_mount_path
 41 |   volume_id   = aws_ebs_volume.prometheus-disk[count.index].id
 42 |   instance_id = aws_instance.prometheus[count.index].id
 43 | 
 44 |   # Required to work around a bug in terraform https://github.com/hashicorp/terraform/issues/2957
 45 |   # terraform tries to destroy the attachment before stoping/destorying the instance
 46 |   skip_destroy = true
 47 | }
 48 | 
 49 | resource "aws_ebs_volume" "prometheus-disk" {
 50 |   count = length(keys(var.availability_zones))
 51 | 
 52 |   availability_zone = element(keys(var.availability_zones), count.index)
 53 |   size              = var.data_volume_size
 54 | 
 55 |   tags = merge(local.default_tags, {
 56 |     Name = "prometheus-disk"
 57 |   })
 58 | }
 59 | 
 60 | data "template_file" "user_data_script" {
 61 |   count = length(keys(var.availability_zones))
 62 | 
 63 |   template = file("${path.module}/cloud.conf")
 64 | 
 65 |   vars = {
 66 |     config_bucket          = aws_s3_bucket.prometheus_config.id
 67 |     region                 = var.region
 68 |     ireland_targets_bucket = aws_s3_bucket.prometheus_targets.id
 69 |     london_targets_bucket  = aws_s3_bucket.prometheus_london_targets.id
 70 |     alerts_bucket          = aws_s3_bucket.prometheus_config.id
 71 |     prom_external_url      = "https://${var.prometheus_public_fqdns[count.index]}"
 72 |     logstash_host          = var.logstash_host
 73 |     prometheus_htpasswd    = var.prometheus_htpasswd
 74 |     allowed_cidrs          = join("\n        ", formatlist("allow %s;", var.allowed_cidrs))
 75 |     data_volume_size       = var.data_volume_size
 76 |   }
 77 | }
 78 | 
 79 | resource "aws_s3_bucket" "prometheus_config" {
 80 |   bucket        = var.config_bucket
 81 |   acl           = "private"
 82 |   force_destroy = true
 83 | 
 84 |   versioning {
 85 |     enabled = true
 86 |   }
 87 | 
 88 |   tags = merge(local.default_tags, {
 89 |     Name = "${var.environment}-prometheus-config"
 90 |   })
 91 | }
 92 | 
 93 | data "template_file" "filebeat_conf" {
 94 |   count    = local.filebeat_count
 95 |   template = file("${path.module}/filebeat.yml.tpl")
 96 | 
 97 |   vars = {
 98 |     logstash_host = var.logstash_host
 99 |     environment   = var.environment
100 |   }
101 | }
102 | 
103 | resource "aws_s3_bucket_object" "filebeat" {
104 |   count   = local.filebeat_count
105 |   bucket  = var.config_bucket
106 |   key     = "filebeat/filebeat.yml"
107 |   content = data.template_file.filebeat_conf[0].rendered
108 | }
109 | 
110 | resource "aws_lb_target_group_attachment" "prom_target_group_attachment" {
111 |   count            = length(var.prometheus_target_group_arns)
112 |   target_group_arn = var.prometheus_target_group_arns[count.index]
113 |   target_id        = aws_instance.prometheus[count.index].id
114 |   port             = 80
115 | }
116 | 
117 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/prometheus/output.tf:
--------------------------------------------------------------------------------
 1 | output "public_ip_address" {
 2 |   value = aws_instance.prometheus.*.public_ip
 3 | }
 4 | 
 5 | output "private_ip_addresses" {
 6 |   value = aws_instance.prometheus.*.private_ip
 7 | }
 8 | 
 9 | output "prometheus_instance_id" {
10 |   value = aws_instance.prometheus.*.id
11 | }
12 | 
13 | output "prometheus_private_dns" {
14 |   value = aws_instance.prometheus.*.private_dns
15 | }
16 | 
17 | output "prometheus_public_dns" {
18 |   value = aws_instance.prometheus.*.public_dns
19 | }
20 | 
21 | output "s3_config_bucket" {
22 |   value = aws_s3_bucket.prometheus_config.id
23 | }
24 | 
25 | output "ec2_instance_profile_name" {
26 |   value = aws_iam_instance_profile.prometheus_instance_profile.name
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/prometheus/targets.tf:
--------------------------------------------------------------------------------
  1 | resource "aws_s3_bucket" "prometheus_targets" {
  2 |   bucket        = "govukobserve-targets-${var.environment}"
  3 |   acl           = "private"
  4 |   force_destroy = true
  5 | 
  6 |   versioning {
  7 |     enabled = true
  8 |   }
  9 | 
 10 |   tags = merge(local.default_tags, {
 11 |     Name = "${var.environment}-ireland-targets"
 12 |   })
 13 | }
 14 | 
 15 | resource "aws_iam_user" "targets_writer" {
 16 |   name = "targets-writer"
 17 |   path = "/${var.environment}/"
 18 | 
 19 |   tags = merge(local.default_tags, {
 20 |     Name = "${var.environment}-ireland-targets-writer"
 21 |   })
 22 | }
 23 | 
 24 | resource "aws_iam_user_policy" "writer_has_full_access_to_targets_bucket" {
 25 |   name = "targets_bucket_full_access"
 26 |   user = aws_iam_user.targets_writer.name
 27 | 
 28 |   policy = <<EOF
 29 | {
 30 |   "Version": "2012-10-17",
 31 |   "Statement": [
 32 |     {
 33 |       "Action": [
 34 |         "s3:*"
 35 |       ],
 36 |       "Effect": "Allow",
 37 |       "Resource": [
 38 |         "${aws_s3_bucket.prometheus_targets.arn}/*",
 39 |         "${aws_s3_bucket.prometheus_targets.arn}"
 40 |       ]
 41 |     }
 42 |   ]
 43 | }
 44 | EOF
 45 | }
 46 | 
 47 | resource "aws_s3_bucket" "prometheus_london_targets" {
 48 |   bucket        = "govukobserve-london-targets-${var.environment}"
 49 |   acl           = "private"
 50 |   force_destroy = true
 51 | 
 52 |   versioning {
 53 |     enabled = true
 54 |   }
 55 | 
 56 |   tags = merge(local.default_tags, {
 57 |     Name = "${var.environment}-london-targets"
 58 |   })
 59 | }
 60 | 
 61 | resource "aws_iam_user" "london_targets_writer" {
 62 |   name = "london-targets-writer"
 63 |   path = "/${var.environment}/"
 64 | 
 65 |   tags = merge(local.default_tags, {
 66 |     Name = "${var.environment}-london-targets-writer"
 67 |   })
 68 | }
 69 | 
 70 | resource "aws_iam_user_policy" "london_writer_has_full_access_to_london_targets_bucket" {
 71 |   name = "london_targets_bucket_full_access"
 72 |   user = aws_iam_user.london_targets_writer.name
 73 | 
 74 |   policy = <<EOF
 75 | {
 76 |   "Version": "2012-10-17",
 77 |   "Statement": [
 78 |     {
 79 |       "Action": [
 80 |         "s3:*"
 81 |       ],
 82 |       "Effect": "Allow",
 83 |       "Resource": [
 84 |         "${aws_s3_bucket.prometheus_london_targets.arn}/*",
 85 |         "${aws_s3_bucket.prometheus_london_targets.arn}"
 86 |       ]
 87 |     }
 88 |   ]
 89 | }
 90 | EOF
 91 | }
 92 | 
 93 | resource "aws_iam_role_policy" "prometheus_has_read_access_to_targets_bucket" {
 94 |   name = "targets_bucket_read_access"
 95 |   role = aws_iam_role.prometheus_role.name
 96 | 
 97 |   policy = <<EOF
 98 | {
 99 |   "Version": "2012-10-17",
100 |   "Statement": [
101 |     {
102 |       "Action": [
103 |         "s3:Get*",
104 |         "s3:List*"
105 |       ],
106 |       "Effect": "Allow",
107 |       "Resource": [
108 |         "${aws_s3_bucket.prometheus_targets.arn}/*",
109 |         "${aws_s3_bucket.prometheus_targets.arn}",
110 |         "${aws_s3_bucket.prometheus_london_targets.arn}/*",
111 |         "${aws_s3_bucket.prometheus_london_targets.arn}"
112 |       ]
113 |     }
114 |   ]
115 | }
116 | EOF
117 | }
118 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/prometheus/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "ami_id" {}
 2 | 
 3 | variable "device_mount_path" {
 4 |   description = "The path to mount the prometheus disk"
 5 |   default     = "/dev/sdh"
 6 | }
 7 | 
 8 | variable "data_volume_size" {
 9 |   description = "The size of the volume that will contain the prometheus data"
10 |   default     = 250
11 | }
12 | 
13 | variable "availability_zones" {
14 |   description = "A map of availability zones to subnets"
15 | 
16 |   type    = map(string)
17 |   default = {}
18 | }
19 | 
20 | variable "subnet_ids" {
21 |   type = list(string)
22 | }
23 | 
24 | variable "instance_size" {
25 |   type        = string
26 |   description = "This is the default instance size"
27 |   default     = "m5.large"
28 | }
29 | 
30 | variable "target_vpc" {
31 |   description = "The VPC in which the system will be deployed"
32 | }
33 | 
34 | variable "environment" {}
35 | 
36 | variable "vpc_security_groups" {
37 |   type        = list(string)
38 |   default     = []
39 |   description = "Security groups to attach to the prometheus instances"
40 | }
41 | 
42 | variable "enable_ssh" {
43 |   default = false
44 | }
45 | 
46 | variable "region" {
47 |   default = "eu-west-1"
48 | }
49 | 
50 | variable "allowed_cidrs" {
51 |   type        = list(string)
52 |   description = "List of CIDRs which are able to access the prometheus instance, default are GDS ips"
53 | 
54 |   default = [
55 |     "213.86.153.211/32",
56 |     "213.86.153.212/32",
57 |     "213.86.153.213/32",
58 |     "213.86.153.214/32",
59 |     "213.86.153.231/32",
60 |     "213.86.153.235/32",
61 |     "213.86.153.236/32",
62 |     "213.86.153.237/32",
63 |     "85.133.67.244/32",
64 |     "35.177.37.128/32",
65 |     "35.176.252.164/32",
66 |     "51.149.9.112/29", # CO
67 |     "51.149.9.240/29", # CO
68 |   ]
69 | }
70 | 
71 | variable "config_bucket" {}
72 | 
73 | variable "prometheus_public_fqdns" {
74 |   type = list(string)
75 | }
76 | 
77 | variable "logstash_host" {
78 |   default = ""
79 | }
80 | 
81 | variable "prometheus_htpasswd" {
82 |   default     = ""
83 |   description = "Contents of basic auth .htpasswd file for NGINX to allow access from Grafana"
84 | }
85 | 
86 | variable "prometheus_target_group_arns" {
87 |   type    = list(string)
88 |   default = []
89 | }
90 | 


--------------------------------------------------------------------------------
/terraform/modules/prom-ec2/prometheus/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 0.13"
 3 |   required_providers {
 4 |     aws = {
 5 |       source = "hashicorp/aws"
 6 |     }
 7 |     template = {
 8 |       source = "hashicorp/template"
 9 |     }
10 |   }
11 | }
12 | 
13 | provider "template" {
14 |   version = ">= 2"
15 | }
16 | 


--------------------------------------------------------------------------------
/terraform/projects/alertmanager-production/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * ## Project: alertmanager
 3 | *
 4 | * Create services and task definitions for the ECS cluster
 5 | *
 6 | */
 7 | 
 8 | variable "aws_region" {
 9 |   type        = string
10 |   description = "AWS region"
11 |   default     = "eu-west-1"
12 | }
13 | 
14 | data "pass_password" "cronitor_production_url" {
15 |   path = "cronitor/cronitor-production-url"
16 | }
17 | 
18 | # Resources
19 | # --------------------------------------------------------------
20 | 
21 | ## Providers
22 | 
23 | terraform {
24 |   required_version = "~> 0.13.3"
25 | 
26 |   backend "s3" {
27 |     bucket = "prometheus-production"
28 |     key    = "app-ecs-services-modular.tfstate"
29 |     region = "eu-west-1"
30 |   }
31 | }
32 | 
33 | provider "aws" {
34 |   region = var.aws_region
35 | }
36 | 
37 | provider "pass" {
38 |   store_dir     = "~/.password-store/re-secrets/observe"
39 |   refresh_store = true
40 | }
41 | 
42 | variable "remote_state_bucket" {
43 |   type        = string
44 |   description = "S3 bucket we store our terraform state in"
45 |   default     = "prometheus-production"
46 | }
47 | 
48 | module "alertmanager" {
49 |   source = "../../modules/alertmanager"
50 | 
51 |   remote_state_bucket = var.remote_state_bucket
52 |   environment         = "production"
53 |   observe_cronitor    = data.pass_password.cronitor_production_url.password
54 |   allowed_cidrs = [
55 |     # Office IPs
56 |     "213.86.153.211/32",
57 |     "213.86.153.212/32",
58 |     "213.86.153.213/32",
59 |     "213.86.153.214/32",
60 |     "213.86.153.231/32",
61 |     "213.86.153.235/32",
62 |     "213.86.153.236/32",
63 |     "213.86.153.237/32",
64 |     "85.133.67.244/32",
65 |     "51.149.8.0/25",
66 |     "51.149.8.128/29",
67 | 
68 |     # verify prod
69 |     "35.178.25.41/32",
70 | 
71 |     "35.177.2.97/32",
72 |     "35.176.169.64/32",
73 | 
74 |     # verify integration
75 |     "3.8.68.252/32",
76 | 
77 |     "3.8.41.125/32",
78 |     "3.8.225.106/32",
79 | 
80 |     # verify staging
81 |     "35.177.140.5/32",
82 | 
83 |     "18.130.58.164/32",
84 |     "35.176.196.169/32",
85 | 
86 |     # concourse
87 |     "35.177.37.128/32",
88 | 
89 |     "35.176.252.164/32",
90 | 
91 |     "51.149.9.112/29", # CO
92 |     "51.149.9.240/29", # CO
93 |   ]
94 | }
95 | 
96 | output "alertmanager_ecs_clusters_services" {
97 |   value = module.alertmanager.ecs_clusters_services
98 | }
99 | 


--------------------------------------------------------------------------------
/terraform/projects/alertmanager-production/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |     pass = {
 8 |       source  = "camptocamp/pass"
 9 |       version = "1.4.0"
10 |     }
11 |     template = {
12 |       source  = "hashicorp/template"
13 |       version = "2.2.0"
14 |     }
15 |   }
16 |   required_version = ">= 0.13"
17 | }
18 | 


--------------------------------------------------------------------------------
/terraform/projects/alertmanager-staging/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * ## Project: alertmanager
 3 | *
 4 | * Create services and task definitions for the ECS cluster
 5 | *
 6 | */
 7 | 
 8 | variable "aws_region" {
 9 |   type        = string
10 |   description = "AWS region"
11 |   default     = "eu-west-1"
12 | }
13 | 
14 | data "pass_password" "cronitor_staging_url" {
15 |   path = "cronitor/cronitor-staging-url"
16 | }
17 | 
18 | # Resources
19 | # --------------------------------------------------------------
20 | 
21 | ## Providers
22 | 
23 | terraform {
24 |   required_version = "~> 0.13.3"
25 | 
26 |   backend "s3" {
27 |     bucket = "prometheus-staging"
28 |     key    = "app-ecs-services-modular.tfstate"
29 |     region = "eu-west-1"
30 |   }
31 | }
32 | 
33 | provider "aws" {
34 |   region = var.aws_region
35 | }
36 | 
37 | provider "pass" {
38 |   store_dir     = "~/.password-store/re-secrets/observe"
39 |   refresh_store = true
40 | }
41 | 
42 | variable "remote_state_bucket" {
43 |   type        = string
44 |   description = "S3 bucket we store our terraform state in"
45 |   default     = "prometheus-staging"
46 | }
47 | 
48 | module "alertmanager" {
49 |   source = "../../modules/alertmanager"
50 | 
51 |   remote_state_bucket = var.remote_state_bucket
52 |   environment         = "staging"
53 |   observe_cronitor    = data.pass_password.cronitor_staging_url.password
54 | }
55 | 
56 | output "alertmanager_ecs_clusters_services" {
57 |   value = module.alertmanager.ecs_clusters_services
58 | }
59 | 


--------------------------------------------------------------------------------
/terraform/projects/alertmanager-staging/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |     pass = {
 8 |       source  = "camptocamp/pass"
 9 |       version = "1.4.0"
10 |     }
11 |     template = {
12 |       source  = "hashicorp/template"
13 |       version = "2.2.0"
14 |     }
15 |   }
16 |   required_version = ">= 0.13"
17 | }
18 | 


--------------------------------------------------------------------------------
/terraform/projects/app-ecs-albs-production/main.tf:
--------------------------------------------------------------------------------
 1 | ## Providers
 2 | 
 3 | terraform {
 4 |   required_version = "~> 0.13.3"
 5 | 
 6 |   backend "s3" {
 7 |     bucket = "prometheus-production"
 8 |     key    = "app-ecs-albs-modular.tfstate"
 9 |     region = "eu-west-1"
10 |   }
11 | }
12 | 
13 | provider "aws" {
14 |   region = var.aws_region
15 | }
16 | 
17 | variable "aws_region" {
18 |   type        = string
19 |   description = "AWS region"
20 |   default     = "eu-west-1"
21 | }
22 | 
23 | variable "remote_state_bucket" {
24 |   type        = string
25 |   description = "S3 bucket we store our terraform state in"
26 |   default     = "prometheus-production"
27 | }
28 | 
29 | data "terraform_remote_state" "infra_networking" {
30 |   backend = "s3"
31 | 
32 |   config = {
33 |     bucket = var.remote_state_bucket
34 |     key    = "infra-networking-modular.tfstate"
35 |     region = var.aws_region
36 |   }
37 | }
38 | 
39 | module "app-ecs-albs" {
40 |   source = "../../modules/app-ecs-albs/"
41 | 
42 |   aws_region          = var.aws_region
43 |   environment         = "production"
44 |   remote_state_bucket = var.remote_state_bucket
45 |   zone_id             = data.terraform_remote_state.infra_networking.outputs.public_zone_id
46 |   subnets             = data.terraform_remote_state.infra_networking.outputs.public_subnets
47 | }
48 | 
49 | output "prom_public_record_fqdns" {
50 |   value       = module.app-ecs-albs.prom_public_record_fqdns
51 |   description = "Prometheus public DNS FQDNs"
52 | }
53 | 
54 | output "prometheus_target_group_arns" {
55 |   value = module.app-ecs-albs.prometheus_target_group_ids
56 | }
57 | 


--------------------------------------------------------------------------------
/terraform/projects/app-ecs-albs-production/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |   }
 8 |   required_version = ">= 0.13"
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/projects/app-ecs-albs-staging/main.tf:
--------------------------------------------------------------------------------
 1 | ## Providers
 2 | 
 3 | terraform {
 4 |   required_version = "~> 0.13.3"
 5 | 
 6 |   backend "s3" {
 7 |     bucket = "prometheus-staging"
 8 |     key    = "app-ecs-albs-modular.tfstate"
 9 |     region = "eu-west-1"
10 |   }
11 | }
12 | 
13 | provider "aws" {
14 |   region = var.aws_region
15 | }
16 | 
17 | variable "aws_region" {
18 |   type        = string
19 |   description = "AWS region"
20 |   default     = "eu-west-1"
21 | }
22 | 
23 | variable "remote_state_bucket" {
24 |   type        = string
25 |   description = "S3 bucket we store our terraform state in"
26 |   default     = "prometheus-staging"
27 | }
28 | 
29 | data "terraform_remote_state" "infra_networking" {
30 |   backend = "s3"
31 | 
32 |   config = {
33 |     bucket = var.remote_state_bucket
34 |     key    = "infra-networking-modular.tfstate"
35 |     region = var.aws_region
36 |   }
37 | }
38 | 
39 | module "app-ecs-albs" {
40 |   source = "../../modules/app-ecs-albs/"
41 | 
42 |   aws_region          = var.aws_region
43 |   environment         = "staging"
44 |   remote_state_bucket = var.remote_state_bucket
45 |   zone_id             = data.terraform_remote_state.infra_networking.outputs.public_zone_id
46 |   subnets             = data.terraform_remote_state.infra_networking.outputs.public_subnets
47 | }
48 | 
49 | output "prom_public_record_fqdns" {
50 |   value       = module.app-ecs-albs.prom_public_record_fqdns
51 |   description = "Prometheus public DNS FQDNs"
52 | }
53 | 
54 | output "prometheus_target_group_arns" {
55 |   value = module.app-ecs-albs.prometheus_target_group_ids
56 | }
57 | 


--------------------------------------------------------------------------------
/terraform/projects/app-ecs-albs-staging/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |   }
 8 |   required_version = ">= 0.13"
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/projects/infra-networking-production/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 0.13.3"
 3 | 
 4 |   backend "s3" {
 5 |     bucket = "prometheus-production"
 6 |     key    = "infra-networking-modular.tfstate"
 7 |     region = "eu-west-1"
 8 |   }
 9 | }
10 | 
11 | provider "aws" {
12 |   region = var.aws_region
13 | }
14 | 
15 | variable "prometheus_subdomain" {
16 |   type        = string
17 |   description = "Subdomain for prometheus"
18 |   default     = "monitoring"
19 | }
20 | 
21 | variable "aws_region" {
22 |   type        = string
23 |   description = "The AWS region to use."
24 |   default     = "eu-west-1"
25 | }
26 | 
27 | module "infra-networking" {
28 |   source = "../../modules/infra-networking"
29 | 
30 |   environment          = "production"
31 |   prometheus_subdomain = var.prometheus_subdomain
32 | }
33 | 
34 | output "vpc_id" {
35 |   value       = module.infra-networking.vpc_id
36 |   description = "VPC ID where the stack resources are created"
37 | }
38 | 
39 | output "private_subnets" {
40 |   value       = module.infra-networking.private_subnets
41 |   description = "List of private subnet IDs"
42 | }
43 | 
44 | output "public_subnets" {
45 |   value       = module.infra-networking.public_subnets
46 |   description = "List of public subnet IDs"
47 | }
48 | 
49 | output "public_zone_id" {
50 |   value       = module.infra-networking.public_zone_id
51 |   description = "Route 53 Zone ID for publicly visible zone"
52 | }
53 | 
54 | output "public_subdomain" {
55 |   value       = module.infra-networking.public_subdomain
56 |   description = "This is the subdomain for root zone"
57 | }
58 | 
59 | output "private_zone_id" {
60 |   value       = module.infra-networking.private_zone_id
61 |   description = "Route 53 Zone ID for the internal zone"
62 | }
63 | 
64 | output "private_subdomain" {
65 |   value       = module.infra-networking.private_subdomain
66 |   description = "This is the subdomain for private zone"
67 | }
68 | 
69 | output "subnets_by_az" {
70 |   value       = module.infra-networking.subnets_by_az
71 |   description = "Map of availability zones to private subnets"
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/terraform/projects/infra-networking-production/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |   }
 8 |   required_version = ">= 0.13"
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/projects/infra-networking-staging/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 0.13.3"
 3 | 
 4 |   backend "s3" {
 5 |     bucket = "prometheus-staging"
 6 |     key    = "infra-networking-modular.tfstate"
 7 |     region = "eu-west-1"
 8 |   }
 9 | }
10 | 
11 | provider "aws" {
12 |   region = var.aws_region
13 | }
14 | 
15 | variable "aws_region" {
16 |   type        = string
17 |   description = "The AWS region to use."
18 |   default     = "eu-west-1"
19 | }
20 | 
21 | variable "prometheus_subdomain" {
22 |   type        = string
23 |   description = "Subdomain for prometheus"
24 |   default     = "monitoring-staging"
25 | }
26 | 
27 | module "infra-networking" {
28 |   source = "../../modules/infra-networking"
29 | 
30 |   environment          = "staging"
31 |   prometheus_subdomain = var.prometheus_subdomain
32 | }
33 | 
34 | output "vpc_id" {
35 |   value       = module.infra-networking.vpc_id
36 |   description = "VPC ID where the stack resources are created"
37 | }
38 | 
39 | output "private_subnets" {
40 |   value       = module.infra-networking.private_subnets
41 |   description = "List of private subnet IDs"
42 | }
43 | 
44 | output "public_subnets" {
45 |   value       = module.infra-networking.public_subnets
46 |   description = "List of public subnet IDs"
47 | }
48 | 
49 | output "public_zone_id" {
50 |   value       = module.infra-networking.public_zone_id
51 |   description = "Route 53 Zone ID for publicly visible zone"
52 | }
53 | 
54 | output "public_subdomain" {
55 |   value       = module.infra-networking.public_subdomain
56 |   description = "This is the subdomain for root zone"
57 | }
58 | 
59 | output "private_zone_id" {
60 |   value       = module.infra-networking.private_zone_id
61 |   description = "Route 53 Zone ID for the internal zone"
62 | }
63 | 
64 | output "private_subdomain" {
65 |   value       = module.infra-networking.private_subdomain
66 |   description = "This is the subdomain for private zone"
67 | }
68 | 
69 | output "subnets_by_az" {
70 |   value       = module.infra-networking.subnets_by_az
71 |   description = "Map of availability zones to private subnets"
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/terraform/projects/infra-networking-staging/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |   }
 8 |   required_version = ">= 0.13"
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/projects/infra-security-groups-production/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 0.13.3"
 3 | 
 4 |   backend "s3" {
 5 |     bucket = "prometheus-production"
 6 |     key    = "infra-security-groups-modular.tfstate"
 7 |     region = "eu-west-1"
 8 |   }
 9 | }
10 | 
11 | provider "aws" {
12 |   region = var.aws_region
13 | }
14 | 
15 | variable "aws_region" {
16 |   type        = string
17 |   description = "AWS region"
18 |   default     = "eu-west-1"
19 | }
20 | 
21 | module "infra-security-groups" {
22 |   source = "../../modules/infra-security-groups/"
23 | 
24 |   aws_region          = var.aws_region
25 |   environment         = "production"
26 |   remote_state_bucket = "prometheus-production"
27 | }
28 | 
29 | ## Outputs
30 | 
31 | output "prometheus_ec2_sg_id" {
32 |   value       = module.infra-security-groups.prometheus_ec2_sg_id
33 |   description = "security group prometheus_ec2 ID"
34 | }
35 | 
36 | output "prometheus_alb_sg_id" {
37 |   value       = module.infra-security-groups.prometheus_alb_sg_id
38 |   description = "security group prometheus_alb ID"
39 | }
40 | 


--------------------------------------------------------------------------------
/terraform/projects/infra-security-groups-production/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |   }
 8 |   required_version = ">= 0.13"
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/projects/infra-security-groups-staging/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 0.13.3"
 3 | 
 4 |   backend "s3" {
 5 |     bucket = "prometheus-staging"
 6 |     key    = "infra-security-groups-modular.tfstate"
 7 |     region = "eu-west-1"
 8 |   }
 9 | }
10 | 
11 | provider "aws" {
12 |   region = var.aws_region
13 | }
14 | 
15 | variable "aws_region" {
16 |   type        = string
17 |   description = "AWS region"
18 |   default     = "eu-west-1"
19 | }
20 | 
21 | module "infra-security-groups" {
22 |   source = "../../modules/infra-security-groups/"
23 | 
24 |   aws_region          = var.aws_region
25 |   environment         = "staging"
26 |   remote_state_bucket = "prometheus-staging"
27 | }
28 | 
29 | ## Outputs
30 | 
31 | output "prometheus_ec2_sg_id" {
32 |   value       = module.infra-security-groups.prometheus_ec2_sg_id
33 |   description = "security group prometheus_ec2 ID"
34 | }
35 | 
36 | output "prometheus_alb_sg_id" {
37 |   value       = module.infra-security-groups.prometheus_alb_sg_id
38 |   description = "security group prometheus_alb ID"
39 | }
40 | 


--------------------------------------------------------------------------------
/terraform/projects/infra-security-groups-staging/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |   }
 8 |   required_version = ">= 0.13"
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/projects/prom-ec2/paas-production/extra-prometheus-scrape-configs.yml.tpl:
--------------------------------------------------------------------------------
  1 | - job_name: dcs-federate
  2 |   scheme: https
  3 |   honor_labels: true
  4 |   honor_timestamps: true
  5 |   metrics_path: '/federate'
  6 |   params:
  7 |     "match[]":
  8 |     # fetch everything (via https://stackoverflow.com/a/39253848 )
  9 |     - '{__name__=~".+"}'
 10 |   static_configs:
 11 |   - targets:
 12 |     - dcs-build-internal-prometheus.london.cloudapps.digital
 13 |     labels:
 14 |       federated_from: dcs-build-internal-prometheus.london.cloudapps.digital
 15 |   - targets:
 16 |     - dcs-integration-internal-prometheus.london.cloudapps.digital
 17 |     labels:
 18 |       federated_from: dcs-integration-internal-prometheus.london.cloudapps.digital
 19 |   - targets:
 20 |     - dcs-production-internal-prometheus.london.cloudapps.digital
 21 |     labels:
 22 |       federated_from: dcs-production-internal-prometheus.london.cloudapps.digital
 23 | 
 24 | 
 25 | - job_name: paas_elasticsearch_for_dm
 26 |   scheme: https
 27 |   basic_auth:
 28 |     username: digitalmarketplace
 29 |     password: ${dm_elasticsearch_metrics_password}
 30 |   metrics_path: '/federate'
 31 |   params:
 32 |     "match[]":
 33 |     - "{job='aiven'}"
 34 |   static_configs:
 35 |   - targets:
 36 |     - digitalmarketplace-es-metrics.cloudapps.digital
 37 |   metric_relabel_configs:
 38 |   # Prepend `paas_es_` so the metrics are easier to find
 39 |   - action: replace
 40 |     source_labels: [__name__]
 41 |     target_label: __name__
 42 |     regex: (.*)
 43 |     replacement: paas_es_$${1}
 44 |   # Dummy entry to be used below
 45 |   - &store_this_metric
 46 |     action: replace
 47 |     target_label: __store_this__
 48 |     replacement: store_this
 49 |     source_labels: [__name__]
 50 |     regex: __dummy_metric_name
 51 |   # One entry for each metric you want to import into Prometheus.
 52 |   # (Or remove this and the drop rules below it in order to import all
 53 |   # nearly 1000 metrics.)
 54 |   - <<: *store_this_metric
 55 |     regex: paas_es_disk_free
 56 |   - <<: *store_this_metric
 57 |     regex: paas_es_disk_used_percent
 58 |   - <<: *store_this_metric
 59 |     regex: paas_es_diskio_io_time
 60 |   - <<: *store_this_metric
 61 |     regex: paas_es_diskio_iops_in_progress
 62 |   - <<: *store_this_metric
 63 |     regex: paas_es_diskio_read_time
 64 |   - <<: *store_this_metric
 65 |     regex: paas_es_diskio_write_time
 66 |   - <<: *store_this_metric
 67 |     regex: paas_es_swap_used_percent
 68 |   - <<: *store_this_metric
 69 |     regex: paas_es_system_load1
 70 |   - <<: *store_this_metric
 71 |     regex: paas_es_system_load5
 72 |   - <<: *store_this_metric
 73 |     regex: paas_es_system_load15
 74 |   - <<: *store_this_metric
 75 |     regex: paas_es_net_bytes_recv
 76 |   - <<: *store_this_metric
 77 |     regex: paas_es_net_bytes_sent
 78 |   - <<: *store_this_metric
 79 |     regex: paas_es_elasticsearch_clusterstats_nodes_os_mem_free_percent
 80 |   - <<: *store_this_metric
 81 |     regex: paas_es_elasticsearch_clusterstats_nodes_os_mem_used_percent
 82 |   - <<: *store_this_metric
 83 |     regex: paas_es_elasticsearch_clusterstats_nodes_process_cpu_percent
 84 |   - <<: *store_this_metric
 85 |     regex: paas_es_elasticsearch_clusterstats_indices_count
 86 |   - <<: *store_this_metric
 87 |     regex: paas_es_elasticsearch_clusterstats_indices_docs_count
 88 |   - <<: *store_this_metric
 89 |     regex: paas_es_elasticsearch_clusterstats_indices_docs_deleted
 90 |   - <<: *store_this_metric
 91 |     regex: paas_es_elasticsearch_clusterstats_indices_query_cache_miss_count
 92 |   - <<: *store_this_metric
 93 |     regex: paas_es_elasticsearch_clusterstats_indices_store_size_in_bytes
 94 |   - <<: *store_this_metric
 95 |     regex: paas_es_elasticsearch_clusterstats_nodes_count_master
 96 |   - <<: *store_this_metric
 97 |     regex: paas_es_elasticsearch_clusterstats_nodes_count_total
 98 |   - <<: *store_this_metric
 99 |     regex: paas_es_elasticsearch_clusterstats_nodes_fs_available_in_bytes
100 |   - <<: *store_this_metric
101 |     regex: paas_es_elasticsearch_clusterstats_nodes_fs_free_in_bytes
102 |   - <<: *store_this_metric
103 |     regex: paas_es_elasticsearch_clusterstats_nodes_fs_total_in_bytes
104 |   - <<: *store_this_metric
105 |     regex: paas_es_elasticsearch_clusterstats_nodes_jvm_mem_heap_max_in_bytes
106 |   - <<: *store_this_metric
107 |     regex: paas_es_elasticsearch_clusterstats_nodes_jvm_mem_heap_used_in_bytes
108 |   - <<: *store_this_metric
109 |     regex: paas_es_elasticsearch_clusterstats_nodes_jvm_threads
110 |   - <<: *store_this_metric
111 |     regex: paas_es_elasticsearch_clusterstats_nodes_process_open_file_descriptors_avg
112 |   - <<: *store_this_metric
113 |     regex: paas_es_elasticsearch_clusterstats_nodes_process_open_file_descriptors_max
114 |   - <<: *store_this_metric
115 |     regex: paas_es_elasticsearch_clusterstats_nodes_process_open_file_descriptors_min
116 |   - <<: *store_this_metric
117 |     regex: paas_es_elasticsearch_cluster_health_active_primary_shards
118 |   - <<: *store_this_metric
119 |     regex: paas_es_elasticsearch_cluster_health_active_shards
120 |   - <<: *store_this_metric
121 |     regex: paas_es_elasticsearch_cluster_health_active_shards_percent_as_number
122 |   - <<: *store_this_metric
123 |     regex: paas_es_elasticsearch_cluster_health_initializing_shards
124 |   - <<: *store_this_metric
125 |     regex: paas_es_elasticsearch_cluster_health_number_of_data_nodes
126 |   - <<: *store_this_metric
127 |     regex: paas_es_elasticsearch_cluster_health_number_of_nodes
128 |   - <<: *store_this_metric
129 |     regex: paas_es_elasticsearch_cluster_health_number_of_pending_tasks
130 |   - <<: *store_this_metric
131 |     regex: paas_es_elasticsearch_cluster_health_relocating_shards
132 |   - <<: *store_this_metric
133 |     regex: paas_es_elasticsearch_cluster_health_status_code
134 |   - <<: *store_this_metric
135 |     regex: paas_es_elasticsearch_cluster_health_task_max_waiting_in_queue_millis
136 |   - <<: *store_this_metric
137 |     regex: paas_es_elasticsearch_cluster_health_unassigned_shards
138 |   - <<: *store_this_metric
139 |     regex: paas_es_elasticsearch_indices_docs_count
140 |   - <<: *store_this_metric
141 |     regex: paas_es_elasticsearch_indices_docs_deleted
142 |   - <<: *store_this_metric
143 |     regex: paas_es_elasticsearch_indices_request_cache_hit_count
144 |   - <<: *store_this_metric
145 |     regex: paas_es_elasticsearch_indices_request_cache_miss_count
146 |   - <<: *store_this_metric
147 |     regex: paas_es_elasticsearch_os_cpu_load_average_15m
148 |   - <<: *store_this_metric
149 |     regex: paas_es_elasticsearch_os_cpu_load_average_1m
150 |   - <<: *store_this_metric
151 |     regex: paas_es_elasticsearch_os_cpu_load_average_5m
152 |   - <<: *store_this_metric
153 |     regex: paas_es_elasticsearch_os_cpu_percent
154 |   - <<: *store_this_metric
155 |     regex: paas_es_elasticsearch_os_mem_free_percent
156 |   - <<: *store_this_metric
157 |     regex: paas_es_elasticsearch_os_mem_used_percent
158 |   - <<: *store_this_metric
159 |     regex: paas_es_elasticsearch_os_swap_total_in_bytes
160 |   - <<: *store_this_metric
161 |     regex: paas_es_elasticsearch_os_swap_used_in_bytes
162 |   - <<: *store_this_metric
163 |     regex: paas_es_elasticsearch_process_max_file_descriptors
164 |   - <<: *store_this_metric
165 |     regex: paas_es_elasticsearch_process_open_file_descriptors
166 |   - <<: *store_this_metric
167 |     regex: paas_es_elasticsearch_jvm_gc_collectors_old_collection_count
168 |   - <<: *store_this_metric
169 |     regex: paas_es_elasticsearch_jvm_gc_collectors_old_collection_time_in_millis
170 |   - <<: *store_this_metric
171 |     regex: paas_es_elasticsearch_jvm_gc_collectors_young_collection_count
172 |   - <<: *store_this_metric
173 |     regex: paas_es_elasticsearch_jvm_gc_collectors_young_collection_time_in_millis
174 |   - <<: *store_this_metric
175 |     regex: paas_es_elasticsearch_jvm_mem_heap_used_percent
176 |   - <<: *store_this_metric
177 |     regex: paas_es_elasticsearch_jvm_uptime_in_millis
178 |   # Drop metrics we don't want to keep
179 |   - source_labels: [__store_this__]
180 |     regex: ^store_this$
181 |     action: keep
182 |   # Drop the temporary label
183 |   - regex: ^__store_this__$
184 |     action: labeldrop
185 | - job_name: paas_redis_metrics_for_dm
186 |   scheme: https
187 |   basic_auth:
188 |     username: ${dm_paas_metrics_username}
189 |     password: ${dm_paas_metrics_password}
190 |   static_configs:
191 |   - targets:
192 |     - redis.metrics.cloud.service.gov.uk
193 |   metrics_path: /metrics
194 |   scrape_interval: 300s
195 |   scrape_timeout: 120s
196 |   honor_timestamps: true
197 |   metric_relabel_configs:
198 |   # Prepend `paas_redis_` so the metrics are easier to find
199 |   - action: replace
200 |     source_labels: [__name__]
201 |     target_label: __name__
202 |     regex: (.*)
203 |     replacement: paas_redis_$${1}
204 | 


--------------------------------------------------------------------------------
/terraform/projects/prom-ec2/paas-production/main.tf:
--------------------------------------------------------------------------------
  1 | locals {
  2 |   environment   = "production"
  3 |   config_bucket = "gdsobserve-paas-${local.environment}-config-store"
  4 | }
  5 | 
  6 | terraform {
  7 |   required_version = "~> 0.13.3"
  8 | 
  9 |   backend "s3" {
 10 |     bucket  = "govukobserve-tfstate-prom-enclave-paas-production"
 11 |     key     = "prometheus.tfstate"
 12 |     encrypt = true
 13 |     region  = "eu-west-1"
 14 |   }
 15 | }
 16 | 
 17 | provider "aws" {
 18 |   region              = "eu-west-1"
 19 |   allowed_account_ids = ["455214962221"]
 20 | }
 21 | 
 22 | data "terraform_remote_state" "infra_networking" {
 23 |   backend = "s3"
 24 | 
 25 |   config = {
 26 |     bucket = "prometheus-${local.environment}"
 27 |     key    = "infra-networking-modular.tfstate"
 28 |     region = "eu-west-1"
 29 |   }
 30 | }
 31 | 
 32 | data "terraform_remote_state" "infra_security_groups" {
 33 |   backend = "s3"
 34 | 
 35 |   config = {
 36 |     bucket = "prometheus-${local.environment}"
 37 |     key    = "infra-security-groups-modular.tfstate"
 38 |     region = "eu-west-1"
 39 |   }
 40 | }
 41 | 
 42 | data "terraform_remote_state" "app_ecs_albs" {
 43 |   backend = "s3"
 44 | 
 45 |   config = {
 46 |     bucket = "prometheus-${local.environment}"
 47 |     key    = "app-ecs-albs-modular.tfstate"
 48 |     region = "eu-west-1"
 49 |   }
 50 | }
 51 | 
 52 | provider "pass" {
 53 |   store_dir     = "~/.password-store/re-secrets/observe"
 54 |   refresh_store = true
 55 | }
 56 | 
 57 | data "pass_password" "logstash_endpoint" {
 58 |   path = "logit/prometheus-paas-logstash-endpoint-prod"
 59 | }
 60 | 
 61 | data "pass_password" "prometheus_htpasswd" {
 62 |   path = "prometheus-basic-auth-htpasswd"
 63 | }
 64 | 
 65 | data "pass_password" "dm_elasticsearch_metrics_password" {
 66 |   path = "dm-elasticsearch-metrics-password"
 67 | }
 68 | 
 69 | data "pass_password" "dm_paas_metrics_username" {
 70 |   path = "dm-paas-metrics-username"
 71 | }
 72 | 
 73 | data "pass_password" "dm_paas_metrics_password" {
 74 |   path = "dm-paas-metrics-password"
 75 | }
 76 | 
 77 | module "ami" {
 78 |   source = "../../../modules/common/ami"
 79 | }
 80 | 
 81 | module "prometheus" {
 82 |   source = "../../../modules/prom-ec2/prometheus"
 83 | 
 84 |   ami_id = module.ami.ubuntu_focal_ami_id
 85 | 
 86 |   target_vpc = data.terraform_remote_state.infra_networking.outputs.vpc_id
 87 |   enable_ssh = false
 88 | 
 89 |   environment   = local.environment
 90 |   config_bucket = local.config_bucket
 91 |   logstash_host = data.pass_password.logstash_endpoint.password
 92 | 
 93 |   prometheus_public_fqdns = data.terraform_remote_state.app_ecs_albs.outputs.prom_public_record_fqdns
 94 | 
 95 |   subnet_ids          = data.terraform_remote_state.infra_networking.outputs.private_subnets
 96 |   availability_zones  = data.terraform_remote_state.infra_networking.outputs.subnets_by_az
 97 |   vpc_security_groups = [data.terraform_remote_state.infra_security_groups.outputs.prometheus_ec2_sg_id]
 98 |   region              = "eu-west-1"
 99 | 
100 |   prometheus_htpasswd          = data.pass_password.prometheus_htpasswd.password
101 |   prometheus_target_group_arns = data.terraform_remote_state.app_ecs_albs.outputs.prometheus_target_group_arns
102 | }
103 | 
104 | module "paas-config" {
105 |   source = "../../../modules/prom-ec2/paas-config"
106 | 
107 |   environment = local.environment
108 | 
109 |   prometheus_config_bucket = module.prometheus.s3_config_bucket
110 |   alerts_path              = "../../../modules/prom-ec2/alerts-config/alerts/"
111 | 
112 |   prom_private_ips = module.prometheus.private_ip_addresses
113 |   private_zone_id  = data.terraform_remote_state.infra_networking.outputs.private_zone_id
114 | 
115 |   extra_scrape_configs = yamldecode(templatefile("${path.module}/extra-prometheus-scrape-configs.yml.tpl", {
116 |     dm_elasticsearch_metrics_password = data.pass_password.dm_elasticsearch_metrics_password.password
117 |     dm_paas_metrics_username          = data.pass_password.dm_paas_metrics_username.password
118 |     dm_paas_metrics_password          = data.pass_password.dm_paas_metrics_password.password
119 |   }))
120 | }
121 | 
122 | output "instance_ids" {
123 |   value = "[\n    ${join("\n    ", module.prometheus.prometheus_instance_id)}\n]"
124 | }
125 | 
126 | output "prometheus_config_etag" {
127 |   value = module.paas-config.prometheus_config_etag
128 | }
129 | 


--------------------------------------------------------------------------------
/terraform/projects/prom-ec2/paas-production/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |     pass = {
 8 |       source  = "camptocamp/pass"
 9 |       version = "1.4.0"
10 |     }
11 |   }
12 |   required_version = ">= 0.13"
13 | }
14 | 


--------------------------------------------------------------------------------
/terraform/projects/prom-ec2/paas-staging/main.tf:
--------------------------------------------------------------------------------
  1 | locals {
  2 |   environment   = "staging"
  3 |   config_bucket = "gdsobserve-paas-${local.environment}-config-store"
  4 | }
  5 | 
  6 | terraform {
  7 |   required_version = "~> 0.13.3"
  8 | 
  9 |   backend "s3" {
 10 |     bucket  = "govukobserve-tfstate-prom-enclave-paas-staging"
 11 |     key     = "prometheus.tfstate"
 12 |     encrypt = true
 13 |     region  = "eu-west-1"
 14 |   }
 15 | }
 16 | 
 17 | provider "aws" {
 18 |   region              = "eu-west-1"
 19 |   allowed_account_ids = ["027317422673"]
 20 | }
 21 | 
 22 | data "terraform_remote_state" "infra_networking" {
 23 |   backend = "s3"
 24 | 
 25 |   config = {
 26 |     bucket = "prometheus-${local.environment}"
 27 |     key    = "infra-networking-modular.tfstate"
 28 |     region = "eu-west-1"
 29 |   }
 30 | }
 31 | 
 32 | data "terraform_remote_state" "infra_security_groups" {
 33 |   backend = "s3"
 34 | 
 35 |   config = {
 36 |     bucket = "prometheus-${local.environment}"
 37 |     key    = "infra-security-groups-modular.tfstate"
 38 |     region = "eu-west-1"
 39 |   }
 40 | }
 41 | 
 42 | data "terraform_remote_state" "app_ecs_albs" {
 43 |   backend = "s3"
 44 | 
 45 |   config = {
 46 |     bucket = "prometheus-${local.environment}"
 47 |     key    = "app-ecs-albs-modular.tfstate"
 48 |     region = "eu-west-1"
 49 |   }
 50 | }
 51 | 
 52 | provider "pass" {
 53 |   store_dir     = "~/.password-store/re-secrets/observe"
 54 |   refresh_store = true
 55 | }
 56 | 
 57 | data "pass_password" "prometheus_htpasswd" {
 58 |   path = "prometheus-basic-auth-htpasswd"
 59 | }
 60 | 
 61 | module "ami" {
 62 |   source = "../../../modules/common/ami"
 63 | }
 64 | 
 65 | module "prometheus" {
 66 |   source = "../../../modules/prom-ec2/prometheus"
 67 | 
 68 |   ami_id = module.ami.ubuntu_focal_ami_id
 69 | 
 70 |   target_vpc = data.terraform_remote_state.infra_networking.outputs.vpc_id
 71 |   enable_ssh = false
 72 | 
 73 |   environment   = local.environment
 74 |   config_bucket = local.config_bucket
 75 | 
 76 |   prometheus_public_fqdns = data.terraform_remote_state.app_ecs_albs.outputs.prom_public_record_fqdns
 77 | 
 78 |   subnet_ids          = data.terraform_remote_state.infra_networking.outputs.private_subnets
 79 |   availability_zones  = data.terraform_remote_state.infra_networking.outputs.subnets_by_az
 80 |   vpc_security_groups = [data.terraform_remote_state.infra_security_groups.outputs.prometheus_ec2_sg_id]
 81 |   region              = "eu-west-1"
 82 | 
 83 |   prometheus_htpasswd          = data.pass_password.prometheus_htpasswd.password
 84 |   prometheus_target_group_arns = data.terraform_remote_state.app_ecs_albs.outputs.prometheus_target_group_arns
 85 |   data_volume_size             = 100
 86 | }
 87 | 
 88 | module "paas-config" {
 89 |   source = "../../../modules/prom-ec2/paas-config"
 90 | 
 91 |   environment = local.environment
 92 | 
 93 |   prometheus_config_bucket = module.prometheus.s3_config_bucket
 94 |   alerts_path              = "../../../modules/prom-ec2/alerts-config/alerts/"
 95 | 
 96 |   prom_private_ips = module.prometheus.private_ip_addresses
 97 |   private_zone_id  = data.terraform_remote_state.infra_networking.outputs.private_zone_id
 98 | }
 99 | 
100 | output "instance_ids" {
101 |   value = "[\n    ${join("\n    ", module.prometheus.prometheus_instance_id)}\n]"
102 | }
103 | 
104 | output "prometheus_config_etag" {
105 |   value = module.paas-config.prometheus_config_etag
106 | }
107 | 


--------------------------------------------------------------------------------
/terraform/projects/prom-ec2/paas-staging/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "3.15"
 6 |     }
 7 |     pass = {
 8 |       source  = "camptocamp/pass"
 9 |       version = "1.4.0"
10 |     }
11 |   }
12 |   required_version = ">= 0.13"
13 | }
14 | 


--------------------------------------------------------------------------------
/tools/check-alerting-rules.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Check prometheus alerting rules using promtool
4 | #
5 | set -e
6 | 
7 | promtool check rules ./terraform/modules/prom-ec2/alerts-config/alerts/*.yml
8 | 


--------------------------------------------------------------------------------
/tools/grafana_info/.python-version:
--------------------------------------------------------------------------------
1 | 3.6.6
2 | 


--------------------------------------------------------------------------------
/tools/grafana_info/README.md:
--------------------------------------------------------------------------------
 1 | # Setup
 2 | 
 3 | To run these python apps:
 4 | 
 5 |     - [create an api key](https://grafana-paas.cloudapps.digital/org/apikeys) with "Viewer" capability
 6 |     - set it in the GRAFANA_TOKEN environment variable
 7 |     - create a virtualenv if you want
 8 |     - run `pip install -r requirements.txt`
 9 | 
10 | # show_queries.py
11 | 
12 | This script scrapes all the PromQL queries from Grafana and shows
13 | them, in sorted order.
14 | 
15 | To run it:
16 | 
17 |     - run `./show_queries.py`
18 | 
19 | This directory has a `.python-version` file to be used by
20 | [pyenv](https://github.com/pyenv/pyenv).
21 | 
22 | # find_missing_metrics.py
23 | 
24 | This script will attempt to find missing metrics on the prometheus running an older version which are being used in Grafana and the alerts files.
25 | It will show the expressions used, then the result of an API call to an older Prometheus server (could be EC2 or one deployed locally) based on the extracted metrics (boolean in the result refers to whether any metric results were returned followed by the metric) and will report any metrics without data points in the older Prometheus server but have data points in the latest prometheus server.
26 | 
27 | NB - keywords for prom QL operators used in the Grafana expressions are ignored, other wrongly identified metrics should be added to the `IGNORE_WORDS` list found at the top of the `find_missing_metrics.py` file.
28 | 
29 | To run it:
30 | 
31 |     - set the `OLD_PROM_SERVER` environment variable to an EC2 staging prometheus server, or a locally deployed older prometheus version
32 |     - set the `NEW_PROM_SERVER` environment variable to an ECS staging prometheus server, or a locally running the latest prometheus
33 |     - set the `ALERTS_DIR` environment variable to the location of the alerts yml files relative to where you will be executing the python script
34 |     - run `./find_missing_metrics.py`
35 | 


--------------------------------------------------------------------------------
/tools/grafana_info/bearer_auth.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | class BearerAuth(requests.auth.AuthBase):
 4 |     def __init__(self, token):
 5 |         self.token = token
 6 | 
 7 |     def __call__(self, r):
 8 |         r.headers['Authorization'] = 'Bearer %s' % self.token
 9 |         return r
10 | 


--------------------------------------------------------------------------------
/tools/grafana_info/find_missing_metrics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import json
  3 | import os
  4 | import re
  5 | import requests
  6 | import sys
  7 | import yaml
  8 | 
  9 | from bearer_auth import BearerAuth
 10 | from grafana_api.grafana_api import GrafanaAPI
 11 | 
 12 | IGNORE_WORDS = [
 13 |     "and",
 14 |     "avg",
 15 |     "avg_over_time",
 16 |     "by",
 17 |     "count",
 18 |     "deriv",
 19 |     "exported_instance",
 20 |     "ignoring",
 21 |     "increase",
 22 |     "irate",
 23 |     "job",
 24 |     "le",
 25 |     "max",
 26 |     "on",
 27 |     "or",
 28 |     "rate",
 29 |     "sort",
 30 |     "sum",
 31 |     "time",
 32 |     "topk",
 33 |     "without",
 34 | ]
 35 | 
 36 | 
 37 | def exprs_for_dashboard(dashboard):
 38 |     d = g.get('/dashboards/uid/%s' % dashboard['uid'])
 39 |     if 'panels' in d['dashboard']:
 40 |         panels = d['dashboard']['panels']
 41 |         for panel in panels:
 42 |             targets = panel.get('targets', [])
 43 |             for target in targets:
 44 |                 if 'expr' in target:
 45 |                     yield {
 46 |                         "expr": target['expr'],
 47 |                         "dashboard_title": dashboard['title'],
 48 |                         "panel_title": panel['title']
 49 |                     }
 50 | 
 51 | 
 52 | def exprs_for_alerts():
 53 |     exprs = []
 54 |     for yml_file in [f for f in os.listdir(os.environ.get("ALERTS_DIR")) if re.match(r'.*\.yml', f)]:
 55 |         with open("{}/{}".format(os.environ.get("ALERTS_DIR"), yml_file), 'r') as stream:
 56 |             try:
 57 |                 alerts = yaml.load(stream)
 58 |                 
 59 |             except yaml.YAMLError as exc:
 60 |                 print(exc)
 61 | 
 62 |         for rule in alerts["groups"][0]['rules']:
 63 |             exprs.append({'expr': rule['expr']})
 64 | 
 65 |     return exprs
 66 | 
 67 | 
 68 | # remove unwanted parts of the expression
 69 | def rationalise_expr(expr, pattern, replace="%s"):
 70 |     matched = re.findall(pattern, expr)
 71 | 
 72 |     if matched:
 73 |         for m in matched:
 74 |             expr = expr.replace(replace % m, "")
 75 | 
 76 |     return expr
 77 | 
 78 | 
 79 | def extract_words_from_expressions(exprs):
 80 |     index = 0
 81 |     words = []
 82 | 
 83 |     print('**** Expressions:')
 84 |     for e in exprs:
 85 |         expr = e['expr']
 86 | 
 87 |         if len(expr) > 0:
 88 |             print(index, expr)
 89 | 
 90 |             expr = rationalise_expr(expr, r'\{([^}]+)', "{%s}")  # filters
 91 |             expr = rationalise_expr(expr, r'\[([^]]+)', "[%s]")  # time ranges
 92 |             expr = rationalise_expr(expr, r'\$[_\w]+')  # grafana vars
 93 |             expr = rationalise_expr(expr, r'\([a-z]+\)')  # labels
 94 | 
 95 |             matched_words = re.findall(r'[^\d\W]+', expr)
 96 |             words.extend(matched_words)
 97 | 
 98 |             index += 1
 99 | 
100 |     return words
101 | 
102 | 
103 | def check_metric_exists_for_word(words):
104 |     index = 0
105 |     missing_metric = []
106 |     print('**** Metrics evaluation:')
107 |     for w in set(words).difference(IGNORE_WORDS):
108 |         r_old = requests.get("{}/api/v1/query?query={}".format(os.environ.get("OLD_PROM_SERVER"), w))
109 |         resp_old = json.loads(r_old.content)
110 | 
111 |         if resp_old['status'] == 'success':
112 |             print('{}: {}, {}'.format(index, len(resp_old['data']['result']) > 0, w))
113 | 
114 |             # if old prometheus server doesn't have the metric then check if new prometheus server has the metric
115 |             if not len(resp_old['data']['result']):
116 |                 r_new = requests.get("{}/api/v1/query?query={}".format(os.environ.get("NEW_PROM_SERVER"), w))
117 |                 resp_new = json.loads(r_new.content)
118 |                 # only report it as missing if metrics are found on the new prometheus server
119 |                 if len(resp_new['data']['result']) > 0:
120 |                     missing_metric.append(w)
121 |         else:
122 |             print("{}: *** {} - {}".format(index, w, resp_old))
123 | 
124 |         index += 1
125 | 
126 |     return missing_metric
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     try:
131 |         token = os.environ['GRAFANA_TOKEN']
132 |         g = GrafanaAPI(BearerAuth(token), 'grafana-paas.cloudapps.digital', protocol='https')
133 |         dashboards = g.get('/search?type=dash-db')
134 |         exprs = [expr for dashboard in dashboards for expr in exprs_for_dashboard(dashboard)]
135 |         exprs.sort(key=lambda e: e['dashboard_title'] + e['panel_title'])
136 | 
137 |         exprs.extend(exprs_for_alerts())
138 | 
139 |         words = extract_words_from_expressions(exprs)
140 | 
141 |         missing_metric = check_metric_exists_for_word(words)
142 | 
143 |         print('**** Missing metrics:' if missing_metric else '**** No missing metrics')
144 |         for m in missing_metric:
145 |             print(m)
146 | 
147 |     except KeyError as e:
148 |         print('Please set the %s environment variable' % e.args[0], file=sys.stderr)
149 |         exit(1)
150 | 


--------------------------------------------------------------------------------
/tools/grafana_info/requirements.txt:
--------------------------------------------------------------------------------
1 | grafana-api==0.2.4
2 | simplejson==3.16.0
3 | pyyaml>=4.2b1
4 | 


--------------------------------------------------------------------------------
/tools/grafana_info/show_queries.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from grafana_api.grafana_api import GrafanaAPI
 4 | from bearer_auth import BearerAuth
 5 | import os, sys
 6 | 
 7 | 
 8 | def exprs_for_dashboard(dashboard):
 9 |     d = g.get('/dashboards/uid/%s' % dashboard['uid'])
10 |     if 'panels' in d['dashboard']:
11 |         panels = d['dashboard']['panels']
12 |         for panel in panels:
13 |             targets = panel.get('targets',[])
14 |             for target in targets:
15 |                 if 'expr' in target:
16 |                     yield (target['expr'], dashboard['title'], panel['title'])
17 |     else:
18 |         print('***** no panels {}'.format(dashboard['title']))
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     try:
23 |         token = os.environ['GRAFANA_TOKEN']
24 |         g = GrafanaAPI(BearerAuth(token), 'grafana-paas.cloudapps.digital', protocol='https')
25 |         dashboards = g.get('/search?type=dash-db')
26 |         exprs = [expr for dashboard in dashboards for expr in exprs_for_dashboard(dashboard)]
27 |         exprs.sort()
28 |         for expr in exprs:
29 |             print(expr)
30 |     except KeyError as e:
31 |         print('Please set the %s environment variable' % e.args[0], file=sys.stderr)
32 |         exit(1)
33 | 


--------------------------------------------------------------------------------
/tools/terraform-format.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -eu
 3 | 
 4 | for file in "$@"; do
 5 |   lint=$(terraform fmt -write=false -diff=true -list=true "${file}")
 6 |   failed=""
 7 | 
 8 |   if [ ! -z "${lint}" ]; then
 9 |     failed="yes"
10 |     echo -e "Your code is not in a canonical format:\n"
11 |     echo "${lint}"
12 |     echo -e "To apply these changes do 'terraform fmt ${file}'\n"
13 |   fi
14 | 
15 |   if [ "$failed" == "yes" ];then
16 |     exit 1
17 |   fi
18 | done
19 | 


--------------------------------------------------------------------------------