├── .ruby-version ├── .terraform-version ├── tools ├── grafana_info │ ├── .python-version │ ├── requirements.txt │ ├── bearer_auth.py │ ├── show_queries.py │ ├── README.md │ └── find_missing_metrics.py ├── check-alerting-rules.sh └── terraform-format.sh ├── terraform ├── modules │ ├── prom-ec2 │ │ ├── prometheus │ │ │ ├── .ruby-version │ │ │ ├── filebeat.yml.tpl │ │ │ ├── versions.tf │ │ │ ├── output.tf │ │ │ ├── variables.tf │ │ │ ├── iam.tf │ │ │ ├── targets.tf │ │ │ ├── main.tf │ │ │ └── cloud.conf │ │ ├── paas-config │ │ │ ├── outputs.tf │ │ │ ├── versions.tf │ │ │ ├── variables.tf │ │ │ ├── main.tf │ │ │ └── prometheus.conf.tpl │ │ ├── README.md │ │ └── alerts-config │ │ │ └── alerts │ │ │ ├── notify-alerts.yml │ │ │ ├── README.md │ │ │ ├── doc-checking-alerts.yml │ │ │ ├── data-gov-uk-alerts.yml │ │ │ └── observe-alerts.yml │ ├── common │ │ └── ami │ │ │ ├── versions.tf │ │ │ └── main.tf │ ├── app-ecs-albs │ │ ├── versions.tf │ │ └── main.tf │ ├── infra-networking │ │ ├── versions.tf │ │ └── main.tf │ ├── infra-security-groups │ │ ├── versions.tf │ │ └── main.tf │ └── alertmanager │ │ ├── versions.tf │ │ ├── service_discovery.tf │ │ ├── templates │ │ ├── default.tmpl │ │ └── alertmanager.tpl │ │ ├── task-definitions │ │ └── alertmanager.json │ │ ├── certificate.tf │ │ ├── main.tf │ │ ├── security-group.tf │ │ ├── alb.tf │ │ └── alertmanager-service.tf └── projects │ ├── app-ecs-albs-staging │ ├── versions.tf │ └── main.tf │ ├── app-ecs-albs-production │ ├── versions.tf │ └── main.tf │ ├── infra-networking-staging │ ├── versions.tf │ └── main.tf │ ├── infra-networking-production │ ├── versions.tf │ └── main.tf │ ├── infra-security-groups-staging │ ├── versions.tf │ └── main.tf │ ├── infra-security-groups-production │ ├── versions.tf │ └── main.tf │ ├── prom-ec2 │ ├── paas-production │ │ ├── versions.tf │ │ ├── main.tf │ │ └── extra-prometheus-scrape-configs.yml.tpl │ └── paas-staging │ │ ├── versions.tf │ │ └── main.tf │ ├── alertmanager-staging │ ├── versions.tf │ └── main.tf │ └── alertmanager-production │ ├── versions.tf │ └── main.tf ├── Brewfile ├── CODEOWNERS ├── ci ├── deploy.vars.default.yml ├── tasks │ ├── generate-prometheus-test-jq.yml │ ├── wait-ecs-services-stable.yml │ ├── deploy-project.yml │ └── http-ping.yml ├── images │ └── task │ │ ├── assume-role │ │ └── Dockerfile └── deploy.yml ├── .gitignore ├── .travis.yml ├── LICENCE ├── README.md └── logstash └── prometheus-for-paas-production.conf /.ruby-version: -------------------------------------------------------------------------------- 1 | 2.6.1 2 | -------------------------------------------------------------------------------- /.terraform-version: -------------------------------------------------------------------------------- 1 | 0.13.3 2 | -------------------------------------------------------------------------------- /tools/grafana_info/.python-version: -------------------------------------------------------------------------------- 1 | 3.6.6 2 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/.ruby-version: -------------------------------------------------------------------------------- 1 | 2.6.1 2 | -------------------------------------------------------------------------------- /Brewfile: -------------------------------------------------------------------------------- 1 | tap "alphagov/gds" 2 | 3 | brew "jq" 4 | brew "tfenv" 5 | brew "gds-cli" 6 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/en/articles/about-code-owners 2 | * @alphagov/re-autom8 3 | -------------------------------------------------------------------------------- /tools/grafana_info/requirements.txt: -------------------------------------------------------------------------------- 1 | grafana-api==0.2.4 2 | simplejson==3.16.0 3 | pyyaml>=4.2b1 4 | -------------------------------------------------------------------------------- /ci/deploy.vars.default.yml: -------------------------------------------------------------------------------- 1 | background-image: "" 2 | prometheus-aws-configuration-beta-branch: master 3 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/outputs.tf: -------------------------------------------------------------------------------- 1 | output "prometheus_config_etag" { 2 | value = aws_s3_bucket_object.prometheus_config.etag 3 | } 4 | -------------------------------------------------------------------------------- /terraform/modules/common/ami/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /tools/check-alerting-rules.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Check prometheus alerting rules using promtool 4 | # 5 | set -e 6 | 7 | promtool check rules ./terraform/modules/prom-ec2/alerts-config/alerts/*.yml 8 | -------------------------------------------------------------------------------- /terraform/modules/app-ecs-albs/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/infra-networking/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/infra-security-groups/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /terraform/projects/app-ecs-albs-staging/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | } 8 | required_version = ">= 0.13" 9 | } 10 | -------------------------------------------------------------------------------- /terraform/projects/app-ecs-albs-production/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | } 8 | required_version = ">= 0.13" 9 | } 10 | -------------------------------------------------------------------------------- /terraform/projects/infra-networking-staging/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | } 8 | required_version = ">= 0.13" 9 | } 10 | -------------------------------------------------------------------------------- /terraform/projects/infra-networking-production/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | } 8 | required_version = ">= 0.13" 9 | } 10 | -------------------------------------------------------------------------------- /terraform/projects/infra-security-groups-staging/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | } 8 | required_version = ">= 0.13" 9 | } 10 | -------------------------------------------------------------------------------- /terraform/projects/infra-security-groups-production/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | } 8 | required_version = ">= 0.13" 9 | } 10 | -------------------------------------------------------------------------------- /tools/grafana_info/bearer_auth.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | class BearerAuth(requests.auth.AuthBase): 4 | def __init__(self, token): 5 | self.token = token 6 | 7 | def __call__(self, r): 8 | r.headers['Authorization'] = 'Bearer %s' % self.token 9 | return r 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # terraform state files 2 | .terraform/ 3 | *.tfst* 4 | 5 | # editor config stuff 6 | .idea 7 | .idea/*/** 8 | .vscode 9 | .*.swp 10 | 11 | # os files 12 | .DS_Store 13 | 14 | *.plan 15 | 16 | /tools/prometheus-configs/**/data 17 | /tools/prometheus-configs/log-cache-adapter/token 18 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/filebeat.yml.tpl: -------------------------------------------------------------------------------- 1 | filebeat.inputs: 2 | - type: log 3 | enabled: true 4 | paths: 5 | - /var/log/syslog 6 | 7 | output.logstash: 8 | hosts: ["${logstash_host}"] 9 | loadbalance: true 10 | ssl.enabled: true 11 | 12 | tags: ["prometheus", "${environment}"] 13 | -------------------------------------------------------------------------------- /ci/tasks/generate-prometheus-test-jq.yml: -------------------------------------------------------------------------------- 1 | platform: linux 2 | inputs: 3 | - name: input 4 | outputs: 5 | - name: output 6 | run: 7 | path: sh 8 | args: 9 | - -euxc 10 | - | 11 | echo ".last_successful_config == $(jq '.prometheus_config_etag.value' input/terraform-outputs.json)" > output/test.jq 12 | -------------------------------------------------------------------------------- /terraform/projects/prom-ec2/paas-production/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | pass = { 8 | source = "camptocamp/pass" 9 | version = "1.4.0" 10 | } 11 | } 12 | required_version = ">= 0.13" 13 | } 14 | -------------------------------------------------------------------------------- /terraform/projects/prom-ec2/paas-staging/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | pass = { 8 | source = "camptocamp/pass" 9 | version = "1.4.0" 10 | } 11 | } 12 | required_version = ">= 0.13" 13 | } 14 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | } 8 | pass = { 9 | source = "camptocamp/pass" 10 | } 11 | template = { 12 | source = "hashicorp/template" 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 0.13" 3 | required_providers { 4 | aws = { 5 | source = "hashicorp/aws" 6 | } 7 | template = { 8 | source = "hashicorp/template" 9 | } 10 | } 11 | } 12 | 13 | provider "template" { 14 | version = ">= 2" 15 | } 16 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 0.13" 3 | required_providers { 4 | aws = { 5 | source = "hashicorp/aws" 6 | } 7 | template = { 8 | source = "hashicorp/template" 9 | } 10 | } 11 | } 12 | 13 | provider "template" { 14 | version = ">= 2" 15 | } 16 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/variables.tf: -------------------------------------------------------------------------------- 1 | variable "environment" {} 2 | variable "prometheus_config_bucket" {} 3 | variable "alerts_path" {} 4 | variable "private_zone_id" {} 5 | 6 | variable "prom_private_ips" { 7 | type = list(string) 8 | } 9 | 10 | variable "extra_scrape_configs" { 11 | default = [] 12 | description = "List of scrape configs to append to the Prometheus config" 13 | } 14 | -------------------------------------------------------------------------------- /terraform/projects/alertmanager-staging/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | pass = { 8 | source = "camptocamp/pass" 9 | version = "1.4.0" 10 | } 11 | template = { 12 | source = "hashicorp/template" 13 | version = "2.2.0" 14 | } 15 | } 16 | required_version = ">= 0.13" 17 | } 18 | -------------------------------------------------------------------------------- /terraform/projects/alertmanager-production/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "3.15" 6 | } 7 | pass = { 8 | source = "camptocamp/pass" 9 | version = "1.4.0" 10 | } 11 | template = { 12 | source = "hashicorp/template" 13 | version = "2.2.0" 14 | } 15 | } 16 | required_version = ">= 0.13" 17 | } 18 | -------------------------------------------------------------------------------- /tools/terraform-format.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eu 3 | 4 | for file in "$@"; do 5 | lint=$(terraform fmt -write=false -diff=true -list=true "${file}") 6 | failed="" 7 | 8 | if [ ! -z "${lint}" ]; then 9 | failed="yes" 10 | echo -e "Your code is not in a canonical format:\n" 11 | echo "${lint}" 12 | echo -e "To apply these changes do 'terraform fmt ${file}'\n" 13 | fi 14 | 15 | if [ "$failed" == "yes" ];then 16 | exit 1 17 | fi 18 | done 19 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/README.md: -------------------------------------------------------------------------------- 1 | # Prometheus EC2 module 2 | 3 | There are two modules 4 | 5 | - `prometheus`, which deploys prometheus to the target network. 6 | - `paas-config`, which contains configuration specific to our 7 | prometheus-for-paas deployment 8 | 9 | We deploy using raw Terraform commands, scoped per environment. 10 | 11 | ## Deploying 12 | 13 | To deploy (for example to staging): 14 | 15 | ```shell 16 | cd terraform/projects/prom-ec2/paas-staging/prometheus 17 | gds aws re-prom-staging -- terraform plan 18 | ``` 19 | -------------------------------------------------------------------------------- /ci/images/task/assume-role: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu 4 | 5 | arn="$1" 6 | creds="$(aws \ 7 | sts assume-role \ 8 | --role-arn="$arn" \ 9 | --role-session-name="deploy-concourse-$(date +%s)" \ 10 | --duration 1800 \ 11 | )" 12 | 13 | access_key="$(echo "$creds" | jq -r ".Credentials.AccessKeyId")" 14 | secret_key="$(echo "$creds" | jq -r ".Credentials.SecretAccessKey")" 15 | session_token="$(echo "$creds" | jq -r ".Credentials.SessionToken")" 16 | 17 | echo "export AWS_ACCESS_KEY_ID=\"$access_key\"" 18 | echo "export AWS_SECRET_ACCESS_KEY=\"$secret_key\"" 19 | echo "export AWS_SESSION_TOKEN=\"$session_token\"" 20 | echo "export AWS_DEFAULT_REGION=\"eu-west-1\"" 21 | 22 | -------------------------------------------------------------------------------- /terraform/modules/common/ami/main.tf: -------------------------------------------------------------------------------- 1 | ## Variables 2 | 3 | locals { 4 | canonical_account_id = "099720109477" 5 | } 6 | 7 | ## Data sources 8 | 9 | data "aws_ami" "ubuntu_focal" { 10 | most_recent = true 11 | 12 | filter { 13 | name = "name" 14 | values = ["ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*"] 15 | } 16 | 17 | filter { 18 | name = "architecture" 19 | values = ["x86_64"] 20 | } 21 | 22 | filter { 23 | name = "virtualization-type" 24 | values = ["hvm"] 25 | } 26 | 27 | owners = [local.canonical_account_id] 28 | } 29 | 30 | ## Outputs 31 | 32 | output "ubuntu_focal_ami_id" { 33 | value = data.aws_ami.ubuntu_focal.id 34 | } 35 | 36 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/output.tf: -------------------------------------------------------------------------------- 1 | output "public_ip_address" { 2 | value = aws_instance.prometheus.*.public_ip 3 | } 4 | 5 | output "private_ip_addresses" { 6 | value = aws_instance.prometheus.*.private_ip 7 | } 8 | 9 | output "prometheus_instance_id" { 10 | value = aws_instance.prometheus.*.id 11 | } 12 | 13 | output "prometheus_private_dns" { 14 | value = aws_instance.prometheus.*.private_dns 15 | } 16 | 17 | output "prometheus_public_dns" { 18 | value = aws_instance.prometheus.*.public_dns 19 | } 20 | 21 | output "s3_config_bucket" { 22 | value = aws_s3_bucket.prometheus_config.id 23 | } 24 | 25 | output "ec2_instance_profile_name" { 26 | value = aws_iam_instance_profile.prometheus_instance_profile.name 27 | } 28 | 29 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/service_discovery.tf: -------------------------------------------------------------------------------- 1 | resource "aws_service_discovery_private_dns_namespace" "observe" { 2 | name = "local.gds-reliability.engineering" 3 | description = "Observe instances" 4 | vpc = local.vpc_id 5 | } 6 | 7 | resource "aws_service_discovery_service" "alertmanager" { 8 | name = "alertmanager" 9 | 10 | description = "A service to allow alertmanager peers to discover each other" 11 | 12 | dns_config { 13 | namespace_id = aws_service_discovery_private_dns_namespace.observe.id 14 | 15 | dns_records { 16 | ttl = 10 17 | type = "A" 18 | } 19 | 20 | routing_policy = "MULTIVALUE" 21 | } 22 | 23 | health_check_custom_config { 24 | failure_threshold = 2 25 | } 26 | } 27 | 28 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/templates/default.tmpl: -------------------------------------------------------------------------------- 1 | {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} 2 | 3 | {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} 4 | 5 | {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} 6 | {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} 7 | {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} 8 | {{ define "slack.default.footer" }}{{ end }} 9 | -------------------------------------------------------------------------------- /ci/tasks/wait-ecs-services-stable.yml: -------------------------------------------------------------------------------- 1 | platform: linux 2 | inputs: 3 | - name: terraform-outputs 4 | params: 5 | DEPLOYER_ARN: 6 | TERRAFORM_VAR: 7 | AWS_REGION: 'eu-west-1' 8 | AWS_DEFAULT_REGION: 'eu-west-1' 9 | run: 10 | path: bash 11 | args: 12 | - -eu 13 | - -c 14 | - | 15 | echo "configuring aws client..." 16 | eval $(assume-role "${DEPLOYER_ARN}") 17 | 18 | jq -c '.[env.TERRAFORM_VAR].value | to_entries | .[]' terraform-outputs/terraform-outputs.json | while read entry ; do 19 | CLUSTER="$(echo ${entry} | jq -r '.key')" 20 | SERVICES="$(echo ${entry} | jq -r '.value | join(" ")')" 21 | 22 | echo "Waiting for services ${SERVICES} of cluster ${CLUSTER} to be stable..." 23 | 24 | aws ecs wait services-stable \ 25 | --cluster "${CLUSTER}" \ 26 | --services ${SERVICES} 27 | done 28 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | env: 2 | global: 3 | - TERRAFORM_VERSION=0.13.3 4 | - TERRAFORM_FILE_NAME=terraform_${TERRAFORM_VERSION}_linux_amd64.zip 5 | - TERRAFORM_DOWNLOAD_URL=https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/${TERRAFORM_FILE_NAME} 6 | - PROMETHEUS_VERSION=2.3.2 7 | - PROMETHEUS_FILE_NAME=prometheus-${PROMETHEUS_VERSION}.linux-amd64 8 | - PROMETHEUS_TAR_FILE_NAME=${PROMETHEUS_FILE_NAME}.tar.gz 9 | - PROMETHEUS_DOWNLOAD_URL=https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/${PROMETHEUS_TAR_FILE_NAME} 10 | install: 11 | - wget ${TERRAFORM_DOWNLOAD_URL} 12 | - unzip -o ${TERRAFORM_FILE_NAME} -d /tmp 13 | - export PATH=/tmp:${PATH} 14 | - wget ${PROMETHEUS_DOWNLOAD_URL} 15 | - tar -xvzf ${PROMETHEUS_TAR_FILE_NAME} -C /tmp 16 | - export PATH=/tmp/${PROMETHEUS_FILE_NAME}:${PATH} 17 | 18 | script: 19 | - find . -name '*.tf' | xargs tools/terraform-format.sh 20 | - tools/check-alerting-rules.sh 21 | notifications: 22 | email: false 23 | -------------------------------------------------------------------------------- /ci/tasks/deploy-project.yml: -------------------------------------------------------------------------------- 1 | platform: linux 2 | inputs: 3 | - name: src 4 | - name: re-secrets 5 | outputs: 6 | - name: outputs 7 | params: 8 | PROJECT: 9 | DEPLOYER_ARN: 10 | GPG_PRIVATE_KEY: 11 | AWS_REGION: 'eu-west-1' 12 | AWS_DEFAULT_REGION: 'eu-west-1' 13 | PASSWORD_STORE_DIR: "re-secrets/observe" 14 | run: 15 | path: bash 16 | args: 17 | - -eu 18 | - -c 19 | - | 20 | BUILD_DIR=$(pwd) 21 | 22 | echo "configuring aws client..." 23 | eval $(assume-role "${DEPLOYER_ARN}") 24 | 25 | echo "configuring re-secrets store..." 26 | echo "${GPG_PRIVATE_KEY}" | gpg --import 27 | mkdir -p $HOME/.password-store 28 | cp -R re-secrets $HOME/.password-store 29 | 30 | echo "terraforming..." 31 | pushd "src/terraform/projects/${PROJECT}" 32 | terraform init 33 | terraform apply -auto-approve 34 | terraform output -json > $BUILD_DIR/outputs/terraform-outputs.json 35 | popd 36 | -------------------------------------------------------------------------------- /terraform/projects/infra-security-groups-staging/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = "~> 0.13.3" 3 | 4 | backend "s3" { 5 | bucket = "prometheus-staging" 6 | key = "infra-security-groups-modular.tfstate" 7 | region = "eu-west-1" 8 | } 9 | } 10 | 11 | provider "aws" { 12 | region = var.aws_region 13 | } 14 | 15 | variable "aws_region" { 16 | type = string 17 | description = "AWS region" 18 | default = "eu-west-1" 19 | } 20 | 21 | module "infra-security-groups" { 22 | source = "../../modules/infra-security-groups/" 23 | 24 | aws_region = var.aws_region 25 | environment = "staging" 26 | remote_state_bucket = "prometheus-staging" 27 | } 28 | 29 | ## Outputs 30 | 31 | output "prometheus_ec2_sg_id" { 32 | value = module.infra-security-groups.prometheus_ec2_sg_id 33 | description = "security group prometheus_ec2 ID" 34 | } 35 | 36 | output "prometheus_alb_sg_id" { 37 | value = module.infra-security-groups.prometheus_alb_sg_id 38 | description = "security group prometheus_alb ID" 39 | } 40 | -------------------------------------------------------------------------------- /terraform/projects/infra-security-groups-production/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = "~> 0.13.3" 3 | 4 | backend "s3" { 5 | bucket = "prometheus-production" 6 | key = "infra-security-groups-modular.tfstate" 7 | region = "eu-west-1" 8 | } 9 | } 10 | 11 | provider "aws" { 12 | region = var.aws_region 13 | } 14 | 15 | variable "aws_region" { 16 | type = string 17 | description = "AWS region" 18 | default = "eu-west-1" 19 | } 20 | 21 | module "infra-security-groups" { 22 | source = "../../modules/infra-security-groups/" 23 | 24 | aws_region = var.aws_region 25 | environment = "production" 26 | remote_state_bucket = "prometheus-production" 27 | } 28 | 29 | ## Outputs 30 | 31 | output "prometheus_ec2_sg_id" { 32 | value = module.infra-security-groups.prometheus_ec2_sg_id 33 | description = "security group prometheus_ec2 ID" 34 | } 35 | 36 | output "prometheus_alb_sg_id" { 37 | value = module.infra-security-groups.prometheus_alb_sg_id 38 | description = "security group prometheus_alb ID" 39 | } 40 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Crown Copyright (Government Digital Service) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /ci/tasks/http-ping.yml: -------------------------------------------------------------------------------- 1 | platform: linux 2 | image_resource: 3 | type: docker-image 4 | source: 5 | repository: governmentpaas/curl-ssl 6 | tag: fe3e384e81ccb50842509d7237e3828b293de694 7 | inputs: 8 | - name: response-jq-test 9 | optional: true 10 | params: 11 | URL: 12 | run: 13 | path: sh 14 | args: 15 | - -euxc 16 | - | 17 | DOMAIN=$(echo "${URL}" | awk -F/ '{print $3}') 18 | getent ahosts ${DOMAIN} | cut -d ' ' -f1 | sort | uniq | tee /dev/stderr | while read TARGET_IP ; do 19 | curl \ 20 | --resolve ${DOMAIN}:443:${TARGET_IP} \ 21 | --silent \ 22 | --fail \ 23 | --write-out "${TARGET_IP} %{http_code} %{time_total}s"$'\n' \ 24 | --output curl_output \ 25 | --max-time 5 "${URL}" 26 | 27 | if [[ -e response-jq-test/test.jq ]] ; then 28 | if ! jq -e -f response-jq-test/test.jq curl_output ; then 29 | echo 'Response:' 30 | cat curl_output 31 | echo 'Failed jq test:' 32 | cat response-jq-test/test.jq 33 | # don't spin through attempts too fast 34 | sleep 5 35 | exit 9 36 | fi 37 | fi 38 | done 39 | 40 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/notify-alerts.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: GOVUK_Notify 3 | rules: 4 | - alert: GOVUK_Notify_Disk_75_percent_full 5 | expr: max(disk_utilization{space="production", organisation="govuk-notify"}) by (app, space) > 75 6 | for: 5m 7 | labels: 8 | product: "notify" 9 | severity: "ticket" 10 | annotations: 11 | message: "{{ $labels.space }}: disk usage for {{ $labels.app }} is over 75% full. You should redeploy the app to avoid running out of disk space" 12 | grafana: "https://grafana-paas.cloudapps.digital/d/_GlGBNbmk/notify-apps?orgId=2&var-space=production&var-app={{ $labels.app }}" 13 | - alert: GOVUK_Notify_Disk_95_percent_full 14 | expr: max(disk_utilization{space="production", organisation="govuk-notify", app!~"(.*conduit.*)|(.*exporter)"}) by (app, space) > 95 15 | for: 5m 16 | labels: 17 | product: "notify" 18 | severity: "p2" 19 | annotations: 20 | summary: "{{ $labels.space }}: disk usage for {{ $labels.app }} is over 95% full. You should redeploy the app to avoid running out of disk space" 21 | grafana: "https://grafana-paas.cloudapps.digital/d/_GlGBNbmk/notify-apps?orgId=2&var-space=production&var-app={{ $labels.app }}" 22 | -------------------------------------------------------------------------------- /tools/grafana_info/show_queries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from grafana_api.grafana_api import GrafanaAPI 4 | from bearer_auth import BearerAuth 5 | import os, sys 6 | 7 | 8 | def exprs_for_dashboard(dashboard): 9 | d = g.get('/dashboards/uid/%s' % dashboard['uid']) 10 | if 'panels' in d['dashboard']: 11 | panels = d['dashboard']['panels'] 12 | for panel in panels: 13 | targets = panel.get('targets',[]) 14 | for target in targets: 15 | if 'expr' in target: 16 | yield (target['expr'], dashboard['title'], panel['title']) 17 | else: 18 | print('***** no panels {}'.format(dashboard['title'])) 19 | 20 | 21 | if __name__ == "__main__": 22 | try: 23 | token = os.environ['GRAFANA_TOKEN'] 24 | g = GrafanaAPI(BearerAuth(token), 'grafana-paas.cloudapps.digital', protocol='https') 25 | dashboards = g.get('/search?type=dash-db') 26 | exprs = [expr for dashboard in dashboards for expr in exprs_for_dashboard(dashboard)] 27 | exprs.sort() 28 | for expr in exprs: 29 | print(expr) 30 | except KeyError as e: 31 | print('Please set the %s environment variable' % e.args[0], file=sys.stderr) 32 | exit(1) 33 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/task-definitions/alertmanager.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "alertmanager", 4 | "image": "prom/alertmanager", 5 | "memoryReservation": 512, 6 | "essential": true, 7 | "portMappings": [ 8 | { 9 | "containerPort": 9093, 10 | "hostPort": 9093 11 | }, 12 | { 13 | "containerPort": 9094, 14 | "hostPort": 9094 15 | } 16 | ], 17 | "environment": [ 18 | { 19 | "Name": "ALERTMANAGER_CONFIG", 20 | "Value": "${alertmanager_config_base64}" 21 | }, 22 | { 23 | "Name": "TEMPLATES", 24 | "Value": "${templates_base64}" 25 | } 26 | ], 27 | "entryPoint": [ 28 | "/bin/sh", 29 | "-c", 30 | "echo \"$ALERTMANAGER_CONFIG\" | base64 -d > /etc/alertmanager/alertmanager.yml; echo \"$TEMPLATES\" | base64 -d > /etc/alertmanager/default.tmpl; /bin/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --cluster.peer=alertmanager.local.gds-reliability.engineering:9094 ${alertmanager_url}" 31 | ], 32 | "logConfiguration": { 33 | "logDriver": "awslogs", 34 | "options": { 35 | "awslogs-group": "${log_group}", 36 | "awslogs-region": "${region}", 37 | "awslogs-stream-prefix": "alertmanager" 38 | } 39 | } 40 | } 41 | ] 42 | -------------------------------------------------------------------------------- /ci/images/task/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | ENV TF_VERSION 0.13.3 4 | ENV TF_ZIP_SHA256 35c662be9d32d38815cde5fa4c9fa61a3b7f39952ecd50ebf92fd1b2ddd6109b 5 | 6 | LABEL ubuntu="20.04" 7 | LABEL terraform="$TF_VERSION" 8 | 9 | ENV TZ=Europe/London 10 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 11 | 12 | RUN apt-get update --yes && \ 13 | apt-get install --yes --no-install-recommends \ 14 | ca-certificates \ 15 | awscli \ 16 | jq \ 17 | curl \ 18 | dnsutils \ 19 | unzip \ 20 | gpg \ 21 | gpg-agent \ 22 | golang \ 23 | git 24 | 25 | WORKDIR /tmp 26 | 27 | RUN curl https://releases.hashicorp.com/terraform/${TF_VERSION}/terraform_${TF_VERSION}_linux_amd64.zip > terraform.zip && \ 28 | echo "${TF_ZIP_SHA256} terraform.zip" > terraform.sha && \ 29 | sha256sum -c terraform.sha && unzip terraform.zip && mv terraform /usr/bin/terraform && \ 30 | rm terraform.zip && rm terraform.sha 31 | 32 | RUN GO111MODULE=on go get -v github.com/camptocamp/terraform-provider-pass && \ 33 | mkdir -p ~/.terraform.d/plugins/linux_amd64 && \ 34 | mv ~/go/bin/terraform-provider-pass ~/.terraform.d/plugins/linux_amd64/ 35 | 36 | # prom-ec2 terraform expects a pub ssh key even if it doesn't use it 37 | RUN mkdir -p $HOME/.ssh/ && touch $HOME/.ssh/id_rsa.pub 38 | 39 | COPY assume-role /usr/bin/assume-role 40 | 41 | ENTRYPOINT ["bash"] 42 | -------------------------------------------------------------------------------- /terraform/projects/alertmanager-staging/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## Project: alertmanager 3 | * 4 | * Create services and task definitions for the ECS cluster 5 | * 6 | */ 7 | 8 | variable "aws_region" { 9 | type = string 10 | description = "AWS region" 11 | default = "eu-west-1" 12 | } 13 | 14 | data "pass_password" "cronitor_staging_url" { 15 | path = "cronitor/cronitor-staging-url" 16 | } 17 | 18 | # Resources 19 | # -------------------------------------------------------------- 20 | 21 | ## Providers 22 | 23 | terraform { 24 | required_version = "~> 0.13.3" 25 | 26 | backend "s3" { 27 | bucket = "prometheus-staging" 28 | key = "app-ecs-services-modular.tfstate" 29 | region = "eu-west-1" 30 | } 31 | } 32 | 33 | provider "aws" { 34 | region = var.aws_region 35 | } 36 | 37 | provider "pass" { 38 | store_dir = "~/.password-store/re-secrets/observe" 39 | refresh_store = true 40 | } 41 | 42 | variable "remote_state_bucket" { 43 | type = string 44 | description = "S3 bucket we store our terraform state in" 45 | default = "prometheus-staging" 46 | } 47 | 48 | module "alertmanager" { 49 | source = "../../modules/alertmanager" 50 | 51 | remote_state_bucket = var.remote_state_bucket 52 | environment = "staging" 53 | observe_cronitor = data.pass_password.cronitor_staging_url.password 54 | } 55 | 56 | output "alertmanager_ecs_clusters_services" { 57 | value = module.alertmanager.ecs_clusters_services 58 | } 59 | -------------------------------------------------------------------------------- /terraform/projects/app-ecs-albs-staging/main.tf: -------------------------------------------------------------------------------- 1 | ## Providers 2 | 3 | terraform { 4 | required_version = "~> 0.13.3" 5 | 6 | backend "s3" { 7 | bucket = "prometheus-staging" 8 | key = "app-ecs-albs-modular.tfstate" 9 | region = "eu-west-1" 10 | } 11 | } 12 | 13 | provider "aws" { 14 | region = var.aws_region 15 | } 16 | 17 | variable "aws_region" { 18 | type = string 19 | description = "AWS region" 20 | default = "eu-west-1" 21 | } 22 | 23 | variable "remote_state_bucket" { 24 | type = string 25 | description = "S3 bucket we store our terraform state in" 26 | default = "prometheus-staging" 27 | } 28 | 29 | data "terraform_remote_state" "infra_networking" { 30 | backend = "s3" 31 | 32 | config = { 33 | bucket = var.remote_state_bucket 34 | key = "infra-networking-modular.tfstate" 35 | region = var.aws_region 36 | } 37 | } 38 | 39 | module "app-ecs-albs" { 40 | source = "../../modules/app-ecs-albs/" 41 | 42 | aws_region = var.aws_region 43 | environment = "staging" 44 | remote_state_bucket = var.remote_state_bucket 45 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 46 | subnets = data.terraform_remote_state.infra_networking.outputs.public_subnets 47 | } 48 | 49 | output "prom_public_record_fqdns" { 50 | value = module.app-ecs-albs.prom_public_record_fqdns 51 | description = "Prometheus public DNS FQDNs" 52 | } 53 | 54 | output "prometheus_target_group_arns" { 55 | value = module.app-ecs-albs.prometheus_target_group_ids 56 | } 57 | -------------------------------------------------------------------------------- /terraform/projects/app-ecs-albs-production/main.tf: -------------------------------------------------------------------------------- 1 | ## Providers 2 | 3 | terraform { 4 | required_version = "~> 0.13.3" 5 | 6 | backend "s3" { 7 | bucket = "prometheus-production" 8 | key = "app-ecs-albs-modular.tfstate" 9 | region = "eu-west-1" 10 | } 11 | } 12 | 13 | provider "aws" { 14 | region = var.aws_region 15 | } 16 | 17 | variable "aws_region" { 18 | type = string 19 | description = "AWS region" 20 | default = "eu-west-1" 21 | } 22 | 23 | variable "remote_state_bucket" { 24 | type = string 25 | description = "S3 bucket we store our terraform state in" 26 | default = "prometheus-production" 27 | } 28 | 29 | data "terraform_remote_state" "infra_networking" { 30 | backend = "s3" 31 | 32 | config = { 33 | bucket = var.remote_state_bucket 34 | key = "infra-networking-modular.tfstate" 35 | region = var.aws_region 36 | } 37 | } 38 | 39 | module "app-ecs-albs" { 40 | source = "../../modules/app-ecs-albs/" 41 | 42 | aws_region = var.aws_region 43 | environment = "production" 44 | remote_state_bucket = var.remote_state_bucket 45 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 46 | subnets = data.terraform_remote_state.infra_networking.outputs.public_subnets 47 | } 48 | 49 | output "prom_public_record_fqdns" { 50 | value = module.app-ecs-albs.prom_public_record_fqdns 51 | description = "Prometheus public DNS FQDNs" 52 | } 53 | 54 | output "prometheus_target_group_arns" { 55 | value = module.app-ecs-albs.prometheus_target_group_ids 56 | } 57 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/certificate.tf: -------------------------------------------------------------------------------- 1 | # AWS should manage the certificate renewal automatically 2 | # https://docs.aws.amazon.com/acm/latest/userguide/managed-renewal.html 3 | # If this fails, AWS will email associated with the AWS account 4 | resource "aws_acm_certificate" "alertmanager_cert" { 5 | domain_name = "alerts.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}" 6 | validation_method = "DNS" 7 | 8 | subject_alternative_names = formatlist("alerts-%s.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}", data.aws_availability_zones.available.names) 9 | 10 | lifecycle { 11 | # We can't destroy a certificate that's in use, and we can't stop 12 | # using it until the new one is ready. Hence 13 | # create_before_destroy here. 14 | create_before_destroy = true 15 | } 16 | } 17 | 18 | resource "aws_route53_record" "alertmanager_cert_validation" { 19 | for_each = { 20 | for dvo in aws_acm_certificate.alertmanager_cert.domain_validation_options : dvo.domain_name => { 21 | name = dvo.resource_record_name 22 | record = dvo.resource_record_value 23 | type = dvo.resource_record_type 24 | } 25 | } 26 | 27 | name = each.value.name 28 | records = [each.value.record] 29 | type = each.value.type 30 | zone_id = local.zone_id 31 | ttl = 60 32 | 33 | allow_overwrite = true 34 | 35 | depends_on = [aws_acm_certificate.alertmanager_cert] 36 | } 37 | 38 | resource "aws_acm_certificate_validation" "alertmanager_cert" { 39 | certificate_arn = aws_acm_certificate.alertmanager_cert.arn 40 | validation_record_fqdns = [for record in aws_route53_record.alertmanager_cert_validation : record.fqdn] 41 | } 42 | 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **This repo is no longer in use and has been archived** 2 | 3 | # Prometheus configuration on AWS # 4 | 5 | Terraform configuration to manage a Prometheus server running on AWS. 6 | 7 | ## Setup ## 8 | 9 | ### Install dependencies 10 | 11 | brew bundle 12 | tfenv install # this will pick up the version from .terraform-version 13 | 14 | ### Allow access to secrets 15 | 16 | You will need to clone the re-secrets repo into `~/.password-store/re-secrets`: 17 | 18 | git clone git@github.com:alphagov/re-secrets.git ~/.password-store/re-secrets 19 | 20 | ## Deploying Terraform 21 | 22 | ```shell 23 | cd terraform/projects/PROJECT-ENV/ 24 | gds aws re-prom- -- terraform init 25 | gds aws re-prom- -- terraform plan 26 | gds aws re-prom- -- terraform apply 27 | ``` 28 | 29 | eg 30 | 31 | ```shell 32 | cd terraform/projects/app-ecs-albs-staging 33 | gds aws re-prom-staging -- terraform plan 34 | ``` 35 | 36 | ### Deploy EC2 Prometheus with zero downtime 37 | 38 | To avoid all three instances being respun at the same time you can do one instance at a time using: 39 | 40 | ``` 41 | gds aws re-prom- -- terraform apply -target=module.paas-config.aws_route53_record.prom_ec2_a_record[i] -target=module.prometheus.aws_volume_attachment.attach-prometheus-disk[i] -target=module.prometheus.aws_instance.prometheus[i] -target=module.prometheus.aws_lb_target_group_attachment.prom_target_group_attachment[i] 42 | ``` 43 | 44 | where `i` is `0`, `1` or `2`. 45 | 46 | ## EC2 Prometheus 47 | 48 | Prometheis are not deployed on Amazon ECS and are instead deployed using the prom-ec2 modules onto EC2 instances. For details of how to develop and deploy them see the [terraform/modules/prom-ec2 README](terraform/modules/prom-ec2). 49 | 50 | ## ECS 51 | 52 | Alertmanager and NGINX are deployed on Amazon ECS Fargate. 53 | 54 | ## License 55 | [MIT License](LICENCE) 56 | -------------------------------------------------------------------------------- /tools/grafana_info/README.md: -------------------------------------------------------------------------------- 1 | # Setup 2 | 3 | To run these python apps: 4 | 5 | - [create an api key](https://grafana-paas.cloudapps.digital/org/apikeys) with "Viewer" capability 6 | - set it in the GRAFANA_TOKEN environment variable 7 | - create a virtualenv if you want 8 | - run `pip install -r requirements.txt` 9 | 10 | # show_queries.py 11 | 12 | This script scrapes all the PromQL queries from Grafana and shows 13 | them, in sorted order. 14 | 15 | To run it: 16 | 17 | - run `./show_queries.py` 18 | 19 | This directory has a `.python-version` file to be used by 20 | [pyenv](https://github.com/pyenv/pyenv). 21 | 22 | # find_missing_metrics.py 23 | 24 | This script will attempt to find missing metrics on the prometheus running an older version which are being used in Grafana and the alerts files. 25 | It will show the expressions used, then the result of an API call to an older Prometheus server (could be EC2 or one deployed locally) based on the extracted metrics (boolean in the result refers to whether any metric results were returned followed by the metric) and will report any metrics without data points in the older Prometheus server but have data points in the latest prometheus server. 26 | 27 | NB - keywords for prom QL operators used in the Grafana expressions are ignored, other wrongly identified metrics should be added to the `IGNORE_WORDS` list found at the top of the `find_missing_metrics.py` file. 28 | 29 | To run it: 30 | 31 | - set the `OLD_PROM_SERVER` environment variable to an EC2 staging prometheus server, or a locally deployed older prometheus version 32 | - set the `NEW_PROM_SERVER` environment variable to an ECS staging prometheus server, or a locally running the latest prometheus 33 | - set the `ALERTS_DIR` environment variable to the location of the alerts yml files relative to where you will be executing the python script 34 | - run `./find_missing_metrics.py` 35 | -------------------------------------------------------------------------------- /terraform/projects/infra-networking-production/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = "~> 0.13.3" 3 | 4 | backend "s3" { 5 | bucket = "prometheus-production" 6 | key = "infra-networking-modular.tfstate" 7 | region = "eu-west-1" 8 | } 9 | } 10 | 11 | provider "aws" { 12 | region = var.aws_region 13 | } 14 | 15 | variable "prometheus_subdomain" { 16 | type = string 17 | description = "Subdomain for prometheus" 18 | default = "monitoring" 19 | } 20 | 21 | variable "aws_region" { 22 | type = string 23 | description = "The AWS region to use." 24 | default = "eu-west-1" 25 | } 26 | 27 | module "infra-networking" { 28 | source = "../../modules/infra-networking" 29 | 30 | environment = "production" 31 | prometheus_subdomain = var.prometheus_subdomain 32 | } 33 | 34 | output "vpc_id" { 35 | value = module.infra-networking.vpc_id 36 | description = "VPC ID where the stack resources are created" 37 | } 38 | 39 | output "private_subnets" { 40 | value = module.infra-networking.private_subnets 41 | description = "List of private subnet IDs" 42 | } 43 | 44 | output "public_subnets" { 45 | value = module.infra-networking.public_subnets 46 | description = "List of public subnet IDs" 47 | } 48 | 49 | output "public_zone_id" { 50 | value = module.infra-networking.public_zone_id 51 | description = "Route 53 Zone ID for publicly visible zone" 52 | } 53 | 54 | output "public_subdomain" { 55 | value = module.infra-networking.public_subdomain 56 | description = "This is the subdomain for root zone" 57 | } 58 | 59 | output "private_zone_id" { 60 | value = module.infra-networking.private_zone_id 61 | description = "Route 53 Zone ID for the internal zone" 62 | } 63 | 64 | output "private_subdomain" { 65 | value = module.infra-networking.private_subdomain 66 | description = "This is the subdomain for private zone" 67 | } 68 | 69 | output "subnets_by_az" { 70 | value = module.infra-networking.subnets_by_az 71 | description = "Map of availability zones to private subnets" 72 | } 73 | 74 | -------------------------------------------------------------------------------- /terraform/projects/infra-networking-staging/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = "~> 0.13.3" 3 | 4 | backend "s3" { 5 | bucket = "prometheus-staging" 6 | key = "infra-networking-modular.tfstate" 7 | region = "eu-west-1" 8 | } 9 | } 10 | 11 | provider "aws" { 12 | region = var.aws_region 13 | } 14 | 15 | variable "aws_region" { 16 | type = string 17 | description = "The AWS region to use." 18 | default = "eu-west-1" 19 | } 20 | 21 | variable "prometheus_subdomain" { 22 | type = string 23 | description = "Subdomain for prometheus" 24 | default = "monitoring-staging" 25 | } 26 | 27 | module "infra-networking" { 28 | source = "../../modules/infra-networking" 29 | 30 | environment = "staging" 31 | prometheus_subdomain = var.prometheus_subdomain 32 | } 33 | 34 | output "vpc_id" { 35 | value = module.infra-networking.vpc_id 36 | description = "VPC ID where the stack resources are created" 37 | } 38 | 39 | output "private_subnets" { 40 | value = module.infra-networking.private_subnets 41 | description = "List of private subnet IDs" 42 | } 43 | 44 | output "public_subnets" { 45 | value = module.infra-networking.public_subnets 46 | description = "List of public subnet IDs" 47 | } 48 | 49 | output "public_zone_id" { 50 | value = module.infra-networking.public_zone_id 51 | description = "Route 53 Zone ID for publicly visible zone" 52 | } 53 | 54 | output "public_subdomain" { 55 | value = module.infra-networking.public_subdomain 56 | description = "This is the subdomain for root zone" 57 | } 58 | 59 | output "private_zone_id" { 60 | value = module.infra-networking.private_zone_id 61 | description = "Route 53 Zone ID for the internal zone" 62 | } 63 | 64 | output "private_subdomain" { 65 | value = module.infra-networking.private_subdomain 66 | description = "This is the subdomain for private zone" 67 | } 68 | 69 | output "subnets_by_az" { 70 | value = module.infra-networking.subnets_by_az 71 | description = "Map of availability zones to private subnets" 72 | } 73 | 74 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/variables.tf: -------------------------------------------------------------------------------- 1 | variable "ami_id" {} 2 | 3 | variable "device_mount_path" { 4 | description = "The path to mount the prometheus disk" 5 | default = "/dev/sdh" 6 | } 7 | 8 | variable "data_volume_size" { 9 | description = "The size of the volume that will contain the prometheus data" 10 | default = 250 11 | } 12 | 13 | variable "availability_zones" { 14 | description = "A map of availability zones to subnets" 15 | 16 | type = map(string) 17 | default = {} 18 | } 19 | 20 | variable "subnet_ids" { 21 | type = list(string) 22 | } 23 | 24 | variable "instance_size" { 25 | type = string 26 | description = "This is the default instance size" 27 | default = "m5.large" 28 | } 29 | 30 | variable "target_vpc" { 31 | description = "The VPC in which the system will be deployed" 32 | } 33 | 34 | variable "environment" {} 35 | 36 | variable "vpc_security_groups" { 37 | type = list(string) 38 | default = [] 39 | description = "Security groups to attach to the prometheus instances" 40 | } 41 | 42 | variable "enable_ssh" { 43 | default = false 44 | } 45 | 46 | variable "region" { 47 | default = "eu-west-1" 48 | } 49 | 50 | variable "allowed_cidrs" { 51 | type = list(string) 52 | description = "List of CIDRs which are able to access the prometheus instance, default are GDS ips" 53 | 54 | default = [ 55 | "213.86.153.211/32", 56 | "213.86.153.212/32", 57 | "213.86.153.213/32", 58 | "213.86.153.214/32", 59 | "213.86.153.231/32", 60 | "213.86.153.235/32", 61 | "213.86.153.236/32", 62 | "213.86.153.237/32", 63 | "85.133.67.244/32", 64 | "35.177.37.128/32", 65 | "35.176.252.164/32", 66 | "51.149.9.112/29", # CO 67 | "51.149.9.240/29", # CO 68 | ] 69 | } 70 | 71 | variable "config_bucket" {} 72 | 73 | variable "prometheus_public_fqdns" { 74 | type = list(string) 75 | } 76 | 77 | variable "logstash_host" { 78 | default = "" 79 | } 80 | 81 | variable "prometheus_htpasswd" { 82 | default = "" 83 | description = "Contents of basic auth .htpasswd file for NGINX to allow access from Grafana" 84 | } 85 | 86 | variable "prometheus_target_group_arns" { 87 | type = list(string) 88 | default = [] 89 | } 90 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/iam.tf: -------------------------------------------------------------------------------- 1 | #Prepare to attach role to instance 2 | resource "aws_iam_instance_profile" "prometheus_instance_profile" { 3 | name = "prometheus_${var.environment}_config_reader_profile" 4 | role = aws_iam_role.prometheus_role.name 5 | } 6 | 7 | #Create role 8 | resource "aws_iam_role" "prometheus_role" { 9 | name = "prometheus_profile_${var.environment}" 10 | 11 | assume_role_policy = data.aws_iam_policy_document.prometheus_assume_role_policy.json 12 | 13 | tags = merge(local.default_tags, { 14 | Name = "${var.environment}-prometheus" 15 | }) 16 | } 17 | 18 | #Create permission to assume role 19 | data "aws_iam_policy_document" "prometheus_assume_role_policy" { 20 | statement { 21 | actions = ["sts:AssumeRole"] 22 | 23 | principals { 24 | type = "Service" 25 | identifiers = ["ec2.amazonaws.com"] 26 | } 27 | } 28 | } 29 | 30 | #Define the policy to attach the role too 31 | resource "aws_iam_policy" "prometheus_instance_profile" { 32 | name = "prometheus_instance_profile_${var.environment}" 33 | path = "/" 34 | description = "This is the main profile, that has bucket permission and decribe permissions" 35 | 36 | policy = data.aws_iam_policy_document.instance_role_policy.json 37 | } 38 | 39 | #define IAM policy documention 40 | data "aws_iam_policy_document" "instance_role_policy" { 41 | statement { 42 | sid = "ec2Policy" 43 | actions = ["ec2:Describe*"] 44 | resources = ["*"] 45 | } 46 | 47 | statement { 48 | sid = "s3Bucket" 49 | 50 | actions = [ 51 | "s3:Get*", 52 | "s3:ListBucket", 53 | ] 54 | 55 | resources = [ 56 | "arn:aws:s3:::${aws_s3_bucket.prometheus_config.id}/*", 57 | "arn:aws:s3:::${aws_s3_bucket.prometheus_config.id}", 58 | ] 59 | } 60 | } 61 | 62 | #Attach policy to role 63 | resource "aws_iam_role_policy_attachment" "iam_policy" { 64 | role = aws_iam_role.prometheus_role.name 65 | policy_arn = aws_iam_policy.prometheus_instance_profile.arn 66 | } 67 | 68 | resource "aws_iam_role_policy_attachment" "session_manager_access" { 69 | role = aws_iam_role.prometheus_role.name 70 | policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM" 71 | } 72 | 73 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/main.tf: -------------------------------------------------------------------------------- 1 | data "template_file" "prometheus_config_template" { 2 | template = file("${path.module}/prometheus.conf.tpl") 3 | 4 | vars = { 5 | environment = var.environment 6 | } 7 | } 8 | 9 | locals { 10 | prometheus_config = yamldecode(data.template_file.prometheus_config_template.rendered) 11 | final_scrape_configs = concat(local.prometheus_config["scrape_configs"], var.extra_scrape_configs) 12 | final_prometheus_config = merge(local.prometheus_config, { "scrape_configs" = local.final_scrape_configs }) 13 | final_prometheus_config_yaml = yamlencode(local.final_prometheus_config) 14 | } 15 | 16 | resource "aws_route53_record" "prom_ec2_a_record" { 17 | count = 3 18 | 19 | zone_id = var.private_zone_id 20 | name = "prom-ec2-${count.index + 1}" 21 | type = "A" 22 | ttl = 300 23 | 24 | records = [var.prom_private_ips[count.index]] 25 | } 26 | 27 | resource "aws_s3_bucket_object" "prometheus_config" { 28 | bucket = var.prometheus_config_bucket 29 | key = "prometheus/prometheus.yml" 30 | content = local.final_prometheus_config_yaml 31 | etag = md5(local.final_prometheus_config_yaml) 32 | } 33 | 34 | resource "aws_s3_bucket_object" "alerts-config" { 35 | bucket = var.prometheus_config_bucket 36 | key = "prometheus/alerts/observe-alerts.yml" 37 | source = "${var.alerts_path}observe-alerts.yml" 38 | etag = filemd5("${var.alerts_path}observe-alerts.yml") 39 | } 40 | 41 | resource "aws_s3_bucket_object" "alerts-data-gov-uk-config" { 42 | bucket = var.prometheus_config_bucket 43 | key = "prometheus/alerts/data-gov-uk-alerts.yml" 44 | source = "${var.alerts_path}data-gov-uk-alerts.yml" 45 | etag = filemd5("${var.alerts_path}data-gov-uk-alerts.yml") 46 | } 47 | 48 | resource "aws_s3_bucket_object" "alerts-doc-checking-config" { 49 | bucket = var.prometheus_config_bucket 50 | key = "prometheus/alerts/doc-checking-alerts.yml" 51 | source = "${var.alerts_path}doc-checking-alerts.yml" 52 | etag = filemd5("${var.alerts_path}doc-checking-alerts.yml") 53 | } 54 | 55 | resource "aws_s3_bucket_object" "alerts-notify-config" { 56 | bucket = var.prometheus_config_bucket 57 | key = "prometheus/alerts/notify-alerts.yml" 58 | source = "${var.alerts_path}notify-alerts.yml" 59 | etag = filemd5("${var.alerts_path}notify-alerts.yml") 60 | } 61 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/README.md: -------------------------------------------------------------------------------- 1 | # Example Alert 2 | 3 | Below is an example alert that you can copy and rewrite to create your 4 | own alert. [View the RE 5 | docs](https://reliability-engineering.cloudapps.digital/monitoring-alerts.html#create-and-edit-alerts-using-prometheus) 6 | for more information on what to consider when writing alerts. 7 | 8 | It alerts if the number of 5xx status codes exceeds 25% of total 9 | requests for 120 seconds (2 minutes) or more. 10 | 11 | It is broken down into: 12 | 13 | - `alert`: The alert name, in the format `TeamName_Problem`. 14 | - `expr`: The PromQL query that queries for the data, followed by `>= 15 | 0.25` defining the threshold of values. 16 | - `for`: Optional: The alert fires if the query is over threshold for 17 | this amount of time. 18 | - `labels`: 19 | - `product`: The team name or product for the team that this alert 20 | refers to. For example, "Observe" or "Prometheus". 21 | - `annotations`: 22 | - `summary`: Required: A summary of what the alert shows. 23 | - `description`: Required: A more detailed description of what the alert shows. 24 | - `dashboard_url`: Optional: A link to your team's dashboard (ie Grafana) to see 25 | trends for the alert. 26 | - `runbook`: Optional: A link to your team manual describing what to do about 27 | the alert. 28 | - `logs`: Optional: A link to your logs (ie Kibana URL). 29 | 30 | In the `annotations` section, `{{ $labels.app }}` refers to your team 31 | name, and `{{ $labels.job }}` refers to your app name. 32 | 33 | ``` 34 | - alert: Example_AppRequestsExcess5xx 35 | expr: sum by(app) (rate(requests{org="example-paas-org", space="example-paas-space", status_range="5xx"}[5m])) / sum by(app) (rate(requests{org="example-paas-org", space="example-paas-space"}[5m])) >= 0.25 36 | for: 120s 37 | labels: 38 | product: "example-team-name" 39 | annotations: 40 | summary: "App {{ $labels.app }} has too many 5xx errors" 41 | description: "App {{ $labels.app }} has 5xx errors in excess of 25% of total requests" 42 | dashboard_url: https://grafana-paas.cloudapps.digital/d//?refresh=1m&orgId=1 43 | runbook: "https://re-team-manual.cloudapps.digital/" 44 | logs: "https://kibana.logit.io/s//app/kibana#/discover" 45 | ``` 46 | -------------------------------------------------------------------------------- /terraform/projects/alertmanager-production/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## Project: alertmanager 3 | * 4 | * Create services and task definitions for the ECS cluster 5 | * 6 | */ 7 | 8 | variable "aws_region" { 9 | type = string 10 | description = "AWS region" 11 | default = "eu-west-1" 12 | } 13 | 14 | data "pass_password" "cronitor_production_url" { 15 | path = "cronitor/cronitor-production-url" 16 | } 17 | 18 | # Resources 19 | # -------------------------------------------------------------- 20 | 21 | ## Providers 22 | 23 | terraform { 24 | required_version = "~> 0.13.3" 25 | 26 | backend "s3" { 27 | bucket = "prometheus-production" 28 | key = "app-ecs-services-modular.tfstate" 29 | region = "eu-west-1" 30 | } 31 | } 32 | 33 | provider "aws" { 34 | region = var.aws_region 35 | } 36 | 37 | provider "pass" { 38 | store_dir = "~/.password-store/re-secrets/observe" 39 | refresh_store = true 40 | } 41 | 42 | variable "remote_state_bucket" { 43 | type = string 44 | description = "S3 bucket we store our terraform state in" 45 | default = "prometheus-production" 46 | } 47 | 48 | module "alertmanager" { 49 | source = "../../modules/alertmanager" 50 | 51 | remote_state_bucket = var.remote_state_bucket 52 | environment = "production" 53 | observe_cronitor = data.pass_password.cronitor_production_url.password 54 | allowed_cidrs = [ 55 | # Office IPs 56 | "213.86.153.211/32", 57 | "213.86.153.212/32", 58 | "213.86.153.213/32", 59 | "213.86.153.214/32", 60 | "213.86.153.231/32", 61 | "213.86.153.235/32", 62 | "213.86.153.236/32", 63 | "213.86.153.237/32", 64 | "85.133.67.244/32", 65 | "51.149.8.0/25", 66 | "51.149.8.128/29", 67 | 68 | # verify prod 69 | "35.178.25.41/32", 70 | 71 | "35.177.2.97/32", 72 | "35.176.169.64/32", 73 | 74 | # verify integration 75 | "3.8.68.252/32", 76 | 77 | "3.8.41.125/32", 78 | "3.8.225.106/32", 79 | 80 | # verify staging 81 | "35.177.140.5/32", 82 | 83 | "18.130.58.164/32", 84 | "35.176.196.169/32", 85 | 86 | # concourse 87 | "35.177.37.128/32", 88 | 89 | "35.176.252.164/32", 90 | 91 | "51.149.9.112/29", # CO 92 | "51.149.9.240/29", # CO 93 | ] 94 | } 95 | 96 | output "alertmanager_ecs_clusters_services" { 97 | value = module.alertmanager.ecs_clusters_services 98 | } 99 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/doc-checking-alerts.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: DocChecking 3 | rules: 4 | - alert: AuditEventsNotProcessing 5 | annotations: 6 | message: >- 7 | The audit consumer should be writing audit events to the 8 | database. This hasn't happened in a while. 9 | runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/AuditEventsNotProcessing/ 10 | expr: | 11 | sum without(instance) (rate(audit_consumer_events_processing_attempts_total[5m])) 12 | - 13 | sum without(instance) (rate(audit_consumer_events_processing_failures_total[5m])) 14 | == 0 15 | for: 10m 16 | labels: 17 | product: doc-checking 18 | severity: p4 19 | - alert: AuditEventsFailedProcessing 20 | annotations: 21 | message: >- 22 | The audit consumer has a high error rate when attempting to 23 | write audit events to the database. Those events may have 24 | ended up on the dead letter queue. 25 | runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/AuditEventsFailedProcessing/ 26 | expr: | 27 | sum without(instance) (rate(audit_consumer_events_processing_failures_total[2m])) > 3 28 | for: 5m 29 | labels: 30 | product: doc-checking 31 | severity: p4 32 | - alert: AuditEventsOnTheDeadLetterQueue 33 | annotations: 34 | message: | 35 | There are unprocessed audit events on the dead letter queue. 36 | runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/AuditEventsOnTheDeadLetterQueue/ 37 | expr: | 38 | max without(instance) (audit_consumer_dead_letter_queue_approximate_messages) > 0 39 | for: 5m 40 | labels: 41 | product: doc-checking 42 | severity: p4 43 | - alert: RedisNotAvailable 44 | annotations: 45 | message: | 46 | Redis is not available for rate limiting and quota. 47 | runbook_url: https://dcs-service-manual.cloudapps.digital/responding-to-alerts/RedisNotAvailable/ 48 | expr: | 49 | (avg by (job) (dcs_dmz_proxy_using_redis_for_rate_limiting) != 1) or (avg by (job) (dcs_agents_using_redis_for_rate_limiting) != 1) 50 | for: 5m 51 | labels: 52 | product: doc-checking 53 | severity: p4 54 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/paas-config/prometheus.conf.tpl: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 30s 3 | evaluation_interval: 30s 4 | alerting: 5 | alertmanagers: 6 | - scheme: http 7 | dns_sd_configs: 8 | - names: 9 | - 'alertmanager.local.gds-reliability.engineering' 10 | type: 'A' 11 | port: 9093 12 | rule_files: 13 | - "/etc/prometheus/alerts/*" 14 | scrape_configs: 15 | - job_name: prometheus 16 | ec2_sd_configs: 17 | - region: eu-west-1 18 | port: 9090 19 | relabel_configs: 20 | - source_labels: ['__meta_ec2_tag_Environment'] 21 | regex: '${environment}' 22 | action: keep 23 | - source_labels: ['__meta_ec2_tag_Service'] 24 | regex: 'observe-prometheus' 25 | action: keep 26 | - source_labels: ['__meta_ec2_availability_zone'] 27 | target_label: availability_zone 28 | - source_labels: ['__meta_ec2_instance_id'] 29 | replacement: '$1:9090' 30 | target_label: instance 31 | - job_name: paas-ireland-targets 32 | scheme: http 33 | proxy_url: 'http://localhost:8080' 34 | file_sd_configs: 35 | - files: ['/etc/prometheus/ireland-targets/*.json'] 36 | refresh_interval: 30s 37 | relabel_configs: 38 | - target_label: region 39 | replacement: ireland 40 | - job_name: paas-london-targets 41 | scheme: http 42 | proxy_url: 'http://localhost:8080' 43 | file_sd_configs: 44 | - files: ['/etc/prometheus/london-targets/*.json'] 45 | refresh_interval: 30s 46 | relabel_configs: 47 | - target_label: region 48 | replacement: london 49 | - job_name: alertmanager 50 | dns_sd_configs: 51 | - names: 52 | - 'alertmanager.local.gds-reliability.engineering' 53 | type: 'A' 54 | port: 9093 55 | - job_name: prometheus_node 56 | ec2_sd_configs: 57 | - region: eu-west-1 58 | port: 9100 59 | relabel_configs: 60 | - source_labels: ['__meta_ec2_tag_Environment'] 61 | regex: '${environment}' 62 | action: keep 63 | - source_labels: ['__meta_ec2_tag_Service'] 64 | regex: 'observe-prometheus' 65 | action: keep 66 | - source_labels: ['__meta_ec2_availability_zone'] 67 | target_label: availability_zone 68 | - source_labels: ['__meta_ec2_instance_id'] 69 | replacement: '$1:9100' 70 | target_label: instance 71 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/data-gov-uk-alerts.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: DataGovUk 3 | rules: 4 | - alert: DataGovUk_HighCpuUsage 5 | expr: avg(cpu{job="metric-exporter"}) without (exported_instance) >= 80 6 | for: 5m 7 | labels: 8 | product: "data-gov-uk" 9 | annotations: 10 | summary: "App {{ $labels.app }} has high CPU usage" 11 | message: "Application {{ $labels.app }} has been using over 80% CPU (averaged over all instances) for 5 minutes or more" 12 | - alert: DataGovUk_HighDiskUsage 13 | expr: max(disk_utilization{job="metric-exporter"}) without (exported_instance) >= 80 14 | labels: 15 | product: "data-gov-uk" 16 | annotations: 17 | summary: "App {{ $labels.app }} has high disk usage" 18 | message: "Application {{ $labels.app }} has an instance which is using over 80% disk." 19 | - alert: DataGovUk_ElasticSearchIndexSizeIncrease 20 | expr: max without(instance, host, name, es_client_node, es_data_node, es_ingest_node, es_master_node) (delta(elasticsearch_indices_docs{space="data-gov-uk"}[30m])) >= 300 21 | for: 1m 22 | labels: 23 | product: "data-gov-uk" 24 | annotations: 25 | summary: "Index size of Elasticsearch for {{ $labels.job }} has increased significantly" 26 | message: "The index size of Elasticsearch for {{ $labels.job }} has increased by more than 300 documents in the last 30 minutes" 27 | runbook: https://docs.publishing.service.gov.uk/manual/data-gov-uk-troubleshooting.html#different-number-of-datasets-in-ckan-to-find 28 | - alert: DataGovUk_ElasticSearchIndexSizeDecrease 29 | expr: max without(instance, host, name, es_client_node, es_data_node, es_ingest_node, es_master_node) (delta(elasticsearch_indices_docs{space="data-gov-uk"}[30m])) <= -300 30 | for: 1m 31 | labels: 32 | product: "data-gov-uk" 33 | annotations: 34 | summary: "Index size of Elasticsearch for {{ $labels.job }} has decreased significantly" 35 | message: "The index size of Elasticsearch for {{ $labels.job }} has decreased by more than 300 documents in the last 30 minutes" 36 | runbook: https://docs.publishing.service.gov.uk/manual/data-gov-uk-troubleshooting.html#different-number-of-datasets-in-ckan-to-find 37 | - alert: DataGovUk_HighSidekiqEnqueuedJobs 38 | expr: sidekiq_enqueued_jobs{org="gds-data-gov-uk",job="publish-data-production-queue-monitor"} > 800 39 | for: 5m 40 | labels: 41 | product: "data-gov-uk" 42 | annotations: 43 | summary: "Sidekiq's enqueued jobs do not seem to be clearing for Publish Data on production" 44 | message: "Sidekiq has had more than 800 enqueued jobs for Publish Data on production for at least 5 minutes" 45 | runbook: https://docs.publishing.service.gov.uk/manual/data-gov-uk-monitoring.html#sidekiq-publish 46 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/targets.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "prometheus_targets" { 2 | bucket = "govukobserve-targets-${var.environment}" 3 | acl = "private" 4 | force_destroy = true 5 | 6 | versioning { 7 | enabled = true 8 | } 9 | 10 | tags = merge(local.default_tags, { 11 | Name = "${var.environment}-ireland-targets" 12 | }) 13 | } 14 | 15 | resource "aws_iam_user" "targets_writer" { 16 | name = "targets-writer" 17 | path = "/${var.environment}/" 18 | 19 | tags = merge(local.default_tags, { 20 | Name = "${var.environment}-ireland-targets-writer" 21 | }) 22 | } 23 | 24 | resource "aws_iam_user_policy" "writer_has_full_access_to_targets_bucket" { 25 | name = "targets_bucket_full_access" 26 | user = aws_iam_user.targets_writer.name 27 | 28 | policy = < { "message" => "%{SYSLOG5424PRI}%{NONNEGINT:syslog_ver} +(?:%{TIMESTAMP_ISO8601:syslog_timestamp}|-) +(?:%{HOSTNAME:syslog_host}|-) +(?:%{NOTSPACE:syslog_app}|-) +(?:%{NOTSPACE:syslog_proc}|-) +(?:%{WORD:syslog_msgid}|-) +(?:%{SYSLOG5424SD:syslog_sd}|-|) +%{GREEDYDATA:syslog_msg}" } 6 | # if successful, save original `@timestamp` and `host` fields created by logstash 7 | add_field => [ "received_at", "%{@timestamp}" ] 8 | add_field => [ "received_from", "%{host}" ] 9 | add_tag => ["cf"] 10 | tag_on_failure => ["_syslogparsefailure"] 11 | } 12 | } 13 | 14 | if "cf" in [tags] { 15 | # parse the syslog pri field into severity/facility 16 | if [syslog5424_pri] { 17 | syslog_pri { syslog_pri_field_name => 'syslog5424_pri' } 18 | } 19 | 20 | # replace @timestamp field with the one from syslog 21 | date { match => [ "syslog_timestamp", "ISO8601" ] } 22 | 23 | # if we successfully parsed cf syslog, replace the message and source_host fields 24 | mutate { 25 | replace => [ "source_host", "%{syslog_host}" ] 26 | replace => [ "message", "%{syslog_msg}" ] 27 | } 28 | 29 | # Cloud Foundry passes the app name, space and organisation in the syslog_host 30 | # Filtering them into separate fields makes it easier to query multiple apps in a single Kibana instance 31 | dissect { 32 | mapping => { "syslog_host" => "%{[cf][org]}.%{[cf][space]}.%{[cf][app]}" } 33 | tag_on_failure => ["_sysloghostdissectfailure"] 34 | } 35 | 36 | # Cloud Foundry gorouter logs 37 | if [syslog_proc] =~ "RTR" { 38 | mutate { replace => { "type" => "gorouter" } } 39 | grok { 40 | match => { "syslog_msg" => "%{HOSTNAME:[access][host]} - \[%{TIMESTAMP_ISO8601:router_timestamp}\] \"%{WORD:[access][method]} %{NOTSPACE:[access][url]} HTTP/%{NUMBER:[access][http_version]}\" %{NONNEGINT:[access][response_code]:int} %{NONNEGINT:[access][body_received][bytes]:int} %{NONNEGINT:[access][body_sent][bytes]:int} %{QUOTEDSTRING:[access][referrer]} %{QUOTEDSTRING:[access][agent]} \"%{HOSTPORT:[access][remote_ip_and_port]}\" \"%{HOSTPORT:[access][upstream_ip_and_port]}\" %{GREEDYDATA:router_keys}" } 41 | tag_on_failure => ["_routerparsefailure"] 42 | add_tag => ["gorouter"] 43 | } 44 | # replace @timestamp field with the one from router access log 45 | date { 46 | match => [ "router_timestamp", "ISO8601" ] 47 | } 48 | kv { 49 | source => "router_keys" 50 | target => "router" 51 | value_split => ":" 52 | remove_field => "router_keys" 53 | } 54 | } 55 | 56 | # Application logs 57 | if [syslog_proc] =~ "APP" { 58 | json { 59 | source => "syslog_msg" 60 | add_tag => ["app"] 61 | } 62 | } 63 | 64 | # User agent parsing 65 | if [access][agent] { 66 | useragent { 67 | source => "[access][agent]" 68 | target => "[access][user_agent]" 69 | } 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## Module: alertmanager 3 | * 4 | * Create services and task definitions for the ECS cluster 5 | * 6 | */ 7 | 8 | variable "aws_region" { 9 | type = string 10 | description = "AWS region" 11 | default = "eu-west-1" 12 | } 13 | 14 | variable "remote_state_bucket" { 15 | type = string 16 | description = "S3 bucket we store our terraform state in" 17 | default = "ecs-monitoring" 18 | } 19 | 20 | variable "environment" { 21 | type = string 22 | description = "Unique name for this collection of resources" 23 | default = "ecs-monitoring" 24 | } 25 | 26 | variable "observe_cronitor" { 27 | type = string 28 | description = "URL to send Observe heartbeats to" 29 | default = "" 30 | } 31 | 32 | variable "allowed_cidrs" { 33 | type = list(string) 34 | description = "List of CIDRs which are able to access alertmanager, default are GDS ips and concourse egress" 35 | 36 | default = [ 37 | "213.86.153.211/32", 38 | "213.86.153.212/32", 39 | "213.86.153.213/32", 40 | "213.86.153.214/32", 41 | "213.86.153.231/32", 42 | "213.86.153.235/32", 43 | "213.86.153.236/32", 44 | "213.86.153.237/32", 45 | "85.133.67.244/32", 46 | "35.177.37.128/32", 47 | "35.176.252.164/32", 48 | "51.149.8.0/25", 49 | "51.149.8.128/29", # CO 50 | "51.149.9.112/29", # CO 51 | "51.149.9.240/29", # CO 52 | ] 53 | } 54 | 55 | locals { 56 | default_tags = { 57 | Terraform = "true" 58 | Project = "alertmanager" 59 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 60 | Environment = var.environment 61 | Service = "alertmanager" 62 | } 63 | vpc_id = data.terraform_remote_state.infra_networking.outputs.vpc_id 64 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 65 | availability_zones = data.aws_subnet.public_subnets.*.availability_zone 66 | } 67 | 68 | # Resources 69 | # -------------------------------------------------------------- 70 | 71 | ## Data sources 72 | data "terraform_remote_state" "infra_networking" { 73 | backend = "s3" 74 | 75 | config = { 76 | bucket = var.remote_state_bucket 77 | key = "infra-networking-modular.tfstate" 78 | region = var.aws_region 79 | } 80 | } 81 | 82 | data "terraform_remote_state" "infra_security_groups" { 83 | backend = "s3" 84 | 85 | config = { 86 | bucket = var.remote_state_bucket 87 | key = "infra-security-groups-modular.tfstate" 88 | region = var.aws_region 89 | } 90 | } 91 | 92 | data "aws_availability_zones" "available" {} 93 | 94 | data "aws_subnet" "public_subnets" { 95 | count = length(data.terraform_remote_state.infra_networking.outputs.public_subnets) 96 | id = data.terraform_remote_state.infra_networking.outputs.public_subnets[count.index] 97 | } 98 | 99 | data "aws_subnet" "private_subnets" { 100 | count = length(data.terraform_remote_state.infra_networking.outputs.private_subnets) 101 | id = data.terraform_remote_state.infra_networking.outputs.private_subnets[count.index] 102 | } 103 | 104 | ## Resources 105 | 106 | resource "aws_cloudwatch_log_group" "task_logs" { 107 | name = var.environment 108 | retention_in_days = 7 109 | 110 | tags = merge(local.default_tags, { 111 | Name = "${var.environment}-alertmanager-task-logs" 112 | }) 113 | } 114 | 115 | ## Outputs 116 | 117 | output "ecs_clusters_services" { 118 | description = "Names of ECS services created, listed by ECS cluster name" 119 | value = transpose({ 120 | for _, service in aws_ecs_service.alertmanager_alb: 121 | service.name => [ service.cluster ] 122 | }) 123 | } 124 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/main.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | filebeat_count = var.logstash_host != "" ? 1 : 0 3 | default_tags = { 4 | ManagedBy = "terraform" 5 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 6 | Environment = var.environment 7 | Service = "observe-prometheus" 8 | } 9 | } 10 | 11 | resource "aws_key_pair" "ssh_key" { 12 | count = var.enable_ssh == true ? 1 : 0 13 | key_name = "${var.environment}-prom-key" 14 | public_key = file("~/.ssh/id_rsa.pub") 15 | } 16 | 17 | resource "aws_instance" "prometheus" { 18 | count = length(keys(var.availability_zones)) 19 | 20 | ami = var.ami_id 21 | instance_type = var.instance_size 22 | user_data = data.template_file.user_data_script[count.index].rendered 23 | iam_instance_profile = aws_iam_instance_profile.prometheus_instance_profile.id 24 | subnet_id = var.subnet_ids[count.index] 25 | 26 | associate_public_ip_address = var.enable_ssh 27 | 28 | key_name = var.enable_ssh ? format("%s-prom-key", var.environment) : "" 29 | 30 | vpc_security_group_ids = var.vpc_security_groups 31 | 32 | tags = merge(local.default_tags, { 33 | Name = "paas-${var.environment}-prometheus-${element(keys(var.availability_zones), count.index)}" 34 | }) 35 | } 36 | 37 | resource "aws_volume_attachment" "attach-prometheus-disk" { 38 | count = length(keys(var.availability_zones)) 39 | 40 | device_name = var.device_mount_path 41 | volume_id = aws_ebs_volume.prometheus-disk[count.index].id 42 | instance_id = aws_instance.prometheus[count.index].id 43 | 44 | # Required to work around a bug in terraform https://github.com/hashicorp/terraform/issues/2957 45 | # terraform tries to destroy the attachment before stoping/destorying the instance 46 | skip_destroy = true 47 | } 48 | 49 | resource "aws_ebs_volume" "prometheus-disk" { 50 | count = length(keys(var.availability_zones)) 51 | 52 | availability_zone = element(keys(var.availability_zones), count.index) 53 | size = var.data_volume_size 54 | 55 | tags = merge(local.default_tags, { 56 | Name = "prometheus-disk" 57 | }) 58 | } 59 | 60 | data "template_file" "user_data_script" { 61 | count = length(keys(var.availability_zones)) 62 | 63 | template = file("${path.module}/cloud.conf") 64 | 65 | vars = { 66 | config_bucket = aws_s3_bucket.prometheus_config.id 67 | region = var.region 68 | ireland_targets_bucket = aws_s3_bucket.prometheus_targets.id 69 | london_targets_bucket = aws_s3_bucket.prometheus_london_targets.id 70 | alerts_bucket = aws_s3_bucket.prometheus_config.id 71 | prom_external_url = "https://${var.prometheus_public_fqdns[count.index]}" 72 | logstash_host = var.logstash_host 73 | prometheus_htpasswd = var.prometheus_htpasswd 74 | allowed_cidrs = join("\n ", formatlist("allow %s;", var.allowed_cidrs)) 75 | data_volume_size = var.data_volume_size 76 | } 77 | } 78 | 79 | resource "aws_s3_bucket" "prometheus_config" { 80 | bucket = var.config_bucket 81 | acl = "private" 82 | force_destroy = true 83 | 84 | versioning { 85 | enabled = true 86 | } 87 | 88 | tags = merge(local.default_tags, { 89 | Name = "${var.environment}-prometheus-config" 90 | }) 91 | } 92 | 93 | data "template_file" "filebeat_conf" { 94 | count = local.filebeat_count 95 | template = file("${path.module}/filebeat.yml.tpl") 96 | 97 | vars = { 98 | logstash_host = var.logstash_host 99 | environment = var.environment 100 | } 101 | } 102 | 103 | resource "aws_s3_bucket_object" "filebeat" { 104 | count = local.filebeat_count 105 | bucket = var.config_bucket 106 | key = "filebeat/filebeat.yml" 107 | content = data.template_file.filebeat_conf[0].rendered 108 | } 109 | 110 | resource "aws_lb_target_group_attachment" "prom_target_group_attachment" { 111 | count = length(var.prometheus_target_group_arns) 112 | target_group_arn = var.prometheus_target_group_arns[count.index] 113 | target_id = aws_instance.prometheus[count.index].id 114 | port = 80 115 | } 116 | 117 | -------------------------------------------------------------------------------- /terraform/modules/infra-networking/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## module: infra-networking 3 | * 4 | * Terraform module to deploy the networking required for a VPC and 5 | * related services. You will often have multiple VPCs in an account 6 | * 7 | */ 8 | 9 | variable "aws_region" { 10 | type = string 11 | description = "AWS region" 12 | default = "eu-west-1" 13 | } 14 | 15 | variable "environment" { 16 | type = string 17 | description = "Unique name for this collection of resources" 18 | } 19 | 20 | variable "prometheus_subdomain" { 21 | type = string 22 | description = "Subdomain for prometheus" 23 | default = "monitoring" 24 | } 25 | 26 | # locals 27 | # -------------------------------------------------------------- 28 | 29 | locals { 30 | default_tags = { 31 | Terraform = "true" 32 | Project = "infra-networking" 33 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 34 | Environment = var.environment 35 | } 36 | 37 | subdomain_name = "${var.prometheus_subdomain}.gds-reliability.engineering" 38 | private_subdomain_name = "${var.environment}.monitoring.private" 39 | } 40 | 41 | ## Data sources 42 | 43 | data "aws_availability_zones" "available" {} 44 | 45 | ## Resources 46 | 47 | module "vpc" { 48 | source = "terraform-aws-modules/vpc/aws" 49 | version = "3.5.0" 50 | 51 | name = "observe-${var.environment}" 52 | cidr = "10.0.0.0/16" 53 | 54 | # subnets assumes 3 AZs although 3AZs are not implemented elsewhere 55 | azs = data.aws_availability_zones.available.names 56 | private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] 57 | public_subnets = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"] 58 | 59 | create_database_subnet_group = false 60 | 61 | enable_nat_gateway = true 62 | single_nat_gateway = false 63 | 64 | enable_dns_hostnames = true 65 | enable_dns_support = true 66 | 67 | enable_dhcp_options = true 68 | dhcp_options_domain_name = local.private_subdomain_name 69 | 70 | # no `Name` tag unlike other resources as this is taken care of by the vpc module `name` property 71 | tags = local.default_tags 72 | } 73 | 74 | resource "aws_route53_zone" "subdomain" { 75 | name = local.subdomain_name 76 | } 77 | 78 | resource "aws_route53_zone" "private" { 79 | name = local.private_subdomain_name 80 | force_destroy = true 81 | vpc { 82 | vpc_id = module.vpc.vpc_id 83 | } 84 | } 85 | 86 | ## Outputs 87 | 88 | output "vpc_id" { 89 | value = module.vpc.vpc_id 90 | description = "VPC ID where the stack resources are created" 91 | } 92 | 93 | output "private_subnets" { 94 | value = module.vpc.private_subnets 95 | description = "List of private subnet IDs" 96 | } 97 | 98 | output "public_subnets" { 99 | value = module.vpc.public_subnets 100 | description = "List of public subnet IDs" 101 | } 102 | 103 | output "public_zone_id" { 104 | value = aws_route53_zone.subdomain.zone_id 105 | description = "Route 53 Zone ID for publicly visible zone" 106 | } 107 | 108 | output "public_subdomain" { 109 | value = aws_route53_zone.subdomain.name 110 | description = "This is the subdomain for root zone" 111 | } 112 | 113 | output "private_zone_id" { 114 | value = aws_route53_zone.private.zone_id 115 | description = "Route 53 Zone ID for the internal zone" 116 | } 117 | 118 | output "private_zone_name" { 119 | value = aws_route53_zone.private.name 120 | description = "Route 53 Zone name for the internal zone" 121 | } 122 | 123 | output "private_subnets_ips" { 124 | value = module.vpc.private_subnets_cidr_blocks 125 | description = "List of private subnet IPs" 126 | } 127 | 128 | output "nat_gateway" { 129 | value = module.vpc.nat_public_ips 130 | description = "List of nat gateway IP" 131 | } 132 | 133 | output "private_subdomain" { 134 | value = aws_route53_zone.private.name 135 | description = "This is the subdomain for private zone" 136 | } 137 | 138 | output "subnets_by_az" { 139 | value = zipmap( 140 | data.aws_availability_zones.available.names, 141 | module.vpc.private_subnets_cidr_blocks, 142 | ) 143 | 144 | description = "Map of availability zones to private subnets" 145 | } 146 | 147 | -------------------------------------------------------------------------------- /terraform/projects/prom-ec2/paas-production/main.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | environment = "production" 3 | config_bucket = "gdsobserve-paas-${local.environment}-config-store" 4 | } 5 | 6 | terraform { 7 | required_version = "~> 0.13.3" 8 | 9 | backend "s3" { 10 | bucket = "govukobserve-tfstate-prom-enclave-paas-production" 11 | key = "prometheus.tfstate" 12 | encrypt = true 13 | region = "eu-west-1" 14 | } 15 | } 16 | 17 | provider "aws" { 18 | region = "eu-west-1" 19 | allowed_account_ids = ["455214962221"] 20 | } 21 | 22 | data "terraform_remote_state" "infra_networking" { 23 | backend = "s3" 24 | 25 | config = { 26 | bucket = "prometheus-${local.environment}" 27 | key = "infra-networking-modular.tfstate" 28 | region = "eu-west-1" 29 | } 30 | } 31 | 32 | data "terraform_remote_state" "infra_security_groups" { 33 | backend = "s3" 34 | 35 | config = { 36 | bucket = "prometheus-${local.environment}" 37 | key = "infra-security-groups-modular.tfstate" 38 | region = "eu-west-1" 39 | } 40 | } 41 | 42 | data "terraform_remote_state" "app_ecs_albs" { 43 | backend = "s3" 44 | 45 | config = { 46 | bucket = "prometheus-${local.environment}" 47 | key = "app-ecs-albs-modular.tfstate" 48 | region = "eu-west-1" 49 | } 50 | } 51 | 52 | provider "pass" { 53 | store_dir = "~/.password-store/re-secrets/observe" 54 | refresh_store = true 55 | } 56 | 57 | data "pass_password" "logstash_endpoint" { 58 | path = "logit/prometheus-paas-logstash-endpoint-prod" 59 | } 60 | 61 | data "pass_password" "prometheus_htpasswd" { 62 | path = "prometheus-basic-auth-htpasswd" 63 | } 64 | 65 | data "pass_password" "dm_elasticsearch_metrics_password" { 66 | path = "dm-elasticsearch-metrics-password" 67 | } 68 | 69 | data "pass_password" "dm_paas_metrics_username" { 70 | path = "dm-paas-metrics-username" 71 | } 72 | 73 | data "pass_password" "dm_paas_metrics_password" { 74 | path = "dm-paas-metrics-password" 75 | } 76 | 77 | module "ami" { 78 | source = "../../../modules/common/ami" 79 | } 80 | 81 | module "prometheus" { 82 | source = "../../../modules/prom-ec2/prometheus" 83 | 84 | ami_id = module.ami.ubuntu_focal_ami_id 85 | 86 | target_vpc = data.terraform_remote_state.infra_networking.outputs.vpc_id 87 | enable_ssh = false 88 | 89 | environment = local.environment 90 | config_bucket = local.config_bucket 91 | logstash_host = data.pass_password.logstash_endpoint.password 92 | 93 | prometheus_public_fqdns = data.terraform_remote_state.app_ecs_albs.outputs.prom_public_record_fqdns 94 | 95 | subnet_ids = data.terraform_remote_state.infra_networking.outputs.private_subnets 96 | availability_zones = data.terraform_remote_state.infra_networking.outputs.subnets_by_az 97 | vpc_security_groups = [data.terraform_remote_state.infra_security_groups.outputs.prometheus_ec2_sg_id] 98 | region = "eu-west-1" 99 | 100 | prometheus_htpasswd = data.pass_password.prometheus_htpasswd.password 101 | prometheus_target_group_arns = data.terraform_remote_state.app_ecs_albs.outputs.prometheus_target_group_arns 102 | } 103 | 104 | module "paas-config" { 105 | source = "../../../modules/prom-ec2/paas-config" 106 | 107 | environment = local.environment 108 | 109 | prometheus_config_bucket = module.prometheus.s3_config_bucket 110 | alerts_path = "../../../modules/prom-ec2/alerts-config/alerts/" 111 | 112 | prom_private_ips = module.prometheus.private_ip_addresses 113 | private_zone_id = data.terraform_remote_state.infra_networking.outputs.private_zone_id 114 | 115 | extra_scrape_configs = yamldecode(templatefile("${path.module}/extra-prometheus-scrape-configs.yml.tpl", { 116 | dm_elasticsearch_metrics_password = data.pass_password.dm_elasticsearch_metrics_password.password 117 | dm_paas_metrics_username = data.pass_password.dm_paas_metrics_username.password 118 | dm_paas_metrics_password = data.pass_password.dm_paas_metrics_password.password 119 | })) 120 | } 121 | 122 | output "instance_ids" { 123 | value = "[\n ${join("\n ", module.prometheus.prometheus_instance_id)}\n]" 124 | } 125 | 126 | output "prometheus_config_etag" { 127 | value = module.paas-config.prometheus_config_etag 128 | } 129 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/security-group.tf: -------------------------------------------------------------------------------- 1 | resource "aws_security_group" "alertmanager_alb" { 2 | name = "${var.environment}-alertmanager-alb" 3 | vpc_id = local.vpc_id 4 | description = "Alertmanager ALB" 5 | 6 | tags = merge( 7 | local.default_tags, 8 | { 9 | Name = "alertmanager-alb", 10 | }, 11 | ) 12 | } 13 | 14 | resource "aws_security_group" "alertmanager_task" { 15 | name = "${var.environment}-alertmanager-task" 16 | vpc_id = local.vpc_id 17 | description = "Controls ingress and egress for the alertmanager task" 18 | 19 | tags = merge( 20 | local.default_tags, 21 | { 22 | Name = "alertmanager-task", 23 | }, 24 | ) 25 | } 26 | 27 | # Alertmanager is behind an NLB, so it needs to allow ingress from the 28 | # allowed public internet cidrs directly 29 | resource "aws_security_group_rule" "ingress_from_allowed_cidrs_to_alertmanager_9093" { 30 | security_group_id = aws_security_group.alertmanager_task.id 31 | type = "ingress" 32 | from_port = 9093 33 | to_port = 9093 34 | protocol = "tcp" 35 | cidr_blocks = var.allowed_cidrs 36 | } 37 | 38 | # Alertmanager ALB needs to allow ingress from the allowed public 39 | # internet cidrs 40 | resource "aws_security_group_rule" "ingress_from_allowed_cidrs_to_alertmanager_alb_http" { 41 | security_group_id = aws_security_group.alertmanager_alb.id 42 | type = "ingress" 43 | from_port = 80 44 | to_port = 80 45 | protocol = "tcp" 46 | cidr_blocks = var.allowed_cidrs 47 | } 48 | 49 | resource "aws_security_group_rule" "ingress_from_allowed_cidrs_to_alertmanager_alb_https" { 50 | security_group_id = aws_security_group.alertmanager_alb.id 51 | type = "ingress" 52 | from_port = 443 53 | to_port = 443 54 | protocol = "tcp" 55 | cidr_blocks = var.allowed_cidrs 56 | } 57 | 58 | # NLB health checks come from the public subnet IP range 59 | resource "aws_security_group_rule" "ingress_from_public_subnets_to_alertmanager_9093" { 60 | security_group_id = aws_security_group.alertmanager_task.id 61 | type = "ingress" 62 | from_port = 9093 63 | to_port = 9093 64 | protocol = "tcp" 65 | cidr_blocks = data.aws_subnet.public_subnets.*.cidr_block 66 | } 67 | 68 | resource "aws_security_group_rule" "ingress_from_alertmanager_alb_to_alertmanager_9093" { 69 | security_group_id = aws_security_group.alertmanager_task.id 70 | source_security_group_id = aws_security_group.alertmanager_alb.id 71 | type = "ingress" 72 | from_port = 9093 73 | to_port = 9093 74 | protocol = "tcp" 75 | } 76 | 77 | resource "aws_security_group_rule" "egress_from_alertmanager_alb_to_alertmanager_9093" { 78 | security_group_id = aws_security_group.alertmanager_alb.id 79 | # source_security_group_id means destination for egress rules 80 | source_security_group_id = aws_security_group.alertmanager_task.id 81 | type = "egress" 82 | from_port = 9093 83 | to_port = 9093 84 | protocol = "tcp" 85 | } 86 | 87 | # TODO: could we make observe prometheus more consistent with external 88 | # prometheis and go via public NLB IPs? 89 | resource "aws_security_group_rule" "ingress_from_prometheus_ec2_to_alertmanager_task" { 90 | security_group_id = aws_security_group.alertmanager_task.id 91 | type = "ingress" 92 | from_port = 9093 93 | to_port = 9093 94 | protocol = "tcp" 95 | source_security_group_id = data.terraform_remote_state.infra_security_groups.outputs.prometheus_ec2_sg_id 96 | } 97 | 98 | 99 | resource "aws_security_group_rule" "ingress_alertmanager_task_meshing" { 100 | security_group_id = aws_security_group.alertmanager_task.id 101 | type = "ingress" 102 | from_port = 9094 103 | to_port = 9094 104 | protocol = "tcp" 105 | source_security_group_id = aws_security_group.alertmanager_task.id 106 | } 107 | 108 | # This rule allows all egress out of alertmanager_task. This is for the following purposes: 109 | # - raising alerts with receivers such as pagerduty and cronitor 110 | # - sending emails via AWS API 111 | # - communicate with other alertmanagers to mesh 112 | resource "aws_security_group_rule" "egress_from_alertmanager_task_to_all" { 113 | security_group_id = aws_security_group.alertmanager_task.id 114 | type = "egress" 115 | from_port = 0 116 | to_port = 0 117 | protocol = "-1" 118 | cidr_blocks = ["0.0.0.0/0"] 119 | } 120 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/alb.tf: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | # ----- alertmanager public ALB ------- 3 | ###################################################################### 4 | # 5 | # 6 | # The ALB serves one main purpose: so we can use ACM certs instead of 7 | # managing our own. We don't actually want it to load-balance; each 8 | # public domain name associated with alertmanager should route to 9 | # exactly one internal alertmanager instance. We achieve this by 10 | # using listener rules, so that requests with a particular host: 11 | # header must go to a particular AZ, and running one alertmanager per 12 | # AZ. 13 | 14 | 15 | resource "aws_lb" "alertmanager_alb" { 16 | name = "${var.environment}-alertmanager-alb" 17 | internal = false 18 | load_balancer_type = "application" 19 | 20 | security_groups = [aws_security_group.alertmanager_alb.id] 21 | 22 | subnets = data.terraform_remote_state.infra_networking.outputs.public_subnets 23 | 24 | tags = merge( 25 | local.default_tags, 26 | { 27 | Name = "${var.environment}-alertmanager-alb" 28 | }, 29 | ) 30 | } 31 | 32 | resource "aws_lb_listener" "alertmanager_listener_alb_http" { 33 | load_balancer_arn = aws_lb.alertmanager_alb.arn 34 | port = "80" 35 | protocol = "HTTP" 36 | 37 | default_action { 38 | type = "redirect" 39 | 40 | redirect { 41 | port = "443" 42 | protocol = "HTTPS" 43 | status_code = "HTTP_301" 44 | } 45 | } 46 | } 47 | 48 | resource "aws_lb_listener" "alertmanager_listener_alb_https" { 49 | load_balancer_arn = aws_lb.alertmanager_alb.arn 50 | port = "443" 51 | protocol = "HTTPS" 52 | ssl_policy = "ELBSecurityPolicy-TLS-1-2-2017-01" 53 | certificate_arn = aws_acm_certificate_validation.alertmanager_cert.certificate_arn 54 | 55 | default_action { 56 | type = "forward" 57 | target_group_arn = aws_lb_target_group.alertmanager_all.arn 58 | } 59 | } 60 | 61 | resource "aws_lb_listener_rule" "alertmanager_listener_rule_per_az" { 62 | for_each = toset(local.availability_zones) 63 | 64 | listener_arn = aws_lb_listener.alertmanager_listener_alb_https.arn 65 | 66 | action { 67 | type = "forward" 68 | target_group_arn = aws_lb_target_group.alertmanager_per_az[each.key].arn 69 | } 70 | 71 | condition { 72 | host_header { 73 | values = ["alerts-${each.key}.*"] 74 | } 75 | } 76 | } 77 | 78 | resource "aws_lb_target_group" "alertmanager_per_az" { 79 | for_each = toset(local.availability_zones) 80 | name = "${var.environment}-alerts-${each.key}" 81 | port = 9093 82 | protocol = "HTTP" 83 | vpc_id = local.vpc_id 84 | deregistration_delay = 30 85 | target_type = "ip" 86 | 87 | health_check { 88 | interval = 10 89 | path = "/" 90 | matcher = "200" 91 | protocol = "HTTP" 92 | healthy_threshold = 2 93 | unhealthy_threshold = 2 94 | timeout = "5" 95 | } 96 | 97 | tags = merge( 98 | local.default_tags, 99 | { 100 | Name = "${var.environment}-alertmanager-${each.key}" 101 | }, 102 | ) 103 | } 104 | 105 | resource "aws_lb_target_group" "alertmanager_all" { 106 | name = "${var.environment}-alerts-all" 107 | port = 9093 108 | protocol = "HTTP" 109 | vpc_id = local.vpc_id 110 | deregistration_delay = 30 111 | target_type = "ip" 112 | 113 | health_check { 114 | interval = 10 115 | path = "/" 116 | matcher = "200" 117 | protocol = "HTTP" 118 | healthy_threshold = 2 119 | unhealthy_threshold = 2 120 | timeout = "5" 121 | } 122 | 123 | tags = merge( 124 | local.default_tags, 125 | { 126 | Name = "${var.environment}-alertmanager-all" 127 | }, 128 | ) 129 | } 130 | 131 | resource "aws_route53_record" "alerts_alias" { 132 | zone_id = local.zone_id 133 | name = "alerts" 134 | type = "A" 135 | 136 | alias { 137 | name = aws_lb.alertmanager_alb.dns_name 138 | zone_id = aws_lb.alertmanager_alb.zone_id 139 | evaluate_target_health = false 140 | } 141 | } 142 | 143 | resource "aws_route53_record" "alerts_az_alias" { 144 | for_each = toset(local.availability_zones) 145 | 146 | zone_id = local.zone_id 147 | name = "alerts-${each.key}" 148 | type = "A" 149 | 150 | alias { 151 | name = aws_lb.alertmanager_alb.dns_name 152 | zone_id = aws_lb.alertmanager_alb.zone_id 153 | evaluate_target_health = false 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /tools/grafana_info/find_missing_metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import json 3 | import os 4 | import re 5 | import requests 6 | import sys 7 | import yaml 8 | 9 | from bearer_auth import BearerAuth 10 | from grafana_api.grafana_api import GrafanaAPI 11 | 12 | IGNORE_WORDS = [ 13 | "and", 14 | "avg", 15 | "avg_over_time", 16 | "by", 17 | "count", 18 | "deriv", 19 | "exported_instance", 20 | "ignoring", 21 | "increase", 22 | "irate", 23 | "job", 24 | "le", 25 | "max", 26 | "on", 27 | "or", 28 | "rate", 29 | "sort", 30 | "sum", 31 | "time", 32 | "topk", 33 | "without", 34 | ] 35 | 36 | 37 | def exprs_for_dashboard(dashboard): 38 | d = g.get('/dashboards/uid/%s' % dashboard['uid']) 39 | if 'panels' in d['dashboard']: 40 | panels = d['dashboard']['panels'] 41 | for panel in panels: 42 | targets = panel.get('targets', []) 43 | for target in targets: 44 | if 'expr' in target: 45 | yield { 46 | "expr": target['expr'], 47 | "dashboard_title": dashboard['title'], 48 | "panel_title": panel['title'] 49 | } 50 | 51 | 52 | def exprs_for_alerts(): 53 | exprs = [] 54 | for yml_file in [f for f in os.listdir(os.environ.get("ALERTS_DIR")) if re.match(r'.*\.yml', f)]: 55 | with open("{}/{}".format(os.environ.get("ALERTS_DIR"), yml_file), 'r') as stream: 56 | try: 57 | alerts = yaml.load(stream) 58 | 59 | except yaml.YAMLError as exc: 60 | print(exc) 61 | 62 | for rule in alerts["groups"][0]['rules']: 63 | exprs.append({'expr': rule['expr']}) 64 | 65 | return exprs 66 | 67 | 68 | # remove unwanted parts of the expression 69 | def rationalise_expr(expr, pattern, replace="%s"): 70 | matched = re.findall(pattern, expr) 71 | 72 | if matched: 73 | for m in matched: 74 | expr = expr.replace(replace % m, "") 75 | 76 | return expr 77 | 78 | 79 | def extract_words_from_expressions(exprs): 80 | index = 0 81 | words = [] 82 | 83 | print('**** Expressions:') 84 | for e in exprs: 85 | expr = e['expr'] 86 | 87 | if len(expr) > 0: 88 | print(index, expr) 89 | 90 | expr = rationalise_expr(expr, r'\{([^}]+)', "{%s}") # filters 91 | expr = rationalise_expr(expr, r'\[([^]]+)', "[%s]") # time ranges 92 | expr = rationalise_expr(expr, r'\$[_\w]+') # grafana vars 93 | expr = rationalise_expr(expr, r'\([a-z]+\)') # labels 94 | 95 | matched_words = re.findall(r'[^\d\W]+', expr) 96 | words.extend(matched_words) 97 | 98 | index += 1 99 | 100 | return words 101 | 102 | 103 | def check_metric_exists_for_word(words): 104 | index = 0 105 | missing_metric = [] 106 | print('**** Metrics evaluation:') 107 | for w in set(words).difference(IGNORE_WORDS): 108 | r_old = requests.get("{}/api/v1/query?query={}".format(os.environ.get("OLD_PROM_SERVER"), w)) 109 | resp_old = json.loads(r_old.content) 110 | 111 | if resp_old['status'] == 'success': 112 | print('{}: {}, {}'.format(index, len(resp_old['data']['result']) > 0, w)) 113 | 114 | # if old prometheus server doesn't have the metric then check if new prometheus server has the metric 115 | if not len(resp_old['data']['result']): 116 | r_new = requests.get("{}/api/v1/query?query={}".format(os.environ.get("NEW_PROM_SERVER"), w)) 117 | resp_new = json.loads(r_new.content) 118 | # only report it as missing if metrics are found on the new prometheus server 119 | if len(resp_new['data']['result']) > 0: 120 | missing_metric.append(w) 121 | else: 122 | print("{}: *** {} - {}".format(index, w, resp_old)) 123 | 124 | index += 1 125 | 126 | return missing_metric 127 | 128 | 129 | if __name__ == "__main__": 130 | try: 131 | token = os.environ['GRAFANA_TOKEN'] 132 | g = GrafanaAPI(BearerAuth(token), 'grafana-paas.cloudapps.digital', protocol='https') 133 | dashboards = g.get('/search?type=dash-db') 134 | exprs = [expr for dashboard in dashboards for expr in exprs_for_dashboard(dashboard)] 135 | exprs.sort(key=lambda e: e['dashboard_title'] + e['panel_title']) 136 | 137 | exprs.extend(exprs_for_alerts()) 138 | 139 | words = extract_words_from_expressions(exprs) 140 | 141 | missing_metric = check_metric_exists_for_word(words) 142 | 143 | print('**** Missing metrics:' if missing_metric else '**** No missing metrics') 144 | for m in missing_metric: 145 | print(m) 146 | 147 | except KeyError as e: 148 | print('Please set the %s environment variable' % e.args[0], file=sys.stderr) 149 | exit(1) 150 | -------------------------------------------------------------------------------- /terraform/modules/infra-security-groups/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## module: infra-security-groups 3 | * 4 | * Central module to manage all security groups. 5 | * 6 | * This is done in a single module to reduce conflicts 7 | * and cascade issues. 8 | * 9 | */ 10 | 11 | variable "aws_region" { 12 | type = string 13 | description = "The AWS region to use." 14 | } 15 | 16 | variable "remote_state_bucket" { 17 | type = string 18 | description = "S3 bucket we store our terraform state in" 19 | } 20 | 21 | variable "environment" { 22 | type = string 23 | description = "Unique name for this collection of resources" 24 | } 25 | 26 | # locals 27 | # -------------------------------------------------------------- 28 | 29 | locals { 30 | default_tags = { 31 | Terraform = "true" 32 | Project = "infra-security-groups" 33 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 34 | Environment = var.environment 35 | } 36 | } 37 | 38 | # Resources 39 | # -------------------------------------------------------------- 40 | 41 | ## Data sources 42 | 43 | data "terraform_remote_state" "infra_networking" { 44 | backend = "s3" 45 | 46 | config = { 47 | bucket = var.remote_state_bucket 48 | key = "infra-networking-modular.tfstate" 49 | region = var.aws_region 50 | } 51 | } 52 | 53 | resource "aws_security_group" "prometheus_alb" { 54 | name = "${var.environment}-prometheus-alb" 55 | vpc_id = data.terraform_remote_state.infra_networking.outputs.vpc_id 56 | description = "Controls ingress and egress for prometheus ALB" 57 | 58 | tags = merge( 59 | local.default_tags, 60 | { 61 | Name = "prometheus-alb", 62 | Service = "observe-prometheus", 63 | }, 64 | ) 65 | } 66 | 67 | # We allow all IPs to access the ALB as Prometheus is fronted by an nginx which controls access to either approved IP 68 | # addresses, or users with basic auth creds 69 | resource "aws_security_group_rule" "ingress_from_public_http_to_prometheus_alb" { 70 | security_group_id = aws_security_group.prometheus_alb.id 71 | type = "ingress" 72 | from_port = 80 73 | to_port = 80 74 | protocol = "tcp" 75 | cidr_blocks = ["0.0.0.0/0"] 76 | } 77 | 78 | resource "aws_security_group_rule" "ingress_from_public_https_to_prometheus_alb" { 79 | security_group_id = aws_security_group.prometheus_alb.id 80 | type = "ingress" 81 | from_port = 443 82 | to_port = 443 83 | protocol = "tcp" 84 | cidr_blocks = ["0.0.0.0/0"] 85 | } 86 | 87 | resource "aws_security_group_rule" "egress_from_prometheus_alb_to_prometheus_ec2" { 88 | security_group_id = aws_security_group.prometheus_alb.id 89 | type = "egress" 90 | to_port = 80 91 | from_port = 80 92 | protocol = "tcp" 93 | source_security_group_id = aws_security_group.prometheus_ec2.id 94 | } 95 | 96 | resource "aws_security_group" "prometheus_ec2" { 97 | name = "${var.environment}-prometheus-ec2" 98 | vpc_id = data.terraform_remote_state.infra_networking.outputs.vpc_id 99 | description = "Controls ingress and egress for prometheus EC2 instances" 100 | 101 | tags = merge( 102 | local.default_tags, 103 | { 104 | Name = "prometheus-ec2", 105 | Service = "observe-prometheus", 106 | }, 107 | ) 108 | } 109 | 110 | resource "aws_security_group_rule" "ingress_from_prometheus_alb_to_prometheus_ec2" { 111 | security_group_id = aws_security_group.prometheus_ec2.id 112 | type = "ingress" 113 | to_port = 80 114 | from_port = 80 115 | protocol = "tcp" 116 | source_security_group_id = aws_security_group.prometheus_alb.id 117 | } 118 | 119 | resource "aws_security_group_rule" "ingress_from_prometheus_ec2_to_prometheus_ec2" { 120 | security_group_id = aws_security_group.prometheus_ec2.id 121 | type = "ingress" 122 | to_port = 9090 123 | from_port = 9090 124 | protocol = "tcp" 125 | source_security_group_id = aws_security_group.prometheus_ec2.id 126 | } 127 | 128 | resource "aws_security_group_rule" "ingress_from_prometheus_to_prometheus_node_exporter" { 129 | security_group_id = aws_security_group.prometheus_ec2.id 130 | type = "ingress" 131 | to_port = 9100 132 | from_port = 9100 133 | protocol = "tcp" 134 | source_security_group_id = aws_security_group.prometheus_ec2.id 135 | } 136 | 137 | # This rule allows all egress out of prometheus_ec2. This is for the following purposes: 138 | # - downloading packages from package repos 139 | # - calling AWS APIs such as SSM, S3 and EC2 140 | # - scraping alertmanager on port 9093 141 | # - sending alerts to alertmanager on port 9093 142 | # - scraping external targets that run on the PaaS 143 | # - scraping itself and other promethis on port 9090 144 | # - scraping node exporters on port 9100 145 | resource "aws_security_group_rule" "egress_from_prometheus_ec2_to_all" { 146 | security_group_id = aws_security_group.prometheus_ec2.id 147 | type = "egress" 148 | to_port = 0 149 | from_port = 0 150 | protocol = "-1" 151 | cidr_blocks = ["0.0.0.0/0"] 152 | } 153 | 154 | ## Outputs 155 | 156 | output "prometheus_ec2_sg_id" { 157 | value = aws_security_group.prometheus_ec2.id 158 | description = "security group prometheus_ec2 ID" 159 | } 160 | 161 | output "prometheus_alb_sg_id" { 162 | value = aws_security_group.prometheus_alb.id 163 | description = "security group prometheus_alb ID" 164 | } 165 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/templates/alertmanager.tpl: -------------------------------------------------------------------------------- 1 | global: 2 | resolve_timeout: 5m 3 | 4 | smtp_from: "${smtp_from}" 5 | smtp_smarthost: "${smtp_smarthost}" 6 | smtp_auth_username: "${smtp_username}" 7 | smtp_auth_password: "${smtp_password}" 8 | slack_api_url: "${slack_api_url}" 9 | 10 | templates: 11 | - '/etc/alertmanager/default.tmpl' 12 | 13 | route: 14 | receiver: "re-observe-pagerduty" 15 | group_by: 16 | - alertname 17 | - product 18 | - deployment 19 | routes: 20 | - receiver: "autom8-tickets" 21 | repeat_interval: 7d 22 | match: 23 | product: "prometheus" 24 | severity: "ticket" 25 | - receiver: "notify-tickets" 26 | repeat_interval: 7d 27 | match: 28 | product: "notify" 29 | severity: "ticket" 30 | - receiver: "notify-p2" 31 | repeat_interval: 7d 32 | match: 33 | product: "notify" 34 | severity: "p2" 35 | - receiver: "dgu-pagerduty" 36 | match: 37 | product: "data-gov-uk" 38 | - receiver: "govuk-pagerduty" 39 | match: 40 | product: "govuk-accounts" 41 | - receiver: "re-observe-pagerduty" 42 | match: 43 | product: "prometheus" 44 | severity: "page" 45 | - receiver: "observe-cronitor" 46 | group_interval: 1m 47 | repeat_interval: 1m 48 | match: 49 | product: "prometheus" 50 | severity: "constant" 51 | - receiver: "dev-null" 52 | match: 53 | product: "doc-checking" 54 | routes: 55 | - match_re: 56 | space: production|integration 57 | receiver: dcs-slack 58 | routes: 59 | - match: 60 | space: production 61 | severity: p2 62 | receiver: "dcs-p2" 63 | # Verify hub ECS 64 | - receiver: "verify-2ndline-slack" 65 | match: 66 | product: "verify" 67 | routes: 68 | - receiver: "verify-p1" 69 | match: 70 | deployment: prod 71 | severity: p1 72 | - receiver: "verify-p2" 73 | match: 74 | deployment: integration 75 | severity: p1 76 | - receiver: "verify-p3" 77 | match: 78 | severity: ticket 79 | - match: 80 | severity: constant 81 | group_interval: 1m 82 | repeat_interval: 1m 83 | routes: 84 | - match: 85 | deployment: prod 86 | receiver: "verify-prod-cronitor" 87 | - match: 88 | deployment: integration 89 | receiver: "verify-integration-cronitor" 90 | - match: 91 | deployment: staging 92 | receiver: "verify-staging-cronitor" 93 | 94 | receivers: 95 | - name: "re-observe-pagerduty" 96 | pagerduty_configs: 97 | - service_key: "${observe_pagerduty_key}" 98 | - name: "dgu-pagerduty" 99 | pagerduty_configs: 100 | - service_key: "${dgu_pagerduty_key}" 101 | - name: "govuk-pagerduty" 102 | pagerduty_configs: 103 | - service_key: "${govuk_pagerduty_key}" 104 | - name: "notify-tickets" 105 | email_configs: 106 | - to: "${notify_zendesk}" 107 | - name: "notify-p2" 108 | pagerduty_configs: 109 | - service_key: "${notify_p2_pagerduty_key}" 110 | - name: "observe-cronitor" 111 | webhook_configs: 112 | - send_resolved: false 113 | url: "${observe_cronitor}" 114 | - name: "verify-prod-cronitor" 115 | webhook_configs: 116 | - send_resolved: false 117 | url: "${verify_prod_cronitor}" 118 | - name: "verify-integration-cronitor" 119 | webhook_configs: 120 | - send_resolved: false 121 | url: "${verify_integration_cronitor}" 122 | - name: "verify-staging-cronitor" 123 | webhook_configs: 124 | - send_resolved: false 125 | url: "${verify_staging_cronitor}" 126 | - name: "verify-2ndline-slack" 127 | slack_configs: &verify-2ndline-slack-configs 128 | - send_resolved: true 129 | channel: '#verify-2ndline' 130 | icon_emoji: ':verify-shield:' 131 | username: alertmanager 132 | - name: "autom8-tickets" 133 | email_configs: 134 | - to: "${autom8_recipient_email}" 135 | slack_configs: 136 | - send_resolved: true 137 | channel: '#re-autom8-alerts' 138 | icon_emoji: ':verify-shield:' 139 | username: alertmanager 140 | color: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}warning{{ else }}danger{{ end }}{{ else }}good{{ end }}' 141 | pretext: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}:warning:{{ else }}:rotating_light:{{ end }}{{ else }}:green_tick:{{ end }} {{ .CommonLabels.alertname }}:{{ .CommonAnnotations.summary }}' 142 | text: |- 143 | *Description:* {{ .CommonAnnotations.message }} 144 | {{ range .Alerts }} 145 | *Details:* 146 | {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` 147 | {{ end }} 148 | {{ end }} 149 | short_fields: true 150 | fields: 151 | - title: Product 152 | value: '{{ .CommonLabels.product }}' 153 | - title: Deployment 154 | value: '{{ .CommonLabels.deployment }}' 155 | actions: 156 | - type: button 157 | text: Runbook 158 | url: '{{ .CommonAnnotations.runbook_url }}' 159 | - name: "dcs-slack" 160 | slack_configs: 161 | - send_resolved: true 162 | channel: '#di-dcs-2ndline' 163 | icon_emoji: ':gsp:' 164 | username: alertmanager 165 | color: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}warning{{ else }}danger{{ end }}{{ else }}good{{ end }}' 166 | pretext: '{{ if eq .Status "firing" }}{{ if eq .CommonLabels.severity "warning" }}:warning:{{ else }}:rotating_light:{{ end }}{{ else }}:green_tick:{{ end }} {{ .CommonLabels.alertname }}:{{ .CommonAnnotations.summary }}' 167 | text: |- 168 | *Description:* {{ .CommonAnnotations.message }} 169 | {{ range .Alerts }} 170 | *Details:* 171 | {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` 172 | {{ end }} 173 | {{ end }} 174 | short_fields: true 175 | fields: 176 | - title: Product 177 | value: '{{ .CommonLabels.product }}' 178 | - title: Namespace 179 | value: '{{ .CommonLabels.namespace }}' 180 | - title: | 181 | {{- if .CommonLabels.job_name -}} 182 | Job 183 | {{- else if .CommonLabels.deployment -}} 184 | Deployment 185 | {{- else if match "^KubePod" .CommonLabels.alertname -}} 186 | Pod 187 | {{- end -}} 188 | value: | 189 | {{- if .CommonLabels.job_name -}} 190 | {{ .CommonLabels.job_name }} 191 | {{- else if .CommonLabels.deployment -}} 192 | {{ .CommonLabels.deployment }} 193 | {{- else if match "^KubePod" .CommonLabels.alertname -}} 194 | {{ .CommonLabels.pod }} 195 | {{- end -}} 196 | actions: 197 | - type: button 198 | text: Runbook 199 | url: '{{ .CommonAnnotations.runbook_url }}' 200 | - name: "dcs-p2" 201 | pagerduty_configs: 202 | - service_key: "${dcs_p2_pagerduty_key}" 203 | - name: "verify-p1" 204 | pagerduty_configs: 205 | - service_key: "${verify_p1_pagerduty_key}" 206 | slack_configs: *verify-2ndline-slack-configs 207 | - name: "verify-p2" 208 | pagerduty_configs: 209 | - service_key: "${verify_p2_pagerduty_key}" 210 | slack_configs: *verify-2ndline-slack-configs 211 | - name: "verify-p3" 212 | slack_configs: *verify-2ndline-slack-configs 213 | - name: "dev-null" 214 | -------------------------------------------------------------------------------- /terraform/modules/app-ecs-albs/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ## Module: app-ecs-albs 3 | * 4 | * Load balancer for Prometheus 5 | * 6 | */ 7 | 8 | variable "aws_region" { 9 | type = string 10 | description = "AWS region" 11 | } 12 | 13 | variable "remote_state_bucket" { 14 | type = string 15 | description = "S3 bucket we store our terraform state in" 16 | } 17 | 18 | variable "environment" { 19 | type = string 20 | description = "Unique name for this collection of resources" 21 | } 22 | 23 | variable "zone_id" { 24 | type = string 25 | description = "Route 53 zone ID for registering public DNS records" 26 | } 27 | 28 | variable "subnets" { 29 | type = list(string) 30 | description = "Subnets to attach load balancers to" 31 | } 32 | 33 | variable "prometheus_count" { 34 | type = string 35 | description = "Number of prometheus instances to create listener rules and target groups for" 36 | default = "3" 37 | } 38 | 39 | # locals 40 | # -------------------------------------------------------------- 41 | 42 | locals { 43 | default_tags = { 44 | Terraform = "true" 45 | Project = "app-ecs-albs" 46 | Source = "github.com/alphagov/prometheus-aws-configuration-beta" 47 | Environment = var.environment 48 | } 49 | 50 | prom_records_count = var.prometheus_count 51 | 52 | # data.aws_route_53.XXX.name has a trailing dot which we remove with replace() to make ACM happy 53 | subdomain = replace(data.aws_route53_zone.public_zone.name, "/\\.$/", "") 54 | vpc_id = data.aws_subnet.first_subnet.vpc_id 55 | } 56 | 57 | ## Data sources 58 | 59 | data "terraform_remote_state" "infra_networking" { 60 | backend = "s3" 61 | 62 | config = { 63 | bucket = var.remote_state_bucket 64 | key = "infra-networking-modular.tfstate" 65 | region = var.aws_region 66 | } 67 | } 68 | 69 | data "terraform_remote_state" "infra_security_groups" { 70 | backend = "s3" 71 | 72 | config = { 73 | bucket = var.remote_state_bucket 74 | key = "infra-security-groups-modular.tfstate" 75 | region = var.aws_region 76 | } 77 | } 78 | 79 | data "aws_route53_zone" "public_zone" { 80 | zone_id = var.zone_id 81 | } 82 | 83 | data "aws_subnet" "first_subnet" { 84 | id = var.subnets[0] 85 | } 86 | 87 | ###################################################################### 88 | # ----- prometheus public ALB ------- 89 | ###################################################################### 90 | 91 | # AWS should manage the certificate renewal automatically 92 | # https://docs.aws.amazon.com/acm/latest/userguide/managed-renewal.html 93 | # If this fails, AWS will email associated with the AWS account 94 | resource "aws_acm_certificate" "prometheus_cert" { 95 | domain_name = "prom.${local.subdomain}" 96 | validation_method = "DNS" 97 | 98 | subject_alternative_names = aws_route53_record.prom_alias.*.fqdn 99 | 100 | lifecycle { 101 | # We can't destroy a certificate that's in use, and we can't stop 102 | # using it until the new one is ready. Hence 103 | # create_before_destroy here. 104 | create_before_destroy = true 105 | } 106 | } 107 | 108 | resource "aws_route53_record" "prometheus_cert_validation" { 109 | for_each = { 110 | for dvo in aws_acm_certificate.prometheus_cert.domain_validation_options : dvo.domain_name => { 111 | name = dvo.resource_record_name 112 | record = dvo.resource_record_value 113 | type = dvo.resource_record_type 114 | } 115 | } 116 | 117 | name = each.value.name 118 | records = [each.value.record] 119 | type = each.value.type 120 | zone_id = var.zone_id 121 | ttl = 60 122 | 123 | allow_overwrite = true 124 | 125 | depends_on = [aws_acm_certificate.prometheus_cert] 126 | } 127 | 128 | resource "aws_acm_certificate_validation" "prometheus_cert" { 129 | certificate_arn = aws_acm_certificate.prometheus_cert.arn 130 | validation_record_fqdns = [for record in aws_route53_record.prometheus_cert_validation : record.fqdn] 131 | } 132 | 133 | resource "aws_route53_record" "prom_alias" { 134 | count = local.prom_records_count 135 | 136 | zone_id = var.zone_id 137 | name = "prom-${count.index + 1}" 138 | type = "A" 139 | 140 | alias { 141 | name = aws_lb.prometheus_alb.dns_name 142 | zone_id = aws_lb.prometheus_alb.zone_id 143 | evaluate_target_health = false 144 | } 145 | } 146 | 147 | resource "aws_lb" "prometheus_alb" { 148 | name = "${var.environment}-prometheus-alb" 149 | internal = false 150 | load_balancer_type = "application" 151 | 152 | security_groups = [data.terraform_remote_state.infra_security_groups.outputs.prometheus_alb_sg_id] 153 | 154 | subnets = var.subnets 155 | 156 | tags = merge( 157 | local.default_tags, 158 | { 159 | Name = "${var.environment}-prometheus-alb" 160 | Service = "observe-prometheus" 161 | }, 162 | ) 163 | } 164 | 165 | resource "aws_lb_listener" "prometheus_listener_http" { 166 | load_balancer_arn = aws_lb.prometheus_alb.arn 167 | port = "80" 168 | protocol = "HTTP" 169 | 170 | default_action { 171 | type = "redirect" 172 | 173 | redirect { 174 | port = "443" 175 | protocol = "HTTPS" 176 | status_code = "HTTP_301" 177 | } 178 | } 179 | } 180 | 181 | resource "aws_lb_listener" "prometheus_listener_https" { 182 | load_balancer_arn = aws_lb.prometheus_alb.arn 183 | port = "443" 184 | protocol = "HTTPS" 185 | ssl_policy = "ELBSecurityPolicy-TLS-1-2-2017-01" 186 | certificate_arn = aws_acm_certificate_validation.prometheus_cert.certificate_arn 187 | 188 | default_action { 189 | type = "fixed-response" 190 | 191 | fixed_response { 192 | content_type = "text/plain" 193 | message_body = "Not found" 194 | status_code = "404" 195 | } 196 | } 197 | } 198 | 199 | resource "aws_lb_listener_rule" "prom_listener_https" { 200 | count = var.prometheus_count 201 | 202 | listener_arn = aws_lb_listener.prometheus_listener_https.arn 203 | priority = 100 + count.index 204 | 205 | action { 206 | type = "forward" 207 | target_group_arn = element(aws_lb_target_group.prometheus_tg.*.arn, count.index) 208 | } 209 | 210 | condition { 211 | host_header { 212 | values = ["prom-${count.index + 1}.*"] 213 | } 214 | } 215 | } 216 | 217 | resource "aws_lb_target_group" "prometheus_tg" { 218 | count = var.prometheus_count 219 | 220 | name = "${var.environment}-prom-${count.index + 1}-tg" 221 | port = 80 222 | protocol = "HTTP" 223 | vpc_id = local.vpc_id 224 | deregistration_delay = 30 225 | 226 | health_check { 227 | interval = "10" 228 | path = "/health" # static health check on nginx auth proxy 229 | matcher = "200" 230 | protocol = "HTTP" 231 | healthy_threshold = 2 232 | unhealthy_threshold = 2 233 | timeout = "5" 234 | } 235 | } 236 | 237 | ## Outputs 238 | 239 | output "prom_public_record_fqdns" { 240 | value = aws_route53_record.prom_alias.*.fqdn 241 | description = "Prometheus public DNS FQDNs" 242 | } 243 | 244 | output "prometheus_target_group_ids" { 245 | value = aws_lb_target_group.prometheus_tg.*.arn 246 | description = "Prometheus target group IDs" 247 | } 248 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/alerts-config/alerts/observe-alerts.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: RE_Observe 3 | rules: 4 | - alert: RE_Observe_Grafana_Down 5 | expr: up{job="grafana-paas"} == 0 6 | for: 5m 7 | labels: 8 | product: "prometheus" 9 | severity: "page" 10 | annotations: 11 | summary: "Prometheus is not able to scrape Grafana" 12 | message: "Prometheus has not successfully scraped {{ $labels.job }} in the last 5 minutes. https://grafana-paas.cloudapps.digital/ may be down." 13 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=()&_a=(columns:!(_source),index:'*-*',interval:h,query:(query_string:(query:'grafana-paas.cloudapps.digital%20AND%20NOT%20access.response_code:200')),sort:!('@timestamp',desc))" 14 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-grafana-down" 15 | 16 | - alert: RE_Observe_AlertManager_Below_Threshold 17 | expr: sum(up{job="alertmanager"}) <= 1 18 | for: 10s 19 | labels: 20 | product: "prometheus" 21 | severity: "page" 22 | annotations: 23 | summary: "There is one or fewer Alertmanagers that can be scraped" 24 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-alertmanager-below-threshold" 25 | 26 | - alert: RE_Observe_Prometheus_Below_Threshold 27 | expr: sum(up{job="prometheus"}) <= 1 28 | for: 10s 29 | labels: 30 | product: "prometheus" 31 | severity: "page" 32 | annotations: 33 | summary: "There is one or fewer Prometheis that can be scraped" 34 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 35 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-below-threshold" 36 | 37 | - alert: RE_Observe_Prometheus_AtLeastOneMissing 38 | expr: sum(up{job="prometheus"}) < 3 39 | for: 3m 40 | labels: 41 | product: "prometheus" 42 | severity: "ticket" 43 | annotations: 44 | summary: "At least one Prometheus can't be scraped" 45 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 46 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-at-least-one-missing" 47 | 48 | - alert: RE_Observe_PrometheusDiskPredictedToFill 49 | expr: | 50 | predict_linear( 51 | node_filesystem_avail{job="prometheus_node", mountpoint="/mnt"}[12h], 3 * 24 * 60 * 60 52 | ) <= 0 53 | and on(instance) 54 | (time() - node_creation_time > 12 * 60 * 60) 55 | labels: 56 | product: "prometheus" 57 | severity: "ticket" 58 | annotations: 59 | summary: "Instance {{ $labels.instance }} disk {{ $labels.mountpoint }} is predicted to fill in 72h" 60 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 61 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-disk-predicted-to-fill" 62 | 63 | - alert: RE_Observe_No_Paas_Targets 64 | expr: prometheus_sd_discovered_targets{config=~"paas-(london|ireland)-targets"} == 0 65 | for: 10m 66 | labels: 67 | product: "prometheus" 68 | severity: "page" 69 | annotations: 70 | summary: "No PaaS targets detected" 71 | message: "No PaaS file_sd targets were detected from the service broker. Is there a problem accessing the targets bucket?" 72 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 73 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-no-filesd-targets" 74 | 75 | - alert: RE_Observe_Prometheus_Over_Capacity 76 | expr: sum without(slice)(rate(prometheus_engine_query_duration_seconds_sum{job="prometheus"}[5m])) > 8 77 | for: 10s 78 | labels: 79 | product: "prometheus" 80 | severity: "page" 81 | annotations: 82 | summary: "Service is over capacity." 83 | message: "The service name is {{ $labels.job }}. The URL experiencing the issue is {{ $labels.instance }}." 84 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 85 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-over-capacity" 86 | 87 | - alert: RE_Observe_Prometheus_High_Load 88 | expr: sum without(slice)(rate(prometheus_engine_query_duration_seconds_sum{job="prometheus"}[2h])) > 4 89 | labels: 90 | product: "prometheus" 91 | severity: "ticket" 92 | annotations: 93 | summary: "Service is approaching capacity." 94 | message: "The service name is {{ $labels.job }}. The URL experiencing the issue is {{ $labels.instance }}." 95 | logs: "https://kibana.logit.io/s/8fd50110-7b0c-490a-bedf-7544daebbec4/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-15m,mode:quick,to:now))&_a=(columns:!(_source),index:'*-*',interval:auto,query:(query_string:(query:'tags:%20prometheus')),sort:!('@timestamp',desc))" 96 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-prometheus-high-load" 97 | 98 | - alert: RE_Observe_Target_Down 99 | expr: up{} == 0 100 | for: 24h 101 | labels: 102 | product: "prometheus" 103 | severity: "ticket" 104 | annotations: 105 | summary: "{{ $labels.job }} target is down" 106 | message: "One of the {{ $labels.job }} targets has been down for 24 hours" 107 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-target-down" 108 | 109 | - alert: RE_Observe_No_Successful_Updates 110 | expr: sum(increase(observe_broker_http_requests_total{code="200", path="/update-targets", method="post"}[30m])) by (region) == 0 111 | for: 12h 112 | labels: 113 | product: "prometheus" 114 | severity: "ticket" 115 | annotations: 116 | summary: "No recent target updates in region '{{ $labels.region }}'" 117 | message: "Target update in region '{{ $labels.region }}' hasn't completed successfully in at least 12h" 118 | runbook: "https://re-team-manual.cloudapps.digital/prometheus-for-gds-paas-users.html#re-observe-no-successful-updates" 119 | 120 | - alert: AlwaysAlert 121 | annotations: 122 | message: | 123 | This is an alert meant to ensure that the entire alerting pipeline is functional. 124 | This alert is always firing, therefore it should always be firing in Alertmanager 125 | and always fire against a receiver. We use cronitor to alert us if this ever 126 | *doesn't* fire, because this indicates a problem with our alerting pipeline 127 | expr: vector(1) 128 | labels: 129 | product: "prometheus" 130 | severity: "constant" 131 | -------------------------------------------------------------------------------- /terraform/modules/prom-ec2/prometheus/cloud.conf: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | package_update: true 3 | package_upgrade: true 4 | packages: ['prometheus', 'prometheus-node-exporter', 'awscli', 'inotify-tools', 'nginx', 'jq'] 5 | 6 | write_files: 7 | - owner: root:root 8 | path: /etc/default/prometheus 9 | permissions: 0444 10 | content: 'ARGS="--storage.tsdb.path=\"/mnt/\" --web.external-url=${prom_external_url} --storage.tsdb.retention=60d --query.timeout=30s"' 11 | - owner: root:root 12 | path: /etc/cron.d/config_pull 13 | permissions: 0755 14 | content: | 15 | * * * * * root flock -w 30 /run/lock/prometheus-config-updates aws s3 sync s3://${config_bucket}/prometheus/ /etc/prometheus/ --region=${region} 16 | @reboot root /root/watch_prometheus_dir 17 | - owner: root:root 18 | path: /etc/cron.d/ireland_targets_pull 19 | permissions: 0755 20 | content: | 21 | # if targets bucket exists then sync it, otherwise this cron runs but has no effect 22 | * * * * * root [ "${ireland_targets_bucket}" != "" ] && aws s3 sync s3://${ireland_targets_bucket}/active/ /etc/prometheus/ireland-targets --region=${region} --delete 23 | - owner: root:root 24 | path: /etc/cron.d/london_targets_pull 25 | permissions: 0755 26 | content: | 27 | # if targets bucket exists then sync it, otherwise this cron runs but has no effect 28 | * * * * * root [ "${london_targets_bucket}" != "" ] && aws s3 sync s3://${london_targets_bucket}/active/ /etc/prometheus/london-targets --region=${region} --delete 29 | - owner: root:root 30 | path: /etc/cron.d/alerts_pull 31 | permissions: 0755 32 | content: | 33 | # if alerts bucket exists then sync it, otherwise this cron runs but has no effect 34 | * * * * * root [ "${alerts_bucket}" != "" ] && aws s3 sync s3://${alerts_bucket}/prometheus/alerts/ /etc/prometheus/alerts --region=${region} --delete 35 | - content: | 36 | echo 'Configuring prometheus EBS' 37 | vol="" 38 | while [ -z "$vol" ]; do 39 | # adapted from 40 | # https://medium.com/@moonape1226/mount-aws-ebs-on-ec2-automatically-with-cloud-init-e5e837e5438a 41 | # [Last accessed on 2020-04-02] 42 | vol=$(lsblk | grep -e disk | awk '{sub("G","",$4)} {if ($4+0 == ${data_volume_size}) print $1}') 43 | echo "still waiting for data volume ; sleeping 5" 44 | sleep 5 45 | done 46 | echo "found volume /dev/$vol" 47 | if [ -z "$(lsblk | grep "$vol" | awk '{print $7}')" ] ; then 48 | if [ -z "$(blkid /dev/$vol | grep ext4)" ] ; then 49 | echo "volume /dev/$vol is not formatted ; formatting" 50 | mkfs -F -t ext4 -L 'prometheus_disk' "/dev/$vol" 51 | else 52 | echo "volume /dev/$vol is already formatted" 53 | fi 54 | 55 | echo "volume /dev/$vol is not mounted ; mounting" 56 | mount "/dev/$vol" /mnt 57 | UUID=$(blkid /dev/$vol -s UUID -o value) 58 | if [ -z "$(grep $UUID /etc/fstab)" ] ; then 59 | echo "writing fstab entry" 60 | 61 | echo "UUID=$UUID /mnt ext4 defaults,nofail 0 2" >> /etc/fstab 62 | fi 63 | fi 64 | echo "ensuring fs block size matches volume block size" 65 | resize2fs "/dev/$vol" 66 | path: /root/manage_data_volume.sh 67 | permissions: 0755 68 | - content: | 69 | #!/bin/bash 70 | STATUS_JSON='/srv/prometheus-last-config.json' 71 | 72 | attempt_reload() { 73 | ( 74 | # take out lock to ensure updater doesn't switch the config between the time we 75 | # calculate NEW_HASH and prometheus reads it 76 | flock 321 77 | 78 | # why md5? because it should be the same as the s3 etag and so easy to check 79 | export NEW_HASH=$(md5sum /etc/prometheus/prometheus.yml | cut -d ' ' -f 1) 80 | if systemctl reload prometheus ; then 81 | jq -n '{last_successful_config: env.NEW_HASH, last_reload_successful: true}' > $STATUS_JSON 82 | else 83 | touch $STATUS_JSON 84 | jq '{last_successful_config: .last_successful_config, last_reload_successful: false, failed_config: env.NEW_HASH}' $STATUS_JSON > $STATUS_JSON 85 | fi 86 | 87 | ) 321>/run/lock/prometheus-config-updates 88 | } 89 | 90 | systemctl start prometheus # ensure prometheus is started before initial attempt_reload 91 | attempt_reload 92 | 93 | inotifywait -e modify,create,delete,move -m /etc/prometheus | 94 | while read -r directory events; do 95 | attempt_reload 96 | done 97 | path: /root/watch_prometheus_dir 98 | permissions: 0755 99 | - content: | 100 | #!/bin/bash 101 | curl -L -O https://artifacts.elastic.co/downloads/beats/filebeat/filebeat-6.4.2-amd64.deb && sudo dpkg -i filebeat-6.4.2-amd64.deb 102 | aws s3 sync s3://${config_bucket}/filebeat/ /etc/filebeat/ --region=${region} 103 | update-rc.d filebeat defaults 104 | update-rc.d filebeat enable 5 105 | path: /root/setup_filebeat.sh 106 | permissions: 0755 107 | - content: | 108 | server { 109 | listen 8080; 110 | 111 | location / { 112 | set $cleaned_header $arg_cf_app_instance; 113 | if ($arg_cf_app_instance ~* "^(.*)%3A(.*)$") { 114 | set $cleaned_header $1:$2; 115 | } 116 | proxy_http_version 1.1; 117 | proxy_pass https://$host$uri; 118 | proxy_ssl_server_name on; 119 | proxy_set_header Connection ""; 120 | proxy_set_header X-CF-APP-INSTANCE $cleaned_header; 121 | proxy_set_header XX-CF-APP-INSTANCE $cleaned_header; 122 | proxy_set_header Authorization "Bearer $arg_cf_app_guid"; 123 | } 124 | 125 | location /health { 126 | return 200 "Static health check"; 127 | } 128 | 129 | resolver 10.0.0.2 valid=10s; 130 | } 131 | path: /etc/nginx/sites-enabled/paas-proxy 132 | permissions: 0644 133 | - content: | 134 | ${prometheus_htpasswd} 135 | path: /etc/nginx/conf.d/.htpasswd 136 | owner: www-data:www-data 137 | permissions: 0600 138 | # the package-provided default server conflicts with auth-proxy 139 | # below and causes package installation to fail because of a 140 | # duplicate default_server on port 80. So we wipe the default 141 | # server (and then remove it in runcmd at the bottom) 142 | - content: "" 143 | path: /etc/nginx/sites-enabled/default 144 | - content: | 145 | server { 146 | listen 80 default_server; 147 | 148 | location /health { 149 | # This location is not protected by basic auth because of 150 | # https://stackoverflow.com/questions/40447376/auth-basic-within-location-block-doesnt-work-when-return-is-specified 151 | return 200 "Static health check"; 152 | } 153 | 154 | location = /last-config { 155 | default_type application/json; 156 | alias /srv/prometheus-last-config.json; 157 | } 158 | 159 | location / { 160 | proxy_pass http://localhost:9090; 161 | proxy_set_header X-Real-IP $remote_addr; 162 | proxy_set_header Host $host; 163 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 164 | } 165 | 166 | satisfy any; 167 | auth_basic "Prometheus"; 168 | auth_basic_user_file /etc/nginx/conf.d/.htpasswd; 169 | 170 | real_ip_header X-Forwarded-For; 171 | set_real_ip_from 10.0.0.0/8; 172 | set_real_ip_from 127.0.0.1/32; 173 | ${allowed_cidrs} 174 | deny all; 175 | } 176 | path: /etc/nginx/sites-enabled/auth-proxy 177 | 178 | runcmd: 179 | - rm /etc/nginx/sites-enabled/default 180 | - "if [ -n '${logstash_host}' ]; then /root/setup_filebeat.sh; fi" 181 | - [bash, -c, "/root/manage_data_volume.sh"] 182 | - [bash, -c, "chown -R prometheus /mnt/"] 183 | - [bash, -c, "echo \"node_creation_time `date +%s`\" > /var/lib/prometheus/node-exporter/node-creation-time.prom"] 184 | - [bash, -c, "rm /etc/resolv.conf && sed -e 's/ trust-ad//' < /run/systemd/resolve/stub-resolv.conf > /etc/resolv.conf"] 185 | - [reboot] 186 | -------------------------------------------------------------------------------- /terraform/projects/prom-ec2/paas-production/extra-prometheus-scrape-configs.yml.tpl: -------------------------------------------------------------------------------- 1 | - job_name: dcs-federate 2 | scheme: https 3 | honor_labels: true 4 | honor_timestamps: true 5 | metrics_path: '/federate' 6 | params: 7 | "match[]": 8 | # fetch everything (via https://stackoverflow.com/a/39253848 ) 9 | - '{__name__=~".+"}' 10 | static_configs: 11 | - targets: 12 | - dcs-build-internal-prometheus.london.cloudapps.digital 13 | labels: 14 | federated_from: dcs-build-internal-prometheus.london.cloudapps.digital 15 | - targets: 16 | - dcs-integration-internal-prometheus.london.cloudapps.digital 17 | labels: 18 | federated_from: dcs-integration-internal-prometheus.london.cloudapps.digital 19 | - targets: 20 | - dcs-production-internal-prometheus.london.cloudapps.digital 21 | labels: 22 | federated_from: dcs-production-internal-prometheus.london.cloudapps.digital 23 | 24 | 25 | - job_name: paas_elasticsearch_for_dm 26 | scheme: https 27 | basic_auth: 28 | username: digitalmarketplace 29 | password: ${dm_elasticsearch_metrics_password} 30 | metrics_path: '/federate' 31 | params: 32 | "match[]": 33 | - "{job='aiven'}" 34 | static_configs: 35 | - targets: 36 | - digitalmarketplace-es-metrics.cloudapps.digital 37 | metric_relabel_configs: 38 | # Prepend `paas_es_` so the metrics are easier to find 39 | - action: replace 40 | source_labels: [__name__] 41 | target_label: __name__ 42 | regex: (.*) 43 | replacement: paas_es_$${1} 44 | # Dummy entry to be used below 45 | - &store_this_metric 46 | action: replace 47 | target_label: __store_this__ 48 | replacement: store_this 49 | source_labels: [__name__] 50 | regex: __dummy_metric_name 51 | # One entry for each metric you want to import into Prometheus. 52 | # (Or remove this and the drop rules below it in order to import all 53 | # nearly 1000 metrics.) 54 | - <<: *store_this_metric 55 | regex: paas_es_disk_free 56 | - <<: *store_this_metric 57 | regex: paas_es_disk_used_percent 58 | - <<: *store_this_metric 59 | regex: paas_es_diskio_io_time 60 | - <<: *store_this_metric 61 | regex: paas_es_diskio_iops_in_progress 62 | - <<: *store_this_metric 63 | regex: paas_es_diskio_read_time 64 | - <<: *store_this_metric 65 | regex: paas_es_diskio_write_time 66 | - <<: *store_this_metric 67 | regex: paas_es_swap_used_percent 68 | - <<: *store_this_metric 69 | regex: paas_es_system_load1 70 | - <<: *store_this_metric 71 | regex: paas_es_system_load5 72 | - <<: *store_this_metric 73 | regex: paas_es_system_load15 74 | - <<: *store_this_metric 75 | regex: paas_es_net_bytes_recv 76 | - <<: *store_this_metric 77 | regex: paas_es_net_bytes_sent 78 | - <<: *store_this_metric 79 | regex: paas_es_elasticsearch_clusterstats_nodes_os_mem_free_percent 80 | - <<: *store_this_metric 81 | regex: paas_es_elasticsearch_clusterstats_nodes_os_mem_used_percent 82 | - <<: *store_this_metric 83 | regex: paas_es_elasticsearch_clusterstats_nodes_process_cpu_percent 84 | - <<: *store_this_metric 85 | regex: paas_es_elasticsearch_clusterstats_indices_count 86 | - <<: *store_this_metric 87 | regex: paas_es_elasticsearch_clusterstats_indices_docs_count 88 | - <<: *store_this_metric 89 | regex: paas_es_elasticsearch_clusterstats_indices_docs_deleted 90 | - <<: *store_this_metric 91 | regex: paas_es_elasticsearch_clusterstats_indices_query_cache_miss_count 92 | - <<: *store_this_metric 93 | regex: paas_es_elasticsearch_clusterstats_indices_store_size_in_bytes 94 | - <<: *store_this_metric 95 | regex: paas_es_elasticsearch_clusterstats_nodes_count_master 96 | - <<: *store_this_metric 97 | regex: paas_es_elasticsearch_clusterstats_nodes_count_total 98 | - <<: *store_this_metric 99 | regex: paas_es_elasticsearch_clusterstats_nodes_fs_available_in_bytes 100 | - <<: *store_this_metric 101 | regex: paas_es_elasticsearch_clusterstats_nodes_fs_free_in_bytes 102 | - <<: *store_this_metric 103 | regex: paas_es_elasticsearch_clusterstats_nodes_fs_total_in_bytes 104 | - <<: *store_this_metric 105 | regex: paas_es_elasticsearch_clusterstats_nodes_jvm_mem_heap_max_in_bytes 106 | - <<: *store_this_metric 107 | regex: paas_es_elasticsearch_clusterstats_nodes_jvm_mem_heap_used_in_bytes 108 | - <<: *store_this_metric 109 | regex: paas_es_elasticsearch_clusterstats_nodes_jvm_threads 110 | - <<: *store_this_metric 111 | regex: paas_es_elasticsearch_clusterstats_nodes_process_open_file_descriptors_avg 112 | - <<: *store_this_metric 113 | regex: paas_es_elasticsearch_clusterstats_nodes_process_open_file_descriptors_max 114 | - <<: *store_this_metric 115 | regex: paas_es_elasticsearch_clusterstats_nodes_process_open_file_descriptors_min 116 | - <<: *store_this_metric 117 | regex: paas_es_elasticsearch_cluster_health_active_primary_shards 118 | - <<: *store_this_metric 119 | regex: paas_es_elasticsearch_cluster_health_active_shards 120 | - <<: *store_this_metric 121 | regex: paas_es_elasticsearch_cluster_health_active_shards_percent_as_number 122 | - <<: *store_this_metric 123 | regex: paas_es_elasticsearch_cluster_health_initializing_shards 124 | - <<: *store_this_metric 125 | regex: paas_es_elasticsearch_cluster_health_number_of_data_nodes 126 | - <<: *store_this_metric 127 | regex: paas_es_elasticsearch_cluster_health_number_of_nodes 128 | - <<: *store_this_metric 129 | regex: paas_es_elasticsearch_cluster_health_number_of_pending_tasks 130 | - <<: *store_this_metric 131 | regex: paas_es_elasticsearch_cluster_health_relocating_shards 132 | - <<: *store_this_metric 133 | regex: paas_es_elasticsearch_cluster_health_status_code 134 | - <<: *store_this_metric 135 | regex: paas_es_elasticsearch_cluster_health_task_max_waiting_in_queue_millis 136 | - <<: *store_this_metric 137 | regex: paas_es_elasticsearch_cluster_health_unassigned_shards 138 | - <<: *store_this_metric 139 | regex: paas_es_elasticsearch_indices_docs_count 140 | - <<: *store_this_metric 141 | regex: paas_es_elasticsearch_indices_docs_deleted 142 | - <<: *store_this_metric 143 | regex: paas_es_elasticsearch_indices_request_cache_hit_count 144 | - <<: *store_this_metric 145 | regex: paas_es_elasticsearch_indices_request_cache_miss_count 146 | - <<: *store_this_metric 147 | regex: paas_es_elasticsearch_os_cpu_load_average_15m 148 | - <<: *store_this_metric 149 | regex: paas_es_elasticsearch_os_cpu_load_average_1m 150 | - <<: *store_this_metric 151 | regex: paas_es_elasticsearch_os_cpu_load_average_5m 152 | - <<: *store_this_metric 153 | regex: paas_es_elasticsearch_os_cpu_percent 154 | - <<: *store_this_metric 155 | regex: paas_es_elasticsearch_os_mem_free_percent 156 | - <<: *store_this_metric 157 | regex: paas_es_elasticsearch_os_mem_used_percent 158 | - <<: *store_this_metric 159 | regex: paas_es_elasticsearch_os_swap_total_in_bytes 160 | - <<: *store_this_metric 161 | regex: paas_es_elasticsearch_os_swap_used_in_bytes 162 | - <<: *store_this_metric 163 | regex: paas_es_elasticsearch_process_max_file_descriptors 164 | - <<: *store_this_metric 165 | regex: paas_es_elasticsearch_process_open_file_descriptors 166 | - <<: *store_this_metric 167 | regex: paas_es_elasticsearch_jvm_gc_collectors_old_collection_count 168 | - <<: *store_this_metric 169 | regex: paas_es_elasticsearch_jvm_gc_collectors_old_collection_time_in_millis 170 | - <<: *store_this_metric 171 | regex: paas_es_elasticsearch_jvm_gc_collectors_young_collection_count 172 | - <<: *store_this_metric 173 | regex: paas_es_elasticsearch_jvm_gc_collectors_young_collection_time_in_millis 174 | - <<: *store_this_metric 175 | regex: paas_es_elasticsearch_jvm_mem_heap_used_percent 176 | - <<: *store_this_metric 177 | regex: paas_es_elasticsearch_jvm_uptime_in_millis 178 | # Drop metrics we don't want to keep 179 | - source_labels: [__store_this__] 180 | regex: ^store_this$ 181 | action: keep 182 | # Drop the temporary label 183 | - regex: ^__store_this__$ 184 | action: labeldrop 185 | - job_name: paas_redis_metrics_for_dm 186 | scheme: https 187 | basic_auth: 188 | username: ${dm_paas_metrics_username} 189 | password: ${dm_paas_metrics_password} 190 | static_configs: 191 | - targets: 192 | - redis.metrics.cloud.service.gov.uk 193 | metrics_path: /metrics 194 | scrape_interval: 300s 195 | scrape_timeout: 120s 196 | honor_timestamps: true 197 | metric_relabel_configs: 198 | # Prepend `paas_redis_` so the metrics are easier to find 199 | - action: replace 200 | source_labels: [__name__] 201 | target_label: __name__ 202 | regex: (.*) 203 | replacement: paas_redis_$${1} 204 | -------------------------------------------------------------------------------- /terraform/modules/alertmanager/alertmanager-service.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * ECS service that runs alertmanager 3 | * 4 | */ 5 | 6 | ### container, task, service definitions 7 | 8 | resource "aws_ecs_cluster" "prometheus_cluster" { 9 | name = "${var.environment}-ecs-monitoring" 10 | 11 | tags = merge(local.default_tags, { 12 | Name = "${var.environment}-alertmanager" 13 | }) 14 | } 15 | 16 | resource "aws_iam_role" "execution" { 17 | name = "${var.environment}-alertmanager-execution" 18 | 19 | assume_role_policy = <<-EOF 20 | { 21 | "Version": "2012-10-17", 22 | "Statement": [ 23 | { 24 | "Effect": "Allow", 25 | "Principal": { 26 | "Service": "ecs-tasks.amazonaws.com" 27 | }, 28 | "Action": "sts:AssumeRole" 29 | } 30 | ] 31 | } 32 | EOF 33 | 34 | tags = merge(local.default_tags, { 35 | Name = "${var.environment}-alertmanager-execution" 36 | }) 37 | } 38 | 39 | resource "aws_iam_policy" "execution" { 40 | name = "${var.environment}-alertmanager-execution" 41 | 42 | policy = <<-EOF 43 | { 44 | "Version": "2012-10-17", 45 | "Statement": [ 46 | { 47 | "Effect": "Allow", 48 | "Action": [ 49 | "logs:CreateLogStream", 50 | "logs:PutLogEvents" 51 | ], 52 | "Resource": "*" 53 | } 54 | ] 55 | } 56 | EOF 57 | 58 | } 59 | 60 | resource "aws_iam_role_policy_attachment" "execution_execution" { 61 | role = aws_iam_role.execution.name 62 | policy_arn = aws_iam_policy.execution.arn 63 | } 64 | 65 | data "template_file" "alertmanager_nlb_container_defn" { 66 | template = file("${path.module}/task-definitions/alertmanager.json") 67 | 68 | vars = { 69 | alertmanager_config_base64 = base64encode(data.template_file.alertmanager_config_file.rendered) 70 | templates_base64 = base64encode(file("${path.module}/templates/default.tmpl")) 71 | alertmanager_url = "--web.external-url=https://${aws_route53_record.alerts_alias.fqdn}" 72 | log_group = aws_cloudwatch_log_group.task_logs.name 73 | region = var.aws_region 74 | } 75 | 76 | depends_on = [ 77 | module.assertion_alertmanager_config_file_valid_yaml.checked, 78 | ] 79 | } 80 | 81 | module "assertion_alertmanager_nlb_container_defn_valid_json" { 82 | source = "github.com/Invicton-Labs/terraform-null-assertion?ref=47d7354cc5521853fbe8df96b7bb0223bea732cd" 83 | 84 | condition = can(jsondecode(data.template_file.alertmanager_nlb_container_defn.rendered)) 85 | 86 | error_message = "Alertmanager NLB container definition failed JSON parsing" 87 | } 88 | 89 | resource "aws_ecs_task_definition" "alertmanager_nlb" { 90 | family = "${var.environment}-alertmanager" 91 | container_definitions = data.template_file.alertmanager_nlb_container_defn.rendered 92 | network_mode = "awsvpc" 93 | execution_role_arn = aws_iam_role.execution.arn 94 | requires_compatibilities = ["FARGATE"] 95 | cpu = 256 96 | memory = 512 97 | 98 | tags = merge(local.default_tags, { 99 | Name = "${var.environment}-alertmanager" 100 | }) 101 | 102 | depends_on = [ 103 | module.assertion_alertmanager_nlb_container_defn_valid_json.checked, 104 | ] 105 | } 106 | 107 | resource "aws_ecs_service" "alertmanager_alb" { 108 | for_each = { 109 | for _, subnet in data.aws_subnet.private_subnets : 110 | subnet.id => subnet.availability_zone 111 | } 112 | name = "${var.environment}-alertmanager-alb-${each.value}" 113 | cluster = "${var.environment}-ecs-monitoring" 114 | task_definition = aws_ecs_task_definition.alertmanager_nlb.arn 115 | desired_count = 1 116 | launch_type = "FARGATE" 117 | 118 | wait_for_steady_state = true 119 | 120 | load_balancer { 121 | target_group_arn = aws_lb_target_group.alertmanager_all.arn 122 | container_name = "alertmanager" 123 | container_port = 9093 124 | } 125 | 126 | load_balancer { 127 | target_group_arn = aws_lb_target_group.alertmanager_per_az[each.value].arn 128 | container_name = "alertmanager" 129 | container_port = 9093 130 | } 131 | 132 | network_configuration { 133 | subnets = [each.key] 134 | security_groups = [aws_security_group.alertmanager_task.id] 135 | } 136 | 137 | service_registries { 138 | registry_arn = aws_service_discovery_service.alertmanager.arn 139 | } 140 | } 141 | 142 | #### alertmanager 143 | 144 | data "pass_password" "observe_pagerduty_key" { 145 | path = "pagerduty/integration-keys/production" 146 | } 147 | 148 | data "pass_password" "dgu_pagerduty_key" { 149 | path = "pagerduty/integration-keys/dgu" 150 | } 151 | 152 | data "pass_password" "govuk_pagerduty_key" { 153 | path = "pagerduty/integration-keys/govuk" 154 | } 155 | 156 | data "pass_password" "verify_p1_pagerduty_key" { 157 | path = "pagerduty/integration-keys/verify-p1" 158 | } 159 | 160 | data "pass_password" "verify_p2_pagerduty_key" { 161 | path = "pagerduty/integration-keys/verify-p2" 162 | } 163 | 164 | data "pass_password" "dcs_p2_pagerduty_key" { 165 | path = "pagerduty/integration-keys/dcs-p2" 166 | } 167 | 168 | data "pass_password" "slack_api_url" { 169 | path = "slack-api-url" 170 | } 171 | 172 | data "pass_password" "notify_zendesk" { 173 | path = "receivers/notify/zendesk" 174 | } 175 | 176 | data "pass_password" "notify_p2_pagerduty_key" { 177 | path = "receivers/notify/p2_pagerduty" 178 | } 179 | 180 | data "pass_password" "autom8_email" { 181 | path = "receivers/autom8/email" 182 | 183 | } 184 | 185 | data "pass_password" "verify_staging_cronitor" { 186 | path = "cronitor/verify-staging-url" 187 | } 188 | 189 | data "pass_password" "verify_integration_cronitor" { 190 | path = "cronitor/verify-integration-url" 191 | } 192 | 193 | data "pass_password" "verify_prod_cronitor" { 194 | path = "cronitor/verify-prod-url" 195 | } 196 | 197 | data "template_file" "alertmanager_config_file" { 198 | template = file("${path.module}/templates/alertmanager.tpl") 199 | 200 | vars = { 201 | observe_pagerduty_key = data.pass_password.observe_pagerduty_key.password 202 | dgu_pagerduty_key = data.pass_password.dgu_pagerduty_key.password 203 | govuk_pagerduty_key = data.pass_password.govuk_pagerduty_key.password 204 | verify_p1_pagerduty_key = data.pass_password.verify_p1_pagerduty_key.password 205 | verify_p2_pagerduty_key = data.pass_password.verify_p2_pagerduty_key.password 206 | dcs_p2_pagerduty_key = data.pass_password.dcs_p2_pagerduty_key.password 207 | slack_api_url = data.pass_password.slack_api_url.password 208 | notify_zendesk = data.pass_password.notify_zendesk.password 209 | notify_p2_pagerduty_key = data.pass_password.notify_p2_pagerduty_key.password 210 | smtp_from = "alerts@${data.terraform_remote_state.infra_networking.outputs.public_subdomain}" 211 | # Port as requested by https://docs.aws.amazon.com/ses/latest/DeveloperGuide/smtp-connect.html 212 | smtp_smarthost = "email-smtp.${var.aws_region}.amazonaws.com:587" 213 | smtp_username = aws_iam_access_key.smtp.id 214 | smtp_password = aws_iam_access_key.smtp.ses_smtp_password_v4 215 | autom8_recipient_email = data.pass_password.autom8_email.password 216 | observe_cronitor = var.observe_cronitor 217 | verify_staging_cronitor = data.pass_password.verify_staging_cronitor.password 218 | verify_integration_cronitor = data.pass_password.verify_integration_cronitor.password 219 | verify_prod_cronitor = data.pass_password.verify_prod_cronitor.password 220 | } 221 | } 222 | 223 | module "assertion_alertmanager_config_file_valid_yaml" { 224 | source = "github.com/Invicton-Labs/terraform-null-assertion?ref=47d7354cc5521853fbe8df96b7bb0223bea732cd" 225 | 226 | condition = can(yamldecode(data.template_file.alertmanager_config_file.rendered)) 227 | 228 | error_message = "Alertmanager config failed YAML parsing" 229 | } 230 | 231 | ## AWS SES 232 | 233 | resource "aws_ses_domain_identity" "main" { 234 | domain = data.terraform_remote_state.infra_networking.outputs.public_subdomain 235 | } 236 | 237 | resource "aws_route53_record" "txt_amazonses_verification_record" { 238 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 239 | name = "_amazonses.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}" 240 | type = "TXT" 241 | ttl = "600" 242 | records = [aws_ses_domain_identity.main.verification_token] 243 | } 244 | 245 | resource "aws_ses_domain_dkim" "main" { 246 | domain = aws_ses_domain_identity.main.domain 247 | } 248 | 249 | resource "aws_route53_record" "dkim_amazonses_verification_record" { 250 | count = 3 251 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 252 | name = "${element(aws_ses_domain_dkim.main.dkim_tokens, count.index)}._domainkey.${data.terraform_remote_state.infra_networking.outputs.public_subdomain}" 253 | type = "CNAME" 254 | ttl = "600" 255 | records = ["${element(aws_ses_domain_dkim.main.dkim_tokens, count.index)}.dkim.amazonses.com"] 256 | } 257 | 258 | resource "aws_ses_domain_mail_from" "alerts" { 259 | domain = aws_ses_domain_identity.main.domain 260 | mail_from_domain = "mail.${aws_ses_domain_identity.main.domain}" 261 | } 262 | 263 | resource "aws_route53_record" "alerts_ses_domain_mail_from_mx" { 264 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 265 | name = aws_ses_domain_mail_from.alerts.mail_from_domain 266 | type = "MX" 267 | ttl = "600" 268 | records = ["10 feedback-smtp.${var.aws_region}.amazonses.com"] 269 | } 270 | 271 | resource "aws_route53_record" "alerts_ses_domain_mail_from_txt" { 272 | zone_id = data.terraform_remote_state.infra_networking.outputs.public_zone_id 273 | name = aws_ses_domain_mail_from.alerts.mail_from_domain 274 | type = "TXT" 275 | ttl = "600" 276 | records = ["v=spf1 include:amazonses.com -all"] 277 | } 278 | 279 | # IAM for SMTP 280 | 281 | resource "aws_iam_user" "smtp" { 282 | name = "${var.environment}.smtp" 283 | path = "/system/" 284 | 285 | tags = merge(local.default_tags, { 286 | Name = "${var.environment}-alertmanager-smtp" 287 | }) 288 | } 289 | 290 | resource "aws_iam_access_key" "smtp" { 291 | user = aws_iam_user.smtp.name 292 | } 293 | 294 | resource "aws_iam_user_policy" "smtp_ro" { 295 | name = "${var.environment}.smtp" 296 | user = aws_iam_user.smtp.name 297 | 298 | policy = <