├── README.md ├── cadvisor-node-exporter-compose.yml ├── cadvisor-node-exporter-definition.json ├── grafana-definition.json ├── prometheus-grafana-compose.yml ├── prometheus-grafana-definition.json └── prometheus.yml /README.md: -------------------------------------------------------------------------------- 1 | # ECS monitoring using Prometheus and Grafana 2 | Setup ECS monitoring using Prometheus, Grafana, cAdvisor and Node Exporter 3 | 4 | ## Requirements: 5 | 6 | 1. Modify 'init-container' definition (in prometheus-grafana-compose.yml/prometheus-grafana-definition.json) to download the correct 'prometheus.yml' configuration file. 7 | 2. Modify 'prometheus.yml' to make sure it queries and filters correct set of EC2 instance which needs to be monitored. 8 | Here I used filter (in ec2_sd_configs section) with tag 'aws:autoscaling:groupName' and vlaue 'ECS_ASG-2' as all instances in my ECS clsuter was tagged with it. 9 | 3. Use EC2 instance profile for ECS Cluster instances with permission to make describe calls for EC2 instances. This will be used by Prometheus to discover instance in your ECS cluster and add it in Prometheus targets (http://monitor_ec2_public_ip:9090/targets). 10 | 4. Make sure instances in cluster can reach each other on ports 9090, 9100, and 9200 and open Grafana port 3000 for the IP range from where you need to access the Grafana Dashboard. 11 | 12 | ## Usages Instructions: 13 | 14 | 1. Create Task Definitions for cAdvisor, Node-Exporter, Prometheus and Grafana: 15 | 16 | ``` 17 | aws ecs register-task-definition --cli-input-json file://./cadvisor-node-exporter-definition.json --region us-west-2 18 | aws ecs register-task-definition --cli-input-json file://./prometheus-grafana-definition.json --region us-west-2 19 | ``` 20 | 21 | 2. Create a DAEMON Service to run cAdvisor, Node-Exporter on every node in ECS Cluster: 22 | 23 | ``` 24 | aws ecs create-service --cluster MyWorkingCluster --service-name cadvisor-node-exporter --task-definition cadvisor-node-exporter-definition:1 --launch-type EC2 --scheduling-strategy DAEMON --region us-west-2 25 | ``` 26 | 27 | 3. Run one ECS Task for Prometheus and Grafana in the clsuter: 28 | 29 | ``` 30 | aws ecs run-task --cluster MyWorkingCluster --task-definition prometheus-grafana-definition:1 --region us-west-2 31 | ``` 32 | 33 | 4. Access Grafana Dashboard using URL: http://monitor_ec2_public_ip:3000 34 | Use user:admin and password:admin to login and then reset the password. 35 | 36 | 5. After logginig in, add datasource: 37 | 38 | Adding the Datasource 39 | 40 | * Fig 1: Monitoring results for Uptime, Containers, Load(Numerical Representation), DiskSpace, Memory, Filesystem Usage, CPU Usage, Memory Usage, Targets Online, Total Memory Usage, Alerts 41 | 42 | 43 | 6. Import or Create a custom Grafana-Dashboard to monitor Docker containers running on the host: 44 | 45 | ![Fig 2](https://miro.medium.com/max/2732/0*W0yXIT_P-1Gc_sY4) 46 | 47 | * Fig 2: Monitoring results for Uptime, Containers, Load(Numerical Representation), DiskSpace, Memory, Filesystem Usage, CPU Usage, Memory Usage, Targets Online, Total Memory Usage, Alerts. 48 | 49 | ![Fig 3](https://miro.medium.com/max/2732/0*EO1JyVMHPkbFEYdk) 50 | 51 | * Fig 3: Monitoring results for CPU Usage, Network Traffic, Load, Available Memory, Node Network Traffic, Disk I/O, Node Memory, Filesystem Available, Container Network Input, Container Network Output. 52 | 53 | ![Fig 4](https://miro.medium.com/max/2732/0*HeRmOCOHeHeJkkBB) 54 | 55 | * Fig 4: Monitoring results for System Load on Node and Cached Memory per Container(Stacked). 56 | 57 | ![Fig 5](https://miro.medium.com/max/2732/0*Gmqz4PFyP5LjfRNn) 58 | 59 | * Fig 5: Monitoring Results for CPU Usage per container, Sent Traffic per Container, Received Network Traffic per Container, Memory Usage per Container. 60 | 61 | ![Fig 6](https://miro.medium.com/max/2724/0*WjYd0f9n53689GUl) 62 | 63 | * Fig 6: Monitoring results for Network Rx, Network Tx, Tables for Usage Memory, Remaining Memory, Limit Memory. 64 | 65 | ![Fig 7](https://miro.medium.com/max/2728/0*3jj0V42Ph67rZwFN) 66 | 67 | * Fig 7: Monitoring results for Running versions and Metrics. 68 | 69 | 70 | 7. Useful Grafana Dashboards: 71 | - Docker Host Monitoring: 11074, 10619, 395 72 | - Docker Monitoring: 193 73 | - Docker monitoring with Node selection: 8321 74 | -------------------------------------------------------------------------------- /cadvisor-node-exporter-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | node-exporter: 5 | image: prom/node-exporter 6 | ports: 7 | - '9100:9100' 8 | deploy: 9 | resources: 10 | limits: 11 | cpus: '0.25' 12 | memory: '100M' 13 | reservations: 14 | cpus: '0.1' 15 | memory: '50M' 16 | cadvisor-exporter: 17 | image: google/cadvisor 18 | ports: 19 | - "9200:8080" 20 | privileged: true 21 | volumes: 22 | - /:/rootfs:ro 23 | - /cgroup:/cgroup:ro 24 | - /var/run:/var/run:ro 25 | - /var/lib/docker/:/var/lib/docker:ro 26 | - /dev/disk/:/dev/disk:ro 27 | - /cgroup:/sys/fs/cgroup:ro 28 | deploy: 29 | resources: 30 | limits: 31 | cpus: '0.25' 32 | memory: '100M' 33 | reservations: 34 | cpus: '0.1' 35 | memory: '50M' 36 | 37 | -------------------------------------------------------------------------------- /cadvisor-node-exporter-definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "family": "cadvisor-node-exporter-definition", 3 | "taskRoleArn": "arn:aws:iam::342241566140:role/AmazonECSTaskRole", 4 | "containerDefinitions": [ 5 | { 6 | "name": "node-exporter", 7 | "image": "prom/node-exporter", 8 | "cpu": 512, 9 | "memory": 256, 10 | "memoryReservation": 128, 11 | "portMappings": [ 12 | { 13 | "containerPort": 9100, 14 | "hostPort": 9100, 15 | "protocol": "tcp" 16 | } 17 | ], 18 | "essential": true 19 | }, 20 | { 21 | "name": "cadvisor-exporter", 22 | "image": "google/cadvisor", 23 | "cpu": 512, 24 | "memory": 256, 25 | "memoryReservation": 128, 26 | "portMappings": [ 27 | { 28 | "containerPort": 8080, 29 | "hostPort": 9200, 30 | "protocol": "tcp" 31 | } 32 | ], 33 | "essential": true, 34 | "mountPoints": [ 35 | { 36 | "sourceVolume": "root", 37 | "containerPath": "/rootfs", 38 | "readOnly": true 39 | }, 40 | { 41 | "sourceVolume": "cgroup", 42 | "containerPath": "/cgroup", 43 | "readOnly": true 44 | }, 45 | { 46 | "sourceVolume": "var_run", 47 | "containerPath": "/var/run", 48 | "readOnly": true 49 | }, 50 | { 51 | "sourceVolume": "var_lib_docker", 52 | "containerPath": "/var/lib/docker", 53 | "readOnly": true 54 | }, 55 | { 56 | "sourceVolume": "dev_disk", 57 | "containerPath": "/dev/disk", 58 | "readOnly": true 59 | }, 60 | { 61 | "sourceVolume": "cgroup", 62 | "containerPath": "/sys/fs/cgroup", 63 | "readOnly": true 64 | } 65 | ], 66 | "privileged": true, 67 | "readonlyRootFilesystem": false 68 | } 69 | ], 70 | "volumes": [ 71 | { 72 | "name": "root", 73 | "host": { 74 | "sourcePath": "/" 75 | } 76 | }, 77 | { 78 | "name": "cgroup", 79 | "host": { 80 | "sourcePath": "/cgroup" 81 | } 82 | }, 83 | { 84 | "name": "var_run", 85 | "host": { 86 | "sourcePath": "/var/run" 87 | } 88 | }, 89 | { 90 | "name": "var_lib_docker", 91 | "host": { 92 | "sourcePath": "/var/lib/docker" 93 | } 94 | }, 95 | { 96 | "name": "dev_disk", 97 | "host": { 98 | "sourcePath": "/dev/disk" 99 | } 100 | } 101 | ], 102 | "requiresCompatibilities": [ 103 | "EC2" 104 | ], 105 | "cpu": "1024", 106 | "memory": "512" 107 | } 108 | -------------------------------------------------------------------------------- /grafana-definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "family": "grafana-definition", 3 | "taskRoleArn": "arn:aws:iam::342241566140:role/AmazonECSTaskRole", 4 | "containerDefinitions": [ 5 | { 6 | "name": "grafana", 7 | "image": "grafana/grafana", 8 | "cpu": 512, 9 | "memory": 512, 10 | "memoryReservation": 128, 11 | "portMappings": [ 12 | { 13 | "containerPort": 3000, 14 | "hostPort": 3000, 15 | "protocol": "tcp" 16 | } 17 | ], 18 | "essential": true 19 | } 20 | ], 21 | 22 | "requiresCompatibilities": [ 23 | "EC2" 24 | ], 25 | "cpu": "512", 26 | "memory": "512" 27 | } 28 | -------------------------------------------------------------------------------- /prometheus-grafana-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | prometheus: 5 | image: prom/prometheus:v2.4.0 6 | volumes: 7 | - prometheus_config:/etc/prometheus/ 8 | ports: 9 | - "9090:9090" 10 | restart: always 11 | depends_on: 12 | - init-container 13 | deploy: 14 | resources: 15 | limits: 16 | cpus: '0.25' 17 | memory: '100M' 18 | reservations: 19 | cpus: '0.1' 20 | memory: '50M' 21 | init-container: 22 | image: alpine:latest 23 | volumes: 24 | - prometheus_config:/mnt/ 25 | command: wget https://raw.githubusercontent.com/santosh07bec/ecs-monitoring-with-prometheus-grafana/master/prometheus.yml -P /mnt/ 26 | grafana: 27 | image: grafana/grafana 28 | ports: 29 | - "3000:3000" 30 | restart: always 31 | deploy: 32 | resources: 33 | limits: 34 | cpus: '0.25' 35 | memory: '100M' 36 | reservations: 37 | cpus: '0.1' 38 | memory: '50M' 39 | volumes: 40 | prometheus_config: 41 | -------------------------------------------------------------------------------- /prometheus-grafana-definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "family": "prometheus-grafana-definition", 3 | "taskRoleArn": "arn:aws:iam::342241566140:role/AmazonECSTaskRole", 4 | "containerDefinitions": [ 5 | { 6 | "name": "init-container", 7 | "image": "alpine:latest", 8 | "cpu": 0, 9 | "memory": 256, 10 | "memoryReservation": 128, 11 | "essential": false, 12 | "command": [ 13 | "wget", 14 | "https://raw.githubusercontent.com/santosh07bec/ecs-monitoring-with-prometheus-grafana/master/prometheus.yml", 15 | "-P", 16 | "/mnt" 17 | ], 18 | "mountPoints": [ 19 | { 20 | "sourceVolume": "prometheus_config", 21 | "containerPath": "/mnt" 22 | } 23 | ] 24 | }, 25 | { 26 | "name": "prometheus", 27 | "image": "prom/prometheus:v2.4.0", 28 | "cpu": 512, 29 | "memory": 512, 30 | "memoryReservation": 256, 31 | "portMappings": [ 32 | { 33 | "containerPort": 9090, 34 | "hostPort": 9090, 35 | "protocol": "tcp" 36 | } 37 | ], 38 | "essential": true, 39 | "mountPoints": [ 40 | { 41 | "sourceVolume": "prometheus_config", 42 | "containerPath": "/etc/prometheus" 43 | } 44 | ], 45 | "dependsOn": [ 46 | { 47 | "containerName": "init-container", 48 | "condition": "COMPLETE" 49 | } 50 | ] 51 | }, 52 | { 53 | "name": "grafana", 54 | "image": "grafana/grafana", 55 | "cpu": 512, 56 | "memory": 512, 57 | "memoryReservation": 128, 58 | "portMappings": [ 59 | { 60 | "containerPort": 3000, 61 | "hostPort": 3000, 62 | "protocol": "tcp" 63 | } 64 | ], 65 | "essential": true 66 | } 67 | ], 68 | "volumes": [ 69 | { 70 | "name": "prometheus_config", 71 | "dockerVolumeConfiguration": { 72 | "scope": "task", 73 | "driver": "local" 74 | } 75 | } 76 | ], 77 | 78 | "requiresCompatibilities": [ 79 | "EC2" 80 | ], 81 | "cpu": "1024", 82 | "memory": "512" 83 | } 84 | -------------------------------------------------------------------------------- /prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 5s 3 | external_labels: 4 | monitor: 'devopsage-monitor' 5 | scrape_configs: 6 | - job_name: 'prometheus' 7 | static_configs: 8 | - targets: ['localhost:9090'] ## IP Address of the localhost 9 | - job_name: 'ec2_aws_node_exporter_discovery' 10 | ec2_sd_configs: 11 | - port: 9100 # referred to nodeExporter exposed port 12 | #role_arn: arn:aws:iam::342241566140:role/Ec2-Instance-Role 13 | filters: 14 | - name: tag:aws:autoscaling:groupName 15 | values: 16 | - ECS 17 | - job_name: ecs_CAdvisor 18 | ec2_sd_configs: 19 | - port: 9200 # referred to cAdvisor exposed port 20 | #role_arn: arn:aws:iam::342241566140:role/Ec2-Instance-Role 21 | filters: 22 | - name: tag:aws:autoscaling:groupName 23 | values: 24 | - ECS 25 | 26 | --------------------------------------------------------------------------------