├── gitlab ├── gitlab.rb ├── runner-config.tml ├── README.md ├── server-docker-compose.yml └── runner-docker-compose.yml ├── unifi-controller ├── README.md └── docker-compose.yml ├── .gitignore ├── copy-docker-volume.sh ├── caddy ├── conf │ ├── redirect_hosts.txt │ └── Caddyfile ├── README.md └── docker-compose.yml ├── prometheus ├── grafana │ ├── swarmprom_dashboards.yml │ ├── datasources │ │ └── prometheus.yaml │ └── dashboards │ │ ├── swarmprom-prometheus-dash.json │ │ ├── swarmprom-services-dash.json │ │ └── swarmprom-nodes-dash.json ├── prometheus │ ├── rules │ │ ├── snitch.rules.yml │ │ ├── healthcheck-io_rules.yml │ │ ├── swarm_task.rules.yml │ │ └── swarm_node.rules.yml │ └── conf │ │ └── prometheus.yml ├── blackbox-exporter │ ├── README.md │ └── blackbox.yml ├── alertmanager │ └── conf │ │ └── alertmanager.yml ├── README.md └── docker-compose.yml ├── portainer ├── README.md └── portainer-agent-stack.yml ├── elk ├── kibana │ └── config │ │ └── kibana.yml ├── elasticsearch │ └── config │ │ └── elasticsearch.yml ├── elastalert │ ├── config.json │ ├── elastalert-test.yml │ └── elastalert.yml ├── README.md ├── lifecycle-policy-request.txt ├── filebeat │ └── config │ │ └── filebeat.yml └── docker-compose.yml ├── vscode-server └── docker-compose.yml ├── pihole ├── README.md └── docker-compose-stack.yml ├── start-cluster.sh ├── shutdown-cluster.sh └── README.md /gitlab/gitlab.rb: -------------------------------------------------------------------------------- 1 | external_url 'https://gitlab.int.belisleonline.com' 2 | gitlab_rails['initial_root_password'] = File.read('/run/secrets/gitlab_root_password') -------------------------------------------------------------------------------- /unifi-controller/README.md: -------------------------------------------------------------------------------- 1 | ## Usage 2 | 3 | On a Docker Swarm Manager node, run the following 4 | ```bash 5 | sudo docker stack deploy --compose-file=docker-compose.yml unifi 6 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | prometheus/alertmanager/conf/prod.alertmanager.yml 2 | gitlab/root_password.txt 3 | gitlab/gitlab_omniauth_providers.txt 4 | caddy/conf/google-client.conf 5 | elk/elastalert/smtp_auth_file 6 | vscode-server/password.txt 7 | -------------------------------------------------------------------------------- /copy-docker-volume.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo docker run --rm -v gitlab_gitlab-data:/from alpine ash -c "cd /from ; tar -cf - . " | ssh ubuntu@raspberrypi-delta-2 'sudo docker run --rm -i -v gitlab_gitlab-data:/to alpine ash -c "cd /to ; tar -xpvf - " ' -------------------------------------------------------------------------------- /caddy/conf/redirect_hosts.txt: -------------------------------------------------------------------------------- 1 | prometheus.int.belisleonline.com 2 | kibana.int.belisleonline.com 3 | alertmanager.int.belisleonline.com 4 | grafana.int.belisleonline.com 5 | portainer.int.belisleonline.com 6 | unifi.int.belisleonline.com 7 | code.int.belisleonline.com -------------------------------------------------------------------------------- /prometheus/grafana/swarmprom_dashboards.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'default' 5 | orgId: 1 6 | folder: '' 7 | type: file 8 | disableDeletion: false 9 | editable: true 10 | options: 11 | path: /etc/grafana/dashboards 12 | -------------------------------------------------------------------------------- /portainer/README.md: -------------------------------------------------------------------------------- 1 | ## Usage 2 | 3 | On a Docker Swarm Manager node, run the following (preferably after picking a better shared secret): 4 | ```bash 5 | $ export PORTAINER_AGENT_SECRET=changeme 6 | $ sudo -E bash -c 'docker stack deploy --compose-file=portainer-agent-stack.yml portainer' 7 | ``` -------------------------------------------------------------------------------- /elk/kibana/config/kibana.yml: -------------------------------------------------------------------------------- 1 | server.name: kibana 2 | server.host: "0" 3 | elasticsearch.hosts: [ "http://elasticsearch:9200" ] 4 | xpack.monitoring.ui.container.elasticsearch.enabled: false 5 | logging.quiet: true 6 | elastalert-kibana-plugin.serverHost: elastalert 7 | elastalert-kibana-plugin.serverPort: 3030 -------------------------------------------------------------------------------- /prometheus/grafana/datasources/prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | deleteDatasources: 4 | - name: Prometheus 5 | 6 | datasources: 7 | - name: Prometheus 8 | type: prometheus 9 | access: proxy 10 | url: http://prometheus:9090 11 | isDefault: true 12 | version: 1 13 | editable: true 14 | -------------------------------------------------------------------------------- /elk/elasticsearch/config/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | cluster.name: "docker-cluster" 2 | network.host: 0.0.0.0 3 | path.repo: ["/mnt/elasticsearch-snapshot"] 4 | discovery.type: single-node 5 | 6 | xpack.license.self_generated.type: basic 7 | xpack.security.enabled: false 8 | xpack.monitoring.collection.enabled: false 9 | xpack.ml.enabled: false -------------------------------------------------------------------------------- /prometheus/prometheus/rules/snitch.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: meta 3 | rules: 4 | - alert: DeadMansSnitch 5 | expr: vector(1) 6 | labels: 7 | severity: critical 8 | annotations: 9 | description: This is a DeadMansSnitch meant to ensure that the entire Alerting 10 | pipeline is functional. 11 | summary: Alerting DeadMansSnitch -------------------------------------------------------------------------------- /prometheus/prometheus/rules/healthcheck-io_rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: meta 3 | rules: 4 | - alert: Healthcheck.io 5 | expr: vector(1) 6 | labels: 7 | severity: critical 8 | annotations: 9 | description: This is a Healthcheck.io integration meant to ensure that the entire Alerting 10 | pipeline is functional. 11 | summary: Healthcheck.io Ping -------------------------------------------------------------------------------- /elk/elastalert/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "appName": "elastalert-server", 3 | "port": 3030, 4 | "wsport": 3333, 5 | "elastalertPath": "/opt/elastalert", 6 | "verbose": false, 7 | "es_debug": false, 8 | "debug": false, 9 | "rulesPath": { 10 | "relative": true, 11 | "path": "/rules" 12 | }, 13 | "templatesPath": { 14 | "relative": true, 15 | "path": "/rule_templates" 16 | }, 17 | "es_host": "elasticsearch", 18 | "es_port": 9200, 19 | "writeback_index": "elastalert_status" 20 | } -------------------------------------------------------------------------------- /caddy/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | Create a file at `conf/google-client.conf` and include the following line (substitute your google client id and secret): 4 | 5 | ``` 6 | google client_id=CLIENT_ID,client_secret=SECRET 7 | ``` 8 | 9 | This config file is created as a `Docker Secret` during stack deploy to secure the oauth secret 10 | 11 | ## Deploy 12 | 13 | On a Docker Swarm Manager node, run the following 14 | ```bash 15 | export CONFIG_VERSION=X 16 | sudo -E bash -c 'docker stack deploy --compose-file=docker-compose.yml caddy' 17 | ``` -------------------------------------------------------------------------------- /elk/README.md: -------------------------------------------------------------------------------- 1 | ## Usage 2 | 3 | On a Docker Swarm Manager node, run the following 4 | ```bash 5 | export CONFIG_VERSION=X 6 | sudo -E bash -c 'docker stack deploy --compose-file=docker-compose.yml elk' 7 | ``` 8 | 9 | ## Setup Snapshot and Restore 10 | 11 | NOTE: This is temporarily nonfunctional due to this regression. Should be fixed in next patch release (7.7.1): https://github.com/elastic/kibana/pull/67308 12 | 13 | In the Kibana UI, under `Management -> Elasticsearch -> Snapshot and Restore`, add the following Repository: 14 | * `gluster-backups` -> path: `/mnt/elasticsearch-snapshot` 15 | 16 | -------------------------------------------------------------------------------- /elk/lifecycle-policy-request.txt: -------------------------------------------------------------------------------- 1 | PUT _ilm/policy/filebeat-7.4.1 2 | { 3 | "policy": { 4 | "phases": { 5 | "hot": { 6 | "min_age": "0ms", 7 | "actions": { 8 | "rollover": { 9 | "max_age": "7d", 10 | "max_size": "2gb" 11 | } 12 | } 13 | }, 14 | "warm": { 15 | "actions": {} 16 | }, 17 | "cold": { 18 | "min_age": "14d", 19 | "actions": {} 20 | }, 21 | "delete": { 22 | "min_age": "30d", 23 | "actions": { 24 | "delete": {} 25 | } 26 | } 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /prometheus/blackbox-exporter/README.md: -------------------------------------------------------------------------------- 1 | # blackbox-exporter 2 | Trialing usage of blackbox-exporter within my monitoring setup. I am using `prom/blackbox-exporter` to expose a probe endpoint that prometheus can 'scrape'. For the trial, I will be probing CloudFlare's DoH (DNS over HTTPS) status endpoint to monitor that DNS requests are indeed routed via CF DoH thru my pihole setup. 3 | 4 | ## Usage 5 | * URL to check current DoH status: https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-doh.cloudflareresolve.com/resolvertest 6 | * URL for Prometheus to scrape: http://blackbox-exporter:9115/probe?target=bbd96f23-eda8-465d-b190-6ddf056cae66.is-doh.cloudflareresolve.com/resolvertest&module=http_2xx_expect_1 7 | 8 | -------------------------------------------------------------------------------- /gitlab/runner-config.tml: -------------------------------------------------------------------------------- 1 | concurrent = 1 2 | check_interval = 0 3 | [session_server] 4 | listen_address = "0.0.0.0:8093" # listen on all available interfaces on port 8093 5 | advertise_address = "raspi-swarm.home.local:8093" 6 | session_timeout = 1800 7 | session_timecheck_interval = 0 8 | [[runners]] 9 | name = "docker-runner" 10 | url = "http://raspi-swarm.home.local:88" 11 | token = "sZcx2xcHAwgHZiTQVbHs" 12 | executor = "docker" 13 | [runners.custom_build_dir] 14 | [runners.docker] 15 | tls_verify = false 16 | image = "alpine:latest" 17 | privileged = false 18 | disable_entrypoint_overwrite = false 19 | oom_kill_disable = false 20 | disable_cache = false 21 | volumes = ["/cache"] 22 | shm_size = 0 23 | [runners.cache] 24 | [runners.cache.s3] 25 | [runners.cache.gcs] -------------------------------------------------------------------------------- /elk/filebeat/config/filebeat.yml: -------------------------------------------------------------------------------- 1 | # filebeat.modules: 2 | # - module: system 3 | # syslog: 4 | # enabled: true 5 | # auth: 6 | # enabled: true 7 | 8 | # https://www.elastic.co/guide/en/beats/filebeat/7.4/configuration-filebeat-options.html 9 | filebeat.inputs: 10 | - type: log 11 | paths: 12 | - /var/log/syslog 13 | - /var/log/daemon.log 14 | - /var/log/glusterfs/*.log 15 | - /var/log/glusterfs/bricks/*.log 16 | 17 | filebeat.autodiscover: 18 | providers: 19 | - type: docker 20 | hints.enabled: true 21 | 22 | processors: 23 | - add_cloud_metadata: ~ 24 | - add_docker_metadata: ~ 25 | - add_locale: 26 | format: offset 27 | - add_host_metadata: 28 | netinfo.enabled: true 29 | 30 | output.elasticsearch: 31 | hosts: ['elasticsearch:9200'] 32 | 33 | # setup.dashboards: 34 | # enabled: true 35 | 36 | setup.kibana: 37 | host: "kibana:5601" 38 | protocol: 'http' 39 | 40 | xpack.monitoring: 41 | enabled: true -------------------------------------------------------------------------------- /vscode-server/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | secrets: 3 | password: 4 | file: ./password.txt 5 | services: 6 | vscode-server: 7 | image: linuxserver/code-server 8 | environment: 9 | - PUID=1000 10 | - PGID=1000 11 | - TZ=America/New_York 12 | - DOCKER_MODS=linuxserver/mods:code-server-nodejs|linuxserver/mods:code-server-golang|linuxserver/mods:code-server-docker|linuxserver/mods:code-server-python3 13 | - FILE__PASSWORD=/run/secrets/password #optional 14 | - FILE__SUDO_PASSWORD=/run/secrets/password #optional 15 | - PROXY_DOMAIN=code.int.belisleonline.com #optional 16 | volumes: 17 | - code-server-config:/config 18 | - /var/run/docker.sock:/var/run/docker.sock 19 | networks: 20 | - caddy_internal_net 21 | secrets: 22 | - password 23 | # ports: 24 | # - 8888:8443 25 | networks: 26 | caddy_internal_net: 27 | external: true 28 | volumes: 29 | code-server-config: 30 | driver: glusterfs:latest 31 | name: "shared-volume/vscode-server-config" -------------------------------------------------------------------------------- /pihole/README.md: -------------------------------------------------------------------------------- 1 | ## Usage 2 | 1. Check if the docker_gwbridge ip address is 172.18.0.1: 3 | ```bash 4 | ip -o addr show docker_gwbridge 5 | ``` 6 | 7 | If different, replace 172.18.0.1 with your local docker_gwbridge address in the pihole `DNS1:` docker environment variable. This will set the `cloudflared` DoH container as the primary upstream DNS provider. 8 | 9 | ```yaml 10 | environment: 11 | DNS1: "172.18.0.1#5053" 12 | ``` 13 | 14 | 2. On a Docker Swarm Manager node, run: 15 | ```bash 16 | $ export PIHOLE_WEBPASSWORD=admin 17 | $ sudo -E bash -c 'docker stack deploy --compose-file=docker-compose-stack.yml pihole' 18 | ``` 19 | 20 | 3. To change temperature to fahrenheit, run the following against each pihole container. This setting is persisted in the docker volume: 21 | ```sh 22 | docker exec PIHOLE_CONTAINER_NAME pihole -a fahrenheit 23 | ``` 24 | 4. You may want to add hostrecords to dnsmasq, run the following against each pihole container / node: 25 | ```sh 26 | docker exec PIHOLE_CONTAINER_NAME pihole -a hostrecord my.custom.dns.entry 192.168.X.X 27 | ``` 28 | These records are persisted in the dnsmasq docker volume. -------------------------------------------------------------------------------- /start-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | exec 1> >(logger -s -t $(basename $0)) 2>&1 4 | 5 | ##TODO: make sure all manager nodes are UP first!! 6 | 7 | echo "Updating worker node status to active.." 8 | for host in `sudo docker node ls --filter "role=worker" --format="{{.Hostname}}"` 9 | do 10 | echo "Activating node: $host" 11 | CMD_OUTPUT=`sudo docker node update --availability active $host` 12 | echo "CMD_OUTPUT=$CMD_OUTPUT" 13 | done 14 | 15 | 16 | echo "Activated Worker Nodes...waiting 60 seconds to stabilize...." 17 | sleep 30 18 | 19 | 20 | for host in `sudo docker node ls --filter "role=manager" --format="{{.Hostname}}"` 21 | do 22 | echo "Activating node: $host" 23 | CMD_OUTPUT=`sudo docker node update --availability active $host` 24 | echo "CMD_OUTPUT=$CMD_OUTPUT" 25 | done 26 | 27 | 28 | 29 | ##TODO Manually test first! 30 | echo "Recreate mon stack..." 31 | CMD_OUTPUT=`sudo docker stack deploy -c ./prometheus/docker-compose.yml mon` 32 | echo "Deployed mon stack, CMD_OUTPUT=$CMD_OUTPUT" 33 | echo "Sleeping 30 seconds" 34 | echo "Recreating elk stack..." 35 | CMD_OUTPUT=`sudo docker stack deploy --compose-file=./elk/docker-compose.yml elk` 36 | echo "Deployed elk stack, CMD_OUTPUT=$CMD_OUTPUT" 37 | echo "DONE!" 38 | -------------------------------------------------------------------------------- /caddy/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | configs: 4 | caddyfile: 5 | name: caddyfile-${CONFIG_VERSION:-0} 6 | file: ./conf/Caddyfile 7 | redirect-hosts: 8 | name: redirect-hosts-${CONFIG_VERSION:-0} 9 | file: ./conf/redirect_hosts.txt 10 | 11 | secrets: 12 | google-client-conf-secret: 13 | file: ./conf/google-client.conf 14 | 15 | networks: 16 | internal_net: 17 | driver: overlay 18 | attachable: true 19 | 20 | volumes: 21 | caddy-ssl: 22 | driver: glusterfs:latest 23 | name: "shared-volume/caddy-ssl" 24 | 25 | services: 26 | caddy: 27 | image: jmb12686/caddy:latest 28 | ports: 29 | - target: 80 30 | published: 81 31 | mode: host 32 | - target: 443 33 | published: 444 34 | mode: host 35 | networks: 36 | - internal_net 37 | volumes: 38 | - caddy-ssl:/root/.caddy 39 | - type: bind 40 | source: /home/pi/GeoLite2-Country.mmdb 41 | target: /data/GeoLite2-Country.mmdb 42 | configs: 43 | - source: redirect-hosts 44 | target: /redirect_hosts.txt 45 | - source: caddyfile 46 | target: /etc/Caddyfile 47 | secrets: 48 | - google-client-conf-secret 49 | deploy: 50 | mode: replicated 51 | replicas: 1 52 | placement: 53 | constraints: 54 | - node.hostname == raspberrypi-delta 55 | -------------------------------------------------------------------------------- /unifi-controller/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | unifi: 5 | image: jmb12686/unifi:v5.12.66 6 | init: true 7 | environment: 8 | TZ: "America/New_York" 9 | RUNAS_UID0: "false" 10 | UNIFI_UID: 1000 11 | UNIFI_GID: 1000 12 | JVM_MAX_HEAP_SIZE: "375M" 13 | ports: 14 | - "3478:3478/udp" # STUN 15 | - "6789:6789/tcp" # Speed test 16 | - "8080:8080/tcp" # Device/ controller comm. 17 | - "8443:8443/tcp" # Controller GUI/API as seen in a web browser 18 | - "8880:8880/tcp" # HTTP portal redirection 19 | - "8843:8843/tcp" # HTTPS portal redirection 20 | - "10001:10001/udp" # AP discovery 21 | volumes: 22 | - unifi-controller-data:/unifi 23 | networks: 24 | - unifi 25 | deploy: 26 | mode: replicated 27 | replicas: 1 28 | resources: 29 | limits: 30 | memory: 1500M 31 | reservations: 32 | memory: 1500M 33 | stop_grace_period: 60s 34 | log-pipe: 35 | image: bash 36 | command: bash -c 'tail -F /unifi/log/*.log' 37 | volumes: 38 | - unifi-controller-data:/unifi 39 | deploy: 40 | mode: replicated 41 | replicas: 1 42 | 43 | 44 | networks: 45 | unifi: 46 | driver: overlay 47 | 48 | volumes: 49 | unifi-controller-data: 50 | driver: glusterfs:latest 51 | name: "shared-volume/unifi-controller-data" -------------------------------------------------------------------------------- /prometheus/alertmanager/conf/alertmanager.yml: -------------------------------------------------------------------------------- 1 | route: 2 | group_by: [...] 3 | receiver: 'default-receiver' 4 | # All alerts that do not match the following child routes 5 | # will remain at the root node and be dispatched to 'default-receiver'. 6 | routes: 7 | # All alerts with alertname=DeadMansSnitch 8 | # are dispatched to the deadmanssnitch receiver webhook 9 | - match: 10 | alertname: DeadMansSnitch 11 | receiver: deadmanssnitch 12 | repeat_interval: 5m 13 | - match: 14 | alertname: Healthcheck.io 15 | receiver: healthcheckio 16 | repeat_interval: 30s 17 | group_interval: 1s 18 | 19 | receivers: 20 | - name: 'deadmanssnitch' 21 | webhook_configs: 22 | - send_resolved: false 23 | url: 'https://nosnch.in/SNITCH_URL' 24 | - name: 'healthcheckio' 25 | webhook_configs: 26 | - send_resolved: false 27 | url: 'https://hc-ping.com/UNIQUE_PING_URL' 28 | - name: 'default-receiver' 29 | email_configs: 30 | - send_resolved: true 31 | to: $EMAIL_TO_ACCOUNT 32 | from: $GMAIL_ACCOUNT 33 | smarthost: smtp.gmail.com:587 34 | auth_identity: "$GMAIL_ACCOUNT" 35 | auth_username: "$GMAIL_ACCOUNT" 36 | auth_password: "$GMAIL_AUTH_TOKEN" 37 | text: "{{ .CommonAnnotations.description }}" 38 | - send_resolved: true 39 | to: $EMAIL_TO_ACCOUNT_2 40 | from: $GMAIL_ACCOUNT 41 | smarthost: smtp.gmail.com:587 42 | auth_identity: "$GMAIL_ACCOUNT" 43 | auth_username: "$GMAIL_ACCOUNT" 44 | auth_password: "$GMAIL_AUTH_TOKEN" 45 | text: "{{ .CommonAnnotations.description }}" 46 | -------------------------------------------------------------------------------- /portainer/portainer-agent-stack.yml: -------------------------------------------------------------------------------- 1 | version: "3.2" 2 | 3 | services: 4 | agent: 5 | image: portainer/agent 6 | environment: 7 | # REQUIRED: Should be equal to the service name prefixed by "tasks." when 8 | # deployed inside an overlay network 9 | AGENT_CLUSTER_ADDR: tasks.agent 10 | CAP_HOST_MANAGEMENT: 1 11 | AGENT_SECRET: ${PORTAINER_AGENT_SECRET:?err} 12 | # AGENT_PORT: 9001 13 | # LOG_LEVEL: debug 14 | volumes: 15 | - /var/run/docker.sock:/var/run/docker.sock 16 | - /var/lib/docker/volumes:/var/lib/docker/volumes 17 | - /:/host 18 | networks: 19 | - agent_network 20 | deploy: 21 | mode: global 22 | resources: 23 | limits: 24 | memory: 128M 25 | reservations: 26 | memory: 64M 27 | 28 | portainer: 29 | image: portainer/portainer 30 | environment: 31 | AGENT_SECRET: ${PORTAINER_AGENT_SECRET:?err} 32 | command: -H tcp://tasks.agent:9001 --tlsskipverify 33 | # Do not publish ports - force access thru caddy_internal_net 34 | # ports: 35 | # - "9000:9000" 36 | # - "8000:8000" 37 | volumes: 38 | - portainer_data:/data 39 | networks: 40 | - agent_network 41 | - caddy_internal_net 42 | deploy: 43 | mode: replicated 44 | replicas: 1 45 | placement: 46 | constraints: [node.role == manager] 47 | resources: 48 | limits: 49 | memory: 256M 50 | reservations: 51 | memory: 128M 52 | 53 | networks: 54 | agent_network: 55 | driver: overlay 56 | attachable: true 57 | caddy_internal_net: 58 | external: true 59 | 60 | volumes: 61 | portainer_data: 62 | -------------------------------------------------------------------------------- /elk/elastalert/elastalert-test.yml: -------------------------------------------------------------------------------- 1 | # NOTE: This config is used when testing a rule 2 | 3 | # The elasticsearch hostname for metadata writeback 4 | # Note that every rule can have its own elasticsearch host 5 | es_host: elasticsearch 6 | 7 | # The elasticsearch port 8 | es_port: 9200 9 | 10 | # This is the folder that contains the rule yaml files 11 | # Any .yaml file will be loaded as a rule 12 | rules_folder: rules 13 | 14 | # How often ElastAlert will query elasticsearch 15 | # The unit can be anything from weeks to seconds 16 | run_every: 17 | seconds: 5 18 | 19 | # ElastAlert will buffer results from the most recent 20 | # period of time, in case some log sources are not in real time 21 | buffer_time: 22 | minutes: 1 23 | 24 | # Optional URL prefix for elasticsearch 25 | #es_url_prefix: elasticsearch 26 | 27 | # Connect with TLS to elasticsearch 28 | #use_ssl: True 29 | 30 | # Verify TLS certificates 31 | #verify_certs: True 32 | 33 | # GET request with body is the default option for Elasticsearch. 34 | # If it fails for some reason, you can pass 'GET', 'POST' or 'source'. 35 | # See http://elasticsearch-py.readthedocs.io/en/master/connection.html?highlight=send_get_body_as#transport 36 | # for details 37 | #es_send_get_body_as: GET 38 | 39 | # Option basic-auth username and password for elasticsearch 40 | #es_username: someusername 41 | #es_password: somepassword 42 | 43 | # The index on es_host which is used for metadata storage 44 | # This can be a unmapped index, but it is recommended that you run 45 | # elastalert-create-index to set a mapping 46 | writeback_index: elastalert_status 47 | 48 | # If an alert fails for some reason, ElastAlert will retry 49 | # sending the alert until this time period has elapsed 50 | alert_time_limit: 51 | days: 2 -------------------------------------------------------------------------------- /shutdown-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | exec 1> >(logger -s -t $(basename $0)) 2>&1 4 | 5 | for host in `sudo docker node ls --filter "role=manager" --format="{{.Hostname}}"` 6 | do 7 | echo "Draining node: $host" 8 | CMD_OUTPUT=`sudo docker node update --availability drain $host` 9 | echo "CMD_OUTPUT=$CMD_OUTPUT" 10 | done 11 | 12 | echo "Drained Manager Nodes...waiting 30 seconds to stabilize...." 13 | sleep 30 14 | 15 | 16 | for host in `sudo docker node ls --filter "role=worker" --format="{{.Hostname}}"` 17 | do 18 | echo "Draining node: $host" 19 | CMD_OUTPUT=`sudo docker node update --availability drain $host` 20 | echo "CMD_OUTPUT=$CMD_OUTPUT" 21 | done 22 | 23 | echo "Drained Worker Nodes...waiting 30 seconds to stabilize...." 24 | sleep 30 25 | 26 | echo "remove stateful docker stacks [mon & elk] to allow graceful cluster rejoin upon reboot" 27 | CMD_OUTPUT=`sudo docker stack rm elk` 28 | echo "Removed elk stack, CMD_OUTPUT=$CMD_OUTPUT" 29 | CMD_OUTPUT=`sudo docker stack rm mon` 30 | echo "Removed mon stack, CMD_OUTPUT=$CMD_OUTPUT" 31 | 32 | 33 | ## TODO: Need consistent user login name (ubuntu vs pi) 34 | # echo "Shutting down worker nodes" 35 | # for host in `sudo docker node ls --filter "role=worker" --format="{{.Hostname}}"` 36 | # do 37 | # echo "shutting down node: $node" 38 | # CMD_OUTPUT=`sudo shut` 39 | # echo "CMD_OUTPUT=$CMD_OUTPUT" 40 | # done 41 | 42 | #TODO These need to be IPs, DNS is dead at this point 43 | echo "Shutting down worker nodes" 44 | ssh pi@raspberrypi-beta sudo shutdown now 45 | ssh pi@raspberrypi-delta-3 sudo shutdown now 46 | ssh ubuntu@raspberrypi-delta-5 sudo shutdown now 47 | ssh pi@raspberrypicharlie sudo shutdown now 48 | 49 | echo "Shutting down manager nodes" 50 | ssh ubuntu@raspberrypi-delta-2 sudo shutdown now 51 | ssh ubuntu@raspberrypi-delta-4 sudo shutdown now 52 | 53 | 54 | -------------------------------------------------------------------------------- /elk/elastalert/elastalert.yml: -------------------------------------------------------------------------------- 1 | # The elasticsearch hostname for metadata writeback 2 | # Note that every rule can have its own elasticsearch host 3 | es_host: elasticsearch 4 | 5 | # The elasticsearch port 6 | es_port: 9200 7 | 8 | # This is the folder that contains the rule yaml files 9 | # Any .yaml file will be loaded as a rule 10 | rules_folder: rules 11 | 12 | # How often ElastAlert will query elasticsearch 13 | # The unit can be anything from weeks to seconds 14 | run_every: 15 | seconds: 10 16 | 17 | # ElastAlert will buffer results from the most recent 18 | # period of time, in case some log sources are not in real time 19 | buffer_time: 20 | minutes: 1 21 | 22 | # Optional URL prefix for elasticsearch 23 | #es_url_prefix: elasticsearch 24 | 25 | # Connect with TLS to elasticsearch 26 | #use_ssl: True 27 | 28 | # Verify TLS certificates 29 | # verify_certs: True 30 | 31 | # GET request with body is the default option for Elasticsearch. 32 | # If it fails for some reason, you can pass 'GET', 'POST' or 'source'. 33 | # See http://elasticsearch-py.readthedocs.io/en/master/connection.html?highlight=send_get_body_as#transport 34 | # for details 35 | #es_send_get_body_as: GET 36 | 37 | # Option basic-auth username and password for elasticsearch 38 | #es_username: someusername 39 | #es_password: somepassword 40 | 41 | # The index on es_host which is used for metadata storage 42 | # This can be a unmapped index, but it is recommended that you run 43 | # elastalert-create-index to set a mapping 44 | writeback_index: elastalert_status 45 | 46 | # If an alert fails for some reason, ElastAlert will retry 47 | # sending the alert until this time period has elapsed 48 | alert_time_limit: 49 | days: 2 50 | 51 | smtp_host: 'smtp.gmail.com' 52 | smpt_port: 465 53 | smtp_ssl: true 54 | smtp_auth_file: '/run/secrets/elastalert_smtp_auth_file' 55 | from_addr: 'john.belisle.cloud.2019@gmail.com' -------------------------------------------------------------------------------- /gitlab/README.md: -------------------------------------------------------------------------------- 1 | # gitlab server and runner deployment 2 | 3 | ## Deploy the Server 4 | 5 | 1. Create a file called `root_password.txt` in this directory and put your gitlab `root` password here. The compose stack will add the file as a secret upon deploy 6 | 7 | 2. Deploy the stack: 8 | 9 | ```bash 10 | $ sudo docker stack deploy -c server-docker-compose.yml gitlab-server 11 | ``` 12 | 13 | ## Deploy the Runner stack 14 | 15 | 1. Find your registration token in GitLab, navigate to: "Your project" > "Settings" > "CI/CD" > "Runners settings" > "Specific Runners" (look for registration token). Register it as `GITLAB_REGISTRATION_TOKEN`: 16 | 17 | ```bash 18 | $ printf | sudo docker secret create GITLAB_REGISTRATION_TOKEN - 19 | ``` 20 | 21 | 2. Find your personal access token in GitLab, navigate to: "Your user account" > "Settings" > "Access Tokens" > "Create personal access token" (for api). Register it as `GITLAB_PERSONAL_ACCESS_TOKEN`: 22 | 23 | ```bash 24 | $ printf | sudo docker secret create GITLAB_PERSONAL_ACCESS_TOKEN - 25 | ``` 26 | 27 | 3. Deploy the stack: 28 | 29 | ```bash 30 | $ sudo docker stack deploy --compose-file runner-docker-compose.yml gitlab-runner 31 | ``` 32 | 33 | 34 | -------------------------------------------------------------------------------- /prometheus/prometheus/conf/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | evaluation_interval: 15s 4 | 5 | external_labels: 6 | monitor: 'promswarm' 7 | 8 | rule_files: 9 | - "swarm_node.rules.yml" 10 | - "swarm_task.rules.yml" 11 | - "snitch.rules.yml" 12 | - "healthcheck-io.rules.yml" 13 | 14 | alerting: 15 | alertmanagers: 16 | - dns_sd_configs: 17 | - names: 18 | - 'tasks.alertmanager' 19 | type: 'A' 20 | port: 9093 21 | 22 | scrape_configs: 23 | - job_name: 'prometheus' 24 | static_configs: 25 | - targets: ['localhost:9090'] 26 | 27 | - job_name: 'dockerd-exporter' 28 | dns_sd_configs: 29 | - names: 30 | - 'tasks.dockerd-exporter' 31 | type: 'A' 32 | port: 9323 33 | 34 | - job_name: 'cadvisor' 35 | dns_sd_configs: 36 | - names: 37 | - 'tasks.cadvisor' 38 | type: 'A' 39 | port: 8080 40 | 41 | - job_name: 'node-exporter' 42 | dns_sd_configs: 43 | - names: 44 | - 'tasks.node-exporter' 45 | type: 'A' 46 | port: 9100 47 | 48 | - job_name: 'doh-status-probe' 49 | metrics_path: /probe 50 | params: 51 | module: [http_2xx_expect_1] # Look for a HTTP 200 response and expect '1' in body. 52 | static_configs: 53 | - targets: 54 | # - http://prometheus.io # Target to probe with http. 55 | - https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-doh.cloudflareresolve.com/resolvertest # Target to probe with https. 56 | # - http://example.com:8080 # Target to probe with http on port 8080. 57 | relabel_configs: 58 | - source_labels: [__address__] 59 | target_label: __param_target 60 | - source_labels: [__param_target] 61 | target_label: instance 62 | - target_label: __address__ 63 | replacement: doh-status-probe:9115 # The blackbox exporter's real hostname:port. 64 | 65 | 66 | -------------------------------------------------------------------------------- /prometheus/blackbox-exporter/blackbox.yml: -------------------------------------------------------------------------------- 1 | # http://localhost:9115/probe?target=https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-dot.cloudflareresolve.com/resolvertest&module=http_2xx 2 | # http://localhost:9115/probe?target=https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-dot.cloudflareresolve.com/resolvertest&module=http_2xx_example 3 | modules: 4 | http_2xx_expect_1: 5 | prober: http 6 | timeout: 5s 7 | http: 8 | valid_http_versions: ["HTTP/1.1", "HTTP/2"] 9 | valid_status_codes: [] # Defaults to 2xx 10 | method: GET 11 | # headers: 12 | # Host: vhost.example.com 13 | # Accept-Language: en-US 14 | # Origin: example.com 15 | # no_follow_redirects: false 16 | # fail_if_ssl: false 17 | # fail_if_not_ssl: false 18 | # fail_if_body_matches_regexp: 19 | # - "Could not connect to database" 20 | fail_if_body_not_matches_regexp: 21 | - "^1$" 22 | preferred_ip_protocol: "ip4" # defaults to "ip6" 23 | ip_protocol_fallback: false # no fallback to "ip6" 24 | # fail_if_header_matches: # Verifies that no cookies are set 25 | # - header: Set-Cookie 26 | # allow_missing: true 27 | # regexp: '.*' 28 | # fail_if_header_not_matches: 29 | # - header: Access-Control-Allow-Origin 30 | # regexp: '(\*|example\.com)' 31 | # tls_config: 32 | # insecure_skip_verify: false 33 | # preferred_ip_protocol: "ip4" # defaults to "ip6" 34 | # ip_protocol_fallback: false # no fallback to "ip6" 35 | 36 | 37 | 38 | # curl 'https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-doh.cloudflareresolve.com/resolvertest' -H 'Accept: */*' -H 'Referer: https://www.cloudflare.com/ssl/encrypted-sni/' -H 'Origin: https://www.cloudflare.com' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36' -H 'DNT: 1' -H 'Sec-Fetch-Mode: cors' --compressed -------------------------------------------------------------------------------- /prometheus/prometheus/rules/swarm_task.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: swarm_task.rules.yml 3 | rules: 4 | - alert: ProbeFailed 5 | expr: probe_success == 0 6 | for: 3m 7 | labels: 8 | severity: error 9 | annotations: 10 | summary: "Probe failed {{ $labels.job }} (instance {{ $labels.instance }})" 11 | description: "Probe failed\n VALUE = {{ $value }}\n JOB: {{ $labels.job }}\n LABELS: {{ $labels }}" 12 | - alert: ExporterDown 13 | expr: up == 0 14 | for: 5m 15 | labels: 16 | severity: warning 17 | annotations: 18 | summary: "Exporter down {{ $labels.job }} (instance {{ $labels.instance }})" 19 | description: "Prometheus exporter down\n VALUE = {{ $value }}\n JOB: {{ $labels.job }}\n LABELS: {{ $labels }}" 20 | - alert: task_high_cpu_usage_200 21 | expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m])) 22 | BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) 23 | * 100 > 200 24 | for: 5m 25 | annotations: 26 | description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ 27 | $labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize 28 | $value}}%.' 29 | summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name 30 | }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' 31 | - alert: task_high_memory_usage_3g 32 | expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) 33 | BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 3e+09 34 | for: 1m 35 | annotations: 36 | description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ 37 | $labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize 38 | $value}}.' 39 | summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name 40 | }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RaspberryPi Docker Swarm Stacks 2 | 3 | ![Custom badge](https://img.shields.io/endpoint?url=https%3A%2F%2Fhealthchecks.io%2Fbadge%2F60ea1ee3-cc42-4799-9a68-e08d81%2FlrujVX8j.shields) 4 | 5 | A collection of Docker Stacks that I run on my home Raspberry Pi Docker Swarm cluster. 6 | 7 | * **Prometheus** - Full metrics and monitoring pipeline. Includes Docker, container, and node based metric collection, alerting, and visualization w/ **Grafana** 8 | * **pihole** - Network wide adblocker implementing DNS over HTTPS (DoH) via **cloudflared** proxy. 9 | * **portainer** - Docker Swarm cluster management UI. 10 | * **GitLab** - GitLab Omnibus deployment with GitLab CI/CD Runner. 11 | * **Elastic (ELK) Stack** - Logging aggregation, analysis, search, and visualization stack. Comprised of **Elasticsearch**, **Kibana**, and **Filebeat**. 12 | * **Unifi Controller** - Wireless network management software solution from Ubiquiti Networks for administration of Unifi network gear. 13 | 14 | 15 | 16 | ## Setup and Install 17 | Clone the repo, `cd` into each directory and run: 18 | ```bash 19 | sudo docker stack deploy --compose-file=$FILE_NAME $STACK_NAME 20 | ``` 21 | 22 | **Note** - Read thru README in each dir for setup and configuration details of each stack. 23 | 24 | ## Multiarch Docker Images 25 | 26 | Many of the open source products used here do not have vendor supported ARM compatible Docker images or are published under different Docker Hub repositories / tags. Some vendor supported images do have ARM support, but are not fully compatible with Docker Swarm clustering. To overcome these limitations, the following projects were created and use [Docker buildx](https://docs.docker.com/buildx/working-with-buildx/) to publish native multi-architecture images (tutorial [here](https://www.docker.com/blog/multi-arch-images/)). Check out these repositories for further information: 27 | 28 | * [jmb12686/docker-cadvisor](https://github.com/jmb12686/docker-cadvisor) 29 | * [jmb12686/node-exporter](https://github.com/jmb12686/node-exporter) 30 | * [jmb12686/docker-swarm-alertmanager](https://github.com/jmb12686/docker-swarm-alertmanager) 31 | * [jmb12686/docker-socat](https://github.com/jmb12686/docker-socat) 32 | * [jmb12686/docker-elasticsearch](https://github.com/jmb12686/docker-elasticsearch) 33 | * [jmb12686/docker-kibana](https://github.com/jmb12686/docker-kibana) 34 | * [jmb12686/docker-filebeat](https://github.com/jmb12686/docker-filebeat) 35 | 36 | Special shout out to these open source ARM compatible projects used: 37 | 38 | * [crazy-max/docker-cloudflared](https://github.com/crazy-max/docker-cloudflared) 39 | * [pi-hole/docker-pi-hole](https://github.com/pi-hole/docker-pi-hole) 40 | * [ulm0/gitlab](https://github.com/ulm0/gitlab) 41 | * [klud/gitlab-runner](https://hub.docker.com/r/klud/gitlab-runner/) 42 | 43 | -------------------------------------------------------------------------------- /pihole/docker-compose-stack.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | # https://github.com/pi-hole/docker-pi-hole/blob/master/README.md 4 | 5 | services: 6 | pihole: 7 | # Pinned to arm/v7 digest pihole/pihole:v5.1.1 8 | image: pihole/pihole@sha256:f26dc1beaec171b2e53e107643fb2ea7dfbb0afea0e9572671d6737b986a8870 9 | hostname: "{{.Node.Hostname}}-{{.Task.ID}}" 10 | deploy: 11 | replicas: 2 12 | update_config: 13 | delay: 30s 14 | resources: 15 | limits: 16 | memory: 200M 17 | reservations: 18 | memory: 100M 19 | # Temporarily disable deployment to Ubuntu 20.04 arm64 nodes due to bug https://github.com/pi-hole/docker-pi-hole/issues/593 20 | placement: 21 | constraints: 22 | - "node.hostname!=raspberrypicharlie" 23 | max_replicas_per_node: 1 24 | # For DHCP it is recommended to remove these ports and instead add: network_mode: "host" 25 | ports: 26 | - target: 53 27 | published: 53 28 | protocol: tcp 29 | - target: 53 30 | published: 53 31 | protocol: udp 32 | - target: 67 33 | published: 67 34 | protocol: udp 35 | - target: 80 36 | published: 80 37 | protocol: tcp 38 | - target: 443 39 | published: 443 40 | protocol: tcp 41 | environment: 42 | TZ: "America/New_York" 43 | WEBPASSWORD: "${PIHOLE_WEBPASSWORD:?err}" 44 | DNS1: "172.18.0.1#5053" 45 | DNS2: "172.18.0.1#5053" 46 | DNS_FQDN_REQUIRED: "true" 47 | DNS_BOGUS_PRIV: "true" 48 | DNSSEC: "false" 49 | CONDITIONAL_FORWARDING: "true" 50 | CONDITIONAL_FORWARDING_IP: "192.168.0.1" 51 | CONDITIONAL_FORWARDING_DOMAIN: "home.local" 52 | CONDITIONAL_FORWARDING_REVERSE: "0.168.192.in-addr.arpa" 53 | # Volumes store your data between container upgrades 54 | volumes: 55 | - etc-pihole:/etc/pihole/ 56 | - etc-dnsmasq.d:/etc/dnsmasq.d/ 57 | # run `touch ./var-log/pihole.log` first unless you like errors 58 | - var-log:/var/log/ 59 | dns: 60 | - "127.0.0.1" 61 | - "1.1.1.1" # fallback 62 | 63 | cloudflared: 64 | # Pinned to arm/v7 digest for crazymax/cloudflared:2020.7.4 65 | image: crazymax/cloudflared@sha256:84be412b20d462cad62a34efd417f5ed5edc546e97ac2a2c8094c2556031daff 66 | deploy: 67 | replicas: 2 68 | placement: 69 | max_replicas_per_node: 1 70 | update_config: 71 | delay: 30s 72 | ports: 73 | - target: 5053 74 | published: 5053 75 | protocol: udp 76 | - target: 49312 77 | published: 49312 78 | protocol: tcp 79 | environment: 80 | - "TZ=America/New_York" 81 | - "TUNNEL_DNS_UPSTREAM=https://1.1.1.1/dns-query,https://1.0.0.1/dns-query" 82 | volumes: 83 | etc-pihole: {} 84 | etc-dnsmasq.d: {} 85 | var-log: {} 86 | -------------------------------------------------------------------------------- /gitlab/server-docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | networks: 4 | caddy_internal_net: 5 | external: true 6 | runner_network: 7 | driver: overlay 8 | attachable: true 9 | 10 | volumes: 11 | gitlab-data: 12 | driver: glusterfs:latest 13 | name: "shared-volume/gitlab/data" 14 | gitlab-config: 15 | driver: glusterfs:latest 16 | name: "shared-volume/gitlab/config" 17 | gitlab-logs: {} 18 | gitlab-runner-config: {} 19 | 20 | secrets: 21 | gitlab_root_password: 22 | file: ./root_password.txt 23 | # gitlab_omniauth_providers: 24 | # file: ./gitlab_omniauth_providers.txt 25 | 26 | services: 27 | gitlab: 28 | image: jmb12686/gitlab:13.1.0 29 | networks: 30 | - caddy_internal_net 31 | - runner_network 32 | volumes: 33 | - gitlab-data:/var/opt/gitlab 34 | - gitlab-logs:/var/log/gitlab 35 | - gitlab-config:/etc/gitlab 36 | environment: 37 | GITLAB_OMNIBUS_CONFIG: | 38 | external_url 'https://gitlab.int.belisleonline.com' # Address to access gitlab from the outside internet 39 | gitlab_rails['initial_root_password'] = File.read('/run/secrets/gitlab_root_password') 40 | ## Override default detection of the 'external_url' to enable usage of external load balancer and TLS termination 41 | ## https://docs.gitlab.com/omnibus/settings/nginx.html#supporting-proxied-ssl 42 | nginx['listen_port'] = 80 43 | nginx['listen_https'] = false 44 | 45 | # Disable Prometheus node_exporter inside Docker. 46 | node_exporter['enable'] = false 47 | 48 | ## To completely disable prometheus, and all of it's exporters, set to false 49 | prometheus_monitoring['enable'] = false 50 | 51 | ## Set Unicorn timeout and lower processes (2 is the lowest allowed at this moment) 52 | puma['max_threads'] = 2 53 | puma['min_threads'] = 1 54 | puma['worker_processes'] = 2 55 | puma['worker_timeout'] = 60 56 | 57 | ## Set Sidekiq timeout and lower its concurrency to the lowest allowed 58 | sidekiq['shutdown_timeout'] = 4 59 | sidekiq['concurrency'] = 5 60 | 61 | # registry_external_url 'https://registry.gitlab.int.belisleonline.com' 62 | # registry_nginx['listen_port'] = 5001 63 | # registry_nginx['listen_https'] = false 64 | 65 | # gitlab_rails['omniauth_providers'] = File.read('/run/secrets/gitlab_omniauth_providers') 66 | 67 | secrets: 68 | - gitlab_root_password 69 | # - gitlab_omniauth_providers 70 | deploy: 71 | mode: replicated 72 | replicas: 1 73 | resources: 74 | limits: 75 | memory: 4000M 76 | reservations: 77 | memory: 2048M 78 | stop_grace_period: 60s 79 | healthcheck: 80 | test: ["CMD", "/opt/gitlab/bin/gitlab-healthcheck", "--fail", "--max-time", "15"] 81 | interval: 60s 82 | timeout: 30s 83 | retries: 8 84 | start_period: 4m -------------------------------------------------------------------------------- /prometheus/prometheus/rules/swarm_node.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_node.rules.yml 3 | rules: 4 | # absent(((time() - container_last_seen{name=~"pihole_pihole.*"}) < 5)) 5 | ##count(count(container_tasks_state{container_label_com_docker_swarm_node_id =~".+"}) by (container_label_com_docker_swarm_node_id)) < 3 6 | ##count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~".+"}) < 3 7 | - alert: piholeMissing 8 | expr: (count(time() - container_last_seen{name=~".*pihole_pihole.*"} < 30) OR vector(0)) < 2 9 | for: 30s 10 | labels: 11 | severity: error 12 | annotations: 13 | description: Less than 2 pihole containers exist! Check status of home DNS setup! 14 | summary: pihole container missing 15 | - alert: less_than_6_nodes 16 | expr: count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~".+"}) < 6 17 | for: 5m 18 | labels: 19 | severity: error 20 | annotations: 21 | description: Number of Swarm nodes is less than 6, check setup to maintain cluster health. 22 | summary: Swarm node went down, check setup. 23 | - alert: less_than_3_swarm_managers 24 | expr: sum(swarm_node_manager) < 3 25 | for: 5m 26 | labels: 27 | severity: error 28 | annotations: 29 | description: Number of Swarm Manager Nodes is less than 3!!! Check cluster to maintain a quorum! 30 | summary: Swarm Manager Node went down, check cluster health. 31 | - alert: node_cpu_usage 32 | expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) 33 | node_meta * 100) BY (node_name)) > 75 34 | for: 5m 35 | labels: 36 | severity: warning 37 | annotations: 38 | description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize 39 | $value}}%. 40 | summary: CPU alert for Swarm node '{{ $labels.node_name }}' 41 | - alert: node_memory_usage 42 | expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) 43 | * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 44 | for: 1m 45 | labels: 46 | severity: warning 47 | annotations: 48 | description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize 49 | $value}}%. 50 | summary: Memory alert for Swarm node '{{ $labels.node_name }}' 51 | - alert: node_disk_usage 52 | expr: ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) 53 | * 100 / node_filesystem_size_bytes{mountpoint="/"}) * ON(instance) GROUP_LEFT(node_name) 54 | node_meta > 85 55 | for: 1m 56 | labels: 57 | severity: warning 58 | annotations: 59 | description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize 60 | $value}}%. 61 | summary: Disk alert for Swarm node '{{ $labels.node_name }}' 62 | - alert: node_disk_fill_rate_6h 63 | expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 6 * 3600) * ON(instance) 64 | GROUP_LEFT(node_name) node_meta < 0 65 | for: 1h 66 | labels: 67 | severity: critical 68 | annotations: 69 | description: Swarm node {{ $labels.node_name }} disk is going to fill up in 70 | 6h. 71 | summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' 72 | -------------------------------------------------------------------------------- /caddy/conf/Caddyfile: -------------------------------------------------------------------------------- 1 | (ip-filter) { 2 | ipfilter / { 3 | rule allow 4 | ip 192.168.0.1 5 | database /data/GeoLite2-Country.mmdb 6 | country US 7 | } 8 | } 9 | 10 | 11 | https://auth.int.belisleonline.com:443 { 12 | import ip-filter 13 | errors stderr 14 | log / stdout "{combined}" 15 | tls jmb186@gmail.com 16 | redir 302 { 17 | if {path} is / 18 | / /login 19 | } 20 | 21 | login { 22 | import /run/secrets/google-client-conf-secret 23 | redirect_check_referer false 24 | redirect_host_file /redirect_hosts.txt 25 | cookie_domain int.belisleonline.com 26 | } 27 | } 28 | 29 | (int-auth) { 30 | jwt { 31 | token_source cookie jwt_token 32 | path / 33 | redirect https://auth.int.belisleonline.com/login?backTo=https%3A%2F%2F{host}{rewrite_uri_escaped} 34 | allow sub jmb186@gmail.com 35 | } 36 | } 37 | 38 | https://code.int.belisleonline.com:443 { 39 | import ip-filter 40 | import int-auth 41 | errors stderr 42 | log / stdout "{combined}" 43 | tls jmb186@gmail.com 44 | proxy / http://vscode-server:8443 { 45 | transparent 46 | websocket 47 | } 48 | } 49 | 50 | https://gitlab.int.belisleonline.com:443 { 51 | import ip-filter 52 | errors stderr 53 | log / stdout "{combined}" 54 | tls jmb186@gmail.com 55 | proxy / http://gitlab:80 { 56 | transparent 57 | } 58 | } 59 | 60 | https://unifi.int.belisleonline.com:443 { 61 | import ip-filter 62 | import int-auth 63 | errors stderr 64 | log / stdout "{combined}" 65 | tls jmb186@gmail.com 66 | proxy / https://raspi-swarm.home.local:8443 { 67 | transparent 68 | websocket 69 | insecure_skip_verify 70 | } 71 | } 72 | 73 | 74 | https://prometheus.int.belisleonline.com:443 { 75 | import ip-filter 76 | import int-auth 77 | errors stderr 78 | log / stdout "{combined}" 79 | tls jmb186@gmail.com 80 | proxy / http://prometheus:9090 { 81 | transparent 82 | } 83 | } 84 | 85 | https://kibana.int.belisleonline.com:443 { 86 | import ip-filter 87 | import int-auth 88 | errors stderr 89 | log / stdout "{combined}" 90 | tls jmb186@gmail.com 91 | proxy / http://kibana:5601 { 92 | transparent 93 | } 94 | } 95 | 96 | https://alertmanager.int.belisleonline.com:443 { 97 | import ip-filter 98 | import int-auth 99 | errors stderr 100 | log / stdout "{combined}" 101 | tls jmb186@gmail.com 102 | proxy / http://alertmanager:9093 { 103 | transparent 104 | } 105 | } 106 | 107 | https://grafana.int.belisleonline.com:443 { 108 | import ip-filter 109 | import int-auth 110 | errors stderr 111 | log / stdout "{combined}" 112 | tls jmb186@gmail.com 113 | proxy / http://grafana:3000 { 114 | transparent 115 | } 116 | } 117 | 118 | https://portainer.int.belisleonline.com:443 { 119 | import ip-filter 120 | import int-auth 121 | errors stderr 122 | log / stdout "{combined}" 123 | tls jmb186@gmail.com 124 | proxy / http://portainer:9000 { 125 | transparent 126 | websocket 127 | } 128 | } 129 | 130 | -------------------------------------------------------------------------------- /elk/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | 5 | elasticsearch: 6 | image: jmb12686/elasticsearch:v7.7.1 7 | configs: 8 | - source: elastic_config 9 | target: /usr/share/elasticsearch/config/elasticsearch.yml 10 | volumes: 11 | - /mnt/ssd/elasticsearch-data:/usr/share/elasticsearch/data 12 | - elasticsearch-snapshot:/mnt/elasticsearch-snapshot 13 | - /etc/localtime:/etc/localtime:ro 14 | - /etc/timezone:/etc/timezone:ro 15 | environment: 16 | ES_JAVA_OPTS: "-Xmx1500m -Xms1500m" 17 | ELASTIC_PASSWORD: changeme 18 | # discovery.type: single-node 19 | TZ: America/New_York 20 | networks: 21 | - elk 22 | deploy: 23 | mode: replicated 24 | replicas: 1 25 | placement: 26 | constraints: 27 | - node.labels.ssd==true 28 | resources: 29 | limits: 30 | memory: 2500M 31 | reservations: 32 | memory: 2000M 33 | 34 | elastalert: 35 | image: jmb12686/elastalert:v3.1 36 | # ports: 37 | # - 3030:3030 38 | # - 3333:3333 39 | configs: 40 | - source: elastalert_config 41 | target: config/config.json 42 | - source: elastalert_elastalert 43 | target: /opt/elastalert/config.yaml 44 | - source: elastalert_elastalert_test 45 | target: /opt/elastalert/config-test.yaml 46 | secrets: 47 | - elastalert_smtp_auth_file 48 | volumes: 49 | - elastalert-rules:/opt/elastalert/rules 50 | networks: 51 | - elk 52 | deploy: 53 | mode: replicated 54 | replicas: 1 55 | placement: 56 | constraints: 57 | - node.hostname != raspberrypi-beta 58 | 59 | 60 | kibana: 61 | image: jmb12686/kibana:v7.7.1 62 | # Do not publish ports - force access thru caddy_internal_net 63 | # ports: 64 | # - 5601:5601 65 | 66 | # Legacy - custom runtime commmands to install plugins, spikes CPU and RAM causing infinite crashes... 67 | # command: /bin/bash -c "/opt/kibana/bin/kibana" 68 | # command: /bin/bash -c "rm -rf /opt/kibana/optimize/* && /opt/kibana/bin/kibana" 69 | # command: /bin/bash -c "/opt/kibana/bin/kibana-plugin install https://github.com/bitsensor/elastalert-kibana-plugin/releases/download/1.1.0/elastalert-kibana-plugin-1.1.0-7.4.1.zip && /opt/kibana/bin/kibana" 70 | 71 | environment: 72 | NODE_OPTIONS: "--max-old-space-size=2048" 73 | TZ: America/New_York 74 | volumes: 75 | - /etc/localtime:/etc/localtime:ro 76 | - /etc/timezone:/etc/timezone:ro 77 | configs: 78 | - source: kibana_config 79 | target: /opt/kibana/config/kibana.yml 80 | networks: 81 | - elk 82 | - caddy_internal_net 83 | deploy: 84 | mode: replicated 85 | replicas: 1 86 | resources: 87 | limits: 88 | memory: 2048M 89 | reservations: 90 | memory: 1024M 91 | 92 | filebeat: 93 | image: jmb12686/filebeat:v7.7.1 94 | hostname: "{{.Node.Hostname}}" 95 | user: root 96 | networks: 97 | - elk 98 | configs: 99 | - source: filebeat_config 100 | target: /usr/share/filebeat/filebeat.yml 101 | volumes: 102 | - filebeat:/usr/share/filebeat/data 103 | - /var/run/docker.sock:/var/run/docker.sock 104 | - /var/lib/docker/containers/:/var/lib/docker/containers/:ro 105 | - /var/log/:/var/log/:ro 106 | environment: 107 | - ELASTICSEARCH_HOST:elasticsearch 108 | - KIBANA_HOST:kibana 109 | command: ["--strict.perms=false"] 110 | deploy: 111 | mode: global 112 | 113 | configs: 114 | elastic_config: 115 | name: elastic_config-${CONFIG_VERSION:-0} 116 | file: ./elasticsearch/config/elasticsearch.yml 117 | kibana_config: 118 | name: kibana_config-${CONFIG_VERSION:-0} 119 | file: ./kibana/config/kibana.yml 120 | filebeat_config: 121 | name: filebeat_config-${CONFIG_VERSION:-0} 122 | file: ./filebeat/config/filebeat.yml 123 | elastalert_config: 124 | name: elastalert_config-${CONFIG_VERSION:-0} 125 | file: ./elastalert/config.json 126 | elastalert_elastalert: 127 | name: elastalert_elastalert-${CONFIG_VERSION:-0} 128 | file: ./elastalert/elastalert.yml 129 | elastalert_elastalert_test: 130 | name: elastalert_elastalert_test-${CONFIG_VERSION:-0} 131 | file: ./elastalert/elastalert-test.yml 132 | 133 | secrets: 134 | elastalert_smtp_auth_file: 135 | file: ./elastalert/smtp_auth_file 136 | 137 | networks: 138 | elk: 139 | driver: overlay 140 | caddy_internal_net: 141 | external: true 142 | 143 | volumes: 144 | filebeat: {} 145 | elasticsearch-data: {} 146 | elasticsearch-snapshot: 147 | driver: glusterfs:latest 148 | name: "shared-volume/elasticsearch-snapshot" 149 | elastalert-rules: 150 | driver: glusterfs:latest 151 | name: "shared-volume/elastalert-rules" 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /gitlab/runner-docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | secrets: 4 | 5 | # Find your registration token at: "Your project" > "Settings" > "CI/CD" > "Runners settings" > "Specific Runners" (look for registration token) 6 | # Register it as `GITLAB_REGISTRATION_TOKEN`: `docker secret create GITLAB_REGISTRATION_TOKEN YOUR_REGISTRATION_TOKEN` 7 | GITLAB_REGISTRATION_TOKEN: 8 | external: true 9 | # Find your personal access token at: "Your user account" > "Settings" > "Access Tokens" > "Create personal access token" (for api) 10 | # Register it as `GITLAB_PERSONAL_ACCESS_TOKEN`: `docker secret create GITLAB_PERSONAL_ACCESS_TOKEN ` 11 | GITLAB_PERSONAL_ACCESS_TOKEN: 12 | external: true 13 | 14 | networks: 15 | gitlab-server_runner_network: ## Created by 'gitlab-server' stack 16 | external: true 17 | 18 | services: 19 | 20 | # Gitlab Runner - https://gitlab.com/gitlab-org/gitlab-runner 21 | runner: 22 | image: klud/gitlab-runner:13.1.0-alpine # ARM compatible image 23 | environment: 24 | - CONCURRENT=4 25 | - REGISTER_LOCKED=1 26 | - REGISTER_NON_INTERACTIVE=1 27 | - RUNNER_EXECUTOR=docker 28 | - DOCKER_IMAGE=alpine:latest 29 | - DOCKER_VOLUMES=/var/run/docker.sock:/var/run/docker.sock 30 | - RUNNER_NAME=docker 31 | - API_URL=http://gitlab:80/api/v4 32 | - CI_SERVER_URL=http://gitlab:80/ci 33 | entrypoint: "bash" 34 | secrets: 35 | - GITLAB_REGISTRATION_TOKEN 36 | command: | 37 | -c ' 38 | set -e 39 | 40 | export REGISTRATION_TOKEN="$$(cat /run/secrets/GITLAB_REGISTRATION_TOKEN)" 41 | printf "REGISTRATION_TOKEN = ${REGISTRATION_TOKEN}" 42 | printf "\\n" 43 | 44 | printf "Registering runner...\\n" 45 | gitlab-runner register --non-interactive --tag-list "local" 46 | printf "\\n" 47 | 48 | printf "List runners...\\n" 49 | gitlab-runner list 50 | printf "\\n" 51 | 52 | printf "Manually editing runner configuration...\\n" 53 | sed -i "s/^concurrent = .*/concurrent = $${CONCURRENT}/" /etc/gitlab-runner/config.toml 54 | sed -i "/^\[session_server.*/a \ listen_address = \"0.0.0.0:8093\"\n\ advertise_address = \"raspi-swarm.home.local:8093\"" /etc/gitlab-runner/config.toml 55 | 56 | printf "Running runner...\\n" 57 | gitlab-runner run --user=gitlab-runner --working-directory=/home/gitlab-runner 58 | 59 | ' 60 | volumes: 61 | - /var/run/docker.sock:/var/run/docker.sock 62 | deploy: 63 | mode: replicated 64 | replicas: 1 65 | ports: 66 | - 8093:8093 67 | networks: 68 | - gitlab-server_runner_network 69 | healthcheck: 70 | test: ["CMD-SHELL", "gitlab-runner verify --name docker 2>&1 | grep -q \"is alive\""] 71 | start_period: 10s 72 | interval: 10s 73 | timeout: 10s 74 | retries: 10 75 | 76 | # Gitlab Manager to unregister GitLab Runners 77 | manager: 78 | image: alpine:latest 79 | environment: 80 | - API_URL=http://gitlab:80/api/v4 81 | - CI_SERVER_URL=http://gitlab:80/ci 82 | secrets: 83 | - GITLAB_PERSONAL_ACCESS_TOKEN 84 | entrypoint: sh 85 | command: | 86 | -c ' 87 | set -e 88 | printf "Installing dependencies...\\n" 89 | apk --no-cache add curl jq 90 | sleep 60 91 | printf "\\n" 92 | 93 | export PERSONAL_ACCESS_TOKEN="$$(cat /run/secrets/GITLAB_PERSONAL_ACCESS_TOKEN)" 94 | while true; do 95 | printf "Checking runners...\\n" 96 | curl -sS --header "PRIVATE-TOKEN: $${PERSONAL_ACCESS_TOKEN}" "$${API_URL}/runners/all?per_page=100" | \ 97 | jq -c ".[] | select(\"online\"==.status) | .id" | \ 98 | while read RUNNER_ID; do 99 | printf "Runner $${RUNNER_ID} is online\\n" 100 | done 101 | curl -sS --header "PRIVATE-TOKEN: $${PERSONAL_ACCESS_TOKEN}" "$${API_URL}/runners/all?per_page=100" | \ 102 | jq -c ".[] | select(\"online\"!=.status) | .id" | \ 103 | while read RUNNER_ID; do 104 | printf "Deleting runner $${RUNNER_ID}...\\n" 105 | curl -sS --request DELETE --header "PRIVATE-TOKEN: $${PERSONAL_ACCESS_TOKEN}" "$${API_URL}/runners/$${RUNNER_ID}" 106 | done 107 | printf "All offline runners deleted\\n" 108 | printf "Waiting for 24 hours...\\n" 109 | sleep 24h 110 | done 111 | printf "\\n" 112 | ' 113 | deploy: 114 | mode: replicated 115 | replicas: 1 116 | healthcheck: 117 | test: ["CMD-SHELL", "command -v curl"] 118 | start_period: 10s 119 | interval: 10s 120 | timeout: 10s 121 | retries: 10 122 | 123 | # NOT ARM COMPATIBLE 124 | # Gitlab Runner Docker Cleanup - https://gitlab.com/gitlab-org/gitlab-runner-docker-cleanup 125 | # cleaner: 126 | # image: quay.io/gitlab/gitlab-runner-docker-cleanup 127 | # environment: 128 | # - CHECK_PATH=/data 129 | # - LOW_FREE_SPACE=10G 130 | # - EXPECTED_FREE_SPACE=20G 131 | # - LOW_FREE_FILES_COUNT=1048576 132 | # - EXPECTED_FREE_FILES_COUNT=2097152 133 | # - USE_DF=1 134 | # - CHECK_INTERVAL=10s 135 | # - RETRY_INTERVAL=30s 136 | # - DEFAULT_TTL=60m 137 | # volumes: 138 | # - /var/run/docker.sock:/var/run/docker.sock 139 | # - /data:/data 140 | # deploy: 141 | # restart_policy: 142 | # condition: any 143 | # labels: 144 | # - "traefik.enable=false" -------------------------------------------------------------------------------- /prometheus/README.md: -------------------------------------------------------------------------------- 1 | # prometheus 2 | 3 | Docker Swarm monitoring with [Prometheus](https://prometheus.io/), 4 | [Grafana](http://grafana.org/), 5 | [cAdvisor](https://github.com/google/cadvisor), 6 | [Node Exporter](https://github.com/prometheus/node_exporter), 7 | [Alert Manager](https://github.com/prometheus/alertmanager) 8 | 9 | Dereived from [stefanprodan/swarmprom](https://github.com/stefanprodan/swarmprom) with added support for multi architecture Docker images (amd64, arm64, and arm/v7). This implementation also eliminates the need to custom build Prometheus, Grafana, and AlertManager images. 10 | 11 | ## Install 12 | 13 | Clone this repository and run the monitoring stack: 14 | 15 | ```bash 16 | $ git clone https://github.com/jmb12686/raspi-docker-stacks.git 17 | $ cd prometheus 18 | 19 | docker stack deploy -c docker-compose.yml mon 20 | ``` 21 | 22 | Prerequisites: 23 | 24 | * Docker CE 17.09.0-ce or Docker EE 17.06.2-ee-3 25 | * Swarm cluster with one manager and a worker node 26 | * Docker engine experimental enabled and metrics address set to `0.0.0.0:9323` 27 | 28 | Services: 29 | 30 | * prometheus (metrics database) `http://:9090` 31 | * grafana (visualize metrics) `http://:3000` 32 | * node-exporter (host metrics collector) 33 | * cadvisor (containers metrics collector) 34 | * dockerd-exporter (Docker daemon metrics collector, requires Docker experimental metrics-addr to be enabled) 35 | * alertmanager (alerts dispatcher) `http://:9093` 36 | 37 | ### custom multiarch node-exporter 38 | 39 | When a node-exporter container starts `node-meta.prom` is generated with the following content: 40 | 41 | ```bash 42 | "node_meta{node_id=\"$NODE_ID\", node_name=\"$NODE_NAME\"} 1" 43 | ``` 44 | 45 | The node ID value is supplied via `{{.Node.ID}}` and the node name is extracted from the `/etc/hostname` 46 | file that is mounted inside the node-exporter container. 47 | 48 | ```yaml 49 | node-exporter: 50 | image: jmb12686/swarmprom-node-exporter 51 | environment: 52 | - NODE_ID={{.Node.ID}} 53 | volumes: 54 | - /etc/hostname:/etc/nodename 55 | command: 56 | - '-collector.textfile.directory=/etc/node-exporter/' 57 | ``` 58 | 59 | Using the textfile command, you can instruct node-exporter to collect the `node_meta` metric. 60 | Now that you have a metric containing the Docker Swarm node ID and name, you can use it in promql queries. 61 | 62 | Let's say you want to find the available memory on each node, normally you would write something like this: 63 | 64 | ``` 65 | sum(node_memory_MemAvailable) by (instance) 66 | 67 | {instance="10.0.0.5:9100"} 889450496 68 | {instance="10.0.0.13:9100"} 1404162048 69 | {instance="10.0.0.15:9100"} 1406574592 70 | ``` 71 | 72 | The above result is not very helpful since you can't tell what Swarm node is behind the instance IP. 73 | So let's write that query taking into account the node_meta metric: 74 | 75 | ```sql 76 | sum(node_memory_MemAvailable * on(instance) group_left(node_id, node_name) node_meta) by (node_id, node_name) 77 | 78 | {node_id="wrdvtftteo0uaekmdq4dxrn14",node_name="swarm-manager-1"} 889450496 79 | {node_id="moggm3uaq8tax9ptr1if89pi7",node_name="swarm-worker-1"} 1404162048 80 | {node_id="vkdfx99mm5u4xl2drqhnwtnsv",node_name="swarm-worker-2"} 1406574592 81 | ``` 82 | 83 | This is much better. Instead of overlay IPs, now I can see the actual Docker Swarm nodes ID and hostname. Knowing the hostname of your nodes is useful for alerting as well. 84 | 85 | You can define an alert when available memory reaches 10%. You also will receive the hostname in the alert message 86 | and not some overlay IP that you can't correlate to a infrastructure item. 87 | 88 | Maybe you are wondering why you need the node ID if you have the hostname. The node ID will help you match 89 | node-exporter instances to cAdvisor instances. All metrics exported by cAdvisor have a label named `container_label_com_docker_swarm_node_id`, 90 | and this label can be used to filter containers metrics by Swarm nodes. 91 | 92 | Let's write a query to find out how many containers are running on a Swarm node. 93 | Knowing from the `node_meta` metric all the nodes IDs you can define a filter with them in Grafana. 94 | Assuming the filter is `$node_id` the container count query should look like this: 95 | 96 | ``` 97 | count(rate(container_last_seen{container_label_com_docker_swarm_node_id=~"$node_id"}[5m])) 98 | ``` 99 | 100 | ## Updating Configs in already deployed Stack 101 | This Prometheus stack utilizes many Docker Configs which are immutable. To support hot deployment and updating of the stack with updates to config files, a versioning mechanism has been added to the Compose file: 102 | ```yml 103 | configs: 104 | prometheus: 105 | name: prometheus-${CONFIG_VERSION:-0} 106 | file: ./prometheus/conf/prometheus.yml 107 | ``` 108 | 109 | Without the `CONFIG_VERSION` env variable set, compose defaults the config file "version" to 0. 110 | 111 | Deploying the stack with `CONFIG_VERSION=1`: 112 | ```bash 113 | $ export CONFIG_VERSION=1 114 | $ sudo -E bash -c 'docker stack deploy -c docker-compose.yml mon' 115 | Creating config prometheus-1 116 | Creating config alert_manager-1 117 | Updating service mon_prometheus (id: 2mdy9h720iofyaqort1qx1qu2) 118 | Updating service mon_alertmanager (id: 68sxctatkw7kwg1ywt4zcik4v) 119 | ``` 120 | 121 | Removing the Docker stack will dispose of all configs created, but there is currently no `prune` for Docker Configs, so maintenance / cleanup to remove unused configs should be periodically performed. 122 | -------------------------------------------------------------------------------- /prometheus/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | networks: 4 | net: 5 | driver: overlay 6 | attachable: true 7 | caddy_internal_net: 8 | external: true 9 | 10 | volumes: 11 | prometheus: 12 | driver: glusterfs:latest 13 | name: "rpi-gfs" 14 | grafana: {} 15 | alertmanager: {} 16 | 17 | configs: 18 | prometheus: 19 | name: prometheus-${CONFIG_VERSION:-0} 20 | file: ./prometheus/conf/prometheus.yml 21 | node_rules: 22 | name: swarm_node_rules-${CONFIG_VERSION:-0} 23 | file: ./prometheus/rules/swarm_node.rules.yml 24 | task_rules: 25 | name: swarm_task_rules-${CONFIG_VERSION:-0} 26 | file: ./prometheus/rules/swarm_task.rules.yml 27 | meta_rules: 28 | file: ./prometheus/rules/snitch.rules.yml 29 | healthcheck-io_rules: 30 | file: ./prometheus/rules/healthcheck-io_rules.yml 31 | grafana_datasource: 32 | file: ./grafana/datasources/prometheus.yaml 33 | grafana_provisioning_dashboards: 34 | file: ./grafana/swarmprom_dashboards.yml 35 | grafana_dashboards_nodes: 36 | file: ./grafana/dashboards/swarmprom-nodes-dash.json 37 | grafana_dashboards_prometheus: 38 | file: ./grafana/dashboards/swarmprom-prometheus-dash.json 39 | grafana_dashboards_services: 40 | file: ./grafana/dashboards/swarmprom-services-dash.json 41 | alert_manager: 42 | name: alert_manager-${CONFIG_VERSION:-0} 43 | file: ./alertmanager/conf/prod.alertmanager.yml 44 | blackbox_exporter: 45 | file: ./blackbox-exporter/blackbox.yml 46 | 47 | services: 48 | doh-status-probe: 49 | image: prom/blackbox-exporter:v0.16.0 50 | networks: 51 | - net 52 | command: 53 | - "--config.file=/config/blackbox.yml" 54 | configs: 55 | - source: blackbox_exporter 56 | target: /config/blackbox.yml 57 | deploy: 58 | mode: replicated 59 | replicas: 1 60 | resources: 61 | limits: 62 | memory: 32M 63 | 64 | dockerd-exporter: 65 | image: jmb12686/socat 66 | networks: 67 | - net 68 | command: -d -d TCP-L:9323,fork TCP:172.18.0.1:9323 69 | deploy: 70 | mode: global 71 | resources: 72 | limits: 73 | memory: 32M 74 | 75 | cadvisor: 76 | image: jmb12686/cadvisor 77 | networks: 78 | - net 79 | command: -logtostderr -docker_only 80 | volumes: 81 | - /var/run/docker.sock:/var/run/docker.sock:ro 82 | - /:/rootfs:ro 83 | - /var/run:/var/run 84 | - /sys:/sys:ro 85 | - /var/lib/docker/:/var/lib/docker:ro 86 | # Do not publish ports - force access thru caddy_internal_net 87 | # ports: 88 | # - 9094:8080 89 | deploy: 90 | mode: global 91 | resources: 92 | limits: 93 | memory: 128M 94 | reservations: 95 | memory: 64M 96 | 97 | grafana: 98 | image: grafana/grafana:6.3.6 99 | networks: 100 | - net 101 | - caddy_internal_net 102 | environment: 103 | - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin} 104 | - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 105 | - GF_USERS_ALLOW_SIGN_UP=false 106 | - GF_PATHS_PROVISIONING=/etc/grafana/provisioning/ 107 | #- GF_SERVER_ROOT_URL=${GF_SERVER_ROOT_URL:-localhost} 108 | #- GF_SMTP_ENABLED=${GF_SMTP_ENABLED:-false} 109 | #- GF_SMTP_FROM_ADDRESS=${GF_SMTP_FROM_ADDRESS:-grafana@test.com} 110 | #- GF_SMTP_FROM_NAME=${GF_SMTP_FROM_NAME:-Grafana} 111 | #- GF_SMTP_HOST=${GF_SMTP_HOST:-smtp:25} 112 | #- GF_SMTP_USER=${GF_SMTP_USER} 113 | #- GF_SMTP_PASSWORD=${GF_SMTP_PASSWORD} 114 | volumes: 115 | - grafana:/var/lib/grafana 116 | configs: 117 | - source: grafana_datasource 118 | target: /etc/grafana/provisioning/datasources/prometheus.yaml 119 | - source: grafana_provisioning_dashboards 120 | target: /etc/grafana/provisioning/dashboards/swarmprom_dashboards.yml 121 | - source: grafana_dashboards_nodes 122 | target: /etc/grafana/dashboards/swarmprom-nodes-dash.json 123 | - source: grafana_dashboards_prometheus 124 | target: /etc/grafana/dashboards/swarmprom-prometheus-dash.json 125 | - source: grafana_dashboards_services 126 | target: /etc/grafana/dashboards/swarmprom-services-dash.json 127 | # Do not publish ports - force access thru caddy_internal_net 128 | # ports: 129 | # - 3000:3000 130 | deploy: 131 | mode: replicated 132 | replicas: 1 133 | resources: 134 | limits: 135 | memory: 128M 136 | reservations: 137 | memory: 64M 138 | 139 | alertmanager: 140 | image: jmb12686/alertmanager-swarm 141 | networks: 142 | - net 143 | - caddy_internal_net 144 | environment: 145 | - SLACK_URL=${SLACK_URL:-https://hooks.slack.com/services/TOKEN} 146 | - SLACK_CHANNEL=${SLACK_CHANNEL:-general} 147 | - SLACK_USER=${SLACK_USER:-alertmanager} 148 | configs: 149 | - source: alert_manager 150 | target: /etc/alertmanager/alertmanager.yml 151 | command: 152 | - "--web.external-url=https://alertmanager.int.belisleonline.com" 153 | - "--config.file=/etc/alertmanager/alertmanager.yml" 154 | - "--storage.path=/alertmanager" 155 | - "--cluster.listen-address=0.0.0.0:8001" 156 | - "--cluster.peer=tasks.alertmanager:8001" 157 | # - '--log.level=debug' 158 | # - '--cluster.advertise-address=' --> This arg gets set in the jmb12686/alertmanager-swarm image entry-point script. 159 | # It effectively gets set to the eth1 interface IP @ port 8001 to support Swarm networking 160 | # Do not publish ports - force access thru caddy_internal_net 161 | # ports: 162 | # - 9093:9093 163 | volumes: 164 | - alertmanager:/alertmanager 165 | deploy: 166 | mode: replicated 167 | replicas: 2 168 | placement: 169 | max_replicas_per_node: 1 170 | resources: 171 | limits: 172 | memory: 128M 173 | reservations: 174 | memory: 64M 175 | 176 | node-exporter: 177 | image: jmb12686/node-exporter 178 | networks: 179 | - net 180 | environment: 181 | - NODE_ID={{.Node.ID}} 182 | # Do not publish ports - force access thru caddy_internal_net 183 | # ports: 184 | # - 9100:9100 185 | volumes: 186 | - /proc:/host/proc:ro 187 | - /sys:/host/sys:ro 188 | - /:/rootfs:ro 189 | - /etc/hostname:/etc/nodename:ro 190 | command: 191 | - "--path.sysfs=/host/sys" 192 | - "--path.procfs=/host/proc" 193 | - "--path.rootfs=/rootfs" 194 | - "--collector.textfile.directory=/etc/node-exporter/" 195 | - "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)" 196 | - "--no-collector.ipvs" 197 | deploy: 198 | mode: global 199 | resources: 200 | limits: 201 | memory: 32M 202 | 203 | prometheus: 204 | image: prom/prometheus 205 | networks: 206 | - net 207 | - caddy_internal_net 208 | command: 209 | - "--config.file=/etc/prometheus/prometheus.yml" 210 | - "--storage.tsdb.path=/prometheus" 211 | - "--storage.tsdb.retention.time=${PROMETHEUS_RETENTION:-21d}" 212 | - "--storage.tsdb.retention.size=3GB" 213 | - "--web.external-url=https://prometheus.int.belisleonline.com" 214 | volumes: 215 | - prometheus:/prometheus 216 | configs: 217 | - source: prometheus 218 | target: /etc/prometheus/prometheus.yml 219 | - source: node_rules 220 | target: /etc/prometheus/swarm_node.rules.yml 221 | - source: task_rules 222 | target: /etc/prometheus/swarm_task.rules.yml 223 | - source: meta_rules 224 | target: /etc/prometheus/snitch.rules.yml 225 | - source: healthcheck-io_rules 226 | target: /etc/prometheus/healthcheck-io.rules.yml 227 | # Do not publish ports - force access thru caddy_internal_net 228 | # ports: 229 | # - 9090:9090 230 | deploy: 231 | mode: replicated 232 | replicas: 1 233 | placement: 234 | constraints: 235 | - node.platform.arch == aarch64 236 | resources: 237 | limits: 238 | memory: 1024M 239 | reservations: 240 | memory: 512M 241 | stop_grace_period: 60s 242 | -------------------------------------------------------------------------------- /prometheus/grafana/dashboards/swarmprom-prometheus-dash.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "$$hashKey": "object:698", 6 | "builtIn": 1, 7 | "datasource": "-- Grafana --", 8 | "enable": true, 9 | "hide": true, 10 | "iconColor": "rgba(0, 211, 255, 1)", 11 | "name": "Annotations & Alerts", 12 | "type": "dashboard" 13 | } 14 | ] 15 | }, 16 | "editable": true, 17 | "gnetId": null, 18 | "graphTooltip": 1, 19 | "links": [ 20 | { 21 | "icon": "info", 22 | "tags": [], 23 | "targetBlank": true, 24 | "title": "Grafana Docs", 25 | "tooltip": "", 26 | "type": "link", 27 | "url": "http://docs.grafana.org/" 28 | }, 29 | { 30 | "icon": "info", 31 | "tags": [], 32 | "targetBlank": true, 33 | "title": "Prometheus Docs", 34 | "type": "link", 35 | "url": "http://prometheus.io/docs/introduction/overview/" 36 | } 37 | ], 38 | "panels": [ 39 | { 40 | "aliasColors": { 41 | "prometheus": "#C15C17", 42 | "{instance=\"localhost:9090\",job=\"prometheus\"}": "#CCA300" 43 | }, 44 | "bars": false, 45 | "dashLength": 10, 46 | "dashes": false, 47 | "datasource": "Prometheus", 48 | "editable": true, 49 | "error": false, 50 | "fill": 0, 51 | "grid": {}, 52 | "gridPos": { 53 | "h": 5, 54 | "w": 6, 55 | "x": 0, 56 | "y": 0 57 | }, 58 | "id": 3, 59 | "legend": { 60 | "avg": false, 61 | "current": false, 62 | "max": false, 63 | "min": false, 64 | "show": true, 65 | "total": false, 66 | "values": false 67 | }, 68 | "lines": true, 69 | "linewidth": 1, 70 | "links": [], 71 | "nullPointMode": "connected", 72 | "percentage": false, 73 | "pointradius": 2, 74 | "points": false, 75 | "renderer": "flot", 76 | "seriesOverrides": [], 77 | "spaceLength": 10, 78 | "stack": false, 79 | "steppedLine": false, 80 | "targets": [ 81 | { 82 | "expr": "sum(irate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus\"}[5m]))", 83 | "format": "time_series", 84 | "hide": false, 85 | "interval": "", 86 | "intervalFactor": 2, 87 | "legendFormat": "samples", 88 | "metric": "", 89 | "refId": "A", 90 | "step": 20 91 | } 92 | ], 93 | "thresholds": [], 94 | "timeFrom": null, 95 | "timeShift": null, 96 | "title": "Samples Appended", 97 | "tooltip": { 98 | "shared": true, 99 | "sort": 0, 100 | "value_type": "cumulative" 101 | }, 102 | "type": "graph", 103 | "xaxis": { 104 | "buckets": null, 105 | "mode": "time", 106 | "name": null, 107 | "show": true, 108 | "values": [] 109 | }, 110 | "yaxes": [ 111 | { 112 | "format": "short", 113 | "logBase": 1, 114 | "max": null, 115 | "min": "0", 116 | "show": true 117 | }, 118 | { 119 | "format": "short", 120 | "logBase": 1, 121 | "max": null, 122 | "min": null, 123 | "show": true 124 | } 125 | ] 126 | }, 127 | { 128 | "aliasColors": {}, 129 | "bars": false, 130 | "dashLength": 10, 131 | "dashes": false, 132 | "datasource": "Prometheus", 133 | "editable": true, 134 | "error": false, 135 | "fill": 0, 136 | "grid": {}, 137 | "gridPos": { 138 | "h": 5, 139 | "w": 6, 140 | "x": 6, 141 | "y": 0 142 | }, 143 | "id": 14, 144 | "legend": { 145 | "avg": false, 146 | "current": false, 147 | "max": false, 148 | "min": false, 149 | "show": true, 150 | "total": false, 151 | "values": false 152 | }, 153 | "lines": true, 154 | "linewidth": 1, 155 | "links": [], 156 | "nullPointMode": "connected", 157 | "percentage": false, 158 | "pointradius": 5, 159 | "points": false, 160 | "renderer": "flot", 161 | "seriesOverrides": [], 162 | "spaceLength": 10, 163 | "stack": false, 164 | "steppedLine": false, 165 | "targets": [ 166 | { 167 | "expr": "topk(5, max(scrape_duration_seconds) by (job))", 168 | "format": "time_series", 169 | "interval": "", 170 | "intervalFactor": 2, 171 | "legendFormat": "{{job}}", 172 | "metric": "", 173 | "refId": "A", 174 | "step": 20 175 | } 176 | ], 177 | "thresholds": [], 178 | "timeFrom": null, 179 | "timeShift": null, 180 | "title": "Scrape Duration", 181 | "tooltip": { 182 | "shared": true, 183 | "sort": 0, 184 | "value_type": "cumulative" 185 | }, 186 | "type": "graph", 187 | "xaxis": { 188 | "buckets": null, 189 | "mode": "time", 190 | "name": null, 191 | "show": true, 192 | "values": [] 193 | }, 194 | "yaxes": [ 195 | { 196 | "format": "s", 197 | "logBase": 1, 198 | "max": null, 199 | "min": null, 200 | "show": true 201 | }, 202 | { 203 | "format": "short", 204 | "logBase": 1, 205 | "max": null, 206 | "min": null, 207 | "show": true 208 | } 209 | ] 210 | }, 211 | { 212 | "aliasColors": {}, 213 | "bars": false, 214 | "dashLength": 10, 215 | "dashes": false, 216 | "datasource": "Prometheus", 217 | "description": "", 218 | "fill": 0, 219 | "gridPos": { 220 | "h": 5, 221 | "w": 6, 222 | "x": 12, 223 | "y": 0 224 | }, 225 | "id": 16, 226 | "legend": { 227 | "avg": false, 228 | "current": false, 229 | "max": false, 230 | "min": false, 231 | "show": true, 232 | "total": false, 233 | "values": false 234 | }, 235 | "lines": true, 236 | "linewidth": 1, 237 | "links": [], 238 | "nullPointMode": "null", 239 | "percentage": false, 240 | "pointradius": 5, 241 | "points": false, 242 | "renderer": "flot", 243 | "seriesOverrides": [], 244 | "spaceLength": 10, 245 | "stack": false, 246 | "steppedLine": false, 247 | "targets": [ 248 | { 249 | "expr": "sum(process_resident_memory_bytes{job=\"prometheus\"})", 250 | "format": "time_series", 251 | "hide": false, 252 | "interval": "", 253 | "intervalFactor": 2, 254 | "legendFormat": "p8s process resident memory", 255 | "refId": "D", 256 | "step": 20 257 | }, 258 | { 259 | "expr": "process_virtual_memory_bytes{job=\"prometheus\"}", 260 | "format": "time_series", 261 | "hide": false, 262 | "intervalFactor": 2, 263 | "legendFormat": "virtual memory", 264 | "refId": "C", 265 | "step": 20 266 | } 267 | ], 268 | "thresholds": [], 269 | "timeFrom": null, 270 | "timeShift": null, 271 | "title": "Memory Profile", 272 | "tooltip": { 273 | "shared": true, 274 | "sort": 2, 275 | "value_type": "individual" 276 | }, 277 | "transparent": false, 278 | "type": "graph", 279 | "xaxis": { 280 | "buckets": null, 281 | "mode": "time", 282 | "name": null, 283 | "show": true, 284 | "values": [] 285 | }, 286 | "yaxes": [ 287 | { 288 | "format": "bytes", 289 | "label": "", 290 | "logBase": 1, 291 | "max": null, 292 | "min": "0", 293 | "show": true 294 | }, 295 | { 296 | "format": "short", 297 | "label": null, 298 | "logBase": 1, 299 | "max": null, 300 | "min": null, 301 | "show": true 302 | } 303 | ] 304 | }, 305 | { 306 | "cacheTimeout": null, 307 | "colorBackground": false, 308 | "colorValue": true, 309 | "colors": [ 310 | "rgba(50, 172, 45, 0.97)", 311 | "rgba(237, 129, 40, 0.89)", 312 | "rgba(245, 54, 54, 0.9)" 313 | ], 314 | "datasource": "Prometheus", 315 | "format": "none", 316 | "gauge": { 317 | "maxValue": 100, 318 | "minValue": 0, 319 | "show": false, 320 | "thresholdLabels": false, 321 | "thresholdMarkers": true 322 | }, 323 | "gridPos": { 324 | "h": 5, 325 | "w": 6, 326 | "x": 18, 327 | "y": 0 328 | }, 329 | "id": 37, 330 | "interval": null, 331 | "links": [], 332 | "mappingType": 1, 333 | "mappingTypes": [ 334 | { 335 | "name": "value to text", 336 | "value": 1 337 | }, 338 | { 339 | "name": "range to text", 340 | "value": 2 341 | } 342 | ], 343 | "maxDataPoints": 100, 344 | "nullPointMode": "connected", 345 | "nullText": null, 346 | "postfix": "", 347 | "postfixFontSize": "50%", 348 | "prefix": "", 349 | "prefixFontSize": "50%", 350 | "rangeMaps": [ 351 | { 352 | "from": "null", 353 | "text": "N/A", 354 | "to": "null" 355 | } 356 | ], 357 | "sparkline": { 358 | "fillColor": "rgba(31, 118, 189, 0.18)", 359 | "full": false, 360 | "lineColor": "rgb(31, 120, 193)", 361 | "show": false 362 | }, 363 | "tableColumn": "", 364 | "targets": [ 365 | { 366 | "expr": "prometheus_tsdb_wal_corruptions_total{job=\"prometheus\"}", 367 | "format": "time_series", 368 | "intervalFactor": 2, 369 | "legendFormat": "", 370 | "refId": "A", 371 | "step": 60 372 | } 373 | ], 374 | "thresholds": "0.1,1", 375 | "title": "WAL Corruptions", 376 | "type": "singlestat", 377 | "valueFontSize": "200%", 378 | "valueMaps": [ 379 | { 380 | "op": "=", 381 | "text": "None", 382 | "value": "0" 383 | } 384 | ], 385 | "valueName": "max" 386 | }, 387 | { 388 | "aliasColors": {}, 389 | "bars": false, 390 | "dashLength": 10, 391 | "dashes": false, 392 | "datasource": "Prometheus", 393 | "fill": 0, 394 | "gridPos": { 395 | "h": 5, 396 | "w": 6, 397 | "x": 0, 398 | "y": 5 399 | }, 400 | "id": 29, 401 | "legend": { 402 | "avg": false, 403 | "current": false, 404 | "max": false, 405 | "min": false, 406 | "show": true, 407 | "total": false, 408 | "values": false 409 | }, 410 | "lines": true, 411 | "linewidth": 1, 412 | "links": [], 413 | "nullPointMode": "null", 414 | "percentage": false, 415 | "pointradius": 5, 416 | "points": false, 417 | "renderer": "flot", 418 | "seriesOverrides": [], 419 | "spaceLength": 10, 420 | "stack": false, 421 | "steppedLine": false, 422 | "targets": [ 423 | { 424 | "expr": "sum(prometheus_tsdb_head_active_appenders{job=\"prometheus\"})", 425 | "format": "time_series", 426 | "interval": "", 427 | "intervalFactor": 2, 428 | "legendFormat": "active_appenders", 429 | "metric": "", 430 | "refId": "A", 431 | "step": 20 432 | }, 433 | { 434 | "expr": "sum(process_open_fds{job=\"prometheus\"})", 435 | "format": "time_series", 436 | "interval": "", 437 | "intervalFactor": 2, 438 | "legendFormat": "open_fds", 439 | "refId": "B", 440 | "step": 20 441 | } 442 | ], 443 | "thresholds": [], 444 | "timeFrom": null, 445 | "timeShift": null, 446 | "title": "Active Appenders", 447 | "tooltip": { 448 | "shared": true, 449 | "sort": 0, 450 | "value_type": "individual" 451 | }, 452 | "type": "graph", 453 | "xaxis": { 454 | "buckets": null, 455 | "mode": "time", 456 | "name": null, 457 | "show": true, 458 | "values": [] 459 | }, 460 | "yaxes": [ 461 | { 462 | "format": "short", 463 | "label": null, 464 | "logBase": 1, 465 | "max": null, 466 | "min": null, 467 | "show": true 468 | }, 469 | { 470 | "format": "short", 471 | "label": null, 472 | "logBase": 1, 473 | "max": null, 474 | "min": null, 475 | "show": false 476 | } 477 | ] 478 | }, 479 | { 480 | "aliasColors": { 481 | "prometheus": "#F9BA8F", 482 | "{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}": "#F9BA8F" 483 | }, 484 | "bars": false, 485 | "dashLength": 10, 486 | "dashes": false, 487 | "datasource": "Prometheus", 488 | "editable": true, 489 | "error": false, 490 | "fill": 0, 491 | "grid": {}, 492 | "gridPos": { 493 | "h": 5, 494 | "w": 6, 495 | "x": 6, 496 | "y": 5 497 | }, 498 | "id": 2, 499 | "legend": { 500 | "avg": false, 501 | "current": false, 502 | "max": false, 503 | "min": false, 504 | "show": true, 505 | "total": false, 506 | "values": false 507 | }, 508 | "lines": true, 509 | "linewidth": 1, 510 | "links": [], 511 | "nullPointMode": "connected", 512 | "percentage": false, 513 | "pointradius": 5, 514 | "points": false, 515 | "renderer": "flot", 516 | "seriesOverrides": [], 517 | "spaceLength": 10, 518 | "stack": false, 519 | "steppedLine": false, 520 | "targets": [ 521 | { 522 | "expr": "prometheus_tsdb_blocks_loaded{job=\"prometheus\"}", 523 | "format": "time_series", 524 | "intervalFactor": 2, 525 | "legendFormat": "blocks", 526 | "refId": "A", 527 | "step": 20 528 | } 529 | ], 530 | "thresholds": [], 531 | "timeFrom": null, 532 | "timeShift": null, 533 | "title": "Blocks Loaded", 534 | "tooltip": { 535 | "shared": true, 536 | "sort": 0, 537 | "value_type": "cumulative" 538 | }, 539 | "type": "graph", 540 | "xaxis": { 541 | "buckets": null, 542 | "mode": "time", 543 | "name": null, 544 | "show": true, 545 | "values": [] 546 | }, 547 | "yaxes": [ 548 | { 549 | "format": "short", 550 | "logBase": 1, 551 | "max": null, 552 | "min": null, 553 | "show": true 554 | }, 555 | { 556 | "format": "short", 557 | "logBase": 1, 558 | "max": null, 559 | "min": null, 560 | "show": true 561 | } 562 | ] 563 | }, 564 | { 565 | "aliasColors": {}, 566 | "bars": false, 567 | "dashLength": 10, 568 | "dashes": false, 569 | "datasource": "Prometheus", 570 | "decimals": null, 571 | "description": "", 572 | "fill": 0, 573 | "gridPos": { 574 | "h": 5, 575 | "w": 6, 576 | "x": 12, 577 | "y": 5 578 | }, 579 | "id": 33, 580 | "legend": { 581 | "avg": false, 582 | "current": false, 583 | "max": false, 584 | "min": false, 585 | "show": true, 586 | "total": false, 587 | "values": false 588 | }, 589 | "lines": true, 590 | "linewidth": 1, 591 | "links": [], 592 | "nullPointMode": "connected", 593 | "percentage": false, 594 | "pointradius": 5, 595 | "points": false, 596 | "renderer": "flot", 597 | "seriesOverrides": [], 598 | "spaceLength": 10, 599 | "stack": false, 600 | "steppedLine": false, 601 | "targets": [ 602 | { 603 | "expr": "prometheus_tsdb_head_chunks{job=\"prometheus\"}", 604 | "format": "time_series", 605 | "interval": "", 606 | "intervalFactor": 2, 607 | "legendFormat": "chunks", 608 | "refId": "A", 609 | "step": 20 610 | } 611 | ], 612 | "thresholds": [], 613 | "timeFrom": null, 614 | "timeShift": null, 615 | "title": "Head Chunks", 616 | "tooltip": { 617 | "shared": true, 618 | "sort": 0, 619 | "value_type": "individual" 620 | }, 621 | "type": "graph", 622 | "xaxis": { 623 | "buckets": null, 624 | "mode": "time", 625 | "name": null, 626 | "show": true, 627 | "values": [] 628 | }, 629 | "yaxes": [ 630 | { 631 | "format": "short", 632 | "label": null, 633 | "logBase": 1, 634 | "max": null, 635 | "min": null, 636 | "show": true 637 | }, 638 | { 639 | "format": "bytes", 640 | "label": "", 641 | "logBase": 1, 642 | "max": null, 643 | "min": null, 644 | "show": false 645 | } 646 | ] 647 | }, 648 | { 649 | "aliasColors": {}, 650 | "bars": false, 651 | "dashLength": 10, 652 | "dashes": false, 653 | "datasource": "Prometheus", 654 | "fill": 1, 655 | "gridPos": { 656 | "h": 5, 657 | "w": 6, 658 | "x": 18, 659 | "y": 5 660 | }, 661 | "id": 36, 662 | "legend": { 663 | "avg": false, 664 | "current": false, 665 | "max": false, 666 | "min": false, 667 | "show": true, 668 | "total": false, 669 | "values": false 670 | }, 671 | "lines": true, 672 | "linewidth": 1, 673 | "links": [], 674 | "nullPointMode": "null", 675 | "percentage": false, 676 | "pointradius": 5, 677 | "points": false, 678 | "renderer": "flot", 679 | "seriesOverrides": [ 680 | { 681 | "alias": "duration-p99", 682 | "yaxis": 2 683 | } 684 | ], 685 | "spaceLength": 10, 686 | "stack": false, 687 | "steppedLine": false, 688 | "targets": [ 689 | { 690 | "expr": "prometheus_tsdb_head_gc_duration_seconds{job=\"prometheus\",quantile=\"0.99\"}", 691 | "format": "time_series", 692 | "intervalFactor": 2, 693 | "legendFormat": "duration-p99", 694 | "refId": "A", 695 | "step": 20 696 | }, 697 | { 698 | "expr": "irate(prometheus_tsdb_head_gc_duration_seconds_count{job=\"prometheus\"}[5m])", 699 | "format": "time_series", 700 | "intervalFactor": 2, 701 | "legendFormat": "collections", 702 | "refId": "B", 703 | "step": 20 704 | } 705 | ], 706 | "thresholds": [], 707 | "timeFrom": null, 708 | "timeShift": null, 709 | "title": "Head Block GC Activity", 710 | "tooltip": { 711 | "shared": true, 712 | "sort": 0, 713 | "value_type": "individual" 714 | }, 715 | "type": "graph", 716 | "xaxis": { 717 | "buckets": null, 718 | "mode": "time", 719 | "name": null, 720 | "show": true, 721 | "values": [] 722 | }, 723 | "yaxes": [ 724 | { 725 | "format": "short", 726 | "label": null, 727 | "logBase": 1, 728 | "max": null, 729 | "min": "0", 730 | "show": true 731 | }, 732 | { 733 | "format": "s", 734 | "label": null, 735 | "logBase": 1, 736 | "max": null, 737 | "min": "0", 738 | "show": true 739 | } 740 | ] 741 | }, 742 | { 743 | "aliasColors": {}, 744 | "bars": false, 745 | "dashLength": 10, 746 | "dashes": false, 747 | "datasource": "Prometheus", 748 | "decimals": null, 749 | "description": "", 750 | "fill": 0, 751 | "gridPos": { 752 | "h": 5, 753 | "w": 8, 754 | "x": 0, 755 | "y": 10 756 | }, 757 | "id": 20, 758 | "legend": { 759 | "avg": false, 760 | "current": false, 761 | "max": false, 762 | "min": false, 763 | "show": true, 764 | "total": false, 765 | "values": false 766 | }, 767 | "lines": true, 768 | "linewidth": 1, 769 | "links": [], 770 | "nullPointMode": "connected", 771 | "percentage": false, 772 | "pointradius": 5, 773 | "points": false, 774 | "renderer": "flot", 775 | "seriesOverrides": [ 776 | { 777 | "alias": "duration-p99", 778 | "yaxis": 2 779 | } 780 | ], 781 | "spaceLength": 10, 782 | "stack": false, 783 | "steppedLine": false, 784 | "targets": [ 785 | { 786 | "expr": "histogram_quantile(0.99, sum(rate(prometheus_tsdb_compaction_duration_bucket{job=\"prometheus\"}[5m])) by (le))", 787 | "format": "time_series", 788 | "hide": false, 789 | "interval": "", 790 | "intervalFactor": 2, 791 | "legendFormat": "duration-{{p99}}", 792 | "refId": "A", 793 | "step": 20 794 | }, 795 | { 796 | "expr": "irate(prometheus_tsdb_compactions_total{job=\"prometheus\"}[5m])", 797 | "format": "time_series", 798 | "intervalFactor": 2, 799 | "legendFormat": "compactions", 800 | "refId": "B", 801 | "step": 20 802 | }, 803 | { 804 | "expr": "irate(prometheus_tsdb_compactions_failed_total{job=\"prometheus\"}[5m])", 805 | "format": "time_series", 806 | "intervalFactor": 2, 807 | "legendFormat": "failed", 808 | "refId": "C", 809 | "step": 20 810 | }, 811 | { 812 | "expr": "irate(prometheus_tsdb_compactions_triggered_total{job=\"prometheus\"}[5m])", 813 | "format": "time_series", 814 | "intervalFactor": 2, 815 | "legendFormat": "triggered", 816 | "refId": "D", 817 | "step": 20 818 | } 819 | ], 820 | "thresholds": [], 821 | "timeFrom": null, 822 | "timeShift": null, 823 | "title": "Compaction Activity", 824 | "tooltip": { 825 | "shared": true, 826 | "sort": 0, 827 | "value_type": "individual" 828 | }, 829 | "type": "graph", 830 | "xaxis": { 831 | "buckets": null, 832 | "mode": "time", 833 | "name": null, 834 | "show": true, 835 | "values": [] 836 | }, 837 | "yaxes": [ 838 | { 839 | "format": "short", 840 | "label": null, 841 | "logBase": 1, 842 | "max": null, 843 | "min": "0", 844 | "show": true 845 | }, 846 | { 847 | "format": "s", 848 | "label": "", 849 | "logBase": 1, 850 | "max": null, 851 | "min": "0", 852 | "show": true 853 | } 854 | ] 855 | }, 856 | { 857 | "aliasColors": {}, 858 | "bars": false, 859 | "dashLength": 10, 860 | "dashes": false, 861 | "datasource": "Prometheus", 862 | "fill": 1, 863 | "gridPos": { 864 | "h": 5, 865 | "w": 8, 866 | "x": 8, 867 | "y": 10 868 | }, 869 | "id": 32, 870 | "legend": { 871 | "avg": false, 872 | "current": false, 873 | "max": false, 874 | "min": false, 875 | "show": true, 876 | "total": false, 877 | "values": false 878 | }, 879 | "lines": true, 880 | "linewidth": 1, 881 | "links": [], 882 | "nullPointMode": "null", 883 | "percentage": false, 884 | "pointradius": 5, 885 | "points": false, 886 | "renderer": "flot", 887 | "seriesOverrides": [], 888 | "spaceLength": 10, 889 | "stack": false, 890 | "steppedLine": false, 891 | "targets": [ 892 | { 893 | "expr": "rate(prometheus_tsdb_reloads_total{job=\"prometheus\"}[5m])", 894 | "format": "time_series", 895 | "intervalFactor": 2, 896 | "legendFormat": "reloads", 897 | "refId": "A", 898 | "step": 20 899 | }, 900 | { 901 | "expr": "rate(prometheus_tsdb_reloads_failures_total{job=\"prometheus\"}[5m])", 902 | "format": "time_series", 903 | "hide": false, 904 | "intervalFactor": 2, 905 | "legendFormat": "failures", 906 | "refId": "B", 907 | "step": 20 908 | } 909 | ], 910 | "thresholds": [], 911 | "timeFrom": null, 912 | "timeShift": null, 913 | "title": "Reload Count", 914 | "tooltip": { 915 | "shared": true, 916 | "sort": 0, 917 | "value_type": "individual" 918 | }, 919 | "type": "graph", 920 | "xaxis": { 921 | "buckets": null, 922 | "mode": "time", 923 | "name": null, 924 | "show": true, 925 | "values": [] 926 | }, 927 | "yaxes": [ 928 | { 929 | "format": "short", 930 | "label": null, 931 | "logBase": 1, 932 | "max": null, 933 | "min": null, 934 | "show": true 935 | }, 936 | { 937 | "format": "short", 938 | "label": null, 939 | "logBase": 1, 940 | "max": null, 941 | "min": null, 942 | "show": true 943 | } 944 | ] 945 | }, 946 | { 947 | "aliasColors": {}, 948 | "bars": false, 949 | "dashLength": 10, 950 | "dashes": false, 951 | "datasource": "Prometheus", 952 | "fill": 0, 953 | "gridPos": { 954 | "h": 5, 955 | "w": 8, 956 | "x": 16, 957 | "y": 10 958 | }, 959 | "id": 38, 960 | "legend": { 961 | "avg": false, 962 | "current": false, 963 | "max": false, 964 | "min": false, 965 | "show": true, 966 | "total": false, 967 | "values": false 968 | }, 969 | "lines": true, 970 | "linewidth": 1, 971 | "links": [], 972 | "nullPointMode": "null", 973 | "percentage": false, 974 | "pointradius": 5, 975 | "points": false, 976 | "renderer": "flot", 977 | "seriesOverrides": [], 978 | "spaceLength": 10, 979 | "stack": false, 980 | "steppedLine": false, 981 | "targets": [ 982 | { 983 | "expr": "prometheus_engine_query_duration_seconds{job=\"prometheus\", quantile=\"0.99\"}", 984 | "format": "time_series", 985 | "intervalFactor": 2, 986 | "legendFormat": "{{slice}}_p99", 987 | "refId": "A", 988 | "step": 20 989 | } 990 | ], 991 | "thresholds": [], 992 | "timeFrom": null, 993 | "timeShift": null, 994 | "title": "Query Durations", 995 | "tooltip": { 996 | "shared": true, 997 | "sort": 0, 998 | "value_type": "individual" 999 | }, 1000 | "type": "graph", 1001 | "xaxis": { 1002 | "buckets": null, 1003 | "mode": "time", 1004 | "name": null, 1005 | "show": true, 1006 | "values": [] 1007 | }, 1008 | "yaxes": [ 1009 | { 1010 | "format": "short", 1011 | "label": null, 1012 | "logBase": 1, 1013 | "max": null, 1014 | "min": null, 1015 | "show": true 1016 | }, 1017 | { 1018 | "format": "short", 1019 | "label": null, 1020 | "logBase": 1, 1021 | "max": null, 1022 | "min": null, 1023 | "show": true 1024 | } 1025 | ] 1026 | }, 1027 | { 1028 | "aliasColors": {}, 1029 | "bars": false, 1030 | "dashLength": 10, 1031 | "dashes": false, 1032 | "datasource": "Prometheus", 1033 | "decimals": null, 1034 | "editable": true, 1035 | "error": false, 1036 | "fill": 0, 1037 | "grid": {}, 1038 | "gridPos": { 1039 | "h": 7, 1040 | "w": 12, 1041 | "x": 0, 1042 | "y": 15 1043 | }, 1044 | "id": 35, 1045 | "legend": { 1046 | "alignAsTable": false, 1047 | "avg": false, 1048 | "current": false, 1049 | "hideEmpty": true, 1050 | "max": false, 1051 | "min": false, 1052 | "show": true, 1053 | "total": false, 1054 | "values": false 1055 | }, 1056 | "lines": true, 1057 | "linewidth": 1, 1058 | "links": [], 1059 | "nullPointMode": "connected", 1060 | "percentage": false, 1061 | "pointradius": 5, 1062 | "points": false, 1063 | "renderer": "flot", 1064 | "seriesOverrides": [], 1065 | "spaceLength": 10, 1066 | "stack": false, 1067 | "steppedLine": false, 1068 | "targets": [ 1069 | { 1070 | "expr": "max(prometheus_rule_group_duration_seconds{job=\"prometheus\"}) by (quantile)", 1071 | "format": "time_series", 1072 | "interval": "", 1073 | "intervalFactor": 2, 1074 | "legendFormat": "{{quantile}}", 1075 | "refId": "A", 1076 | "step": 10 1077 | } 1078 | ], 1079 | "thresholds": [], 1080 | "timeFrom": null, 1081 | "timeShift": null, 1082 | "title": "Rule Group Eval Duration", 1083 | "tooltip": { 1084 | "shared": true, 1085 | "sort": 0, 1086 | "value_type": "cumulative" 1087 | }, 1088 | "type": "graph", 1089 | "xaxis": { 1090 | "buckets": null, 1091 | "mode": "time", 1092 | "name": null, 1093 | "show": true, 1094 | "values": [] 1095 | }, 1096 | "yaxes": [ 1097 | { 1098 | "format": "s", 1099 | "label": "", 1100 | "logBase": 1, 1101 | "max": null, 1102 | "min": null, 1103 | "show": true 1104 | }, 1105 | { 1106 | "format": "short", 1107 | "logBase": 1, 1108 | "max": null, 1109 | "min": null, 1110 | "show": true 1111 | } 1112 | ] 1113 | }, 1114 | { 1115 | "aliasColors": {}, 1116 | "bars": false, 1117 | "dashLength": 10, 1118 | "dashes": false, 1119 | "datasource": "Prometheus", 1120 | "fill": 1, 1121 | "gridPos": { 1122 | "h": 7, 1123 | "w": 12, 1124 | "x": 12, 1125 | "y": 15 1126 | }, 1127 | "id": 39, 1128 | "legend": { 1129 | "avg": false, 1130 | "current": false, 1131 | "max": false, 1132 | "min": false, 1133 | "show": true, 1134 | "total": false, 1135 | "values": false 1136 | }, 1137 | "lines": true, 1138 | "linewidth": 1, 1139 | "links": [], 1140 | "nullPointMode": "null", 1141 | "percentage": false, 1142 | "pointradius": 5, 1143 | "points": false, 1144 | "renderer": "flot", 1145 | "seriesOverrides": [], 1146 | "spaceLength": 10, 1147 | "stack": true, 1148 | "steppedLine": false, 1149 | "targets": [ 1150 | { 1151 | "expr": "rate(prometheus_rule_group_iterations_missed_total{job=\"prometheus\"}[5m])", 1152 | "format": "time_series", 1153 | "intervalFactor": 2, 1154 | "legendFormat": "missed", 1155 | "refId": "B", 1156 | "step": 10 1157 | }, 1158 | { 1159 | "expr": "rate(prometheus_rule_group_iterations_total{job=\"prometheus\"}[5m])", 1160 | "format": "time_series", 1161 | "intervalFactor": 2, 1162 | "legendFormat": "iterations", 1163 | "refId": "A", 1164 | "step": 10 1165 | } 1166 | ], 1167 | "thresholds": [], 1168 | "timeFrom": null, 1169 | "timeShift": null, 1170 | "title": "Rule Group Eval Activity", 1171 | "tooltip": { 1172 | "shared": true, 1173 | "sort": 0, 1174 | "value_type": "individual" 1175 | }, 1176 | "type": "graph", 1177 | "xaxis": { 1178 | "buckets": null, 1179 | "mode": "time", 1180 | "name": null, 1181 | "show": true, 1182 | "values": [] 1183 | }, 1184 | "yaxes": [ 1185 | { 1186 | "format": "short", 1187 | "label": null, 1188 | "logBase": 1, 1189 | "max": null, 1190 | "min": null, 1191 | "show": true 1192 | }, 1193 | { 1194 | "format": "short", 1195 | "label": null, 1196 | "logBase": 1, 1197 | "max": null, 1198 | "min": null, 1199 | "show": true 1200 | } 1201 | ] 1202 | } 1203 | ], 1204 | "refresh": "1m", 1205 | "revision": "1.0", 1206 | "schemaVersion": 16, 1207 | "style": "dark", 1208 | "tags": [ 1209 | "prometheus" 1210 | ], 1211 | "templating": { 1212 | "list": [] 1213 | }, 1214 | "time": { 1215 | "from": "now-1h", 1216 | "to": "now" 1217 | }, 1218 | "timepicker": { 1219 | "now": true, 1220 | "refresh_intervals": [ 1221 | "5s", 1222 | "10s", 1223 | "30s", 1224 | "1m", 1225 | "5m", 1226 | "15m", 1227 | "30m", 1228 | "1h", 1229 | "2h", 1230 | "1d" 1231 | ], 1232 | "time_options": [ 1233 | "5m", 1234 | "15m", 1235 | "1h", 1236 | "6h", 1237 | "12h", 1238 | "24h", 1239 | "2d", 1240 | "7d", 1241 | "30d" 1242 | ] 1243 | }, 1244 | "timezone": "browser", 1245 | "title": "Prometheus 2.0 Stats", 1246 | "uid": "mGFfYSRiz", 1247 | "version": 1 1248 | } 1249 | -------------------------------------------------------------------------------- /prometheus/grafana/dashboards/swarmprom-services-dash.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "$$hashKey": "object:429", 6 | "builtIn": 1, 7 | "datasource": "-- Grafana --", 8 | "enable": true, 9 | "hide": true, 10 | "iconColor": "rgba(0, 211, 255, 1)", 11 | "name": "Annotations & Alerts", 12 | "type": "dashboard" 13 | } 14 | ] 15 | }, 16 | "description": "Docker Swarm stacks and services metrics", 17 | "editable": true, 18 | "gnetId": null, 19 | "graphTooltip": 0, 20 | "iteration": 1520585594614, 21 | "links": [], 22 | "panels": [ 23 | { 24 | "cacheTimeout": null, 25 | "colorBackground": false, 26 | "colorValue": false, 27 | "colors": [ 28 | "rgba(245, 54, 54, 0.9)", 29 | "rgba(237, 129, 40, 0.89)", 30 | "rgba(50, 172, 45, 0.97)" 31 | ], 32 | "datasource": null, 33 | "decimals": 0, 34 | "format": "none", 35 | "gauge": { 36 | "maxValue": 100, 37 | "minValue": 0, 38 | "show": false, 39 | "thresholdLabels": false, 40 | "thresholdMarkers": true 41 | }, 42 | "gridPos": { 43 | "h": 4, 44 | "w": 6, 45 | "x": 0, 46 | "y": 0 47 | }, 48 | "hideTimeOverride": true, 49 | "id": 1, 50 | "interval": null, 51 | "links": [], 52 | "mappingType": 1, 53 | "mappingTypes": [ 54 | { 55 | "name": "value to text", 56 | "value": 1 57 | }, 58 | { 59 | "name": "range to text", 60 | "value": 2 61 | } 62 | ], 63 | "maxDataPoints": 100, 64 | "nullPointMode": "connected", 65 | "nullText": null, 66 | "postfix": "", 67 | "postfixFontSize": "50%", 68 | "prefix": "", 69 | "prefixFontSize": "50%", 70 | "rangeMaps": [ 71 | { 72 | "from": "null", 73 | "text": "N/A", 74 | "to": "null" 75 | } 76 | ], 77 | "sparkline": { 78 | "fillColor": "rgba(31, 118, 189, 0.18)", 79 | "full": false, 80 | "lineColor": "rgb(31, 120, 193)", 81 | "show": false 82 | }, 83 | "tableColumn": "", 84 | "targets": [ 85 | { 86 | "expr": "count(count(container_tasks_state{container_label_com_docker_swarm_node_id =~\"$node_id\"}) by (container_label_com_docker_swarm_node_id))", 87 | "format": "time_series", 88 | "intervalFactor": 2, 89 | "legendFormat": "", 90 | "refId": "A", 91 | "step": 2 92 | } 93 | ], 94 | "thresholds": "", 95 | "timeFrom": "1m", 96 | "title": "Nodes", 97 | "type": "singlestat", 98 | "valueFontSize": "80%", 99 | "valueMaps": [ 100 | { 101 | "op": "=", 102 | "text": "N/A", 103 | "value": "null" 104 | } 105 | ], 106 | "valueName": "avg" 107 | }, 108 | { 109 | "cacheTimeout": null, 110 | "colorBackground": false, 111 | "colorValue": false, 112 | "colors": [ 113 | "rgba(245, 54, 54, 0.9)", 114 | "rgba(237, 129, 40, 0.89)", 115 | "rgba(50, 172, 45, 0.97)" 116 | ], 117 | "datasource": null, 118 | "decimals": 0, 119 | "format": "none", 120 | "gauge": { 121 | "maxValue": 100, 122 | "minValue": 0, 123 | "show": false, 124 | "thresholdLabels": false, 125 | "thresholdMarkers": true 126 | }, 127 | "gridPos": { 128 | "h": 4, 129 | "w": 6, 130 | "x": 6, 131 | "y": 0 132 | }, 133 | "hideTimeOverride": true, 134 | "id": 21, 135 | "interval": null, 136 | "links": [], 137 | "mappingType": 1, 138 | "mappingTypes": [ 139 | { 140 | "name": "value to text", 141 | "value": 1 142 | }, 143 | { 144 | "name": "range to text", 145 | "value": 2 146 | } 147 | ], 148 | "maxDataPoints": 100, 149 | "nullPointMode": "connected", 150 | "nullText": null, 151 | "postfix": "", 152 | "postfixFontSize": "50%", 153 | "prefix": "", 154 | "prefixFontSize": "50%", 155 | "rangeMaps": [ 156 | { 157 | "from": "null", 158 | "text": "N/A", 159 | "to": "null" 160 | } 161 | ], 162 | "sparkline": { 163 | "fillColor": "rgba(31, 118, 189, 0.18)", 164 | "full": false, 165 | "lineColor": "rgb(31, 120, 193)", 166 | "show": false 167 | }, 168 | "tableColumn": "", 169 | "targets": [ 170 | { 171 | "expr": "count(count(container_tasks_state{container_label_com_docker_stack_namespace=~\".+\", container_label_com_docker_swarm_node_id=~\"$node_id\"}) by (container_label_com_docker_stack_namespace))", 172 | "format": "time_series", 173 | "intervalFactor": 2, 174 | "legendFormat": "", 175 | "refId": "A", 176 | "step": 2 177 | } 178 | ], 179 | "thresholds": "", 180 | "timeFrom": "1m", 181 | "title": "Stacks", 182 | "type": "singlestat", 183 | "valueFontSize": "80%", 184 | "valueMaps": [ 185 | { 186 | "op": "=", 187 | "text": "N/A", 188 | "value": "null" 189 | } 190 | ], 191 | "valueName": "avg" 192 | }, 193 | { 194 | "cacheTimeout": null, 195 | "colorBackground": false, 196 | "colorValue": false, 197 | "colors": [ 198 | "rgba(245, 54, 54, 0.9)", 199 | "rgba(237, 129, 40, 0.89)", 200 | "rgba(50, 172, 45, 0.97)" 201 | ], 202 | "datasource": null, 203 | "decimals": 0, 204 | "format": "none", 205 | "gauge": { 206 | "maxValue": 100, 207 | "minValue": 0, 208 | "show": false, 209 | "thresholdLabels": false, 210 | "thresholdMarkers": true 211 | }, 212 | "gridPos": { 213 | "h": 4, 214 | "w": 6, 215 | "x": 12, 216 | "y": 0 217 | }, 218 | "hideTimeOverride": true, 219 | "id": 20, 220 | "interval": null, 221 | "links": [], 222 | "mappingType": 1, 223 | "mappingTypes": [ 224 | { 225 | "name": "value to text", 226 | "value": 1 227 | }, 228 | { 229 | "name": "range to text", 230 | "value": 2 231 | } 232 | ], 233 | "maxDataPoints": 100, 234 | "nullPointMode": "connected", 235 | "nullText": null, 236 | "postfix": "", 237 | "postfixFontSize": "50%", 238 | "prefix": "", 239 | "prefixFontSize": "50%", 240 | "rangeMaps": [ 241 | { 242 | "from": "null", 243 | "text": "N/A", 244 | "to": "null" 245 | } 246 | ], 247 | "sparkline": { 248 | "fillColor": "rgba(31, 118, 189, 0.18)", 249 | "full": false, 250 | "lineColor": "rgb(31, 120, 193)", 251 | "show": false 252 | }, 253 | "tableColumn": "", 254 | "targets": [ 255 | { 256 | "expr": "count(count(container_tasks_state{container_label_com_docker_swarm_service_name=~\".+\", container_label_com_docker_swarm_node_id=~\"$node_id\"}) by (container_label_com_docker_swarm_service_name))", 257 | "format": "time_series", 258 | "intervalFactor": 2, 259 | "refId": "A", 260 | "step": 2 261 | } 262 | ], 263 | "thresholds": "", 264 | "timeFrom": "1m", 265 | "timeShift": null, 266 | "title": "Services", 267 | "type": "singlestat", 268 | "valueFontSize": "80%", 269 | "valueMaps": [ 270 | { 271 | "op": "=", 272 | "text": "N/A", 273 | "value": "null" 274 | } 275 | ], 276 | "valueName": "avg" 277 | }, 278 | { 279 | "cacheTimeout": null, 280 | "colorBackground": false, 281 | "colorValue": false, 282 | "colors": [ 283 | "rgba(245, 54, 54, 0.9)", 284 | "rgba(237, 129, 40, 0.89)", 285 | "rgba(50, 172, 45, 0.97)" 286 | ], 287 | "datasource": null, 288 | "decimals": 0, 289 | "format": "none", 290 | "gauge": { 291 | "maxValue": 100, 292 | "minValue": 0, 293 | "show": false, 294 | "thresholdLabels": false, 295 | "thresholdMarkers": true 296 | }, 297 | "gridPos": { 298 | "h": 4, 299 | "w": 6, 300 | "x": 18, 301 | "y": 0 302 | }, 303 | "hideTimeOverride": true, 304 | "id": 7, 305 | "interval": null, 306 | "links": [], 307 | "mappingType": 1, 308 | "mappingTypes": [ 309 | { 310 | "name": "value to text", 311 | "value": 1 312 | }, 313 | { 314 | "name": "range to text", 315 | "value": 2 316 | } 317 | ], 318 | "maxDataPoints": 100, 319 | "nullPointMode": "connected", 320 | "nullText": null, 321 | "postfix": "", 322 | "postfixFontSize": "50%", 323 | "prefix": "", 324 | "prefixFontSize": "50%", 325 | "rangeMaps": [ 326 | { 327 | "from": "null", 328 | "text": "N/A", 329 | "to": "null" 330 | } 331 | ], 332 | "sparkline": { 333 | "fillColor": "rgba(31, 118, 189, 0.18)", 334 | "full": false, 335 | "lineColor": "rgb(31, 120, 193)", 336 | "show": false 337 | }, 338 | "tableColumn": "", 339 | "targets": [ 340 | { 341 | "expr": "count(rate(container_last_seen{container_label_com_docker_swarm_node_id=~\"$node_id\"}[5m])) ", 342 | "format": "time_series", 343 | "intervalFactor": 2, 344 | "refId": "A", 345 | "step": 2 346 | } 347 | ], 348 | "thresholds": "", 349 | "timeFrom": "1m", 350 | "title": "Containers", 351 | "type": "singlestat", 352 | "valueFontSize": "80%", 353 | "valueMaps": [ 354 | { 355 | "op": "=", 356 | "text": "N/A", 357 | "value": "null" 358 | } 359 | ], 360 | "valueName": "avg" 361 | }, 362 | { 363 | "aliasColors": {}, 364 | "bars": true, 365 | "dashLength": 10, 366 | "dashes": false, 367 | "datasource": null, 368 | "decimals": 0, 369 | "fill": 5, 370 | "gridPos": { 371 | "h": 7, 372 | "w": 12, 373 | "x": 0, 374 | "y": 4 375 | }, 376 | "id": 12, 377 | "legend": { 378 | "alignAsTable": true, 379 | "avg": false, 380 | "current": true, 381 | "hideEmpty": true, 382 | "hideZero": true, 383 | "max": false, 384 | "min": false, 385 | "rightSide": true, 386 | "show": true, 387 | "sort": "current", 388 | "sortDesc": true, 389 | "total": false, 390 | "values": true 391 | }, 392 | "lines": false, 393 | "linewidth": 1, 394 | "links": [], 395 | "nullPointMode": "null", 396 | "percentage": false, 397 | "pointradius": 5, 398 | "points": false, 399 | "renderer": "flot", 400 | "seriesOverrides": [], 401 | "spaceLength": 10, 402 | "stack": true, 403 | "steppedLine": false, 404 | "targets": [ 405 | { 406 | "expr": "sum(rate(container_last_seen{container_label_com_docker_swarm_node_id=~\"$node_id\"}[5m])) by (container_label_com_docker_swarm_service_name)", 407 | "format": "time_series", 408 | "intervalFactor": 10, 409 | "legendFormat": "{{ container_label_com_docker_swarm_service_name }}", 410 | "refId": "A", 411 | "step": 10 412 | } 413 | ], 414 | "thresholds": [], 415 | "timeFrom": null, 416 | "timeShift": null, 417 | "title": "Service Tasks", 418 | "tooltip": { 419 | "shared": true, 420 | "sort": 2, 421 | "value_type": "individual" 422 | }, 423 | "type": "graph", 424 | "xaxis": { 425 | "buckets": null, 426 | "mode": "time", 427 | "name": null, 428 | "show": true, 429 | "values": [] 430 | }, 431 | "yaxes": [ 432 | { 433 | "format": "short", 434 | "label": null, 435 | "logBase": 1, 436 | "max": null, 437 | "min": null, 438 | "show": true 439 | }, 440 | { 441 | "format": "short", 442 | "label": null, 443 | "logBase": 1, 444 | "max": null, 445 | "min": null, 446 | "show": true 447 | } 448 | ] 449 | }, 450 | { 451 | "aliasColors": {}, 452 | "bars": false, 453 | "dashLength": 10, 454 | "dashes": false, 455 | "datasource": null, 456 | "decimals": 0, 457 | "fill": 1, 458 | "gridPos": { 459 | "h": 7, 460 | "w": 12, 461 | "x": 12, 462 | "y": 4 463 | }, 464 | "id": 32, 465 | "legend": { 466 | "alignAsTable": true, 467 | "avg": false, 468 | "current": true, 469 | "hideEmpty": true, 470 | "hideZero": true, 471 | "max": false, 472 | "min": false, 473 | "rightSide": true, 474 | "show": false, 475 | "sort": "current", 476 | "sortDesc": true, 477 | "total": false, 478 | "values": true 479 | }, 480 | "lines": true, 481 | "linewidth": 1, 482 | "links": [], 483 | "nullPointMode": "null", 484 | "percentage": false, 485 | "pointradius": 5, 486 | "points": false, 487 | "renderer": "flot", 488 | "seriesOverrides": [], 489 | "spaceLength": 10, 490 | "stack": false, 491 | "steppedLine": false, 492 | "targets": [ 493 | { 494 | "expr": "sum(increase(engine_daemon_health_checks_total[$interval]) * on(instance) group_left(node_id) swarm_node_info{node_id=~\"$node_id\"}) ", 495 | "format": "time_series", 496 | "intervalFactor": 10, 497 | "legendFormat": "checks", 498 | "refId": "A", 499 | "step": 10 500 | }, 501 | { 502 | "expr": "sum(increase(engine_daemon_health_checks_failed_total[$interval]) * on(instance) group_left(node_id) swarm_node_info{node_id=~\"$node_id\"}) ", 503 | "format": "time_series", 504 | "intervalFactor": 10, 505 | "legendFormat": "failed", 506 | "refId": "B", 507 | "step": 10 508 | } 509 | ], 510 | "thresholds": [], 511 | "timeFrom": null, 512 | "timeShift": null, 513 | "title": "Health Checks", 514 | "tooltip": { 515 | "shared": true, 516 | "sort": 2, 517 | "value_type": "individual" 518 | }, 519 | "type": "graph", 520 | "xaxis": { 521 | "buckets": null, 522 | "mode": "time", 523 | "name": null, 524 | "show": true, 525 | "values": [] 526 | }, 527 | "yaxes": [ 528 | { 529 | "format": "short", 530 | "label": null, 531 | "logBase": 1, 532 | "max": null, 533 | "min": null, 534 | "show": true 535 | }, 536 | { 537 | "format": "short", 538 | "label": null, 539 | "logBase": 1, 540 | "max": null, 541 | "min": null, 542 | "show": true 543 | } 544 | ] 545 | }, 546 | { 547 | "aliasColors": {}, 548 | "bars": false, 549 | "dashLength": 10, 550 | "dashes": false, 551 | "datasource": null, 552 | "decimals": 2, 553 | "fill": 1, 554 | "gridPos": { 555 | "h": 7, 556 | "w": 20, 557 | "x": 0, 558 | "y": 11 559 | }, 560 | "id": 22, 561 | "legend": { 562 | "alignAsTable": true, 563 | "avg": true, 564 | "current": false, 565 | "hideEmpty": true, 566 | "hideZero": true, 567 | "max": true, 568 | "min": true, 569 | "rightSide": true, 570 | "show": true, 571 | "sort": "avg", 572 | "sortDesc": true, 573 | "total": false, 574 | "values": true 575 | }, 576 | "lines": true, 577 | "linewidth": 1, 578 | "links": [], 579 | "nullPointMode": "null", 580 | "percentage": false, 581 | "pointradius": 5, 582 | "points": false, 583 | "renderer": "flot", 584 | "seriesOverrides": [], 585 | "spaceLength": 10, 586 | "stack": true, 587 | "steppedLine": false, 588 | "targets": [ 589 | { 590 | "expr": "sum(irate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}[1m])) by (container_label_com_docker_swarm_service_name) * 100 ", 591 | "format": "time_series", 592 | "intervalFactor": 2, 593 | "legendFormat": "{{container_label_com_docker_swarm_service_name}}", 594 | "refId": "A", 595 | "step": 2 596 | } 597 | ], 598 | "thresholds": [], 599 | "timeFrom": null, 600 | "timeShift": null, 601 | "title": "CPU usage by Service", 602 | "tooltip": { 603 | "shared": true, 604 | "sort": 2, 605 | "value_type": "individual" 606 | }, 607 | "type": "graph", 608 | "xaxis": { 609 | "buckets": null, 610 | "mode": "time", 611 | "name": null, 612 | "show": true, 613 | "values": [] 614 | }, 615 | "yaxes": [ 616 | { 617 | "format": "percent", 618 | "label": null, 619 | "logBase": 1, 620 | "max": null, 621 | "min": null, 622 | "show": true 623 | }, 624 | { 625 | "format": "short", 626 | "label": null, 627 | "logBase": 1, 628 | "max": null, 629 | "min": null, 630 | "show": false 631 | } 632 | ] 633 | }, 634 | { 635 | "cacheTimeout": null, 636 | "colorBackground": false, 637 | "colorValue": false, 638 | "colors": [ 639 | "rgba(245, 54, 54, 0.9)", 640 | "rgba(237, 129, 40, 0.89)", 641 | "rgba(50, 172, 45, 0.97)" 642 | ], 643 | "datasource": null, 644 | "decimals": null, 645 | "format": "percent", 646 | "gauge": { 647 | "maxValue": 100, 648 | "minValue": 0, 649 | "show": true, 650 | "thresholdLabels": false, 651 | "thresholdMarkers": true 652 | }, 653 | "gridPos": { 654 | "h": 7, 655 | "w": 4, 656 | "x": 20, 657 | "y": 11 658 | }, 659 | "hideTimeOverride": true, 660 | "id": 11, 661 | "interval": null, 662 | "links": [], 663 | "mappingType": 1, 664 | "mappingTypes": [ 665 | { 666 | "name": "value to text", 667 | "value": 1 668 | }, 669 | { 670 | "name": "range to text", 671 | "value": 2 672 | } 673 | ], 674 | "maxDataPoints": 100, 675 | "nullPointMode": "connected", 676 | "nullText": null, 677 | "postfix": "", 678 | "postfixFontSize": "50%", 679 | "prefix": "", 680 | "prefixFontSize": "50%", 681 | "rangeMaps": [ 682 | { 683 | "from": "null", 684 | "text": "N/A", 685 | "to": "null" 686 | } 687 | ], 688 | "sparkline": { 689 | "fillColor": "rgba(31, 118, 189, 0.18)", 690 | "full": false, 691 | "lineColor": "rgb(31, 120, 193)", 692 | "show": false 693 | }, 694 | "tableColumn": "", 695 | "targets": [ 696 | { 697 | "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) * 100 / count(node_cpu_seconds_total{mode=\"user\"} * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) ", 698 | "format": "time_series", 699 | "intervalFactor": 2, 700 | "legendFormat": "", 701 | "refId": "A", 702 | "step": 2 703 | } 704 | ], 705 | "thresholds": "10,25,100", 706 | "timeFrom": "1m", 707 | "timeShift": null, 708 | "title": "CPU Idle", 709 | "type": "singlestat", 710 | "valueFontSize": "80%", 711 | "valueMaps": [ 712 | { 713 | "op": "=", 714 | "text": "N/A", 715 | "value": "null" 716 | } 717 | ], 718 | "valueName": "avg" 719 | }, 720 | { 721 | "aliasColors": {}, 722 | "bars": false, 723 | "dashLength": 10, 724 | "dashes": false, 725 | "datasource": null, 726 | "decimals": 2, 727 | "fill": 1, 728 | "gridPos": { 729 | "h": 7, 730 | "w": 24, 731 | "x": 0, 732 | "y": 18 733 | }, 734 | "id": 33, 735 | "legend": { 736 | "alignAsTable": true, 737 | "avg": true, 738 | "current": false, 739 | "hideEmpty": true, 740 | "hideZero": true, 741 | "max": false, 742 | "min": false, 743 | "rightSide": true, 744 | "show": true, 745 | "sort": "avg", 746 | "sortDesc": true, 747 | "total": false, 748 | "values": true 749 | }, 750 | "lines": true, 751 | "linewidth": 1, 752 | "links": [], 753 | "nullPointMode": "null as zero", 754 | "percentage": false, 755 | "pointradius": 5, 756 | "points": false, 757 | "renderer": "flot", 758 | "seriesOverrides": [], 759 | "spaceLength": 10, 760 | "stack": false, 761 | "steppedLine": false, 762 | "targets": [ 763 | { 764 | "expr": "topk(10, sum(irate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}[$interval])) by (name)) * 100 ", 765 | "format": "time_series", 766 | "intervalFactor": 2, 767 | "legendFormat": "{{name}}", 768 | "refId": "A", 769 | "step": 2 770 | } 771 | ], 772 | "thresholds": [], 773 | "timeFrom": null, 774 | "timeShift": null, 775 | "title": "CPU usage by Container (top 10)", 776 | "tooltip": { 777 | "shared": true, 778 | "sort": 2, 779 | "value_type": "individual" 780 | }, 781 | "type": "graph", 782 | "xaxis": { 783 | "buckets": null, 784 | "mode": "time", 785 | "name": null, 786 | "show": true, 787 | "values": [] 788 | }, 789 | "yaxes": [ 790 | { 791 | "format": "percent", 792 | "label": null, 793 | "logBase": 1, 794 | "max": null, 795 | "min": null, 796 | "show": true 797 | }, 798 | { 799 | "format": "short", 800 | "label": null, 801 | "logBase": 1, 802 | "max": null, 803 | "min": null, 804 | "show": false 805 | } 806 | ] 807 | }, 808 | { 809 | "aliasColors": {}, 810 | "bars": false, 811 | "dashLength": 10, 812 | "dashes": false, 813 | "datasource": null, 814 | "fill": 1, 815 | "gridPos": { 816 | "h": 7, 817 | "w": 20, 818 | "x": 0, 819 | "y": 25 820 | }, 821 | "id": 24, 822 | "legend": { 823 | "alignAsTable": true, 824 | "avg": true, 825 | "current": false, 826 | "max": true, 827 | "min": true, 828 | "rightSide": true, 829 | "show": true, 830 | "sort": "avg", 831 | "sortDesc": true, 832 | "total": false, 833 | "values": true 834 | }, 835 | "lines": true, 836 | "linewidth": 1, 837 | "links": [], 838 | "nullPointMode": "null", 839 | "percentage": false, 840 | "pointradius": 5, 841 | "points": false, 842 | "renderer": "flot", 843 | "seriesOverrides": [], 844 | "spaceLength": 10, 845 | "stack": false, 846 | "steppedLine": false, 847 | "targets": [ 848 | { 849 | "expr": "sum(container_memory_usage_bytes{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}) by (container_label_com_docker_swarm_service_name) ", 850 | "format": "time_series", 851 | "intervalFactor": 2, 852 | "legendFormat": "Used {{container_label_com_docker_swarm_service_name}}", 853 | "refId": "A", 854 | "step": 2 855 | }, 856 | { 857 | "expr": "sum(container_memory_cache{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}) by (container_label_com_docker_swarm_service_name) ", 858 | "format": "time_series", 859 | "intervalFactor": 2, 860 | "legendFormat": "Cached {{container_label_com_docker_swarm_service_name}}", 861 | "refId": "B", 862 | "step": 2 863 | } 864 | ], 865 | "thresholds": [], 866 | "timeFrom": null, 867 | "timeShift": null, 868 | "title": "Memory usage by Service", 869 | "tooltip": { 870 | "shared": true, 871 | "sort": 0, 872 | "value_type": "individual" 873 | }, 874 | "type": "graph", 875 | "xaxis": { 876 | "buckets": null, 877 | "mode": "time", 878 | "name": null, 879 | "show": true, 880 | "values": [] 881 | }, 882 | "yaxes": [ 883 | { 884 | "format": "decbytes", 885 | "label": null, 886 | "logBase": 1, 887 | "max": null, 888 | "min": null, 889 | "show": true 890 | }, 891 | { 892 | "format": "short", 893 | "label": null, 894 | "logBase": 1, 895 | "max": null, 896 | "min": null, 897 | "show": true 898 | } 899 | ] 900 | }, 901 | { 902 | "cacheTimeout": null, 903 | "colorBackground": false, 904 | "colorValue": false, 905 | "colors": [ 906 | "rgba(245, 54, 54, 0.9)", 907 | "rgba(237, 129, 40, 0.89)", 908 | "rgba(50, 172, 45, 0.97)" 909 | ], 910 | "datasource": null, 911 | "format": "percent", 912 | "gauge": { 913 | "maxValue": 100, 914 | "minValue": 0, 915 | "show": true, 916 | "thresholdLabels": false, 917 | "thresholdMarkers": true 918 | }, 919 | "gridPos": { 920 | "h": 7, 921 | "w": 4, 922 | "x": 20, 923 | "y": 25 924 | }, 925 | "id": 8, 926 | "interval": null, 927 | "links": [], 928 | "mappingType": 1, 929 | "mappingTypes": [ 930 | { 931 | "name": "value to text", 932 | "value": 1 933 | }, 934 | { 935 | "name": "range to text", 936 | "value": 2 937 | } 938 | ], 939 | "maxDataPoints": 100, 940 | "nullPointMode": "connected", 941 | "nullText": null, 942 | "postfix": "", 943 | "postfixFontSize": "50%", 944 | "prefix": "", 945 | "prefixFontSize": "50%", 946 | "rangeMaps": [ 947 | { 948 | "from": "null", 949 | "text": "N/A", 950 | "to": "null" 951 | } 952 | ], 953 | "sparkline": { 954 | "fillColor": "rgba(31, 118, 189, 0.18)", 955 | "full": false, 956 | "lineColor": "rgb(31, 120, 193)", 957 | "show": false 958 | }, 959 | "tableColumn": "", 960 | "targets": [ 961 | { 962 | "expr": "sum((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 963 | "format": "time_series", 964 | "intervalFactor": 2, 965 | "legendFormat": "", 966 | "refId": "A", 967 | "step": 20 968 | } 969 | ], 970 | "thresholds": "10,25,100", 971 | "title": "Available Memory", 972 | "type": "singlestat", 973 | "valueFontSize": "80%", 974 | "valueMaps": [ 975 | { 976 | "op": "=", 977 | "text": "N/A", 978 | "value": "null" 979 | } 980 | ], 981 | "valueName": "avg" 982 | }, 983 | { 984 | "aliasColors": {}, 985 | "bars": false, 986 | "dashLength": 10, 987 | "dashes": false, 988 | "datasource": null, 989 | "fill": 1, 990 | "gridPos": { 991 | "h": 7, 992 | "w": 24, 993 | "x": 0, 994 | "y": 32 995 | }, 996 | "id": 34, 997 | "legend": { 998 | "alignAsTable": true, 999 | "avg": true, 1000 | "current": false, 1001 | "hideEmpty": false, 1002 | "hideZero": false, 1003 | "max": false, 1004 | "min": false, 1005 | "rightSide": true, 1006 | "show": true, 1007 | "sort": "avg", 1008 | "sortDesc": true, 1009 | "total": false, 1010 | "values": true 1011 | }, 1012 | "lines": true, 1013 | "linewidth": 1, 1014 | "links": [], 1015 | "nullPointMode": "null", 1016 | "percentage": false, 1017 | "pointradius": 5, 1018 | "points": false, 1019 | "renderer": "flot", 1020 | "seriesOverrides": [], 1021 | "spaceLength": 10, 1022 | "stack": false, 1023 | "steppedLine": false, 1024 | "targets": [ 1025 | { 1026 | "expr": "topk(10, avg_over_time(container_memory_usage_bytes{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}[$interval]))", 1027 | "format": "time_series", 1028 | "intervalFactor": 2, 1029 | "legendFormat": "{{name}}", 1030 | "refId": "A", 1031 | "step": 2 1032 | } 1033 | ], 1034 | "thresholds": [], 1035 | "timeFrom": null, 1036 | "timeShift": null, 1037 | "title": "Memory usage by Container (top 10)", 1038 | "tooltip": { 1039 | "shared": true, 1040 | "sort": 2, 1041 | "value_type": "individual" 1042 | }, 1043 | "type": "graph", 1044 | "xaxis": { 1045 | "buckets": null, 1046 | "mode": "time", 1047 | "name": null, 1048 | "show": true, 1049 | "values": [] 1050 | }, 1051 | "yaxes": [ 1052 | { 1053 | "format": "decbytes", 1054 | "label": null, 1055 | "logBase": 1, 1056 | "max": null, 1057 | "min": null, 1058 | "show": true 1059 | }, 1060 | { 1061 | "format": "short", 1062 | "label": null, 1063 | "logBase": 1, 1064 | "max": null, 1065 | "min": null, 1066 | "show": false 1067 | } 1068 | ] 1069 | }, 1070 | { 1071 | "aliasColors": {}, 1072 | "bars": false, 1073 | "dashLength": 10, 1074 | "dashes": false, 1075 | "datasource": null, 1076 | "fill": 1, 1077 | "gridPos": { 1078 | "h": 7, 1079 | "w": 24, 1080 | "x": 0, 1081 | "y": 39 1082 | }, 1083 | "id": 17, 1084 | "legend": { 1085 | "alignAsTable": true, 1086 | "avg": true, 1087 | "current": false, 1088 | "max": true, 1089 | "min": true, 1090 | "rightSide": true, 1091 | "show": true, 1092 | "sort": "avg", 1093 | "sortDesc": true, 1094 | "total": false, 1095 | "values": true 1096 | }, 1097 | "lines": true, 1098 | "linewidth": 1, 1099 | "links": [], 1100 | "nullPointMode": "null", 1101 | "percentage": false, 1102 | "pointradius": 5, 1103 | "points": false, 1104 | "renderer": "flot", 1105 | "seriesOverrides": [], 1106 | "spaceLength": 10, 1107 | "stack": false, 1108 | "steppedLine": false, 1109 | "targets": [ 1110 | { 1111 | "expr": "sum(rate(container_network_receive_bytes_total{container_label_com_docker_swarm_node_id=~\"$node_id\"}[$interval])) by (container_label_com_docker_swarm_service_name)", 1112 | "format": "time_series", 1113 | "intervalFactor": 2, 1114 | "legendFormat": "{{container_label_com_docker_swarm_service_name}}", 1115 | "refId": "A", 1116 | "step": 2 1117 | } 1118 | ], 1119 | "thresholds": [], 1120 | "timeFrom": null, 1121 | "timeShift": null, 1122 | "title": "Network received by Service", 1123 | "tooltip": { 1124 | "shared": true, 1125 | "sort": 0, 1126 | "value_type": "individual" 1127 | }, 1128 | "type": "graph", 1129 | "xaxis": { 1130 | "buckets": null, 1131 | "mode": "time", 1132 | "name": null, 1133 | "show": true, 1134 | "values": [] 1135 | }, 1136 | "yaxes": [ 1137 | { 1138 | "format": "Bps", 1139 | "label": null, 1140 | "logBase": 1, 1141 | "max": null, 1142 | "min": null, 1143 | "show": true 1144 | }, 1145 | { 1146 | "format": "short", 1147 | "label": null, 1148 | "logBase": 1, 1149 | "max": null, 1150 | "min": null, 1151 | "show": true 1152 | } 1153 | ] 1154 | }, 1155 | { 1156 | "aliasColors": {}, 1157 | "bars": false, 1158 | "dashLength": 10, 1159 | "dashes": false, 1160 | "datasource": null, 1161 | "fill": 1, 1162 | "gridPos": { 1163 | "h": 7, 1164 | "w": 24, 1165 | "x": 0, 1166 | "y": 46 1167 | }, 1168 | "id": 25, 1169 | "legend": { 1170 | "alignAsTable": true, 1171 | "avg": true, 1172 | "current": false, 1173 | "max": true, 1174 | "min": true, 1175 | "rightSide": true, 1176 | "show": true, 1177 | "sort": "avg", 1178 | "sortDesc": true, 1179 | "total": false, 1180 | "values": true 1181 | }, 1182 | "lines": true, 1183 | "linewidth": 1, 1184 | "links": [], 1185 | "nullPointMode": "null", 1186 | "percentage": false, 1187 | "pointradius": 5, 1188 | "points": false, 1189 | "renderer": "flot", 1190 | "seriesOverrides": [], 1191 | "spaceLength": 10, 1192 | "stack": false, 1193 | "steppedLine": false, 1194 | "targets": [ 1195 | { 1196 | "expr": "sum(rate(container_network_transmit_bytes_total{container_label_com_docker_swarm_node_id=~\"$node_id\"}[$interval])) by (container_label_com_docker_swarm_service_name)", 1197 | "format": "time_series", 1198 | "intervalFactor": 2, 1199 | "legendFormat": "{{container_label_com_docker_swarm_service_name}}", 1200 | "metric": "", 1201 | "refId": "B", 1202 | "step": 2 1203 | } 1204 | ], 1205 | "thresholds": [], 1206 | "timeFrom": null, 1207 | "timeShift": null, 1208 | "title": "Network transmitted by Service", 1209 | "tooltip": { 1210 | "shared": true, 1211 | "sort": 0, 1212 | "value_type": "individual" 1213 | }, 1214 | "type": "graph", 1215 | "xaxis": { 1216 | "buckets": null, 1217 | "mode": "time", 1218 | "name": null, 1219 | "show": true, 1220 | "values": [] 1221 | }, 1222 | "yaxes": [ 1223 | { 1224 | "format": "Bps", 1225 | "label": null, 1226 | "logBase": 1, 1227 | "max": null, 1228 | "min": null, 1229 | "show": true 1230 | }, 1231 | { 1232 | "format": "short", 1233 | "label": null, 1234 | "logBase": 1, 1235 | "max": null, 1236 | "min": null, 1237 | "show": true 1238 | } 1239 | ] 1240 | }, 1241 | { 1242 | "aliasColors": {}, 1243 | "bars": false, 1244 | "dashLength": 10, 1245 | "dashes": false, 1246 | "datasource": null, 1247 | "fill": 1, 1248 | "gridPos": { 1249 | "h": 7, 1250 | "w": 10, 1251 | "x": 0, 1252 | "y": 53 1253 | }, 1254 | "id": 31, 1255 | "legend": { 1256 | "avg": true, 1257 | "current": false, 1258 | "max": false, 1259 | "min": false, 1260 | "show": true, 1261 | "total": false, 1262 | "values": true 1263 | }, 1264 | "lines": true, 1265 | "linewidth": 1, 1266 | "links": [], 1267 | "nullPointMode": "null", 1268 | "percentage": false, 1269 | "pointradius": 5, 1270 | "points": false, 1271 | "renderer": "flot", 1272 | "seriesOverrides": [], 1273 | "spaceLength": 10, 1274 | "stack": false, 1275 | "steppedLine": false, 1276 | "targets": [ 1277 | { 1278 | "expr": "sum(rate(container_network_receive_bytes_total{id=\"/\"}[$interval])) by (id)", 1279 | "format": "time_series", 1280 | "intervalFactor": 2, 1281 | "legendFormat": "Received", 1282 | "refId": "A", 1283 | "step": 4 1284 | }, 1285 | { 1286 | "expr": "- sum(rate(container_network_transmit_bytes_total{id=\"/\"}[$interval])) by (id)", 1287 | "format": "time_series", 1288 | "intervalFactor": 2, 1289 | "legendFormat": "Transmited", 1290 | "refId": "B", 1291 | "step": 4 1292 | } 1293 | ], 1294 | "thresholds": [], 1295 | "timeFrom": null, 1296 | "timeShift": null, 1297 | "title": "Cluster Network Traffic", 1298 | "tooltip": { 1299 | "shared": true, 1300 | "sort": 0, 1301 | "value_type": "individual" 1302 | }, 1303 | "type": "graph", 1304 | "xaxis": { 1305 | "buckets": null, 1306 | "mode": "time", 1307 | "name": null, 1308 | "show": true, 1309 | "values": [] 1310 | }, 1311 | "yaxes": [ 1312 | { 1313 | "format": "Bps", 1314 | "label": null, 1315 | "logBase": 1, 1316 | "max": null, 1317 | "min": null, 1318 | "show": true 1319 | }, 1320 | { 1321 | "format": "short", 1322 | "label": null, 1323 | "logBase": 1, 1324 | "max": null, 1325 | "min": null, 1326 | "show": true 1327 | } 1328 | ] 1329 | }, 1330 | { 1331 | "aliasColors": {}, 1332 | "bars": false, 1333 | "dashLength": 10, 1334 | "dashes": false, 1335 | "datasource": null, 1336 | "fill": 1, 1337 | "gridPos": { 1338 | "h": 7, 1339 | "w": 10, 1340 | "x": 10, 1341 | "y": 53 1342 | }, 1343 | "id": 26, 1344 | "legend": { 1345 | "alignAsTable": false, 1346 | "avg": true, 1347 | "current": false, 1348 | "max": true, 1349 | "min": true, 1350 | "rightSide": false, 1351 | "show": true, 1352 | "total": false, 1353 | "values": true 1354 | }, 1355 | "lines": true, 1356 | "linewidth": 1, 1357 | "links": [], 1358 | "nullPointMode": "null", 1359 | "percentage": false, 1360 | "pointradius": 5, 1361 | "points": false, 1362 | "renderer": "flot", 1363 | "seriesOverrides": [], 1364 | "spaceLength": 10, 1365 | "stack": false, 1366 | "steppedLine": false, 1367 | "targets": [ 1368 | { 1369 | "expr": "sum(irate(container_fs_reads_total[$interval]) )", 1370 | "format": "time_series", 1371 | "intervalFactor": 2, 1372 | "legendFormat": "Reads", 1373 | "refId": "A", 1374 | "step": 4 1375 | }, 1376 | { 1377 | "expr": "sum(irate(container_fs_writes_total[$interval])) ", 1378 | "format": "time_series", 1379 | "intervalFactor": 2, 1380 | "legendFormat": "Writes ", 1381 | "refId": "B", 1382 | "step": 4 1383 | } 1384 | ], 1385 | "thresholds": [], 1386 | "timeFrom": null, 1387 | "timeShift": null, 1388 | "title": "Cluster IOPS", 1389 | "tooltip": { 1390 | "shared": true, 1391 | "sort": 0, 1392 | "value_type": "individual" 1393 | }, 1394 | "type": "graph", 1395 | "xaxis": { 1396 | "buckets": null, 1397 | "mode": "time", 1398 | "name": null, 1399 | "show": true, 1400 | "values": [] 1401 | }, 1402 | "yaxes": [ 1403 | { 1404 | "format": "short", 1405 | "label": null, 1406 | "logBase": 1, 1407 | "max": null, 1408 | "min": null, 1409 | "show": true 1410 | }, 1411 | { 1412 | "format": "short", 1413 | "label": null, 1414 | "logBase": 1, 1415 | "max": null, 1416 | "min": null, 1417 | "show": true 1418 | } 1419 | ] 1420 | }, 1421 | { 1422 | "cacheTimeout": null, 1423 | "colorBackground": false, 1424 | "colorValue": false, 1425 | "colors": [ 1426 | "rgba(245, 54, 54, 0.9)", 1427 | "rgba(237, 129, 40, 0.89)", 1428 | "rgba(50, 172, 45, 0.97)" 1429 | ], 1430 | "datasource": null, 1431 | "format": "percent", 1432 | "gauge": { 1433 | "maxValue": 100, 1434 | "minValue": 0, 1435 | "show": true, 1436 | "thresholdLabels": false, 1437 | "thresholdMarkers": true 1438 | }, 1439 | "gridPos": { 1440 | "h": 7, 1441 | "w": 4, 1442 | "x": 20, 1443 | "y": 53 1444 | }, 1445 | "id": 27, 1446 | "interval": null, 1447 | "links": [], 1448 | "mappingType": 1, 1449 | "mappingTypes": [ 1450 | { 1451 | "name": "value to text", 1452 | "value": 1 1453 | }, 1454 | { 1455 | "name": "range to text", 1456 | "value": 2 1457 | } 1458 | ], 1459 | "maxDataPoints": 100, 1460 | "nullPointMode": "connected", 1461 | "nullText": null, 1462 | "postfix": "", 1463 | "postfixFontSize": "50%", 1464 | "prefix": "", 1465 | "prefixFontSize": "50%", 1466 | "rangeMaps": [ 1467 | { 1468 | "from": "null", 1469 | "text": "N/A", 1470 | "to": "null" 1471 | } 1472 | ], 1473 | "sparkline": { 1474 | "fillColor": "rgba(31, 118, 189, 0.18)", 1475 | "full": false, 1476 | "lineColor": "rgb(31, 120, 193)", 1477 | "show": false 1478 | }, 1479 | "tableColumn": "", 1480 | "targets": [ 1481 | { 1482 | "expr": "sum((node_filesystem_free_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 1483 | "format": "time_series", 1484 | "intervalFactor": 2, 1485 | "legendFormat": "", 1486 | "refId": "A", 1487 | "step": 20 1488 | } 1489 | ], 1490 | "thresholds": "10,25,100", 1491 | "title": "Available Disk Space", 1492 | "type": "singlestat", 1493 | "valueFontSize": "80%", 1494 | "valueMaps": [ 1495 | { 1496 | "op": "=", 1497 | "text": "N/A", 1498 | "value": "null" 1499 | } 1500 | ], 1501 | "valueName": "avg" 1502 | }, 1503 | { 1504 | "aliasColors": {}, 1505 | "bars": false, 1506 | "dashLength": 10, 1507 | "dashes": false, 1508 | "datasource": null, 1509 | "decimals": 0, 1510 | "fill": 1, 1511 | "gridPos": { 1512 | "h": 7, 1513 | "w": 12, 1514 | "x": 0, 1515 | "y": 60 1516 | }, 1517 | "id": 29, 1518 | "legend": { 1519 | "alignAsTable": true, 1520 | "avg": false, 1521 | "current": true, 1522 | "hideEmpty": true, 1523 | "hideZero": true, 1524 | "max": false, 1525 | "min": false, 1526 | "rightSide": true, 1527 | "show": true, 1528 | "sort": "current", 1529 | "sortDesc": true, 1530 | "total": false, 1531 | "values": true 1532 | }, 1533 | "lines": true, 1534 | "linewidth": 1, 1535 | "links": [], 1536 | "nullPointMode": "null", 1537 | "percentage": false, 1538 | "pointradius": 5, 1539 | "points": false, 1540 | "renderer": "flot", 1541 | "seriesOverrides": [], 1542 | "spaceLength": 10, 1543 | "stack": false, 1544 | "steppedLine": false, 1545 | "targets": [ 1546 | { 1547 | "expr": "sum(engine_daemon_container_actions_seconds_count * on(instance) group_left(node_id) swarm_node_info{node_id=~\"$node_id\"}) by (action)", 1548 | "format": "time_series", 1549 | "intervalFactor": 10, 1550 | "legendFormat": "{{action }}", 1551 | "refId": "A", 1552 | "step": 10 1553 | } 1554 | ], 1555 | "thresholds": [], 1556 | "timeFrom": null, 1557 | "timeShift": null, 1558 | "title": "Docker Daemon Container Actions", 1559 | "tooltip": { 1560 | "shared": true, 1561 | "sort": 2, 1562 | "value_type": "individual" 1563 | }, 1564 | "type": "graph", 1565 | "xaxis": { 1566 | "buckets": null, 1567 | "mode": "time", 1568 | "name": null, 1569 | "show": true, 1570 | "values": [] 1571 | }, 1572 | "yaxes": [ 1573 | { 1574 | "format": "short", 1575 | "label": null, 1576 | "logBase": 1, 1577 | "max": null, 1578 | "min": null, 1579 | "show": true 1580 | }, 1581 | { 1582 | "format": "short", 1583 | "label": null, 1584 | "logBase": 1, 1585 | "max": null, 1586 | "min": null, 1587 | "show": true 1588 | } 1589 | ] 1590 | }, 1591 | { 1592 | "aliasColors": {}, 1593 | "bars": false, 1594 | "dashLength": 10, 1595 | "dashes": false, 1596 | "datasource": null, 1597 | "decimals": 0, 1598 | "fill": 1, 1599 | "gridPos": { 1600 | "h": 7, 1601 | "w": 12, 1602 | "x": 12, 1603 | "y": 60 1604 | }, 1605 | "id": 30, 1606 | "legend": { 1607 | "alignAsTable": true, 1608 | "avg": false, 1609 | "current": true, 1610 | "hideEmpty": true, 1611 | "hideZero": true, 1612 | "max": false, 1613 | "min": false, 1614 | "rightSide": true, 1615 | "show": true, 1616 | "sort": "current", 1617 | "sortDesc": true, 1618 | "total": false, 1619 | "values": true 1620 | }, 1621 | "lines": true, 1622 | "linewidth": 1, 1623 | "links": [], 1624 | "nullPointMode": "null", 1625 | "percentage": false, 1626 | "pointradius": 5, 1627 | "points": false, 1628 | "renderer": "flot", 1629 | "seriesOverrides": [], 1630 | "spaceLength": 10, 1631 | "stack": false, 1632 | "steppedLine": false, 1633 | "targets": [ 1634 | { 1635 | "expr": "sum(engine_daemon_network_actions_seconds_count * on(instance) group_left(node_id) swarm_node_info{node_id=~\"$node_id\"}) by (action)", 1636 | "format": "time_series", 1637 | "intervalFactor": 10, 1638 | "legendFormat": "{{action }}", 1639 | "refId": "A", 1640 | "step": 10 1641 | } 1642 | ], 1643 | "thresholds": [], 1644 | "timeFrom": null, 1645 | "timeShift": null, 1646 | "title": "Docker Daemon Network Actions", 1647 | "tooltip": { 1648 | "shared": true, 1649 | "sort": 2, 1650 | "value_type": "individual" 1651 | }, 1652 | "type": "graph", 1653 | "xaxis": { 1654 | "buckets": null, 1655 | "mode": "time", 1656 | "name": null, 1657 | "show": true, 1658 | "values": [] 1659 | }, 1660 | "yaxes": [ 1661 | { 1662 | "format": "short", 1663 | "label": null, 1664 | "logBase": 1, 1665 | "max": null, 1666 | "min": null, 1667 | "show": true 1668 | }, 1669 | { 1670 | "format": "short", 1671 | "label": null, 1672 | "logBase": 1, 1673 | "max": null, 1674 | "min": null, 1675 | "show": true 1676 | } 1677 | ] 1678 | }, 1679 | { 1680 | "columns": [ 1681 | { 1682 | "text": "Avg", 1683 | "value": "avg" 1684 | } 1685 | ], 1686 | "datasource": null, 1687 | "fontSize": "100%", 1688 | "gridPos": { 1689 | "h": 7, 1690 | "w": 24, 1691 | "x": 0, 1692 | "y": 67 1693 | }, 1694 | "hideTimeOverride": true, 1695 | "id": 28, 1696 | "links": [], 1697 | "pageSize": null, 1698 | "repeat": null, 1699 | "scroll": true, 1700 | "showHeader": true, 1701 | "sort": { 1702 | "col": 0, 1703 | "desc": true 1704 | }, 1705 | "styles": [ 1706 | { 1707 | "alias": "Time", 1708 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 1709 | "pattern": "Time", 1710 | "type": "hidden" 1711 | }, 1712 | { 1713 | "alias": "", 1714 | "colorMode": null, 1715 | "colors": [ 1716 | "rgba(245, 54, 54, 0.9)", 1717 | "rgba(237, 129, 40, 0.89)", 1718 | "rgba(50, 172, 45, 0.97)" 1719 | ], 1720 | "decimals": 2, 1721 | "pattern": "/.*/", 1722 | "thresholds": [], 1723 | "type": "number", 1724 | "unit": "short" 1725 | } 1726 | ], 1727 | "targets": [ 1728 | { 1729 | "expr": "sum(engine_daemon_engine_info * on(instance) group_left(node_id) swarm_node_info) by (kernel, os, graphdriver, version, node_id)", 1730 | "format": "table", 1731 | "instant": true, 1732 | "intervalFactor": 2, 1733 | "legendFormat": "", 1734 | "refId": "A", 1735 | "step": 2 1736 | } 1737 | ], 1738 | "timeFrom": "1s", 1739 | "title": "Docker Engine Info", 1740 | "transform": "timeseries_to_rows", 1741 | "type": "table" 1742 | } 1743 | ], 1744 | "refresh": "30s", 1745 | "schemaVersion": 16, 1746 | "style": "dark", 1747 | "tags": [ 1748 | "swarmprom" 1749 | ], 1750 | "templating": { 1751 | "list": [ 1752 | { 1753 | "allValue": ".+", 1754 | "current": { 1755 | "text": "All", 1756 | "value": "$__all" 1757 | }, 1758 | "datasource": "Prometheus", 1759 | "hide": 0, 1760 | "includeAll": true, 1761 | "label": "Swarm Node", 1762 | "multi": false, 1763 | "name": "node_id", 1764 | "options": [], 1765 | "query": "node_meta", 1766 | "refresh": 2, 1767 | "regex": "/node_id=\"([^\"]+)\"/", 1768 | "sort": 0, 1769 | "tagValuesQuery": "label_values({node_id=\"$tag\"},node_name)", 1770 | "tags": [ 1771 | "ofdocker", 1772 | "ofmon" 1773 | ], 1774 | "tagsQuery": "label_values(node_meta, node_name)", 1775 | "type": "query", 1776 | "useTags": true 1777 | }, 1778 | { 1779 | "auto": true, 1780 | "auto_count": 30, 1781 | "auto_min": "30s", 1782 | "current": { 1783 | "text": "auto", 1784 | "value": "$__auto_interval_interval" 1785 | }, 1786 | "hide": 0, 1787 | "label": "Interval", 1788 | "name": "interval", 1789 | "options": [ 1790 | { 1791 | "selected": true, 1792 | "text": "auto", 1793 | "value": "$__auto_interval_interval" 1794 | }, 1795 | { 1796 | "selected": false, 1797 | "text": "1m", 1798 | "value": "1m" 1799 | }, 1800 | { 1801 | "selected": false, 1802 | "text": "10m", 1803 | "value": "10m" 1804 | }, 1805 | { 1806 | "selected": false, 1807 | "text": "30m", 1808 | "value": "30m" 1809 | }, 1810 | { 1811 | "selected": false, 1812 | "text": "1h", 1813 | "value": "1h" 1814 | }, 1815 | { 1816 | "selected": false, 1817 | "text": "6h", 1818 | "value": "6h" 1819 | }, 1820 | { 1821 | "selected": false, 1822 | "text": "12h", 1823 | "value": "12h" 1824 | }, 1825 | { 1826 | "selected": false, 1827 | "text": "1d", 1828 | "value": "1d" 1829 | }, 1830 | { 1831 | "selected": false, 1832 | "text": "7d", 1833 | "value": "7d" 1834 | }, 1835 | { 1836 | "selected": false, 1837 | "text": "14d", 1838 | "value": "14d" 1839 | }, 1840 | { 1841 | "selected": false, 1842 | "text": "30d", 1843 | "value": "30d" 1844 | } 1845 | ], 1846 | "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 1847 | "refresh": 2, 1848 | "type": "interval" 1849 | } 1850 | ] 1851 | }, 1852 | "time": { 1853 | "from": "now-15m", 1854 | "to": "now" 1855 | }, 1856 | "timepicker": { 1857 | "refresh_intervals": [ 1858 | "5s", 1859 | "10s", 1860 | "30s", 1861 | "1m", 1862 | "5m", 1863 | "15m", 1864 | "30m", 1865 | "1h", 1866 | "2h", 1867 | "1d" 1868 | ], 1869 | "time_options": [ 1870 | "5m", 1871 | "15m", 1872 | "1h", 1873 | "6h", 1874 | "12h", 1875 | "24h", 1876 | "2d", 1877 | "7d", 1878 | "30d" 1879 | ] 1880 | }, 1881 | "timezone": "", 1882 | "title": "Docker Swarm Services", 1883 | "uid": "zr_baSRmk", 1884 | "version": 1 1885 | } 1886 | -------------------------------------------------------------------------------- /prometheus/grafana/dashboards/swarmprom-nodes-dash.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "description": "Docker Swarm nodes metrics", 16 | "editable": true, 17 | "gnetId": null, 18 | "graphTooltip": 0, 19 | "iteration": 1547535746076, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "cacheTimeout": null, 24 | "colorBackground": false, 25 | "colorValue": false, 26 | "colors": [ 27 | "rgba(245, 54, 54, 0.9)", 28 | "rgba(237, 129, 40, 0.89)", 29 | "rgba(50, 172, 45, 0.97)" 30 | ], 31 | "datasource": "Prometheus", 32 | "decimals": 1, 33 | "format": "s", 34 | "gauge": { 35 | "maxValue": 100, 36 | "minValue": 0, 37 | "show": false, 38 | "thresholdLabels": false, 39 | "thresholdMarkers": true 40 | }, 41 | "gridPos": { 42 | "h": 4, 43 | "w": 6, 44 | "x": 0, 45 | "y": 0 46 | }, 47 | "hideTimeOverride": true, 48 | "id": 2, 49 | "interval": null, 50 | "links": [], 51 | "mappingType": 1, 52 | "mappingTypes": [ 53 | { 54 | "name": "value to text", 55 | "value": 1 56 | }, 57 | { 58 | "name": "range to text", 59 | "value": 2 60 | } 61 | ], 62 | "maxDataPoints": 100, 63 | "nullPointMode": "connected", 64 | "nullText": null, 65 | "postfix": "", 66 | "postfixFontSize": "50%", 67 | "prefix": "", 68 | "prefixFontSize": "50%", 69 | "rangeMaps": [ 70 | { 71 | "from": "null", 72 | "text": "N/A", 73 | "to": "null" 74 | } 75 | ], 76 | "sparkline": { 77 | "fillColor": "rgba(31, 118, 189, 0.18)", 78 | "full": false, 79 | "lineColor": "rgb(31, 120, 193)", 80 | "show": false 81 | }, 82 | "tableColumn": "", 83 | "targets": [ 84 | { 85 | "expr": "topk(1, sum((node_time_seconds - node_boot_time_seconds) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name))", 86 | "format": "time_series", 87 | "intervalFactor": 2, 88 | "legendFormat": "", 89 | "refId": "A", 90 | "step": 2 91 | } 92 | ], 93 | "thresholds": "", 94 | "timeFrom": "1m", 95 | "timeShift": null, 96 | "title": "Uptime", 97 | "type": "singlestat", 98 | "valueFontSize": "80%", 99 | "valueMaps": [ 100 | { 101 | "op": "=", 102 | "text": "N/A", 103 | "value": "null" 104 | } 105 | ], 106 | "valueName": "avg" 107 | }, 108 | { 109 | "cacheTimeout": null, 110 | "colorBackground": false, 111 | "colorValue": false, 112 | "colors": [ 113 | "rgba(245, 54, 54, 0.9)", 114 | "rgba(237, 129, 40, 0.89)", 115 | "rgba(50, 172, 45, 0.97)" 116 | ], 117 | "datasource": null, 118 | "decimals": 0, 119 | "format": "none", 120 | "gauge": { 121 | "maxValue": 100, 122 | "minValue": 0, 123 | "show": false, 124 | "thresholdLabels": false, 125 | "thresholdMarkers": true 126 | }, 127 | "gridPos": { 128 | "h": 4, 129 | "w": 6, 130 | "x": 6, 131 | "y": 0 132 | }, 133 | "id": 1, 134 | "interval": null, 135 | "links": [], 136 | "mappingType": 1, 137 | "mappingTypes": [ 138 | { 139 | "name": "value to text", 140 | "value": 1 141 | }, 142 | { 143 | "name": "range to text", 144 | "value": 2 145 | } 146 | ], 147 | "maxDataPoints": 100, 148 | "nullPointMode": "connected", 149 | "nullText": null, 150 | "postfix": "", 151 | "postfixFontSize": "50%", 152 | "prefix": "", 153 | "prefixFontSize": "50%", 154 | "rangeMaps": [ 155 | { 156 | "from": "null", 157 | "text": "N/A", 158 | "to": "null" 159 | } 160 | ], 161 | "sparkline": { 162 | "fillColor": "rgba(31, 118, 189, 0.18)", 163 | "full": false, 164 | "lineColor": "rgb(31, 120, 193)", 165 | "show": false 166 | }, 167 | "tableColumn": "", 168 | "targets": [ 169 | { 170 | "expr": "count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 171 | "format": "time_series", 172 | "intervalFactor": 2, 173 | "legendFormat": "", 174 | "refId": "A", 175 | "step": 20 176 | } 177 | ], 178 | "thresholds": "", 179 | "title": "Nodes", 180 | "type": "singlestat", 181 | "valueFontSize": "80%", 182 | "valueMaps": [ 183 | { 184 | "op": "=", 185 | "text": "N/A", 186 | "value": "null" 187 | } 188 | ], 189 | "valueName": "avg" 190 | }, 191 | { 192 | "cacheTimeout": null, 193 | "colorBackground": false, 194 | "colorValue": false, 195 | "colors": [ 196 | "rgba(245, 54, 54, 0.9)", 197 | "rgba(237, 129, 40, 0.89)", 198 | "rgba(50, 172, 45, 0.97)" 199 | ], 200 | "datasource": null, 201 | "decimals": 0, 202 | "format": "short", 203 | "gauge": { 204 | "maxValue": 100, 205 | "minValue": 0, 206 | "show": false, 207 | "thresholdLabels": false, 208 | "thresholdMarkers": true 209 | }, 210 | "gridPos": { 211 | "h": 4, 212 | "w": 6, 213 | "x": 12, 214 | "y": 0 215 | }, 216 | "hideTimeOverride": true, 217 | "id": 4, 218 | "interval": null, 219 | "links": [], 220 | "mappingType": 1, 221 | "mappingTypes": [ 222 | { 223 | "name": "value to text", 224 | "value": 1 225 | }, 226 | { 227 | "name": "range to text", 228 | "value": 2 229 | } 230 | ], 231 | "maxDataPoints": 100, 232 | "nullPointMode": "connected", 233 | "nullText": null, 234 | "postfix": "", 235 | "postfixFontSize": "50%", 236 | "prefix": "", 237 | "prefixFontSize": "50%", 238 | "rangeMaps": [ 239 | { 240 | "from": "null", 241 | "text": "N/A", 242 | "to": "null" 243 | } 244 | ], 245 | "sparkline": { 246 | "fillColor": "rgba(31, 118, 189, 0.18)", 247 | "full": false, 248 | "lineColor": "rgb(31, 120, 193)", 249 | "show": false 250 | }, 251 | "tableColumn": "", 252 | "targets": [ 253 | { 254 | "expr": "count(node_cpu_seconds_total{mode=\"idle\"} * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 255 | "format": "time_series", 256 | "intervalFactor": 2, 257 | "legendFormat": "", 258 | "refId": "A", 259 | "step": 2 260 | } 261 | ], 262 | "thresholds": "", 263 | "timeFrom": "1m", 264 | "timeShift": null, 265 | "title": "CPUs", 266 | "type": "singlestat", 267 | "valueFontSize": "80%", 268 | "valueMaps": [ 269 | { 270 | "op": "=", 271 | "text": "N/A", 272 | "value": "null" 273 | } 274 | ], 275 | "valueName": "avg" 276 | }, 277 | { 278 | "cacheTimeout": null, 279 | "colorBackground": false, 280 | "colorValue": false, 281 | "colors": [ 282 | "rgba(245, 54, 54, 0.9)", 283 | "rgba(237, 129, 40, 0.89)", 284 | "rgba(50, 172, 45, 0.97)" 285 | ], 286 | "datasource": null, 287 | "decimals": null, 288 | "format": "percent", 289 | "gauge": { 290 | "maxValue": 100, 291 | "minValue": 0, 292 | "show": true, 293 | "thresholdLabels": false, 294 | "thresholdMarkers": true 295 | }, 296 | "gridPos": { 297 | "h": 4, 298 | "w": 6, 299 | "x": 18, 300 | "y": 0 301 | }, 302 | "hideTimeOverride": true, 303 | "id": 11, 304 | "interval": null, 305 | "links": [], 306 | "mappingType": 1, 307 | "mappingTypes": [ 308 | { 309 | "name": "value to text", 310 | "value": 1 311 | }, 312 | { 313 | "name": "range to text", 314 | "value": 2 315 | } 316 | ], 317 | "maxDataPoints": 100, 318 | "nullPointMode": "connected", 319 | "nullText": null, 320 | "postfix": "", 321 | "postfixFontSize": "50%", 322 | "prefix": "", 323 | "prefixFontSize": "50%", 324 | "rangeMaps": [ 325 | { 326 | "from": "null", 327 | "text": "N/A", 328 | "to": "null" 329 | } 330 | ], 331 | "sparkline": { 332 | "fillColor": "rgba(31, 118, 189, 0.18)", 333 | "full": false, 334 | "lineColor": "rgb(31, 120, 193)", 335 | "show": false 336 | }, 337 | "tableColumn": "", 338 | "targets": [ 339 | { 340 | "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) * 100 / count(node_cpu_seconds_total{mode=\"user\"} * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) ", 341 | "format": "time_series", 342 | "intervalFactor": 2, 343 | "legendFormat": "", 344 | "refId": "A", 345 | "step": 2 346 | } 347 | ], 348 | "thresholds": "10,25,100", 349 | "timeFrom": "1m", 350 | "timeShift": null, 351 | "title": "CPU Idle", 352 | "type": "singlestat", 353 | "valueFontSize": "80%", 354 | "valueMaps": [ 355 | { 356 | "op": "=", 357 | "text": "N/A", 358 | "value": "null" 359 | } 360 | ], 361 | "valueName": "avg" 362 | }, 363 | { 364 | "aliasColors": {}, 365 | "bars": false, 366 | "dashLength": 10, 367 | "dashes": false, 368 | "datasource": null, 369 | "decimals": 2, 370 | "fill": 1, 371 | "gridPos": { 372 | "h": 7, 373 | "w": 12, 374 | "x": 0, 375 | "y": 4 376 | }, 377 | "id": 13, 378 | "legend": { 379 | "alignAsTable": true, 380 | "avg": true, 381 | "current": true, 382 | "hideEmpty": false, 383 | "hideZero": false, 384 | "max": true, 385 | "min": true, 386 | "rightSide": true, 387 | "show": false, 388 | "total": false, 389 | "values": true 390 | }, 391 | "lines": true, 392 | "linewidth": 1, 393 | "links": [], 394 | "nullPointMode": "null", 395 | "percentage": false, 396 | "pointradius": 5, 397 | "points": false, 398 | "renderer": "flot", 399 | "seriesOverrides": [], 400 | "spaceLength": 10, 401 | "stack": false, 402 | "steppedLine": false, 403 | "targets": [ 404 | { 405 | "expr": "node_load5 * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}", 406 | "format": "time_series", 407 | "intervalFactor": 2, 408 | "legendFormat": "load5 {{node_name}}", 409 | "refId": "A", 410 | "step": 2 411 | } 412 | ], 413 | "thresholds": [], 414 | "timeFrom": null, 415 | "timeShift": null, 416 | "title": "System Load by Node", 417 | "tooltip": { 418 | "shared": true, 419 | "sort": 2, 420 | "value_type": "individual" 421 | }, 422 | "type": "graph", 423 | "xaxis": { 424 | "buckets": null, 425 | "mode": "time", 426 | "name": null, 427 | "show": true, 428 | "values": [] 429 | }, 430 | "yaxes": [ 431 | { 432 | "format": "short", 433 | "label": null, 434 | "logBase": 1, 435 | "max": null, 436 | "min": null, 437 | "show": true 438 | }, 439 | { 440 | "format": "short", 441 | "label": null, 442 | "logBase": 1, 443 | "max": null, 444 | "min": null, 445 | "show": true 446 | } 447 | ], 448 | "yaxis": { 449 | "align": false, 450 | "alignLevel": null 451 | } 452 | }, 453 | { 454 | "aliasColors": {}, 455 | "bars": false, 456 | "dashLength": 10, 457 | "dashes": false, 458 | "datasource": null, 459 | "decimals": 2, 460 | "fill": 1, 461 | "gridPos": { 462 | "h": 7, 463 | "w": 12, 464 | "x": 12, 465 | "y": 4 466 | }, 467 | "id": 14, 468 | "legend": { 469 | "alignAsTable": true, 470 | "avg": true, 471 | "current": true, 472 | "hideEmpty": true, 473 | "hideZero": true, 474 | "max": true, 475 | "min": true, 476 | "rightSide": true, 477 | "show": false, 478 | "total": false, 479 | "values": true 480 | }, 481 | "lines": true, 482 | "linewidth": 1, 483 | "links": [], 484 | "nullPointMode": "null as zero", 485 | "percentage": false, 486 | "pointradius": 5, 487 | "points": false, 488 | "renderer": "flot", 489 | "seriesOverrides": [], 490 | "spaceLength": 10, 491 | "stack": false, 492 | "steppedLine": false, 493 | "targets": [ 494 | { 495 | "expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) by (node_name))", 496 | "format": "time_series", 497 | "intervalFactor": 2, 498 | "legendFormat": "{{node_name}}", 499 | "refId": "A", 500 | "step": 2 501 | } 502 | ], 503 | "thresholds": [], 504 | "timeFrom": null, 505 | "timeShift": null, 506 | "title": "CPU Usage by Node", 507 | "tooltip": { 508 | "shared": true, 509 | "sort": 2, 510 | "value_type": "individual" 511 | }, 512 | "type": "graph", 513 | "xaxis": { 514 | "buckets": null, 515 | "mode": "time", 516 | "name": null, 517 | "show": true, 518 | "values": [] 519 | }, 520 | "yaxes": [ 521 | { 522 | "format": "percent", 523 | "label": null, 524 | "logBase": 1, 525 | "max": "100", 526 | "min": null, 527 | "show": true 528 | }, 529 | { 530 | "format": "short", 531 | "label": null, 532 | "logBase": 1, 533 | "max": null, 534 | "min": null, 535 | "show": true 536 | } 537 | ], 538 | "yaxis": { 539 | "align": false, 540 | "alignLevel": null 541 | } 542 | }, 543 | { 544 | "cacheTimeout": null, 545 | "colorBackground": false, 546 | "colorValue": false, 547 | "colors": [ 548 | "rgba(245, 54, 54, 0.9)", 549 | "rgba(237, 129, 40, 0.89)", 550 | "rgba(50, 172, 45, 0.97)" 551 | ], 552 | "datasource": null, 553 | "decimals": 1, 554 | "format": "decbytes", 555 | "gauge": { 556 | "maxValue": 100, 557 | "minValue": 0, 558 | "show": false, 559 | "thresholdLabels": false, 560 | "thresholdMarkers": true 561 | }, 562 | "gridPos": { 563 | "h": 4, 564 | "w": 3, 565 | "x": 0, 566 | "y": 11 567 | }, 568 | "hideTimeOverride": true, 569 | "id": 3, 570 | "interval": null, 571 | "links": [], 572 | "mappingType": 1, 573 | "mappingTypes": [ 574 | { 575 | "name": "value to text", 576 | "value": 1 577 | }, 578 | { 579 | "name": "range to text", 580 | "value": 2 581 | } 582 | ], 583 | "maxDataPoints": 100, 584 | "nullPointMode": "connected", 585 | "nullText": null, 586 | "postfix": "", 587 | "postfixFontSize": "50%", 588 | "prefix": "", 589 | "prefixFontSize": "50%", 590 | "rangeMaps": [ 591 | { 592 | "from": "null", 593 | "text": "N/A", 594 | "to": "null" 595 | } 596 | ], 597 | "sparkline": { 598 | "fillColor": "rgba(31, 118, 189, 0.18)", 599 | "full": false, 600 | "lineColor": "rgb(31, 120, 193)", 601 | "show": false 602 | }, 603 | "tableColumn": "", 604 | "targets": [ 605 | { 606 | "expr": "sum(node_memory_MemTotal_bytes * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 607 | "format": "time_series", 608 | "intervalFactor": 2, 609 | "legendFormat": "", 610 | "refId": "A", 611 | "step": 20 612 | } 613 | ], 614 | "thresholds": "", 615 | "timeFrom": null, 616 | "timeShift": null, 617 | "title": "Total Memory", 618 | "type": "singlestat", 619 | "valueFontSize": "80%", 620 | "valueMaps": [ 621 | { 622 | "op": "=", 623 | "text": "N/A", 624 | "value": "null" 625 | } 626 | ], 627 | "valueName": "avg" 628 | }, 629 | { 630 | "cacheTimeout": null, 631 | "colorBackground": false, 632 | "colorValue": false, 633 | "colors": [ 634 | "rgba(245, 54, 54, 0.9)", 635 | "rgba(237, 129, 40, 0.89)", 636 | "rgba(50, 172, 45, 0.97)" 637 | ], 638 | "datasource": null, 639 | "format": "percent", 640 | "gauge": { 641 | "maxValue": 100, 642 | "minValue": 0, 643 | "show": true, 644 | "thresholdLabels": false, 645 | "thresholdMarkers": true 646 | }, 647 | "gridPos": { 648 | "h": 4, 649 | "w": 4, 650 | "x": 3, 651 | "y": 11 652 | }, 653 | "id": 8, 654 | "interval": null, 655 | "links": [], 656 | "mappingType": 1, 657 | "mappingTypes": [ 658 | { 659 | "name": "value to text", 660 | "value": 1 661 | }, 662 | { 663 | "name": "range to text", 664 | "value": 2 665 | } 666 | ], 667 | "maxDataPoints": 100, 668 | "nullPointMode": "connected", 669 | "nullText": null, 670 | "postfix": "", 671 | "postfixFontSize": "50%", 672 | "prefix": "", 673 | "prefixFontSize": "50%", 674 | "rangeMaps": [ 675 | { 676 | "from": "null", 677 | "text": "N/A", 678 | "to": "null" 679 | } 680 | ], 681 | "sparkline": { 682 | "fillColor": "rgba(31, 118, 189, 0.18)", 683 | "full": false, 684 | "lineColor": "rgb(31, 120, 193)", 685 | "show": false 686 | }, 687 | "tableColumn": "", 688 | "targets": [ 689 | { 690 | "expr": "sum((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 691 | "format": "time_series", 692 | "intervalFactor": 2, 693 | "legendFormat": "", 694 | "refId": "A", 695 | "step": 20 696 | } 697 | ], 698 | "thresholds": "10,25,100", 699 | "title": "Available Memory", 700 | "type": "singlestat", 701 | "valueFontSize": "80%", 702 | "valueMaps": [ 703 | { 704 | "op": "=", 705 | "text": "N/A", 706 | "value": "null" 707 | } 708 | ], 709 | "valueName": "avg" 710 | }, 711 | { 712 | "cacheTimeout": null, 713 | "colorBackground": false, 714 | "colorValue": false, 715 | "colors": [ 716 | "rgba(245, 54, 54, 0.9)", 717 | "rgba(237, 129, 40, 0.89)", 718 | "rgba(50, 172, 45, 0.97)" 719 | ], 720 | "datasource": null, 721 | "decimals": 1, 722 | "format": "decbytes", 723 | "gauge": { 724 | "maxValue": 100, 725 | "minValue": 0, 726 | "show": false, 727 | "thresholdLabels": false, 728 | "thresholdMarkers": true 729 | }, 730 | "gridPos": { 731 | "h": 4, 732 | "w": 3, 733 | "x": 7, 734 | "y": 11 735 | }, 736 | "hideTimeOverride": true, 737 | "id": 22, 738 | "interval": null, 739 | "links": [], 740 | "mappingType": 1, 741 | "mappingTypes": [ 742 | { 743 | "name": "value to text", 744 | "value": 1 745 | }, 746 | { 747 | "name": "range to text", 748 | "value": 2 749 | } 750 | ], 751 | "maxDataPoints": 100, 752 | "nullPointMode": "connected", 753 | "nullText": null, 754 | "postfix": "", 755 | "postfixFontSize": "50%", 756 | "prefix": "", 757 | "prefixFontSize": "50%", 758 | "rangeMaps": [ 759 | { 760 | "from": "null", 761 | "text": "N/A", 762 | "to": "null" 763 | } 764 | ], 765 | "sparkline": { 766 | "fillColor": "rgba(31, 118, 189, 0.18)", 767 | "full": false, 768 | "lineColor": "rgb(31, 120, 193)", 769 | "show": false 770 | }, 771 | "tableColumn": "", 772 | "targets": [ 773 | { 774 | "expr": "sum((node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 775 | "format": "time_series", 776 | "intervalFactor": 2, 777 | "legendFormat": "", 778 | "refId": "A", 779 | "step": 20 780 | } 781 | ], 782 | "thresholds": "", 783 | "timeFrom": null, 784 | "timeShift": null, 785 | "title": "Total swap memory used", 786 | "type": "singlestat", 787 | "valueFontSize": "80%", 788 | "valueMaps": [ 789 | { 790 | "op": "=", 791 | "text": "N/A", 792 | "value": "null" 793 | } 794 | ], 795 | "valueName": "avg" 796 | }, 797 | { 798 | "cacheTimeout": null, 799 | "colorBackground": false, 800 | "colorValue": false, 801 | "colors": [ 802 | "rgba(50, 172, 45, 0.97)", 803 | "rgba(237, 129, 40, 0.89)", 804 | "rgba(245, 54, 54, 0.9)" 805 | ], 806 | "datasource": null, 807 | "format": "percent", 808 | "gauge": { 809 | "maxValue": 100, 810 | "minValue": 0, 811 | "show": true, 812 | "thresholdLabels": false, 813 | "thresholdMarkers": true 814 | }, 815 | "gridPos": { 816 | "h": 4, 817 | "w": 4, 818 | "x": 10, 819 | "y": 11 820 | }, 821 | "id": 23, 822 | "interval": null, 823 | "links": [], 824 | "mappingType": 1, 825 | "mappingTypes": [ 826 | { 827 | "name": "value to text", 828 | "value": 1 829 | }, 830 | { 831 | "name": "range to text", 832 | "value": 2 833 | } 834 | ], 835 | "maxDataPoints": 100, 836 | "nullPointMode": "connected", 837 | "nullText": null, 838 | "postfix": "", 839 | "postfixFontSize": "50%", 840 | "prefix": "", 841 | "prefixFontSize": "50%", 842 | "rangeMaps": [ 843 | { 844 | "from": "null", 845 | "text": "N/A", 846 | "to": "null" 847 | } 848 | ], 849 | "sparkline": { 850 | "fillColor": "rgba(31, 118, 189, 0.18)", 851 | "full": false, 852 | "lineColor": "rgb(31, 120, 193)", 853 | "show": false 854 | }, 855 | "tableColumn": "", 856 | "targets": [ 857 | { 858 | "expr": "sum(((node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / node_memory_SwapTotal_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 859 | "format": "time_series", 860 | "intervalFactor": 2, 861 | "legendFormat": "", 862 | "refId": "A", 863 | "step": 20 864 | } 865 | ], 866 | "thresholds": "5,10,100", 867 | "title": "Used swap memory", 868 | "type": "singlestat", 869 | "valueFontSize": "80%", 870 | "valueMaps": [ 871 | { 872 | "op": "=", 873 | "text": "N/A", 874 | "value": "null" 875 | } 876 | ], 877 | "valueName": "avg" 878 | }, 879 | { 880 | "cacheTimeout": null, 881 | "colorBackground": false, 882 | "colorValue": false, 883 | "colors": [ 884 | "rgba(50, 172, 45, 0.97)", 885 | "rgba(237, 129, 40, 0.89)", 886 | "rgba(245, 54, 54, 0.9)" 887 | ], 888 | "datasource": null, 889 | "format": "percent", 890 | "gauge": { 891 | "maxValue": 100, 892 | "minValue": 0, 893 | "show": true, 894 | "thresholdLabels": false, 895 | "thresholdMarkers": true 896 | }, 897 | "gridPos": { 898 | "h": 4, 899 | "w": 3, 900 | "x": 14, 901 | "y": 11 902 | }, 903 | "id": 24, 904 | "interval": null, 905 | "links": [], 906 | "mappingType": 1, 907 | "mappingTypes": [ 908 | { 909 | "name": "value to text", 910 | "value": 1 911 | }, 912 | { 913 | "name": "range to text", 914 | "value": 2 915 | } 916 | ], 917 | "maxDataPoints": 100, 918 | "nullPointMode": "connected", 919 | "nullText": null, 920 | "postfix": "", 921 | "postfixFontSize": "50%", 922 | "prefix": "", 923 | "prefixFontSize": "50%", 924 | "rangeMaps": [ 925 | { 926 | "from": "null", 927 | "text": "N/A", 928 | "to": "null" 929 | } 930 | ], 931 | "sparkline": { 932 | "fillColor": "rgba(31, 118, 189, 0.18)", 933 | "full": false, 934 | "lineColor": "rgb(31, 120, 193)", 935 | "show": false 936 | }, 937 | "tableColumn": "", 938 | "targets": [ 939 | { 940 | "expr": "sum(((node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 941 | "format": "time_series", 942 | "intervalFactor": 2, 943 | "legendFormat": "", 944 | "refId": "A", 945 | "step": 20 946 | } 947 | ], 948 | "thresholds": "5,10,100", 949 | "title": "Swap used / total RAM memory ratio", 950 | "type": "singlestat", 951 | "valueFontSize": "80%", 952 | "valueMaps": [ 953 | { 954 | "op": "=", 955 | "text": "N/A", 956 | "value": "null" 957 | } 958 | ], 959 | "valueName": "avg" 960 | }, 961 | { 962 | "cacheTimeout": null, 963 | "colorBackground": false, 964 | "colorValue": false, 965 | "colors": [ 966 | "rgba(245, 54, 54, 0.9)", 967 | "rgba(237, 129, 40, 0.89)", 968 | "rgba(50, 172, 45, 0.97)" 969 | ], 970 | "datasource": null, 971 | "decimals": 1, 972 | "format": "decbytes", 973 | "gauge": { 974 | "maxValue": 100, 975 | "minValue": 0, 976 | "show": false, 977 | "thresholdLabels": false, 978 | "thresholdMarkers": true 979 | }, 980 | "gridPos": { 981 | "h": 4, 982 | "w": 3, 983 | "x": 17, 984 | "y": 11 985 | }, 986 | "hideTimeOverride": true, 987 | "id": 9, 988 | "interval": null, 989 | "links": [], 990 | "mappingType": 1, 991 | "mappingTypes": [ 992 | { 993 | "name": "value to text", 994 | "value": 1 995 | }, 996 | { 997 | "name": "range to text", 998 | "value": 2 999 | } 1000 | ], 1001 | "maxDataPoints": 100, 1002 | "nullPointMode": "connected", 1003 | "nullText": null, 1004 | "postfix": "", 1005 | "postfixFontSize": "50%", 1006 | "prefix": "", 1007 | "prefixFontSize": "50%", 1008 | "rangeMaps": [ 1009 | { 1010 | "from": "null", 1011 | "text": "N/A", 1012 | "to": "null" 1013 | } 1014 | ], 1015 | "sparkline": { 1016 | "fillColor": "rgba(31, 118, 189, 0.18)", 1017 | "full": false, 1018 | "lineColor": "rgb(31, 120, 193)", 1019 | "show": false 1020 | }, 1021 | "tableColumn": "", 1022 | "targets": [ 1023 | { 1024 | "expr": "sum(node_filesystem_size_bytes{mountpoint=\"/\"} * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 1025 | "format": "time_series", 1026 | "intervalFactor": 2, 1027 | "legendFormat": "", 1028 | "refId": "A", 1029 | "step": 20 1030 | } 1031 | ], 1032 | "thresholds": "", 1033 | "timeFrom": null, 1034 | "timeShift": null, 1035 | "title": "Total Disk Space", 1036 | "type": "singlestat", 1037 | "valueFontSize": "80%", 1038 | "valueMaps": [ 1039 | { 1040 | "op": "=", 1041 | "text": "N/A", 1042 | "value": "null" 1043 | } 1044 | ], 1045 | "valueName": "avg" 1046 | }, 1047 | { 1048 | "cacheTimeout": null, 1049 | "colorBackground": false, 1050 | "colorValue": false, 1051 | "colors": [ 1052 | "rgba(245, 54, 54, 0.9)", 1053 | "rgba(237, 129, 40, 0.89)", 1054 | "rgba(50, 172, 45, 0.97)" 1055 | ], 1056 | "datasource": null, 1057 | "format": "percent", 1058 | "gauge": { 1059 | "maxValue": 100, 1060 | "minValue": 0, 1061 | "show": true, 1062 | "thresholdLabels": false, 1063 | "thresholdMarkers": true 1064 | }, 1065 | "gridPos": { 1066 | "h": 4, 1067 | "w": 4, 1068 | "x": 20, 1069 | "y": 11 1070 | }, 1071 | "id": 10, 1072 | "interval": null, 1073 | "links": [], 1074 | "mappingType": 1, 1075 | "mappingTypes": [ 1076 | { 1077 | "name": "value to text", 1078 | "value": 1 1079 | }, 1080 | { 1081 | "name": "range to text", 1082 | "value": 2 1083 | } 1084 | ], 1085 | "maxDataPoints": 100, 1086 | "nullPointMode": "connected", 1087 | "nullText": null, 1088 | "postfix": "", 1089 | "postfixFontSize": "50%", 1090 | "prefix": "", 1091 | "prefixFontSize": "50%", 1092 | "rangeMaps": [ 1093 | { 1094 | "from": "null", 1095 | "text": "N/A", 1096 | "to": "null" 1097 | } 1098 | ], 1099 | "sparkline": { 1100 | "fillColor": "rgba(31, 118, 189, 0.18)", 1101 | "full": false, 1102 | "lineColor": "rgb(31, 120, 193)", 1103 | "show": false 1104 | }, 1105 | "tableColumn": "", 1106 | "targets": [ 1107 | { 1108 | "expr": "sum((node_filesystem_free_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 1109 | "format": "time_series", 1110 | "intervalFactor": 2, 1111 | "legendFormat": "", 1112 | "refId": "A", 1113 | "step": 20 1114 | } 1115 | ], 1116 | "thresholds": "10,25,100", 1117 | "title": "Available Disk Space", 1118 | "type": "singlestat", 1119 | "valueFontSize": "80%", 1120 | "valueMaps": [ 1121 | { 1122 | "op": "=", 1123 | "text": "N/A", 1124 | "value": "null" 1125 | } 1126 | ], 1127 | "valueName": "avg" 1128 | }, 1129 | { 1130 | "aliasColors": {}, 1131 | "bars": false, 1132 | "dashLength": 10, 1133 | "dashes": false, 1134 | "datasource": null, 1135 | "fill": 1, 1136 | "gridPos": { 1137 | "h": 7, 1138 | "w": 24, 1139 | "x": 0, 1140 | "y": 15 1141 | }, 1142 | "id": 15, 1143 | "legend": { 1144 | "alignAsTable": true, 1145 | "avg": true, 1146 | "current": false, 1147 | "max": true, 1148 | "min": true, 1149 | "rightSide": true, 1150 | "show": true, 1151 | "total": false, 1152 | "values": true 1153 | }, 1154 | "lines": true, 1155 | "linewidth": 1, 1156 | "links": [], 1157 | "nullPointMode": "null", 1158 | "percentage": false, 1159 | "pointradius": 5, 1160 | "points": false, 1161 | "renderer": "flot", 1162 | "seriesOverrides": [], 1163 | "spaceLength": 10, 1164 | "stack": true, 1165 | "steppedLine": false, 1166 | "targets": [ 1167 | { 1168 | "expr": "sum((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Cached_bytes - node_memory_Buffers_bytes - node_memory_Slab_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1169 | "format": "time_series", 1170 | "intervalFactor": 2, 1171 | "legendFormat": "Used {{node_name}}", 1172 | "refId": "A", 1173 | "step": 2 1174 | }, 1175 | { 1176 | "expr": "sum(node_memory_Cached * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1177 | "format": "time_series", 1178 | "intervalFactor": 2, 1179 | "legendFormat": "Cached {{node_name}}", 1180 | "refId": "B", 1181 | "step": 2 1182 | } 1183 | ], 1184 | "thresholds": [], 1185 | "timeFrom": null, 1186 | "timeShift": null, 1187 | "title": "Memory usage by Node", 1188 | "tooltip": { 1189 | "shared": true, 1190 | "sort": 0, 1191 | "value_type": "individual" 1192 | }, 1193 | "type": "graph", 1194 | "xaxis": { 1195 | "buckets": null, 1196 | "mode": "time", 1197 | "name": null, 1198 | "show": true, 1199 | "values": [] 1200 | }, 1201 | "yaxes": [ 1202 | { 1203 | "format": "decbytes", 1204 | "label": null, 1205 | "logBase": 1, 1206 | "max": null, 1207 | "min": null, 1208 | "show": true 1209 | }, 1210 | { 1211 | "format": "short", 1212 | "label": null, 1213 | "logBase": 1, 1214 | "max": null, 1215 | "min": null, 1216 | "show": true 1217 | } 1218 | ], 1219 | "yaxis": { 1220 | "align": false, 1221 | "alignLevel": null 1222 | } 1223 | }, 1224 | { 1225 | "aliasColors": {}, 1226 | "bars": false, 1227 | "dashLength": 10, 1228 | "dashes": false, 1229 | "datasource": null, 1230 | "fill": 1, 1231 | "gridPos": { 1232 | "h": 7, 1233 | "w": 24, 1234 | "x": 0, 1235 | "y": 22 1236 | }, 1237 | "id": 21, 1238 | "legend": { 1239 | "alignAsTable": true, 1240 | "avg": true, 1241 | "current": false, 1242 | "max": true, 1243 | "min": true, 1244 | "rightSide": true, 1245 | "show": true, 1246 | "total": false, 1247 | "values": true 1248 | }, 1249 | "lines": true, 1250 | "linewidth": 1, 1251 | "links": [], 1252 | "nullPointMode": "null", 1253 | "percentage": false, 1254 | "pointradius": 5, 1255 | "points": false, 1256 | "renderer": "flot", 1257 | "seriesOverrides": [], 1258 | "spaceLength": 10, 1259 | "stack": true, 1260 | "steppedLine": false, 1261 | "targets": [ 1262 | { 1263 | "expr": "sum((node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1264 | "format": "time_series", 1265 | "intervalFactor": 2, 1266 | "legendFormat": "Used {{node_name}}", 1267 | "refId": "A", 1268 | "step": 2 1269 | } 1270 | ], 1271 | "thresholds": [], 1272 | "timeFrom": null, 1273 | "timeShift": null, 1274 | "title": "Swap memory usage by Node", 1275 | "tooltip": { 1276 | "shared": true, 1277 | "sort": 0, 1278 | "value_type": "individual" 1279 | }, 1280 | "type": "graph", 1281 | "xaxis": { 1282 | "buckets": null, 1283 | "mode": "time", 1284 | "name": null, 1285 | "show": true, 1286 | "values": [] 1287 | }, 1288 | "yaxes": [ 1289 | { 1290 | "format": "decbytes", 1291 | "label": null, 1292 | "logBase": 1, 1293 | "max": null, 1294 | "min": "0", 1295 | "show": true 1296 | }, 1297 | { 1298 | "format": "short", 1299 | "label": null, 1300 | "logBase": 1, 1301 | "max": null, 1302 | "min": null, 1303 | "show": true 1304 | } 1305 | ], 1306 | "yaxis": { 1307 | "align": false, 1308 | "alignLevel": null 1309 | } 1310 | }, 1311 | { 1312 | "aliasColors": {}, 1313 | "bars": false, 1314 | "dashLength": 10, 1315 | "dashes": false, 1316 | "datasource": null, 1317 | "decimals": 2, 1318 | "fill": 1, 1319 | "gridPos": { 1320 | "h": 7, 1321 | "w": 24, 1322 | "x": 0, 1323 | "y": 29 1324 | }, 1325 | "id": 16, 1326 | "legend": { 1327 | "alignAsTable": true, 1328 | "avg": true, 1329 | "current": false, 1330 | "max": true, 1331 | "min": true, 1332 | "rightSide": true, 1333 | "show": true, 1334 | "total": false, 1335 | "values": true 1336 | }, 1337 | "lines": true, 1338 | "linewidth": 1, 1339 | "links": [], 1340 | "nullPointMode": "null as zero", 1341 | "percentage": false, 1342 | "pointradius": 5, 1343 | "points": false, 1344 | "renderer": "flot", 1345 | "seriesOverrides": [], 1346 | "spaceLength": 10, 1347 | "stack": false, 1348 | "steppedLine": false, 1349 | "targets": [ 1350 | { 1351 | "expr": "sum(irate(node_disk_read_bytes_total[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1352 | "format": "time_series", 1353 | "intervalFactor": 2, 1354 | "legendFormat": "Read {{node_name}}", 1355 | "refId": "A", 1356 | "step": 2 1357 | }, 1358 | { 1359 | "expr": "sum(irate(node_disk_written_bytes_total[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1360 | "format": "time_series", 1361 | "intervalFactor": 2, 1362 | "legendFormat": "Written {{node_name}}", 1363 | "refId": "B", 1364 | "step": 2 1365 | } 1366 | ], 1367 | "thresholds": [], 1368 | "timeFrom": null, 1369 | "timeShift": null, 1370 | "title": "Disk I/O by Node", 1371 | "tooltip": { 1372 | "shared": true, 1373 | "sort": 0, 1374 | "value_type": "individual" 1375 | }, 1376 | "type": "graph", 1377 | "xaxis": { 1378 | "buckets": null, 1379 | "mode": "time", 1380 | "name": null, 1381 | "show": true, 1382 | "values": [] 1383 | }, 1384 | "yaxes": [ 1385 | { 1386 | "format": "Bps", 1387 | "label": null, 1388 | "logBase": 1, 1389 | "max": null, 1390 | "min": null, 1391 | "show": true 1392 | }, 1393 | { 1394 | "format": "short", 1395 | "label": null, 1396 | "logBase": 1, 1397 | "max": null, 1398 | "min": null, 1399 | "show": true 1400 | } 1401 | ], 1402 | "yaxis": { 1403 | "align": false, 1404 | "alignLevel": null 1405 | } 1406 | }, 1407 | { 1408 | "aliasColors": {}, 1409 | "bars": false, 1410 | "dashLength": 10, 1411 | "dashes": false, 1412 | "datasource": null, 1413 | "decimals": 2, 1414 | "fill": 1, 1415 | "gridPos": { 1416 | "h": 7, 1417 | "w": 12, 1418 | "x": 0, 1419 | "y": 36 1420 | }, 1421 | "id": 18, 1422 | "legend": { 1423 | "alignAsTable": true, 1424 | "avg": true, 1425 | "current": true, 1426 | "max": true, 1427 | "min": true, 1428 | "rightSide": true, 1429 | "show": false, 1430 | "total": false, 1431 | "values": true 1432 | }, 1433 | "lines": true, 1434 | "linewidth": 1, 1435 | "links": [], 1436 | "nullPointMode": "null as zero", 1437 | "percentage": false, 1438 | "pointradius": 5, 1439 | "points": false, 1440 | "renderer": "flot", 1441 | "seriesOverrides": [], 1442 | "spaceLength": 10, 1443 | "stack": false, 1444 | "steppedLine": false, 1445 | "targets": [ 1446 | { 1447 | "expr": "sum(irate(node_disk_reads_completed_total[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1448 | "format": "time_series", 1449 | "intervalFactor": 2, 1450 | "legendFormat": "Reads {{node_name}}", 1451 | "refId": "A", 1452 | "step": 2 1453 | }, 1454 | { 1455 | "expr": "sum(irate(node_disk_writes_completed_total[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1456 | "format": "time_series", 1457 | "intervalFactor": 2, 1458 | "legendFormat": "Writes {{node_name}}", 1459 | "refId": "B", 1460 | "step": 2 1461 | } 1462 | ], 1463 | "thresholds": [], 1464 | "timeFrom": null, 1465 | "timeShift": null, 1466 | "title": "IOPS by Node", 1467 | "tooltip": { 1468 | "shared": true, 1469 | "sort": 0, 1470 | "value_type": "individual" 1471 | }, 1472 | "type": "graph", 1473 | "xaxis": { 1474 | "buckets": null, 1475 | "mode": "time", 1476 | "name": null, 1477 | "show": true, 1478 | "values": [] 1479 | }, 1480 | "yaxes": [ 1481 | { 1482 | "format": "short", 1483 | "label": null, 1484 | "logBase": 1, 1485 | "max": null, 1486 | "min": null, 1487 | "show": true 1488 | }, 1489 | { 1490 | "format": "short", 1491 | "label": null, 1492 | "logBase": 1, 1493 | "max": null, 1494 | "min": null, 1495 | "show": true 1496 | } 1497 | ], 1498 | "yaxis": { 1499 | "align": false, 1500 | "alignLevel": null 1501 | } 1502 | }, 1503 | { 1504 | "aliasColors": {}, 1505 | "bars": false, 1506 | "dashLength": 10, 1507 | "dashes": false, 1508 | "datasource": null, 1509 | "decimals": 2, 1510 | "fill": 1, 1511 | "gridPos": { 1512 | "h": 7, 1513 | "w": 12, 1514 | "x": 12, 1515 | "y": 36 1516 | }, 1517 | "id": 19, 1518 | "legend": { 1519 | "alignAsTable": true, 1520 | "avg": true, 1521 | "current": true, 1522 | "hideEmpty": true, 1523 | "hideZero": true, 1524 | "max": true, 1525 | "min": true, 1526 | "rightSide": true, 1527 | "show": false, 1528 | "total": false, 1529 | "values": true 1530 | }, 1531 | "lines": true, 1532 | "linewidth": 1, 1533 | "links": [], 1534 | "nullPointMode": "null as zero", 1535 | "percentage": false, 1536 | "pointradius": 5, 1537 | "points": false, 1538 | "renderer": "flot", 1539 | "seriesOverrides": [], 1540 | "spaceLength": 10, 1541 | "stack": false, 1542 | "steppedLine": false, 1543 | "targets": [ 1544 | { 1545 | "expr": "(avg(irate(node_cpu_seconds_total{mode=\"iowait\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) by (node_name))", 1546 | "format": "time_series", 1547 | "intervalFactor": 2, 1548 | "legendFormat": "{{node_name}}", 1549 | "refId": "A", 1550 | "step": 2 1551 | } 1552 | ], 1553 | "thresholds": [], 1554 | "timeFrom": null, 1555 | "timeShift": null, 1556 | "title": "CPU IO Wait by Node", 1557 | "tooltip": { 1558 | "shared": true, 1559 | "sort": 2, 1560 | "value_type": "individual" 1561 | }, 1562 | "type": "graph", 1563 | "xaxis": { 1564 | "buckets": null, 1565 | "mode": "time", 1566 | "name": null, 1567 | "show": true, 1568 | "values": [] 1569 | }, 1570 | "yaxes": [ 1571 | { 1572 | "format": "percent", 1573 | "label": null, 1574 | "logBase": 1, 1575 | "max": null, 1576 | "min": null, 1577 | "show": true 1578 | }, 1579 | { 1580 | "format": "short", 1581 | "label": null, 1582 | "logBase": 1, 1583 | "max": null, 1584 | "min": null, 1585 | "show": true 1586 | } 1587 | ], 1588 | "yaxis": { 1589 | "align": false, 1590 | "alignLevel": null 1591 | } 1592 | }, 1593 | { 1594 | "aliasColors": {}, 1595 | "bars": false, 1596 | "dashLength": 10, 1597 | "dashes": false, 1598 | "datasource": null, 1599 | "decimals": 0, 1600 | "fill": 3, 1601 | "gridPos": { 1602 | "h": 7, 1603 | "w": 18, 1604 | "x": 0, 1605 | "y": 43 1606 | }, 1607 | "id": 12, 1608 | "legend": { 1609 | "alignAsTable": true, 1610 | "avg": false, 1611 | "current": true, 1612 | "hideEmpty": true, 1613 | "hideZero": true, 1614 | "max": false, 1615 | "min": false, 1616 | "rightSide": true, 1617 | "show": true, 1618 | "sort": "current", 1619 | "sortDesc": true, 1620 | "total": false, 1621 | "values": true 1622 | }, 1623 | "lines": true, 1624 | "linewidth": 1, 1625 | "links": [], 1626 | "nullPointMode": "null", 1627 | "percentage": false, 1628 | "pointradius": 5, 1629 | "points": false, 1630 | "renderer": "flot", 1631 | "seriesOverrides": [], 1632 | "spaceLength": 10, 1633 | "stack": true, 1634 | "steppedLine": false, 1635 | "targets": [ 1636 | { 1637 | "expr": "sum(rate(container_last_seen{container_label_com_docker_swarm_node_id=~\"$node_id\"}[5m])) by (container_label_com_docker_swarm_service_name)", 1638 | "format": "time_series", 1639 | "intervalFactor": 10, 1640 | "legendFormat": "{{ container_label_com_docker_swarm_service_name }}", 1641 | "refId": "A", 1642 | "step": 10 1643 | } 1644 | ], 1645 | "thresholds": [], 1646 | "timeFrom": null, 1647 | "timeShift": null, 1648 | "title": "Running Containers by Service", 1649 | "tooltip": { 1650 | "shared": true, 1651 | "sort": 2, 1652 | "value_type": "individual" 1653 | }, 1654 | "type": "graph", 1655 | "xaxis": { 1656 | "buckets": null, 1657 | "mode": "time", 1658 | "name": null, 1659 | "show": true, 1660 | "values": [] 1661 | }, 1662 | "yaxes": [ 1663 | { 1664 | "format": "short", 1665 | "label": null, 1666 | "logBase": 1, 1667 | "max": null, 1668 | "min": null, 1669 | "show": true 1670 | }, 1671 | { 1672 | "format": "short", 1673 | "label": null, 1674 | "logBase": 1, 1675 | "max": null, 1676 | "min": null, 1677 | "show": true 1678 | } 1679 | ], 1680 | "yaxis": { 1681 | "align": false, 1682 | "alignLevel": null 1683 | } 1684 | }, 1685 | { 1686 | "cacheTimeout": null, 1687 | "colorBackground": false, 1688 | "colorValue": false, 1689 | "colors": [ 1690 | "rgba(245, 54, 54, 0.9)", 1691 | "rgba(237, 129, 40, 0.89)", 1692 | "rgba(50, 172, 45, 0.97)" 1693 | ], 1694 | "datasource": null, 1695 | "format": "none", 1696 | "gauge": { 1697 | "maxValue": 100, 1698 | "minValue": 0, 1699 | "show": false, 1700 | "thresholdLabels": false, 1701 | "thresholdMarkers": true 1702 | }, 1703 | "gridPos": { 1704 | "h": 7, 1705 | "w": 6, 1706 | "x": 18, 1707 | "y": 43 1708 | }, 1709 | "id": 7, 1710 | "interval": null, 1711 | "links": [], 1712 | "mappingType": 1, 1713 | "mappingTypes": [ 1714 | { 1715 | "name": "value to text", 1716 | "value": 1 1717 | }, 1718 | { 1719 | "name": "range to text", 1720 | "value": 2 1721 | } 1722 | ], 1723 | "maxDataPoints": 100, 1724 | "nullPointMode": "connected", 1725 | "nullText": null, 1726 | "postfix": "", 1727 | "postfixFontSize": "50%", 1728 | "prefix": "", 1729 | "prefixFontSize": "50%", 1730 | "rangeMaps": [ 1731 | { 1732 | "from": "null", 1733 | "text": "N/A", 1734 | "to": "null" 1735 | } 1736 | ], 1737 | "sparkline": { 1738 | "fillColor": "rgba(31, 118, 189, 0.18)", 1739 | "full": false, 1740 | "lineColor": "rgb(31, 120, 193)", 1741 | "show": true 1742 | }, 1743 | "tableColumn": "", 1744 | "targets": [ 1745 | { 1746 | "expr": "count(rate(container_last_seen{container_label_com_docker_swarm_node_id=~\"$node_id\"}[5m])) ", 1747 | "format": "time_series", 1748 | "intervalFactor": 2, 1749 | "refId": "A", 1750 | "step": 20 1751 | } 1752 | ], 1753 | "thresholds": "", 1754 | "title": "Total Containers", 1755 | "type": "singlestat", 1756 | "valueFontSize": "80%", 1757 | "valueMaps": [ 1758 | { 1759 | "op": "=", 1760 | "text": "N/A", 1761 | "value": "null" 1762 | } 1763 | ], 1764 | "valueName": "avg" 1765 | }, 1766 | { 1767 | "aliasColors": {}, 1768 | "bars": false, 1769 | "dashLength": 10, 1770 | "dashes": false, 1771 | "datasource": null, 1772 | "fill": 1, 1773 | "gridPos": { 1774 | "h": 7, 1775 | "w": 24, 1776 | "x": 0, 1777 | "y": 50 1778 | }, 1779 | "id": 17, 1780 | "legend": { 1781 | "alignAsTable": true, 1782 | "avg": true, 1783 | "current": false, 1784 | "max": true, 1785 | "min": true, 1786 | "rightSide": true, 1787 | "show": true, 1788 | "total": false, 1789 | "values": true 1790 | }, 1791 | "lines": true, 1792 | "linewidth": 1, 1793 | "links": [], 1794 | "nullPointMode": "null", 1795 | "percentage": false, 1796 | "pointradius": 5, 1797 | "points": false, 1798 | "renderer": "flot", 1799 | "seriesOverrides": [], 1800 | "spaceLength": 10, 1801 | "stack": false, 1802 | "steppedLine": false, 1803 | "targets": [ 1804 | { 1805 | "expr": "sum(rate(container_network_receive_bytes_total{container_label_com_docker_swarm_node_id=~\"$node_id\"}[$interval]) * on(container_label_com_docker_swarm_node_id) group_left(node_name) node_meta) by (node_name)", 1806 | "format": "time_series", 1807 | "intervalFactor": 2, 1808 | "legendFormat": "IN {{node_name}}", 1809 | "refId": "A", 1810 | "step": 2 1811 | }, 1812 | { 1813 | "expr": "- sum(rate(container_network_transmit_bytes_total{container_label_com_docker_swarm_node_id=~\"$node_id\"}[$interval]) * on(container_label_com_docker_swarm_node_id) group_left(node_name) node_meta) by (node_name)", 1814 | "format": "time_series", 1815 | "hide": false, 1816 | "intervalFactor": 2, 1817 | "legendFormat": "OUT {{node_name}}", 1818 | "metric": "", 1819 | "refId": "B", 1820 | "step": 2 1821 | } 1822 | ], 1823 | "thresholds": [], 1824 | "timeFrom": null, 1825 | "timeShift": null, 1826 | "title": "Containers Network Traffic by Node", 1827 | "tooltip": { 1828 | "shared": true, 1829 | "sort": 0, 1830 | "value_type": "individual" 1831 | }, 1832 | "type": "graph", 1833 | "xaxis": { 1834 | "buckets": null, 1835 | "mode": "time", 1836 | "name": null, 1837 | "show": true, 1838 | "values": [] 1839 | }, 1840 | "yaxes": [ 1841 | { 1842 | "format": "Bps", 1843 | "label": null, 1844 | "logBase": 1, 1845 | "max": null, 1846 | "min": null, 1847 | "show": true 1848 | }, 1849 | { 1850 | "format": "short", 1851 | "label": null, 1852 | "logBase": 1, 1853 | "max": null, 1854 | "min": null, 1855 | "show": true 1856 | } 1857 | ], 1858 | "yaxis": { 1859 | "align": false, 1860 | "alignLevel": null 1861 | } 1862 | }, 1863 | { 1864 | "columns": [], 1865 | "datasource": null, 1866 | "fontSize": "100%", 1867 | "gridPos": { 1868 | "h": 7, 1869 | "w": 24, 1870 | "x": 0, 1871 | "y": 57 1872 | }, 1873 | "hideTimeOverride": true, 1874 | "id": 20, 1875 | "links": [], 1876 | "pageSize": null, 1877 | "scroll": true, 1878 | "showHeader": true, 1879 | "sort": { 1880 | "col": 0, 1881 | "desc": true 1882 | }, 1883 | "styles": [ 1884 | { 1885 | "alias": "Time", 1886 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 1887 | "pattern": "Time", 1888 | "type": "hidden" 1889 | }, 1890 | { 1891 | "alias": "", 1892 | "colorMode": null, 1893 | "colors": [ 1894 | "rgba(245, 54, 54, 0.9)", 1895 | "rgba(237, 129, 40, 0.89)", 1896 | "rgba(50, 172, 45, 0.97)" 1897 | ], 1898 | "decimals": 2, 1899 | "pattern": "/.*/", 1900 | "thresholds": [], 1901 | "type": "number", 1902 | "unit": "short" 1903 | } 1904 | ], 1905 | "targets": [ 1906 | { 1907 | "expr": "sum(node_meta) by (node_id, node_name, instance)", 1908 | "format": "table", 1909 | "instant": true, 1910 | "intervalFactor": 2, 1911 | "refId": "A", 1912 | "step": 2 1913 | } 1914 | ], 1915 | "timeFrom": "1s", 1916 | "title": "Cluster members", 1917 | "transform": "table", 1918 | "type": "table" 1919 | } 1920 | ], 1921 | "refresh": "30s", 1922 | "schemaVersion": 16, 1923 | "style": "dark", 1924 | "tags": [ 1925 | "swarmprom" 1926 | ], 1927 | "templating": { 1928 | "list": [ 1929 | { 1930 | "allValue": ".+", 1931 | "current": { 1932 | "text": "All", 1933 | "value": "$__all" 1934 | }, 1935 | "datasource": "Prometheus", 1936 | "hide": 0, 1937 | "includeAll": true, 1938 | "label": "Swarm Node", 1939 | "multi": false, 1940 | "name": "node_id", 1941 | "options": [], 1942 | "query": "node_meta", 1943 | "refresh": 1, 1944 | "regex": "/node_id=\"([^\"]+)\"/", 1945 | "skipUrlSync": false, 1946 | "sort": 0, 1947 | "tagValuesQuery": "label_values({node_id=\"$tag\"},node_name)", 1948 | "tags": [ 1949 | "ofdocker", 1950 | "ofmon" 1951 | ], 1952 | "tagsQuery": "label_values(node_meta, node_name)", 1953 | "type": "query", 1954 | "useTags": true 1955 | }, 1956 | { 1957 | "auto": true, 1958 | "auto_count": 30, 1959 | "auto_min": "30s", 1960 | "current": { 1961 | "text": "auto", 1962 | "value": "$__auto_interval_interval" 1963 | }, 1964 | "hide": 0, 1965 | "label": "Interval", 1966 | "name": "interval", 1967 | "options": [ 1968 | { 1969 | "selected": true, 1970 | "text": "auto", 1971 | "value": "$__auto_interval_interval" 1972 | }, 1973 | { 1974 | "selected": false, 1975 | "text": "1m", 1976 | "value": "1m" 1977 | }, 1978 | { 1979 | "selected": false, 1980 | "text": "10m", 1981 | "value": "10m" 1982 | }, 1983 | { 1984 | "selected": false, 1985 | "text": "30m", 1986 | "value": "30m" 1987 | }, 1988 | { 1989 | "selected": false, 1990 | "text": "1h", 1991 | "value": "1h" 1992 | }, 1993 | { 1994 | "selected": false, 1995 | "text": "6h", 1996 | "value": "6h" 1997 | }, 1998 | { 1999 | "selected": false, 2000 | "text": "12h", 2001 | "value": "12h" 2002 | }, 2003 | { 2004 | "selected": false, 2005 | "text": "1d", 2006 | "value": "1d" 2007 | }, 2008 | { 2009 | "selected": false, 2010 | "text": "7d", 2011 | "value": "7d" 2012 | }, 2013 | { 2014 | "selected": false, 2015 | "text": "14d", 2016 | "value": "14d" 2017 | }, 2018 | { 2019 | "selected": false, 2020 | "text": "30d", 2021 | "value": "30d" 2022 | } 2023 | ], 2024 | "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 2025 | "refresh": 2, 2026 | "skipUrlSync": false, 2027 | "type": "interval" 2028 | } 2029 | ] 2030 | }, 2031 | "time": { 2032 | "from": "now-15m", 2033 | "to": "now" 2034 | }, 2035 | "timepicker": { 2036 | "refresh_intervals": [ 2037 | "5s", 2038 | "10s", 2039 | "30s", 2040 | "1m", 2041 | "5m", 2042 | "15m", 2043 | "30m", 2044 | "1h", 2045 | "2h", 2046 | "1d" 2047 | ], 2048 | "time_options": [ 2049 | "5m", 2050 | "15m", 2051 | "1h", 2052 | "6h", 2053 | "12h", 2054 | "24h", 2055 | "2d", 2056 | "7d", 2057 | "30d" 2058 | ] 2059 | }, 2060 | "timezone": "", 2061 | "title": "Docker Swarm Nodes", 2062 | "uid": "BPlb-Sgik", 2063 | "version": 3 2064 | } 2065 | --------------------------------------------------------------------------------