├── .gitattributes ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── alertmanager ├── Dockerfile └── conf │ ├── alertmanager.yml │ └── docker-entrypoint.sh ├── caddy └── Caddyfile ├── docker-compose.traefik.yml ├── docker-compose.yml ├── dockerd-exporter └── Caddyfile ├── grafana ├── .dockerignore ├── Dockerfile ├── dashboards │ ├── swarmprom-nodes-dash.json │ ├── swarmprom-prometheus-dash.json │ └── swarmprom-services-dash.json ├── datasources │ └── prometheus.yaml ├── screens │ ├── alertmanager-slack-v2.png │ ├── swarmprom-nodes-dash-v3.png │ ├── swarmprom-prometheus-dash-v3.png │ ├── swarmprom-services-dash-v3.png │ ├── unsee.png │ ├── weave-scope-hosts-v2.png │ └── weave-scope.png └── swarmprom_dashboards.yml ├── node-exporter ├── Dockerfile └── conf │ └── docker-entrypoint.sh ├── prometheus ├── Dockerfile ├── conf │ ├── docker-entrypoint.sh │ ├── prometheus.yml │ └── weave-cortex.yml └── rules │ ├── swarm_node.rules.yml │ └── swarm_task.rules.yml ├── test-compose.yml └── weave-compose.yml /.gitattributes: -------------------------------------------------------------------------------- 1 | # Denote all files that are truly binary and should not be modified. 2 | *.png binary 3 | *.jpg binary 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.dll 4 | *.so 5 | *.dylib 6 | 7 | # Test binary, build with `go test -c` 8 | *.test 9 | 10 | # Output of the go coverage tool, specifically when used with LiteIDE 11 | *.out 12 | 13 | # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 14 | .glide/ 15 | 16 | .idea/ 17 | .DS_Store 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | services: 4 | - docker 5 | 6 | before_install: 7 | - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 8 | - sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" 9 | - sudo apt-get update 10 | - sudo apt-get -y install docker-ce 11 | - sudo service docker restart 12 | 13 | script: 14 | - cd prometheus && docker build -t stefanprodan/swarmprom-prometheus:$TRAVIS_BUILD_NUMBER . 15 | - cd .. && cd node-exporter && docker build -t stefanprodan/swarmprom-node-exporter:$TRAVIS_BUILD_NUMBER . 16 | - cd .. && cd alertmanager && docker build -t stefanprodan/swarmprom-alertmanager:$TRAVIS_BUILD_NUMBER . 17 | - cd .. && cd grafana && docker build -t stefanprodan/swarmprom-grafana:$TRAVIS_BUILD_NUMBER . 18 | 19 | after_success: 20 | - if [ -z "$DOCKER_USER" ]; then 21 | echo "PR build, skipping Docker Hub push"; 22 | else 23 | docker login -u "$DOCKER_USER" -p "$DOCKER_PASS"; 24 | docker tag stefanprodan/swarmprom-prometheus:$TRAVIS_BUILD_NUMBER stefanprodan/swarmprom-prometheus:v2.5.0; 25 | docker push stefanprodan/swarmprom-prometheus:v2.5.0; 26 | docker tag stefanprodan/swarmprom-node-exporter:$TRAVIS_BUILD_NUMBER stefanprodan/swarmprom-node-exporter:v0.16.0; 27 | docker push stefanprodan/swarmprom-node-exporter:v0.16.0; 28 | docker tag stefanprodan/swarmprom-alertmanager:$TRAVIS_BUILD_NUMBER stefanprodan/swarmprom-alertmanager:v0.15.3; 29 | docker push stefanprodan/swarmprom-alertmanager:v0.15.3; 30 | docker tag stefanprodan/swarmprom-grafana:$TRAVIS_BUILD_NUMBER stefanprodan/swarmprom-grafana:5.3.4; 31 | docker push stefanprodan/swarmprom-grafana:5.3.4; 32 | fi 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Stefan Prodan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # swarmprom 2 | 3 | Swarmprom is a starter kit for Docker Swarm monitoring with [Prometheus](https://prometheus.io/), 4 | [Grafana](http://grafana.org/), 5 | [cAdvisor](https://github.com/google/cadvisor), 6 | [Node Exporter](https://github.com/prometheus/node_exporter), 7 | [Alert Manager](https://github.com/prometheus/alertmanager) 8 | and [Unsee](https://github.com/cloudflare/unsee). 9 | 10 | ## Install 11 | 12 | Clone this repository and run the monitoring stack: 13 | 14 | ```bash 15 | $ git clone https://github.com/stefanprodan/swarmprom.git 16 | $ cd swarmprom 17 | 18 | ADMIN_USER=admin \ 19 | ADMIN_PASSWORD=admin \ 20 | SLACK_URL=https://hooks.slack.com/services/TOKEN \ 21 | SLACK_CHANNEL=devops-alerts \ 22 | SLACK_USER=alertmanager \ 23 | docker stack deploy -c docker-compose.yml mon 24 | ``` 25 | 26 | Prerequisites: 27 | 28 | * Docker CE 17.09.0-ce or Docker EE 17.06.2-ee-3 29 | * Swarm cluster with one manager and a worker node 30 | * Docker engine experimental enabled and metrics address set to `0.0.0.0:9323` 31 | 32 | Services: 33 | 34 | * prometheus (metrics database) `http://:9090` 35 | * grafana (visualize metrics) `http://:3000` 36 | * node-exporter (host metrics collector) 37 | * cadvisor (containers metrics collector) 38 | * dockerd-exporter (Docker daemon metrics collector, requires Docker experimental metrics-addr to be enabled) 39 | * alertmanager (alerts dispatcher) `http://:9093` 40 | * unsee (alert manager dashboard) `http://:9094` 41 | * caddy (reverse proxy and basic auth provider for prometheus, alertmanager and unsee) 42 | 43 | 44 | ## Alternative install with Traefik and HTTPS 45 | 46 | If you have a Docker Swarm cluster with a global Traefik set up as described in [DockerSwarm.rocks](https://dockerswarm.rocks), you can deploy Swarmprom integrated with that global Traefik proxy. 47 | 48 | This way, each Swarmprom service will have its own domain, and each of them will be served using HTTPS, with certificates generated (and renewed) automatically. 49 | 50 | ### Requisites 51 | 52 | These instructions assume you already have Traefik set up following that guide above, in short: 53 | 54 | * With automatic HTTPS certificate generation. 55 | * A Docker Swarm network `traefik-public`. 56 | * Filtering to only serve containers with a label `traefik.constraint-label=traefik-public`. 57 | 58 | ### Instructions 59 | 60 | * Clone this repository and enter into the directory: 61 | 62 | ```bash 63 | $ git clone https://github.com/stefanprodan/swarmprom.git 64 | $ cd swarmprom 65 | ``` 66 | 67 | * Set and export an `ADMIN_USER` environment variable: 68 | 69 | ```bash 70 | export ADMIN_USER=admin 71 | ``` 72 | 73 | * Set and export an `ADMIN_PASSWORD` environment variable: 74 | 75 | 76 | ```bash 77 | export ADMIN_PASSWORD=changethis 78 | ``` 79 | 80 | * Set and export a hashed version of the `ADMIN_PASSWORD` using `openssl`, it will be used by Traefik's HTTP Basic Auth for most of the services: 81 | 82 | ```bash 83 | export HASHED_PASSWORD=$(openssl passwd -apr1 $ADMIN_PASSWORD) 84 | ``` 85 | 86 | * You can check the contents with: 87 | 88 | ```bash 89 | echo $HASHED_PASSWORD 90 | ``` 91 | 92 | it will look like: 93 | 94 | ``` 95 | $apr1$89eqM5Ro$CxaFELthUKV21DpI3UTQO. 96 | ``` 97 | 98 | * Create and export an environment variable `DOMAIN`, e.g.: 99 | 100 | ```bash 101 | export DOMAIN=example.com 102 | ``` 103 | 104 | and make sure that the following sub-domains point to your Docker Swarm cluster IPs: 105 | 106 | * `grafana.example.com` 107 | * `alertmanager.example.com` 108 | * `unsee.example.com` 109 | * `prometheus.example.com` 110 | 111 | (and replace `example.com` with your actual domain). 112 | 113 | **Note**: You can also use a subdomain, like `swarmprom.example.com`. Just make sure that the subdomains point to (at least one of) your cluster IPs. Or set up a wildcard subdomain (`*`). 114 | 115 | * If you are using Slack and want to integrate it, set the following environment variables: 116 | 117 | ```bash 118 | export SLACK_URL=https://hooks.slack.com/services/TOKEN 119 | export SLACK_CHANNEL=devops-alerts 120 | export SLACK_USER=alertmanager 121 | ``` 122 | 123 | **Note**: by using `export` when declaring all the environment variables above, the next command will be able to use them. 124 | 125 | * Deploy the Traefik version of the stack: 126 | 127 | 128 | ```bash 129 | docker stack deploy -c docker-compose.traefik.yml swarmprom 130 | ``` 131 | 132 | To test it, go to each URL: 133 | 134 | * `https://grafana.example.com` 135 | * `https://alertmanager.example.com` 136 | * `https://unsee.example.com` 137 | * `https://prometheus.example.com` 138 | 139 | 140 | ## Setup Grafana 141 | 142 | Navigate to `http://:3000` and login with user ***admin*** password ***admin***. 143 | You can change the credentials in the compose file or 144 | by supplying the `ADMIN_USER` and `ADMIN_PASSWORD` environment variables at stack deploy. 145 | 146 | Swarmprom Grafana is preconfigured with two dashboards and Prometheus as the default data source: 147 | 148 | * Name: Prometheus 149 | * Type: Prometheus 150 | * Url: http://prometheus:9090 151 | * Access: proxy 152 | 153 | After you login, click on the home drop down, in the left upper corner and you'll see the dashboards there. 154 | 155 | ***Docker Swarm Nodes Dashboard*** 156 | 157 | ![Nodes](https://raw.githubusercontent.com/stefanprodan/swarmprom/master/grafana/screens/swarmprom-nodes-dash-v3.png) 158 | 159 | URL: `http://:3000/dashboard/db/docker-swarm-nodes` 160 | 161 | This dashboard shows key metrics for monitoring the resource usage of your Swarm nodes and can be filtered by node ID: 162 | 163 | * Cluster up-time, number of nodes, number of CPUs, CPU idle gauge 164 | * System load average graph, CPU usage graph by node 165 | * Total memory, available memory gouge, total disk space and available storage gouge 166 | * Memory usage graph by node (used and cached) 167 | * I/O usage graph (read and write Bps) 168 | * IOPS usage (read and write operation per second) and CPU IOWait 169 | * Running containers graph by Swarm service and node 170 | * Network usage graph (inbound Bps, outbound Bps) 171 | * Nodes list (instance, node ID, node name) 172 | 173 | ***Docker Swarm Services Dashboard*** 174 | 175 | ![Nodes](https://raw.githubusercontent.com/stefanprodan/swarmprom/master/grafana/screens/swarmprom-services-dash-v3.png) 176 | 177 | URL: `http://:3000/dashboard/db/docker-swarm-services` 178 | 179 | This dashboard shows key metrics for monitoring the resource usage of your Swarm stacks and services, can be filtered by node ID: 180 | 181 | * Number of nodes, stacks, services and running container 182 | * Swarm tasks graph by service name 183 | * Health check graph (total health checks and failed checks) 184 | * CPU usage graph by service and by container (top 10) 185 | * Memory usage graph by service and by container (top 10) 186 | * Network usage graph by service (received and transmitted) 187 | * Cluster network traffic and IOPS graphs 188 | * Docker engine container and network actions by node 189 | * Docker engine list (version, node id, OS, kernel, graph driver) 190 | 191 | ***Prometheus Stats Dashboard*** 192 | 193 | ![Nodes](https://raw.githubusercontent.com/stefanprodan/swarmprom/master/grafana/screens/swarmprom-prometheus-dash-v3.png) 194 | 195 | URL: `http://:3000/dashboard/db/prometheus` 196 | 197 | * Uptime, local storage memory chunks and series 198 | * CPU usage graph 199 | * Memory usage graph 200 | * Chunks to persist and persistence urgency graphs 201 | * Chunks ops and checkpoint duration graphs 202 | * Target scrapes, rule evaluation duration, samples ingested rate and scrape duration graphs 203 | 204 | 205 | ## Prometheus service discovery 206 | 207 | In order to collect metrics from Swarm nodes you need to deploy the exporters on each server. 208 | Using global services you don't have to manually deploy the exporters. When you scale up your 209 | cluster, Swarm will launch a cAdvisor, node-exporter and dockerd-exporter instance on the newly created nodes. 210 | All you need is an automated way for Prometheus to reach these instances. 211 | 212 | Running Prometheus on the same overlay network as the exporter services allows you to use the DNS service 213 | discovery. Using the exporters service name, you can configure DNS discovery: 214 | 215 | ```yaml 216 | scrape_configs: 217 | - job_name: 'node-exporter' 218 | dns_sd_configs: 219 | - names: 220 | - 'tasks.node-exporter' 221 | type: 'A' 222 | port: 9100 223 | - job_name: 'cadvisor' 224 | dns_sd_configs: 225 | - names: 226 | - 'tasks.cadvisor' 227 | type: 'A' 228 | port: 8080 229 | - job_name: 'dockerd-exporter' 230 | dns_sd_configs: 231 | - names: 232 | - 'tasks.dockerd-exporter' 233 | type: 'A' 234 | port: 9323 235 | ``` 236 | 237 | When Prometheus runs the DNS lookup, Docker Swarm will return a list of IPs for each task. 238 | Using these IPs, Prometheus will bypass the Swarm load-balancer and will be able to scrape each exporter 239 | instance. 240 | 241 | The problem with this approach is that you will not be able to tell which exporter runs on which node. 242 | Your Swarm nodes' real IPs are different from the exporters IPs since exporters IPs are dynamically 243 | assigned by Docker and are part of the overlay network. 244 | Swarm doesn't provide any records for the tasks DNS, besides the overlay IP. 245 | If Swarm provides SRV records with the nodes hostname or IP, you can re-label the source 246 | and overwrite the overlay IP with the real IP. 247 | 248 | In order to tell which host a node-exporter instance is running, I had to create a prom file inside 249 | the node-exporter containing the hostname and the Docker Swarm node ID. 250 | 251 | When a node-exporter container starts `node-meta.prom` is generated with the following content: 252 | 253 | ```bash 254 | "node_meta{node_id=\"$NODE_ID\", node_name=\"$NODE_NAME\"} 1" 255 | ``` 256 | 257 | The node ID value is supplied via `{{.Node.ID}}` and the node name is extracted from the `/etc/hostname` 258 | file that is mounted inside the node-exporter container. 259 | 260 | ```yaml 261 | node-exporter: 262 | image: stefanprodan/swarmprom-node-exporter 263 | environment: 264 | - NODE_ID={{.Node.ID}} 265 | volumes: 266 | - /etc/hostname:/etc/nodename 267 | command: 268 | - '-collector.textfile.directory=/etc/node-exporter/' 269 | ``` 270 | 271 | Using the textfile command, you can instruct node-exporter to collect the `node_meta` metric. 272 | Now that you have a metric containing the Docker Swarm node ID and name, you can use it in promql queries. 273 | 274 | Let's say you want to find the available memory on each node, normally you would write something like this: 275 | 276 | ``` 277 | sum(node_memory_MemAvailable) by (instance) 278 | 279 | {instance="10.0.0.5:9100"} 889450496 280 | {instance="10.0.0.13:9100"} 1404162048 281 | {instance="10.0.0.15:9100"} 1406574592 282 | ``` 283 | 284 | The above result is not very helpful since you can't tell what Swarm node is behind the instance IP. 285 | So let's write that query taking into account the node_meta metric: 286 | 287 | ```sql 288 | sum(node_memory_MemAvailable * on(instance) group_left(node_id, node_name) node_meta) by (node_id, node_name) 289 | 290 | {node_id="wrdvtftteo0uaekmdq4dxrn14",node_name="swarm-manager-1"} 889450496 291 | {node_id="moggm3uaq8tax9ptr1if89pi7",node_name="swarm-worker-1"} 1404162048 292 | {node_id="vkdfx99mm5u4xl2drqhnwtnsv",node_name="swarm-worker-2"} 1406574592 293 | ``` 294 | 295 | This is much better. Instead of overlay IPs, now I can see the actual Docker Swarm nodes ID and hostname. Knowing the hostname of your nodes is useful for alerting as well. 296 | 297 | You can define an alert when available memory reaches 10%. You also will receive the hostname in the alert message 298 | and not some overlay IP that you can't correlate to a infrastructure item. 299 | 300 | Maybe you are wondering why you need the node ID if you have the hostname. The node ID will help you match 301 | node-exporter instances to cAdvisor instances. All metrics exported by cAdvisor have a label named `container_label_com_docker_swarm_node_id`, 302 | and this label can be used to filter containers metrics by Swarm nodes. 303 | 304 | Let's write a query to find out how many containers are running on a Swarm node. 305 | Knowing from the `node_meta` metric all the nodes IDs you can define a filter with them in Grafana. 306 | Assuming the filter is `$node_id` the container count query should look like this: 307 | 308 | ``` 309 | count(rate(container_last_seen{container_label_com_docker_swarm_node_id=~"$node_id"}[5m])) 310 | ``` 311 | 312 | Another use case for node ID is filtering the metrics provided by the Docker engine daemon. 313 | Docker engine doesn't have a label with the node ID attached on every metric, but there is a `swarm_node_info` 314 | metric that has this label. If you want to find out the number of failed health checks on a Swarm node 315 | you would write a query like this: 316 | 317 | ``` 318 | sum(engine_daemon_health_checks_failed_total) * on(instance) group_left(node_id) swarm_node_info{node_id=~"$node_id"}) 319 | ``` 320 | 321 | For now the engine metrics are still experimental. If you want to use dockerd-exporter you have to enable 322 | the experimental feature and set the metrics address to `0.0.0.0:9323`. 323 | 324 | If you are running Docker with systemd create or edit 325 | /etc/systemd/system/docker.service.d/docker.conf file like so: 326 | 327 | ``` 328 | [Service] 329 | ExecStart= 330 | ExecStart=/usr/bin/dockerd \ 331 | --storage-driver=overlay2 \ 332 | --dns 8.8.4.4 --dns 8.8.8.8 \ 333 | --experimental=true \ 334 | --metrics-addr 0.0.0.0:9323 335 | ``` 336 | 337 | Apply the config changes with `systemctl daemon-reload && systemctl restart docker` and 338 | check if the docker_gwbridge ip address is 172.18.0.1: 339 | 340 | ```bash 341 | ip -o addr show docker_gwbridge 342 | ``` 343 | 344 | Replace 172.18.0.1 with your docker_gwbridge address in the compose file: 345 | 346 | ```yaml 347 | dockerd-exporter: 348 | image: stefanprodan/caddy 349 | environment: 350 | - DOCKER_GWBRIDGE_IP=172.18.0.1 351 | ``` 352 | 353 | Collecting Docker Swarm metrics with Prometheus is not a smooth process, and 354 | because of `group_left` queries tend to become more complex. 355 | In the future I hope Swarm DNS will contain the SRV record for hostname and Docker engine 356 | metrics will expose container metrics replacing cAdvisor all together. 357 | 358 | ## Configure Prometheus 359 | 360 | I've set the Prometheus retention period to 24h, you can change these values in the 361 | compose file or using the env variable `PROMETHEUS_RETENTION`. 362 | 363 | ```yaml 364 | prometheus: 365 | image: stefanprodan/swarmprom-prometheus 366 | command: 367 | - '-storage.tsdb.retention=24h' 368 | deploy: 369 | resources: 370 | limits: 371 | memory: 2048M 372 | reservations: 373 | memory: 1024M 374 | ``` 375 | 376 | When using host volumes you should ensure that Prometheus doesn't get scheduled on different nodes. You can 377 | pin the Prometheus service on a specific host with placement constraints. 378 | 379 | ```yaml 380 | prometheus: 381 | image: stefanprodan/swarmprom-prometheus 382 | volumes: 383 | - prometheus:/prometheus 384 | deploy: 385 | mode: replicated 386 | replicas: 1 387 | placement: 388 | constraints: 389 | - node.labels.monitoring.role == prometheus 390 | ``` 391 | 392 | ## Configure alerting 393 | 394 | The Prometheus swarmprom comes with the following alert rules: 395 | 396 | ***Swarm Node CPU Usage*** 397 | 398 | Alerts when a node CPU usage goes over 80% for five minutes. 399 | 400 | ``` 401 | ALERT node_cpu_usage 402 | IF 100 - (avg(irate(node_cpu{mode="idle"}[1m]) * on(instance) group_left(node_name) node_meta * 100) by (node_name)) > 80 403 | FOR 5m 404 | LABELS { severity="warning" } 405 | ANNOTATIONS { 406 | summary = "CPU alert for Swarm node '{{ $labels.node_name }}'", 407 | description = "Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%.", 408 | } 409 | ``` 410 | ***Swarm Node Memory Alert*** 411 | 412 | Alerts when a node memory usage goes over 80% for five minutes. 413 | 414 | ``` 415 | ALERT node_memory_usage 416 | IF sum(((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) * on(instance) group_left(node_name) node_meta * 100) by (node_name) > 80 417 | FOR 5m 418 | LABELS { severity="warning" } 419 | ANNOTATIONS { 420 | summary = "Memory alert for Swarm node '{{ $labels.node_name }}'", 421 | description = "Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%.", 422 | } 423 | ``` 424 | ***Swarm Node Disk Alert*** 425 | 426 | Alerts when a node storage usage goes over 85% for five minutes. 427 | 428 | ``` 429 | ALERT node_disk_usage 430 | IF ((node_filesystem_size{mountpoint="/rootfs"} - node_filesystem_free{mountpoint="/rootfs"}) * 100 / node_filesystem_size{mountpoint="/rootfs"}) * on(instance) group_left(node_name) node_meta > 85 431 | FOR 5m 432 | LABELS { severity="warning" } 433 | ANNOTATIONS { 434 | summary = "Disk alert for Swarm node '{{ $labels.node_name }}'", 435 | description = "Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%.", 436 | } 437 | ``` 438 | 439 | ***Swarm Node Disk Fill Rate Alert*** 440 | 441 | Alerts when a node storage is going to remain out of free space in six hours. 442 | 443 | ``` 444 | ALERT node_disk_fill_rate_6h 445 | IF predict_linear(node_filesystem_free{mountpoint="/rootfs"}[1h], 6*3600) * on(instance) group_left(node_name) node_meta < 0 446 | FOR 1h 447 | LABELS { severity="critical" } 448 | ANNOTATIONS { 449 | summary = "Disk fill alert for Swarm node '{{ $labels.node_name }}'", 450 | description = "Swarm node {{ $labels.node_name }} disk is going to fill up in 6h.", 451 | } 452 | ``` 453 | 454 | You can add alerts to 455 | [swarm_node](https://github.com/stefanprodan/swarmprom/blob/master/prometheus/rules/swarm_node.rules) 456 | and [swarm_task](https://github.com/stefanprodan/swarmprom/blob/master/prometheus/rules/swarm_task.rules) 457 | files and rerun stack deploy to update them. Because these files are mounted inside the Prometheus 458 | container at run time as [Docker configs](https://docs.docker.com/engine/swarm/configs/) 459 | you don't have to bundle them with the image. 460 | 461 | The Alertmanager swarmprom image is configured with the Slack receiver. 462 | In order to receive alerts on Slack you have to provide the Slack API url, 463 | username and channel via environment variables: 464 | 465 | ```yaml 466 | alertmanager: 467 | image: stefanprodan/swarmprom-alertmanager 468 | environment: 469 | - SLACK_URL=${SLACK_URL} 470 | - SLACK_CHANNEL=${SLACK_CHANNEL} 471 | - SLACK_USER=${SLACK_USER} 472 | ``` 473 | 474 | You can install the `stress` package with apt and test out the CPU alert, you should receive something like this: 475 | 476 | ![Alerts](https://raw.githubusercontent.com/stefanprodan/swarmprom/master/grafana/screens/alertmanager-slack-v2.png) 477 | 478 | Cloudflare has made a great dashboard for managing alerts. 479 | Unsee can aggregate alerts from multiple Alertmanager instances, running either in HA mode or separate. 480 | You can access unsee at `http://:9094` using the admin user/password set via compose up: 481 | 482 | ![Unsee](https://raw.githubusercontent.com/stefanprodan/swarmprom/master/grafana/screens/unsee.png) 483 | 484 | ## Monitoring applications and backend services 485 | 486 | You can extend swarmprom with special-purpose exporters for services like MongoDB, PostgreSQL, Kafka, 487 | Redis and also instrument your own applications using the Prometheus client libraries. 488 | 489 | In order to scrape other services you need to attach those to the `mon_net` network so Prometheus 490 | can reach them. Or you can attach the `mon_prometheus` service to the networks where your services are running. 491 | 492 | Once your services are reachable by Prometheus you can add the dns name and port of those services to the 493 | Prometheus config using the `JOBS` environment variable: 494 | 495 | ```yaml 496 | prometheus: 497 | image: stefanprodan/swarmprom-prometheus 498 | environment: 499 | - JOBS=mongo-exporter:9216 kafka-exporter:9216 redis-exporter:9216 500 | ``` 501 | 502 | ## Monitoring production systems 503 | 504 | The swarmprom project is meant as a starting point in developing your own monitoring solution. Before running this 505 | in production you should consider building and publishing your own Prometheus, node exporter and alert manager 506 | images. Docker Swarm doesn't play well with locally built images, the first step would be to setup a secure Docker 507 | registry that your Swarm has access to and push the images there. Your CI system should assign version tags to each 508 | image. Don't rely on the latest tag for continuous deployments, Prometheus will soon reach v2 and the data store 509 | will not be backwards compatible with v1.x. 510 | 511 | Another thing you should consider is having redundancy for Prometheus and alert manager. 512 | You could run them as a service with two replicas pinned on different nodes, or even better, 513 | use a service like Weave Cloud Cortex to ship your metrics outside of your current setup. 514 | You can use Weave Cloud not only as a backup of your 515 | metrics database but you can also define alerts and use it as a data source for your Grafana dashboards. 516 | Having the alerting and monitoring system hosted on a different platform other than your production 517 | is good practice that will allow you to react quickly and efficiently when a major disaster strikes. 518 | 519 | Swarmprom comes with built-in [Weave Cloud](https://www.weave.works/product/cloud/) integration, 520 | what you need to do is run the weave-compose stack with your Weave service token: 521 | 522 | ```bash 523 | TOKEN= \ 524 | ADMIN_USER=admin \ 525 | ADMIN_PASSWORD=admin \ 526 | docker stack deploy -c weave-compose.yml mon 527 | ``` 528 | 529 | This will deploy Weave Scope and Prometheus with Weave Cortex as remote write. 530 | The local retention is set to 24h so even if your internet connection drops you'll not lose data 531 | as Prometheus will retry pushing data to Weave Cloud when the connection is up again. 532 | 533 | You can define alerts and notifications routes in Weave Cloud in the same way you would do with alert manager. 534 | 535 | To use Grafana with Weave Cloud you have to reconfigure the Prometheus data source like this: 536 | 537 | * Name: Prometheus 538 | * Type: Prometheus 539 | * Url: https://cloud.weave.works/api/prom 540 | * Access: proxy 541 | * Basic auth: use your service token as password, the user value is ignored 542 | 543 | Weave Scope automatically generates a map of your application, enabling you to intuitively understand, 544 | monitor, and control your microservices based application. 545 | You can view metrics, tags and metadata of the running processes, containers and hosts. 546 | Scope offers remote access to the Swarm’s nods and containers, making it easy to diagnose issues in real-time. 547 | 548 | ![Scope](https://raw.githubusercontent.com/stefanprodan/swarmprom/master/grafana/screens/weave-scope.png) 549 | 550 | ![Scope Hosts](https://raw.githubusercontent.com/stefanprodan/swarmprom/master/grafana/screens/weave-scope-hosts-v2.png) 551 | -------------------------------------------------------------------------------- /alertmanager/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM prom/alertmanager:v0.15.3 2 | 3 | COPY conf /etc/alertmanager/ 4 | 5 | ENTRYPOINT [ "/etc/alertmanager/docker-entrypoint.sh" ] 6 | CMD [ "--config.file=/etc/alertmanager/alertmanager.yml", \ 7 | "--storage.path=/alertmanager" ] 8 | -------------------------------------------------------------------------------- /alertmanager/conf/alertmanager.yml: -------------------------------------------------------------------------------- 1 | route: 2 | receiver: 'slack' 3 | 4 | receivers: 5 | - name: 'slack' 6 | slack_configs: 7 | - send_resolved: true 8 | text: "{{ .CommonAnnotations.description }}" 9 | #username: # 10 | #channel: # 11 | #api_url: # 12 | -------------------------------------------------------------------------------- /alertmanager/conf/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | 3 | cat /etc/alertmanager/alertmanager.yml |\ 4 | sed "s@#api_url: #@api_url: '$SLACK_URL'@g" |\ 5 | sed "s@#channel: #@channel: '#$SLACK_CHANNEL'@g" |\ 6 | sed "s@#username: #@username: '$SLACK_USER'@g" > /tmp/alertmanager.yml 7 | 8 | mv /tmp/alertmanager.yml /etc/alertmanager/alertmanager.yml 9 | 10 | set -- /bin/alertmanager "$@" 11 | 12 | exec "$@" 13 | -------------------------------------------------------------------------------- /caddy/Caddyfile: -------------------------------------------------------------------------------- 1 | :9090 { 2 | basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD} 3 | proxy / prometheus:9090 { 4 | transparent 5 | } 6 | 7 | errors stderr 8 | tls off 9 | } 10 | 11 | :9093 { 12 | basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD} 13 | proxy / alertmanager:9093 { 14 | transparent 15 | } 16 | 17 | errors stderr 18 | tls off 19 | } 20 | 21 | :9094 { 22 | basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD} 23 | proxy / unsee:8080 { 24 | transparent 25 | } 26 | 27 | errors stderr 28 | tls off 29 | } 30 | 31 | :3000 { 32 | proxy / grafana:3000 { 33 | transparent 34 | websocket 35 | } 36 | 37 | errors stderr 38 | tls off 39 | } 40 | 41 | -------------------------------------------------------------------------------- /docker-compose.traefik.yml: -------------------------------------------------------------------------------- 1 | version: "3.3" 2 | 3 | networks: 4 | net: 5 | driver: overlay 6 | attachable: true 7 | traefik-public: 8 | external: true 9 | 10 | volumes: 11 | prometheus: {} 12 | grafana: {} 13 | alertmanager: {} 14 | 15 | configs: 16 | dockerd_config: 17 | file: ./dockerd-exporter/Caddyfile 18 | node_rules: 19 | file: ./prometheus/rules/swarm_node.rules.yml 20 | task_rules: 21 | file: ./prometheus/rules/swarm_task.rules.yml 22 | 23 | services: 24 | dockerd-exporter: 25 | image: stefanprodan/caddy 26 | networks: 27 | - net 28 | environment: 29 | - DOCKER_GWBRIDGE_IP=172.18.0.1 30 | configs: 31 | - source: dockerd_config 32 | target: /etc/caddy/Caddyfile 33 | deploy: 34 | mode: global 35 | resources: 36 | limits: 37 | memory: 128M 38 | reservations: 39 | memory: 64M 40 | 41 | cadvisor: 42 | image: google/cadvisor 43 | networks: 44 | - net 45 | command: -logtostderr -docker_only 46 | volumes: 47 | - /var/run/docker.sock:/var/run/docker.sock:ro 48 | - /:/rootfs:ro 49 | - /var/run:/var/run 50 | - /sys:/sys:ro 51 | - /var/lib/docker/:/var/lib/docker:ro 52 | deploy: 53 | mode: global 54 | resources: 55 | limits: 56 | memory: 128M 57 | reservations: 58 | memory: 64M 59 | 60 | grafana: 61 | image: stefanprodan/swarmprom-grafana:5.3.4 62 | networks: 63 | - default 64 | - net 65 | - traefik-public 66 | environment: 67 | - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin} 68 | - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 69 | - GF_USERS_ALLOW_SIGN_UP=false 70 | #- GF_SERVER_ROOT_URL=${GF_SERVER_ROOT_URL:-localhost} 71 | #- GF_SMTP_ENABLED=${GF_SMTP_ENABLED:-false} 72 | #- GF_SMTP_FROM_ADDRESS=${GF_SMTP_FROM_ADDRESS:-grafana@test.com} 73 | #- GF_SMTP_FROM_NAME=${GF_SMTP_FROM_NAME:-Grafana} 74 | #- GF_SMTP_HOST=${GF_SMTP_HOST:-smtp:25} 75 | #- GF_SMTP_USER=${GF_SMTP_USER} 76 | #- GF_SMTP_PASSWORD=${GF_SMTP_PASSWORD} 77 | volumes: 78 | - grafana:/var/lib/grafana 79 | deploy: 80 | mode: replicated 81 | replicas: 1 82 | placement: 83 | constraints: 84 | - node.role == manager 85 | resources: 86 | limits: 87 | memory: 128M 88 | reservations: 89 | memory: 64M 90 | labels: 91 | - traefik.enable=true 92 | - traefik.docker.network=traefik-public 93 | - traefik.constraint-label=traefik-public 94 | - traefik.http.routers.swarmprom-grafana-http.rule=Host(`grafana.${DOMAIN?Variable DOMAIN not set}`) 95 | - traefik.http.routers.swarmprom-grafana-http.entrypoints=http 96 | - traefik.http.routers.swarmprom-grafana-http.middlewares=https-redirect 97 | - traefik.http.routers.swarmprom-grafana-https.rule=Host(`grafana.${DOMAIN?Variable DOMAIN not set}`) 98 | - traefik.http.routers.swarmprom-grafana-https.entrypoints=https 99 | - traefik.http.routers.swarmprom-grafana-https.tls=true 100 | - traefik.http.routers.swarmprom-grafana-https.tls.certresolver=le 101 | - traefik.http.services.swarmprom-grafana.loadbalancer.server.port=3000 102 | 103 | alertmanager: 104 | image: stefanprodan/swarmprom-alertmanager:v0.14.0 105 | networks: 106 | - default 107 | - net 108 | - traefik-public 109 | environment: 110 | - SLACK_URL=${SLACK_URL:-https://hooks.slack.com/services/TOKEN} 111 | - SLACK_CHANNEL=${SLACK_CHANNEL:-general} 112 | - SLACK_USER=${SLACK_USER:-alertmanager} 113 | command: 114 | - '--config.file=/etc/alertmanager/alertmanager.yml' 115 | - '--storage.path=/alertmanager' 116 | volumes: 117 | - alertmanager:/alertmanager 118 | deploy: 119 | mode: replicated 120 | replicas: 1 121 | placement: 122 | constraints: 123 | - node.role == manager 124 | resources: 125 | limits: 126 | memory: 128M 127 | reservations: 128 | memory: 64M 129 | labels: 130 | - traefik.enable=true 131 | - traefik.docker.network=traefik-public 132 | - traefik.constraint-label=traefik-public 133 | - traefik.http.routers.swarmprom-alertmanager-http.rule=Host(`alertmanager.${DOMAIN?Variable DOMAIN not set}`) 134 | - traefik.http.routers.swarmprom-alertmanager-http.entrypoints=http 135 | - traefik.http.routers.swarmprom-alertmanager-http.middlewares=https-redirect 136 | - traefik.http.routers.swarmprom-alertmanager-https.rule=Host(`alertmanager.${DOMAIN?Variable DOMAIN not set}`) 137 | - traefik.http.routers.swarmprom-alertmanager-https.entrypoints=https 138 | - traefik.http.routers.swarmprom-alertmanager-https.tls=true 139 | - traefik.http.routers.swarmprom-alertmanager-https.tls.certresolver=le 140 | - traefik.http.services.swarmprom-alertmanager.loadbalancer.server.port=9093 141 | - traefik.http.middlewares.swarmprom-alertmanager-auth.basicauth.users=${ADMIN_USER?Variable ADMIN_USER not set}:${HASHED_PASSWORD?Variable HASHED_PASSWORD not set} 142 | - traefik.http.routers.swarmprom-alertmanager-https.middlewares=swarmprom-alertmanager-auth 143 | 144 | unsee: 145 | image: cloudflare/unsee:v0.8.0 146 | networks: 147 | - default 148 | - net 149 | - traefik-public 150 | environment: 151 | - "ALERTMANAGER_URIS=default:http://alertmanager:9093" 152 | deploy: 153 | mode: replicated 154 | replicas: 1 155 | labels: 156 | - traefik.enable=true 157 | - traefik.docker.network=traefik-public 158 | - traefik.constraint-label=traefik-public 159 | - traefik.http.routers.swarmprom-unsee-http.rule=Host(`unsee.${DOMAIN?Variable DOMAIN not set}`) 160 | - traefik.http.routers.swarmprom-unsee-http.entrypoints=http 161 | - traefik.http.routers.swarmprom-unsee-http.middlewares=https-redirect 162 | - traefik.http.routers.swarmprom-unsee-https.rule=Host(`unsee.${DOMAIN?Variable DOMAIN not set}`) 163 | - traefik.http.routers.swarmprom-unsee-https.entrypoints=https 164 | - traefik.http.routers.swarmprom-unsee-https.tls=true 165 | - traefik.http.routers.swarmprom-unsee-https.tls.certresolver=le 166 | - traefik.http.services.swarmprom-unsee.loadbalancer.server.port=8080 167 | - traefik.http.middlewares.swarmprom-unsee-auth.basicauth.users=${ADMIN_USER?Variable ADMIN_USER not set}:${HASHED_PASSWORD?Variable HASHED_PASSWORD not set} 168 | - traefik.http.routers.swarmprom-unsee-https.middlewares=swarmprom-unsee-auth 169 | 170 | node-exporter: 171 | image: stefanprodan/swarmprom-node-exporter:v0.16.0 172 | networks: 173 | - net 174 | environment: 175 | - NODE_ID={{.Node.ID}} 176 | volumes: 177 | - /proc:/host/proc:ro 178 | - /sys:/host/sys:ro 179 | - /:/rootfs:ro 180 | - /etc/hostname:/etc/nodename 181 | command: 182 | - '--path.sysfs=/host/sys' 183 | - '--path.procfs=/host/proc' 184 | - '--collector.textfile.directory=/etc/node-exporter/' 185 | - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' 186 | - '--no-collector.ipvs' 187 | deploy: 188 | mode: global 189 | resources: 190 | limits: 191 | memory: 128M 192 | reservations: 193 | memory: 64M 194 | 195 | prometheus: 196 | image: stefanprodan/swarmprom-prometheus:v2.5.0 197 | networks: 198 | - default 199 | - net 200 | - traefik-public 201 | command: 202 | - '--config.file=/etc/prometheus/prometheus.yml' 203 | - '--storage.tsdb.path=/prometheus' 204 | - '--storage.tsdb.retention=${PROMETHEUS_RETENTION:-24h}' 205 | volumes: 206 | - prometheus:/prometheus 207 | configs: 208 | - source: node_rules 209 | target: /etc/prometheus/swarm_node.rules.yml 210 | - source: task_rules 211 | target: /etc/prometheus/swarm_task.rules.yml 212 | deploy: 213 | mode: replicated 214 | replicas: 1 215 | placement: 216 | constraints: 217 | - node.role == manager 218 | resources: 219 | limits: 220 | memory: 2048M 221 | reservations: 222 | memory: 128M 223 | labels: 224 | - traefik.enable=true 225 | - traefik.docker.network=traefik-public 226 | - traefik.constraint-label=traefik-public 227 | - traefik.http.routers.swarmprom-prometheus-http.rule=Host(`prometheus.${DOMAIN?Variable DOMAIN not set}`) 228 | - traefik.http.routers.swarmprom-prometheus-http.entrypoints=http 229 | - traefik.http.routers.swarmprom-prometheus-http.middlewares=https-redirect 230 | - traefik.http.routers.swarmprom-prometheus-https.rule=Host(`prometheus.${DOMAIN?Variable DOMAIN not set}`) 231 | - traefik.http.routers.swarmprom-prometheus-https.entrypoints=https 232 | - traefik.http.routers.swarmprom-prometheus-https.tls=true 233 | - traefik.http.routers.swarmprom-prometheus-https.tls.certresolver=le 234 | - traefik.http.services.swarmprom-prometheus.loadbalancer.server.port=9090 235 | - traefik.http.middlewares.swarmprom-prometheus-auth.basicauth.users=${ADMIN_USER?Variable ADMIN_USER not set}:${HASHED_PASSWORD?Variable HASHED_PASSWORD not set} 236 | - traefik.http.routers.swarmprom-prometheus-https.middlewares=swarmprom-prometheus-auth 237 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.3" 2 | 3 | networks: 4 | net: 5 | driver: overlay 6 | attachable: true 7 | 8 | volumes: 9 | prometheus: {} 10 | grafana: {} 11 | alertmanager: {} 12 | 13 | configs: 14 | caddy_config: 15 | file: ./caddy/Caddyfile 16 | dockerd_config: 17 | file: ./dockerd-exporter/Caddyfile 18 | node_rules: 19 | file: ./prometheus/rules/swarm_node.rules.yml 20 | task_rules: 21 | file: ./prometheus/rules/swarm_task.rules.yml 22 | 23 | services: 24 | dockerd-exporter: 25 | image: stefanprodan/caddy 26 | networks: 27 | - net 28 | environment: 29 | - DOCKER_GWBRIDGE_IP=172.18.0.1 30 | configs: 31 | - source: dockerd_config 32 | target: /etc/caddy/Caddyfile 33 | deploy: 34 | mode: global 35 | resources: 36 | limits: 37 | memory: 128M 38 | reservations: 39 | memory: 64M 40 | 41 | cadvisor: 42 | image: google/cadvisor 43 | networks: 44 | - net 45 | command: -logtostderr -docker_only 46 | volumes: 47 | - /var/run/docker.sock:/var/run/docker.sock:ro 48 | - /:/rootfs:ro 49 | - /var/run:/var/run 50 | - /sys:/sys:ro 51 | - /var/lib/docker/:/var/lib/docker:ro 52 | deploy: 53 | mode: global 54 | resources: 55 | limits: 56 | memory: 128M 57 | reservations: 58 | memory: 64M 59 | 60 | grafana: 61 | image: stefanprodan/swarmprom-grafana:5.3.4 62 | networks: 63 | - net 64 | environment: 65 | - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin} 66 | - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 67 | - GF_USERS_ALLOW_SIGN_UP=false 68 | #- GF_SERVER_ROOT_URL=${GF_SERVER_ROOT_URL:-localhost} 69 | #- GF_SMTP_ENABLED=${GF_SMTP_ENABLED:-false} 70 | #- GF_SMTP_FROM_ADDRESS=${GF_SMTP_FROM_ADDRESS:-grafana@test.com} 71 | #- GF_SMTP_FROM_NAME=${GF_SMTP_FROM_NAME:-Grafana} 72 | #- GF_SMTP_HOST=${GF_SMTP_HOST:-smtp:25} 73 | #- GF_SMTP_USER=${GF_SMTP_USER} 74 | #- GF_SMTP_PASSWORD=${GF_SMTP_PASSWORD} 75 | volumes: 76 | - grafana:/var/lib/grafana 77 | deploy: 78 | mode: replicated 79 | replicas: 1 80 | placement: 81 | constraints: 82 | - node.role == manager 83 | resources: 84 | limits: 85 | memory: 128M 86 | reservations: 87 | memory: 64M 88 | 89 | alertmanager: 90 | image: stefanprodan/swarmprom-alertmanager:v0.14.0 91 | networks: 92 | - net 93 | environment: 94 | - SLACK_URL=${SLACK_URL:-https://hooks.slack.com/services/TOKEN} 95 | - SLACK_CHANNEL=${SLACK_CHANNEL:-general} 96 | - SLACK_USER=${SLACK_USER:-alertmanager} 97 | command: 98 | - '--config.file=/etc/alertmanager/alertmanager.yml' 99 | - '--storage.path=/alertmanager' 100 | volumes: 101 | - alertmanager:/alertmanager 102 | deploy: 103 | mode: replicated 104 | replicas: 1 105 | placement: 106 | constraints: 107 | - node.role == manager 108 | resources: 109 | limits: 110 | memory: 128M 111 | reservations: 112 | memory: 64M 113 | 114 | unsee: 115 | image: cloudflare/unsee:v0.8.0 116 | networks: 117 | - net 118 | environment: 119 | - "ALERTMANAGER_URIS=default:http://alertmanager:9093" 120 | deploy: 121 | mode: replicated 122 | replicas: 1 123 | 124 | node-exporter: 125 | image: stefanprodan/swarmprom-node-exporter:v0.16.0 126 | networks: 127 | - net 128 | environment: 129 | - NODE_ID={{.Node.ID}} 130 | volumes: 131 | - /proc:/host/proc:ro 132 | - /sys:/host/sys:ro 133 | - /:/rootfs:ro 134 | - /etc/hostname:/etc/nodename 135 | command: 136 | - '--path.sysfs=/host/sys' 137 | - '--path.procfs=/host/proc' 138 | - '--collector.textfile.directory=/etc/node-exporter/' 139 | - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' 140 | - '--no-collector.ipvs' 141 | deploy: 142 | mode: global 143 | resources: 144 | limits: 145 | memory: 128M 146 | reservations: 147 | memory: 64M 148 | 149 | prometheus: 150 | image: stefanprodan/swarmprom-prometheus:v2.5.0 151 | networks: 152 | - net 153 | command: 154 | - '--config.file=/etc/prometheus/prometheus.yml' 155 | - '--storage.tsdb.path=/prometheus' 156 | - '--storage.tsdb.retention=${PROMETHEUS_RETENTION:-24h}' 157 | volumes: 158 | - prometheus:/prometheus 159 | configs: 160 | - source: node_rules 161 | target: /etc/prometheus/swarm_node.rules.yml 162 | - source: task_rules 163 | target: /etc/prometheus/swarm_task.rules.yml 164 | deploy: 165 | mode: replicated 166 | replicas: 1 167 | placement: 168 | constraints: 169 | - node.role == manager 170 | resources: 171 | limits: 172 | memory: 2048M 173 | reservations: 174 | memory: 128M 175 | 176 | caddy: 177 | image: stefanprodan/caddy 178 | ports: 179 | - "3000:3000" 180 | - "9090:9090" 181 | - "9093:9093" 182 | - "9094:9094" 183 | networks: 184 | - net 185 | environment: 186 | - ADMIN_USER=${ADMIN_USER:-admin} 187 | - ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 188 | configs: 189 | - source: caddy_config 190 | target: /etc/caddy/Caddyfile 191 | deploy: 192 | mode: replicated 193 | replicas: 1 194 | placement: 195 | constraints: 196 | - node.role == manager 197 | resources: 198 | limits: 199 | memory: 128M 200 | reservations: 201 | memory: 64M 202 | healthcheck: 203 | test: ["CMD", "curl", "-f", "http://localhost:3000"] 204 | interval: 5s 205 | timeout: 1s 206 | retries: 5 207 | -------------------------------------------------------------------------------- /dockerd-exporter/Caddyfile: -------------------------------------------------------------------------------- 1 | :9323 { 2 | proxy / {$DOCKER_GWBRIDGE_IP}:9323 { 3 | transparent 4 | } 5 | 6 | errors stderr 7 | tls off 8 | } 9 | -------------------------------------------------------------------------------- /grafana/.dockerignore: -------------------------------------------------------------------------------- 1 | screens/ 2 | -------------------------------------------------------------------------------- /grafana/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM grafana/grafana:5.3.4 2 | # https://hub.docker.com/r/grafana/grafana/tags/ 3 | 4 | COPY datasources /etc/grafana/provisioning/datasources/ 5 | COPY swarmprom_dashboards.yml /etc/grafana/provisioning/dashboards/ 6 | COPY dashboards /etc/grafana/dashboards/ 7 | 8 | ENV GF_SECURITY_ADMIN_PASSWORD=admin \ 9 | GF_SECURITY_ADMIN_USER=admin \ 10 | GF_PATHS_PROVISIONING=/etc/grafana/provisioning/ 11 | -------------------------------------------------------------------------------- /grafana/dashboards/swarmprom-nodes-dash.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "description": "Docker Swarm nodes metrics", 16 | "editable": true, 17 | "gnetId": null, 18 | "graphTooltip": 0, 19 | "iteration": 1547535746076, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "cacheTimeout": null, 24 | "colorBackground": false, 25 | "colorValue": false, 26 | "colors": [ 27 | "rgba(245, 54, 54, 0.9)", 28 | "rgba(237, 129, 40, 0.89)", 29 | "rgba(50, 172, 45, 0.97)" 30 | ], 31 | "datasource": "Prometheus", 32 | "decimals": 1, 33 | "format": "s", 34 | "gauge": { 35 | "maxValue": 100, 36 | "minValue": 0, 37 | "show": false, 38 | "thresholdLabels": false, 39 | "thresholdMarkers": true 40 | }, 41 | "gridPos": { 42 | "h": 4, 43 | "w": 6, 44 | "x": 0, 45 | "y": 0 46 | }, 47 | "hideTimeOverride": true, 48 | "id": 2, 49 | "interval": null, 50 | "links": [], 51 | "mappingType": 1, 52 | "mappingTypes": [ 53 | { 54 | "name": "value to text", 55 | "value": 1 56 | }, 57 | { 58 | "name": "range to text", 59 | "value": 2 60 | } 61 | ], 62 | "maxDataPoints": 100, 63 | "nullPointMode": "connected", 64 | "nullText": null, 65 | "postfix": "", 66 | "postfixFontSize": "50%", 67 | "prefix": "", 68 | "prefixFontSize": "50%", 69 | "rangeMaps": [ 70 | { 71 | "from": "null", 72 | "text": "N/A", 73 | "to": "null" 74 | } 75 | ], 76 | "sparkline": { 77 | "fillColor": "rgba(31, 118, 189, 0.18)", 78 | "full": false, 79 | "lineColor": "rgb(31, 120, 193)", 80 | "show": false 81 | }, 82 | "tableColumn": "", 83 | "targets": [ 84 | { 85 | "expr": "topk(1, sum((node_time_seconds - node_boot_time_seconds) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name))", 86 | "format": "time_series", 87 | "intervalFactor": 2, 88 | "legendFormat": "", 89 | "refId": "A", 90 | "step": 2 91 | } 92 | ], 93 | "thresholds": "", 94 | "timeFrom": "1m", 95 | "timeShift": null, 96 | "title": "Uptime", 97 | "type": "singlestat", 98 | "valueFontSize": "80%", 99 | "valueMaps": [ 100 | { 101 | "op": "=", 102 | "text": "N/A", 103 | "value": "null" 104 | } 105 | ], 106 | "valueName": "avg" 107 | }, 108 | { 109 | "cacheTimeout": null, 110 | "colorBackground": false, 111 | "colorValue": false, 112 | "colors": [ 113 | "rgba(245, 54, 54, 0.9)", 114 | "rgba(237, 129, 40, 0.89)", 115 | "rgba(50, 172, 45, 0.97)" 116 | ], 117 | "datasource": null, 118 | "decimals": 0, 119 | "format": "none", 120 | "gauge": { 121 | "maxValue": 100, 122 | "minValue": 0, 123 | "show": false, 124 | "thresholdLabels": false, 125 | "thresholdMarkers": true 126 | }, 127 | "gridPos": { 128 | "h": 4, 129 | "w": 6, 130 | "x": 6, 131 | "y": 0 132 | }, 133 | "id": 1, 134 | "interval": null, 135 | "links": [], 136 | "mappingType": 1, 137 | "mappingTypes": [ 138 | { 139 | "name": "value to text", 140 | "value": 1 141 | }, 142 | { 143 | "name": "range to text", 144 | "value": 2 145 | } 146 | ], 147 | "maxDataPoints": 100, 148 | "nullPointMode": "connected", 149 | "nullText": null, 150 | "postfix": "", 151 | "postfixFontSize": "50%", 152 | "prefix": "", 153 | "prefixFontSize": "50%", 154 | "rangeMaps": [ 155 | { 156 | "from": "null", 157 | "text": "N/A", 158 | "to": "null" 159 | } 160 | ], 161 | "sparkline": { 162 | "fillColor": "rgba(31, 118, 189, 0.18)", 163 | "full": false, 164 | "lineColor": "rgb(31, 120, 193)", 165 | "show": false 166 | }, 167 | "tableColumn": "", 168 | "targets": [ 169 | { 170 | "expr": "count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 171 | "format": "time_series", 172 | "intervalFactor": 2, 173 | "legendFormat": "", 174 | "refId": "A", 175 | "step": 20 176 | } 177 | ], 178 | "thresholds": "", 179 | "title": "Nodes", 180 | "type": "singlestat", 181 | "valueFontSize": "80%", 182 | "valueMaps": [ 183 | { 184 | "op": "=", 185 | "text": "N/A", 186 | "value": "null" 187 | } 188 | ], 189 | "valueName": "avg" 190 | }, 191 | { 192 | "cacheTimeout": null, 193 | "colorBackground": false, 194 | "colorValue": false, 195 | "colors": [ 196 | "rgba(245, 54, 54, 0.9)", 197 | "rgba(237, 129, 40, 0.89)", 198 | "rgba(50, 172, 45, 0.97)" 199 | ], 200 | "datasource": null, 201 | "decimals": 0, 202 | "format": "short", 203 | "gauge": { 204 | "maxValue": 100, 205 | "minValue": 0, 206 | "show": false, 207 | "thresholdLabels": false, 208 | "thresholdMarkers": true 209 | }, 210 | "gridPos": { 211 | "h": 4, 212 | "w": 6, 213 | "x": 12, 214 | "y": 0 215 | }, 216 | "hideTimeOverride": true, 217 | "id": 4, 218 | "interval": null, 219 | "links": [], 220 | "mappingType": 1, 221 | "mappingTypes": [ 222 | { 223 | "name": "value to text", 224 | "value": 1 225 | }, 226 | { 227 | "name": "range to text", 228 | "value": 2 229 | } 230 | ], 231 | "maxDataPoints": 100, 232 | "nullPointMode": "connected", 233 | "nullText": null, 234 | "postfix": "", 235 | "postfixFontSize": "50%", 236 | "prefix": "", 237 | "prefixFontSize": "50%", 238 | "rangeMaps": [ 239 | { 240 | "from": "null", 241 | "text": "N/A", 242 | "to": "null" 243 | } 244 | ], 245 | "sparkline": { 246 | "fillColor": "rgba(31, 118, 189, 0.18)", 247 | "full": false, 248 | "lineColor": "rgb(31, 120, 193)", 249 | "show": false 250 | }, 251 | "tableColumn": "", 252 | "targets": [ 253 | { 254 | "expr": "count(node_cpu_seconds_total{mode=\"idle\"} * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 255 | "format": "time_series", 256 | "intervalFactor": 2, 257 | "legendFormat": "", 258 | "refId": "A", 259 | "step": 2 260 | } 261 | ], 262 | "thresholds": "", 263 | "timeFrom": "1m", 264 | "timeShift": null, 265 | "title": "CPUs", 266 | "type": "singlestat", 267 | "valueFontSize": "80%", 268 | "valueMaps": [ 269 | { 270 | "op": "=", 271 | "text": "N/A", 272 | "value": "null" 273 | } 274 | ], 275 | "valueName": "avg" 276 | }, 277 | { 278 | "cacheTimeout": null, 279 | "colorBackground": false, 280 | "colorValue": false, 281 | "colors": [ 282 | "rgba(245, 54, 54, 0.9)", 283 | "rgba(237, 129, 40, 0.89)", 284 | "rgba(50, 172, 45, 0.97)" 285 | ], 286 | "datasource": null, 287 | "decimals": null, 288 | "format": "percent", 289 | "gauge": { 290 | "maxValue": 100, 291 | "minValue": 0, 292 | "show": true, 293 | "thresholdLabels": false, 294 | "thresholdMarkers": true 295 | }, 296 | "gridPos": { 297 | "h": 4, 298 | "w": 6, 299 | "x": 18, 300 | "y": 0 301 | }, 302 | "hideTimeOverride": true, 303 | "id": 11, 304 | "interval": null, 305 | "links": [], 306 | "mappingType": 1, 307 | "mappingTypes": [ 308 | { 309 | "name": "value to text", 310 | "value": 1 311 | }, 312 | { 313 | "name": "range to text", 314 | "value": 2 315 | } 316 | ], 317 | "maxDataPoints": 100, 318 | "nullPointMode": "connected", 319 | "nullText": null, 320 | "postfix": "", 321 | "postfixFontSize": "50%", 322 | "prefix": "", 323 | "prefixFontSize": "50%", 324 | "rangeMaps": [ 325 | { 326 | "from": "null", 327 | "text": "N/A", 328 | "to": "null" 329 | } 330 | ], 331 | "sparkline": { 332 | "fillColor": "rgba(31, 118, 189, 0.18)", 333 | "full": false, 334 | "lineColor": "rgb(31, 120, 193)", 335 | "show": false 336 | }, 337 | "tableColumn": "", 338 | "targets": [ 339 | { 340 | "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) * 100 / count(node_cpu_seconds_total{mode=\"user\"} * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) ", 341 | "format": "time_series", 342 | "intervalFactor": 2, 343 | "legendFormat": "", 344 | "refId": "A", 345 | "step": 2 346 | } 347 | ], 348 | "thresholds": "10,25,100", 349 | "timeFrom": "1m", 350 | "timeShift": null, 351 | "title": "CPU Idle", 352 | "type": "singlestat", 353 | "valueFontSize": "80%", 354 | "valueMaps": [ 355 | { 356 | "op": "=", 357 | "text": "N/A", 358 | "value": "null" 359 | } 360 | ], 361 | "valueName": "avg" 362 | }, 363 | { 364 | "aliasColors": {}, 365 | "bars": false, 366 | "dashLength": 10, 367 | "dashes": false, 368 | "datasource": null, 369 | "decimals": 2, 370 | "fill": 1, 371 | "gridPos": { 372 | "h": 7, 373 | "w": 12, 374 | "x": 0, 375 | "y": 4 376 | }, 377 | "id": 13, 378 | "legend": { 379 | "alignAsTable": true, 380 | "avg": true, 381 | "current": true, 382 | "hideEmpty": false, 383 | "hideZero": false, 384 | "max": true, 385 | "min": true, 386 | "rightSide": true, 387 | "show": false, 388 | "total": false, 389 | "values": true 390 | }, 391 | "lines": true, 392 | "linewidth": 1, 393 | "links": [], 394 | "nullPointMode": "null", 395 | "percentage": false, 396 | "pointradius": 5, 397 | "points": false, 398 | "renderer": "flot", 399 | "seriesOverrides": [], 400 | "spaceLength": 10, 401 | "stack": false, 402 | "steppedLine": false, 403 | "targets": [ 404 | { 405 | "expr": "node_load5 * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}", 406 | "format": "time_series", 407 | "intervalFactor": 2, 408 | "legendFormat": "load5 {{node_name}}", 409 | "refId": "A", 410 | "step": 2 411 | } 412 | ], 413 | "thresholds": [], 414 | "timeFrom": null, 415 | "timeShift": null, 416 | "title": "System Load by Node", 417 | "tooltip": { 418 | "shared": true, 419 | "sort": 2, 420 | "value_type": "individual" 421 | }, 422 | "type": "graph", 423 | "xaxis": { 424 | "buckets": null, 425 | "mode": "time", 426 | "name": null, 427 | "show": true, 428 | "values": [] 429 | }, 430 | "yaxes": [ 431 | { 432 | "format": "short", 433 | "label": null, 434 | "logBase": 1, 435 | "max": null, 436 | "min": null, 437 | "show": true 438 | }, 439 | { 440 | "format": "short", 441 | "label": null, 442 | "logBase": 1, 443 | "max": null, 444 | "min": null, 445 | "show": true 446 | } 447 | ], 448 | "yaxis": { 449 | "align": false, 450 | "alignLevel": null 451 | } 452 | }, 453 | { 454 | "aliasColors": {}, 455 | "bars": false, 456 | "dashLength": 10, 457 | "dashes": false, 458 | "datasource": null, 459 | "decimals": 2, 460 | "fill": 1, 461 | "gridPos": { 462 | "h": 7, 463 | "w": 12, 464 | "x": 12, 465 | "y": 4 466 | }, 467 | "id": 14, 468 | "legend": { 469 | "alignAsTable": true, 470 | "avg": true, 471 | "current": true, 472 | "hideEmpty": true, 473 | "hideZero": true, 474 | "max": true, 475 | "min": true, 476 | "rightSide": true, 477 | "show": false, 478 | "total": false, 479 | "values": true 480 | }, 481 | "lines": true, 482 | "linewidth": 1, 483 | "links": [], 484 | "nullPointMode": "null as zero", 485 | "percentage": false, 486 | "pointradius": 5, 487 | "points": false, 488 | "renderer": "flot", 489 | "seriesOverrides": [], 490 | "spaceLength": 10, 491 | "stack": false, 492 | "steppedLine": false, 493 | "targets": [ 494 | { 495 | "expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) by (node_name))", 496 | "format": "time_series", 497 | "intervalFactor": 2, 498 | "legendFormat": "{{node_name}}", 499 | "refId": "A", 500 | "step": 2 501 | } 502 | ], 503 | "thresholds": [], 504 | "timeFrom": null, 505 | "timeShift": null, 506 | "title": "CPU Usage by Node", 507 | "tooltip": { 508 | "shared": true, 509 | "sort": 2, 510 | "value_type": "individual" 511 | }, 512 | "type": "graph", 513 | "xaxis": { 514 | "buckets": null, 515 | "mode": "time", 516 | "name": null, 517 | "show": true, 518 | "values": [] 519 | }, 520 | "yaxes": [ 521 | { 522 | "format": "percent", 523 | "label": null, 524 | "logBase": 1, 525 | "max": "100", 526 | "min": null, 527 | "show": true 528 | }, 529 | { 530 | "format": "short", 531 | "label": null, 532 | "logBase": 1, 533 | "max": null, 534 | "min": null, 535 | "show": true 536 | } 537 | ], 538 | "yaxis": { 539 | "align": false, 540 | "alignLevel": null 541 | } 542 | }, 543 | { 544 | "cacheTimeout": null, 545 | "colorBackground": false, 546 | "colorValue": false, 547 | "colors": [ 548 | "rgba(245, 54, 54, 0.9)", 549 | "rgba(237, 129, 40, 0.89)", 550 | "rgba(50, 172, 45, 0.97)" 551 | ], 552 | "datasource": null, 553 | "decimals": 1, 554 | "format": "decbytes", 555 | "gauge": { 556 | "maxValue": 100, 557 | "minValue": 0, 558 | "show": false, 559 | "thresholdLabels": false, 560 | "thresholdMarkers": true 561 | }, 562 | "gridPos": { 563 | "h": 4, 564 | "w": 3, 565 | "x": 0, 566 | "y": 11 567 | }, 568 | "hideTimeOverride": true, 569 | "id": 3, 570 | "interval": null, 571 | "links": [], 572 | "mappingType": 1, 573 | "mappingTypes": [ 574 | { 575 | "name": "value to text", 576 | "value": 1 577 | }, 578 | { 579 | "name": "range to text", 580 | "value": 2 581 | } 582 | ], 583 | "maxDataPoints": 100, 584 | "nullPointMode": "connected", 585 | "nullText": null, 586 | "postfix": "", 587 | "postfixFontSize": "50%", 588 | "prefix": "", 589 | "prefixFontSize": "50%", 590 | "rangeMaps": [ 591 | { 592 | "from": "null", 593 | "text": "N/A", 594 | "to": "null" 595 | } 596 | ], 597 | "sparkline": { 598 | "fillColor": "rgba(31, 118, 189, 0.18)", 599 | "full": false, 600 | "lineColor": "rgb(31, 120, 193)", 601 | "show": false 602 | }, 603 | "tableColumn": "", 604 | "targets": [ 605 | { 606 | "expr": "sum(node_memory_MemTotal_bytes * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 607 | "format": "time_series", 608 | "intervalFactor": 2, 609 | "legendFormat": "", 610 | "refId": "A", 611 | "step": 20 612 | } 613 | ], 614 | "thresholds": "", 615 | "timeFrom": null, 616 | "timeShift": null, 617 | "title": "Total Memory", 618 | "type": "singlestat", 619 | "valueFontSize": "80%", 620 | "valueMaps": [ 621 | { 622 | "op": "=", 623 | "text": "N/A", 624 | "value": "null" 625 | } 626 | ], 627 | "valueName": "avg" 628 | }, 629 | { 630 | "cacheTimeout": null, 631 | "colorBackground": false, 632 | "colorValue": false, 633 | "colors": [ 634 | "rgba(245, 54, 54, 0.9)", 635 | "rgba(237, 129, 40, 0.89)", 636 | "rgba(50, 172, 45, 0.97)" 637 | ], 638 | "datasource": null, 639 | "format": "percent", 640 | "gauge": { 641 | "maxValue": 100, 642 | "minValue": 0, 643 | "show": true, 644 | "thresholdLabels": false, 645 | "thresholdMarkers": true 646 | }, 647 | "gridPos": { 648 | "h": 4, 649 | "w": 4, 650 | "x": 3, 651 | "y": 11 652 | }, 653 | "id": 8, 654 | "interval": null, 655 | "links": [], 656 | "mappingType": 1, 657 | "mappingTypes": [ 658 | { 659 | "name": "value to text", 660 | "value": 1 661 | }, 662 | { 663 | "name": "range to text", 664 | "value": 2 665 | } 666 | ], 667 | "maxDataPoints": 100, 668 | "nullPointMode": "connected", 669 | "nullText": null, 670 | "postfix": "", 671 | "postfixFontSize": "50%", 672 | "prefix": "", 673 | "prefixFontSize": "50%", 674 | "rangeMaps": [ 675 | { 676 | "from": "null", 677 | "text": "N/A", 678 | "to": "null" 679 | } 680 | ], 681 | "sparkline": { 682 | "fillColor": "rgba(31, 118, 189, 0.18)", 683 | "full": false, 684 | "lineColor": "rgb(31, 120, 193)", 685 | "show": false 686 | }, 687 | "tableColumn": "", 688 | "targets": [ 689 | { 690 | "expr": "sum((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 691 | "format": "time_series", 692 | "intervalFactor": 2, 693 | "legendFormat": "", 694 | "refId": "A", 695 | "step": 20 696 | } 697 | ], 698 | "thresholds": "10,25,100", 699 | "title": "Available Memory", 700 | "type": "singlestat", 701 | "valueFontSize": "80%", 702 | "valueMaps": [ 703 | { 704 | "op": "=", 705 | "text": "N/A", 706 | "value": "null" 707 | } 708 | ], 709 | "valueName": "avg" 710 | }, 711 | { 712 | "cacheTimeout": null, 713 | "colorBackground": false, 714 | "colorValue": false, 715 | "colors": [ 716 | "rgba(245, 54, 54, 0.9)", 717 | "rgba(237, 129, 40, 0.89)", 718 | "rgba(50, 172, 45, 0.97)" 719 | ], 720 | "datasource": null, 721 | "decimals": 1, 722 | "format": "decbytes", 723 | "gauge": { 724 | "maxValue": 100, 725 | "minValue": 0, 726 | "show": false, 727 | "thresholdLabels": false, 728 | "thresholdMarkers": true 729 | }, 730 | "gridPos": { 731 | "h": 4, 732 | "w": 3, 733 | "x": 7, 734 | "y": 11 735 | }, 736 | "hideTimeOverride": true, 737 | "id": 22, 738 | "interval": null, 739 | "links": [], 740 | "mappingType": 1, 741 | "mappingTypes": [ 742 | { 743 | "name": "value to text", 744 | "value": 1 745 | }, 746 | { 747 | "name": "range to text", 748 | "value": 2 749 | } 750 | ], 751 | "maxDataPoints": 100, 752 | "nullPointMode": "connected", 753 | "nullText": null, 754 | "postfix": "", 755 | "postfixFontSize": "50%", 756 | "prefix": "", 757 | "prefixFontSize": "50%", 758 | "rangeMaps": [ 759 | { 760 | "from": "null", 761 | "text": "N/A", 762 | "to": "null" 763 | } 764 | ], 765 | "sparkline": { 766 | "fillColor": "rgba(31, 118, 189, 0.18)", 767 | "full": false, 768 | "lineColor": "rgb(31, 120, 193)", 769 | "show": false 770 | }, 771 | "tableColumn": "", 772 | "targets": [ 773 | { 774 | "expr": "sum((node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 775 | "format": "time_series", 776 | "intervalFactor": 2, 777 | "legendFormat": "", 778 | "refId": "A", 779 | "step": 20 780 | } 781 | ], 782 | "thresholds": "", 783 | "timeFrom": null, 784 | "timeShift": null, 785 | "title": "Total swap memory used", 786 | "type": "singlestat", 787 | "valueFontSize": "80%", 788 | "valueMaps": [ 789 | { 790 | "op": "=", 791 | "text": "N/A", 792 | "value": "null" 793 | } 794 | ], 795 | "valueName": "avg" 796 | }, 797 | { 798 | "cacheTimeout": null, 799 | "colorBackground": false, 800 | "colorValue": false, 801 | "colors": [ 802 | "rgba(50, 172, 45, 0.97)", 803 | "rgba(237, 129, 40, 0.89)", 804 | "rgba(245, 54, 54, 0.9)" 805 | ], 806 | "datasource": null, 807 | "format": "percent", 808 | "gauge": { 809 | "maxValue": 100, 810 | "minValue": 0, 811 | "show": true, 812 | "thresholdLabels": false, 813 | "thresholdMarkers": true 814 | }, 815 | "gridPos": { 816 | "h": 4, 817 | "w": 4, 818 | "x": 10, 819 | "y": 11 820 | }, 821 | "id": 23, 822 | "interval": null, 823 | "links": [], 824 | "mappingType": 1, 825 | "mappingTypes": [ 826 | { 827 | "name": "value to text", 828 | "value": 1 829 | }, 830 | { 831 | "name": "range to text", 832 | "value": 2 833 | } 834 | ], 835 | "maxDataPoints": 100, 836 | "nullPointMode": "connected", 837 | "nullText": null, 838 | "postfix": "", 839 | "postfixFontSize": "50%", 840 | "prefix": "", 841 | "prefixFontSize": "50%", 842 | "rangeMaps": [ 843 | { 844 | "from": "null", 845 | "text": "N/A", 846 | "to": "null" 847 | } 848 | ], 849 | "sparkline": { 850 | "fillColor": "rgba(31, 118, 189, 0.18)", 851 | "full": false, 852 | "lineColor": "rgb(31, 120, 193)", 853 | "show": false 854 | }, 855 | "tableColumn": "", 856 | "targets": [ 857 | { 858 | "expr": "sum(((node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / node_memory_SwapTotal_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 859 | "format": "time_series", 860 | "intervalFactor": 2, 861 | "legendFormat": "", 862 | "refId": "A", 863 | "step": 20 864 | } 865 | ], 866 | "thresholds": "5,10,100", 867 | "title": "Used swap memory", 868 | "type": "singlestat", 869 | "valueFontSize": "80%", 870 | "valueMaps": [ 871 | { 872 | "op": "=", 873 | "text": "N/A", 874 | "value": "null" 875 | } 876 | ], 877 | "valueName": "avg" 878 | }, 879 | { 880 | "cacheTimeout": null, 881 | "colorBackground": false, 882 | "colorValue": false, 883 | "colors": [ 884 | "rgba(50, 172, 45, 0.97)", 885 | "rgba(237, 129, 40, 0.89)", 886 | "rgba(245, 54, 54, 0.9)" 887 | ], 888 | "datasource": null, 889 | "format": "percent", 890 | "gauge": { 891 | "maxValue": 100, 892 | "minValue": 0, 893 | "show": true, 894 | "thresholdLabels": false, 895 | "thresholdMarkers": true 896 | }, 897 | "gridPos": { 898 | "h": 4, 899 | "w": 3, 900 | "x": 14, 901 | "y": 11 902 | }, 903 | "id": 24, 904 | "interval": null, 905 | "links": [], 906 | "mappingType": 1, 907 | "mappingTypes": [ 908 | { 909 | "name": "value to text", 910 | "value": 1 911 | }, 912 | { 913 | "name": "range to text", 914 | "value": 2 915 | } 916 | ], 917 | "maxDataPoints": 100, 918 | "nullPointMode": "connected", 919 | "nullText": null, 920 | "postfix": "", 921 | "postfixFontSize": "50%", 922 | "prefix": "", 923 | "prefixFontSize": "50%", 924 | "rangeMaps": [ 925 | { 926 | "from": "null", 927 | "text": "N/A", 928 | "to": "null" 929 | } 930 | ], 931 | "sparkline": { 932 | "fillColor": "rgba(31, 118, 189, 0.18)", 933 | "full": false, 934 | "lineColor": "rgb(31, 120, 193)", 935 | "show": false 936 | }, 937 | "tableColumn": "", 938 | "targets": [ 939 | { 940 | "expr": "sum(((node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 941 | "format": "time_series", 942 | "intervalFactor": 2, 943 | "legendFormat": "", 944 | "refId": "A", 945 | "step": 20 946 | } 947 | ], 948 | "thresholds": "5,10,100", 949 | "title": "Swap used / total RAM memory ratio", 950 | "type": "singlestat", 951 | "valueFontSize": "80%", 952 | "valueMaps": [ 953 | { 954 | "op": "=", 955 | "text": "N/A", 956 | "value": "null" 957 | } 958 | ], 959 | "valueName": "avg" 960 | }, 961 | { 962 | "cacheTimeout": null, 963 | "colorBackground": false, 964 | "colorValue": false, 965 | "colors": [ 966 | "rgba(245, 54, 54, 0.9)", 967 | "rgba(237, 129, 40, 0.89)", 968 | "rgba(50, 172, 45, 0.97)" 969 | ], 970 | "datasource": null, 971 | "decimals": 1, 972 | "format": "decbytes", 973 | "gauge": { 974 | "maxValue": 100, 975 | "minValue": 0, 976 | "show": false, 977 | "thresholdLabels": false, 978 | "thresholdMarkers": true 979 | }, 980 | "gridPos": { 981 | "h": 4, 982 | "w": 3, 983 | "x": 17, 984 | "y": 11 985 | }, 986 | "hideTimeOverride": true, 987 | "id": 9, 988 | "interval": null, 989 | "links": [], 990 | "mappingType": 1, 991 | "mappingTypes": [ 992 | { 993 | "name": "value to text", 994 | "value": 1 995 | }, 996 | { 997 | "name": "range to text", 998 | "value": 2 999 | } 1000 | ], 1001 | "maxDataPoints": 100, 1002 | "nullPointMode": "connected", 1003 | "nullText": null, 1004 | "postfix": "", 1005 | "postfixFontSize": "50%", 1006 | "prefix": "", 1007 | "prefixFontSize": "50%", 1008 | "rangeMaps": [ 1009 | { 1010 | "from": "null", 1011 | "text": "N/A", 1012 | "to": "null" 1013 | } 1014 | ], 1015 | "sparkline": { 1016 | "fillColor": "rgba(31, 118, 189, 0.18)", 1017 | "full": false, 1018 | "lineColor": "rgb(31, 120, 193)", 1019 | "show": false 1020 | }, 1021 | "tableColumn": "", 1022 | "targets": [ 1023 | { 1024 | "expr": "sum(node_filesystem_size_bytes{mountpoint=\"/rootfs\"} * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 1025 | "format": "time_series", 1026 | "intervalFactor": 2, 1027 | "legendFormat": "", 1028 | "refId": "A", 1029 | "step": 20 1030 | } 1031 | ], 1032 | "thresholds": "", 1033 | "timeFrom": null, 1034 | "timeShift": null, 1035 | "title": "Total Disk Space", 1036 | "type": "singlestat", 1037 | "valueFontSize": "80%", 1038 | "valueMaps": [ 1039 | { 1040 | "op": "=", 1041 | "text": "N/A", 1042 | "value": "null" 1043 | } 1044 | ], 1045 | "valueName": "avg" 1046 | }, 1047 | { 1048 | "cacheTimeout": null, 1049 | "colorBackground": false, 1050 | "colorValue": false, 1051 | "colors": [ 1052 | "rgba(245, 54, 54, 0.9)", 1053 | "rgba(237, 129, 40, 0.89)", 1054 | "rgba(50, 172, 45, 0.97)" 1055 | ], 1056 | "datasource": null, 1057 | "format": "percent", 1058 | "gauge": { 1059 | "maxValue": 100, 1060 | "minValue": 0, 1061 | "show": true, 1062 | "thresholdLabels": false, 1063 | "thresholdMarkers": true 1064 | }, 1065 | "gridPos": { 1066 | "h": 4, 1067 | "w": 4, 1068 | "x": 20, 1069 | "y": 11 1070 | }, 1071 | "id": 10, 1072 | "interval": null, 1073 | "links": [], 1074 | "mappingType": 1, 1075 | "mappingTypes": [ 1076 | { 1077 | "name": "value to text", 1078 | "value": 1 1079 | }, 1080 | { 1081 | "name": "range to text", 1082 | "value": 2 1083 | } 1084 | ], 1085 | "maxDataPoints": 100, 1086 | "nullPointMode": "connected", 1087 | "nullText": null, 1088 | "postfix": "", 1089 | "postfixFontSize": "50%", 1090 | "prefix": "", 1091 | "prefixFontSize": "50%", 1092 | "rangeMaps": [ 1093 | { 1094 | "from": "null", 1095 | "text": "N/A", 1096 | "to": "null" 1097 | } 1098 | ], 1099 | "sparkline": { 1100 | "fillColor": "rgba(31, 118, 189, 0.18)", 1101 | "full": false, 1102 | "lineColor": "rgb(31, 120, 193)", 1103 | "show": false 1104 | }, 1105 | "tableColumn": "", 1106 | "targets": [ 1107 | { 1108 | "expr": "sum((node_filesystem_free_bytes{mountpoint=\"/rootfs\"} / node_filesystem_size_bytes{mountpoint=\"/rootfs\"}) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 1109 | "format": "time_series", 1110 | "intervalFactor": 2, 1111 | "legendFormat": "", 1112 | "refId": "A", 1113 | "step": 20 1114 | } 1115 | ], 1116 | "thresholds": "10,25,100", 1117 | "title": "Available Disk Space", 1118 | "type": "singlestat", 1119 | "valueFontSize": "80%", 1120 | "valueMaps": [ 1121 | { 1122 | "op": "=", 1123 | "text": "N/A", 1124 | "value": "null" 1125 | } 1126 | ], 1127 | "valueName": "avg" 1128 | }, 1129 | { 1130 | "aliasColors": {}, 1131 | "bars": false, 1132 | "dashLength": 10, 1133 | "dashes": false, 1134 | "datasource": null, 1135 | "fill": 1, 1136 | "gridPos": { 1137 | "h": 7, 1138 | "w": 24, 1139 | "x": 0, 1140 | "y": 15 1141 | }, 1142 | "id": 15, 1143 | "legend": { 1144 | "alignAsTable": true, 1145 | "avg": true, 1146 | "current": false, 1147 | "max": true, 1148 | "min": true, 1149 | "rightSide": true, 1150 | "show": true, 1151 | "total": false, 1152 | "values": true 1153 | }, 1154 | "lines": true, 1155 | "linewidth": 1, 1156 | "links": [], 1157 | "nullPointMode": "null", 1158 | "percentage": false, 1159 | "pointradius": 5, 1160 | "points": false, 1161 | "renderer": "flot", 1162 | "seriesOverrides": [], 1163 | "spaceLength": 10, 1164 | "stack": true, 1165 | "steppedLine": false, 1166 | "targets": [ 1167 | { 1168 | "expr": "sum((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Cached_bytes - node_memory_Buffers_bytes - node_memory_Slab_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1169 | "format": "time_series", 1170 | "intervalFactor": 2, 1171 | "legendFormat": "Used {{node_name}}", 1172 | "refId": "A", 1173 | "step": 2 1174 | }, 1175 | { 1176 | "expr": "sum(node_memory_Cached * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1177 | "format": "time_series", 1178 | "intervalFactor": 2, 1179 | "legendFormat": "Cached {{node_name}}", 1180 | "refId": "B", 1181 | "step": 2 1182 | } 1183 | ], 1184 | "thresholds": [], 1185 | "timeFrom": null, 1186 | "timeShift": null, 1187 | "title": "Memory usage by Node", 1188 | "tooltip": { 1189 | "shared": true, 1190 | "sort": 0, 1191 | "value_type": "individual" 1192 | }, 1193 | "type": "graph", 1194 | "xaxis": { 1195 | "buckets": null, 1196 | "mode": "time", 1197 | "name": null, 1198 | "show": true, 1199 | "values": [] 1200 | }, 1201 | "yaxes": [ 1202 | { 1203 | "format": "decbytes", 1204 | "label": null, 1205 | "logBase": 1, 1206 | "max": null, 1207 | "min": null, 1208 | "show": true 1209 | }, 1210 | { 1211 | "format": "short", 1212 | "label": null, 1213 | "logBase": 1, 1214 | "max": null, 1215 | "min": null, 1216 | "show": true 1217 | } 1218 | ], 1219 | "yaxis": { 1220 | "align": false, 1221 | "alignLevel": null 1222 | } 1223 | }, 1224 | { 1225 | "aliasColors": {}, 1226 | "bars": false, 1227 | "dashLength": 10, 1228 | "dashes": false, 1229 | "datasource": null, 1230 | "fill": 1, 1231 | "gridPos": { 1232 | "h": 7, 1233 | "w": 24, 1234 | "x": 0, 1235 | "y": 22 1236 | }, 1237 | "id": 21, 1238 | "legend": { 1239 | "alignAsTable": true, 1240 | "avg": true, 1241 | "current": false, 1242 | "max": true, 1243 | "min": true, 1244 | "rightSide": true, 1245 | "show": true, 1246 | "total": false, 1247 | "values": true 1248 | }, 1249 | "lines": true, 1250 | "linewidth": 1, 1251 | "links": [], 1252 | "nullPointMode": "null", 1253 | "percentage": false, 1254 | "pointradius": 5, 1255 | "points": false, 1256 | "renderer": "flot", 1257 | "seriesOverrides": [], 1258 | "spaceLength": 10, 1259 | "stack": true, 1260 | "steppedLine": false, 1261 | "targets": [ 1262 | { 1263 | "expr": "sum((node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1264 | "format": "time_series", 1265 | "intervalFactor": 2, 1266 | "legendFormat": "Used {{node_name}}", 1267 | "refId": "A", 1268 | "step": 2 1269 | } 1270 | ], 1271 | "thresholds": [], 1272 | "timeFrom": null, 1273 | "timeShift": null, 1274 | "title": "Swap memory usage by Node", 1275 | "tooltip": { 1276 | "shared": true, 1277 | "sort": 0, 1278 | "value_type": "individual" 1279 | }, 1280 | "type": "graph", 1281 | "xaxis": { 1282 | "buckets": null, 1283 | "mode": "time", 1284 | "name": null, 1285 | "show": true, 1286 | "values": [] 1287 | }, 1288 | "yaxes": [ 1289 | { 1290 | "format": "decbytes", 1291 | "label": null, 1292 | "logBase": 1, 1293 | "max": null, 1294 | "min": "0", 1295 | "show": true 1296 | }, 1297 | { 1298 | "format": "short", 1299 | "label": null, 1300 | "logBase": 1, 1301 | "max": null, 1302 | "min": null, 1303 | "show": true 1304 | } 1305 | ], 1306 | "yaxis": { 1307 | "align": false, 1308 | "alignLevel": null 1309 | } 1310 | }, 1311 | { 1312 | "aliasColors": {}, 1313 | "bars": false, 1314 | "dashLength": 10, 1315 | "dashes": false, 1316 | "datasource": null, 1317 | "decimals": 2, 1318 | "fill": 1, 1319 | "gridPos": { 1320 | "h": 7, 1321 | "w": 24, 1322 | "x": 0, 1323 | "y": 29 1324 | }, 1325 | "id": 16, 1326 | "legend": { 1327 | "alignAsTable": true, 1328 | "avg": true, 1329 | "current": false, 1330 | "max": true, 1331 | "min": true, 1332 | "rightSide": true, 1333 | "show": true, 1334 | "total": false, 1335 | "values": true 1336 | }, 1337 | "lines": true, 1338 | "linewidth": 1, 1339 | "links": [], 1340 | "nullPointMode": "null as zero", 1341 | "percentage": false, 1342 | "pointradius": 5, 1343 | "points": false, 1344 | "renderer": "flot", 1345 | "seriesOverrides": [], 1346 | "spaceLength": 10, 1347 | "stack": false, 1348 | "steppedLine": false, 1349 | "targets": [ 1350 | { 1351 | "expr": "sum(irate(node_disk_read_bytes_total[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1352 | "format": "time_series", 1353 | "intervalFactor": 2, 1354 | "legendFormat": "Read {{node_name}}", 1355 | "refId": "A", 1356 | "step": 2 1357 | }, 1358 | { 1359 | "expr": "sum(irate(node_disk_written_bytes_total[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1360 | "format": "time_series", 1361 | "intervalFactor": 2, 1362 | "legendFormat": "Written {{node_name}}", 1363 | "refId": "B", 1364 | "step": 2 1365 | } 1366 | ], 1367 | "thresholds": [], 1368 | "timeFrom": null, 1369 | "timeShift": null, 1370 | "title": "Disk I/O by Node", 1371 | "tooltip": { 1372 | "shared": true, 1373 | "sort": 0, 1374 | "value_type": "individual" 1375 | }, 1376 | "type": "graph", 1377 | "xaxis": { 1378 | "buckets": null, 1379 | "mode": "time", 1380 | "name": null, 1381 | "show": true, 1382 | "values": [] 1383 | }, 1384 | "yaxes": [ 1385 | { 1386 | "format": "Bps", 1387 | "label": null, 1388 | "logBase": 1, 1389 | "max": null, 1390 | "min": null, 1391 | "show": true 1392 | }, 1393 | { 1394 | "format": "short", 1395 | "label": null, 1396 | "logBase": 1, 1397 | "max": null, 1398 | "min": null, 1399 | "show": true 1400 | } 1401 | ], 1402 | "yaxis": { 1403 | "align": false, 1404 | "alignLevel": null 1405 | } 1406 | }, 1407 | { 1408 | "aliasColors": {}, 1409 | "bars": false, 1410 | "dashLength": 10, 1411 | "dashes": false, 1412 | "datasource": null, 1413 | "decimals": 2, 1414 | "fill": 1, 1415 | "gridPos": { 1416 | "h": 7, 1417 | "w": 12, 1418 | "x": 0, 1419 | "y": 36 1420 | }, 1421 | "id": 18, 1422 | "legend": { 1423 | "alignAsTable": true, 1424 | "avg": true, 1425 | "current": true, 1426 | "max": true, 1427 | "min": true, 1428 | "rightSide": true, 1429 | "show": false, 1430 | "total": false, 1431 | "values": true 1432 | }, 1433 | "lines": true, 1434 | "linewidth": 1, 1435 | "links": [], 1436 | "nullPointMode": "null as zero", 1437 | "percentage": false, 1438 | "pointradius": 5, 1439 | "points": false, 1440 | "renderer": "flot", 1441 | "seriesOverrides": [], 1442 | "spaceLength": 10, 1443 | "stack": false, 1444 | "steppedLine": false, 1445 | "targets": [ 1446 | { 1447 | "expr": "sum(irate(node_disk_reads_completed_total[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1448 | "format": "time_series", 1449 | "intervalFactor": 2, 1450 | "legendFormat": "Reads {{node_name}}", 1451 | "refId": "A", 1452 | "step": 2 1453 | }, 1454 | { 1455 | "expr": "sum(irate(node_disk_writes_completed_total[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", 1456 | "format": "time_series", 1457 | "intervalFactor": 2, 1458 | "legendFormat": "Writes {{node_name}}", 1459 | "refId": "B", 1460 | "step": 2 1461 | } 1462 | ], 1463 | "thresholds": [], 1464 | "timeFrom": null, 1465 | "timeShift": null, 1466 | "title": "IOPS by Node", 1467 | "tooltip": { 1468 | "shared": true, 1469 | "sort": 0, 1470 | "value_type": "individual" 1471 | }, 1472 | "type": "graph", 1473 | "xaxis": { 1474 | "buckets": null, 1475 | "mode": "time", 1476 | "name": null, 1477 | "show": true, 1478 | "values": [] 1479 | }, 1480 | "yaxes": [ 1481 | { 1482 | "format": "short", 1483 | "label": null, 1484 | "logBase": 1, 1485 | "max": null, 1486 | "min": null, 1487 | "show": true 1488 | }, 1489 | { 1490 | "format": "short", 1491 | "label": null, 1492 | "logBase": 1, 1493 | "max": null, 1494 | "min": null, 1495 | "show": true 1496 | } 1497 | ], 1498 | "yaxis": { 1499 | "align": false, 1500 | "alignLevel": null 1501 | } 1502 | }, 1503 | { 1504 | "aliasColors": {}, 1505 | "bars": false, 1506 | "dashLength": 10, 1507 | "dashes": false, 1508 | "datasource": null, 1509 | "decimals": 2, 1510 | "fill": 1, 1511 | "gridPos": { 1512 | "h": 7, 1513 | "w": 12, 1514 | "x": 12, 1515 | "y": 36 1516 | }, 1517 | "id": 19, 1518 | "legend": { 1519 | "alignAsTable": true, 1520 | "avg": true, 1521 | "current": true, 1522 | "hideEmpty": true, 1523 | "hideZero": true, 1524 | "max": true, 1525 | "min": true, 1526 | "rightSide": true, 1527 | "show": false, 1528 | "total": false, 1529 | "values": true 1530 | }, 1531 | "lines": true, 1532 | "linewidth": 1, 1533 | "links": [], 1534 | "nullPointMode": "null as zero", 1535 | "percentage": false, 1536 | "pointradius": 5, 1537 | "points": false, 1538 | "renderer": "flot", 1539 | "seriesOverrides": [], 1540 | "spaceLength": 10, 1541 | "stack": false, 1542 | "steppedLine": false, 1543 | "targets": [ 1544 | { 1545 | "expr": "(avg(irate(node_cpu_seconds_total{mode=\"iowait\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) by (node_name))", 1546 | "format": "time_series", 1547 | "intervalFactor": 2, 1548 | "legendFormat": "{{node_name}}", 1549 | "refId": "A", 1550 | "step": 2 1551 | } 1552 | ], 1553 | "thresholds": [], 1554 | "timeFrom": null, 1555 | "timeShift": null, 1556 | "title": "CPU IO Wait by Node", 1557 | "tooltip": { 1558 | "shared": true, 1559 | "sort": 2, 1560 | "value_type": "individual" 1561 | }, 1562 | "type": "graph", 1563 | "xaxis": { 1564 | "buckets": null, 1565 | "mode": "time", 1566 | "name": null, 1567 | "show": true, 1568 | "values": [] 1569 | }, 1570 | "yaxes": [ 1571 | { 1572 | "format": "percent", 1573 | "label": null, 1574 | "logBase": 1, 1575 | "max": null, 1576 | "min": null, 1577 | "show": true 1578 | }, 1579 | { 1580 | "format": "short", 1581 | "label": null, 1582 | "logBase": 1, 1583 | "max": null, 1584 | "min": null, 1585 | "show": true 1586 | } 1587 | ], 1588 | "yaxis": { 1589 | "align": false, 1590 | "alignLevel": null 1591 | } 1592 | }, 1593 | { 1594 | "aliasColors": {}, 1595 | "bars": false, 1596 | "dashLength": 10, 1597 | "dashes": false, 1598 | "datasource": null, 1599 | "decimals": 0, 1600 | "fill": 3, 1601 | "gridPos": { 1602 | "h": 7, 1603 | "w": 18, 1604 | "x": 0, 1605 | "y": 43 1606 | }, 1607 | "id": 12, 1608 | "legend": { 1609 | "alignAsTable": true, 1610 | "avg": false, 1611 | "current": true, 1612 | "hideEmpty": true, 1613 | "hideZero": true, 1614 | "max": false, 1615 | "min": false, 1616 | "rightSide": true, 1617 | "show": true, 1618 | "sort": "current", 1619 | "sortDesc": true, 1620 | "total": false, 1621 | "values": true 1622 | }, 1623 | "lines": true, 1624 | "linewidth": 1, 1625 | "links": [], 1626 | "nullPointMode": "null", 1627 | "percentage": false, 1628 | "pointradius": 5, 1629 | "points": false, 1630 | "renderer": "flot", 1631 | "seriesOverrides": [], 1632 | "spaceLength": 10, 1633 | "stack": true, 1634 | "steppedLine": false, 1635 | "targets": [ 1636 | { 1637 | "expr": "sum(rate(container_last_seen{container_label_com_docker_swarm_node_id=~\"$node_id\"}[5m])) by (container_label_com_docker_swarm_service_name)", 1638 | "format": "time_series", 1639 | "intervalFactor": 10, 1640 | "legendFormat": "{{ container_label_com_docker_swarm_service_name }}", 1641 | "refId": "A", 1642 | "step": 10 1643 | } 1644 | ], 1645 | "thresholds": [], 1646 | "timeFrom": null, 1647 | "timeShift": null, 1648 | "title": "Running Containers by Service", 1649 | "tooltip": { 1650 | "shared": true, 1651 | "sort": 2, 1652 | "value_type": "individual" 1653 | }, 1654 | "type": "graph", 1655 | "xaxis": { 1656 | "buckets": null, 1657 | "mode": "time", 1658 | "name": null, 1659 | "show": true, 1660 | "values": [] 1661 | }, 1662 | "yaxes": [ 1663 | { 1664 | "format": "short", 1665 | "label": null, 1666 | "logBase": 1, 1667 | "max": null, 1668 | "min": null, 1669 | "show": true 1670 | }, 1671 | { 1672 | "format": "short", 1673 | "label": null, 1674 | "logBase": 1, 1675 | "max": null, 1676 | "min": null, 1677 | "show": true 1678 | } 1679 | ], 1680 | "yaxis": { 1681 | "align": false, 1682 | "alignLevel": null 1683 | } 1684 | }, 1685 | { 1686 | "cacheTimeout": null, 1687 | "colorBackground": false, 1688 | "colorValue": false, 1689 | "colors": [ 1690 | "rgba(245, 54, 54, 0.9)", 1691 | "rgba(237, 129, 40, 0.89)", 1692 | "rgba(50, 172, 45, 0.97)" 1693 | ], 1694 | "datasource": null, 1695 | "format": "none", 1696 | "gauge": { 1697 | "maxValue": 100, 1698 | "minValue": 0, 1699 | "show": false, 1700 | "thresholdLabels": false, 1701 | "thresholdMarkers": true 1702 | }, 1703 | "gridPos": { 1704 | "h": 7, 1705 | "w": 6, 1706 | "x": 18, 1707 | "y": 43 1708 | }, 1709 | "id": 7, 1710 | "interval": null, 1711 | "links": [], 1712 | "mappingType": 1, 1713 | "mappingTypes": [ 1714 | { 1715 | "name": "value to text", 1716 | "value": 1 1717 | }, 1718 | { 1719 | "name": "range to text", 1720 | "value": 2 1721 | } 1722 | ], 1723 | "maxDataPoints": 100, 1724 | "nullPointMode": "connected", 1725 | "nullText": null, 1726 | "postfix": "", 1727 | "postfixFontSize": "50%", 1728 | "prefix": "", 1729 | "prefixFontSize": "50%", 1730 | "rangeMaps": [ 1731 | { 1732 | "from": "null", 1733 | "text": "N/A", 1734 | "to": "null" 1735 | } 1736 | ], 1737 | "sparkline": { 1738 | "fillColor": "rgba(31, 118, 189, 0.18)", 1739 | "full": false, 1740 | "lineColor": "rgb(31, 120, 193)", 1741 | "show": true 1742 | }, 1743 | "tableColumn": "", 1744 | "targets": [ 1745 | { 1746 | "expr": "count(rate(container_last_seen{container_label_com_docker_swarm_node_id=~\"$node_id\"}[5m])) ", 1747 | "format": "time_series", 1748 | "intervalFactor": 2, 1749 | "refId": "A", 1750 | "step": 20 1751 | } 1752 | ], 1753 | "thresholds": "", 1754 | "title": "Total Containers", 1755 | "type": "singlestat", 1756 | "valueFontSize": "80%", 1757 | "valueMaps": [ 1758 | { 1759 | "op": "=", 1760 | "text": "N/A", 1761 | "value": "null" 1762 | } 1763 | ], 1764 | "valueName": "avg" 1765 | }, 1766 | { 1767 | "aliasColors": {}, 1768 | "bars": false, 1769 | "dashLength": 10, 1770 | "dashes": false, 1771 | "datasource": null, 1772 | "fill": 1, 1773 | "gridPos": { 1774 | "h": 7, 1775 | "w": 24, 1776 | "x": 0, 1777 | "y": 50 1778 | }, 1779 | "id": 17, 1780 | "legend": { 1781 | "alignAsTable": true, 1782 | "avg": true, 1783 | "current": false, 1784 | "max": true, 1785 | "min": true, 1786 | "rightSide": true, 1787 | "show": true, 1788 | "total": false, 1789 | "values": true 1790 | }, 1791 | "lines": true, 1792 | "linewidth": 1, 1793 | "links": [], 1794 | "nullPointMode": "null", 1795 | "percentage": false, 1796 | "pointradius": 5, 1797 | "points": false, 1798 | "renderer": "flot", 1799 | "seriesOverrides": [], 1800 | "spaceLength": 10, 1801 | "stack": false, 1802 | "steppedLine": false, 1803 | "targets": [ 1804 | { 1805 | "expr": "sum(rate(container_network_receive_bytes_total{container_label_com_docker_swarm_node_id=~\"$node_id\"}[$interval]) * on(container_label_com_docker_swarm_node_id) group_left(node_name) node_meta) by (node_name)", 1806 | "format": "time_series", 1807 | "intervalFactor": 2, 1808 | "legendFormat": "IN {{node_name}}", 1809 | "refId": "A", 1810 | "step": 2 1811 | }, 1812 | { 1813 | "expr": "- sum(rate(container_network_transmit_bytes_total{container_label_com_docker_swarm_node_id=~\"$node_id\"}[$interval]) * on(container_label_com_docker_swarm_node_id) group_left(node_name) node_meta) by (node_name)", 1814 | "format": "time_series", 1815 | "hide": false, 1816 | "intervalFactor": 2, 1817 | "legendFormat": "OUT {{node_name}}", 1818 | "metric": "", 1819 | "refId": "B", 1820 | "step": 2 1821 | } 1822 | ], 1823 | "thresholds": [], 1824 | "timeFrom": null, 1825 | "timeShift": null, 1826 | "title": "Containers Network Traffic by Node", 1827 | "tooltip": { 1828 | "shared": true, 1829 | "sort": 0, 1830 | "value_type": "individual" 1831 | }, 1832 | "type": "graph", 1833 | "xaxis": { 1834 | "buckets": null, 1835 | "mode": "time", 1836 | "name": null, 1837 | "show": true, 1838 | "values": [] 1839 | }, 1840 | "yaxes": [ 1841 | { 1842 | "format": "Bps", 1843 | "label": null, 1844 | "logBase": 1, 1845 | "max": null, 1846 | "min": null, 1847 | "show": true 1848 | }, 1849 | { 1850 | "format": "short", 1851 | "label": null, 1852 | "logBase": 1, 1853 | "max": null, 1854 | "min": null, 1855 | "show": true 1856 | } 1857 | ], 1858 | "yaxis": { 1859 | "align": false, 1860 | "alignLevel": null 1861 | } 1862 | }, 1863 | { 1864 | "columns": [], 1865 | "datasource": null, 1866 | "fontSize": "100%", 1867 | "gridPos": { 1868 | "h": 7, 1869 | "w": 24, 1870 | "x": 0, 1871 | "y": 57 1872 | }, 1873 | "hideTimeOverride": true, 1874 | "id": 20, 1875 | "links": [], 1876 | "pageSize": null, 1877 | "scroll": true, 1878 | "showHeader": true, 1879 | "sort": { 1880 | "col": 0, 1881 | "desc": true 1882 | }, 1883 | "styles": [ 1884 | { 1885 | "alias": "Time", 1886 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 1887 | "pattern": "Time", 1888 | "type": "hidden" 1889 | }, 1890 | { 1891 | "alias": "", 1892 | "colorMode": null, 1893 | "colors": [ 1894 | "rgba(245, 54, 54, 0.9)", 1895 | "rgba(237, 129, 40, 0.89)", 1896 | "rgba(50, 172, 45, 0.97)" 1897 | ], 1898 | "decimals": 2, 1899 | "pattern": "/.*/", 1900 | "thresholds": [], 1901 | "type": "number", 1902 | "unit": "short" 1903 | } 1904 | ], 1905 | "targets": [ 1906 | { 1907 | "expr": "sum(node_meta) by (node_id, node_name, instance)", 1908 | "format": "table", 1909 | "instant": true, 1910 | "intervalFactor": 2, 1911 | "refId": "A", 1912 | "step": 2 1913 | } 1914 | ], 1915 | "timeFrom": "1s", 1916 | "title": "Cluster members", 1917 | "transform": "table", 1918 | "type": "table" 1919 | } 1920 | ], 1921 | "refresh": "30s", 1922 | "schemaVersion": 16, 1923 | "style": "dark", 1924 | "tags": [ 1925 | "swarmprom" 1926 | ], 1927 | "templating": { 1928 | "list": [ 1929 | { 1930 | "allValue": ".+", 1931 | "current": { 1932 | "text": "All", 1933 | "value": "$__all" 1934 | }, 1935 | "datasource": "Prometheus", 1936 | "hide": 0, 1937 | "includeAll": true, 1938 | "label": "Swarm Node", 1939 | "multi": false, 1940 | "name": "node_id", 1941 | "options": [], 1942 | "query": "node_meta", 1943 | "refresh": 1, 1944 | "regex": "/node_id=\"([^\"]+)\"/", 1945 | "skipUrlSync": false, 1946 | "sort": 0, 1947 | "tagValuesQuery": "label_values({node_id=\"$tag\"},node_name)", 1948 | "tags": [ 1949 | "ofdocker", 1950 | "ofmon" 1951 | ], 1952 | "tagsQuery": "label_values(node_meta, node_name)", 1953 | "type": "query", 1954 | "useTags": true 1955 | }, 1956 | { 1957 | "auto": true, 1958 | "auto_count": 30, 1959 | "auto_min": "30s", 1960 | "current": { 1961 | "text": "auto", 1962 | "value": "$__auto_interval_interval" 1963 | }, 1964 | "hide": 0, 1965 | "label": "Interval", 1966 | "name": "interval", 1967 | "options": [ 1968 | { 1969 | "selected": true, 1970 | "text": "auto", 1971 | "value": "$__auto_interval_interval" 1972 | }, 1973 | { 1974 | "selected": false, 1975 | "text": "1m", 1976 | "value": "1m" 1977 | }, 1978 | { 1979 | "selected": false, 1980 | "text": "10m", 1981 | "value": "10m" 1982 | }, 1983 | { 1984 | "selected": false, 1985 | "text": "30m", 1986 | "value": "30m" 1987 | }, 1988 | { 1989 | "selected": false, 1990 | "text": "1h", 1991 | "value": "1h" 1992 | }, 1993 | { 1994 | "selected": false, 1995 | "text": "6h", 1996 | "value": "6h" 1997 | }, 1998 | { 1999 | "selected": false, 2000 | "text": "12h", 2001 | "value": "12h" 2002 | }, 2003 | { 2004 | "selected": false, 2005 | "text": "1d", 2006 | "value": "1d" 2007 | }, 2008 | { 2009 | "selected": false, 2010 | "text": "7d", 2011 | "value": "7d" 2012 | }, 2013 | { 2014 | "selected": false, 2015 | "text": "14d", 2016 | "value": "14d" 2017 | }, 2018 | { 2019 | "selected": false, 2020 | "text": "30d", 2021 | "value": "30d" 2022 | } 2023 | ], 2024 | "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 2025 | "refresh": 2, 2026 | "skipUrlSync": false, 2027 | "type": "interval" 2028 | } 2029 | ] 2030 | }, 2031 | "time": { 2032 | "from": "now-15m", 2033 | "to": "now" 2034 | }, 2035 | "timepicker": { 2036 | "refresh_intervals": [ 2037 | "5s", 2038 | "10s", 2039 | "30s", 2040 | "1m", 2041 | "5m", 2042 | "15m", 2043 | "30m", 2044 | "1h", 2045 | "2h", 2046 | "1d" 2047 | ], 2048 | "time_options": [ 2049 | "5m", 2050 | "15m", 2051 | "1h", 2052 | "6h", 2053 | "12h", 2054 | "24h", 2055 | "2d", 2056 | "7d", 2057 | "30d" 2058 | ] 2059 | }, 2060 | "timezone": "", 2061 | "title": "Docker Swarm Nodes", 2062 | "uid": "BPlb-Sgik", 2063 | "version": 3 2064 | } 2065 | -------------------------------------------------------------------------------- /grafana/dashboards/swarmprom-prometheus-dash.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "$$hashKey": "object:698", 6 | "builtIn": 1, 7 | "datasource": "-- Grafana --", 8 | "enable": true, 9 | "hide": true, 10 | "iconColor": "rgba(0, 211, 255, 1)", 11 | "name": "Annotations & Alerts", 12 | "type": "dashboard" 13 | } 14 | ] 15 | }, 16 | "editable": true, 17 | "gnetId": null, 18 | "graphTooltip": 1, 19 | "links": [ 20 | { 21 | "icon": "info", 22 | "tags": [], 23 | "targetBlank": true, 24 | "title": "Grafana Docs", 25 | "tooltip": "", 26 | "type": "link", 27 | "url": "http://docs.grafana.org/" 28 | }, 29 | { 30 | "icon": "info", 31 | "tags": [], 32 | "targetBlank": true, 33 | "title": "Prometheus Docs", 34 | "type": "link", 35 | "url": "http://prometheus.io/docs/introduction/overview/" 36 | } 37 | ], 38 | "panels": [ 39 | { 40 | "aliasColors": { 41 | "prometheus": "#C15C17", 42 | "{instance=\"localhost:9090\",job=\"prometheus\"}": "#CCA300" 43 | }, 44 | "bars": false, 45 | "dashLength": 10, 46 | "dashes": false, 47 | "datasource": "Prometheus", 48 | "editable": true, 49 | "error": false, 50 | "fill": 0, 51 | "grid": {}, 52 | "gridPos": { 53 | "h": 5, 54 | "w": 6, 55 | "x": 0, 56 | "y": 0 57 | }, 58 | "id": 3, 59 | "legend": { 60 | "avg": false, 61 | "current": false, 62 | "max": false, 63 | "min": false, 64 | "show": true, 65 | "total": false, 66 | "values": false 67 | }, 68 | "lines": true, 69 | "linewidth": 1, 70 | "links": [], 71 | "nullPointMode": "connected", 72 | "percentage": false, 73 | "pointradius": 2, 74 | "points": false, 75 | "renderer": "flot", 76 | "seriesOverrides": [], 77 | "spaceLength": 10, 78 | "stack": false, 79 | "steppedLine": false, 80 | "targets": [ 81 | { 82 | "expr": "sum(irate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus\"}[5m]))", 83 | "format": "time_series", 84 | "hide": false, 85 | "interval": "", 86 | "intervalFactor": 2, 87 | "legendFormat": "samples", 88 | "metric": "", 89 | "refId": "A", 90 | "step": 20 91 | } 92 | ], 93 | "thresholds": [], 94 | "timeFrom": null, 95 | "timeShift": null, 96 | "title": "Samples Appended", 97 | "tooltip": { 98 | "shared": true, 99 | "sort": 0, 100 | "value_type": "cumulative" 101 | }, 102 | "type": "graph", 103 | "xaxis": { 104 | "buckets": null, 105 | "mode": "time", 106 | "name": null, 107 | "show": true, 108 | "values": [] 109 | }, 110 | "yaxes": [ 111 | { 112 | "format": "short", 113 | "logBase": 1, 114 | "max": null, 115 | "min": "0", 116 | "show": true 117 | }, 118 | { 119 | "format": "short", 120 | "logBase": 1, 121 | "max": null, 122 | "min": null, 123 | "show": true 124 | } 125 | ] 126 | }, 127 | { 128 | "aliasColors": {}, 129 | "bars": false, 130 | "dashLength": 10, 131 | "dashes": false, 132 | "datasource": "Prometheus", 133 | "editable": true, 134 | "error": false, 135 | "fill": 0, 136 | "grid": {}, 137 | "gridPos": { 138 | "h": 5, 139 | "w": 6, 140 | "x": 6, 141 | "y": 0 142 | }, 143 | "id": 14, 144 | "legend": { 145 | "avg": false, 146 | "current": false, 147 | "max": false, 148 | "min": false, 149 | "show": true, 150 | "total": false, 151 | "values": false 152 | }, 153 | "lines": true, 154 | "linewidth": 1, 155 | "links": [], 156 | "nullPointMode": "connected", 157 | "percentage": false, 158 | "pointradius": 5, 159 | "points": false, 160 | "renderer": "flot", 161 | "seriesOverrides": [], 162 | "spaceLength": 10, 163 | "stack": false, 164 | "steppedLine": false, 165 | "targets": [ 166 | { 167 | "expr": "topk(5, max(scrape_duration_seconds) by (job))", 168 | "format": "time_series", 169 | "interval": "", 170 | "intervalFactor": 2, 171 | "legendFormat": "{{job}}", 172 | "metric": "", 173 | "refId": "A", 174 | "step": 20 175 | } 176 | ], 177 | "thresholds": [], 178 | "timeFrom": null, 179 | "timeShift": null, 180 | "title": "Scrape Duration", 181 | "tooltip": { 182 | "shared": true, 183 | "sort": 0, 184 | "value_type": "cumulative" 185 | }, 186 | "type": "graph", 187 | "xaxis": { 188 | "buckets": null, 189 | "mode": "time", 190 | "name": null, 191 | "show": true, 192 | "values": [] 193 | }, 194 | "yaxes": [ 195 | { 196 | "format": "s", 197 | "logBase": 1, 198 | "max": null, 199 | "min": null, 200 | "show": true 201 | }, 202 | { 203 | "format": "short", 204 | "logBase": 1, 205 | "max": null, 206 | "min": null, 207 | "show": true 208 | } 209 | ] 210 | }, 211 | { 212 | "aliasColors": {}, 213 | "bars": false, 214 | "dashLength": 10, 215 | "dashes": false, 216 | "datasource": "Prometheus", 217 | "description": "", 218 | "fill": 0, 219 | "gridPos": { 220 | "h": 5, 221 | "w": 6, 222 | "x": 12, 223 | "y": 0 224 | }, 225 | "id": 16, 226 | "legend": { 227 | "avg": false, 228 | "current": false, 229 | "max": false, 230 | "min": false, 231 | "show": true, 232 | "total": false, 233 | "values": false 234 | }, 235 | "lines": true, 236 | "linewidth": 1, 237 | "links": [], 238 | "nullPointMode": "null", 239 | "percentage": false, 240 | "pointradius": 5, 241 | "points": false, 242 | "renderer": "flot", 243 | "seriesOverrides": [], 244 | "spaceLength": 10, 245 | "stack": false, 246 | "steppedLine": false, 247 | "targets": [ 248 | { 249 | "expr": "sum(process_resident_memory_bytes{job=\"prometheus\"})", 250 | "format": "time_series", 251 | "hide": false, 252 | "interval": "", 253 | "intervalFactor": 2, 254 | "legendFormat": "p8s process resident memory", 255 | "refId": "D", 256 | "step": 20 257 | }, 258 | { 259 | "expr": "process_virtual_memory_bytes{job=\"prometheus\"}", 260 | "format": "time_series", 261 | "hide": false, 262 | "intervalFactor": 2, 263 | "legendFormat": "virtual memory", 264 | "refId": "C", 265 | "step": 20 266 | } 267 | ], 268 | "thresholds": [], 269 | "timeFrom": null, 270 | "timeShift": null, 271 | "title": "Memory Profile", 272 | "tooltip": { 273 | "shared": true, 274 | "sort": 2, 275 | "value_type": "individual" 276 | }, 277 | "transparent": false, 278 | "type": "graph", 279 | "xaxis": { 280 | "buckets": null, 281 | "mode": "time", 282 | "name": null, 283 | "show": true, 284 | "values": [] 285 | }, 286 | "yaxes": [ 287 | { 288 | "format": "bytes", 289 | "label": "", 290 | "logBase": 1, 291 | "max": null, 292 | "min": "0", 293 | "show": true 294 | }, 295 | { 296 | "format": "short", 297 | "label": null, 298 | "logBase": 1, 299 | "max": null, 300 | "min": null, 301 | "show": true 302 | } 303 | ] 304 | }, 305 | { 306 | "cacheTimeout": null, 307 | "colorBackground": false, 308 | "colorValue": true, 309 | "colors": [ 310 | "rgba(50, 172, 45, 0.97)", 311 | "rgba(237, 129, 40, 0.89)", 312 | "rgba(245, 54, 54, 0.9)" 313 | ], 314 | "datasource": "Prometheus", 315 | "format": "none", 316 | "gauge": { 317 | "maxValue": 100, 318 | "minValue": 0, 319 | "show": false, 320 | "thresholdLabels": false, 321 | "thresholdMarkers": true 322 | }, 323 | "gridPos": { 324 | "h": 5, 325 | "w": 6, 326 | "x": 18, 327 | "y": 0 328 | }, 329 | "id": 37, 330 | "interval": null, 331 | "links": [], 332 | "mappingType": 1, 333 | "mappingTypes": [ 334 | { 335 | "name": "value to text", 336 | "value": 1 337 | }, 338 | { 339 | "name": "range to text", 340 | "value": 2 341 | } 342 | ], 343 | "maxDataPoints": 100, 344 | "nullPointMode": "connected", 345 | "nullText": null, 346 | "postfix": "", 347 | "postfixFontSize": "50%", 348 | "prefix": "", 349 | "prefixFontSize": "50%", 350 | "rangeMaps": [ 351 | { 352 | "from": "null", 353 | "text": "N/A", 354 | "to": "null" 355 | } 356 | ], 357 | "sparkline": { 358 | "fillColor": "rgba(31, 118, 189, 0.18)", 359 | "full": false, 360 | "lineColor": "rgb(31, 120, 193)", 361 | "show": false 362 | }, 363 | "tableColumn": "", 364 | "targets": [ 365 | { 366 | "expr": "prometheus_tsdb_wal_corruptions_total{job=\"prometheus\"}", 367 | "format": "time_series", 368 | "intervalFactor": 2, 369 | "legendFormat": "", 370 | "refId": "A", 371 | "step": 60 372 | } 373 | ], 374 | "thresholds": "0.1,1", 375 | "title": "WAL Corruptions", 376 | "type": "singlestat", 377 | "valueFontSize": "200%", 378 | "valueMaps": [ 379 | { 380 | "op": "=", 381 | "text": "None", 382 | "value": "0" 383 | } 384 | ], 385 | "valueName": "max" 386 | }, 387 | { 388 | "aliasColors": {}, 389 | "bars": false, 390 | "dashLength": 10, 391 | "dashes": false, 392 | "datasource": "Prometheus", 393 | "fill": 0, 394 | "gridPos": { 395 | "h": 5, 396 | "w": 6, 397 | "x": 0, 398 | "y": 5 399 | }, 400 | "id": 29, 401 | "legend": { 402 | "avg": false, 403 | "current": false, 404 | "max": false, 405 | "min": false, 406 | "show": true, 407 | "total": false, 408 | "values": false 409 | }, 410 | "lines": true, 411 | "linewidth": 1, 412 | "links": [], 413 | "nullPointMode": "null", 414 | "percentage": false, 415 | "pointradius": 5, 416 | "points": false, 417 | "renderer": "flot", 418 | "seriesOverrides": [], 419 | "spaceLength": 10, 420 | "stack": false, 421 | "steppedLine": false, 422 | "targets": [ 423 | { 424 | "expr": "sum(prometheus_tsdb_head_active_appenders{job=\"prometheus\"})", 425 | "format": "time_series", 426 | "interval": "", 427 | "intervalFactor": 2, 428 | "legendFormat": "active_appenders", 429 | "metric": "", 430 | "refId": "A", 431 | "step": 20 432 | }, 433 | { 434 | "expr": "sum(process_open_fds{job=\"prometheus\"})", 435 | "format": "time_series", 436 | "interval": "", 437 | "intervalFactor": 2, 438 | "legendFormat": "open_fds", 439 | "refId": "B", 440 | "step": 20 441 | } 442 | ], 443 | "thresholds": [], 444 | "timeFrom": null, 445 | "timeShift": null, 446 | "title": "Active Appenders", 447 | "tooltip": { 448 | "shared": true, 449 | "sort": 0, 450 | "value_type": "individual" 451 | }, 452 | "type": "graph", 453 | "xaxis": { 454 | "buckets": null, 455 | "mode": "time", 456 | "name": null, 457 | "show": true, 458 | "values": [] 459 | }, 460 | "yaxes": [ 461 | { 462 | "format": "short", 463 | "label": null, 464 | "logBase": 1, 465 | "max": null, 466 | "min": null, 467 | "show": true 468 | }, 469 | { 470 | "format": "short", 471 | "label": null, 472 | "logBase": 1, 473 | "max": null, 474 | "min": null, 475 | "show": false 476 | } 477 | ] 478 | }, 479 | { 480 | "aliasColors": { 481 | "prometheus": "#F9BA8F", 482 | "{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}": "#F9BA8F" 483 | }, 484 | "bars": false, 485 | "dashLength": 10, 486 | "dashes": false, 487 | "datasource": "Prometheus", 488 | "editable": true, 489 | "error": false, 490 | "fill": 0, 491 | "grid": {}, 492 | "gridPos": { 493 | "h": 5, 494 | "w": 6, 495 | "x": 6, 496 | "y": 5 497 | }, 498 | "id": 2, 499 | "legend": { 500 | "avg": false, 501 | "current": false, 502 | "max": false, 503 | "min": false, 504 | "show": true, 505 | "total": false, 506 | "values": false 507 | }, 508 | "lines": true, 509 | "linewidth": 1, 510 | "links": [], 511 | "nullPointMode": "connected", 512 | "percentage": false, 513 | "pointradius": 5, 514 | "points": false, 515 | "renderer": "flot", 516 | "seriesOverrides": [], 517 | "spaceLength": 10, 518 | "stack": false, 519 | "steppedLine": false, 520 | "targets": [ 521 | { 522 | "expr": "prometheus_tsdb_blocks_loaded{job=\"prometheus\"}", 523 | "format": "time_series", 524 | "intervalFactor": 2, 525 | "legendFormat": "blocks", 526 | "refId": "A", 527 | "step": 20 528 | } 529 | ], 530 | "thresholds": [], 531 | "timeFrom": null, 532 | "timeShift": null, 533 | "title": "Blocks Loaded", 534 | "tooltip": { 535 | "shared": true, 536 | "sort": 0, 537 | "value_type": "cumulative" 538 | }, 539 | "type": "graph", 540 | "xaxis": { 541 | "buckets": null, 542 | "mode": "time", 543 | "name": null, 544 | "show": true, 545 | "values": [] 546 | }, 547 | "yaxes": [ 548 | { 549 | "format": "short", 550 | "logBase": 1, 551 | "max": null, 552 | "min": null, 553 | "show": true 554 | }, 555 | { 556 | "format": "short", 557 | "logBase": 1, 558 | "max": null, 559 | "min": null, 560 | "show": true 561 | } 562 | ] 563 | }, 564 | { 565 | "aliasColors": {}, 566 | "bars": false, 567 | "dashLength": 10, 568 | "dashes": false, 569 | "datasource": "Prometheus", 570 | "decimals": null, 571 | "description": "", 572 | "fill": 0, 573 | "gridPos": { 574 | "h": 5, 575 | "w": 6, 576 | "x": 12, 577 | "y": 5 578 | }, 579 | "id": 33, 580 | "legend": { 581 | "avg": false, 582 | "current": false, 583 | "max": false, 584 | "min": false, 585 | "show": true, 586 | "total": false, 587 | "values": false 588 | }, 589 | "lines": true, 590 | "linewidth": 1, 591 | "links": [], 592 | "nullPointMode": "connected", 593 | "percentage": false, 594 | "pointradius": 5, 595 | "points": false, 596 | "renderer": "flot", 597 | "seriesOverrides": [], 598 | "spaceLength": 10, 599 | "stack": false, 600 | "steppedLine": false, 601 | "targets": [ 602 | { 603 | "expr": "prometheus_tsdb_head_chunks{job=\"prometheus\"}", 604 | "format": "time_series", 605 | "interval": "", 606 | "intervalFactor": 2, 607 | "legendFormat": "chunks", 608 | "refId": "A", 609 | "step": 20 610 | } 611 | ], 612 | "thresholds": [], 613 | "timeFrom": null, 614 | "timeShift": null, 615 | "title": "Head Chunks", 616 | "tooltip": { 617 | "shared": true, 618 | "sort": 0, 619 | "value_type": "individual" 620 | }, 621 | "type": "graph", 622 | "xaxis": { 623 | "buckets": null, 624 | "mode": "time", 625 | "name": null, 626 | "show": true, 627 | "values": [] 628 | }, 629 | "yaxes": [ 630 | { 631 | "format": "short", 632 | "label": null, 633 | "logBase": 1, 634 | "max": null, 635 | "min": null, 636 | "show": true 637 | }, 638 | { 639 | "format": "bytes", 640 | "label": "", 641 | "logBase": 1, 642 | "max": null, 643 | "min": null, 644 | "show": false 645 | } 646 | ] 647 | }, 648 | { 649 | "aliasColors": {}, 650 | "bars": false, 651 | "dashLength": 10, 652 | "dashes": false, 653 | "datasource": "Prometheus", 654 | "fill": 1, 655 | "gridPos": { 656 | "h": 5, 657 | "w": 6, 658 | "x": 18, 659 | "y": 5 660 | }, 661 | "id": 36, 662 | "legend": { 663 | "avg": false, 664 | "current": false, 665 | "max": false, 666 | "min": false, 667 | "show": true, 668 | "total": false, 669 | "values": false 670 | }, 671 | "lines": true, 672 | "linewidth": 1, 673 | "links": [], 674 | "nullPointMode": "null", 675 | "percentage": false, 676 | "pointradius": 5, 677 | "points": false, 678 | "renderer": "flot", 679 | "seriesOverrides": [ 680 | { 681 | "alias": "duration-p99", 682 | "yaxis": 2 683 | } 684 | ], 685 | "spaceLength": 10, 686 | "stack": false, 687 | "steppedLine": false, 688 | "targets": [ 689 | { 690 | "expr": "prometheus_tsdb_head_gc_duration_seconds{job=\"prometheus\",quantile=\"0.99\"}", 691 | "format": "time_series", 692 | "intervalFactor": 2, 693 | "legendFormat": "duration-p99", 694 | "refId": "A", 695 | "step": 20 696 | }, 697 | { 698 | "expr": "irate(prometheus_tsdb_head_gc_duration_seconds_count{job=\"prometheus\"}[5m])", 699 | "format": "time_series", 700 | "intervalFactor": 2, 701 | "legendFormat": "collections", 702 | "refId": "B", 703 | "step": 20 704 | } 705 | ], 706 | "thresholds": [], 707 | "timeFrom": null, 708 | "timeShift": null, 709 | "title": "Head Block GC Activity", 710 | "tooltip": { 711 | "shared": true, 712 | "sort": 0, 713 | "value_type": "individual" 714 | }, 715 | "type": "graph", 716 | "xaxis": { 717 | "buckets": null, 718 | "mode": "time", 719 | "name": null, 720 | "show": true, 721 | "values": [] 722 | }, 723 | "yaxes": [ 724 | { 725 | "format": "short", 726 | "label": null, 727 | "logBase": 1, 728 | "max": null, 729 | "min": "0", 730 | "show": true 731 | }, 732 | { 733 | "format": "s", 734 | "label": null, 735 | "logBase": 1, 736 | "max": null, 737 | "min": "0", 738 | "show": true 739 | } 740 | ] 741 | }, 742 | { 743 | "aliasColors": {}, 744 | "bars": false, 745 | "dashLength": 10, 746 | "dashes": false, 747 | "datasource": "Prometheus", 748 | "decimals": null, 749 | "description": "", 750 | "fill": 0, 751 | "gridPos": { 752 | "h": 5, 753 | "w": 8, 754 | "x": 0, 755 | "y": 10 756 | }, 757 | "id": 20, 758 | "legend": { 759 | "avg": false, 760 | "current": false, 761 | "max": false, 762 | "min": false, 763 | "show": true, 764 | "total": false, 765 | "values": false 766 | }, 767 | "lines": true, 768 | "linewidth": 1, 769 | "links": [], 770 | "nullPointMode": "connected", 771 | "percentage": false, 772 | "pointradius": 5, 773 | "points": false, 774 | "renderer": "flot", 775 | "seriesOverrides": [ 776 | { 777 | "alias": "duration-p99", 778 | "yaxis": 2 779 | } 780 | ], 781 | "spaceLength": 10, 782 | "stack": false, 783 | "steppedLine": false, 784 | "targets": [ 785 | { 786 | "expr": "histogram_quantile(0.99, sum(rate(prometheus_tsdb_compaction_duration_bucket{job=\"prometheus\"}[5m])) by (le))", 787 | "format": "time_series", 788 | "hide": false, 789 | "interval": "", 790 | "intervalFactor": 2, 791 | "legendFormat": "duration-{{p99}}", 792 | "refId": "A", 793 | "step": 20 794 | }, 795 | { 796 | "expr": "irate(prometheus_tsdb_compactions_total{job=\"prometheus\"}[5m])", 797 | "format": "time_series", 798 | "intervalFactor": 2, 799 | "legendFormat": "compactions", 800 | "refId": "B", 801 | "step": 20 802 | }, 803 | { 804 | "expr": "irate(prometheus_tsdb_compactions_failed_total{job=\"prometheus\"}[5m])", 805 | "format": "time_series", 806 | "intervalFactor": 2, 807 | "legendFormat": "failed", 808 | "refId": "C", 809 | "step": 20 810 | }, 811 | { 812 | "expr": "irate(prometheus_tsdb_compactions_triggered_total{job=\"prometheus\"}[5m])", 813 | "format": "time_series", 814 | "intervalFactor": 2, 815 | "legendFormat": "triggered", 816 | "refId": "D", 817 | "step": 20 818 | } 819 | ], 820 | "thresholds": [], 821 | "timeFrom": null, 822 | "timeShift": null, 823 | "title": "Compaction Activity", 824 | "tooltip": { 825 | "shared": true, 826 | "sort": 0, 827 | "value_type": "individual" 828 | }, 829 | "type": "graph", 830 | "xaxis": { 831 | "buckets": null, 832 | "mode": "time", 833 | "name": null, 834 | "show": true, 835 | "values": [] 836 | }, 837 | "yaxes": [ 838 | { 839 | "format": "short", 840 | "label": null, 841 | "logBase": 1, 842 | "max": null, 843 | "min": "0", 844 | "show": true 845 | }, 846 | { 847 | "format": "s", 848 | "label": "", 849 | "logBase": 1, 850 | "max": null, 851 | "min": "0", 852 | "show": true 853 | } 854 | ] 855 | }, 856 | { 857 | "aliasColors": {}, 858 | "bars": false, 859 | "dashLength": 10, 860 | "dashes": false, 861 | "datasource": "Prometheus", 862 | "fill": 1, 863 | "gridPos": { 864 | "h": 5, 865 | "w": 8, 866 | "x": 8, 867 | "y": 10 868 | }, 869 | "id": 32, 870 | "legend": { 871 | "avg": false, 872 | "current": false, 873 | "max": false, 874 | "min": false, 875 | "show": true, 876 | "total": false, 877 | "values": false 878 | }, 879 | "lines": true, 880 | "linewidth": 1, 881 | "links": [], 882 | "nullPointMode": "null", 883 | "percentage": false, 884 | "pointradius": 5, 885 | "points": false, 886 | "renderer": "flot", 887 | "seriesOverrides": [], 888 | "spaceLength": 10, 889 | "stack": false, 890 | "steppedLine": false, 891 | "targets": [ 892 | { 893 | "expr": "rate(prometheus_tsdb_reloads_total{job=\"prometheus\"}[5m])", 894 | "format": "time_series", 895 | "intervalFactor": 2, 896 | "legendFormat": "reloads", 897 | "refId": "A", 898 | "step": 20 899 | }, 900 | { 901 | "expr": "rate(prometheus_tsdb_reloads_failures_total{job=\"prometheus\"}[5m])", 902 | "format": "time_series", 903 | "hide": false, 904 | "intervalFactor": 2, 905 | "legendFormat": "failures", 906 | "refId": "B", 907 | "step": 20 908 | } 909 | ], 910 | "thresholds": [], 911 | "timeFrom": null, 912 | "timeShift": null, 913 | "title": "Reload Count", 914 | "tooltip": { 915 | "shared": true, 916 | "sort": 0, 917 | "value_type": "individual" 918 | }, 919 | "type": "graph", 920 | "xaxis": { 921 | "buckets": null, 922 | "mode": "time", 923 | "name": null, 924 | "show": true, 925 | "values": [] 926 | }, 927 | "yaxes": [ 928 | { 929 | "format": "short", 930 | "label": null, 931 | "logBase": 1, 932 | "max": null, 933 | "min": null, 934 | "show": true 935 | }, 936 | { 937 | "format": "short", 938 | "label": null, 939 | "logBase": 1, 940 | "max": null, 941 | "min": null, 942 | "show": true 943 | } 944 | ] 945 | }, 946 | { 947 | "aliasColors": {}, 948 | "bars": false, 949 | "dashLength": 10, 950 | "dashes": false, 951 | "datasource": "Prometheus", 952 | "fill": 0, 953 | "gridPos": { 954 | "h": 5, 955 | "w": 8, 956 | "x": 16, 957 | "y": 10 958 | }, 959 | "id": 38, 960 | "legend": { 961 | "avg": false, 962 | "current": false, 963 | "max": false, 964 | "min": false, 965 | "show": true, 966 | "total": false, 967 | "values": false 968 | }, 969 | "lines": true, 970 | "linewidth": 1, 971 | "links": [], 972 | "nullPointMode": "null", 973 | "percentage": false, 974 | "pointradius": 5, 975 | "points": false, 976 | "renderer": "flot", 977 | "seriesOverrides": [], 978 | "spaceLength": 10, 979 | "stack": false, 980 | "steppedLine": false, 981 | "targets": [ 982 | { 983 | "expr": "prometheus_engine_query_duration_seconds{job=\"prometheus\", quantile=\"0.99\"}", 984 | "format": "time_series", 985 | "intervalFactor": 2, 986 | "legendFormat": "{{slice}}_p99", 987 | "refId": "A", 988 | "step": 20 989 | } 990 | ], 991 | "thresholds": [], 992 | "timeFrom": null, 993 | "timeShift": null, 994 | "title": "Query Durations", 995 | "tooltip": { 996 | "shared": true, 997 | "sort": 0, 998 | "value_type": "individual" 999 | }, 1000 | "type": "graph", 1001 | "xaxis": { 1002 | "buckets": null, 1003 | "mode": "time", 1004 | "name": null, 1005 | "show": true, 1006 | "values": [] 1007 | }, 1008 | "yaxes": [ 1009 | { 1010 | "format": "short", 1011 | "label": null, 1012 | "logBase": 1, 1013 | "max": null, 1014 | "min": null, 1015 | "show": true 1016 | }, 1017 | { 1018 | "format": "short", 1019 | "label": null, 1020 | "logBase": 1, 1021 | "max": null, 1022 | "min": null, 1023 | "show": true 1024 | } 1025 | ] 1026 | }, 1027 | { 1028 | "aliasColors": {}, 1029 | "bars": false, 1030 | "dashLength": 10, 1031 | "dashes": false, 1032 | "datasource": "Prometheus", 1033 | "decimals": null, 1034 | "editable": true, 1035 | "error": false, 1036 | "fill": 0, 1037 | "grid": {}, 1038 | "gridPos": { 1039 | "h": 7, 1040 | "w": 12, 1041 | "x": 0, 1042 | "y": 15 1043 | }, 1044 | "id": 35, 1045 | "legend": { 1046 | "alignAsTable": false, 1047 | "avg": false, 1048 | "current": false, 1049 | "hideEmpty": true, 1050 | "max": false, 1051 | "min": false, 1052 | "show": true, 1053 | "total": false, 1054 | "values": false 1055 | }, 1056 | "lines": true, 1057 | "linewidth": 1, 1058 | "links": [], 1059 | "nullPointMode": "connected", 1060 | "percentage": false, 1061 | "pointradius": 5, 1062 | "points": false, 1063 | "renderer": "flot", 1064 | "seriesOverrides": [], 1065 | "spaceLength": 10, 1066 | "stack": false, 1067 | "steppedLine": false, 1068 | "targets": [ 1069 | { 1070 | "expr": "max(prometheus_rule_group_duration_seconds{job=\"prometheus\"}) by (quantile)", 1071 | "format": "time_series", 1072 | "interval": "", 1073 | "intervalFactor": 2, 1074 | "legendFormat": "{{quantile}}", 1075 | "refId": "A", 1076 | "step": 10 1077 | } 1078 | ], 1079 | "thresholds": [], 1080 | "timeFrom": null, 1081 | "timeShift": null, 1082 | "title": "Rule Group Eval Duration", 1083 | "tooltip": { 1084 | "shared": true, 1085 | "sort": 0, 1086 | "value_type": "cumulative" 1087 | }, 1088 | "type": "graph", 1089 | "xaxis": { 1090 | "buckets": null, 1091 | "mode": "time", 1092 | "name": null, 1093 | "show": true, 1094 | "values": [] 1095 | }, 1096 | "yaxes": [ 1097 | { 1098 | "format": "s", 1099 | "label": "", 1100 | "logBase": 1, 1101 | "max": null, 1102 | "min": null, 1103 | "show": true 1104 | }, 1105 | { 1106 | "format": "short", 1107 | "logBase": 1, 1108 | "max": null, 1109 | "min": null, 1110 | "show": true 1111 | } 1112 | ] 1113 | }, 1114 | { 1115 | "aliasColors": {}, 1116 | "bars": false, 1117 | "dashLength": 10, 1118 | "dashes": false, 1119 | "datasource": "Prometheus", 1120 | "fill": 1, 1121 | "gridPos": { 1122 | "h": 7, 1123 | "w": 12, 1124 | "x": 12, 1125 | "y": 15 1126 | }, 1127 | "id": 39, 1128 | "legend": { 1129 | "avg": false, 1130 | "current": false, 1131 | "max": false, 1132 | "min": false, 1133 | "show": true, 1134 | "total": false, 1135 | "values": false 1136 | }, 1137 | "lines": true, 1138 | "linewidth": 1, 1139 | "links": [], 1140 | "nullPointMode": "null", 1141 | "percentage": false, 1142 | "pointradius": 5, 1143 | "points": false, 1144 | "renderer": "flot", 1145 | "seriesOverrides": [], 1146 | "spaceLength": 10, 1147 | "stack": true, 1148 | "steppedLine": false, 1149 | "targets": [ 1150 | { 1151 | "expr": "rate(prometheus_rule_group_iterations_missed_total{job=\"prometheus\"}[5m])", 1152 | "format": "time_series", 1153 | "intervalFactor": 2, 1154 | "legendFormat": "missed", 1155 | "refId": "B", 1156 | "step": 10 1157 | }, 1158 | { 1159 | "expr": "rate(prometheus_rule_group_iterations_total{job=\"prometheus\"}[5m])", 1160 | "format": "time_series", 1161 | "intervalFactor": 2, 1162 | "legendFormat": "iterations", 1163 | "refId": "A", 1164 | "step": 10 1165 | } 1166 | ], 1167 | "thresholds": [], 1168 | "timeFrom": null, 1169 | "timeShift": null, 1170 | "title": "Rule Group Eval Activity", 1171 | "tooltip": { 1172 | "shared": true, 1173 | "sort": 0, 1174 | "value_type": "individual" 1175 | }, 1176 | "type": "graph", 1177 | "xaxis": { 1178 | "buckets": null, 1179 | "mode": "time", 1180 | "name": null, 1181 | "show": true, 1182 | "values": [] 1183 | }, 1184 | "yaxes": [ 1185 | { 1186 | "format": "short", 1187 | "label": null, 1188 | "logBase": 1, 1189 | "max": null, 1190 | "min": null, 1191 | "show": true 1192 | }, 1193 | { 1194 | "format": "short", 1195 | "label": null, 1196 | "logBase": 1, 1197 | "max": null, 1198 | "min": null, 1199 | "show": true 1200 | } 1201 | ] 1202 | } 1203 | ], 1204 | "refresh": "1m", 1205 | "revision": "1.0", 1206 | "schemaVersion": 16, 1207 | "style": "dark", 1208 | "tags": [ 1209 | "prometheus" 1210 | ], 1211 | "templating": { 1212 | "list": [] 1213 | }, 1214 | "time": { 1215 | "from": "now-1h", 1216 | "to": "now" 1217 | }, 1218 | "timepicker": { 1219 | "now": true, 1220 | "refresh_intervals": [ 1221 | "5s", 1222 | "10s", 1223 | "30s", 1224 | "1m", 1225 | "5m", 1226 | "15m", 1227 | "30m", 1228 | "1h", 1229 | "2h", 1230 | "1d" 1231 | ], 1232 | "time_options": [ 1233 | "5m", 1234 | "15m", 1235 | "1h", 1236 | "6h", 1237 | "12h", 1238 | "24h", 1239 | "2d", 1240 | "7d", 1241 | "30d" 1242 | ] 1243 | }, 1244 | "timezone": "browser", 1245 | "title": "Prometheus 2.0 Stats", 1246 | "uid": "mGFfYSRiz", 1247 | "version": 1 1248 | } 1249 | -------------------------------------------------------------------------------- /grafana/dashboards/swarmprom-services-dash.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "$$hashKey": "object:429", 6 | "builtIn": 1, 7 | "datasource": "-- Grafana --", 8 | "enable": true, 9 | "hide": true, 10 | "iconColor": "rgba(0, 211, 255, 1)", 11 | "name": "Annotations & Alerts", 12 | "type": "dashboard" 13 | } 14 | ] 15 | }, 16 | "description": "Docker Swarm stacks and services metrics", 17 | "editable": true, 18 | "gnetId": null, 19 | "graphTooltip": 0, 20 | "iteration": 1520585594614, 21 | "links": [], 22 | "panels": [ 23 | { 24 | "cacheTimeout": null, 25 | "colorBackground": false, 26 | "colorValue": false, 27 | "colors": [ 28 | "rgba(245, 54, 54, 0.9)", 29 | "rgba(237, 129, 40, 0.89)", 30 | "rgba(50, 172, 45, 0.97)" 31 | ], 32 | "datasource": null, 33 | "decimals": 0, 34 | "format": "none", 35 | "gauge": { 36 | "maxValue": 100, 37 | "minValue": 0, 38 | "show": false, 39 | "thresholdLabels": false, 40 | "thresholdMarkers": true 41 | }, 42 | "gridPos": { 43 | "h": 4, 44 | "w": 6, 45 | "x": 0, 46 | "y": 0 47 | }, 48 | "hideTimeOverride": true, 49 | "id": 1, 50 | "interval": null, 51 | "links": [], 52 | "mappingType": 1, 53 | "mappingTypes": [ 54 | { 55 | "name": "value to text", 56 | "value": 1 57 | }, 58 | { 59 | "name": "range to text", 60 | "value": 2 61 | } 62 | ], 63 | "maxDataPoints": 100, 64 | "nullPointMode": "connected", 65 | "nullText": null, 66 | "postfix": "", 67 | "postfixFontSize": "50%", 68 | "prefix": "", 69 | "prefixFontSize": "50%", 70 | "rangeMaps": [ 71 | { 72 | "from": "null", 73 | "text": "N/A", 74 | "to": "null" 75 | } 76 | ], 77 | "sparkline": { 78 | "fillColor": "rgba(31, 118, 189, 0.18)", 79 | "full": false, 80 | "lineColor": "rgb(31, 120, 193)", 81 | "show": false 82 | }, 83 | "tableColumn": "", 84 | "targets": [ 85 | { 86 | "expr": "count(count(container_tasks_state{container_label_com_docker_swarm_node_id =~\"$node_id\"}) by (container_label_com_docker_swarm_node_id))", 87 | "format": "time_series", 88 | "intervalFactor": 2, 89 | "legendFormat": "", 90 | "refId": "A", 91 | "step": 2 92 | } 93 | ], 94 | "thresholds": "", 95 | "timeFrom": "1m", 96 | "title": "Nodes", 97 | "type": "singlestat", 98 | "valueFontSize": "80%", 99 | "valueMaps": [ 100 | { 101 | "op": "=", 102 | "text": "N/A", 103 | "value": "null" 104 | } 105 | ], 106 | "valueName": "avg" 107 | }, 108 | { 109 | "cacheTimeout": null, 110 | "colorBackground": false, 111 | "colorValue": false, 112 | "colors": [ 113 | "rgba(245, 54, 54, 0.9)", 114 | "rgba(237, 129, 40, 0.89)", 115 | "rgba(50, 172, 45, 0.97)" 116 | ], 117 | "datasource": null, 118 | "decimals": 0, 119 | "format": "none", 120 | "gauge": { 121 | "maxValue": 100, 122 | "minValue": 0, 123 | "show": false, 124 | "thresholdLabels": false, 125 | "thresholdMarkers": true 126 | }, 127 | "gridPos": { 128 | "h": 4, 129 | "w": 6, 130 | "x": 6, 131 | "y": 0 132 | }, 133 | "hideTimeOverride": true, 134 | "id": 21, 135 | "interval": null, 136 | "links": [], 137 | "mappingType": 1, 138 | "mappingTypes": [ 139 | { 140 | "name": "value to text", 141 | "value": 1 142 | }, 143 | { 144 | "name": "range to text", 145 | "value": 2 146 | } 147 | ], 148 | "maxDataPoints": 100, 149 | "nullPointMode": "connected", 150 | "nullText": null, 151 | "postfix": "", 152 | "postfixFontSize": "50%", 153 | "prefix": "", 154 | "prefixFontSize": "50%", 155 | "rangeMaps": [ 156 | { 157 | "from": "null", 158 | "text": "N/A", 159 | "to": "null" 160 | } 161 | ], 162 | "sparkline": { 163 | "fillColor": "rgba(31, 118, 189, 0.18)", 164 | "full": false, 165 | "lineColor": "rgb(31, 120, 193)", 166 | "show": false 167 | }, 168 | "tableColumn": "", 169 | "targets": [ 170 | { 171 | "expr": "count(count(container_tasks_state{container_label_com_docker_stack_namespace=~\".+\", container_label_com_docker_swarm_node_id=~\"$node_id\"}) by (container_label_com_docker_stack_namespace))", 172 | "format": "time_series", 173 | "intervalFactor": 2, 174 | "legendFormat": "", 175 | "refId": "A", 176 | "step": 2 177 | } 178 | ], 179 | "thresholds": "", 180 | "timeFrom": "1m", 181 | "title": "Stacks", 182 | "type": "singlestat", 183 | "valueFontSize": "80%", 184 | "valueMaps": [ 185 | { 186 | "op": "=", 187 | "text": "N/A", 188 | "value": "null" 189 | } 190 | ], 191 | "valueName": "avg" 192 | }, 193 | { 194 | "cacheTimeout": null, 195 | "colorBackground": false, 196 | "colorValue": false, 197 | "colors": [ 198 | "rgba(245, 54, 54, 0.9)", 199 | "rgba(237, 129, 40, 0.89)", 200 | "rgba(50, 172, 45, 0.97)" 201 | ], 202 | "datasource": null, 203 | "decimals": 0, 204 | "format": "none", 205 | "gauge": { 206 | "maxValue": 100, 207 | "minValue": 0, 208 | "show": false, 209 | "thresholdLabels": false, 210 | "thresholdMarkers": true 211 | }, 212 | "gridPos": { 213 | "h": 4, 214 | "w": 6, 215 | "x": 12, 216 | "y": 0 217 | }, 218 | "hideTimeOverride": true, 219 | "id": 20, 220 | "interval": null, 221 | "links": [], 222 | "mappingType": 1, 223 | "mappingTypes": [ 224 | { 225 | "name": "value to text", 226 | "value": 1 227 | }, 228 | { 229 | "name": "range to text", 230 | "value": 2 231 | } 232 | ], 233 | "maxDataPoints": 100, 234 | "nullPointMode": "connected", 235 | "nullText": null, 236 | "postfix": "", 237 | "postfixFontSize": "50%", 238 | "prefix": "", 239 | "prefixFontSize": "50%", 240 | "rangeMaps": [ 241 | { 242 | "from": "null", 243 | "text": "N/A", 244 | "to": "null" 245 | } 246 | ], 247 | "sparkline": { 248 | "fillColor": "rgba(31, 118, 189, 0.18)", 249 | "full": false, 250 | "lineColor": "rgb(31, 120, 193)", 251 | "show": false 252 | }, 253 | "tableColumn": "", 254 | "targets": [ 255 | { 256 | "expr": "count(count(container_tasks_state{container_label_com_docker_swarm_service_name=~\".+\", container_label_com_docker_swarm_node_id=~\"$node_id\"}) by (container_label_com_docker_swarm_service_name))", 257 | "format": "time_series", 258 | "intervalFactor": 2, 259 | "refId": "A", 260 | "step": 2 261 | } 262 | ], 263 | "thresholds": "", 264 | "timeFrom": "1m", 265 | "timeShift": null, 266 | "title": "Services", 267 | "type": "singlestat", 268 | "valueFontSize": "80%", 269 | "valueMaps": [ 270 | { 271 | "op": "=", 272 | "text": "N/A", 273 | "value": "null" 274 | } 275 | ], 276 | "valueName": "avg" 277 | }, 278 | { 279 | "cacheTimeout": null, 280 | "colorBackground": false, 281 | "colorValue": false, 282 | "colors": [ 283 | "rgba(245, 54, 54, 0.9)", 284 | "rgba(237, 129, 40, 0.89)", 285 | "rgba(50, 172, 45, 0.97)" 286 | ], 287 | "datasource": null, 288 | "decimals": 0, 289 | "format": "none", 290 | "gauge": { 291 | "maxValue": 100, 292 | "minValue": 0, 293 | "show": false, 294 | "thresholdLabels": false, 295 | "thresholdMarkers": true 296 | }, 297 | "gridPos": { 298 | "h": 4, 299 | "w": 6, 300 | "x": 18, 301 | "y": 0 302 | }, 303 | "hideTimeOverride": true, 304 | "id": 7, 305 | "interval": null, 306 | "links": [], 307 | "mappingType": 1, 308 | "mappingTypes": [ 309 | { 310 | "name": "value to text", 311 | "value": 1 312 | }, 313 | { 314 | "name": "range to text", 315 | "value": 2 316 | } 317 | ], 318 | "maxDataPoints": 100, 319 | "nullPointMode": "connected", 320 | "nullText": null, 321 | "postfix": "", 322 | "postfixFontSize": "50%", 323 | "prefix": "", 324 | "prefixFontSize": "50%", 325 | "rangeMaps": [ 326 | { 327 | "from": "null", 328 | "text": "N/A", 329 | "to": "null" 330 | } 331 | ], 332 | "sparkline": { 333 | "fillColor": "rgba(31, 118, 189, 0.18)", 334 | "full": false, 335 | "lineColor": "rgb(31, 120, 193)", 336 | "show": false 337 | }, 338 | "tableColumn": "", 339 | "targets": [ 340 | { 341 | "expr": "count(rate(container_last_seen{container_label_com_docker_swarm_node_id=~\"$node_id\"}[5m])) ", 342 | "format": "time_series", 343 | "intervalFactor": 2, 344 | "refId": "A", 345 | "step": 2 346 | } 347 | ], 348 | "thresholds": "", 349 | "timeFrom": "1m", 350 | "title": "Containers", 351 | "type": "singlestat", 352 | "valueFontSize": "80%", 353 | "valueMaps": [ 354 | { 355 | "op": "=", 356 | "text": "N/A", 357 | "value": "null" 358 | } 359 | ], 360 | "valueName": "avg" 361 | }, 362 | { 363 | "aliasColors": {}, 364 | "bars": true, 365 | "dashLength": 10, 366 | "dashes": false, 367 | "datasource": null, 368 | "decimals": 0, 369 | "fill": 5, 370 | "gridPos": { 371 | "h": 7, 372 | "w": 12, 373 | "x": 0, 374 | "y": 4 375 | }, 376 | "id": 12, 377 | "legend": { 378 | "alignAsTable": true, 379 | "avg": false, 380 | "current": true, 381 | "hideEmpty": true, 382 | "hideZero": true, 383 | "max": false, 384 | "min": false, 385 | "rightSide": true, 386 | "show": true, 387 | "sort": "current", 388 | "sortDesc": true, 389 | "total": false, 390 | "values": true 391 | }, 392 | "lines": false, 393 | "linewidth": 1, 394 | "links": [], 395 | "nullPointMode": "null", 396 | "percentage": false, 397 | "pointradius": 5, 398 | "points": false, 399 | "renderer": "flot", 400 | "seriesOverrides": [], 401 | "spaceLength": 10, 402 | "stack": true, 403 | "steppedLine": false, 404 | "targets": [ 405 | { 406 | "expr": "sum(rate(container_last_seen{container_label_com_docker_swarm_node_id=~\"$node_id\"}[5m])) by (container_label_com_docker_swarm_service_name)", 407 | "format": "time_series", 408 | "intervalFactor": 10, 409 | "legendFormat": "{{ container_label_com_docker_swarm_service_name }}", 410 | "refId": "A", 411 | "step": 10 412 | } 413 | ], 414 | "thresholds": [], 415 | "timeFrom": null, 416 | "timeShift": null, 417 | "title": "Service Tasks", 418 | "tooltip": { 419 | "shared": true, 420 | "sort": 2, 421 | "value_type": "individual" 422 | }, 423 | "type": "graph", 424 | "xaxis": { 425 | "buckets": null, 426 | "mode": "time", 427 | "name": null, 428 | "show": true, 429 | "values": [] 430 | }, 431 | "yaxes": [ 432 | { 433 | "format": "short", 434 | "label": null, 435 | "logBase": 1, 436 | "max": null, 437 | "min": null, 438 | "show": true 439 | }, 440 | { 441 | "format": "short", 442 | "label": null, 443 | "logBase": 1, 444 | "max": null, 445 | "min": null, 446 | "show": true 447 | } 448 | ] 449 | }, 450 | { 451 | "aliasColors": {}, 452 | "bars": false, 453 | "dashLength": 10, 454 | "dashes": false, 455 | "datasource": null, 456 | "decimals": 0, 457 | "fill": 1, 458 | "gridPos": { 459 | "h": 7, 460 | "w": 12, 461 | "x": 12, 462 | "y": 4 463 | }, 464 | "id": 32, 465 | "legend": { 466 | "alignAsTable": true, 467 | "avg": false, 468 | "current": true, 469 | "hideEmpty": true, 470 | "hideZero": true, 471 | "max": false, 472 | "min": false, 473 | "rightSide": true, 474 | "show": false, 475 | "sort": "current", 476 | "sortDesc": true, 477 | "total": false, 478 | "values": true 479 | }, 480 | "lines": true, 481 | "linewidth": 1, 482 | "links": [], 483 | "nullPointMode": "null", 484 | "percentage": false, 485 | "pointradius": 5, 486 | "points": false, 487 | "renderer": "flot", 488 | "seriesOverrides": [], 489 | "spaceLength": 10, 490 | "stack": false, 491 | "steppedLine": false, 492 | "targets": [ 493 | { 494 | "expr": "sum(increase(engine_daemon_health_checks_total[$interval]) * on(instance) group_left(node_id) swarm_node_info{node_id=~\"$node_id\"}) ", 495 | "format": "time_series", 496 | "intervalFactor": 10, 497 | "legendFormat": "checks", 498 | "refId": "A", 499 | "step": 10 500 | }, 501 | { 502 | "expr": "sum(increase(engine_daemon_health_checks_failed_total[$interval]) * on(instance) group_left(node_id) swarm_node_info{node_id=~\"$node_id\"}) ", 503 | "format": "time_series", 504 | "intervalFactor": 10, 505 | "legendFormat": "failed", 506 | "refId": "B", 507 | "step": 10 508 | } 509 | ], 510 | "thresholds": [], 511 | "timeFrom": null, 512 | "timeShift": null, 513 | "title": "Health Checks", 514 | "tooltip": { 515 | "shared": true, 516 | "sort": 2, 517 | "value_type": "individual" 518 | }, 519 | "type": "graph", 520 | "xaxis": { 521 | "buckets": null, 522 | "mode": "time", 523 | "name": null, 524 | "show": true, 525 | "values": [] 526 | }, 527 | "yaxes": [ 528 | { 529 | "format": "short", 530 | "label": null, 531 | "logBase": 1, 532 | "max": null, 533 | "min": null, 534 | "show": true 535 | }, 536 | { 537 | "format": "short", 538 | "label": null, 539 | "logBase": 1, 540 | "max": null, 541 | "min": null, 542 | "show": true 543 | } 544 | ] 545 | }, 546 | { 547 | "aliasColors": {}, 548 | "bars": false, 549 | "dashLength": 10, 550 | "dashes": false, 551 | "datasource": null, 552 | "decimals": 2, 553 | "fill": 1, 554 | "gridPos": { 555 | "h": 7, 556 | "w": 20, 557 | "x": 0, 558 | "y": 11 559 | }, 560 | "id": 22, 561 | "legend": { 562 | "alignAsTable": true, 563 | "avg": true, 564 | "current": false, 565 | "hideEmpty": true, 566 | "hideZero": true, 567 | "max": true, 568 | "min": true, 569 | "rightSide": true, 570 | "show": true, 571 | "sort": "avg", 572 | "sortDesc": true, 573 | "total": false, 574 | "values": true 575 | }, 576 | "lines": true, 577 | "linewidth": 1, 578 | "links": [], 579 | "nullPointMode": "null", 580 | "percentage": false, 581 | "pointradius": 5, 582 | "points": false, 583 | "renderer": "flot", 584 | "seriesOverrides": [], 585 | "spaceLength": 10, 586 | "stack": true, 587 | "steppedLine": false, 588 | "targets": [ 589 | { 590 | "expr": "sum(irate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}[1m])) by (container_label_com_docker_swarm_service_name) * 100 ", 591 | "format": "time_series", 592 | "intervalFactor": 2, 593 | "legendFormat": "{{container_label_com_docker_swarm_service_name}}", 594 | "refId": "A", 595 | "step": 2 596 | } 597 | ], 598 | "thresholds": [], 599 | "timeFrom": null, 600 | "timeShift": null, 601 | "title": "CPU usage by Service", 602 | "tooltip": { 603 | "shared": true, 604 | "sort": 2, 605 | "value_type": "individual" 606 | }, 607 | "type": "graph", 608 | "xaxis": { 609 | "buckets": null, 610 | "mode": "time", 611 | "name": null, 612 | "show": true, 613 | "values": [] 614 | }, 615 | "yaxes": [ 616 | { 617 | "format": "percent", 618 | "label": null, 619 | "logBase": 1, 620 | "max": null, 621 | "min": null, 622 | "show": true 623 | }, 624 | { 625 | "format": "short", 626 | "label": null, 627 | "logBase": 1, 628 | "max": null, 629 | "min": null, 630 | "show": false 631 | } 632 | ] 633 | }, 634 | { 635 | "cacheTimeout": null, 636 | "colorBackground": false, 637 | "colorValue": false, 638 | "colors": [ 639 | "rgba(245, 54, 54, 0.9)", 640 | "rgba(237, 129, 40, 0.89)", 641 | "rgba(50, 172, 45, 0.97)" 642 | ], 643 | "datasource": null, 644 | "decimals": null, 645 | "format": "percent", 646 | "gauge": { 647 | "maxValue": 100, 648 | "minValue": 0, 649 | "show": true, 650 | "thresholdLabels": false, 651 | "thresholdMarkers": true 652 | }, 653 | "gridPos": { 654 | "h": 7, 655 | "w": 4, 656 | "x": 20, 657 | "y": 11 658 | }, 659 | "hideTimeOverride": true, 660 | "id": 11, 661 | "interval": null, 662 | "links": [], 663 | "mappingType": 1, 664 | "mappingTypes": [ 665 | { 666 | "name": "value to text", 667 | "value": 1 668 | }, 669 | { 670 | "name": "range to text", 671 | "value": 2 672 | } 673 | ], 674 | "maxDataPoints": 100, 675 | "nullPointMode": "connected", 676 | "nullText": null, 677 | "postfix": "", 678 | "postfixFontSize": "50%", 679 | "prefix": "", 680 | "prefixFontSize": "50%", 681 | "rangeMaps": [ 682 | { 683 | "from": "null", 684 | "text": "N/A", 685 | "to": "null" 686 | } 687 | ], 688 | "sparkline": { 689 | "fillColor": "rgba(31, 118, 189, 0.18)", 690 | "full": false, 691 | "lineColor": "rgb(31, 120, 193)", 692 | "show": false 693 | }, 694 | "tableColumn": "", 695 | "targets": [ 696 | { 697 | "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) * 100 / count(node_cpu_seconds_total{mode=\"user\"} * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) ", 698 | "format": "time_series", 699 | "intervalFactor": 2, 700 | "legendFormat": "", 701 | "refId": "A", 702 | "step": 2 703 | } 704 | ], 705 | "thresholds": "10,25,100", 706 | "timeFrom": "1m", 707 | "timeShift": null, 708 | "title": "CPU Idle", 709 | "type": "singlestat", 710 | "valueFontSize": "80%", 711 | "valueMaps": [ 712 | { 713 | "op": "=", 714 | "text": "N/A", 715 | "value": "null" 716 | } 717 | ], 718 | "valueName": "avg" 719 | }, 720 | { 721 | "aliasColors": {}, 722 | "bars": false, 723 | "dashLength": 10, 724 | "dashes": false, 725 | "datasource": null, 726 | "decimals": 2, 727 | "fill": 1, 728 | "gridPos": { 729 | "h": 7, 730 | "w": 24, 731 | "x": 0, 732 | "y": 18 733 | }, 734 | "id": 33, 735 | "legend": { 736 | "alignAsTable": true, 737 | "avg": true, 738 | "current": false, 739 | "hideEmpty": true, 740 | "hideZero": true, 741 | "max": false, 742 | "min": false, 743 | "rightSide": true, 744 | "show": true, 745 | "sort": "avg", 746 | "sortDesc": true, 747 | "total": false, 748 | "values": true 749 | }, 750 | "lines": true, 751 | "linewidth": 1, 752 | "links": [], 753 | "nullPointMode": "null as zero", 754 | "percentage": false, 755 | "pointradius": 5, 756 | "points": false, 757 | "renderer": "flot", 758 | "seriesOverrides": [], 759 | "spaceLength": 10, 760 | "stack": false, 761 | "steppedLine": false, 762 | "targets": [ 763 | { 764 | "expr": "topk(10, sum(irate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}[$interval])) by (name)) * 100 ", 765 | "format": "time_series", 766 | "intervalFactor": 2, 767 | "legendFormat": "{{name}}", 768 | "refId": "A", 769 | "step": 2 770 | } 771 | ], 772 | "thresholds": [], 773 | "timeFrom": null, 774 | "timeShift": null, 775 | "title": "CPU usage by Container (top 10)", 776 | "tooltip": { 777 | "shared": true, 778 | "sort": 2, 779 | "value_type": "individual" 780 | }, 781 | "type": "graph", 782 | "xaxis": { 783 | "buckets": null, 784 | "mode": "time", 785 | "name": null, 786 | "show": true, 787 | "values": [] 788 | }, 789 | "yaxes": [ 790 | { 791 | "format": "percent", 792 | "label": null, 793 | "logBase": 1, 794 | "max": null, 795 | "min": null, 796 | "show": true 797 | }, 798 | { 799 | "format": "short", 800 | "label": null, 801 | "logBase": 1, 802 | "max": null, 803 | "min": null, 804 | "show": false 805 | } 806 | ] 807 | }, 808 | { 809 | "aliasColors": {}, 810 | "bars": false, 811 | "dashLength": 10, 812 | "dashes": false, 813 | "datasource": null, 814 | "fill": 1, 815 | "gridPos": { 816 | "h": 7, 817 | "w": 20, 818 | "x": 0, 819 | "y": 25 820 | }, 821 | "id": 24, 822 | "legend": { 823 | "alignAsTable": true, 824 | "avg": true, 825 | "current": false, 826 | "max": true, 827 | "min": true, 828 | "rightSide": true, 829 | "show": true, 830 | "sort": "avg", 831 | "sortDesc": true, 832 | "total": false, 833 | "values": true 834 | }, 835 | "lines": true, 836 | "linewidth": 1, 837 | "links": [], 838 | "nullPointMode": "null", 839 | "percentage": false, 840 | "pointradius": 5, 841 | "points": false, 842 | "renderer": "flot", 843 | "seriesOverrides": [], 844 | "spaceLength": 10, 845 | "stack": false, 846 | "steppedLine": false, 847 | "targets": [ 848 | { 849 | "expr": "sum(container_memory_usage_bytes{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}) by (container_label_com_docker_swarm_service_name) ", 850 | "format": "time_series", 851 | "intervalFactor": 2, 852 | "legendFormat": "Used {{container_label_com_docker_swarm_service_name}}", 853 | "refId": "A", 854 | "step": 2 855 | }, 856 | { 857 | "expr": "sum(container_memory_cache{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}) by (container_label_com_docker_swarm_service_name) ", 858 | "format": "time_series", 859 | "intervalFactor": 2, 860 | "legendFormat": "Cached {{container_label_com_docker_swarm_service_name}}", 861 | "refId": "B", 862 | "step": 2 863 | } 864 | ], 865 | "thresholds": [], 866 | "timeFrom": null, 867 | "timeShift": null, 868 | "title": "Memory usage by Service", 869 | "tooltip": { 870 | "shared": true, 871 | "sort": 0, 872 | "value_type": "individual" 873 | }, 874 | "type": "graph", 875 | "xaxis": { 876 | "buckets": null, 877 | "mode": "time", 878 | "name": null, 879 | "show": true, 880 | "values": [] 881 | }, 882 | "yaxes": [ 883 | { 884 | "format": "decbytes", 885 | "label": null, 886 | "logBase": 1, 887 | "max": null, 888 | "min": null, 889 | "show": true 890 | }, 891 | { 892 | "format": "short", 893 | "label": null, 894 | "logBase": 1, 895 | "max": null, 896 | "min": null, 897 | "show": true 898 | } 899 | ] 900 | }, 901 | { 902 | "cacheTimeout": null, 903 | "colorBackground": false, 904 | "colorValue": false, 905 | "colors": [ 906 | "rgba(245, 54, 54, 0.9)", 907 | "rgba(237, 129, 40, 0.89)", 908 | "rgba(50, 172, 45, 0.97)" 909 | ], 910 | "datasource": null, 911 | "format": "percent", 912 | "gauge": { 913 | "maxValue": 100, 914 | "minValue": 0, 915 | "show": true, 916 | "thresholdLabels": false, 917 | "thresholdMarkers": true 918 | }, 919 | "gridPos": { 920 | "h": 7, 921 | "w": 4, 922 | "x": 20, 923 | "y": 25 924 | }, 925 | "id": 8, 926 | "interval": null, 927 | "links": [], 928 | "mappingType": 1, 929 | "mappingTypes": [ 930 | { 931 | "name": "value to text", 932 | "value": 1 933 | }, 934 | { 935 | "name": "range to text", 936 | "value": 2 937 | } 938 | ], 939 | "maxDataPoints": 100, 940 | "nullPointMode": "connected", 941 | "nullText": null, 942 | "postfix": "", 943 | "postfixFontSize": "50%", 944 | "prefix": "", 945 | "prefixFontSize": "50%", 946 | "rangeMaps": [ 947 | { 948 | "from": "null", 949 | "text": "N/A", 950 | "to": "null" 951 | } 952 | ], 953 | "sparkline": { 954 | "fillColor": "rgba(31, 118, 189, 0.18)", 955 | "full": false, 956 | "lineColor": "rgb(31, 120, 193)", 957 | "show": false 958 | }, 959 | "tableColumn": "", 960 | "targets": [ 961 | { 962 | "expr": "sum((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 963 | "format": "time_series", 964 | "intervalFactor": 2, 965 | "legendFormat": "", 966 | "refId": "A", 967 | "step": 20 968 | } 969 | ], 970 | "thresholds": "10,25,100", 971 | "title": "Available Memory", 972 | "type": "singlestat", 973 | "valueFontSize": "80%", 974 | "valueMaps": [ 975 | { 976 | "op": "=", 977 | "text": "N/A", 978 | "value": "null" 979 | } 980 | ], 981 | "valueName": "avg" 982 | }, 983 | { 984 | "aliasColors": {}, 985 | "bars": false, 986 | "dashLength": 10, 987 | "dashes": false, 988 | "datasource": null, 989 | "fill": 1, 990 | "gridPos": { 991 | "h": 7, 992 | "w": 24, 993 | "x": 0, 994 | "y": 32 995 | }, 996 | "id": 34, 997 | "legend": { 998 | "alignAsTable": true, 999 | "avg": true, 1000 | "current": false, 1001 | "hideEmpty": false, 1002 | "hideZero": false, 1003 | "max": false, 1004 | "min": false, 1005 | "rightSide": true, 1006 | "show": true, 1007 | "sort": "avg", 1008 | "sortDesc": true, 1009 | "total": false, 1010 | "values": true 1011 | }, 1012 | "lines": true, 1013 | "linewidth": 1, 1014 | "links": [], 1015 | "nullPointMode": "null", 1016 | "percentage": false, 1017 | "pointradius": 5, 1018 | "points": false, 1019 | "renderer": "flot", 1020 | "seriesOverrides": [], 1021 | "spaceLength": 10, 1022 | "stack": false, 1023 | "steppedLine": false, 1024 | "targets": [ 1025 | { 1026 | "expr": "topk(10, avg_over_time(container_memory_usage_bytes{container_label_com_docker_swarm_node_id=~\"$node_id\", id=~\"/docker/.*\"}[$interval]))", 1027 | "format": "time_series", 1028 | "intervalFactor": 2, 1029 | "legendFormat": "{{name}}", 1030 | "refId": "A", 1031 | "step": 2 1032 | } 1033 | ], 1034 | "thresholds": [], 1035 | "timeFrom": null, 1036 | "timeShift": null, 1037 | "title": "Memory usage by Container (top 10)", 1038 | "tooltip": { 1039 | "shared": true, 1040 | "sort": 2, 1041 | "value_type": "individual" 1042 | }, 1043 | "type": "graph", 1044 | "xaxis": { 1045 | "buckets": null, 1046 | "mode": "time", 1047 | "name": null, 1048 | "show": true, 1049 | "values": [] 1050 | }, 1051 | "yaxes": [ 1052 | { 1053 | "format": "decbytes", 1054 | "label": null, 1055 | "logBase": 1, 1056 | "max": null, 1057 | "min": null, 1058 | "show": true 1059 | }, 1060 | { 1061 | "format": "short", 1062 | "label": null, 1063 | "logBase": 1, 1064 | "max": null, 1065 | "min": null, 1066 | "show": false 1067 | } 1068 | ] 1069 | }, 1070 | { 1071 | "aliasColors": {}, 1072 | "bars": false, 1073 | "dashLength": 10, 1074 | "dashes": false, 1075 | "datasource": null, 1076 | "fill": 1, 1077 | "gridPos": { 1078 | "h": 7, 1079 | "w": 24, 1080 | "x": 0, 1081 | "y": 39 1082 | }, 1083 | "id": 17, 1084 | "legend": { 1085 | "alignAsTable": true, 1086 | "avg": true, 1087 | "current": false, 1088 | "max": true, 1089 | "min": true, 1090 | "rightSide": true, 1091 | "show": true, 1092 | "sort": "avg", 1093 | "sortDesc": true, 1094 | "total": false, 1095 | "values": true 1096 | }, 1097 | "lines": true, 1098 | "linewidth": 1, 1099 | "links": [], 1100 | "nullPointMode": "null", 1101 | "percentage": false, 1102 | "pointradius": 5, 1103 | "points": false, 1104 | "renderer": "flot", 1105 | "seriesOverrides": [], 1106 | "spaceLength": 10, 1107 | "stack": false, 1108 | "steppedLine": false, 1109 | "targets": [ 1110 | { 1111 | "expr": "sum(rate(container_network_receive_bytes_total{container_label_com_docker_swarm_node_id=~\"$node_id\"}[$interval])) by (container_label_com_docker_swarm_service_name)", 1112 | "format": "time_series", 1113 | "intervalFactor": 2, 1114 | "legendFormat": "{{container_label_com_docker_swarm_service_name}}", 1115 | "refId": "A", 1116 | "step": 2 1117 | } 1118 | ], 1119 | "thresholds": [], 1120 | "timeFrom": null, 1121 | "timeShift": null, 1122 | "title": "Network received by Service", 1123 | "tooltip": { 1124 | "shared": true, 1125 | "sort": 0, 1126 | "value_type": "individual" 1127 | }, 1128 | "type": "graph", 1129 | "xaxis": { 1130 | "buckets": null, 1131 | "mode": "time", 1132 | "name": null, 1133 | "show": true, 1134 | "values": [] 1135 | }, 1136 | "yaxes": [ 1137 | { 1138 | "format": "Bps", 1139 | "label": null, 1140 | "logBase": 1, 1141 | "max": null, 1142 | "min": null, 1143 | "show": true 1144 | }, 1145 | { 1146 | "format": "short", 1147 | "label": null, 1148 | "logBase": 1, 1149 | "max": null, 1150 | "min": null, 1151 | "show": true 1152 | } 1153 | ] 1154 | }, 1155 | { 1156 | "aliasColors": {}, 1157 | "bars": false, 1158 | "dashLength": 10, 1159 | "dashes": false, 1160 | "datasource": null, 1161 | "fill": 1, 1162 | "gridPos": { 1163 | "h": 7, 1164 | "w": 24, 1165 | "x": 0, 1166 | "y": 46 1167 | }, 1168 | "id": 25, 1169 | "legend": { 1170 | "alignAsTable": true, 1171 | "avg": true, 1172 | "current": false, 1173 | "max": true, 1174 | "min": true, 1175 | "rightSide": true, 1176 | "show": true, 1177 | "sort": "avg", 1178 | "sortDesc": true, 1179 | "total": false, 1180 | "values": true 1181 | }, 1182 | "lines": true, 1183 | "linewidth": 1, 1184 | "links": [], 1185 | "nullPointMode": "null", 1186 | "percentage": false, 1187 | "pointradius": 5, 1188 | "points": false, 1189 | "renderer": "flot", 1190 | "seriesOverrides": [], 1191 | "spaceLength": 10, 1192 | "stack": false, 1193 | "steppedLine": false, 1194 | "targets": [ 1195 | { 1196 | "expr": "sum(rate(container_network_transmit_bytes_total{container_label_com_docker_swarm_node_id=~\"$node_id\"}[$interval])) by (container_label_com_docker_swarm_service_name)", 1197 | "format": "time_series", 1198 | "intervalFactor": 2, 1199 | "legendFormat": "{{container_label_com_docker_swarm_service_name}}", 1200 | "metric": "", 1201 | "refId": "B", 1202 | "step": 2 1203 | } 1204 | ], 1205 | "thresholds": [], 1206 | "timeFrom": null, 1207 | "timeShift": null, 1208 | "title": "Network transmitted by Service", 1209 | "tooltip": { 1210 | "shared": true, 1211 | "sort": 0, 1212 | "value_type": "individual" 1213 | }, 1214 | "type": "graph", 1215 | "xaxis": { 1216 | "buckets": null, 1217 | "mode": "time", 1218 | "name": null, 1219 | "show": true, 1220 | "values": [] 1221 | }, 1222 | "yaxes": [ 1223 | { 1224 | "format": "Bps", 1225 | "label": null, 1226 | "logBase": 1, 1227 | "max": null, 1228 | "min": null, 1229 | "show": true 1230 | }, 1231 | { 1232 | "format": "short", 1233 | "label": null, 1234 | "logBase": 1, 1235 | "max": null, 1236 | "min": null, 1237 | "show": true 1238 | } 1239 | ] 1240 | }, 1241 | { 1242 | "aliasColors": {}, 1243 | "bars": false, 1244 | "dashLength": 10, 1245 | "dashes": false, 1246 | "datasource": null, 1247 | "fill": 1, 1248 | "gridPos": { 1249 | "h": 7, 1250 | "w": 10, 1251 | "x": 0, 1252 | "y": 53 1253 | }, 1254 | "id": 31, 1255 | "legend": { 1256 | "avg": true, 1257 | "current": false, 1258 | "max": false, 1259 | "min": false, 1260 | "show": true, 1261 | "total": false, 1262 | "values": true 1263 | }, 1264 | "lines": true, 1265 | "linewidth": 1, 1266 | "links": [], 1267 | "nullPointMode": "null", 1268 | "percentage": false, 1269 | "pointradius": 5, 1270 | "points": false, 1271 | "renderer": "flot", 1272 | "seriesOverrides": [], 1273 | "spaceLength": 10, 1274 | "stack": false, 1275 | "steppedLine": false, 1276 | "targets": [ 1277 | { 1278 | "expr": "sum(rate(container_network_receive_bytes_total{id=\"/\"}[$interval])) by (id)", 1279 | "format": "time_series", 1280 | "intervalFactor": 2, 1281 | "legendFormat": "Received", 1282 | "refId": "A", 1283 | "step": 4 1284 | }, 1285 | { 1286 | "expr": "- sum(rate(container_network_transmit_bytes_total{id=\"/\"}[$interval])) by (id)", 1287 | "format": "time_series", 1288 | "intervalFactor": 2, 1289 | "legendFormat": "Transmited", 1290 | "refId": "B", 1291 | "step": 4 1292 | } 1293 | ], 1294 | "thresholds": [], 1295 | "timeFrom": null, 1296 | "timeShift": null, 1297 | "title": "Cluster Network Traffic", 1298 | "tooltip": { 1299 | "shared": true, 1300 | "sort": 0, 1301 | "value_type": "individual" 1302 | }, 1303 | "type": "graph", 1304 | "xaxis": { 1305 | "buckets": null, 1306 | "mode": "time", 1307 | "name": null, 1308 | "show": true, 1309 | "values": [] 1310 | }, 1311 | "yaxes": [ 1312 | { 1313 | "format": "Bps", 1314 | "label": null, 1315 | "logBase": 1, 1316 | "max": null, 1317 | "min": null, 1318 | "show": true 1319 | }, 1320 | { 1321 | "format": "short", 1322 | "label": null, 1323 | "logBase": 1, 1324 | "max": null, 1325 | "min": null, 1326 | "show": true 1327 | } 1328 | ] 1329 | }, 1330 | { 1331 | "aliasColors": {}, 1332 | "bars": false, 1333 | "dashLength": 10, 1334 | "dashes": false, 1335 | "datasource": null, 1336 | "fill": 1, 1337 | "gridPos": { 1338 | "h": 7, 1339 | "w": 10, 1340 | "x": 10, 1341 | "y": 53 1342 | }, 1343 | "id": 26, 1344 | "legend": { 1345 | "alignAsTable": false, 1346 | "avg": true, 1347 | "current": false, 1348 | "max": true, 1349 | "min": true, 1350 | "rightSide": false, 1351 | "show": true, 1352 | "total": false, 1353 | "values": true 1354 | }, 1355 | "lines": true, 1356 | "linewidth": 1, 1357 | "links": [], 1358 | "nullPointMode": "null", 1359 | "percentage": false, 1360 | "pointradius": 5, 1361 | "points": false, 1362 | "renderer": "flot", 1363 | "seriesOverrides": [], 1364 | "spaceLength": 10, 1365 | "stack": false, 1366 | "steppedLine": false, 1367 | "targets": [ 1368 | { 1369 | "expr": "sum(irate(container_fs_reads_total[$interval]) )", 1370 | "format": "time_series", 1371 | "intervalFactor": 2, 1372 | "legendFormat": "Reads", 1373 | "refId": "A", 1374 | "step": 4 1375 | }, 1376 | { 1377 | "expr": "sum(irate(container_fs_writes_total[$interval])) ", 1378 | "format": "time_series", 1379 | "intervalFactor": 2, 1380 | "legendFormat": "Writes ", 1381 | "refId": "B", 1382 | "step": 4 1383 | } 1384 | ], 1385 | "thresholds": [], 1386 | "timeFrom": null, 1387 | "timeShift": null, 1388 | "title": "Cluster IOPS", 1389 | "tooltip": { 1390 | "shared": true, 1391 | "sort": 0, 1392 | "value_type": "individual" 1393 | }, 1394 | "type": "graph", 1395 | "xaxis": { 1396 | "buckets": null, 1397 | "mode": "time", 1398 | "name": null, 1399 | "show": true, 1400 | "values": [] 1401 | }, 1402 | "yaxes": [ 1403 | { 1404 | "format": "short", 1405 | "label": null, 1406 | "logBase": 1, 1407 | "max": null, 1408 | "min": null, 1409 | "show": true 1410 | }, 1411 | { 1412 | "format": "short", 1413 | "label": null, 1414 | "logBase": 1, 1415 | "max": null, 1416 | "min": null, 1417 | "show": true 1418 | } 1419 | ] 1420 | }, 1421 | { 1422 | "cacheTimeout": null, 1423 | "colorBackground": false, 1424 | "colorValue": false, 1425 | "colors": [ 1426 | "rgba(245, 54, 54, 0.9)", 1427 | "rgba(237, 129, 40, 0.89)", 1428 | "rgba(50, 172, 45, 0.97)" 1429 | ], 1430 | "datasource": null, 1431 | "format": "percent", 1432 | "gauge": { 1433 | "maxValue": 100, 1434 | "minValue": 0, 1435 | "show": true, 1436 | "thresholdLabels": false, 1437 | "thresholdMarkers": true 1438 | }, 1439 | "gridPos": { 1440 | "h": 7, 1441 | "w": 4, 1442 | "x": 20, 1443 | "y": 53 1444 | }, 1445 | "id": 27, 1446 | "interval": null, 1447 | "links": [], 1448 | "mappingType": 1, 1449 | "mappingTypes": [ 1450 | { 1451 | "name": "value to text", 1452 | "value": 1 1453 | }, 1454 | { 1455 | "name": "range to text", 1456 | "value": 2 1457 | } 1458 | ], 1459 | "maxDataPoints": 100, 1460 | "nullPointMode": "connected", 1461 | "nullText": null, 1462 | "postfix": "", 1463 | "postfixFontSize": "50%", 1464 | "prefix": "", 1465 | "prefixFontSize": "50%", 1466 | "rangeMaps": [ 1467 | { 1468 | "from": "null", 1469 | "text": "N/A", 1470 | "to": "null" 1471 | } 1472 | ], 1473 | "sparkline": { 1474 | "fillColor": "rgba(31, 118, 189, 0.18)", 1475 | "full": false, 1476 | "lineColor": "rgb(31, 120, 193)", 1477 | "show": false 1478 | }, 1479 | "tableColumn": "", 1480 | "targets": [ 1481 | { 1482 | "expr": "sum((node_filesystem_free_bytes{mountpoint=\"/rootfs\"} / node_filesystem_size_bytes{mountpoint=\"/rootfs\"}) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"})", 1483 | "format": "time_series", 1484 | "intervalFactor": 2, 1485 | "legendFormat": "", 1486 | "refId": "A", 1487 | "step": 20 1488 | } 1489 | ], 1490 | "thresholds": "10,25,100", 1491 | "title": "Available Disk Space", 1492 | "type": "singlestat", 1493 | "valueFontSize": "80%", 1494 | "valueMaps": [ 1495 | { 1496 | "op": "=", 1497 | "text": "N/A", 1498 | "value": "null" 1499 | } 1500 | ], 1501 | "valueName": "avg" 1502 | }, 1503 | { 1504 | "aliasColors": {}, 1505 | "bars": false, 1506 | "dashLength": 10, 1507 | "dashes": false, 1508 | "datasource": null, 1509 | "decimals": 0, 1510 | "fill": 1, 1511 | "gridPos": { 1512 | "h": 7, 1513 | "w": 12, 1514 | "x": 0, 1515 | "y": 60 1516 | }, 1517 | "id": 29, 1518 | "legend": { 1519 | "alignAsTable": true, 1520 | "avg": false, 1521 | "current": true, 1522 | "hideEmpty": true, 1523 | "hideZero": true, 1524 | "max": false, 1525 | "min": false, 1526 | "rightSide": true, 1527 | "show": true, 1528 | "sort": "current", 1529 | "sortDesc": true, 1530 | "total": false, 1531 | "values": true 1532 | }, 1533 | "lines": true, 1534 | "linewidth": 1, 1535 | "links": [], 1536 | "nullPointMode": "null", 1537 | "percentage": false, 1538 | "pointradius": 5, 1539 | "points": false, 1540 | "renderer": "flot", 1541 | "seriesOverrides": [], 1542 | "spaceLength": 10, 1543 | "stack": false, 1544 | "steppedLine": false, 1545 | "targets": [ 1546 | { 1547 | "expr": "sum(engine_daemon_container_actions_seconds_count * on(instance) group_left(node_id) swarm_node_info{node_id=~\"$node_id\"}) by (action)", 1548 | "format": "time_series", 1549 | "intervalFactor": 10, 1550 | "legendFormat": "{{action }}", 1551 | "refId": "A", 1552 | "step": 10 1553 | } 1554 | ], 1555 | "thresholds": [], 1556 | "timeFrom": null, 1557 | "timeShift": null, 1558 | "title": "Docker Daemon Container Actions", 1559 | "tooltip": { 1560 | "shared": true, 1561 | "sort": 2, 1562 | "value_type": "individual" 1563 | }, 1564 | "type": "graph", 1565 | "xaxis": { 1566 | "buckets": null, 1567 | "mode": "time", 1568 | "name": null, 1569 | "show": true, 1570 | "values": [] 1571 | }, 1572 | "yaxes": [ 1573 | { 1574 | "format": "short", 1575 | "label": null, 1576 | "logBase": 1, 1577 | "max": null, 1578 | "min": null, 1579 | "show": true 1580 | }, 1581 | { 1582 | "format": "short", 1583 | "label": null, 1584 | "logBase": 1, 1585 | "max": null, 1586 | "min": null, 1587 | "show": true 1588 | } 1589 | ] 1590 | }, 1591 | { 1592 | "aliasColors": {}, 1593 | "bars": false, 1594 | "dashLength": 10, 1595 | "dashes": false, 1596 | "datasource": null, 1597 | "decimals": 0, 1598 | "fill": 1, 1599 | "gridPos": { 1600 | "h": 7, 1601 | "w": 12, 1602 | "x": 12, 1603 | "y": 60 1604 | }, 1605 | "id": 30, 1606 | "legend": { 1607 | "alignAsTable": true, 1608 | "avg": false, 1609 | "current": true, 1610 | "hideEmpty": true, 1611 | "hideZero": true, 1612 | "max": false, 1613 | "min": false, 1614 | "rightSide": true, 1615 | "show": true, 1616 | "sort": "current", 1617 | "sortDesc": true, 1618 | "total": false, 1619 | "values": true 1620 | }, 1621 | "lines": true, 1622 | "linewidth": 1, 1623 | "links": [], 1624 | "nullPointMode": "null", 1625 | "percentage": false, 1626 | "pointradius": 5, 1627 | "points": false, 1628 | "renderer": "flot", 1629 | "seriesOverrides": [], 1630 | "spaceLength": 10, 1631 | "stack": false, 1632 | "steppedLine": false, 1633 | "targets": [ 1634 | { 1635 | "expr": "sum(engine_daemon_network_actions_seconds_count * on(instance) group_left(node_id) swarm_node_info{node_id=~\"$node_id\"}) by (action)", 1636 | "format": "time_series", 1637 | "intervalFactor": 10, 1638 | "legendFormat": "{{action }}", 1639 | "refId": "A", 1640 | "step": 10 1641 | } 1642 | ], 1643 | "thresholds": [], 1644 | "timeFrom": null, 1645 | "timeShift": null, 1646 | "title": "Docker Daemon Network Actions", 1647 | "tooltip": { 1648 | "shared": true, 1649 | "sort": 2, 1650 | "value_type": "individual" 1651 | }, 1652 | "type": "graph", 1653 | "xaxis": { 1654 | "buckets": null, 1655 | "mode": "time", 1656 | "name": null, 1657 | "show": true, 1658 | "values": [] 1659 | }, 1660 | "yaxes": [ 1661 | { 1662 | "format": "short", 1663 | "label": null, 1664 | "logBase": 1, 1665 | "max": null, 1666 | "min": null, 1667 | "show": true 1668 | }, 1669 | { 1670 | "format": "short", 1671 | "label": null, 1672 | "logBase": 1, 1673 | "max": null, 1674 | "min": null, 1675 | "show": true 1676 | } 1677 | ] 1678 | }, 1679 | { 1680 | "columns": [ 1681 | { 1682 | "text": "Avg", 1683 | "value": "avg" 1684 | } 1685 | ], 1686 | "datasource": null, 1687 | "fontSize": "100%", 1688 | "gridPos": { 1689 | "h": 7, 1690 | "w": 24, 1691 | "x": 0, 1692 | "y": 67 1693 | }, 1694 | "hideTimeOverride": true, 1695 | "id": 28, 1696 | "links": [], 1697 | "pageSize": null, 1698 | "repeat": null, 1699 | "scroll": true, 1700 | "showHeader": true, 1701 | "sort": { 1702 | "col": 0, 1703 | "desc": true 1704 | }, 1705 | "styles": [ 1706 | { 1707 | "alias": "Time", 1708 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 1709 | "pattern": "Time", 1710 | "type": "hidden" 1711 | }, 1712 | { 1713 | "alias": "", 1714 | "colorMode": null, 1715 | "colors": [ 1716 | "rgba(245, 54, 54, 0.9)", 1717 | "rgba(237, 129, 40, 0.89)", 1718 | "rgba(50, 172, 45, 0.97)" 1719 | ], 1720 | "decimals": 2, 1721 | "pattern": "/.*/", 1722 | "thresholds": [], 1723 | "type": "number", 1724 | "unit": "short" 1725 | } 1726 | ], 1727 | "targets": [ 1728 | { 1729 | "expr": "sum(engine_daemon_engine_info * on(instance) group_left(node_id) swarm_node_info) by (kernel, os, graphdriver, version, node_id)", 1730 | "format": "table", 1731 | "instant": true, 1732 | "intervalFactor": 2, 1733 | "legendFormat": "", 1734 | "refId": "A", 1735 | "step": 2 1736 | } 1737 | ], 1738 | "timeFrom": "1s", 1739 | "title": "Docker Engine Info", 1740 | "transform": "timeseries_to_rows", 1741 | "type": "table" 1742 | } 1743 | ], 1744 | "refresh": "30s", 1745 | "schemaVersion": 16, 1746 | "style": "dark", 1747 | "tags": [ 1748 | "swarmprom" 1749 | ], 1750 | "templating": { 1751 | "list": [ 1752 | { 1753 | "allValue": ".+", 1754 | "current": { 1755 | "text": "All", 1756 | "value": "$__all" 1757 | }, 1758 | "datasource": "Prometheus", 1759 | "hide": 0, 1760 | "includeAll": true, 1761 | "label": "Swarm Node", 1762 | "multi": false, 1763 | "name": "node_id", 1764 | "options": [], 1765 | "query": "node_meta", 1766 | "refresh": 2, 1767 | "regex": "/node_id=\"([^\"]+)\"/", 1768 | "sort": 0, 1769 | "tagValuesQuery": "label_values({node_id=\"$tag\"},node_name)", 1770 | "tags": [ 1771 | "ofdocker", 1772 | "ofmon" 1773 | ], 1774 | "tagsQuery": "label_values(node_meta, node_name)", 1775 | "type": "query", 1776 | "useTags": true 1777 | }, 1778 | { 1779 | "auto": true, 1780 | "auto_count": 30, 1781 | "auto_min": "30s", 1782 | "current": { 1783 | "text": "auto", 1784 | "value": "$__auto_interval_interval" 1785 | }, 1786 | "hide": 0, 1787 | "label": "Interval", 1788 | "name": "interval", 1789 | "options": [ 1790 | { 1791 | "selected": true, 1792 | "text": "auto", 1793 | "value": "$__auto_interval_interval" 1794 | }, 1795 | { 1796 | "selected": false, 1797 | "text": "1m", 1798 | "value": "1m" 1799 | }, 1800 | { 1801 | "selected": false, 1802 | "text": "10m", 1803 | "value": "10m" 1804 | }, 1805 | { 1806 | "selected": false, 1807 | "text": "30m", 1808 | "value": "30m" 1809 | }, 1810 | { 1811 | "selected": false, 1812 | "text": "1h", 1813 | "value": "1h" 1814 | }, 1815 | { 1816 | "selected": false, 1817 | "text": "6h", 1818 | "value": "6h" 1819 | }, 1820 | { 1821 | "selected": false, 1822 | "text": "12h", 1823 | "value": "12h" 1824 | }, 1825 | { 1826 | "selected": false, 1827 | "text": "1d", 1828 | "value": "1d" 1829 | }, 1830 | { 1831 | "selected": false, 1832 | "text": "7d", 1833 | "value": "7d" 1834 | }, 1835 | { 1836 | "selected": false, 1837 | "text": "14d", 1838 | "value": "14d" 1839 | }, 1840 | { 1841 | "selected": false, 1842 | "text": "30d", 1843 | "value": "30d" 1844 | } 1845 | ], 1846 | "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 1847 | "refresh": 2, 1848 | "type": "interval" 1849 | } 1850 | ] 1851 | }, 1852 | "time": { 1853 | "from": "now-15m", 1854 | "to": "now" 1855 | }, 1856 | "timepicker": { 1857 | "refresh_intervals": [ 1858 | "5s", 1859 | "10s", 1860 | "30s", 1861 | "1m", 1862 | "5m", 1863 | "15m", 1864 | "30m", 1865 | "1h", 1866 | "2h", 1867 | "1d" 1868 | ], 1869 | "time_options": [ 1870 | "5m", 1871 | "15m", 1872 | "1h", 1873 | "6h", 1874 | "12h", 1875 | "24h", 1876 | "2d", 1877 | "7d", 1878 | "30d" 1879 | ] 1880 | }, 1881 | "timezone": "", 1882 | "title": "Docker Swarm Services", 1883 | "uid": "zr_baSRmk", 1884 | "version": 1 1885 | } 1886 | -------------------------------------------------------------------------------- /grafana/datasources/prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | deleteDatasources: 4 | - name: Prometheus 5 | 6 | datasources: 7 | - name: Prometheus 8 | type: prometheus 9 | access: proxy 10 | url: http://prometheus:9090 11 | isDefault: true 12 | version: 1 13 | editable: true 14 | -------------------------------------------------------------------------------- /grafana/screens/alertmanager-slack-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefanprodan/swarmprom/28957ff51aea9d69fab079783096555fe9bc44b4/grafana/screens/alertmanager-slack-v2.png -------------------------------------------------------------------------------- /grafana/screens/swarmprom-nodes-dash-v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefanprodan/swarmprom/28957ff51aea9d69fab079783096555fe9bc44b4/grafana/screens/swarmprom-nodes-dash-v3.png -------------------------------------------------------------------------------- /grafana/screens/swarmprom-prometheus-dash-v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefanprodan/swarmprom/28957ff51aea9d69fab079783096555fe9bc44b4/grafana/screens/swarmprom-prometheus-dash-v3.png -------------------------------------------------------------------------------- /grafana/screens/swarmprom-services-dash-v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefanprodan/swarmprom/28957ff51aea9d69fab079783096555fe9bc44b4/grafana/screens/swarmprom-services-dash-v3.png -------------------------------------------------------------------------------- /grafana/screens/unsee.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefanprodan/swarmprom/28957ff51aea9d69fab079783096555fe9bc44b4/grafana/screens/unsee.png -------------------------------------------------------------------------------- /grafana/screens/weave-scope-hosts-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefanprodan/swarmprom/28957ff51aea9d69fab079783096555fe9bc44b4/grafana/screens/weave-scope-hosts-v2.png -------------------------------------------------------------------------------- /grafana/screens/weave-scope.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefanprodan/swarmprom/28957ff51aea9d69fab079783096555fe9bc44b4/grafana/screens/weave-scope.png -------------------------------------------------------------------------------- /grafana/swarmprom_dashboards.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'default' 5 | orgId: 1 6 | folder: '' 7 | type: file 8 | disableDeletion: false 9 | editable: true 10 | options: 11 | path: /etc/grafana/dashboards 12 | -------------------------------------------------------------------------------- /node-exporter/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM prom/node-exporter:v0.16.0 2 | 3 | ENV NODE_ID=none 4 | 5 | USER root 6 | 7 | COPY conf /etc/node-exporter/ 8 | 9 | ENTRYPOINT [ "/etc/node-exporter/docker-entrypoint.sh" ] 10 | CMD [ "/bin/node_exporter" ] 11 | -------------------------------------------------------------------------------- /node-exporter/conf/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | 3 | NODE_NAME=$(cat /etc/nodename) 4 | echo "node_meta{node_id=\"$NODE_ID\", container_label_com_docker_swarm_node_id=\"$NODE_ID\", node_name=\"$NODE_NAME\"} 1" > /etc/node-exporter/node-meta.prom 5 | 6 | set -- /bin/node_exporter "$@" 7 | 8 | exec "$@" 9 | -------------------------------------------------------------------------------- /prometheus/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM prom/prometheus:v2.5.0 2 | # https://hub.docker.com/r/prom/prometheus/tags/ 3 | 4 | ENV WEAVE_TOKEN=none 5 | 6 | COPY conf /etc/prometheus/ 7 | 8 | ENTRYPOINT [ "/etc/prometheus/docker-entrypoint.sh" ] 9 | CMD [ "--config.file=/etc/prometheus/prometheus.yml", \ 10 | "--storage.tsdb.path=/prometheus" ] 11 | -------------------------------------------------------------------------------- /prometheus/conf/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | 3 | cat /etc/prometheus/prometheus.yml > /tmp/prometheus.yml 4 | cat /etc/prometheus/weave-cortex.yml | \ 5 | sed "s@#password: #@password: '$WEAVE_TOKEN'@g" > /tmp/weave-cortex.yml 6 | 7 | #JOBS=mongo-exporter:9111 redis-exporter:9112 8 | 9 | if [ ${JOBS+x} ]; then 10 | 11 | for job in $JOBS 12 | do 13 | echo "adding job $job" 14 | 15 | SERVICE=$(echo "$job" | cut -d":" -f1) 16 | PORT=$(echo "$job" | cut -d":" -f2) 17 | 18 | cat >>/tmp/prometheus.yml <>/tmp/weave-cortex.yml <# 5 | 6 | global: 7 | scrape_interval: 15s 8 | evaluation_interval: 15s 9 | 10 | external_labels: 11 | monitor: 'promswarm' 12 | 13 | scrape_configs: 14 | - job_name: 'prometheus' 15 | static_configs: 16 | - targets: ['localhost:9090'] 17 | 18 | - job_name: 'dockerd-exporter' 19 | dns_sd_configs: 20 | - names: 21 | - 'tasks.dockerd-exporter' 22 | type: 'A' 23 | port: 9323 24 | 25 | - job_name: 'cadvisor' 26 | dns_sd_configs: 27 | - names: 28 | - 'tasks.cadvisor' 29 | type: 'A' 30 | port: 8080 31 | 32 | - job_name: 'node-exporter' 33 | dns_sd_configs: 34 | - names: 35 | - 'tasks.node-exporter' 36 | type: 'A' 37 | port: 9100 38 | -------------------------------------------------------------------------------- /prometheus/rules/swarm_node.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_node.rules.yml 3 | rules: 4 | - alert: node_cpu_usage 5 | expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name) 6 | node_meta * 100) BY (node_name)) > 50 7 | for: 1m 8 | labels: 9 | severity: warning 10 | annotations: 11 | description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize 12 | $value}}%. 13 | summary: CPU alert for Swarm node '{{ $labels.node_name }}' 14 | - alert: node_memory_usage 15 | expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) 16 | * ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80 17 | for: 1m 18 | labels: 19 | severity: warning 20 | annotations: 21 | description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize 22 | $value}}%. 23 | summary: Memory alert for Swarm node '{{ $labels.node_name }}' 24 | - alert: node_disk_usage 25 | expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"}) 26 | * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name) 27 | node_meta > 85 28 | for: 1m 29 | labels: 30 | severity: warning 31 | annotations: 32 | description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize 33 | $value}}%. 34 | summary: Disk alert for Swarm node '{{ $labels.node_name }}' 35 | - alert: node_disk_fill_rate_6h 36 | expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance) 37 | GROUP_LEFT(node_name) node_meta < 0 38 | for: 1h 39 | labels: 40 | severity: critical 41 | annotations: 42 | description: Swarm node {{ $labels.node_name }} disk is going to fill up in 43 | 6h. 44 | summary: Disk fill alert for Swarm node '{{ $labels.node_name }}' 45 | -------------------------------------------------------------------------------- /prometheus/rules/swarm_task.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml 3 | rules: 4 | - alert: task_high_cpu_usage_50 5 | expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m])) 6 | BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) 7 | * 100 > 50 8 | for: 1m 9 | annotations: 10 | description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ 11 | $labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize 12 | $value}}%.' 13 | summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name 14 | }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' 15 | - alert: task_high_memory_usage_1g 16 | expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) 17 | BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09 18 | for: 1m 19 | annotations: 20 | description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{ 21 | $labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize 22 | $value}}.' 23 | summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name 24 | }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}' 25 | -------------------------------------------------------------------------------- /test-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.3" 2 | 3 | networks: 4 | net: 5 | driver: overlay 6 | attachable: true 7 | mon_net: 8 | external: true 9 | 10 | services: 11 | 12 | mongo: 13 | image: healthcheck/mongo:latest 14 | networks: 15 | - net 16 | deploy: 17 | mode: replicated 18 | replicas: 1 19 | placement: 20 | constraints: 21 | - node.role != manager 22 | 23 | mongo-exporter: 24 | image: forekshub/percona-mongodb-exporter:latest 25 | networks: 26 | - net 27 | - mon_net 28 | ports: 29 | - "9216:9216" 30 | environment: 31 | - MONGODB_URL=mongodb://mongo:27017 32 | deploy: 33 | mode: replicated 34 | replicas: 1 35 | placement: 36 | constraints: 37 | - node.role == manager 38 | -------------------------------------------------------------------------------- /weave-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.3" 2 | 3 | networks: 4 | net: 5 | driver: overlay 6 | attachable: true 7 | 8 | volumes: 9 | prometheus: {} 10 | grafana: {} 11 | 12 | configs: 13 | caddy_config: 14 | file: ./caddy/Caddyfile 15 | dockerd_config: 16 | file: ./dockerd-exporter/Caddyfile 17 | 18 | services: 19 | dockerd-exporter: 20 | image: stefanprodan/caddy 21 | networks: 22 | - net 23 | environment: 24 | - DOCKER_GWBRIDGE_IP=172.18.0.1 25 | configs: 26 | - source: dockerd_config 27 | target: /etc/caddy/Caddyfile 28 | deploy: 29 | mode: global 30 | 31 | cadvisor: 32 | image: google/cadvisor 33 | networks: 34 | - net 35 | command: -logtostderr -docker_only 36 | volumes: 37 | - /var/run/docker.sock:/var/run/docker.sock:ro 38 | - /:/rootfs:ro 39 | - /var/run:/var/run 40 | - /sys:/sys:ro 41 | - /var/lib/docker/:/var/lib/docker:ro 42 | deploy: 43 | mode: global 44 | 45 | grafana: 46 | image: stefanprodan/swarmprom-grafana:4.6.3 47 | networks: 48 | - net 49 | environment: 50 | - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin} 51 | - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 52 | - GF_USERS_ALLOW_SIGN_UP=false 53 | #- GF_SERVER_ROOT_URL=${GF_SERVER_ROOT_URL:-localhost} 54 | #- GF_SMTP_ENABLED=${GF_SMTP_ENABLED:-false} 55 | #- GF_SMTP_FROM_ADDRESS=${GF_SMTP_FROM_ADDRESS:-grafana@test.com} 56 | #- GF_SMTP_FROM_NAME=${GF_SMTP_FROM_NAME:-Grafana} 57 | #- GF_SMTP_HOST=${GF_SMTP_HOST:-smtp:25} 58 | #- GF_SMTP_USER=${GF_SMTP_USER} 59 | #- GF_SMTP_PASSWORD=${GF_SMTP_PASSWORD} 60 | volumes: 61 | - grafana:/var/lib/grafana 62 | deploy: 63 | mode: replicated 64 | replicas: 1 65 | placement: 66 | constraints: 67 | - node.role == manager 68 | 69 | node-exporter: 70 | image: stefanprodan/swarmprom-node-exporter:v0.15.2 71 | networks: 72 | - net 73 | environment: 74 | - NODE_ID={{.Node.ID}} 75 | volumes: 76 | - /proc:/host/proc:ro 77 | - /sys:/host/sys:ro 78 | - /:/rootfs:ro 79 | - /etc/hostname:/etc/nodename 80 | command: 81 | - '--path.sysfs=/host/sys' 82 | - '--path.procfs=/host/proc' 83 | - '--collector.textfile.directory=/etc/node-exporter/' 84 | - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' 85 | # no collectors are explicitely enabled here, because the defaults are just fine, 86 | # see https://github.com/prometheus/node_exporter 87 | # disable ipvs collector because it barfs the node-exporter logs full with errors on my centos 7 vm's 88 | - '--no-collector.ipvs' 89 | deploy: 90 | mode: global 91 | 92 | caddy: 93 | image: stefanprodan/caddy 94 | ports: 95 | - "3000:3000" 96 | - "9090:9090" 97 | networks: 98 | - net 99 | environment: 100 | - ADMIN_USER=${ADMIN_USER:-admin} 101 | - ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 102 | configs: 103 | - source: caddy_config 104 | target: /etc/caddy/Caddyfile 105 | deploy: 106 | mode: replicated 107 | replicas: 1 108 | placement: 109 | constraints: 110 | - node.role == manager 111 | 112 | prometheus: 113 | image: stefanprodan/swarmprom-prometheus:v2.2.0-rc.0 114 | networks: 115 | - net 116 | environment: 117 | - WEAVE_TOKEN=$TOKEN 118 | #- JOBS=mongo-exporter:9216 119 | command: 120 | - '--config.file=/etc/prometheus/weave-cortex.yml' 121 | - '--web.console.libraries=/etc/prometheus/console_libraries' 122 | - '--web.console.templates=/etc/prometheus/consoles' 123 | - '--storage.tsdb.path=/prometheus' 124 | - '--storage.tsdb.retention=${PROMETHEUS_RETENTION:-24h}' 125 | volumes: 126 | - prometheus:/prometheus 127 | deploy: 128 | mode: replicated 129 | replicas: 1 130 | placement: 131 | constraints: 132 | - node.role == manager 133 | 134 | scope-launcher: 135 | image: weaveworks/scope-swarm-launcher 136 | networks: 137 | - net 138 | command: scope launch --service-token=$TOKEN 139 | volumes: 140 | - /var/run/docker.sock:/var/run/docker.sock 141 | deploy: 142 | mode: global 143 | restart_policy: 144 | condition: none 145 | --------------------------------------------------------------------------------