├── README.md ├── inventories └── hosts.yml ├── playbooks └── monitoring.yml └── roles ├── observer ├── defaults │ └── main.yml ├── files │ ├── grafana │ │ ├── dashboards │ │ │ └── main │ │ │ │ ├── cadvisor.json │ │ │ │ └── node-exporter.json │ │ └── provisioning │ │ │ ├── dashboards │ │ │ └── all.yml │ │ │ └── datasources │ │ │ └── all.yml │ ├── prometheus_alerts_rules.yml │ └── prometheus_main.yml ├── tasks │ └── main.yml └── templates │ └── alertmanager │ └── alertmanager.j2 └── target ├── defaults └── main.yml └── tasks └── main.yml /README.md: -------------------------------------------------------------------------------- 1 | # Ansible 2 | 3 | This repository was created using Ansible version 2.13.3. If having problem launching playbooks, please make sure you have a high enough version of Ansible. 4 | 5 | ## Architecture 6 | 7 | This repository is composed of multiple folders, used as follows: 8 | 9 | ```. 10 | ├── README.md -> this file 11 | ├── inventories -> hosts inventory files 12 | │ └── hosts.yml -> describes the different hosts 13 | ├── playbooks -> ansible playbooks 14 | ├── roles -> ansible roles 15 | ``` 16 | 17 | ## Using ansible 18 | 19 | To use ansible, you must first install ansible. Follow the steps described [here](https://docs.ansible.com/ansible/latest/installation_guide/index.html), depending on your OS. 20 | -------------------------------------------------------------------------------- /inventories/hosts.yml: -------------------------------------------------------------------------------- 1 | all: 2 | children: 3 | observer: 4 | hosts: 5 | padok-observer: 6 | ansible_host: 192.168.0.1 7 | target: 8 | hosts: 9 | padok-observer: 10 | ansible_host: 192.168.0.1 11 | padok-target-1: 12 | ansible_host: 192.168.0.10 13 | padok-target-2: 14 | ansible_host: 192.168.0.11 -------------------------------------------------------------------------------- /playbooks/monitoring.yml: -------------------------------------------------------------------------------- 1 | - name: Install Observability stack (targets) 2 | hosts: target 3 | tags: 4 | - monitoring 5 | - target 6 | roles: 7 | - ../roles/target 8 | 9 | - name: Install Observability stack (observer) 10 | hosts: observer 11 | tags: 12 | - monitoring 13 | - observer 14 | roles: 15 | - ../roles/observer 16 | -------------------------------------------------------------------------------- /roles/observer/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | prometheus_version: v2.40.1 3 | grafana_version: "9.2.5" 4 | alertmanager_version: v0.24.0 5 | alertmanager_smtp_password: !vault | 6 | $ANSIBLE_VAULT;1.1;AES256 7 | 64306663363562356132323065396635636630373031303739323666373262663961393132316333 8 | 6135653763363566303331313639633030623530646239310a353236343035643132646230333466 9 | 36336439376131333630346563323833313164353265313264643232373465633561663331396133 10 | 3163303166373166390a396131303239356139653063616437363933333130393563646338663933 11 | 3966 12 | -------------------------------------------------------------------------------- /roles/observer/files/grafana/dashboards/main/cadvisor.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "DS_PROMETHEUS", 5 | "label": "prometheus", 6 | "description": "Prometheus as the datasource is obligatory", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "7.4.5" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "1.0.0" 30 | }, 31 | { 32 | "type": "panel", 33 | "id": "table", 34 | "name": "Table", 35 | "version": "" 36 | } 37 | ], 38 | "annotations": { 39 | "list": [ 40 | { 41 | "builtIn": 1, 42 | "datasource": "-- Grafana --", 43 | "enable": true, 44 | "hide": true, 45 | "iconColor": "rgba(0, 211, 255, 1)", 46 | "name": "Annotations & Alerts", 47 | "type": "dashboard" 48 | } 49 | ] 50 | }, 51 | "editable": true, 52 | "gnetId": 14282, 53 | "graphTooltip": 0, 54 | "id": null, 55 | "iteration": 1617715580880, 56 | "links": [], 57 | "panels": [ 58 | { 59 | "collapsed": false, 60 | "datasource": "Prometheus", 61 | "gridPos": { 62 | "h": 1, 63 | "w": 24, 64 | "x": 0, 65 | "y": 0 66 | }, 67 | "id": 8, 68 | "panels": [], 69 | "title": "CPU", 70 | "type": "row" 71 | }, 72 | { 73 | "aliasColors": {}, 74 | "bars": false, 75 | "dashLength": 10, 76 | "dashes": false, 77 | "datasource": "Prometheus", 78 | "fieldConfig": { 79 | "defaults": { 80 | "custom": {} 81 | }, 82 | "overrides": [] 83 | }, 84 | "fill": 1, 85 | "fillGradient": 0, 86 | "gridPos": { 87 | "h": 7, 88 | "w": 24, 89 | "x": 0, 90 | "y": 1 91 | }, 92 | "hiddenSeries": false, 93 | "id": 15, 94 | "legend": { 95 | "alignAsTable": true, 96 | "avg": true, 97 | "current": false, 98 | "max": true, 99 | "min": false, 100 | "rightSide": true, 101 | "show": true, 102 | "total": false, 103 | "values": true 104 | }, 105 | "lines": true, 106 | "linewidth": 1, 107 | "nullPointMode": "null as zero", 108 | "options": { 109 | "alertThreshold": true 110 | }, 111 | "percentage": false, 112 | "pluginVersion": "7.4.5", 113 | "pointradius": 2, 114 | "points": false, 115 | "renderer": "flot", 116 | "seriesOverrides": [], 117 | "spaceLength": 10, 118 | "stack": true, 119 | "steppedLine": false, 120 | "targets": [ 121 | { 122 | "expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name) *100", 123 | "hide": false, 124 | "interval": "", 125 | "legendFormat": "{{name}}", 126 | "refId": "A" 127 | } 128 | ], 129 | "thresholds": [], 130 | "timeFrom": null, 131 | "timeRegions": [], 132 | "timeShift": null, 133 | "title": "CPU Usage", 134 | "tooltip": { 135 | "shared": true, 136 | "sort": 0, 137 | "value_type": "individual" 138 | }, 139 | "type": "graph", 140 | "xaxis": { 141 | "buckets": null, 142 | "mode": "time", 143 | "name": null, 144 | "show": true, 145 | "values": [] 146 | }, 147 | "yaxes": [ 148 | { 149 | "$$hashKey": "object:606", 150 | "format": "percent", 151 | "label": null, 152 | "logBase": 1, 153 | "max": null, 154 | "min": null, 155 | "show": true 156 | }, 157 | { 158 | "$$hashKey": "object:607", 159 | "format": "short", 160 | "label": null, 161 | "logBase": 1, 162 | "max": null, 163 | "min": null, 164 | "show": true 165 | } 166 | ], 167 | "yaxis": { 168 | "align": false, 169 | "alignLevel": null 170 | } 171 | }, 172 | { 173 | "collapsed": false, 174 | "datasource": "Prometheus", 175 | "gridPos": { 176 | "h": 1, 177 | "w": 24, 178 | "x": 0, 179 | "y": 8 180 | }, 181 | "id": 11, 182 | "panels": [], 183 | "title": "Memory", 184 | "type": "row" 185 | }, 186 | { 187 | "aliasColors": {}, 188 | "bars": false, 189 | "dashLength": 10, 190 | "dashes": false, 191 | "datasource": "Prometheus", 192 | "fieldConfig": { 193 | "defaults": { 194 | "custom": {} 195 | }, 196 | "overrides": [] 197 | }, 198 | "fill": 1, 199 | "fillGradient": 0, 200 | "gridPos": { 201 | "h": 8, 202 | "w": 12, 203 | "x": 0, 204 | "y": 9 205 | }, 206 | "hiddenSeries": false, 207 | "id": 9, 208 | "legend": { 209 | "alignAsTable": true, 210 | "avg": true, 211 | "current": false, 212 | "max": true, 213 | "min": false, 214 | "rightSide": true, 215 | "show": true, 216 | "total": false, 217 | "values": true 218 | }, 219 | "lines": true, 220 | "linewidth": 1, 221 | "nullPointMode": "null as zero", 222 | "options": { 223 | "alertThreshold": true 224 | }, 225 | "percentage": false, 226 | "pluginVersion": "7.4.5", 227 | "pointradius": 2, 228 | "points": false, 229 | "renderer": "flot", 230 | "seriesOverrides": [], 231 | "spaceLength": 10, 232 | "stack": true, 233 | "steppedLine": false, 234 | "targets": [ 235 | { 236 | "expr": "sum(container_memory_rss{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)", 237 | "hide": false, 238 | "interval": "", 239 | "legendFormat": "{{name}}", 240 | "refId": "A" 241 | } 242 | ], 243 | "thresholds": [], 244 | "timeFrom": null, 245 | "timeRegions": [], 246 | "timeShift": null, 247 | "title": "Memory Usage", 248 | "tooltip": { 249 | "shared": true, 250 | "sort": 0, 251 | "value_type": "individual" 252 | }, 253 | "type": "graph", 254 | "xaxis": { 255 | "buckets": null, 256 | "mode": "time", 257 | "name": null, 258 | "show": true, 259 | "values": [] 260 | }, 261 | "yaxes": [ 262 | { 263 | "$$hashKey": "object:606", 264 | "format": "bytes", 265 | "label": null, 266 | "logBase": 1, 267 | "max": null, 268 | "min": null, 269 | "show": true 270 | }, 271 | { 272 | "$$hashKey": "object:607", 273 | "format": "short", 274 | "label": null, 275 | "logBase": 1, 276 | "max": null, 277 | "min": null, 278 | "show": true 279 | } 280 | ], 281 | "yaxis": { 282 | "align": false, 283 | "alignLevel": null 284 | } 285 | }, 286 | { 287 | "aliasColors": {}, 288 | "bars": false, 289 | "dashLength": 10, 290 | "dashes": false, 291 | "datasource": "Prometheus", 292 | "fieldConfig": { 293 | "defaults": { 294 | "custom": {} 295 | }, 296 | "overrides": [] 297 | }, 298 | "fill": 1, 299 | "fillGradient": 0, 300 | "gridPos": { 301 | "h": 8, 302 | "w": 12, 303 | "x": 12, 304 | "y": 9 305 | }, 306 | "hiddenSeries": false, 307 | "id": 14, 308 | "legend": { 309 | "alignAsTable": true, 310 | "avg": true, 311 | "current": false, 312 | "max": true, 313 | "min": false, 314 | "rightSide": true, 315 | "show": true, 316 | "total": false, 317 | "values": true 318 | }, 319 | "lines": true, 320 | "linewidth": 1, 321 | "nullPointMode": "null as zero", 322 | "options": { 323 | "alertThreshold": true 324 | }, 325 | "percentage": false, 326 | "pluginVersion": "7.4.5", 327 | "pointradius": 2, 328 | "points": false, 329 | "renderer": "flot", 330 | "seriesOverrides": [], 331 | "spaceLength": 10, 332 | "stack": true, 333 | "steppedLine": false, 334 | "targets": [ 335 | { 336 | "expr": "sum(container_memory_cache{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)", 337 | "hide": false, 338 | "interval": "", 339 | "legendFormat": "{{name}}", 340 | "refId": "A" 341 | } 342 | ], 343 | "thresholds": [], 344 | "timeFrom": null, 345 | "timeRegions": [], 346 | "timeShift": null, 347 | "title": "Memory Cached", 348 | "tooltip": { 349 | "shared": true, 350 | "sort": 0, 351 | "value_type": "individual" 352 | }, 353 | "type": "graph", 354 | "xaxis": { 355 | "buckets": null, 356 | "mode": "time", 357 | "name": null, 358 | "show": true, 359 | "values": [] 360 | }, 361 | "yaxes": [ 362 | { 363 | "$$hashKey": "object:606", 364 | "format": "bytes", 365 | "label": null, 366 | "logBase": 1, 367 | "max": null, 368 | "min": null, 369 | "show": true 370 | }, 371 | { 372 | "$$hashKey": "object:607", 373 | "format": "short", 374 | "label": null, 375 | "logBase": 1, 376 | "max": null, 377 | "min": null, 378 | "show": true 379 | } 380 | ], 381 | "yaxis": { 382 | "align": false, 383 | "alignLevel": null 384 | } 385 | }, 386 | { 387 | "collapsed": false, 388 | "datasource": "Prometheus", 389 | "gridPos": { 390 | "h": 1, 391 | "w": 24, 392 | "x": 0, 393 | "y": 17 394 | }, 395 | "id": 2, 396 | "panels": [], 397 | "title": "Network", 398 | "type": "row" 399 | }, 400 | { 401 | "aliasColors": {}, 402 | "bars": false, 403 | "dashLength": 10, 404 | "dashes": false, 405 | "datasource": "Prometheus", 406 | "fieldConfig": { 407 | "defaults": { 408 | "custom": {} 409 | }, 410 | "overrides": [] 411 | }, 412 | "fill": 1, 413 | "fillGradient": 0, 414 | "gridPos": { 415 | "h": 8, 416 | "w": 12, 417 | "x": 0, 418 | "y": 18 419 | }, 420 | "hiddenSeries": false, 421 | "id": 4, 422 | "legend": { 423 | "alignAsTable": true, 424 | "avg": true, 425 | "current": false, 426 | "hideEmpty": false, 427 | "hideZero": false, 428 | "max": true, 429 | "min": false, 430 | "rightSide": true, 431 | "show": true, 432 | "sideWidth": null, 433 | "total": false, 434 | "values": true 435 | }, 436 | "lines": true, 437 | "linewidth": 1, 438 | "nullPointMode": "null", 439 | "options": { 440 | "alertThreshold": true 441 | }, 442 | "percentage": false, 443 | "pluginVersion": "7.4.5", 444 | "pointradius": 2, 445 | "points": false, 446 | "renderer": "flot", 447 | "seriesOverrides": [], 448 | "spaceLength": 10, 449 | "stack": false, 450 | "steppedLine": false, 451 | "targets": [ 452 | { 453 | "expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)", 454 | "hide": false, 455 | "interval": "", 456 | "legendFormat": "{{name}}", 457 | "refId": "A" 458 | } 459 | ], 460 | "thresholds": [], 461 | "timeFrom": null, 462 | "timeRegions": [], 463 | "timeShift": null, 464 | "title": "Received Network Traffic", 465 | "tooltip": { 466 | "shared": true, 467 | "sort": 0, 468 | "value_type": "individual" 469 | }, 470 | "type": "graph", 471 | "xaxis": { 472 | "buckets": null, 473 | "mode": "time", 474 | "name": null, 475 | "show": true, 476 | "values": [] 477 | }, 478 | "yaxes": [ 479 | { 480 | "$$hashKey": "object:674", 481 | "format": "Bps", 482 | "label": null, 483 | "logBase": 1, 484 | "max": null, 485 | "min": null, 486 | "show": true 487 | }, 488 | { 489 | "$$hashKey": "object:675", 490 | "format": "short", 491 | "label": null, 492 | "logBase": 1, 493 | "max": null, 494 | "min": null, 495 | "show": true 496 | } 497 | ], 498 | "yaxis": { 499 | "align": false, 500 | "alignLevel": null 501 | } 502 | }, 503 | { 504 | "aliasColors": {}, 505 | "bars": false, 506 | "dashLength": 10, 507 | "dashes": false, 508 | "datasource": "Prometheus", 509 | "fieldConfig": { 510 | "defaults": { 511 | "custom": {} 512 | }, 513 | "overrides": [] 514 | }, 515 | "fill": 1, 516 | "fillGradient": 0, 517 | "gridPos": { 518 | "h": 8, 519 | "w": 12, 520 | "x": 12, 521 | "y": 18 522 | }, 523 | "hiddenSeries": false, 524 | "id": 6, 525 | "legend": { 526 | "alignAsTable": true, 527 | "avg": true, 528 | "current": false, 529 | "max": true, 530 | "min": false, 531 | "rightSide": true, 532 | "show": true, 533 | "total": false, 534 | "values": true 535 | }, 536 | "lines": true, 537 | "linewidth": 1, 538 | "nullPointMode": "null", 539 | "options": { 540 | "alertThreshold": true 541 | }, 542 | "percentage": false, 543 | "pluginVersion": "7.4.5", 544 | "pointradius": 2, 545 | "points": false, 546 | "renderer": "flot", 547 | "seriesOverrides": [], 548 | "spaceLength": 10, 549 | "stack": false, 550 | "steppedLine": false, 551 | "targets": [ 552 | { 553 | "expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)", 554 | "interval": "", 555 | "legendFormat": "{{name}}", 556 | "refId": "A" 557 | } 558 | ], 559 | "thresholds": [], 560 | "timeFrom": null, 561 | "timeRegions": [], 562 | "timeShift": null, 563 | "title": "Sent Network Traffic", 564 | "tooltip": { 565 | "shared": true, 566 | "sort": 0, 567 | "value_type": "individual" 568 | }, 569 | "type": "graph", 570 | "xaxis": { 571 | "buckets": null, 572 | "mode": "time", 573 | "name": null, 574 | "show": true, 575 | "values": [] 576 | }, 577 | "yaxes": [ 578 | { 579 | "$$hashKey": "object:832", 580 | "format": "Bps", 581 | "label": null, 582 | "logBase": 1, 583 | "max": null, 584 | "min": null, 585 | "show": true 586 | }, 587 | { 588 | "$$hashKey": "object:833", 589 | "format": "short", 590 | "label": null, 591 | "logBase": 1, 592 | "max": null, 593 | "min": null, 594 | "show": true 595 | } 596 | ], 597 | "yaxis": { 598 | "align": false, 599 | "alignLevel": null 600 | } 601 | }, 602 | { 603 | "collapsed": false, 604 | "datasource": "Prometheus", 605 | "gridPos": { 606 | "h": 1, 607 | "w": 24, 608 | "x": 0, 609 | "y": 26 610 | }, 611 | "id": 19, 612 | "panels": [], 613 | "title": "Misc", 614 | "type": "row" 615 | }, 616 | { 617 | "datasource": "Prometheus", 618 | "fieldConfig": { 619 | "defaults": { 620 | "custom": { 621 | "align": null, 622 | "filterable": false 623 | }, 624 | "mappings": [], 625 | "thresholds": { 626 | "mode": "absolute", 627 | "steps": [ 628 | { 629 | "color": "green", 630 | "value": null 631 | }, 632 | { 633 | "color": "red", 634 | "value": 80 635 | } 636 | ] 637 | } 638 | }, 639 | "overrides": [ 640 | { 641 | "matcher": { 642 | "id": "byName", 643 | "options": "id" 644 | }, 645 | "properties": [ 646 | { 647 | "id": "custom.width", 648 | "value": 260 649 | } 650 | ] 651 | }, 652 | { 653 | "matcher": { 654 | "id": "byName", 655 | "options": "Running" 656 | }, 657 | "properties": [ 658 | { 659 | "id": "unit", 660 | "value": "d" 661 | }, 662 | { 663 | "id": "decimals", 664 | "value": 1 665 | }, 666 | { 667 | "id": "custom.displayMode", 668 | "value": "color-text" 669 | }, 670 | { 671 | "id": "color", 672 | "value": { 673 | "fixedColor": "dark-green", 674 | "mode": "fixed" 675 | } 676 | } 677 | ] 678 | } 679 | ] 680 | }, 681 | "gridPos": { 682 | "h": 10, 683 | "w": 24, 684 | "x": 0, 685 | "y": 27 686 | }, 687 | "id": 17, 688 | "options": { 689 | "showHeader": true, 690 | "sortBy": [] 691 | }, 692 | "pluginVersion": "7.4.5", 693 | "targets": [ 694 | { 695 | "expr": "(time() - container_start_time_seconds{instance=~\"$host\",name=~\"$container\",name=~\".+\"})/86400", 696 | "format": "table", 697 | "instant": true, 698 | "interval": "", 699 | "legendFormat": "{{name}}", 700 | "refId": "A" 701 | } 702 | ], 703 | "timeFrom": null, 704 | "timeShift": null, 705 | "title": "Containers Info", 706 | "transformations": [ 707 | { 708 | "id": "filterFieldsByName", 709 | "options": { 710 | "include": { 711 | "names": [ 712 | "container_label_com_docker_compose_project", 713 | "container_label_com_docker_compose_project_working_dir", 714 | "image", 715 | "instance", 716 | "name", 717 | "Value", 718 | "container_label_com_docker_compose_service" 719 | ] 720 | } 721 | } 722 | }, 723 | { 724 | "id": "organize", 725 | "options": { 726 | "excludeByName": {}, 727 | "indexByName": {}, 728 | "renameByName": { 729 | "Value": "Running", 730 | "container_label_com_docker_compose_project": "Label", 731 | "container_label_com_docker_compose_project_working_dir": "Working dir", 732 | "container_label_com_docker_compose_service": "Service", 733 | "image": "Registry Image", 734 | "instance": "Instance", 735 | "name": "Name" 736 | } 737 | } 738 | } 739 | ], 740 | "type": "table" 741 | } 742 | ], 743 | "schemaVersion": 27, 744 | "style": "dark", 745 | "tags": [ 746 | "cadvisor", 747 | "docker" 748 | ], 749 | "templating": { 750 | "list": [ 751 | { 752 | "allValue": ".*", 753 | "current": {}, 754 | "datasource": "Prometheus", 755 | "definition": "label_values({__name__=~\"container.*\"},instance)", 756 | "description": null, 757 | "error": null, 758 | "hide": 0, 759 | "includeAll": true, 760 | "label": "Host", 761 | "multi": false, 762 | "name": "host", 763 | "options": [], 764 | "query": { 765 | "query": "label_values({__name__=~\"container.*\"},instance)", 766 | "refId": "Prometheus-host-Variable-Query" 767 | }, 768 | "refresh": 1, 769 | "regex": "", 770 | "skipUrlSync": false, 771 | "sort": 5, 772 | "tagValuesQuery": "", 773 | "tags": [], 774 | "tagsQuery": "", 775 | "type": "query", 776 | "useTags": false 777 | }, 778 | { 779 | "allValue": ".*", 780 | "current": {}, 781 | "datasource": "Prometheus", 782 | "definition": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)", 783 | "description": null, 784 | "error": null, 785 | "hide": 0, 786 | "includeAll": true, 787 | "label": "Container", 788 | "multi": false, 789 | "name": "container", 790 | "options": [], 791 | "query": { 792 | "query": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)", 793 | "refId": "Prometheus-container-Variable-Query" 794 | }, 795 | "refresh": 1, 796 | "regex": "", 797 | "skipUrlSync": false, 798 | "sort": 0, 799 | "tagValuesQuery": "", 800 | "tags": [], 801 | "tagsQuery": "", 802 | "type": "query", 803 | "useTags": false 804 | } 805 | ] 806 | }, 807 | "time": { 808 | "from": "now-6h", 809 | "to": "now" 810 | }, 811 | "timepicker": {}, 812 | "timezone": "", 813 | "title": "Cadvisor exporter", 814 | "uid": "pMEd7m0Mz", 815 | "version": 1, 816 | "description": "Simple exporter for cadvisor only" 817 | } -------------------------------------------------------------------------------- /roles/observer/files/grafana/provisioning/dashboards/all.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: Pre-loaded local dashboards 5 | type: file 6 | options: 7 | foldersFromFilesStructure: true 8 | path: /var/lib/grafana/dashboards -------------------------------------------------------------------------------- /roles/observer/files/grafana/provisioning/datasources/all.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: Prometheus 5 | type: prometheus 6 | access: proxy 7 | url: http://192.168.0.1:9090 -------------------------------------------------------------------------------- /roles/observer/files/prometheus_alerts_rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: AllInstances 3 | rules: 4 | - alert: InstanceDown 5 | expr: up == 0 6 | for: 1m 7 | annotations: 8 | title: 'Instance {{ $labels.instance }} down' 9 | description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.' 10 | labels: 11 | severity: 'critical' -------------------------------------------------------------------------------- /roles/observer/files/prometheus_main.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | 4 | scrape_configs: 5 | - job_name: prometheus 6 | scrape_interval: 30s 7 | static_configs: 8 | - targets: ["localhost:9090"] 9 | 10 | - job_name: node-exporter 11 | scrape_interval: 30s 12 | static_configs: 13 | - targets: ["192.168.0.1:9100", "192.168.0.10:9100", "192.168.0.11:9100"] 14 | 15 | - job_name: cadvisor 16 | scrape_interval: 30s 17 | static_configs: 18 | - targets: ["192.168.0.1:9101", "192.168.0.11:9101"] 19 | 20 | rule_files: 21 | - prometheus_alerts_rules.yml 22 | 23 | alerting: 24 | alertmanagers: 25 | - static_configs: 26 | - targets: 27 | - 192.168.0.1:9093 28 | -------------------------------------------------------------------------------- /roles/observer/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Create Folder /srv/prometheus if not exist 3 | file: 4 | path: /srv/prometheus 5 | mode: 0755 6 | state: directory 7 | 8 | - name: Create Folder /srv/grafana if not exist 9 | file: 10 | path: /srv/grafana 11 | mode: 0755 12 | state: directory 13 | 14 | - name: Create Folder /srv/alertmanager if not exist 15 | file: 16 | path: /srv/alertmanager 17 | mode: 0755 18 | state: directory 19 | 20 | - name: Create prometheus configuration file 21 | copy: 22 | dest: /srv/prometheus/prometheus.yml 23 | src: prometheus_main.yml 24 | mode: 0644 25 | 26 | - name: Create prometheus alert configuration file 27 | copy: 28 | dest: /srv/prometheus/prometheus_alerts_rules.yml 29 | src: prometheus_alerts_rules.yml 30 | mode: 0644 31 | 32 | - name: Create grafana configuration files 33 | copy: 34 | dest: /srv/ 35 | src: grafana 36 | mode: 0644 37 | 38 | - name: Create alertmanager configuration file 39 | template: 40 | dest: /srv/alertmanager/alertmanager.yml 41 | src: alertmanager/alertmanager.j2 42 | mode: 0644 43 | 44 | - name: Create Prometheus container 45 | docker_container: 46 | name: prometheus 47 | restart_policy: always 48 | image: prom/prometheus:{{ prometheus_version }} 49 | volumes: 50 | - /srv/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml 51 | - /srv/prometheus/prometheus_alerts_rules.yml:/etc/prometheus/prometheus_alerts_rules.yml 52 | - prometheus_main_data:/prometheus 53 | command: > 54 | --config.file=/etc/prometheus/prometheus.yml 55 | --storage.tsdb.path=/prometheus 56 | --web.console.libraries=/etc/prometheus/console_libraries 57 | --web.console.templates=/etc/prometheus/consoles 58 | --web.enable-lifecycle 59 | published_ports: "9090:9090" 60 | 61 | - name: Create Grafana container 62 | docker_container: 63 | name: grafana 64 | restart_policy: always 65 | image: grafana/grafana:{{ grafana_version }} 66 | volumes: 67 | - grafana-data:/var/lib/grafana 68 | - /srv/grafana/provisioning:/etc/grafana/provisioning 69 | - /srv/grafana/dashboards:/var/lib/grafana/dashboards 70 | env: 71 | GF_AUTH_ANONYMOUS_ENABLED: "true" 72 | GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin" 73 | published_ports: "3000:3000" 74 | 75 | - name: Create Alertmanager container 76 | docker_container: 77 | name: alertmanager 78 | restart_policy: always 79 | image: prom/alertmanager:{{ alertmanager_version }} 80 | volumes: 81 | - alertmanager-data:/data 82 | - /srv/alertmanager:/config 83 | command: > 84 | --config.file=/config/alertmanager.yml 85 | --log.level=debug 86 | published_ports: "9093:9093" 87 | -------------------------------------------------------------------------------- /roles/observer/templates/alertmanager/alertmanager.j2: -------------------------------------------------------------------------------- 1 | route: 2 | receiver: "mail" 3 | repeat_interval: 4h 4 | group_by: [ alertname ] 5 | 6 | receivers: 7 | - name: "mail" 8 | email_configs: 9 | - smarthost: "outlook.office365.com:587" 10 | auth_username: "test@padok.fr" 11 | auth_password: "{{ alertmanager_smtp_password }}" 12 | from: "test@padok.fr" 13 | to: "test@padok.fr" -------------------------------------------------------------------------------- /roles/target/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | node_exporter_version: v1.4.0 3 | cadvisor_version: v0.46.0 -------------------------------------------------------------------------------- /roles/target/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Create NodeExporter 3 | docker_container: 4 | name: node-exporter 5 | restart_policy: always 6 | image: prom/node-exporter:{{ node_exporter_version }} 7 | volumes: 8 | - /proc:/host/proc:ro 9 | - /sys:/host/sys:ro 10 | - /:/rootfs:ro 11 | command: > 12 | --path.procfs=/host/proc 13 | --path.rootfs=/rootfs 14 | --path.sysfs=/host/sys 15 | --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) 16 | published_ports: "9100:9100" 17 | 18 | - name: Create cAdvisor 19 | docker_container: 20 | name: cadvisor 21 | restart_policy: always 22 | image: gcr.io/cadvisor/cadvisor:{{ cadvisor_version }} 23 | volumes: 24 | - /:/rootfs:ro 25 | - /var/run:/var/run:ro 26 | - /sys:/sys:ro 27 | - /var/lib/docker/:/var/lib/docker:ro 28 | - /dev/disk/:/dev/disk:ro 29 | published_ports: "9101:8080" --------------------------------------------------------------------------------