├── README.md ├── alertmanager-example ├── alertmanager-deployment.yaml ├── alertmanager.yml ├── generalrules.yaml ├── prometheus-example.yaml └── prometheus.yml ├── grafana ├── dashboard-capacity.yaml └── datasource.yaml ├── operator ├── alertmanager.yaml ├── grafana │ └── prometheus.yaml ├── prometheusrules.yaml ├── service-prometheus.yaml └── servicemonitor-coredns.yaml ├── prometheus-svc.yaml ├── pushgateway └── pushgateway.yaml ├── redis_prometheus_exporter.yaml ├── storage └── prometheus-example.yaml └── traefik-prom.yaml /README.md: -------------------------------------------------------------------------------- 1 | # prometheus-monitoring-guide -------------------------------------------------------------------------------- /alertmanager-example/alertmanager-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: alertmanager-deployment 5 | labels: 6 | app: alertmanager 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: alertmanager 12 | template: 13 | metadata: 14 | labels: 15 | app: alertmanager 16 | spec: 17 | containers: 18 | - name: alertmanager 19 | image: prom/alertmanager 20 | volumeMounts: 21 | - name: alertmanager-config 22 | mountPath: /etc/alertmanager/alertmanager.yml 23 | subPath: alertmanager.yml 24 | ports: 25 | - containerPort: 9093 26 | volumes: 27 | - name: alertmanager-config 28 | configMap: 29 | name: alertmanager-config 30 | --- 31 | kind: Service 32 | apiVersion: v1 33 | metadata: 34 | name: alertmanager-service 35 | spec: 36 | selector: 37 | app: alertmanager 38 | ports: 39 | - name: alertui 40 | protocol: TCP 41 | port: 9093 42 | targetPort: 9093 43 | -------------------------------------------------------------------------------- /alertmanager-example/alertmanager.yml: -------------------------------------------------------------------------------- 1 | global: 2 | resolve_timeout: 5m 3 | route: 4 | group_by: ['alertname'] 5 | group_wait: 10s 6 | group_interval: 10s 7 | repeat_interval: 1h 8 | receiver: 'sysdig-test' 9 | receivers: 10 | - name: 'sysdig-test' 11 | webhook_configs: 12 | - url: 'your url here' 13 | -------------------------------------------------------------------------------- /alertmanager-example/generalrules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: example 3 | rules: 4 | - alert: DeadMansSwitch 5 | annotations: 6 | description: This is a DeadMansSwitch meant to ensure that the entire Alerting 7 | pipeline is functional. 8 | summary: Alerting DeadMansSwitch 9 | expr: vector(1) 10 | labels: 11 | severity: none 12 | 13 | -------------------------------------------------------------------------------- /alertmanager-example/prometheus-example.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: prometheus-deployment 5 | labels: 6 | app: prometheus 7 | purpose: example 8 | spec: 9 | replicas: 2 10 | selector: 11 | matchLabels: 12 | app: prometheus 13 | purpose: example 14 | template: 15 | metadata: 16 | labels: 17 | app: prometheus 18 | purpose: example 19 | spec: 20 | containers: 21 | - name: prometheus-example 22 | image: prom/prometheus 23 | volumeMounts: 24 | - name: config-volume 25 | mountPath: /etc/prometheus/prometheus.yml 26 | subPath: prometheus.yml 27 | - name: rules-general 28 | mountPath: /etc/prometheus/rules/generalrules.yaml 29 | subPath: generalrules.yaml 30 | ports: 31 | - containerPort: 9090 32 | volumes: 33 | - name: config-volume 34 | configMap: 35 | name: prometheus-example-cm 36 | - name: rules-general 37 | configMap: 38 | name: prometheus-rules-general 39 | 40 | --- 41 | kind: Service 42 | apiVersion: v1 43 | metadata: 44 | name: prometheus-example-service 45 | spec: 46 | selector: 47 | app: prometheus 48 | purpose: example 49 | ports: 50 | - name: promui 51 | protocol: TCP 52 | port: 9090 53 | targetPort: 9090 54 | -------------------------------------------------------------------------------- /alertmanager-example/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | external_labels: 4 | monitor: 'sysdig-prometheus' 5 | scrape_configs: 6 | - job_name: 'prometheus' 7 | scrape_interval: 5s 8 | static_configs: 9 | - targets: ['localhost:9090'] 10 | rule_files: 11 | - /etc/prometheus/rules/*.yaml 12 | alerting: 13 | alertmanagers: 14 | - scheme: http 15 | static_configs: 16 | - targets: 17 | - "alertmanager-service:9093" 18 | 19 | -------------------------------------------------------------------------------- /grafana/dashboard-capacity.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | kube-capacity-planning.json: | 4 | { 5 | "annotations": { 6 | "list": [ 7 | { 8 | "builtIn": 1, 9 | "datasource": "-- Grafana --", 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "type": "dashboard" 15 | } 16 | ] 17 | }, 18 | "editable": true, 19 | "gnetId": 22, 20 | "graphTooltip": 0, 21 | "id": 2, 22 | "links": [], 23 | "panels": [ 24 | { 25 | "aliasColors": {}, 26 | "bars": false, 27 | "dashLength": 10, 28 | "dashes": false, 29 | "datasource": "Prometheus-server", 30 | "editable": true, 31 | "error": false, 32 | "fill": 1, 33 | "grid": { 34 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 35 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 36 | }, 37 | "gridPos": { 38 | "h": 7, 39 | "w": 12, 40 | "x": 0, 41 | "y": 0 42 | }, 43 | "id": 3, 44 | "isNew": false, 45 | "legend": { 46 | "alignAsTable": false, 47 | "avg": false, 48 | "current": false, 49 | "hideEmpty": false, 50 | "hideZero": false, 51 | "max": false, 52 | "min": false, 53 | "rightSide": false, 54 | "show": true, 55 | "total": false, 56 | "values": false 57 | }, 58 | "lines": true, 59 | "linewidth": 2, 60 | "links": [], 61 | "nullPointMode": "connected", 62 | "percentage": false, 63 | "pointradius": 5, 64 | "points": false, 65 | "renderer": "flot", 66 | "seriesOverrides": [], 67 | "spaceLength": 10, 68 | "stack": false, 69 | "steppedLine": false, 70 | "targets": [ 71 | { 72 | "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", 73 | "hide": false, 74 | "intervalFactor": 10, 75 | "legendFormat": "", 76 | "refId": "A", 77 | "step": 50 78 | } 79 | ], 80 | "thresholds": [], 81 | "timeFrom": null, 82 | "timeShift": null, 83 | "title": "Idle CPU", 84 | "tooltip": { 85 | "msResolution": false, 86 | "shared": true, 87 | "sort": 0, 88 | "value_type": "cumulative" 89 | }, 90 | "type": "graph", 91 | "xaxis": { 92 | "buckets": null, 93 | "mode": "time", 94 | "name": null, 95 | "show": true, 96 | "values": [] 97 | }, 98 | "yaxes": [ 99 | { 100 | "format": "percent", 101 | "label": "cpu usage", 102 | "logBase": 1, 103 | "min": 0, 104 | "show": true 105 | }, 106 | { 107 | "format": "short", 108 | "logBase": 1, 109 | "show": true 110 | } 111 | ] 112 | }, 113 | { 114 | "aliasColors": {}, 115 | "bars": false, 116 | "dashLength": 10, 117 | "dashes": false, 118 | "datasource": "Prometheus-server", 119 | "editable": true, 120 | "error": false, 121 | "fill": 1, 122 | "grid": { 123 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 124 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 125 | }, 126 | "gridPos": { 127 | "h": 7, 128 | "w": 12, 129 | "x": 12, 130 | "y": 0 131 | }, 132 | "id": 9, 133 | "isNew": false, 134 | "legend": { 135 | "alignAsTable": false, 136 | "avg": false, 137 | "current": false, 138 | "hideEmpty": false, 139 | "hideZero": false, 140 | "max": false, 141 | "min": false, 142 | "rightSide": false, 143 | "show": true, 144 | "total": false, 145 | "values": false 146 | }, 147 | "lines": true, 148 | "linewidth": 2, 149 | "links": [], 150 | "nullPointMode": "connected", 151 | "percentage": false, 152 | "pointradius": 5, 153 | "points": false, 154 | "renderer": "flot", 155 | "seriesOverrides": [], 156 | "spaceLength": 10, 157 | "stack": false, 158 | "steppedLine": false, 159 | "targets": [ 160 | { 161 | "expr": "sum(node_load1)", 162 | "intervalFactor": 4, 163 | "legendFormat": "load 1m", 164 | "refId": "A", 165 | "step": 20, 166 | "target": "" 167 | }, 168 | { 169 | "expr": "sum(node_load5)", 170 | "intervalFactor": 4, 171 | "legendFormat": "load 5m", 172 | "refId": "B", 173 | "step": 20, 174 | "target": "" 175 | }, 176 | { 177 | "expr": "sum(node_load15)", 178 | "intervalFactor": 4, 179 | "legendFormat": "load 15m", 180 | "refId": "C", 181 | "step": 20, 182 | "target": "" 183 | } 184 | ], 185 | "thresholds": [], 186 | "timeFrom": null, 187 | "timeShift": null, 188 | "title": "System Load", 189 | "tooltip": { 190 | "msResolution": false, 191 | "shared": true, 192 | "sort": 0, 193 | "value_type": "cumulative" 194 | }, 195 | "type": "graph", 196 | "xaxis": { 197 | "buckets": null, 198 | "mode": "time", 199 | "name": null, 200 | "show": true, 201 | "values": [] 202 | }, 203 | "yaxes": [ 204 | { 205 | "format": "percentunit", 206 | "logBase": 1, 207 | "show": true 208 | }, 209 | { 210 | "format": "short", 211 | "logBase": 1, 212 | "show": true 213 | } 214 | ] 215 | }, 216 | { 217 | "aliasColors": {}, 218 | "bars": false, 219 | "dashLength": 10, 220 | "dashes": false, 221 | "datasource": "Prometheus-server", 222 | "editable": true, 223 | "error": false, 224 | "fill": 1, 225 | "grid": { 226 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 227 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 228 | }, 229 | "gridPos": { 230 | "h": 7, 231 | "w": 18, 232 | "x": 0, 233 | "y": 7 234 | }, 235 | "id": 4, 236 | "isNew": false, 237 | "legend": { 238 | "alignAsTable": false, 239 | "avg": false, 240 | "current": false, 241 | "hideEmpty": false, 242 | "hideZero": false, 243 | "max": false, 244 | "min": false, 245 | "rightSide": false, 246 | "show": true, 247 | "total": false, 248 | "values": false 249 | }, 250 | "lines": true, 251 | "linewidth": 2, 252 | "links": [], 253 | "nullPointMode": "connected", 254 | "percentage": false, 255 | "pointradius": 5, 256 | "points": false, 257 | "renderer": "flot", 258 | "seriesOverrides": [ 259 | { 260 | "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", 261 | "yaxis": 2 262 | } 263 | ], 264 | "spaceLength": 10, 265 | "stack": true, 266 | "steppedLine": false, 267 | "targets": [ 268 | { 269 | "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", 270 | "intervalFactor": 2, 271 | "legendFormat": "memory usage", 272 | "metric": "memo", 273 | "refId": "A", 274 | "step": 10, 275 | "target": "" 276 | }, 277 | { 278 | "expr": "sum(node_memory_Buffers)", 279 | "interval": "", 280 | "intervalFactor": 2, 281 | "legendFormat": "memory buffers", 282 | "metric": "memo", 283 | "refId": "B", 284 | "step": 10, 285 | "target": "" 286 | }, 287 | { 288 | "expr": "sum(node_memory_Cached)", 289 | "interval": "", 290 | "intervalFactor": 2, 291 | "legendFormat": "memory cached", 292 | "metric": "memo", 293 | "refId": "C", 294 | "step": 10, 295 | "target": "" 296 | }, 297 | { 298 | "expr": "sum(node_memory_MemFree)", 299 | "interval": "", 300 | "intervalFactor": 2, 301 | "legendFormat": "memory free", 302 | "metric": "memo", 303 | "refId": "D", 304 | "step": 10, 305 | "target": "" 306 | } 307 | ], 308 | "thresholds": [], 309 | "timeFrom": null, 310 | "timeShift": null, 311 | "title": "Memory Usage", 312 | "tooltip": { 313 | "msResolution": false, 314 | "shared": true, 315 | "sort": 0, 316 | "value_type": "individual" 317 | }, 318 | "type": "graph", 319 | "xaxis": { 320 | "buckets": null, 321 | "mode": "time", 322 | "name": null, 323 | "show": true, 324 | "values": [] 325 | }, 326 | "yaxes": [ 327 | { 328 | "format": "bytes", 329 | "logBase": 1, 330 | "min": "0", 331 | "show": true 332 | }, 333 | { 334 | "format": "short", 335 | "logBase": 1, 336 | "show": true 337 | } 338 | ] 339 | }, 340 | { 341 | "cacheTimeout": null, 342 | "colorBackground": false, 343 | "colorValue": false, 344 | "colors": [ 345 | "rgba(50, 172, 45, 0.97)", 346 | "rgba(237, 129, 40, 0.89)", 347 | "rgba(245, 54, 54, 0.9)" 348 | ], 349 | "datasource": "Prometheus-server", 350 | "editable": true, 351 | "format": "percent", 352 | "gauge": { 353 | "maxValue": 100, 354 | "minValue": 0, 355 | "show": true, 356 | "thresholdLabels": false, 357 | "thresholdMarkers": true 358 | }, 359 | "gridPos": { 360 | "h": 7, 361 | "w": 6, 362 | "x": 18, 363 | "y": 7 364 | }, 365 | "hideTimeOverride": false, 366 | "id": 5, 367 | "interval": null, 368 | "links": [], 369 | "mappingType": 1, 370 | "mappingTypes": [ 371 | { 372 | "name": "value to text", 373 | "value": 1 374 | }, 375 | { 376 | "name": "range to text", 377 | "value": 2 378 | } 379 | ], 380 | "maxDataPoints": 100, 381 | "nullPointMode": "connected", 382 | "nullText": null, 383 | "postfix": "", 384 | "postfixFontSize": "50%", 385 | "prefix": "", 386 | "prefixFontSize": "50%", 387 | "rangeMaps": [ 388 | { 389 | "from": "null", 390 | "text": "N/A", 391 | "to": "null" 392 | } 393 | ], 394 | "sparkline": { 395 | "fillColor": "rgba(31, 118, 189, 0.18)", 396 | "full": false, 397 | "lineColor": "rgb(31, 120, 193)", 398 | "show": false 399 | }, 400 | "tableColumn": "", 401 | "targets": [ 402 | { 403 | "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", 404 | "intervalFactor": 2, 405 | "metric": "", 406 | "refId": "A", 407 | "step": 60, 408 | "target": "" 409 | } 410 | ], 411 | "thresholds": "80, 90", 412 | "title": "Memory Usage", 413 | "transparent": false, 414 | "type": "singlestat", 415 | "valueFontSize": "80%", 416 | "valueMaps": [ 417 | { 418 | "op": "=", 419 | "text": "N/A", 420 | "value": "null" 421 | } 422 | ], 423 | "valueName": "avg" 424 | }, 425 | { 426 | "aliasColors": {}, 427 | "bars": false, 428 | "dashLength": 10, 429 | "dashes": false, 430 | "datasource": "Prometheus-server", 431 | "editable": true, 432 | "error": false, 433 | "fill": 1, 434 | "grid": { 435 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 436 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 437 | }, 438 | "gridPos": { 439 | "h": 7, 440 | "w": 18, 441 | "x": 0, 442 | "y": 14 443 | }, 444 | "id": 6, 445 | "isNew": false, 446 | "legend": { 447 | "alignAsTable": false, 448 | "avg": false, 449 | "current": false, 450 | "hideEmpty": false, 451 | "hideZero": false, 452 | "max": false, 453 | "min": false, 454 | "rightSide": false, 455 | "show": true, 456 | "total": false, 457 | "values": false 458 | }, 459 | "lines": true, 460 | "linewidth": 2, 461 | "links": [], 462 | "nullPointMode": "connected", 463 | "percentage": false, 464 | "pointradius": 5, 465 | "points": false, 466 | "renderer": "flot", 467 | "seriesOverrides": [ 468 | { 469 | "alias": "read", 470 | "yaxis": 1 471 | }, 472 | { 473 | "alias": "{instance=\"172.17.0.1:9100\"}", 474 | "yaxis": 2 475 | }, 476 | { 477 | "alias": "io time", 478 | "yaxis": 2 479 | } 480 | ], 481 | "spaceLength": 10, 482 | "stack": false, 483 | "steppedLine": false, 484 | "targets": [ 485 | { 486 | "expr": "sum(rate(node_disk_bytes_read[5m]))", 487 | "hide": false, 488 | "intervalFactor": 4, 489 | "legendFormat": "read", 490 | "refId": "A", 491 | "step": 20, 492 | "target": "" 493 | }, 494 | { 495 | "expr": "sum(rate(node_disk_bytes_written[5m]))", 496 | "intervalFactor": 4, 497 | "legendFormat": "written", 498 | "refId": "B", 499 | "step": 20 500 | }, 501 | { 502 | "expr": "sum(rate(node_disk_io_time_ms[5m]))", 503 | "intervalFactor": 4, 504 | "legendFormat": "io time", 505 | "refId": "C", 506 | "step": 20 507 | } 508 | ], 509 | "thresholds": [], 510 | "timeFrom": null, 511 | "timeShift": null, 512 | "title": "Disk I/O", 513 | "tooltip": { 514 | "msResolution": false, 515 | "shared": true, 516 | "sort": 0, 517 | "value_type": "cumulative" 518 | }, 519 | "type": "graph", 520 | "xaxis": { 521 | "buckets": null, 522 | "mode": "time", 523 | "name": null, 524 | "show": true, 525 | "values": [] 526 | }, 527 | "yaxes": [ 528 | { 529 | "format": "bytes", 530 | "logBase": 1, 531 | "show": true 532 | }, 533 | { 534 | "format": "ms", 535 | "logBase": 1, 536 | "show": true 537 | } 538 | ] 539 | }, 540 | { 541 | "cacheTimeout": null, 542 | "colorBackground": false, 543 | "colorValue": false, 544 | "colors": [ 545 | "rgba(50, 172, 45, 0.97)", 546 | "rgba(237, 129, 40, 0.89)", 547 | "rgba(245, 54, 54, 0.9)" 548 | ], 549 | "datasource": "Prometheus-server", 550 | "editable": true, 551 | "format": "percentunit", 552 | "gauge": { 553 | "maxValue": 1, 554 | "minValue": 0, 555 | "show": true, 556 | "thresholdLabels": false, 557 | "thresholdMarkers": true 558 | }, 559 | "gridPos": { 560 | "h": 7, 561 | "w": 6, 562 | "x": 18, 563 | "y": 14 564 | }, 565 | "hideTimeOverride": false, 566 | "id": 12, 567 | "interval": null, 568 | "links": [], 569 | "mappingType": 1, 570 | "mappingTypes": [ 571 | { 572 | "name": "value to text", 573 | "value": 1 574 | }, 575 | { 576 | "name": "range to text", 577 | "value": 2 578 | } 579 | ], 580 | "maxDataPoints": 100, 581 | "nullPointMode": "connected", 582 | "nullText": null, 583 | "postfix": "", 584 | "postfixFontSize": "50%", 585 | "prefix": "", 586 | "prefixFontSize": "50%", 587 | "rangeMaps": [ 588 | { 589 | "from": "null", 590 | "text": "N/A", 591 | "to": "null" 592 | } 593 | ], 594 | "sparkline": { 595 | "fillColor": "rgba(31, 118, 189, 0.18)", 596 | "full": false, 597 | "lineColor": "rgb(31, 120, 193)", 598 | "show": false 599 | }, 600 | "tableColumn": "", 601 | "targets": [ 602 | { 603 | "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", 604 | "intervalFactor": 2, 605 | "refId": "A", 606 | "step": 60, 607 | "target": "" 608 | } 609 | ], 610 | "thresholds": "0.75, 0.9", 611 | "title": "Disk Space Usage", 612 | "transparent": false, 613 | "type": "singlestat", 614 | "valueFontSize": "80%", 615 | "valueMaps": [ 616 | { 617 | "op": "=", 618 | "text": "N/A", 619 | "value": "null" 620 | } 621 | ], 622 | "valueName": "current" 623 | }, 624 | { 625 | "aliasColors": {}, 626 | "bars": false, 627 | "dashLength": 10, 628 | "dashes": false, 629 | "datasource": "Prometheus-server", 630 | "editable": true, 631 | "error": false, 632 | "fill": 1, 633 | "grid": { 634 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 635 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 636 | }, 637 | "gridPos": { 638 | "h": 7, 639 | "w": 12, 640 | "x": 0, 641 | "y": 21 642 | }, 643 | "id": 8, 644 | "isNew": false, 645 | "legend": { 646 | "alignAsTable": false, 647 | "avg": false, 648 | "current": false, 649 | "hideEmpty": false, 650 | "hideZero": false, 651 | "max": false, 652 | "min": false, 653 | "rightSide": false, 654 | "show": true, 655 | "total": false, 656 | "values": false 657 | }, 658 | "lines": true, 659 | "linewidth": 2, 660 | "links": [], 661 | "nullPointMode": "connected", 662 | "percentage": false, 663 | "pointradius": 5, 664 | "points": false, 665 | "renderer": "flot", 666 | "seriesOverrides": [ 667 | { 668 | "alias": "transmitted", 669 | "yaxis": 2 670 | } 671 | ], 672 | "spaceLength": 10, 673 | "stack": false, 674 | "steppedLine": false, 675 | "targets": [ 676 | { 677 | "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", 678 | "hide": false, 679 | "intervalFactor": 2, 680 | "legendFormat": "", 681 | "refId": "A", 682 | "step": 10, 683 | "target": "" 684 | } 685 | ], 686 | "thresholds": [], 687 | "timeFrom": null, 688 | "timeShift": null, 689 | "title": "Network Received", 690 | "tooltip": { 691 | "msResolution": false, 692 | "shared": true, 693 | "sort": 0, 694 | "value_type": "cumulative" 695 | }, 696 | "type": "graph", 697 | "xaxis": { 698 | "buckets": null, 699 | "mode": "time", 700 | "name": null, 701 | "show": true, 702 | "values": [] 703 | }, 704 | "yaxes": [ 705 | { 706 | "format": "bytes", 707 | "logBase": 1, 708 | "show": true 709 | }, 710 | { 711 | "format": "bytes", 712 | "logBase": 1, 713 | "show": true 714 | } 715 | ] 716 | }, 717 | { 718 | "aliasColors": {}, 719 | "bars": false, 720 | "dashLength": 10, 721 | "dashes": false, 722 | "datasource": "Prometheus-server", 723 | "editable": true, 724 | "error": false, 725 | "fill": 1, 726 | "grid": { 727 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 728 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 729 | }, 730 | "gridPos": { 731 | "h": 7, 732 | "w": 12, 733 | "x": 12, 734 | "y": 21 735 | }, 736 | "id": 10, 737 | "isNew": false, 738 | "legend": { 739 | "alignAsTable": false, 740 | "avg": false, 741 | "current": false, 742 | "hideEmpty": false, 743 | "hideZero": false, 744 | "max": false, 745 | "min": false, 746 | "rightSide": false, 747 | "show": true, 748 | "total": false, 749 | "values": false 750 | }, 751 | "lines": true, 752 | "linewidth": 2, 753 | "links": [], 754 | "nullPointMode": "connected", 755 | "percentage": false, 756 | "pointradius": 5, 757 | "points": false, 758 | "renderer": "flot", 759 | "seriesOverrides": [ 760 | { 761 | "alias": "transmitted", 762 | "yaxis": 2 763 | } 764 | ], 765 | "spaceLength": 10, 766 | "stack": false, 767 | "steppedLine": false, 768 | "targets": [ 769 | { 770 | "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", 771 | "hide": false, 772 | "intervalFactor": 2, 773 | "legendFormat": "", 774 | "refId": "B", 775 | "step": 10, 776 | "target": "" 777 | } 778 | ], 779 | "thresholds": [], 780 | "timeFrom": null, 781 | "timeShift": null, 782 | "title": "Network Transmitted", 783 | "tooltip": { 784 | "msResolution": false, 785 | "shared": true, 786 | "sort": 0, 787 | "value_type": "cumulative" 788 | }, 789 | "type": "graph", 790 | "xaxis": { 791 | "buckets": null, 792 | "mode": "time", 793 | "name": null, 794 | "show": true, 795 | "values": [] 796 | }, 797 | "yaxes": [ 798 | { 799 | "format": "bytes", 800 | "logBase": 1, 801 | "show": true 802 | }, 803 | { 804 | "format": "bytes", 805 | "logBase": 1, 806 | "show": true 807 | } 808 | ] 809 | }, 810 | { 811 | "aliasColors": {}, 812 | "bars": false, 813 | "dashLength": 10, 814 | "dashes": false, 815 | "datasource": "Prometheus-server", 816 | "editable": true, 817 | "error": false, 818 | "fill": 1, 819 | "grid": { 820 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 821 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 822 | }, 823 | "gridPos": { 824 | "h": 7, 825 | "w": 18, 826 | "x": 0, 827 | "y": 28 828 | }, 829 | "id": 11, 830 | "isNew": true, 831 | "legend": { 832 | "alignAsTable": false, 833 | "avg": false, 834 | "current": false, 835 | "hideEmpty": false, 836 | "hideZero": false, 837 | "max": false, 838 | "min": false, 839 | "rightSide": false, 840 | "show": true, 841 | "total": false, 842 | "values": false 843 | }, 844 | "lines": true, 845 | "linewidth": 2, 846 | "links": [], 847 | "nullPointMode": "connected", 848 | "percentage": false, 849 | "pointradius": 5, 850 | "points": false, 851 | "renderer": "flot", 852 | "seriesOverrides": [], 853 | "spaceLength": 11, 854 | "stack": false, 855 | "steppedLine": false, 856 | "targets": [ 857 | { 858 | "expr": "sum(kube_pod_info)", 859 | "format": "time_series", 860 | "intervalFactor": 2, 861 | "legendFormat": "Current number of Pods", 862 | "refId": "A", 863 | "step": 10 864 | }, 865 | { 866 | "expr": "sum(kube_node_status_capacity_pods)", 867 | "format": "time_series", 868 | "intervalFactor": 2, 869 | "legendFormat": "Maximum capacity of pods", 870 | "refId": "B", 871 | "step": 10 872 | } 873 | ], 874 | "thresholds": [], 875 | "timeFrom": null, 876 | "timeShift": null, 877 | "title": "Cluster Pod Utilization", 878 | "tooltip": { 879 | "msResolution": false, 880 | "shared": true, 881 | "sort": 0, 882 | "value_type": "individual" 883 | }, 884 | "type": "graph", 885 | "xaxis": { 886 | "buckets": null, 887 | "mode": "time", 888 | "name": null, 889 | "show": true, 890 | "values": [] 891 | }, 892 | "yaxes": [ 893 | { 894 | "format": "short", 895 | "logBase": 1, 896 | "show": true 897 | }, 898 | { 899 | "format": "short", 900 | "logBase": 1, 901 | "show": true 902 | } 903 | ] 904 | }, 905 | { 906 | "cacheTimeout": null, 907 | "colorBackground": false, 908 | "colorValue": false, 909 | "colors": [ 910 | "rgba(50, 172, 45, 0.97)", 911 | "rgba(237, 129, 40, 0.89)", 912 | "rgba(245, 54, 54, 0.9)" 913 | ], 914 | "datasource": "Prometheus-server", 915 | "editable": true, 916 | "format": "percent", 917 | "gauge": { 918 | "maxValue": 100, 919 | "minValue": 0, 920 | "show": true, 921 | "thresholdLabels": false, 922 | "thresholdMarkers": true 923 | }, 924 | "gridPos": { 925 | "h": 7, 926 | "w": 6, 927 | "x": 18, 928 | "y": 28 929 | }, 930 | "hideTimeOverride": false, 931 | "id": 7, 932 | "interval": null, 933 | "links": [], 934 | "mappingType": 1, 935 | "mappingTypes": [ 936 | { 937 | "name": "value to text", 938 | "value": 1 939 | }, 940 | { 941 | "name": "range to text", 942 | "value": 2 943 | } 944 | ], 945 | "maxDataPoints": 100, 946 | "nullPointMode": "connected", 947 | "nullText": null, 948 | "postfix": "", 949 | "postfixFontSize": "50%", 950 | "prefix": "", 951 | "prefixFontSize": "50%", 952 | "rangeMaps": [ 953 | { 954 | "from": "null", 955 | "text": "N/A", 956 | "to": "null" 957 | } 958 | ], 959 | "sparkline": { 960 | "fillColor": "rgba(31, 118, 189, 0.18)", 961 | "full": false, 962 | "lineColor": "rgb(31, 120, 193)", 963 | "show": false 964 | }, 965 | "tableColumn": "", 966 | "targets": [ 967 | { 968 | "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", 969 | "format": "time_series", 970 | "intervalFactor": 2, 971 | "legendFormat": "", 972 | "refId": "A", 973 | "step": 60, 974 | "target": "" 975 | } 976 | ], 977 | "thresholds": "80, 90", 978 | "title": "Pod Utilization", 979 | "transparent": false, 980 | "type": "singlestat", 981 | "valueFontSize": "80%", 982 | "valueMaps": [ 983 | { 984 | "op": "=", 985 | "text": "N/A", 986 | "value": "null" 987 | } 988 | ], 989 | "valueName": "current" 990 | } 991 | ], 992 | "refresh": false, 993 | "schemaVersion": 16, 994 | "style": "dark", 995 | "tags": [], 996 | "templating": { 997 | "list": [] 998 | }, 999 | "time": { 1000 | "from": "now-1h", 1001 | "to": "now" 1002 | }, 1003 | "timepicker": { 1004 | "refresh_intervals": [ 1005 | "5s", 1006 | "10s", 1007 | "30s", 1008 | "1m", 1009 | "5m", 1010 | "15m", 1011 | "30m", 1012 | "1h", 1013 | "2h", 1014 | "1d" 1015 | ], 1016 | "time_options": [ 1017 | "5m", 1018 | "15m", 1019 | "1h", 1020 | "6h", 1021 | "12h", 1022 | "24h", 1023 | "2d", 1024 | "7d", 1025 | "30d" 1026 | ] 1027 | }, 1028 | "timezone": "browser", 1029 | "title": "Kubernetes Capacity Planning", 1030 | "uid": "MpzrdxIik", 1031 | "version": 2 1032 | } 1033 | kind: ConfigMap 1034 | metadata: 1035 | labels: 1036 | grafana_dashboard: "true" 1037 | name: dashboard-k8s-capacity 1038 | -------------------------------------------------------------------------------- /grafana/datasource.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: sample-grafana-datasource 5 | labels: 6 | grafana_datasource: "true" 7 | data: 8 | datasource.yaml: |- 9 | apiVersion: 1 10 | datasources: 11 | - name: Prometheus-server 12 | type: prometheus 13 | access: proxy 14 | orgId: 1 15 | url: http://prometheus-example-service:9090 16 | isDefault: true 17 | -------------------------------------------------------------------------------- /operator/alertmanager.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Alertmanager 3 | metadata: 4 | labels: 5 | alertmanager: main 6 | name: main 7 | namespace: monitoring 8 | spec: 9 | baseImage: quay.io/prometheus/alertmanager 10 | nodeSelector: 11 | beta.kubernetes.io/os: linux 12 | replicas: 1 13 | serviceAccountName: alertmanager-main 14 | version: v0.15.2 15 | -------------------------------------------------------------------------------- /operator/grafana/prometheus.yaml: -------------------------------------------------------------------------------- 1 | { 2 | "apiVersion": 1, 3 | "datasources": [ 4 | { 5 | "access": "proxy", 6 | "editable": false, 7 | "name": "prometheus", 8 | "orgId": 1, 9 | "type": "prometheus", 10 | "url": "http://prometheus-k8s.monitoring.svc:9090", 11 | "version": 1 12 | }, 13 | { 14 | "access": "proxy", 15 | "editable": false, 16 | "name": "service-prometheus", 17 | "orgId": 1, 18 | "type": "prometheus", 19 | "url": "http://service-prometheus.monitoring.svc:9090", 20 | "version": 1 21 | } 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /operator/prometheusrules.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | prometheus: service-prometheus 6 | role: alert-rules 7 | name: prometheus-service-rules 8 | namespace: monitoring 9 | spec: 10 | groups: 11 | - name: general.rules 12 | rules: 13 | - alert: TargetDown-serviceprom 14 | annotations: 15 | description: '{{ $value }}% of {{ $labels.job }} targets are down.' 16 | summary: Targets are down 17 | expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 18 | for: 10m 19 | labels: 20 | severity: warning 21 | - alert: DeadMansSwitch-serviceprom 22 | annotations: 23 | description: This is a DeadMansSwitch meant to ensure that the entire Alerting 24 | pipeline is functional. 25 | summary: Alerting DeadMansSwitch 26 | expr: vector(1) 27 | labels: 28 | severity: none 29 | -------------------------------------------------------------------------------- /operator/service-prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Prometheus 3 | metadata: 4 | labels: 5 | app: prometheus 6 | prometheus: service-prometheus 7 | name: service-prometheus 8 | namespace: monitoring 9 | spec: 10 | alerting: 11 | alertmanagers: 12 | - name: alertmanager-main 13 | namespace: monitoring 14 | port: web 15 | baseImage: quay.io/prometheus/prometheus 16 | logLevel: info 17 | paused: false 18 | replicas: 2 19 | retention: 2d 20 | routePrefix: / 21 | ruleSelector: 22 | matchLabels: 23 | prometheus: service-prometheus 24 | role: alert-rules 25 | serviceAccountName: prometheus-k8s 26 | serviceMonitorSelector: 27 | matchExpressions: 28 | - key: serviceapp 29 | operator: Exists 30 | 31 | -------------------------------------------------------------------------------- /operator/servicemonitor-coredns.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | labels: 5 | serviceapp: coredns-servicemonitor 6 | name: coredns-servicemonitor 7 | namespace: monitoring 8 | spec: 9 | endpoints: 10 | - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 11 | interval: 15s 12 | port: metrics 13 | namespaceSelector: 14 | matchNames: 15 | - coredns 16 | selector: 17 | matchLabels: 18 | release: coredns 19 | 20 | -------------------------------------------------------------------------------- /prometheus-svc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | prometheus: service-prometheus 6 | name: service-prometheus 7 | namespace: monitoring 8 | spec: 9 | ports: 10 | - name: web 11 | port: 9090 12 | protocol: TCP 13 | targetPort: web 14 | selector: 15 | app: prometheus 16 | prometheus: service-prometheus 17 | -------------------------------------------------------------------------------- /pushgateway/pushgateway.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: pushgateway-deployment 5 | labels: 6 | app: pushgateway 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: pushgateway 12 | template: 13 | metadata: 14 | labels: 15 | app: pushgateway 16 | spec: 17 | containers: 18 | - name: pushgateway 19 | image: prom/pushgateway 20 | ports: 21 | - containerPort: 9091 22 | --- 23 | kind: Service 24 | apiVersion: v1 25 | metadata: 26 | name: pushgateway-service 27 | spec: 28 | selector: 29 | app: pushgateway 30 | ports: 31 | - name: pushgateway 32 | protocol: TCP 33 | port: 9091 34 | targetPort: 9091 35 | -------------------------------------------------------------------------------- /redis_prometheus_exporter.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: redis 5 | spec: 6 | replicas: 1 7 | template: 8 | metadata: 9 | annotations: 10 | prometheus.io/scrape: "true" 11 | prometheus.io/port: "9121" 12 | labels: 13 | app: redis 14 | spec: 15 | containers: 16 | - name: redis 17 | image: redis:4 18 | resources: 19 | requests: 20 | cpu: 100m 21 | memory: 100Mi 22 | ports: 23 | - containerPort: 6379 24 | - name: redis-exporter 25 | image: oliver006/redis_exporter:latest 26 | resources: 27 | requests: 28 | cpu: 100m 29 | memory: 100Mi 30 | ports: 31 | - containerPort: 9121 32 | --- 33 | kind: Service 34 | apiVersion: v1 35 | metadata: 36 | name: redis 37 | spec: 38 | selector: 39 | app: redis 40 | ports: 41 | - name: redis 42 | protocol: TCP 43 | port: 6379 44 | targetPort: 6379 45 | - name: prom 46 | protocol: TCP 47 | port: 9121 48 | targetPort: 9121 49 | -------------------------------------------------------------------------------- /storage/prometheus-example.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: StatefulSet 3 | metadata: 4 | name: prometheus-deployment 5 | labels: 6 | app: prometheus 7 | purpose: example 8 | spec: 9 | serviceName: prometheus 10 | replicas: 2 11 | selector: 12 | matchLabels: 13 | app: prometheus 14 | purpose: example 15 | volumeClaimTemplates: 16 | - metadata: 17 | name: prometheus-metrics-db 18 | spec: 19 | accessModes: 20 | - ReadWriteOnce 21 | resources: 22 | requests: 23 | storage: 50Gi 24 | template: 25 | metadata: 26 | labels: 27 | app: prometheus 28 | purpose: example 29 | spec: 30 | securityContext: 31 | runAsUser: 1000 32 | fsGroup: 2000 33 | runAsNonRoot: true 34 | containers: 35 | - args: 36 | - --storage.tsdb.path=/data 37 | - --storage.tsdb.retention=400d 38 | - --config.file=/etc/prometheus/prometheus.yml 39 | name: prometheus-example 40 | image: prom/prometheus 41 | volumeMounts: 42 | - name: config-volume 43 | mountPath: /etc/prometheus/prometheus.yml 44 | subPath: prometheus.yml 45 | - name: rules-general 46 | mountPath: /etc/prometheus/rules/generalrules.yaml 47 | subPath: generalrules.yaml 48 | - name: prometheus-metrics-db 49 | mountPath: /data 50 | ports: 51 | - containerPort: 9090 52 | volumes: 53 | - name: config-volume 54 | configMap: 55 | name: prometheus-example-cm 56 | - name: rules-general 57 | configMap: 58 | name: prometheus-rules-general 59 | 60 | --- 61 | kind: Service 62 | apiVersion: v1 63 | metadata: 64 | name: prometheus-example-service 65 | spec: 66 | selector: 67 | app: prometheus 68 | purpose: example 69 | ports: 70 | - name: promui 71 | protocol: TCP 72 | port: 9090 73 | targetPort: 9090 74 | -------------------------------------------------------------------------------- /traefik-prom.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: ClusterRole 3 | apiVersion: rbac.authorization.k8s.io/v1beta1 4 | metadata: 5 | name: traefik-ingress-controller 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - services 11 | - endpoints 12 | - secrets 13 | verbs: 14 | - get 15 | - list 16 | - watch 17 | - apiGroups: 18 | - extensions 19 | resources: 20 | - ingresses 21 | verbs: 22 | - get 23 | - list 24 | - watch 25 | --- 26 | kind: ClusterRoleBinding 27 | apiVersion: rbac.authorization.k8s.io/v1beta1 28 | metadata: 29 | name: traefik-ingress-controller 30 | roleRef: 31 | apiGroup: rbac.authorization.k8s.io 32 | kind: ClusterRole 33 | name: traefik-ingress-controller 34 | subjects: 35 | - kind: ServiceAccount 36 | name: traefik-ingress-controller 37 | namespace: default 38 | --- 39 | apiVersion: v1 40 | kind: ServiceAccount 41 | metadata: 42 | name: traefik-ingress-controller 43 | --- 44 | apiVersion: v1 45 | kind: ConfigMap 46 | data: 47 | traefik.toml: | 48 | debug = false 49 | defaultEntryPoints = ["http","https"] 50 | [entryPoints] 51 | [entryPoints.http] 52 | address = ":80" 53 | [entryPoints.http.redirect] 54 | entryPoint = "https" 55 | [entryPoints.https] 56 | address = ":443" 57 | [entryPoints.https.tls] 58 | [metrics] 59 | [metrics.prometheus] 60 | buckets = [0.1,0.3,1.2,5.0] 61 | metadata: 62 | name: traefik-conf 63 | --- 64 | kind: Deployment 65 | apiVersion: extensions/v1beta1 66 | metadata: 67 | name: traefik-ingress-controller 68 | labels: 69 | k8s-app: traefik-ingress-lb 70 | spec: 71 | replicas: 1 72 | template: 73 | metadata: 74 | labels: 75 | k8s-app: traefik-ingress-lb 76 | name: traefik-ingress-lb 77 | spec: 78 | serviceAccount: traefik-ingress-controller 79 | terminationGracePeriodSeconds: 60 80 | volumes: 81 | - name: config 82 | configMap: 83 | name: traefik-conf 84 | containers: 85 | - image: traefik 86 | name: traefik-ingress-lb 87 | imagePullPolicy: Always 88 | volumeMounts: 89 | - mountPath: "/config" 90 | name: "config" 91 | ports: 92 | - containerPort: 80 93 | - containerPort: 443 94 | - containerPort: 8080 95 | args: 96 | - --configfile=/config/traefik.toml 97 | - --web 98 | - --web.metrics.prometheus 99 | - --kubernetes 100 | - --logLevel=DEBUG 101 | --- 102 | apiVersion: v1 103 | kind: Service 104 | metadata: 105 | labels: 106 | k8s-app: traefik-ingress-lb 107 | name: traefik 108 | spec: 109 | ports: 110 | - name: http 111 | port: 80 112 | protocol: TCP 113 | targetPort: 80 114 | - name: https 115 | port: 443 116 | protocol: TCP 117 | targetPort: 443 118 | - name: admin 119 | port: 8080 120 | protocol: TCP 121 | targetPort: 8080 122 | selector: 123 | k8s-app: traefik-ingress-lb 124 | --------------------------------------------------------------------------------