├── .gitignore ├── delete.sh ├── dashboard.yaml ├── datasource-template.yaml ├── run.sh ├── README.md └── dashboards └── basic-coiled-dashboard.json /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | datasources/*.yaml 3 | -------------------------------------------------------------------------------- /delete.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker rm -f grafana 3 | docker volume rm grafana-storage 4 | -------------------------------------------------------------------------------- /dashboard.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: dashboards 5 | options: 6 | path: /var/lib/grafana/dashboards/basic-coiled-dashboard.json 7 | -------------------------------------------------------------------------------- /datasource-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: Prometheus 5 | type: prometheus 6 | access: proxy 7 | url: {endpoint} 8 | jsonData: 9 | timeInterval: 5s 10 | httpMethod: POST 11 | prometheusType: Prometheus 12 | sigV4Auth: true 13 | sigV4AuthType: keys 14 | sigV4Region: {region} 15 | secureJsonData: 16 | sigV4AccessKey: {key} 17 | sigV4SecretKey: {secret} 18 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker volume create grafana-storage 3 | docker run -d -p 3000:3000 \ 4 | --name=grafana \ 5 | -e "GF_AUTH_SIGV4_AUTH_ENABLED=true" \ 6 | -v $(pwd)/datasources/:/etc/grafana/provisioning/datasources/ \ 7 | -v $(pwd)/dashboard.yaml:/etc/grafana/provisioning/dashboards/dashboard.yaml \ 8 | -v $(pwd)/dashboards/:/var/lib/grafana/dashboards/ \ 9 | -v grafana-storage:/var/lib/grafana \ 10 | grafana/grafana-oss 11 | 12 | # this will work on macOS 13 | sleep 1 14 | open http://127.0.0.1:3000/d/GvbFsqKVk/coiled-cluster-metrics-basic 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Grafana via Docker 2 | 3 | Metrics from Coiled clusters are collected and stored in Prometheus. Some of these metrics are exposed in the Coiled web app, but for access to your cluster metrics, you'll want to use Grafana. 4 | 5 | There are many ways to run Grafana. You can use a hosted Grafana service (Grafana Cloud, Amazon Managed Grafana). You can run Grafana on a VM—we do this for internal Coiled users. 6 | 7 | But for an individual user, it's easiest to just run Grafana locally using Docker. That's what we'll do here. 8 | 9 | The included config will start Grafana, add your datasource and the basic Coiled dashboard, so it's very easy to get started looking at Coiled cluster metrics. 10 | 11 | # Get started running Grafana 12 | 13 | Your Coiled account will need to be configured with a "single-tenant" Prometheus workspace (or you're Coiled staff and have access to our shared workspace). 14 | 15 | Exporting the datasource configuration requires `coiled>=0.5.3`. 16 | 17 | 1. `cd` into this repo and run `coiled setup prometheus-datasource` to export configuration for your Prometheus storage as a datasource YAML file (or manually make one based on `datasource-template.yaml` and put it in `datasources/` directory) 18 | 2. `./run.sh` 19 | 3. On macOS, this will open a web browser automatically, but if that doesn't work, go to the [Grafana dashboard](http://localhost:3000/d/GvbFsqKVk/coiled-cluster-metrics-basic) 20 | 21 | The initial username and password are both 'admin', you can then change the password if you want. 22 | 23 | If you want to stop and start Grafana, just run `docker stop grafana` / `docker start grafana`. 24 | 25 | If you want to get rid of Grafana entirely, run `./delete.sh`. 26 | 27 | # How this works 28 | 29 | For persistent storage, we create a docker volume so that Prometheus settings will be persisted. 30 | 31 | The Docker run command mounts the datasource and dashboard so that they are "provisioned" when Grafana starts. 32 | 33 | We also need to set an ENV var so that SigV4 authentication is enabled; this is needed for accessing Amazon Managed Prometheus. 34 | -------------------------------------------------------------------------------- /dashboards/basic-coiled-dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 1, 27 | "id": 18, 28 | "links": [ 29 | { 30 | "asDropdown": false, 31 | "icon": "external link", 32 | "includeVars": false, 33 | "keepTime": false, 34 | "tags": [], 35 | "targetBlank": true, 36 | "title": "Cluster Details (prod)", 37 | "tooltip": "", 38 | "type": "link", 39 | "url": "https://cloud.coiled.io/${account}/clusters/${cluster_id}/details" 40 | } 41 | ], 42 | "liveNow": false, 43 | "panels": [ 44 | { 45 | "datasource": { 46 | "type": "prometheus", 47 | "uid": "${datasource}" 48 | }, 49 | "fieldConfig": { 50 | "defaults": { 51 | "color": { 52 | "mode": "thresholds" 53 | }, 54 | "mappings": [], 55 | "thresholds": { 56 | "mode": "absolute", 57 | "steps": [ 58 | { 59 | "color": "green", 60 | "value": null 61 | }, 62 | { 63 | "color": "red", 64 | "value": 80 65 | } 66 | ] 67 | }, 68 | "unit": "dtdurations" 69 | }, 70 | "overrides": [] 71 | }, 72 | "gridPos": { 73 | "h": 3, 74 | "w": 3, 75 | "x": 0, 76 | "y": 0 77 | }, 78 | "id": 79, 79 | "options": { 80 | "colorMode": "none", 81 | "graphMode": "none", 82 | "justifyMode": "auto", 83 | "orientation": "auto", 84 | "reduceOptions": { 85 | "calcs": [ 86 | "lastNotNull" 87 | ], 88 | "fields": "", 89 | "values": false 90 | }, 91 | "textMode": "auto" 92 | }, 93 | "pluginVersion": "9.1.5", 94 | "targets": [ 95 | { 96 | "datasource": { 97 | "type": "prometheus", 98 | "uid": "${datasource}" 99 | }, 100 | "editorMode": "code", 101 | "expr": "sum_over_time(\n # any workers connected\n count(\n sum(dask_scheduler_workers{cluster_id=\"$cluster_id\",state!=\"connected\"})\n # only trust metric when scheduler is running\n and on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0\n \n \n )[12h:1s]\n)\n", 102 | "format": "table", 103 | "hide": false, 104 | "legendFormat": "__auto", 105 | "range": true, 106 | "refId": "Time with non-trivial write" 107 | } 108 | ], 109 | "title": "Workers Connected", 110 | "type": "stat" 111 | }, 112 | { 113 | "datasource": { 114 | "type": "prometheus", 115 | "uid": "${datasource}" 116 | }, 117 | "fieldConfig": { 118 | "defaults": { 119 | "color": { 120 | "mode": "thresholds" 121 | }, 122 | "mappings": [], 123 | "thresholds": { 124 | "mode": "absolute", 125 | "steps": [ 126 | { 127 | "color": "green", 128 | "value": null 129 | }, 130 | { 131 | "color": "red", 132 | "value": 80 133 | } 134 | ] 135 | }, 136 | "unit": "dtdurations" 137 | }, 138 | "overrides": [] 139 | }, 140 | "gridPos": { 141 | "h": 3, 142 | "w": 3, 143 | "x": 3, 144 | "y": 0 145 | }, 146 | "id": 68, 147 | "options": { 148 | "colorMode": "none", 149 | "graphMode": "none", 150 | "justifyMode": "auto", 151 | "orientation": "auto", 152 | "reduceOptions": { 153 | "calcs": [ 154 | "lastNotNull" 155 | ], 156 | "fields": "", 157 | "values": false 158 | }, 159 | "textMode": "auto" 160 | }, 161 | "pluginVersion": "9.1.5", 162 | "targets": [ 163 | { 164 | "datasource": { 165 | "type": "prometheus", 166 | "uid": "${datasource}" 167 | }, 168 | "editorMode": "code", 169 | "expr": "sum_over_time(\n (dask_scheduler_tasks{cluster_id=\"$cluster_id\",state=\"processing\"} > bool 0)[12h:1s]\n)\n", 170 | "format": "table", 171 | "hide": false, 172 | "legendFormat": "__auto", 173 | "range": true, 174 | "refId": "Time with non-trivial write" 175 | } 176 | ], 177 | "title": "Processing Tasks", 178 | "type": "stat" 179 | }, 180 | { 181 | "datasource": { 182 | "type": "prometheus", 183 | "uid": "${datasource}" 184 | }, 185 | "fieldConfig": { 186 | "defaults": { 187 | "color": { 188 | "mode": "thresholds" 189 | }, 190 | "mappings": [], 191 | "thresholds": { 192 | "mode": "absolute", 193 | "steps": [ 194 | { 195 | "color": "green", 196 | "value": null 197 | }, 198 | { 199 | "color": "red", 200 | "value": 120 201 | } 202 | ] 203 | }, 204 | "unit": "dtdurations" 205 | }, 206 | "overrides": [] 207 | }, 208 | "gridPos": { 209 | "h": 3, 210 | "w": 3, 211 | "x": 6, 212 | "y": 0 213 | }, 214 | "id": 72, 215 | "options": { 216 | "colorMode": "value", 217 | "graphMode": "area", 218 | "justifyMode": "auto", 219 | "orientation": "auto", 220 | "reduceOptions": { 221 | "calcs": [ 222 | "lastNotNull" 223 | ], 224 | "fields": "", 225 | "values": false 226 | }, 227 | "textMode": "auto" 228 | }, 229 | "pluginVersion": "9.1.5", 230 | "targets": [ 231 | { 232 | "datasource": { 233 | "type": "prometheus", 234 | "uid": "${datasource}" 235 | }, 236 | "editorMode": "code", 237 | "expr": "sum_over_time(\n(\n\n# all instances are idle\n(\n# idle workers / connected workers\ndask_scheduler_workers{cluster_id=\"$cluster_id\",state=\"idle\"}\n/ on () sum(dask_scheduler_workers{cluster_id=\"$cluster_id\",state!=\"connected\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0\n) == 1\n\n)[12h:1s]\n) \n", 238 | "format": "table", 239 | "hide": false, 240 | "legendFormat": "__auto", 241 | "range": true, 242 | "refId": "Time with non-trivial write" 243 | } 244 | ], 245 | "title": "All Workers \"Idle\"", 246 | "type": "stat" 247 | }, 248 | { 249 | "datasource": { 250 | "type": "prometheus", 251 | "uid": "${datasource}" 252 | }, 253 | "fieldConfig": { 254 | "defaults": { 255 | "color": { 256 | "mode": "thresholds" 257 | }, 258 | "mappings": [], 259 | "noValue": "0", 260 | "thresholds": { 261 | "mode": "absolute", 262 | "steps": [ 263 | { 264 | "color": "green", 265 | "value": null 266 | }, 267 | { 268 | "color": "red", 269 | "value": 120 270 | } 271 | ] 272 | }, 273 | "unit": "dtdurations" 274 | }, 275 | "overrides": [] 276 | }, 277 | "gridPos": { 278 | "h": 3, 279 | "w": 3, 280 | "x": 9, 281 | "y": 0 282 | }, 283 | "id": 70, 284 | "options": { 285 | "colorMode": "value", 286 | "graphMode": "area", 287 | "justifyMode": "auto", 288 | "orientation": "auto", 289 | "reduceOptions": { 290 | "calcs": [ 291 | "lastNotNull" 292 | ], 293 | "fields": "", 294 | "values": false 295 | }, 296 | "textMode": "auto" 297 | }, 298 | "pluginVersion": "9.1.5", 299 | "targets": [ 300 | { 301 | "datasource": { 302 | "type": "prometheus", 303 | "uid": "${datasource}" 304 | }, 305 | "editorMode": "code", 306 | "expr": "sum_over_time(\n(\n count(\n # average CPU over entire cluster < 20%\n avg(sum by(instance,cpu) (irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode!=\"idle\"}[$__rate_interval]))) < .1\n # when tasks were processing\n and on () dask_scheduler_tasks{cluster_id=\"$cluster_id\",state=\"processing\"} > 0\n )\n)[12h:1s]\n) \n", 307 | "format": "table", 308 | "hide": false, 309 | "legendFormat": "__auto", 310 | "range": true, 311 | "refId": "Time with non-trivial write" 312 | } 313 | ], 314 | "title": "* Idle CPU (<10% avg)", 315 | "type": "stat" 316 | }, 317 | { 318 | "datasource": { 319 | "type": "prometheus", 320 | "uid": "${datasource}" 321 | }, 322 | "fieldConfig": { 323 | "defaults": { 324 | "color": { 325 | "mode": "thresholds" 326 | }, 327 | "mappings": [], 328 | "noValue": "0", 329 | "thresholds": { 330 | "mode": "absolute", 331 | "steps": [ 332 | { 333 | "color": "green", 334 | "value": null 335 | }, 336 | { 337 | "color": "red", 338 | "value": 120 339 | } 340 | ] 341 | }, 342 | "unit": "dtdurations" 343 | }, 344 | "overrides": [] 345 | }, 346 | "gridPos": { 347 | "h": 3, 348 | "w": 3, 349 | "x": 12, 350 | "y": 0 351 | }, 352 | "id": 81, 353 | "options": { 354 | "colorMode": "value", 355 | "graphMode": "area", 356 | "justifyMode": "auto", 357 | "orientation": "auto", 358 | "reduceOptions": { 359 | "calcs": [ 360 | "lastNotNull" 361 | ], 362 | "fields": "", 363 | "values": false 364 | }, 365 | "textMode": "auto" 366 | }, 367 | "pluginVersion": "9.1.5", 368 | "targets": [ 369 | { 370 | "datasource": { 371 | "type": "prometheus", 372 | "uid": "${datasource}" 373 | }, 374 | "editorMode": "code", 375 | "expr": "sum_over_time(\n(\n# for at least 20% of workers\n(\n # number of instances with\n count(\n # 60% or higher mem util (i.e., less than 40% available)\n (node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\", spec=\"worker\"} / node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\", spec=\"worker\"}) < .4\n )\n) / count(count by (instance) (sum_over_time(dask_worker_threads{cluster_id=\"$cluster_id\"}[2m]) > 0)) > bool .2\n# when tasks were processing\n# and on () dask_scheduler_tasks{cluster_id=\"$cluster_id\",state=\"processing\"} > 0\n\n)[12h:1s]\n) \n", 376 | "format": "table", 377 | "hide": false, 378 | "legendFormat": "__auto", 379 | "range": true, 380 | "refId": "Time with non-trivial write" 381 | } 382 | ], 383 | "title": "Memory Pressure (20% of workers >60%)", 384 | "type": "stat" 385 | }, 386 | { 387 | "datasource": { 388 | "type": "prometheus", 389 | "uid": "${datasource}" 390 | }, 391 | "fieldConfig": { 392 | "defaults": { 393 | "color": { 394 | "mode": "thresholds" 395 | }, 396 | "mappings": [], 397 | "noValue": "0", 398 | "thresholds": { 399 | "mode": "absolute", 400 | "steps": [ 401 | { 402 | "color": "green", 403 | "value": null 404 | }, 405 | { 406 | "color": "red", 407 | "value": 120 408 | } 409 | ] 410 | }, 411 | "unit": "dtdurations" 412 | }, 413 | "overrides": [] 414 | }, 415 | "gridPos": { 416 | "h": 3, 417 | "w": 3, 418 | "x": 15, 419 | "y": 0 420 | }, 421 | "id": 73, 422 | "options": { 423 | "colorMode": "value", 424 | "graphMode": "area", 425 | "justifyMode": "auto", 426 | "orientation": "auto", 427 | "reduceOptions": { 428 | "calcs": [ 429 | "lastNotNull" 430 | ], 431 | "fields": "", 432 | "values": false 433 | }, 434 | "textMode": "auto" 435 | }, 436 | "pluginVersion": "9.1.5", 437 | "targets": [ 438 | { 439 | "datasource": { 440 | "type": "prometheus", 441 | "uid": "${datasource}" 442 | }, 443 | "editorMode": "code", 444 | "expr": "sum_over_time( \n count(\n # for at least 1 worker\n (\n # number of instances with\n count(\n # 80% or higher mem util (i.e., less than 20% available)\n (node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\", spec=\"worker\"} / node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\", spec=\"worker\"}) < .2\n )\n ) > 1\n )[12h:1s]\n) \n", 445 | "format": "table", 446 | "hide": false, 447 | "legendFormat": "__auto", 448 | "range": true, 449 | "refId": "Time with non-trivial write" 450 | } 451 | ], 452 | "title": "Instance 80% Memory", 453 | "type": "stat" 454 | }, 455 | { 456 | "datasource": { 457 | "type": "prometheus", 458 | "uid": "${datasource}" 459 | }, 460 | "fieldConfig": { 461 | "defaults": { 462 | "color": { 463 | "mode": "thresholds" 464 | }, 465 | "mappings": [], 466 | "noValue": "0", 467 | "thresholds": { 468 | "mode": "absolute", 469 | "steps": [ 470 | { 471 | "color": "green", 472 | "value": null 473 | }, 474 | { 475 | "color": "red", 476 | "value": 120 477 | } 478 | ] 479 | }, 480 | "unit": "dtdurations" 481 | }, 482 | "overrides": [] 483 | }, 484 | "gridPos": { 485 | "h": 3, 486 | "w": 3, 487 | "x": 18, 488 | "y": 0 489 | }, 490 | "id": 69, 491 | "options": { 492 | "colorMode": "value", 493 | "graphMode": "area", 494 | "justifyMode": "auto", 495 | "orientation": "auto", 496 | "reduceOptions": { 497 | "calcs": [ 498 | "lastNotNull" 499 | ], 500 | "fields": "", 501 | "values": false 502 | }, 503 | "textMode": "auto" 504 | }, 505 | "pluginVersion": "9.1.5", 506 | "targets": [ 507 | { 508 | "datasource": { 509 | "type": "prometheus", 510 | "uid": "${datasource}" 511 | }, 512 | "editorMode": "code", 513 | "expr": "sum_over_time(\n(\n# for at least 20% of workers\n(\n count(\n # write rate is at least 20MiB/s\n rate(node_disk_written_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval])\n and rate(node_disk_written_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval]) > 20000000\n # while there are tasks processing\n and on () dask_scheduler_tasks{cluster_id=\"$cluster_id\",state=\"processing\"} > 0\n )\n / count(count by (instance) (sum_over_time(dask_worker_threads{cluster_id=\"$cluster_id\"}[2m]) > 0))\n) > bool .2\n)[12h:1s]\n) ", 514 | "format": "table", 515 | "hide": false, 516 | "legendFormat": "__auto", 517 | "range": true, 518 | "refId": "Time with non-trivial write" 519 | } 520 | ], 521 | "title": "* Non-Trivial Write", 522 | "type": "stat" 523 | }, 524 | { 525 | "datasource": { 526 | "type": "prometheus", 527 | "uid": "${datasource}" 528 | }, 529 | "fieldConfig": { 530 | "defaults": { 531 | "color": { 532 | "mode": "thresholds" 533 | }, 534 | "mappings": [], 535 | "noValue": "0", 536 | "thresholds": { 537 | "mode": "absolute", 538 | "steps": [ 539 | { 540 | "color": "green", 541 | "value": null 542 | }, 543 | { 544 | "color": "red", 545 | "value": 120 546 | } 547 | ] 548 | }, 549 | "unit": "dtdurations" 550 | }, 551 | "overrides": [] 552 | }, 553 | "gridPos": { 554 | "h": 3, 555 | "w": 3, 556 | "x": 21, 557 | "y": 0 558 | }, 559 | "id": 75, 560 | "options": { 561 | "colorMode": "value", 562 | "graphMode": "area", 563 | "justifyMode": "auto", 564 | "orientation": "auto", 565 | "reduceOptions": { 566 | "calcs": [ 567 | "lastNotNull" 568 | ], 569 | "fields": "", 570 | "values": false 571 | }, 572 | "textMode": "auto" 573 | }, 574 | "pluginVersion": "9.1.5", 575 | "targets": [ 576 | { 577 | "datasource": { 578 | "type": "prometheus", 579 | "uid": "${datasource}" 580 | }, 581 | "editorMode": "code", 582 | "expr": "sum_over_time(\n(\n # at least one instance with recv rate over 10MiB/s\n count(\n avg (\n rate(node_network_receive_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval])\n ) > 10000000\n )\n # while there are tasks processing\n and on () dask_scheduler_tasks{cluster_id=\"$cluster_id\",state=\"processing\"} > 0\n\n)[12h:1s]\n) \n", 583 | "format": "table", 584 | "hide": false, 585 | "legendFormat": "__auto", 586 | "range": true, 587 | "refId": "Time with non-trivial write" 588 | } 589 | ], 590 | "title": "* Non-Trivial Recv (>10MiB/s Avg)", 591 | "type": "stat" 592 | }, 593 | { 594 | "datasource": { 595 | "type": "prometheus", 596 | "uid": "${datasource}" 597 | }, 598 | "fieldConfig": { 599 | "defaults": { 600 | "color": { 601 | "mode": "thresholds" 602 | }, 603 | "mappings": [], 604 | "thresholds": { 605 | "mode": "absolute", 606 | "steps": [ 607 | { 608 | "color": "green", 609 | "value": null 610 | }, 611 | { 612 | "color": "red", 613 | "value": 1 614 | } 615 | ] 616 | }, 617 | "unit": "none" 618 | }, 619 | "overrides": [] 620 | }, 621 | "gridPos": { 622 | "h": 3, 623 | "w": 3, 624 | "x": 0, 625 | "y": 3 626 | }, 627 | "id": 80, 628 | "options": { 629 | "colorMode": "value", 630 | "graphMode": "none", 631 | "justifyMode": "auto", 632 | "orientation": "auto", 633 | "reduceOptions": { 634 | "calcs": [ 635 | "max" 636 | ], 637 | "fields": "", 638 | "values": false 639 | }, 640 | "textMode": "auto" 641 | }, 642 | "pluginVersion": "9.1.5", 643 | "targets": [ 644 | { 645 | "datasource": { 646 | "type": "prometheus", 647 | "uid": "${datasource}" 648 | }, 649 | "editorMode": "code", 650 | "expr": "dask_scheduler_tasks{cluster_id=\"$cluster_id\",state=\"erred\"}", 651 | "format": "table", 652 | "hide": false, 653 | "legendFormat": "__auto", 654 | "range": true, 655 | "refId": "Time with non-trivial write" 656 | } 657 | ], 658 | "title": "Erred Tasks", 659 | "type": "stat" 660 | }, 661 | { 662 | "datasource": { 663 | "type": "prometheus", 664 | "uid": "${datasource}" 665 | }, 666 | "fieldConfig": { 667 | "defaults": { 668 | "color": { 669 | "mode": "thresholds" 670 | }, 671 | "decimals": 0, 672 | "mappings": [], 673 | "noValue": "0", 674 | "thresholds": { 675 | "mode": "absolute", 676 | "steps": [ 677 | { 678 | "color": "green", 679 | "value": null 680 | }, 681 | { 682 | "color": "red", 683 | "value": 120 684 | } 685 | ] 686 | }, 687 | "unit": "dtdurations" 688 | }, 689 | "overrides": [] 690 | }, 691 | "gridPos": { 692 | "h": 3, 693 | "w": 3, 694 | "x": 3, 695 | "y": 3 696 | }, 697 | "id": 84, 698 | "options": { 699 | "colorMode": "value", 700 | "graphMode": "area", 701 | "justifyMode": "auto", 702 | "orientation": "auto", 703 | "reduceOptions": { 704 | "calcs": [ 705 | "lastNotNull" 706 | ], 707 | "fields": "", 708 | "values": false 709 | }, 710 | "textMode": "auto" 711 | }, 712 | "pluginVersion": "9.1.5", 713 | "targets": [ 714 | { 715 | "datasource": { 716 | "type": "prometheus", 717 | "uid": "${datasource}" 718 | }, 719 | "editorMode": "code", 720 | "exemplar": false, 721 | "expr": "count_over_time(\n(\n\n # any workers aren't reporting dask metrics\n (\n count(\n sum by (instance)(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\",spec=\"worker\"}[$__rate_interval]) > 0)\n )\n - count(dask_worker_threads{cluster_id=\"$cluster_id\"})\n ) > 0\n # only trust metric when scheduler is running\n and on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0\n\n)[12h:1s]\n) ", 722 | "hide": false, 723 | "instant": false, 724 | "legendFormat": "cores", 725 | "range": true, 726 | "refId": "cores (host)" 727 | } 728 | ], 729 | "title": "Worker Event Loop Blocked", 730 | "type": "stat" 731 | }, 732 | { 733 | "datasource": { 734 | "type": "prometheus", 735 | "uid": "WzUZ3pn4k" 736 | }, 737 | "gridPos": { 738 | "h": 3, 739 | "w": 7, 740 | "x": 17, 741 | "y": 3 742 | }, 743 | "id": 83, 744 | "options": { 745 | "content": "* Duration only includes times\n when tasks on scheduler were\n in processing state", 746 | "mode": "html" 747 | }, 748 | "pluginVersion": "9.1.5", 749 | "transparent": true, 750 | "type": "text" 751 | }, 752 | { 753 | "datasource": { 754 | "type": "prometheus", 755 | "uid": "${datasource}" 756 | }, 757 | "fieldConfig": { 758 | "defaults": { 759 | "color": { 760 | "mode": "palette-classic" 761 | }, 762 | "custom": { 763 | "axisCenteredZero": false, 764 | "axisColorMode": "text", 765 | "axisLabel": "", 766 | "axisPlacement": "auto", 767 | "axisWidth": 100, 768 | "barAlignment": 0, 769 | "drawStyle": "line", 770 | "fillOpacity": 100, 771 | "gradientMode": "none", 772 | "hideFrom": { 773 | "legend": false, 774 | "tooltip": false, 775 | "viz": false 776 | }, 777 | "lineInterpolation": "stepBefore", 778 | "lineWidth": 0, 779 | "pointSize": 1, 780 | "scaleDistribution": { 781 | "type": "linear" 782 | }, 783 | "showPoints": "auto", 784 | "spanNulls": false, 785 | "stacking": { 786 | "group": "A", 787 | "mode": "normal" 788 | }, 789 | "thresholdsStyle": { 790 | "mode": "off" 791 | } 792 | }, 793 | "mappings": [], 794 | "thresholds": { 795 | "mode": "absolute", 796 | "steps": [ 797 | { 798 | "color": "green", 799 | "value": null 800 | }, 801 | { 802 | "color": "red", 803 | "value": 80 804 | } 805 | ] 806 | }, 807 | "unit": "dtdurations" 808 | }, 809 | "overrides": [ 810 | { 811 | "matcher": { 812 | "id": "byFrameRefID", 813 | "options": "count" 814 | }, 815 | "properties": [ 816 | { 817 | "id": "unit", 818 | "value": "none" 819 | } 820 | ] 821 | }, 822 | { 823 | "matcher": { 824 | "id": "byFrameRefID", 825 | "options": "total time" 826 | }, 827 | "properties": [ 828 | { 829 | "id": "custom.fillOpacity", 830 | "value": 0 831 | }, 832 | { 833 | "id": "custom.stacking", 834 | "value": { 835 | "group": "A", 836 | "mode": "none" 837 | } 838 | }, 839 | { 840 | "id": "color", 841 | "value": { 842 | "fixedColor": "green", 843 | "mode": "fixed" 844 | } 845 | } 846 | ] 847 | }, 848 | { 849 | "matcher": { 850 | "id": "byFrameRefID", 851 | "options": "capacity" 852 | }, 853 | "properties": [ 854 | { 855 | "id": "custom.fillOpacity", 856 | "value": 0 857 | }, 858 | { 859 | "id": "custom.lineWidth", 860 | "value": 1 861 | }, 862 | { 863 | "id": "custom.stacking", 864 | "value": { 865 | "group": "A", 866 | "mode": "none" 867 | } 868 | }, 869 | { 870 | "id": "color", 871 | "value": { 872 | "mode": "fixed" 873 | } 874 | } 875 | ] 876 | } 877 | ] 878 | }, 879 | "gridPos": { 880 | "h": 8, 881 | "w": 20, 882 | "x": 0, 883 | "y": 6 884 | }, 885 | "id": 86, 886 | "options": { 887 | "legend": { 888 | "calcs": [], 889 | "displayMode": "list", 890 | "placement": "bottom", 891 | "showLegend": true 892 | }, 893 | "tooltip": { 894 | "mode": "multi", 895 | "sort": "desc" 896 | } 897 | }, 898 | "targets": [ 899 | { 900 | "datasource": { 901 | "type": "prometheus", 902 | "uid": "${datasource}" 903 | }, 904 | "editorMode": "code", 905 | "expr": "rate(dask_scheduler_tasks_compute_seconds_total{cluster_id=\"$cluster_id\"}[$__rate_interval])", 906 | "hide": false, 907 | "legendFormat": "{{task_prefix_name}}", 908 | "range": true, 909 | "refId": "duration" 910 | }, 911 | { 912 | "datasource": { 913 | "type": "prometheus", 914 | "uid": "${datasource}" 915 | }, 916 | "editorMode": "code", 917 | "expr": "sum(rate(dask_scheduler_tasks_compute_seconds_total{cluster_id=\"$cluster_id\"}[$__rate_interval]) > 0)", 918 | "hide": false, 919 | "legendFormat": "total time", 920 | "range": true, 921 | "refId": "total time" 922 | }, 923 | { 924 | "datasource": { 925 | "type": "prometheus", 926 | "uid": "${datasource}" 927 | }, 928 | "editorMode": "code", 929 | "expr": "sum(dask_worker_threads{cluster_id=\"$cluster_id\"})", 930 | "hide": false, 931 | "legendFormat": "Capacity", 932 | "range": true, 933 | "refId": "capacity" 934 | } 935 | ], 936 | "title": "Task Prefix Compute Time", 937 | "type": "timeseries" 938 | }, 939 | { 940 | "datasource": { 941 | "type": "prometheus", 942 | "uid": "${datasource}" 943 | }, 944 | "fieldConfig": { 945 | "defaults": { 946 | "color": { 947 | "mode": "palette-classic" 948 | }, 949 | "custom": { 950 | "hideFrom": { 951 | "legend": false, 952 | "tooltip": false, 953 | "viz": false 954 | } 955 | }, 956 | "mappings": [], 957 | "unit": "dtdurations" 958 | }, 959 | "overrides": [ 960 | { 961 | "matcher": { 962 | "id": "byFrameRefID", 963 | "options": "count" 964 | }, 965 | "properties": [ 966 | { 967 | "id": "unit", 968 | "value": "none" 969 | } 970 | ] 971 | } 972 | ] 973 | }, 974 | "gridPos": { 975 | "h": 8, 976 | "w": 4, 977 | "x": 20, 978 | "y": 6 979 | }, 980 | "id": 88, 981 | "options": { 982 | "legend": { 983 | "displayMode": "list", 984 | "placement": "bottom", 985 | "showLegend": true 986 | }, 987 | "pieType": "pie", 988 | "reduceOptions": { 989 | "calcs": [ 990 | "lastNotNull" 991 | ], 992 | "fields": "", 993 | "values": false 994 | }, 995 | "tooltip": { 996 | "mode": "single", 997 | "sort": "none" 998 | } 999 | }, 1000 | "targets": [ 1001 | { 1002 | "datasource": { 1003 | "type": "prometheus", 1004 | "uid": "${datasource}" 1005 | }, 1006 | "editorMode": "code", 1007 | "expr": "max_over_time(dask_scheduler_tasks_compute_seconds_total{cluster_id=\"$cluster_id\"}[6h])", 1008 | "hide": false, 1009 | "legendFormat": "{{task_prefix_name}}", 1010 | "range": true, 1011 | "refId": "duration" 1012 | } 1013 | ], 1014 | "title": "Task Prefix Compute Time", 1015 | "type": "piechart" 1016 | }, 1017 | { 1018 | "collapsed": false, 1019 | "gridPos": { 1020 | "h": 1, 1021 | "w": 24, 1022 | "x": 0, 1023 | "y": 14 1024 | }, 1025 | "id": 60, 1026 | "panels": [], 1027 | "title": "Worker", 1028 | "type": "row" 1029 | }, 1030 | { 1031 | "datasource": { 1032 | "type": "prometheus", 1033 | "uid": "${datasource}" 1034 | }, 1035 | "fieldConfig": { 1036 | "defaults": { 1037 | "color": { 1038 | "mode": "palette-classic" 1039 | }, 1040 | "custom": { 1041 | "axisCenteredZero": false, 1042 | "axisColorMode": "text", 1043 | "axisLabel": "", 1044 | "axisPlacement": "auto", 1045 | "axisSoftMax": 100, 1046 | "axisSoftMin": 0, 1047 | "barAlignment": 0, 1048 | "drawStyle": "line", 1049 | "fillOpacity": 40, 1050 | "gradientMode": "none", 1051 | "hideFrom": { 1052 | "legend": false, 1053 | "tooltip": false, 1054 | "viz": false 1055 | }, 1056 | "lineInterpolation": "linear", 1057 | "lineWidth": 1, 1058 | "pointSize": 3, 1059 | "scaleDistribution": { 1060 | "type": "linear" 1061 | }, 1062 | "showPoints": "never", 1063 | "spanNulls": false, 1064 | "stacking": { 1065 | "group": "A", 1066 | "mode": "normal" 1067 | }, 1068 | "thresholdsStyle": { 1069 | "mode": "off" 1070 | } 1071 | }, 1072 | "decimals": 0, 1073 | "mappings": [], 1074 | "max": 100, 1075 | "min": 0, 1076 | "thresholds": { 1077 | "mode": "absolute", 1078 | "steps": [ 1079 | { 1080 | "color": "green", 1081 | "value": null 1082 | }, 1083 | { 1084 | "color": "red", 1085 | "value": 80 1086 | } 1087 | ] 1088 | }, 1089 | "unit": "percent" 1090 | }, 1091 | "overrides": [ 1092 | { 1093 | "matcher": { 1094 | "id": "byName", 1095 | "options": "80pct" 1096 | }, 1097 | "properties": [ 1098 | { 1099 | "id": "custom.fillOpacity", 1100 | "value": 0 1101 | }, 1102 | { 1103 | "id": "custom.stacking", 1104 | "value": { 1105 | "group": "A", 1106 | "mode": "none" 1107 | } 1108 | }, 1109 | { 1110 | "id": "custom.lineWidth", 1111 | "value": 2 1112 | }, 1113 | { 1114 | "id": "color", 1115 | "value": { 1116 | "fixedColor": "#ccccdc", 1117 | "mode": "fixed" 1118 | } 1119 | } 1120 | ] 1121 | }, 1122 | { 1123 | "matcher": { 1124 | "id": "byName", 1125 | "options": "Max" 1126 | }, 1127 | "properties": [ 1128 | { 1129 | "id": "custom.fillOpacity", 1130 | "value": 0 1131 | }, 1132 | { 1133 | "id": "color", 1134 | "value": { 1135 | "fixedColor": "dark-orange", 1136 | "mode": "fixed" 1137 | } 1138 | }, 1139 | { 1140 | "id": "custom.lineWidth", 1141 | "value": 2 1142 | }, 1143 | { 1144 | "id": "custom.stacking", 1145 | "value": { 1146 | "group": "A", 1147 | "mode": "none" 1148 | } 1149 | } 1150 | ] 1151 | } 1152 | ] 1153 | }, 1154 | "gridPos": { 1155 | "h": 6, 1156 | "w": 12, 1157 | "x": 0, 1158 | "y": 15 1159 | }, 1160 | "id": 26, 1161 | "options": { 1162 | "legend": { 1163 | "calcs": [], 1164 | "displayMode": "list", 1165 | "placement": "bottom", 1166 | "showLegend": true 1167 | }, 1168 | "tooltip": { 1169 | "mode": "single", 1170 | "sort": "none" 1171 | } 1172 | }, 1173 | "targets": [ 1174 | { 1175 | "datasource": { 1176 | "type": "prometheus", 1177 | "uid": "${datasource}" 1178 | }, 1179 | "dimensions": {}, 1180 | "editorMode": "code", 1181 | "expr": "avg by(mode) (irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode!=\"idle\"}[$__rate_interval])) * 100", 1182 | "expression": "", 1183 | "hide": false, 1184 | "id": "", 1185 | "label": "", 1186 | "legendFormat": "{{mode}}", 1187 | "matchExact": true, 1188 | "metricEditorMode": 0, 1189 | "metricName": "", 1190 | "metricQueryType": 0, 1191 | "namespace": "", 1192 | "period": "", 1193 | "queryMode": "Metrics", 1194 | "range": true, 1195 | "refId": "avg worker cpu", 1196 | "region": "default", 1197 | "sqlExpression": "", 1198 | "statistic": "Average" 1199 | }, 1200 | { 1201 | "datasource": { 1202 | "type": "prometheus", 1203 | "uid": "${datasource}" 1204 | }, 1205 | "dimensions": {}, 1206 | "editorMode": "code", 1207 | "exemplar": false, 1208 | "expr": "quantile(0.8, \navg by(coiled_instance) (100-irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode=\"idle\"}[$__rate_interval])*100)\n)", 1209 | "expression": "", 1210 | "format": "time_series", 1211 | "hide": false, 1212 | "id": "", 1213 | "instant": false, 1214 | "label": "", 1215 | "legendFormat": "80pct", 1216 | "matchExact": true, 1217 | "metricEditorMode": 0, 1218 | "metricName": "", 1219 | "metricQueryType": 0, 1220 | "namespace": "", 1221 | "period": "", 1222 | "queryMode": "Metrics", 1223 | "range": true, 1224 | "refId": "80pct worker cpu", 1225 | "region": "default", 1226 | "sqlExpression": "", 1227 | "statistic": "Average" 1228 | }, 1229 | { 1230 | "datasource": { 1231 | "type": "prometheus", 1232 | "uid": "${datasource}" 1233 | }, 1234 | "dimensions": {}, 1235 | "editorMode": "code", 1236 | "exemplar": false, 1237 | "expr": "max(\navg by(coiled_instance) (100-irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode=\"idle\"}[$__rate_interval])*100)\n)", 1238 | "expression": "", 1239 | "format": "time_series", 1240 | "hide": false, 1241 | "id": "", 1242 | "instant": false, 1243 | "label": "", 1244 | "legendFormat": "Max", 1245 | "matchExact": true, 1246 | "metricEditorMode": 0, 1247 | "metricName": "", 1248 | "metricQueryType": 0, 1249 | "namespace": "", 1250 | "period": "", 1251 | "queryMode": "Metrics", 1252 | "range": true, 1253 | "refId": "Max worker cpu", 1254 | "region": "default", 1255 | "sqlExpression": "", 1256 | "statistic": "Average" 1257 | } 1258 | ], 1259 | "title": "Average Worker CPU", 1260 | "type": "timeseries" 1261 | }, 1262 | { 1263 | "datasource": { 1264 | "type": "prometheus", 1265 | "uid": "${datasource}" 1266 | }, 1267 | "fieldConfig": { 1268 | "defaults": { 1269 | "color": { 1270 | "mode": "palette-classic" 1271 | }, 1272 | "custom": { 1273 | "axisCenteredZero": false, 1274 | "axisColorMode": "text", 1275 | "axisLabel": "", 1276 | "axisPlacement": "auto", 1277 | "barAlignment": 0, 1278 | "drawStyle": "line", 1279 | "fillOpacity": 20, 1280 | "gradientMode": "none", 1281 | "hideFrom": { 1282 | "legend": false, 1283 | "tooltip": false, 1284 | "viz": false 1285 | }, 1286 | "lineInterpolation": "smooth", 1287 | "lineWidth": 1, 1288 | "pointSize": 5, 1289 | "scaleDistribution": { 1290 | "type": "linear" 1291 | }, 1292 | "showPoints": "never", 1293 | "spanNulls": false, 1294 | "stacking": { 1295 | "group": "A", 1296 | "mode": "normal" 1297 | }, 1298 | "thresholdsStyle": { 1299 | "mode": "off" 1300 | } 1301 | }, 1302 | "mappings": [], 1303 | "thresholds": { 1304 | "mode": "absolute", 1305 | "steps": [ 1306 | { 1307 | "color": "green", 1308 | "value": null 1309 | } 1310 | ] 1311 | }, 1312 | "unit": "bytes" 1313 | }, 1314 | "overrides": [ 1315 | { 1316 | "matcher": { 1317 | "id": "byName", 1318 | "options": "Max per instance recv" 1319 | }, 1320 | "properties": [ 1321 | { 1322 | "id": "custom.stacking", 1323 | "value": { 1324 | "group": "A", 1325 | "mode": "none" 1326 | } 1327 | }, 1328 | { 1329 | "id": "custom.drawStyle", 1330 | "value": "line" 1331 | }, 1332 | { 1333 | "id": "custom.lineWidth", 1334 | "value": 2 1335 | }, 1336 | { 1337 | "id": "custom.fillOpacity", 1338 | "value": 0 1339 | }, 1340 | { 1341 | "id": "color", 1342 | "value": { 1343 | "fixedColor": "text", 1344 | "mode": "fixed" 1345 | } 1346 | }, 1347 | { 1348 | "id": "thresholds", 1349 | "value": { 1350 | "mode": "absolute", 1351 | "steps": [ 1352 | { 1353 | "color": "green", 1354 | "value": null 1355 | }, 1356 | { 1357 | "color": "red", 1358 | "value": 10000000 1359 | } 1360 | ] 1361 | } 1362 | }, 1363 | { 1364 | "id": "custom.thresholdsStyle", 1365 | "value": { 1366 | "mode": "off" 1367 | } 1368 | } 1369 | ] 1370 | } 1371 | ] 1372 | }, 1373 | "gridPos": { 1374 | "h": 6, 1375 | "w": 12, 1376 | "x": 12, 1377 | "y": 15 1378 | }, 1379 | "id": 29, 1380 | "options": { 1381 | "legend": { 1382 | "calcs": [], 1383 | "displayMode": "list", 1384 | "placement": "bottom", 1385 | "showLegend": true 1386 | }, 1387 | "tooltip": { 1388 | "mode": "single", 1389 | "sort": "none" 1390 | } 1391 | }, 1392 | "targets": [ 1393 | { 1394 | "datasource": { 1395 | "type": "prometheus", 1396 | "uid": "${datasource}" 1397 | }, 1398 | "dimensions": {}, 1399 | "editorMode": "code", 1400 | "expr": "sum((rate(node_network_transmit_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval])))", 1401 | "expression": "", 1402 | "id": "", 1403 | "label": "", 1404 | "legendFormat": "Sent", 1405 | "matchExact": true, 1406 | "metricEditorMode": 0, 1407 | "metricName": "", 1408 | "metricQueryType": 0, 1409 | "namespace": "", 1410 | "period": "", 1411 | "queryMode": "Metrics", 1412 | "range": true, 1413 | "refId": "network - total cluster sent rate", 1414 | "region": "default", 1415 | "sqlExpression": "", 1416 | "statistic": "Average" 1417 | }, 1418 | { 1419 | "datasource": { 1420 | "type": "prometheus", 1421 | "uid": "${datasource}" 1422 | }, 1423 | "dimensions": {}, 1424 | "editorMode": "code", 1425 | "expr": "sum((rate(node_network_receive_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval])))", 1426 | "expression": "", 1427 | "hide": false, 1428 | "id": "", 1429 | "label": "", 1430 | "legendFormat": "Recv", 1431 | "matchExact": true, 1432 | "metricEditorMode": 0, 1433 | "metricName": "", 1434 | "metricQueryType": 0, 1435 | "namespace": "", 1436 | "period": "", 1437 | "queryMode": "Metrics", 1438 | "range": true, 1439 | "refId": "network - total cluster recv rate", 1440 | "region": "default", 1441 | "sqlExpression": "", 1442 | "statistic": "Average" 1443 | }, 1444 | { 1445 | "datasource": { 1446 | "type": "prometheus", 1447 | "uid": "${datasource}" 1448 | }, 1449 | "dimensions": {}, 1450 | "editorMode": "code", 1451 | "expr": "max((rate(node_network_receive_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval])))", 1452 | "expression": "", 1453 | "hide": false, 1454 | "id": "", 1455 | "label": "", 1456 | "legendFormat": "Max per instance recv", 1457 | "matchExact": true, 1458 | "metricEditorMode": 0, 1459 | "metricName": "", 1460 | "metricQueryType": 0, 1461 | "namespace": "", 1462 | "period": "", 1463 | "queryMode": "Metrics", 1464 | "range": true, 1465 | "refId": "network - max worker recv rate", 1466 | "region": "default", 1467 | "sqlExpression": "", 1468 | "statistic": "Average" 1469 | } 1470 | ], 1471 | "title": "Worker Net Sent/Recv Rate", 1472 | "type": "timeseries" 1473 | }, 1474 | { 1475 | "datasource": { 1476 | "type": "prometheus", 1477 | "uid": "${datasource}" 1478 | }, 1479 | "fieldConfig": { 1480 | "defaults": { 1481 | "color": { 1482 | "mode": "palette-classic" 1483 | }, 1484 | "custom": { 1485 | "axisCenteredZero": false, 1486 | "axisColorMode": "text", 1487 | "axisLabel": "", 1488 | "axisPlacement": "auto", 1489 | "barAlignment": 0, 1490 | "drawStyle": "line", 1491 | "fillOpacity": 70, 1492 | "gradientMode": "none", 1493 | "hideFrom": { 1494 | "legend": false, 1495 | "tooltip": false, 1496 | "viz": false 1497 | }, 1498 | "lineInterpolation": "linear", 1499 | "lineWidth": 0, 1500 | "pointSize": 5, 1501 | "scaleDistribution": { 1502 | "type": "linear" 1503 | }, 1504 | "showPoints": "never", 1505 | "spanNulls": false, 1506 | "stacking": { 1507 | "group": "A", 1508 | "mode": "normal" 1509 | }, 1510 | "thresholdsStyle": { 1511 | "mode": "off" 1512 | } 1513 | }, 1514 | "mappings": [], 1515 | "thresholds": { 1516 | "mode": "absolute", 1517 | "steps": [ 1518 | { 1519 | "color": "green", 1520 | "value": null 1521 | }, 1522 | { 1523 | "color": "red", 1524 | "value": 80 1525 | } 1526 | ] 1527 | }, 1528 | "unit": "bytes" 1529 | }, 1530 | "overrides": [ 1531 | { 1532 | "matcher": { 1533 | "id": "byName", 1534 | "options": "cluster total memory" 1535 | }, 1536 | "properties": [ 1537 | { 1538 | "id": "custom.stacking", 1539 | "value": { 1540 | "group": "A", 1541 | "mode": "none" 1542 | } 1543 | }, 1544 | { 1545 | "id": "custom.drawStyle", 1546 | "value": "line" 1547 | }, 1548 | { 1549 | "id": "custom.fillOpacity", 1550 | "value": 0 1551 | }, 1552 | { 1553 | "id": "custom.lineWidth", 1554 | "value": 2 1555 | }, 1556 | { 1557 | "id": "color", 1558 | "value": { 1559 | "fixedColor": "text", 1560 | "mode": "fixed" 1561 | } 1562 | } 1563 | ] 1564 | }, 1565 | { 1566 | "matcher": { 1567 | "id": "byName", 1568 | "options": "managed" 1569 | }, 1570 | "properties": [ 1571 | { 1572 | "id": "color", 1573 | "value": { 1574 | "fixedColor": "blue", 1575 | "mode": "fixed" 1576 | } 1577 | } 1578 | ] 1579 | }, 1580 | { 1581 | "matcher": { 1582 | "id": "byName", 1583 | "options": "unmanaged" 1584 | }, 1585 | "properties": [ 1586 | { 1587 | "id": "color", 1588 | "value": { 1589 | "fixedColor": "super-light-blue", 1590 | "mode": "fixed" 1591 | } 1592 | } 1593 | ] 1594 | }, 1595 | { 1596 | "matcher": { 1597 | "id": "byName", 1598 | "options": "spilled" 1599 | }, 1600 | "properties": [ 1601 | { 1602 | "id": "color", 1603 | "value": { 1604 | "fixedColor": "orange", 1605 | "mode": "fixed" 1606 | } 1607 | } 1608 | ] 1609 | }, 1610 | { 1611 | "matcher": { 1612 | "id": "byName", 1613 | "options": "cluster total occupied" 1614 | }, 1615 | "properties": [ 1616 | { 1617 | "id": "custom.fillOpacity", 1618 | "value": 0 1619 | }, 1620 | { 1621 | "id": "custom.lineWidth", 1622 | "value": 1 1623 | }, 1624 | { 1625 | "id": "color", 1626 | "value": { 1627 | "fixedColor": "super-light-orange", 1628 | "mode": "fixed" 1629 | } 1630 | }, 1631 | { 1632 | "id": "custom.stacking", 1633 | "value": { 1634 | "group": "A", 1635 | "mode": "none" 1636 | } 1637 | } 1638 | ] 1639 | }, 1640 | { 1641 | "matcher": { 1642 | "id": "byName", 1643 | "options": "dask total memory" 1644 | }, 1645 | "properties": [ 1646 | { 1647 | "id": "custom.drawStyle", 1648 | "value": "line" 1649 | }, 1650 | { 1651 | "id": "custom.stacking", 1652 | "value": { 1653 | "group": "A", 1654 | "mode": "none" 1655 | } 1656 | }, 1657 | { 1658 | "id": "color", 1659 | "value": { 1660 | "fixedColor": "dark-blue", 1661 | "mode": "fixed" 1662 | } 1663 | }, 1664 | { 1665 | "id": "custom.fillOpacity", 1666 | "value": 0 1667 | }, 1668 | { 1669 | "id": "custom.lineWidth", 1670 | "value": 1 1671 | } 1672 | ] 1673 | }, 1674 | { 1675 | "matcher": { 1676 | "id": "byName", 1677 | "options": "gap" 1678 | }, 1679 | "properties": [ 1680 | { 1681 | "id": "custom.stacking", 1682 | "value": { 1683 | "group": "A", 1684 | "mode": "none" 1685 | } 1686 | }, 1687 | { 1688 | "id": "custom.fillOpacity", 1689 | "value": 0 1690 | }, 1691 | { 1692 | "id": "custom.lineWidth", 1693 | "value": 2 1694 | } 1695 | ] 1696 | } 1697 | ] 1698 | }, 1699 | "gridPos": { 1700 | "h": 6, 1701 | "w": 12, 1702 | "x": 0, 1703 | "y": 21 1704 | }, 1705 | "id": 66, 1706 | "options": { 1707 | "legend": { 1708 | "calcs": [ 1709 | "max" 1710 | ], 1711 | "displayMode": "list", 1712 | "placement": "bottom", 1713 | "showLegend": true 1714 | }, 1715 | "tooltip": { 1716 | "mode": "single", 1717 | "sort": "none" 1718 | } 1719 | }, 1720 | "targets": [ 1721 | { 1722 | "datasource": { 1723 | "type": "prometheus", 1724 | "uid": "${datasource}" 1725 | }, 1726 | "editorMode": "code", 1727 | "expr": "sum (dask_worker_memory_bytes{cluster_id=\"$cluster_id\", type=\"managed\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 1728 | "hide": false, 1729 | "legendFormat": "managed", 1730 | "range": true, 1731 | "refId": "total managed mem" 1732 | }, 1733 | { 1734 | "datasource": { 1735 | "type": "prometheus", 1736 | "uid": "${datasource}" 1737 | }, 1738 | "editorMode": "code", 1739 | "expr": "sum (dask_worker_memory_bytes{cluster_id=\"$cluster_id\", type=\"unmanaged\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 1740 | "hide": false, 1741 | "legendFormat": "unmanaged", 1742 | "range": true, 1743 | "refId": "total unmanaged mem" 1744 | }, 1745 | { 1746 | "datasource": { 1747 | "type": "prometheus", 1748 | "uid": "${datasource}" 1749 | }, 1750 | "editorMode": "code", 1751 | "expr": "sum (dask_worker_memory_bytes{cluster_id=\"$cluster_id\", type=\"spilled\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 1752 | "hide": false, 1753 | "legendFormat": "spilled", 1754 | "range": true, 1755 | "refId": "total spilled mem" 1756 | }, 1757 | { 1758 | "datasource": { 1759 | "type": "prometheus", 1760 | "uid": "${datasource}" 1761 | }, 1762 | "editorMode": "code", 1763 | "expr": "sum(node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\", spec=\"worker\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 1764 | "hide": false, 1765 | "legendFormat": "cluster total memory", 1766 | "range": true, 1767 | "refId": "cluster mem total" 1768 | }, 1769 | { 1770 | "datasource": { 1771 | "type": "prometheus", 1772 | "uid": "${datasource}" 1773 | }, 1774 | "editorMode": "code", 1775 | "expr": "sum(node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\", spec=\"worker\"} - node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\", spec=\"worker\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 1776 | "hide": false, 1777 | "legendFormat": "cluster total occupied", 1778 | "range": true, 1779 | "refId": "cluster mem occupied" 1780 | }, 1781 | { 1782 | "datasource": { 1783 | "type": "prometheus", 1784 | "uid": "${datasource}" 1785 | }, 1786 | "editorMode": "code", 1787 | "expr": "sum (dask_worker_memory_bytes{cluster_id=\"$cluster_id\", type=~\"unmanaged|managed\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 1788 | "hide": false, 1789 | "legendFormat": "dask total memory", 1790 | "range": true, 1791 | "refId": "dask mem line" 1792 | } 1793 | ], 1794 | "title": "Cluster Worker Memory", 1795 | "type": "timeseries" 1796 | }, 1797 | { 1798 | "datasource": { 1799 | "type": "prometheus", 1800 | "uid": "${datasource}" 1801 | }, 1802 | "fieldConfig": { 1803 | "defaults": { 1804 | "color": { 1805 | "mode": "palette-classic" 1806 | }, 1807 | "custom": { 1808 | "axisCenteredZero": false, 1809 | "axisColorMode": "text", 1810 | "axisLabel": "", 1811 | "axisPlacement": "auto", 1812 | "barAlignment": 0, 1813 | "drawStyle": "line", 1814 | "fillOpacity": 20, 1815 | "gradientMode": "none", 1816 | "hideFrom": { 1817 | "legend": false, 1818 | "tooltip": false, 1819 | "viz": false 1820 | }, 1821 | "lineInterpolation": "smooth", 1822 | "lineWidth": 1, 1823 | "pointSize": 5, 1824 | "scaleDistribution": { 1825 | "type": "linear" 1826 | }, 1827 | "showPoints": "never", 1828 | "spanNulls": false, 1829 | "stacking": { 1830 | "group": "A", 1831 | "mode": "normal" 1832 | }, 1833 | "thresholdsStyle": { 1834 | "mode": "off" 1835 | } 1836 | }, 1837 | "mappings": [], 1838 | "thresholds": { 1839 | "mode": "absolute", 1840 | "steps": [ 1841 | { 1842 | "color": "green", 1843 | "value": null 1844 | }, 1845 | { 1846 | "color": "red", 1847 | "value": 80 1848 | } 1849 | ] 1850 | }, 1851 | "unit": "bytes" 1852 | }, 1853 | "overrides": [ 1854 | { 1855 | "matcher": { 1856 | "id": "byRegexp", 1857 | "options": "Max per instance .*" 1858 | }, 1859 | "properties": [ 1860 | { 1861 | "id": "custom.lineWidth", 1862 | "value": 3 1863 | }, 1864 | { 1865 | "id": "custom.drawStyle", 1866 | "value": "line" 1867 | }, 1868 | { 1869 | "id": "custom.stacking", 1870 | "value": { 1871 | "group": "A", 1872 | "mode": "none" 1873 | } 1874 | }, 1875 | { 1876 | "id": "custom.fillOpacity", 1877 | "value": 0 1878 | } 1879 | ] 1880 | }, 1881 | { 1882 | "matcher": { 1883 | "id": "byName", 1884 | "options": "Max per instance read+write" 1885 | }, 1886 | "properties": [ 1887 | { 1888 | "id": "color", 1889 | "value": { 1890 | "fixedColor": "text", 1891 | "mode": "fixed" 1892 | } 1893 | } 1894 | ] 1895 | } 1896 | ] 1897 | }, 1898 | "gridPos": { 1899 | "h": 6, 1900 | "w": 12, 1901 | "x": 12, 1902 | "y": 21 1903 | }, 1904 | "id": 30, 1905 | "options": { 1906 | "legend": { 1907 | "calcs": [], 1908 | "displayMode": "list", 1909 | "placement": "bottom", 1910 | "showLegend": true 1911 | }, 1912 | "tooltip": { 1913 | "mode": "single", 1914 | "sort": "none" 1915 | } 1916 | }, 1917 | "targets": [ 1918 | { 1919 | "datasource": { 1920 | "type": "prometheus", 1921 | "uid": "${datasource}" 1922 | }, 1923 | "dimensions": {}, 1924 | "editorMode": "code", 1925 | "expr": "sum (rate(node_disk_read_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval]))", 1926 | "expression": "", 1927 | "id": "", 1928 | "label": "", 1929 | "legendFormat": "Read", 1930 | "matchExact": true, 1931 | "metricEditorMode": 0, 1932 | "metricName": "", 1933 | "metricQueryType": 0, 1934 | "namespace": "", 1935 | "period": "", 1936 | "queryMode": "Metrics", 1937 | "range": true, 1938 | "refId": "disk - total cluster read rate", 1939 | "region": "default", 1940 | "sqlExpression": "", 1941 | "statistic": "Average" 1942 | }, 1943 | { 1944 | "datasource": { 1945 | "type": "prometheus", 1946 | "uid": "${datasource}" 1947 | }, 1948 | "dimensions": {}, 1949 | "editorMode": "code", 1950 | "expr": "sum (rate(node_disk_written_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval]))", 1951 | "expression": "", 1952 | "hide": false, 1953 | "id": "", 1954 | "label": "", 1955 | "legendFormat": "Write", 1956 | "matchExact": true, 1957 | "metricEditorMode": 0, 1958 | "metricName": "", 1959 | "metricQueryType": 0, 1960 | "namespace": "", 1961 | "period": "", 1962 | "queryMode": "Metrics", 1963 | "range": true, 1964 | "refId": "disk - total cluster write rate", 1965 | "region": "default", 1966 | "sqlExpression": "", 1967 | "statistic": "Average" 1968 | }, 1969 | { 1970 | "datasource": { 1971 | "type": "prometheus", 1972 | "uid": "${datasource}" 1973 | }, 1974 | "dimensions": {}, 1975 | "editorMode": "code", 1976 | "expr": "max (rate(node_disk_read_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval]) + rate(node_disk_written_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval]))", 1977 | "expression": "", 1978 | "hide": false, 1979 | "id": "", 1980 | "label": "", 1981 | "legendFormat": "Max per instance read+write", 1982 | "matchExact": true, 1983 | "metricEditorMode": 0, 1984 | "metricName": "", 1985 | "metricQueryType": 0, 1986 | "namespace": "", 1987 | "period": "", 1988 | "queryMode": "Metrics", 1989 | "range": true, 1990 | "refId": "disk - max individual io (read+write)", 1991 | "region": "default", 1992 | "sqlExpression": "", 1993 | "statistic": "Average" 1994 | } 1995 | ], 1996 | "title": "Worker Disk Rate", 1997 | "type": "timeseries" 1998 | }, 1999 | { 2000 | "datasource": { 2001 | "type": "prometheus", 2002 | "uid": "${datasource}" 2003 | }, 2004 | "fieldConfig": { 2005 | "defaults": { 2006 | "color": { 2007 | "mode": "palette-classic" 2008 | }, 2009 | "custom": { 2010 | "axisCenteredZero": false, 2011 | "axisColorMode": "text", 2012 | "axisLabel": "", 2013 | "axisPlacement": "auto", 2014 | "barAlignment": 0, 2015 | "drawStyle": "line", 2016 | "fillOpacity": 40, 2017 | "gradientMode": "none", 2018 | "hideFrom": { 2019 | "legend": false, 2020 | "tooltip": false, 2021 | "viz": false 2022 | }, 2023 | "lineInterpolation": "linear", 2024 | "lineStyle": { 2025 | "fill": "solid" 2026 | }, 2027 | "lineWidth": 1, 2028 | "pointSize": 3, 2029 | "scaleDistribution": { 2030 | "type": "linear" 2031 | }, 2032 | "showPoints": "never", 2033 | "spanNulls": false, 2034 | "stacking": { 2035 | "group": "A", 2036 | "mode": "none" 2037 | }, 2038 | "thresholdsStyle": { 2039 | "mode": "off" 2040 | } 2041 | }, 2042 | "decimals": 0, 2043 | "mappings": [], 2044 | "thresholds": { 2045 | "mode": "absolute", 2046 | "steps": [ 2047 | { 2048 | "color": "green" 2049 | }, 2050 | { 2051 | "color": "red", 2052 | "value": 80 2053 | } 2054 | ] 2055 | }, 2056 | "unit": "none" 2057 | }, 2058 | "overrides": [ 2059 | { 2060 | "matcher": { 2061 | "id": "byName", 2062 | "options": "threads (for responding workers)" 2063 | }, 2064 | "properties": [ 2065 | { 2066 | "id": "color", 2067 | "value": { 2068 | "fixedColor": "dark-blue", 2069 | "mode": "fixed" 2070 | } 2071 | }, 2072 | { 2073 | "id": "custom.fillOpacity", 2074 | "value": 0 2075 | }, 2076 | { 2077 | "id": "custom.lineWidth", 2078 | "value": 2 2079 | } 2080 | ] 2081 | }, 2082 | { 2083 | "matcher": { 2084 | "id": "byName", 2085 | "options": "cores" 2086 | }, 2087 | "properties": [ 2088 | { 2089 | "id": "custom.fillOpacity", 2090 | "value": 0 2091 | }, 2092 | { 2093 | "id": "color", 2094 | "value": { 2095 | "fixedColor": "text", 2096 | "mode": "fixed" 2097 | } 2098 | }, 2099 | { 2100 | "id": "custom.lineWidth", 2101 | "value": 2 2102 | } 2103 | ] 2104 | }, 2105 | { 2106 | "matcher": { 2107 | "id": "byName", 2108 | "options": "executing" 2109 | }, 2110 | "properties": [ 2111 | { 2112 | "id": "color", 2113 | "value": { 2114 | "fixedColor": "green", 2115 | "mode": "fixed" 2116 | } 2117 | } 2118 | ] 2119 | } 2120 | ] 2121 | }, 2122 | "gridPos": { 2123 | "h": 6, 2124 | "w": 12, 2125 | "x": 0, 2126 | "y": 27 2127 | }, 2128 | "id": 13, 2129 | "options": { 2130 | "legend": { 2131 | "calcs": [], 2132 | "displayMode": "list", 2133 | "placement": "bottom", 2134 | "showLegend": true 2135 | }, 2136 | "tooltip": { 2137 | "mode": "single", 2138 | "sort": "none" 2139 | } 2140 | }, 2141 | "targets": [ 2142 | { 2143 | "datasource": { 2144 | "type": "prometheus", 2145 | "uid": "${datasource}" 2146 | }, 2147 | "editorMode": "code", 2148 | "exemplar": false, 2149 | "expr": "count(sum by (instance, cpu)(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\",spec=\"worker\"}[$__rate_interval]) > 0))", 2150 | "hide": false, 2151 | "instant": false, 2152 | "legendFormat": "cores", 2153 | "range": true, 2154 | "refId": "cores (host)" 2155 | }, 2156 | { 2157 | "datasource": { 2158 | "type": "prometheus", 2159 | "uid": "${datasource}" 2160 | }, 2161 | "dimensions": {}, 2162 | "editorMode": "code", 2163 | "expr": "sum by(state) (dask_worker_tasks{cluster_id=\"$cluster_id\",state=\"executing\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 2164 | "expression": "", 2165 | "hide": false, 2166 | "id": "", 2167 | "label": "", 2168 | "legendFormat": "executing", 2169 | "matchExact": true, 2170 | "metricEditorMode": 0, 2171 | "metricName": "", 2172 | "metricQueryType": 0, 2173 | "namespace": "", 2174 | "period": "", 2175 | "queryMode": "Metrics", 2176 | "range": true, 2177 | "refId": "executing (dask)", 2178 | "region": "default", 2179 | "sqlExpression": "", 2180 | "statistic": "Average" 2181 | }, 2182 | { 2183 | "datasource": { 2184 | "type": "prometheus", 2185 | "uid": "${datasource}" 2186 | }, 2187 | "editorMode": "code", 2188 | "exemplar": false, 2189 | "expr": "sum(dask_worker_threads{cluster_id=\"$cluster_id\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 2190 | "hide": false, 2191 | "instant": false, 2192 | "legendFormat": "threads (for responding workers)", 2193 | "range": true, 2194 | "refId": "threads (dask)" 2195 | } 2196 | ], 2197 | "title": "Cluster Occupancy", 2198 | "type": "timeseries" 2199 | }, 2200 | { 2201 | "datasource": { 2202 | "type": "prometheus", 2203 | "uid": "${datasource}" 2204 | }, 2205 | "fieldConfig": { 2206 | "defaults": { 2207 | "color": { 2208 | "mode": "palette-classic" 2209 | }, 2210 | "custom": { 2211 | "axisCenteredZero": false, 2212 | "axisColorMode": "text", 2213 | "axisLabel": "", 2214 | "axisPlacement": "auto", 2215 | "barAlignment": 0, 2216 | "drawStyle": "line", 2217 | "fillOpacity": 70, 2218 | "gradientMode": "none", 2219 | "hideFrom": { 2220 | "legend": false, 2221 | "tooltip": false, 2222 | "viz": false 2223 | }, 2224 | "lineInterpolation": "stepAfter", 2225 | "lineStyle": { 2226 | "fill": "solid" 2227 | }, 2228 | "lineWidth": 0, 2229 | "pointSize": 3, 2230 | "scaleDistribution": { 2231 | "type": "linear" 2232 | }, 2233 | "showPoints": "never", 2234 | "spanNulls": false, 2235 | "stacking": { 2236 | "group": "A", 2237 | "mode": "normal" 2238 | }, 2239 | "thresholdsStyle": { 2240 | "mode": "off" 2241 | } 2242 | }, 2243 | "mappings": [], 2244 | "thresholds": { 2245 | "mode": "absolute", 2246 | "steps": [ 2247 | { 2248 | "color": "green" 2249 | }, 2250 | { 2251 | "color": "red", 2252 | "value": 80 2253 | } 2254 | ] 2255 | }, 2256 | "unit": "none" 2257 | }, 2258 | "overrides": [ 2259 | { 2260 | "matcher": { 2261 | "id": "byName", 2262 | "options": "waiting" 2263 | }, 2264 | "properties": [ 2265 | { 2266 | "id": "color", 2267 | "value": { 2268 | "fixedColor": "yellow", 2269 | "mode": "fixed" 2270 | } 2271 | } 2272 | ] 2273 | }, 2274 | { 2275 | "matcher": { 2276 | "id": "byName", 2277 | "options": "processing" 2278 | }, 2279 | "properties": [ 2280 | { 2281 | "id": "color", 2282 | "value": { 2283 | "fixedColor": "green", 2284 | "mode": "fixed" 2285 | } 2286 | } 2287 | ] 2288 | }, 2289 | { 2290 | "matcher": { 2291 | "id": "byName", 2292 | "options": "memory" 2293 | }, 2294 | "properties": [ 2295 | { 2296 | "id": "color", 2297 | "value": { 2298 | "fixedColor": "yellow", 2299 | "mode": "fixed" 2300 | } 2301 | } 2302 | ] 2303 | }, 2304 | { 2305 | "matcher": { 2306 | "id": "byName", 2307 | "options": "erred" 2308 | }, 2309 | "properties": [ 2310 | { 2311 | "id": "color", 2312 | "value": { 2313 | "fixedColor": "dark-red", 2314 | "mode": "fixed" 2315 | } 2316 | } 2317 | ] 2318 | } 2319 | ] 2320 | }, 2321 | "gridPos": { 2322 | "h": 6, 2323 | "w": 12, 2324 | "x": 12, 2325 | "y": 27 2326 | }, 2327 | "id": 14, 2328 | "options": { 2329 | "legend": { 2330 | "calcs": [], 2331 | "displayMode": "list", 2332 | "placement": "bottom", 2333 | "showLegend": true 2334 | }, 2335 | "tooltip": { 2336 | "mode": "single", 2337 | "sort": "none" 2338 | } 2339 | }, 2340 | "targets": [ 2341 | { 2342 | "datasource": { 2343 | "type": "prometheus", 2344 | "uid": "${datasource}" 2345 | }, 2346 | "dimensions": {}, 2347 | "editorMode": "code", 2348 | "expr": "sum by(state) (dask_scheduler_tasks{cluster_id=\"$cluster_id\"}) != 0\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 2349 | "expression": "", 2350 | "hide": false, 2351 | "id": "", 2352 | "label": "", 2353 | "legendFormat": "__auto", 2354 | "matchExact": true, 2355 | "metricEditorMode": 0, 2356 | "metricName": "", 2357 | "metricQueryType": 0, 2358 | "namespace": "", 2359 | "period": "", 2360 | "queryMode": "Metrics", 2361 | "range": true, 2362 | "refId": "scheduler task state counts", 2363 | "region": "default", 2364 | "sqlExpression": "", 2365 | "statistic": "Average" 2366 | } 2367 | ], 2368 | "title": "Task States (Scheduler)", 2369 | "type": "timeseries" 2370 | }, 2371 | { 2372 | "datasource": { 2373 | "type": "prometheus", 2374 | "uid": "${datasource}" 2375 | }, 2376 | "fieldConfig": { 2377 | "defaults": { 2378 | "color": { 2379 | "mode": "palette-classic" 2380 | }, 2381 | "custom": { 2382 | "axisCenteredZero": false, 2383 | "axisColorMode": "text", 2384 | "axisLabel": "", 2385 | "axisPlacement": "auto", 2386 | "barAlignment": 0, 2387 | "drawStyle": "line", 2388 | "fillOpacity": 80, 2389 | "gradientMode": "none", 2390 | "hideFrom": { 2391 | "legend": false, 2392 | "tooltip": false, 2393 | "viz": false 2394 | }, 2395 | "lineInterpolation": "stepAfter", 2396 | "lineWidth": 0, 2397 | "pointSize": 3, 2398 | "scaleDistribution": { 2399 | "type": "linear" 2400 | }, 2401 | "showPoints": "never", 2402 | "spanNulls": false, 2403 | "stacking": { 2404 | "group": "A", 2405 | "mode": "normal" 2406 | }, 2407 | "thresholdsStyle": { 2408 | "mode": "off" 2409 | } 2410 | }, 2411 | "decimals": 0, 2412 | "mappings": [], 2413 | "thresholds": { 2414 | "mode": "absolute", 2415 | "steps": [ 2416 | { 2417 | "color": "green" 2418 | }, 2419 | { 2420 | "color": "red", 2421 | "value": 80 2422 | } 2423 | ] 2424 | }, 2425 | "unit": "none" 2426 | }, 2427 | "overrides": [ 2428 | { 2429 | "matcher": { 2430 | "id": "byName", 2431 | "options": "idle" 2432 | }, 2433 | "properties": [ 2434 | { 2435 | "id": "color", 2436 | "value": { 2437 | "fixedColor": "blue", 2438 | "mode": "fixed" 2439 | } 2440 | } 2441 | ] 2442 | }, 2443 | { 2444 | "matcher": { 2445 | "id": "byName", 2446 | "options": "saturated" 2447 | }, 2448 | "properties": [ 2449 | { 2450 | "id": "color", 2451 | "value": { 2452 | "fixedColor": "dark-red", 2453 | "mode": "fixed" 2454 | } 2455 | } 2456 | ] 2457 | }, 2458 | { 2459 | "matcher": { 2460 | "id": "byName", 2461 | "options": "neither idle nor saturated" 2462 | }, 2463 | "properties": [ 2464 | { 2465 | "id": "color", 2466 | "value": { 2467 | "fixedColor": "text", 2468 | "mode": "fixed" 2469 | } 2470 | } 2471 | ] 2472 | } 2473 | ] 2474 | }, 2475 | "gridPos": { 2476 | "h": 6, 2477 | "w": 12, 2478 | "x": 0, 2479 | "y": 33 2480 | }, 2481 | "id": 10, 2482 | "options": { 2483 | "legend": { 2484 | "calcs": [], 2485 | "displayMode": "list", 2486 | "placement": "bottom", 2487 | "showLegend": true 2488 | }, 2489 | "tooltip": { 2490 | "mode": "single", 2491 | "sort": "none" 2492 | } 2493 | }, 2494 | "targets": [ 2495 | { 2496 | "datasource": { 2497 | "type": "prometheus", 2498 | "uid": "${datasource}" 2499 | }, 2500 | "dimensions": {}, 2501 | "editorMode": "code", 2502 | "expr": "dask_scheduler_workers{cluster_id=\"$cluster_id\",state!=\"connected\"}\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 2503 | "expression": "", 2504 | "hide": false, 2505 | "id": "", 2506 | "label": "", 2507 | "legendFormat": "{{state}}", 2508 | "matchExact": true, 2509 | "metricEditorMode": 0, 2510 | "metricName": "", 2511 | "metricQueryType": 0, 2512 | "namespace": "", 2513 | "period": "", 2514 | "queryMode": "Metrics", 2515 | "range": true, 2516 | "refId": "worker state counts", 2517 | "region": "default", 2518 | "sqlExpression": "", 2519 | "statistic": "Average" 2520 | }, 2521 | { 2522 | "datasource": { 2523 | "type": "prometheus", 2524 | "uid": "${datasource}" 2525 | }, 2526 | "dimensions": {}, 2527 | "editorMode": "code", 2528 | "expr": "(dask_scheduler_workers{cluster_id=\"$cluster_id\",state=\"connected\"} - on () sum(dask_scheduler_workers{cluster_id=\"$cluster_id\",state!=\"connected\"}))\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 2529 | "expression": "", 2530 | "hide": false, 2531 | "id": "", 2532 | "label": "", 2533 | "legendFormat": "neither idle nor saturated", 2534 | "matchExact": true, 2535 | "metricEditorMode": 0, 2536 | "metricName": "", 2537 | "metricQueryType": 0, 2538 | "namespace": "", 2539 | "period": "", 2540 | "queryMode": "Metrics", 2541 | "range": true, 2542 | "refId": "uncategorized worker state count", 2543 | "region": "default", 2544 | "sqlExpression": "", 2545 | "statistic": "Average" 2546 | } 2547 | ], 2548 | "title": "Worker Count", 2549 | "type": "timeseries" 2550 | }, 2551 | { 2552 | "datasource": { 2553 | "type": "prometheus", 2554 | "uid": "${datasource}" 2555 | }, 2556 | "fieldConfig": { 2557 | "defaults": { 2558 | "color": { 2559 | "mode": "palette-classic" 2560 | }, 2561 | "custom": { 2562 | "axisCenteredZero": false, 2563 | "axisColorMode": "text", 2564 | "axisLabel": "", 2565 | "axisPlacement": "auto", 2566 | "barAlignment": 0, 2567 | "drawStyle": "line", 2568 | "fillOpacity": 70, 2569 | "gradientMode": "none", 2570 | "hideFrom": { 2571 | "legend": false, 2572 | "tooltip": false, 2573 | "viz": false 2574 | }, 2575 | "lineInterpolation": "stepAfter", 2576 | "lineStyle": { 2577 | "fill": "solid" 2578 | }, 2579 | "lineWidth": 0, 2580 | "pointSize": 3, 2581 | "scaleDistribution": { 2582 | "type": "linear" 2583 | }, 2584 | "showPoints": "never", 2585 | "spanNulls": false, 2586 | "stacking": { 2587 | "group": "A", 2588 | "mode": "normal" 2589 | }, 2590 | "thresholdsStyle": { 2591 | "mode": "off" 2592 | } 2593 | }, 2594 | "mappings": [], 2595 | "thresholds": { 2596 | "mode": "absolute", 2597 | "steps": [ 2598 | { 2599 | "color": "green" 2600 | }, 2601 | { 2602 | "color": "red", 2603 | "value": 80 2604 | } 2605 | ] 2606 | }, 2607 | "unit": "none" 2608 | }, 2609 | "overrides": [] 2610 | }, 2611 | "gridPos": { 2612 | "h": 6, 2613 | "w": 12, 2614 | "x": 12, 2615 | "y": 33 2616 | }, 2617 | "id": 48, 2618 | "options": { 2619 | "legend": { 2620 | "calcs": [], 2621 | "displayMode": "list", 2622 | "placement": "bottom", 2623 | "showLegend": true 2624 | }, 2625 | "tooltip": { 2626 | "mode": "single", 2627 | "sort": "none" 2628 | } 2629 | }, 2630 | "targets": [ 2631 | { 2632 | "datasource": { 2633 | "type": "prometheus", 2634 | "uid": "${datasource}" 2635 | }, 2636 | "dimensions": {}, 2637 | "editorMode": "code", 2638 | "expr": "sum by(state) (dask_worker_tasks{cluster_id=\"$cluster_id\",state!=\"executing\"}) != 0\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 2639 | "expression": "", 2640 | "id": "", 2641 | "label": "", 2642 | "legendFormat": "__auto", 2643 | "matchExact": true, 2644 | "metricEditorMode": 0, 2645 | "metricName": "", 2646 | "metricQueryType": 0, 2647 | "namespace": "", 2648 | "period": "", 2649 | "queryMode": "Metrics", 2650 | "range": true, 2651 | "refId": "worker task state counts", 2652 | "region": "default", 2653 | "sqlExpression": "", 2654 | "statistic": "Average" 2655 | } 2656 | ], 2657 | "title": "Task States (Workers)", 2658 | "type": "timeseries" 2659 | }, 2660 | { 2661 | "collapsed": false, 2662 | "gridPos": { 2663 | "h": 1, 2664 | "w": 24, 2665 | "x": 0, 2666 | "y": 39 2667 | }, 2668 | "id": 58, 2669 | "panels": [], 2670 | "title": "Scheduler", 2671 | "type": "row" 2672 | }, 2673 | { 2674 | "datasource": { 2675 | "type": "prometheus", 2676 | "uid": "${datasource}" 2677 | }, 2678 | "fieldConfig": { 2679 | "defaults": { 2680 | "color": { 2681 | "mode": "palette-classic" 2682 | }, 2683 | "custom": { 2684 | "axisCenteredZero": false, 2685 | "axisColorMode": "text", 2686 | "axisLabel": "", 2687 | "axisPlacement": "auto", 2688 | "barAlignment": 0, 2689 | "drawStyle": "line", 2690 | "fillOpacity": 40, 2691 | "gradientMode": "none", 2692 | "hideFrom": { 2693 | "legend": false, 2694 | "tooltip": false, 2695 | "viz": false 2696 | }, 2697 | "lineInterpolation": "linear", 2698 | "lineWidth": 1, 2699 | "pointSize": 3, 2700 | "scaleDistribution": { 2701 | "type": "linear" 2702 | }, 2703 | "showPoints": "never", 2704 | "spanNulls": false, 2705 | "stacking": { 2706 | "group": "A", 2707 | "mode": "normal" 2708 | }, 2709 | "thresholdsStyle": { 2710 | "mode": "off" 2711 | } 2712 | }, 2713 | "mappings": [], 2714 | "max": 100, 2715 | "thresholds": { 2716 | "mode": "absolute", 2717 | "steps": [ 2718 | { 2719 | "color": "green" 2720 | }, 2721 | { 2722 | "color": "red", 2723 | "value": 80 2724 | } 2725 | ] 2726 | }, 2727 | "unit": "percent" 2728 | }, 2729 | "overrides": [ 2730 | { 2731 | "matcher": { 2732 | "id": "byFrameRefID", 2733 | "options": "max cpu" 2734 | }, 2735 | "properties": [ 2736 | { 2737 | "id": "custom.stacking", 2738 | "value": { 2739 | "group": "A", 2740 | "mode": "none" 2741 | } 2742 | }, 2743 | { 2744 | "id": "custom.fillOpacity", 2745 | "value": 0 2746 | } 2747 | ] 2748 | }, 2749 | { 2750 | "matcher": { 2751 | "id": "byName", 2752 | "options": "max single core util" 2753 | }, 2754 | "properties": [ 2755 | { 2756 | "id": "color", 2757 | "value": { 2758 | "fixedColor": "text", 2759 | "mode": "fixed" 2760 | } 2761 | } 2762 | ] 2763 | } 2764 | ] 2765 | }, 2766 | "gridPos": { 2767 | "h": 6, 2768 | "w": 12, 2769 | "x": 0, 2770 | "y": 40 2771 | }, 2772 | "id": 23, 2773 | "links": [], 2774 | "options": { 2775 | "legend": { 2776 | "calcs": [], 2777 | "displayMode": "list", 2778 | "placement": "bottom", 2779 | "showLegend": true 2780 | }, 2781 | "tooltip": { 2782 | "mode": "single", 2783 | "sort": "none" 2784 | } 2785 | }, 2786 | "targets": [ 2787 | { 2788 | "datasource": { 2789 | "type": "prometheus", 2790 | "uid": "${datasource}" 2791 | }, 2792 | "dimensions": {}, 2793 | "editorMode": "code", 2794 | "expr": "avg by(mode) (rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\", mode!=\"idle\"}[$__rate_interval])) * 100", 2795 | "expression": "", 2796 | "hide": false, 2797 | "id": "", 2798 | "label": "", 2799 | "legendFormat": "__auto", 2800 | "matchExact": true, 2801 | "metricEditorMode": 0, 2802 | "metricName": "", 2803 | "metricQueryType": 0, 2804 | "namespace": "", 2805 | "period": "", 2806 | "queryMode": "Metrics", 2807 | "range": true, 2808 | "refId": "A", 2809 | "region": "default", 2810 | "sqlExpression": "", 2811 | "statistic": "Average" 2812 | }, 2813 | { 2814 | "datasource": { 2815 | "type": "prometheus", 2816 | "uid": "${datasource}" 2817 | }, 2818 | "dimensions": {}, 2819 | "editorMode": "code", 2820 | "expr": "max(sum by (cpu) (rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\", mode!=\"idle\"}[$__rate_interval]))) * 100", 2821 | "expression": "", 2822 | "hide": false, 2823 | "id": "", 2824 | "label": "", 2825 | "legendFormat": "max single core util", 2826 | "matchExact": true, 2827 | "metricEditorMode": 0, 2828 | "metricName": "", 2829 | "metricQueryType": 0, 2830 | "namespace": "", 2831 | "period": "", 2832 | "queryMode": "Metrics", 2833 | "range": true, 2834 | "refId": "max cpu", 2835 | "region": "default", 2836 | "sqlExpression": "", 2837 | "statistic": "Average" 2838 | } 2839 | ], 2840 | "title": "Scheduler CPU", 2841 | "type": "timeseries" 2842 | }, 2843 | { 2844 | "datasource": { 2845 | "type": "prometheus", 2846 | "uid": "${datasource}" 2847 | }, 2848 | "fieldConfig": { 2849 | "defaults": { 2850 | "color": { 2851 | "mode": "palette-classic" 2852 | }, 2853 | "custom": { 2854 | "axisCenteredZero": false, 2855 | "axisColorMode": "text", 2856 | "axisLabel": "", 2857 | "axisPlacement": "auto", 2858 | "barAlignment": 0, 2859 | "drawStyle": "bars", 2860 | "fillOpacity": 100, 2861 | "gradientMode": "none", 2862 | "hideFrom": { 2863 | "legend": false, 2864 | "tooltip": false, 2865 | "viz": false 2866 | }, 2867 | "lineInterpolation": "linear", 2868 | "lineWidth": 1, 2869 | "pointSize": 5, 2870 | "scaleDistribution": { 2871 | "type": "linear" 2872 | }, 2873 | "showPoints": "auto", 2874 | "spanNulls": false, 2875 | "stacking": { 2876 | "group": "A", 2877 | "mode": "none" 2878 | }, 2879 | "thresholdsStyle": { 2880 | "mode": "off" 2881 | } 2882 | }, 2883 | "mappings": [], 2884 | "thresholds": { 2885 | "mode": "absolute", 2886 | "steps": [ 2887 | { 2888 | "color": "green" 2889 | }, 2890 | { 2891 | "color": "red", 2892 | "value": 80 2893 | } 2894 | ] 2895 | }, 2896 | "unit": "bytes" 2897 | }, 2898 | "overrides": [] 2899 | }, 2900 | "gridPos": { 2901 | "h": 6, 2902 | "w": 12, 2903 | "x": 12, 2904 | "y": 40 2905 | }, 2906 | "id": 1, 2907 | "options": { 2908 | "legend": { 2909 | "calcs": [], 2910 | "displayMode": "list", 2911 | "placement": "bottom", 2912 | "showLegend": true 2913 | }, 2914 | "tooltip": { 2915 | "mode": "single", 2916 | "sort": "none" 2917 | } 2918 | }, 2919 | "targets": [ 2920 | { 2921 | "datasource": { 2922 | "type": "cloudwatch", 2923 | "uid": "${datasource}" 2924 | }, 2925 | "dimensions": { 2926 | "ClusterName": "$ClusterName", 2927 | "InstanceSpecName": "scheduler", 2928 | "interface": "ens5" 2929 | }, 2930 | "editorMode": "code", 2931 | "expr": "sum(rate(node_network_receive_bytes_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval]))", 2932 | "expression": "", 2933 | "format": "time_series", 2934 | "hide": false, 2935 | "id": "", 2936 | "key": "Q-5ce1a6dc-002b-4e23-9c02-76e07e13fb73-0", 2937 | "label": "", 2938 | "legendFormat": "Recv", 2939 | "matchExact": false, 2940 | "metricEditorMode": 0, 2941 | "metricName": "net_bytes_sent", 2942 | "metricQueryType": 0, 2943 | "namespace": "Coiled/Clusters", 2944 | "period": "$period", 2945 | "queryMode": "Metrics", 2946 | "range": true, 2947 | "refId": "Recv", 2948 | "region": "$region", 2949 | "sqlExpression": "", 2950 | "statistic": "Sum" 2951 | }, 2952 | { 2953 | "datasource": { 2954 | "type": "cloudwatch", 2955 | "uid": "${datasource}" 2956 | }, 2957 | "dimensions": { 2958 | "ClusterName": "$ClusterName", 2959 | "InstanceSpecName": "scheduler", 2960 | "interface": "ens5" 2961 | }, 2962 | "editorMode": "code", 2963 | "expr": "sum(rate(node_network_transmit_bytes_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval]))", 2964 | "expression": "", 2965 | "format": "time_series", 2966 | "hide": false, 2967 | "id": "", 2968 | "key": "Q-5ce1a6dc-002b-4e23-9c02-76e07e13fb73-0", 2969 | "label": "", 2970 | "legendFormat": "Sent", 2971 | "matchExact": false, 2972 | "metricEditorMode": 0, 2973 | "metricName": "net_bytes_sent", 2974 | "metricQueryType": 0, 2975 | "namespace": "Coiled/Clusters", 2976 | "period": "$period", 2977 | "queryMode": "Metrics", 2978 | "range": true, 2979 | "refId": "Sent", 2980 | "region": "$region", 2981 | "sqlExpression": "", 2982 | "statistic": "Sum" 2983 | } 2984 | ], 2985 | "title": "Scheduler Bytes Sent/Recv Rate", 2986 | "type": "timeseries" 2987 | }, 2988 | { 2989 | "datasource": { 2990 | "type": "prometheus", 2991 | "uid": "${datasource}" 2992 | }, 2993 | "fieldConfig": { 2994 | "defaults": { 2995 | "color": { 2996 | "mode": "palette-classic" 2997 | }, 2998 | "custom": { 2999 | "axisCenteredZero": false, 3000 | "axisColorMode": "text", 3001 | "axisLabel": "", 3002 | "axisPlacement": "auto", 3003 | "barAlignment": 0, 3004 | "drawStyle": "line", 3005 | "fillOpacity": 0, 3006 | "gradientMode": "none", 3007 | "hideFrom": { 3008 | "legend": false, 3009 | "tooltip": false, 3010 | "viz": false 3011 | }, 3012 | "lineInterpolation": "linear", 3013 | "lineWidth": 1, 3014 | "pointSize": 5, 3015 | "scaleDistribution": { 3016 | "type": "linear" 3017 | }, 3018 | "showPoints": "never", 3019 | "spanNulls": false, 3020 | "stacking": { 3021 | "group": "A", 3022 | "mode": "none" 3023 | }, 3024 | "thresholdsStyle": { 3025 | "mode": "off" 3026 | } 3027 | }, 3028 | "mappings": [], 3029 | "max": 1, 3030 | "min": 0, 3031 | "thresholds": { 3032 | "mode": "absolute", 3033 | "steps": [ 3034 | { 3035 | "color": "green" 3036 | }, 3037 | { 3038 | "color": "red", 3039 | "value": 80 3040 | } 3041 | ] 3042 | }, 3043 | "unit": "percentunit" 3044 | }, 3045 | "overrides": [] 3046 | }, 3047 | "gridPos": { 3048 | "h": 6, 3049 | "w": 12, 3050 | "x": 0, 3051 | "y": 46 3052 | }, 3053 | "id": 45, 3054 | "options": { 3055 | "legend": { 3056 | "calcs": [], 3057 | "displayMode": "list", 3058 | "placement": "bottom", 3059 | "showLegend": true 3060 | }, 3061 | "tooltip": { 3062 | "mode": "single", 3063 | "sort": "none" 3064 | } 3065 | }, 3066 | "targets": [ 3067 | { 3068 | "datasource": { 3069 | "type": "prometheus", 3070 | "uid": "${datasource}" 3071 | }, 3072 | "editorMode": "code", 3073 | "expr": "\n (\n node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\", spec=\"scheduler\"}\n - node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\", spec=\"scheduler\"}\n ) / node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\", spec=\"scheduler\"}\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 3074 | "legendFormat": "Non-Free Memory", 3075 | "range": true, 3076 | "refId": "A" 3077 | } 3078 | ], 3079 | "title": "Scheduler Memory", 3080 | "type": "timeseries" 3081 | }, 3082 | { 3083 | "datasource": { 3084 | "type": "prometheus", 3085 | "uid": "${datasource}" 3086 | }, 3087 | "fieldConfig": { 3088 | "defaults": { 3089 | "color": { 3090 | "mode": "palette-classic" 3091 | }, 3092 | "custom": { 3093 | "axisCenteredZero": false, 3094 | "axisColorMode": "text", 3095 | "axisLabel": "", 3096 | "axisPlacement": "auto", 3097 | "barAlignment": 0, 3098 | "drawStyle": "line", 3099 | "fillOpacity": 40, 3100 | "gradientMode": "none", 3101 | "hideFrom": { 3102 | "legend": false, 3103 | "tooltip": false, 3104 | "viz": false 3105 | }, 3106 | "lineInterpolation": "linear", 3107 | "lineWidth": 1, 3108 | "pointSize": 5, 3109 | "scaleDistribution": { 3110 | "type": "linear" 3111 | }, 3112 | "showPoints": "never", 3113 | "spanNulls": false, 3114 | "stacking": { 3115 | "group": "A", 3116 | "mode": "none" 3117 | }, 3118 | "thresholdsStyle": { 3119 | "mode": "off" 3120 | } 3121 | }, 3122 | "mappings": [], 3123 | "thresholds": { 3124 | "mode": "absolute", 3125 | "steps": [ 3126 | { 3127 | "color": "green" 3128 | }, 3129 | { 3130 | "color": "red", 3131 | "value": 80 3132 | } 3133 | ] 3134 | }, 3135 | "unit": "bytes" 3136 | }, 3137 | "overrides": [ 3138 | { 3139 | "matcher": { 3140 | "id": "byName", 3141 | "options": "Recv (avg)" 3142 | }, 3143 | "properties": [ 3144 | { 3145 | "id": "custom.stacking", 3146 | "value": { 3147 | "group": "A", 3148 | "mode": "none" 3149 | } 3150 | }, 3151 | { 3152 | "id": "custom.drawStyle", 3153 | "value": "line" 3154 | }, 3155 | { 3156 | "id": "custom.fillOpacity", 3157 | "value": 0 3158 | }, 3159 | { 3160 | "id": "custom.lineWidth", 3161 | "value": 3 3162 | }, 3163 | { 3164 | "id": "color", 3165 | "value": { 3166 | "fixedColor": "dark-purple", 3167 | "mode": "fixed" 3168 | } 3169 | } 3170 | ] 3171 | }, 3172 | { 3173 | "matcher": { 3174 | "id": "byName", 3175 | "options": "Recv (max)" 3176 | }, 3177 | "properties": [ 3178 | { 3179 | "id": "custom.stacking", 3180 | "value": { 3181 | "group": "A", 3182 | "mode": "none" 3183 | } 3184 | }, 3185 | { 3186 | "id": "custom.drawStyle", 3187 | "value": "line" 3188 | }, 3189 | { 3190 | "id": "custom.lineWidth", 3191 | "value": 2 3192 | }, 3193 | { 3194 | "id": "custom.fillOpacity", 3195 | "value": 0 3196 | } 3197 | ] 3198 | }, 3199 | { 3200 | "matcher": { 3201 | "id": "byName", 3202 | "options": "Sent" 3203 | }, 3204 | "properties": [ 3205 | { 3206 | "id": "color", 3207 | "value": { 3208 | "fixedColor": "dark-green", 3209 | "mode": "fixed" 3210 | } 3211 | } 3212 | ] 3213 | }, 3214 | { 3215 | "matcher": { 3216 | "id": "byName", 3217 | "options": "Recv (after docker pull)" 3218 | }, 3219 | "properties": [ 3220 | { 3221 | "id": "color", 3222 | "value": { 3223 | "fixedColor": "blue", 3224 | "mode": "fixed" 3225 | } 3226 | } 3227 | ] 3228 | } 3229 | ] 3230 | }, 3231 | "gridPos": { 3232 | "h": 6, 3233 | "w": 12, 3234 | "x": 12, 3235 | "y": 46 3236 | }, 3237 | "id": 56, 3238 | "options": { 3239 | "legend": { 3240 | "calcs": [], 3241 | "displayMode": "list", 3242 | "placement": "bottom", 3243 | "showLegend": true 3244 | }, 3245 | "tooltip": { 3246 | "mode": "single", 3247 | "sort": "none" 3248 | } 3249 | }, 3250 | "targets": [ 3251 | { 3252 | "datasource": { 3253 | "type": "prometheus", 3254 | "uid": "${datasource}" 3255 | }, 3256 | "dimensions": {}, 3257 | "editorMode": "code", 3258 | "expr": "node_network_transmit_bytes_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 3259 | "expression": "", 3260 | "hide": false, 3261 | "id": "", 3262 | "label": "", 3263 | "legendFormat": "Sent", 3264 | "matchExact": true, 3265 | "metricEditorMode": 0, 3266 | "metricName": "", 3267 | "metricQueryType": 0, 3268 | "namespace": "", 3269 | "period": "", 3270 | "queryMode": "Metrics", 3271 | "range": true, 3272 | "refId": "A", 3273 | "region": "default", 3274 | "sqlExpression": "", 3275 | "statistic": "Average" 3276 | }, 3277 | { 3278 | "datasource": { 3279 | "type": "prometheus", 3280 | "uid": "${datasource}" 3281 | }, 3282 | "dimensions": {}, 3283 | "editorMode": "code", 3284 | "expr": "-(node_network_receive_bytes_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"} - min_over_time(node_network_receive_bytes_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[12h]))\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 3285 | "expression": "", 3286 | "hide": false, 3287 | "id": "", 3288 | "label": "", 3289 | "legendFormat": "Recv (after docker pull)", 3290 | "matchExact": true, 3291 | "metricEditorMode": 0, 3292 | "metricName": "", 3293 | "metricQueryType": 0, 3294 | "namespace": "", 3295 | "period": "", 3296 | "queryMode": "Metrics", 3297 | "range": true, 3298 | "refId": "B", 3299 | "region": "default", 3300 | "sqlExpression": "", 3301 | "statistic": "Average" 3302 | } 3303 | ], 3304 | "title": "Scheduler Net Sent/Recv Total", 3305 | "type": "timeseries" 3306 | }, 3307 | { 3308 | "collapsed": false, 3309 | "gridPos": { 3310 | "h": 1, 3311 | "w": 24, 3312 | "x": 0, 3313 | "y": 52 3314 | }, 3315 | "id": 62, 3316 | "panels": [], 3317 | "title": "Worker — Disk/Network Details", 3318 | "type": "row" 3319 | }, 3320 | { 3321 | "datasource": { 3322 | "type": "prometheus", 3323 | "uid": "${datasource}" 3324 | }, 3325 | "fieldConfig": { 3326 | "defaults": { 3327 | "color": { 3328 | "mode": "palette-classic" 3329 | }, 3330 | "custom": { 3331 | "axisCenteredZero": false, 3332 | "axisColorMode": "text", 3333 | "axisLabel": "", 3334 | "axisPlacement": "auto", 3335 | "barAlignment": 0, 3336 | "drawStyle": "line", 3337 | "fillOpacity": 0, 3338 | "gradientMode": "none", 3339 | "hideFrom": { 3340 | "legend": false, 3341 | "tooltip": false, 3342 | "viz": false 3343 | }, 3344 | "lineInterpolation": "linear", 3345 | "lineWidth": 1, 3346 | "pointSize": 5, 3347 | "scaleDistribution": { 3348 | "type": "linear" 3349 | }, 3350 | "showPoints": "never", 3351 | "spanNulls": false, 3352 | "stacking": { 3353 | "group": "A", 3354 | "mode": "none" 3355 | }, 3356 | "thresholdsStyle": { 3357 | "mode": "line" 3358 | } 3359 | }, 3360 | "decimals": 1, 3361 | "mappings": [], 3362 | "thresholds": { 3363 | "mode": "absolute", 3364 | "steps": [ 3365 | { 3366 | "color": "green" 3367 | }, 3368 | { 3369 | "color": "red", 3370 | "value": 90 3371 | } 3372 | ] 3373 | }, 3374 | "unit": "percentunit" 3375 | }, 3376 | "overrides": [] 3377 | }, 3378 | "gridPos": { 3379 | "h": 6, 3380 | "w": 12, 3381 | "x": 0, 3382 | "y": 53 3383 | }, 3384 | "id": 9, 3385 | "options": { 3386 | "legend": { 3387 | "calcs": [], 3388 | "displayMode": "list", 3389 | "placement": "bottom", 3390 | "showLegend": true 3391 | }, 3392 | "tooltip": { 3393 | "mode": "single", 3394 | "sort": "none" 3395 | } 3396 | }, 3397 | "targets": [ 3398 | { 3399 | "datasource": { 3400 | "type": "prometheus", 3401 | "uid": "${datasource}" 3402 | }, 3403 | "dimensions": {}, 3404 | "editorMode": "code", 3405 | "exemplar": false, 3406 | "expr": "(node_filesystem_size_bytes{cluster_id=\"$cluster_id\", mountpoint=\"/\", spec=\"worker\"} - node_filesystem_free_bytes{cluster_id=\"$cluster_id\", mountpoint=\"/\", spec=\"worker\"})/node_filesystem_size_bytes{cluster_id=\"$cluster_id\", mountpoint=\"/\", spec=\"worker\"}", 3407 | "expression": "", 3408 | "id": "", 3409 | "instant": false, 3410 | "label": "", 3411 | "legendFormat": "{{coiled_instance}}", 3412 | "matchExact": true, 3413 | "metricEditorMode": 0, 3414 | "metricName": "", 3415 | "metricQueryType": 0, 3416 | "namespace": "", 3417 | "period": "", 3418 | "queryMode": "Metrics", 3419 | "range": true, 3420 | "refId": "A", 3421 | "region": "default", 3422 | "sqlExpression": "", 3423 | "statistic": "Average" 3424 | } 3425 | ], 3426 | "title": "Worker Disk Used (Attached Disk)", 3427 | "type": "timeseries" 3428 | }, 3429 | { 3430 | "datasource": { 3431 | "type": "prometheus", 3432 | "uid": "${datasource}" 3433 | }, 3434 | "fieldConfig": { 3435 | "defaults": { 3436 | "color": { 3437 | "mode": "palette-classic" 3438 | }, 3439 | "custom": { 3440 | "axisCenteredZero": false, 3441 | "axisColorMode": "text", 3442 | "axisLabel": "", 3443 | "axisPlacement": "auto", 3444 | "barAlignment": 0, 3445 | "drawStyle": "line", 3446 | "fillOpacity": 0, 3447 | "gradientMode": "none", 3448 | "hideFrom": { 3449 | "legend": false, 3450 | "tooltip": false, 3451 | "viz": false 3452 | }, 3453 | "lineInterpolation": "linear", 3454 | "lineWidth": 1, 3455 | "pointSize": 5, 3456 | "scaleDistribution": { 3457 | "type": "linear" 3458 | }, 3459 | "showPoints": "never", 3460 | "spanNulls": false, 3461 | "stacking": { 3462 | "group": "A", 3463 | "mode": "none" 3464 | }, 3465 | "thresholdsStyle": { 3466 | "mode": "line" 3467 | } 3468 | }, 3469 | "decimals": 1, 3470 | "mappings": [], 3471 | "thresholds": { 3472 | "mode": "absolute", 3473 | "steps": [ 3474 | { 3475 | "color": "green" 3476 | }, 3477 | { 3478 | "color": "red", 3479 | "value": 90 3480 | } 3481 | ] 3482 | }, 3483 | "unit": "percentunit" 3484 | }, 3485 | "overrides": [] 3486 | }, 3487 | "gridPos": { 3488 | "h": 6, 3489 | "w": 12, 3490 | "x": 12, 3491 | "y": 53 3492 | }, 3493 | "id": 55, 3494 | "options": { 3495 | "legend": { 3496 | "calcs": [], 3497 | "displayMode": "list", 3498 | "placement": "bottom", 3499 | "showLegend": true 3500 | }, 3501 | "tooltip": { 3502 | "mode": "single", 3503 | "sort": "none" 3504 | } 3505 | }, 3506 | "targets": [ 3507 | { 3508 | "datasource": { 3509 | "type": "prometheus", 3510 | "uid": "${datasource}" 3511 | }, 3512 | "dimensions": {}, 3513 | "editorMode": "code", 3514 | "exemplar": false, 3515 | "expr": "(node_filesystem_size_bytes{cluster_id=\"$cluster_id\", mountpoint=\"/scratch\", spec=\"worker\"} - node_filesystem_free_bytes{cluster_id=\"$cluster_id\", mountpoint=\"/scratch\", spec=\"worker\"})/node_filesystem_size_bytes{cluster_id=\"$cluster_id\", mountpoint=\"/scratch\", spec=\"worker\"}", 3516 | "expression": "", 3517 | "id": "", 3518 | "instant": false, 3519 | "label": "", 3520 | "legendFormat": "{{coiled_instance}}", 3521 | "matchExact": true, 3522 | "metricEditorMode": 0, 3523 | "metricName": "", 3524 | "metricQueryType": 0, 3525 | "namespace": "", 3526 | "period": "", 3527 | "queryMode": "Metrics", 3528 | "range": true, 3529 | "refId": "A", 3530 | "region": "default", 3531 | "sqlExpression": "", 3532 | "statistic": "Average" 3533 | } 3534 | ], 3535 | "title": "Worker Scratch Used (NVMe)", 3536 | "type": "timeseries" 3537 | }, 3538 | { 3539 | "datasource": { 3540 | "type": "prometheus", 3541 | "uid": "${datasource}" 3542 | }, 3543 | "fieldConfig": { 3544 | "defaults": { 3545 | "color": { 3546 | "mode": "palette-classic" 3547 | }, 3548 | "custom": { 3549 | "axisCenteredZero": false, 3550 | "axisColorMode": "text", 3551 | "axisLabel": "", 3552 | "axisPlacement": "auto", 3553 | "barAlignment": 0, 3554 | "drawStyle": "line", 3555 | "fillOpacity": 20, 3556 | "gradientMode": "none", 3557 | "hideFrom": { 3558 | "legend": false, 3559 | "tooltip": false, 3560 | "viz": false 3561 | }, 3562 | "lineInterpolation": "linear", 3563 | "lineWidth": 1, 3564 | "pointSize": 5, 3565 | "scaleDistribution": { 3566 | "type": "linear" 3567 | }, 3568 | "showPoints": "never", 3569 | "spanNulls": false, 3570 | "stacking": { 3571 | "group": "A", 3572 | "mode": "none" 3573 | }, 3574 | "thresholdsStyle": { 3575 | "mode": "off" 3576 | } 3577 | }, 3578 | "mappings": [], 3579 | "thresholds": { 3580 | "mode": "absolute", 3581 | "steps": [ 3582 | { 3583 | "color": "green" 3584 | }, 3585 | { 3586 | "color": "red", 3587 | "value": 80 3588 | } 3589 | ] 3590 | }, 3591 | "unit": "bytes" 3592 | }, 3593 | "overrides": [ 3594 | { 3595 | "matcher": { 3596 | "id": "byName", 3597 | "options": "Recv (avg)" 3598 | }, 3599 | "properties": [ 3600 | { 3601 | "id": "custom.stacking", 3602 | "value": { 3603 | "group": "A", 3604 | "mode": "none" 3605 | } 3606 | }, 3607 | { 3608 | "id": "custom.drawStyle", 3609 | "value": "line" 3610 | }, 3611 | { 3612 | "id": "custom.fillOpacity", 3613 | "value": 0 3614 | }, 3615 | { 3616 | "id": "custom.lineWidth", 3617 | "value": 3 3618 | }, 3619 | { 3620 | "id": "color", 3621 | "value": { 3622 | "fixedColor": "dark-purple", 3623 | "mode": "fixed" 3624 | } 3625 | } 3626 | ] 3627 | }, 3628 | { 3629 | "matcher": { 3630 | "id": "byName", 3631 | "options": "Recv (max)" 3632 | }, 3633 | "properties": [ 3634 | { 3635 | "id": "custom.stacking", 3636 | "value": { 3637 | "group": "A", 3638 | "mode": "none" 3639 | } 3640 | }, 3641 | { 3642 | "id": "custom.drawStyle", 3643 | "value": "line" 3644 | }, 3645 | { 3646 | "id": "custom.lineWidth", 3647 | "value": 2 3648 | }, 3649 | { 3650 | "id": "custom.fillOpacity", 3651 | "value": 0 3652 | } 3653 | ] 3654 | } 3655 | ] 3656 | }, 3657 | "gridPos": { 3658 | "h": 6, 3659 | "w": 12, 3660 | "x": 0, 3661 | "y": 59 3662 | }, 3663 | "id": 52, 3664 | "options": { 3665 | "legend": { 3666 | "calcs": [], 3667 | "displayMode": "list", 3668 | "placement": "bottom", 3669 | "showLegend": true 3670 | }, 3671 | "tooltip": { 3672 | "mode": "single", 3673 | "sort": "none" 3674 | } 3675 | }, 3676 | "targets": [ 3677 | { 3678 | "datasource": { 3679 | "type": "prometheus", 3680 | "uid": "${datasource}" 3681 | }, 3682 | "dimensions": {}, 3683 | "editorMode": "code", 3684 | "expr": "max (rate(node_network_transmit_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval]))", 3685 | "expression": "", 3686 | "id": "", 3687 | "label": "", 3688 | "legendFormat": "Sent", 3689 | "matchExact": true, 3690 | "metricEditorMode": 0, 3691 | "metricName": "", 3692 | "metricQueryType": 0, 3693 | "namespace": "", 3694 | "period": "", 3695 | "queryMode": "Metrics", 3696 | "range": true, 3697 | "refId": "A", 3698 | "region": "default", 3699 | "sqlExpression": "", 3700 | "statistic": "Average" 3701 | }, 3702 | { 3703 | "datasource": { 3704 | "type": "prometheus", 3705 | "uid": "${datasource}" 3706 | }, 3707 | "dimensions": {}, 3708 | "editorMode": "code", 3709 | "expr": "max (rate(node_network_receive_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"}[$__rate_interval]))", 3710 | "expression": "", 3711 | "hide": false, 3712 | "id": "", 3713 | "label": "", 3714 | "legendFormat": "Recv", 3715 | "matchExact": true, 3716 | "metricEditorMode": 0, 3717 | "metricName": "", 3718 | "metricQueryType": 0, 3719 | "namespace": "", 3720 | "period": "", 3721 | "queryMode": "Metrics", 3722 | "range": true, 3723 | "refId": "B", 3724 | "region": "default", 3725 | "sqlExpression": "", 3726 | "statistic": "Average" 3727 | } 3728 | ], 3729 | "title": "Worker Net Sent/Recv Max Rate", 3730 | "type": "timeseries" 3731 | }, 3732 | { 3733 | "datasource": { 3734 | "type": "prometheus", 3735 | "uid": "${datasource}" 3736 | }, 3737 | "fieldConfig": { 3738 | "defaults": { 3739 | "color": { 3740 | "mode": "palette-classic" 3741 | }, 3742 | "custom": { 3743 | "axisCenteredZero": false, 3744 | "axisColorMode": "text", 3745 | "axisLabel": "", 3746 | "axisPlacement": "auto", 3747 | "barAlignment": 0, 3748 | "drawStyle": "line", 3749 | "fillOpacity": 20, 3750 | "gradientMode": "none", 3751 | "hideFrom": { 3752 | "legend": false, 3753 | "tooltip": false, 3754 | "viz": false 3755 | }, 3756 | "lineInterpolation": "linear", 3757 | "lineWidth": 1, 3758 | "pointSize": 5, 3759 | "scaleDistribution": { 3760 | "type": "linear" 3761 | }, 3762 | "showPoints": "never", 3763 | "spanNulls": false, 3764 | "stacking": { 3765 | "group": "A", 3766 | "mode": "none" 3767 | }, 3768 | "thresholdsStyle": { 3769 | "mode": "off" 3770 | } 3771 | }, 3772 | "mappings": [], 3773 | "thresholds": { 3774 | "mode": "absolute", 3775 | "steps": [ 3776 | { 3777 | "color": "green" 3778 | }, 3779 | { 3780 | "color": "red", 3781 | "value": 80 3782 | } 3783 | ] 3784 | }, 3785 | "unit": "bytes" 3786 | }, 3787 | "overrides": [ 3788 | { 3789 | "matcher": { 3790 | "id": "byName", 3791 | "options": "Recv (avg)" 3792 | }, 3793 | "properties": [ 3794 | { 3795 | "id": "custom.stacking", 3796 | "value": { 3797 | "group": "A", 3798 | "mode": "none" 3799 | } 3800 | }, 3801 | { 3802 | "id": "custom.drawStyle", 3803 | "value": "line" 3804 | }, 3805 | { 3806 | "id": "custom.fillOpacity", 3807 | "value": 0 3808 | }, 3809 | { 3810 | "id": "custom.lineWidth", 3811 | "value": 3 3812 | }, 3813 | { 3814 | "id": "color", 3815 | "value": { 3816 | "fixedColor": "dark-purple", 3817 | "mode": "fixed" 3818 | } 3819 | } 3820 | ] 3821 | }, 3822 | { 3823 | "matcher": { 3824 | "id": "byName", 3825 | "options": "Recv (max)" 3826 | }, 3827 | "properties": [ 3828 | { 3829 | "id": "custom.stacking", 3830 | "value": { 3831 | "group": "A", 3832 | "mode": "none" 3833 | } 3834 | }, 3835 | { 3836 | "id": "custom.drawStyle", 3837 | "value": "line" 3838 | }, 3839 | { 3840 | "id": "custom.lineWidth", 3841 | "value": 2 3842 | }, 3843 | { 3844 | "id": "custom.fillOpacity", 3845 | "value": 0 3846 | } 3847 | ] 3848 | } 3849 | ] 3850 | }, 3851 | "gridPos": { 3852 | "h": 6, 3853 | "w": 12, 3854 | "x": 12, 3855 | "y": 59 3856 | }, 3857 | "id": 34, 3858 | "options": { 3859 | "legend": { 3860 | "calcs": [], 3861 | "displayMode": "list", 3862 | "placement": "bottom", 3863 | "showLegend": true 3864 | }, 3865 | "tooltip": { 3866 | "mode": "single", 3867 | "sort": "none" 3868 | } 3869 | }, 3870 | "targets": [ 3871 | { 3872 | "datasource": { 3873 | "type": "prometheus", 3874 | "uid": "${datasource}" 3875 | }, 3876 | "dimensions": {}, 3877 | "editorMode": "code", 3878 | "expr": "sum(node_network_transmit_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 3879 | "expression": "", 3880 | "id": "", 3881 | "label": "", 3882 | "legendFormat": "Sent", 3883 | "matchExact": true, 3884 | "metricEditorMode": 0, 3885 | "metricName": "", 3886 | "metricQueryType": 0, 3887 | "namespace": "", 3888 | "period": "", 3889 | "queryMode": "Metrics", 3890 | "range": true, 3891 | "refId": "A", 3892 | "region": "default", 3893 | "sqlExpression": "", 3894 | "statistic": "Average" 3895 | }, 3896 | { 3897 | "datasource": { 3898 | "type": "prometheus", 3899 | "uid": "${datasource}" 3900 | }, 3901 | "dimensions": {}, 3902 | "editorMode": "code", 3903 | "expr": "sum(node_network_receive_bytes_total{cluster_id=\"$cluster_id\", spec=\"worker\"})\n# only trust metric when scheduler is running\nand on () sum(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval])) > 0", 3904 | "expression": "", 3905 | "hide": false, 3906 | "id": "", 3907 | "label": "", 3908 | "legendFormat": "Recv", 3909 | "matchExact": true, 3910 | "metricEditorMode": 0, 3911 | "metricName": "", 3912 | "metricQueryType": 0, 3913 | "namespace": "", 3914 | "period": "", 3915 | "queryMode": "Metrics", 3916 | "range": true, 3917 | "refId": "B", 3918 | "region": "default", 3919 | "sqlExpression": "", 3920 | "statistic": "Average" 3921 | } 3922 | ], 3923 | "title": "Worker Net Sent/Recv Total", 3924 | "type": "timeseries" 3925 | }, 3926 | { 3927 | "collapsed": false, 3928 | "gridPos": { 3929 | "h": 1, 3930 | "w": 24, 3931 | "x": 0, 3932 | "y": 65 3933 | }, 3934 | "id": 64, 3935 | "panels": [], 3936 | "title": "Worker — Memory/CPU Details", 3937 | "type": "row" 3938 | }, 3939 | { 3940 | "datasource": { 3941 | "type": "prometheus", 3942 | "uid": "${datasource}" 3943 | }, 3944 | "fieldConfig": { 3945 | "defaults": { 3946 | "color": { 3947 | "mode": "palette-classic" 3948 | }, 3949 | "custom": { 3950 | "axisCenteredZero": false, 3951 | "axisColorMode": "text", 3952 | "axisLabel": "", 3953 | "axisPlacement": "auto", 3954 | "barAlignment": 0, 3955 | "drawStyle": "line", 3956 | "fillOpacity": 40, 3957 | "gradientMode": "none", 3958 | "hideFrom": { 3959 | "legend": false, 3960 | "tooltip": false, 3961 | "viz": false 3962 | }, 3963 | "lineInterpolation": "linear", 3964 | "lineWidth": 1, 3965 | "pointSize": 3, 3966 | "scaleDistribution": { 3967 | "type": "linear" 3968 | }, 3969 | "showPoints": "never", 3970 | "spanNulls": false, 3971 | "stacking": { 3972 | "group": "A", 3973 | "mode": "normal" 3974 | }, 3975 | "thresholdsStyle": { 3976 | "mode": "off" 3977 | } 3978 | }, 3979 | "mappings": [], 3980 | "thresholds": { 3981 | "mode": "absolute", 3982 | "steps": [ 3983 | { 3984 | "color": "green" 3985 | }, 3986 | { 3987 | "color": "red", 3988 | "value": 80 3989 | } 3990 | ] 3991 | }, 3992 | "unit": "percent" 3993 | }, 3994 | "overrides": [ 3995 | { 3996 | "matcher": { 3997 | "id": "byFrameRefID", 3998 | "options": "max cpu" 3999 | }, 4000 | "properties": [ 4001 | { 4002 | "id": "custom.stacking", 4003 | "value": { 4004 | "group": "A", 4005 | "mode": "none" 4006 | } 4007 | }, 4008 | { 4009 | "id": "custom.fillOpacity", 4010 | "value": 0 4011 | } 4012 | ] 4013 | }, 4014 | { 4015 | "matcher": { 4016 | "id": "byName", 4017 | "options": "max single core util" 4018 | }, 4019 | "properties": [ 4020 | { 4021 | "id": "color", 4022 | "value": { 4023 | "fixedColor": "text", 4024 | "mode": "fixed" 4025 | } 4026 | } 4027 | ] 4028 | }, 4029 | { 4030 | "matcher": { 4031 | "id": "byName", 4032 | "options": "cores" 4033 | }, 4034 | "properties": [ 4035 | { 4036 | "id": "custom.stacking", 4037 | "value": { 4038 | "group": "A", 4039 | "mode": "none" 4040 | } 4041 | }, 4042 | { 4043 | "id": "color", 4044 | "value": { 4045 | "fixedColor": "text", 4046 | "mode": "fixed" 4047 | } 4048 | }, 4049 | { 4050 | "id": "custom.fillOpacity", 4051 | "value": 0 4052 | }, 4053 | { 4054 | "id": "custom.lineWidth", 4055 | "value": 2 4056 | } 4057 | ] 4058 | }, 4059 | { 4060 | "matcher": { 4061 | "id": "byName", 4062 | "options": "single-core capacity" 4063 | }, 4064 | "properties": [ 4065 | { 4066 | "id": "custom.stacking", 4067 | "value": { 4068 | "group": "A", 4069 | "mode": "none" 4070 | } 4071 | }, 4072 | { 4073 | "id": "color", 4074 | "value": { 4075 | "fixedColor": "text", 4076 | "mode": "fixed" 4077 | } 4078 | } 4079 | ] 4080 | } 4081 | ] 4082 | }, 4083 | "gridPos": { 4084 | "h": 6, 4085 | "w": 24, 4086 | "x": 0, 4087 | "y": 66 4088 | }, 4089 | "id": 93, 4090 | "links": [], 4091 | "options": { 4092 | "legend": { 4093 | "calcs": [], 4094 | "displayMode": "list", 4095 | "placement": "bottom", 4096 | "showLegend": true 4097 | }, 4098 | "tooltip": { 4099 | "mode": "multi", 4100 | "sort": "desc" 4101 | } 4102 | }, 4103 | "targets": [ 4104 | { 4105 | "datasource": { 4106 | "type": "prometheus", 4107 | "uid": "${datasource}" 4108 | }, 4109 | "dimensions": {}, 4110 | "editorMode": "code", 4111 | "expr": "sum by(mode) (rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode!=\"idle\"}[$__rate_interval])) * 100", 4112 | "expression": "", 4113 | "hide": false, 4114 | "id": "", 4115 | "label": "", 4116 | "legendFormat": "{{mode}}", 4117 | "matchExact": true, 4118 | "metricEditorMode": 0, 4119 | "metricName": "", 4120 | "metricQueryType": 0, 4121 | "namespace": "", 4122 | "period": "", 4123 | "queryMode": "Metrics", 4124 | "range": true, 4125 | "refId": "A", 4126 | "region": "default", 4127 | "sqlExpression": "", 4128 | "statistic": "Average" 4129 | }, 4130 | { 4131 | "datasource": { 4132 | "type": "prometheus", 4133 | "uid": "${datasource}" 4134 | }, 4135 | "dimensions": {}, 4136 | "editorMode": "code", 4137 | "expr": "max(sum by (cpu) (rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode!=\"idle\"}[$__rate_interval]))) * 100", 4138 | "expression": "", 4139 | "hide": true, 4140 | "id": "", 4141 | "label": "", 4142 | "legendFormat": "max single core util", 4143 | "matchExact": true, 4144 | "metricEditorMode": 0, 4145 | "metricName": "", 4146 | "metricQueryType": 0, 4147 | "namespace": "", 4148 | "period": "", 4149 | "queryMode": "Metrics", 4150 | "range": true, 4151 | "refId": "max cpu", 4152 | "region": "default", 4153 | "sqlExpression": "", 4154 | "statistic": "Average" 4155 | }, 4156 | { 4157 | "datasource": { 4158 | "type": "prometheus", 4159 | "uid": "${datasource}" 4160 | }, 4161 | "editorMode": "code", 4162 | "expr": "count(sum by (instance, cpu) (node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\"})) * 100", 4163 | "hide": false, 4164 | "legendFormat": "cores", 4165 | "range": true, 4166 | "refId": "B" 4167 | }, 4168 | { 4169 | "datasource": { 4170 | "type": "prometheus", 4171 | "uid": "${datasource}" 4172 | }, 4173 | "editorMode": "code", 4174 | "expr": "count(sum by (instance) (node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\"})) * 100", 4175 | "hide": false, 4176 | "legendFormat": "single-core capacity", 4177 | "range": true, 4178 | "refId": "C" 4179 | } 4180 | ], 4181 | "title": "Aggregate Worker CPU", 4182 | "type": "timeseries" 4183 | }, 4184 | { 4185 | "datasource": { 4186 | "type": "prometheus", 4187 | "uid": "${datasource}" 4188 | }, 4189 | "fieldConfig": { 4190 | "defaults": { 4191 | "color": { 4192 | "mode": "palette-classic" 4193 | }, 4194 | "custom": { 4195 | "axisCenteredZero": false, 4196 | "axisColorMode": "text", 4197 | "axisLabel": "", 4198 | "axisPlacement": "auto", 4199 | "barAlignment": 0, 4200 | "drawStyle": "line", 4201 | "fillOpacity": 0, 4202 | "gradientMode": "none", 4203 | "hideFrom": { 4204 | "legend": false, 4205 | "tooltip": false, 4206 | "viz": false 4207 | }, 4208 | "lineInterpolation": "linear", 4209 | "lineWidth": 1, 4210 | "pointSize": 5, 4211 | "scaleDistribution": { 4212 | "type": "linear" 4213 | }, 4214 | "showPoints": "never", 4215 | "spanNulls": false, 4216 | "stacking": { 4217 | "group": "A", 4218 | "mode": "none" 4219 | }, 4220 | "thresholdsStyle": { 4221 | "mode": "off" 4222 | } 4223 | }, 4224 | "decimals": 1, 4225 | "mappings": [], 4226 | "max": 1, 4227 | "thresholds": { 4228 | "mode": "absolute", 4229 | "steps": [ 4230 | { 4231 | "color": "green" 4232 | }, 4233 | { 4234 | "color": "red", 4235 | "value": 80 4236 | } 4237 | ] 4238 | }, 4239 | "unit": "percentunit" 4240 | }, 4241 | "overrides": [ 4242 | { 4243 | "matcher": { 4244 | "id": "byName", 4245 | "options": "total" 4246 | }, 4247 | "properties": [ 4248 | { 4249 | "id": "color", 4250 | "value": { 4251 | "fixedColor": "text", 4252 | "mode": "fixed" 4253 | } 4254 | }, 4255 | { 4256 | "id": "custom.lineWidth", 4257 | "value": 2 4258 | } 4259 | ] 4260 | } 4261 | ] 4262 | }, 4263 | "gridPos": { 4264 | "h": 7, 4265 | "w": 24, 4266 | "x": 0, 4267 | "y": 72 4268 | }, 4269 | "id": 36, 4270 | "options": { 4271 | "legend": { 4272 | "calcs": [], 4273 | "displayMode": "list", 4274 | "placement": "bottom", 4275 | "showLegend": true 4276 | }, 4277 | "tooltip": { 4278 | "mode": "single", 4279 | "sort": "none" 4280 | } 4281 | }, 4282 | "targets": [ 4283 | { 4284 | "datasource": { 4285 | "type": "prometheus", 4286 | "uid": "${datasource}" 4287 | }, 4288 | "dimensions": {}, 4289 | "editorMode": "code", 4290 | "expr": "max by(mode)\n(rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\",spec=\"worker\", mode!=\"idle\"}[$__rate_interval]))", 4291 | "expression": "", 4292 | "hide": false, 4293 | "id": "", 4294 | "label": "", 4295 | "legendFormat": "{{mode}}", 4296 | "matchExact": true, 4297 | "metricEditorMode": 0, 4298 | "metricName": "", 4299 | "metricQueryType": 0, 4300 | "namespace": "", 4301 | "period": "", 4302 | "queryMode": "Metrics", 4303 | "range": true, 4304 | "refId": "B", 4305 | "region": "default", 4306 | "sqlExpression": "", 4307 | "statistic": "Average" 4308 | }, 4309 | { 4310 | "datasource": { 4311 | "type": "prometheus", 4312 | "uid": "${datasource}" 4313 | }, 4314 | "editorMode": "code", 4315 | "expr": "max(\n (1-irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode=\"idle\"}[$__rate_interval]))\n)", 4316 | "hide": false, 4317 | "legendFormat": "total", 4318 | "range": true, 4319 | "refId": "C" 4320 | } 4321 | ], 4322 | "title": "Worker CPU Type", 4323 | "type": "timeseries" 4324 | }, 4325 | { 4326 | "datasource": { 4327 | "type": "prometheus", 4328 | "uid": "${datasource}" 4329 | }, 4330 | "fieldConfig": { 4331 | "defaults": { 4332 | "color": { 4333 | "mode": "palette-classic" 4334 | }, 4335 | "custom": { 4336 | "axisCenteredZero": false, 4337 | "axisColorMode": "text", 4338 | "axisLabel": "", 4339 | "axisPlacement": "auto", 4340 | "barAlignment": 0, 4341 | "drawStyle": "line", 4342 | "fillOpacity": 0, 4343 | "gradientMode": "none", 4344 | "hideFrom": { 4345 | "legend": false, 4346 | "tooltip": false, 4347 | "viz": false 4348 | }, 4349 | "lineInterpolation": "linear", 4350 | "lineWidth": 1, 4351 | "pointSize": 5, 4352 | "scaleDistribution": { 4353 | "type": "linear" 4354 | }, 4355 | "showPoints": "never", 4356 | "spanNulls": false, 4357 | "stacking": { 4358 | "group": "A", 4359 | "mode": "none" 4360 | }, 4361 | "thresholdsStyle": { 4362 | "mode": "off" 4363 | } 4364 | }, 4365 | "decimals": 1, 4366 | "mappings": [], 4367 | "max": 1, 4368 | "thresholds": { 4369 | "mode": "absolute", 4370 | "steps": [ 4371 | { 4372 | "color": "green" 4373 | }, 4374 | { 4375 | "color": "red", 4376 | "value": 80 4377 | } 4378 | ] 4379 | }, 4380 | "unit": "percentunit" 4381 | }, 4382 | "overrides": [ 4383 | { 4384 | "matcher": { 4385 | "id": "byName", 4386 | "options": "80pct" 4387 | }, 4388 | "properties": [ 4389 | { 4390 | "id": "color", 4391 | "value": { 4392 | "fixedColor": "dark-blue", 4393 | "mode": "fixed" 4394 | } 4395 | }, 4396 | { 4397 | "id": "custom.fillBelowTo", 4398 | "value": "20pct" 4399 | }, 4400 | { 4401 | "id": "custom.fillOpacity", 4402 | "value": 50 4403 | } 4404 | ] 4405 | }, 4406 | { 4407 | "matcher": { 4408 | "id": "byName", 4409 | "options": "20pct" 4410 | }, 4411 | "properties": [ 4412 | { 4413 | "id": "color", 4414 | "value": { 4415 | "fixedColor": "dark-blue", 4416 | "mode": "fixed" 4417 | } 4418 | } 4419 | ] 4420 | }, 4421 | { 4422 | "matcher": { 4423 | "id": "byName", 4424 | "options": "min" 4425 | }, 4426 | "properties": [ 4427 | { 4428 | "id": "color", 4429 | "value": { 4430 | "fixedColor": "super-light-blue", 4431 | "mode": "fixed" 4432 | } 4433 | }, 4434 | { 4435 | "id": "custom.lineWidth", 4436 | "value": 0 4437 | } 4438 | ] 4439 | }, 4440 | { 4441 | "matcher": { 4442 | "id": "byName", 4443 | "options": "max" 4444 | }, 4445 | "properties": [ 4446 | { 4447 | "id": "color", 4448 | "value": { 4449 | "fixedColor": "super-light-blue", 4450 | "mode": "fixed" 4451 | } 4452 | }, 4453 | { 4454 | "id": "custom.fillBelowTo", 4455 | "value": "min" 4456 | }, 4457 | { 4458 | "id": "custom.fillOpacity", 4459 | "value": 20 4460 | }, 4461 | { 4462 | "id": "custom.lineWidth", 4463 | "value": 0 4464 | } 4465 | ] 4466 | }, 4467 | { 4468 | "matcher": { 4469 | "id": "byName", 4470 | "options": "median (50pct)" 4471 | }, 4472 | "properties": [ 4473 | { 4474 | "id": "color", 4475 | "value": { 4476 | "fixedColor": "light-blue", 4477 | "mode": "fixed" 4478 | } 4479 | } 4480 | ] 4481 | }, 4482 | { 4483 | "matcher": { 4484 | "id": "byName", 4485 | "options": "mean" 4486 | }, 4487 | "properties": [ 4488 | { 4489 | "id": "color", 4490 | "value": { 4491 | "fixedColor": "text", 4492 | "mode": "fixed" 4493 | } 4494 | } 4495 | ] 4496 | } 4497 | ] 4498 | }, 4499 | "gridPos": { 4500 | "h": 7, 4501 | "w": 24, 4502 | "x": 0, 4503 | "y": 79 4504 | }, 4505 | "id": 89, 4506 | "options": { 4507 | "legend": { 4508 | "calcs": [], 4509 | "displayMode": "list", 4510 | "placement": "bottom", 4511 | "showLegend": true 4512 | }, 4513 | "tooltip": { 4514 | "mode": "single", 4515 | "sort": "none" 4516 | } 4517 | }, 4518 | "targets": [ 4519 | { 4520 | "datasource": { 4521 | "type": "prometheus", 4522 | "uid": "${datasource}" 4523 | }, 4524 | "editorMode": "code", 4525 | "expr": "max(\n (1-irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode=\"idle\"}[$__rate_interval]))\n)", 4526 | "hide": false, 4527 | "legendFormat": "max", 4528 | "range": true, 4529 | "refId": "max" 4530 | }, 4531 | { 4532 | "datasource": { 4533 | "type": "prometheus", 4534 | "uid": "${datasource}" 4535 | }, 4536 | "editorMode": "code", 4537 | "expr": "quantile(\n 0.8,\n (1-irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode=\"idle\"}[$__rate_interval]))\n)", 4538 | "hide": false, 4539 | "legendFormat": "80pct", 4540 | "range": true, 4541 | "refId": "80pct" 4542 | }, 4543 | { 4544 | "datasource": { 4545 | "type": "prometheus", 4546 | "uid": "${datasource}" 4547 | }, 4548 | "editorMode": "code", 4549 | "expr": "quantile(\n 0.2,\n (1-irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode=\"idle\"}[$__rate_interval]))\n)", 4550 | "hide": false, 4551 | "legendFormat": "20pct", 4552 | "range": true, 4553 | "refId": "20pct" 4554 | }, 4555 | { 4556 | "datasource": { 4557 | "type": "prometheus", 4558 | "uid": "${datasource}" 4559 | }, 4560 | "editorMode": "code", 4561 | "expr": "min(\n (1-irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode=\"idle\"}[$__rate_interval]))\n)", 4562 | "hide": false, 4563 | "legendFormat": "min", 4564 | "range": true, 4565 | "refId": "min" 4566 | }, 4567 | { 4568 | "datasource": { 4569 | "type": "prometheus", 4570 | "uid": "${datasource}" 4571 | }, 4572 | "editorMode": "code", 4573 | "expr": "quantile(\n 0.5,\n (1-irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode=\"idle\"}[$__rate_interval]))\n)", 4574 | "hide": false, 4575 | "legendFormat": "median (50pct)", 4576 | "range": true, 4577 | "refId": "median (50pct)" 4578 | }, 4579 | { 4580 | "datasource": { 4581 | "type": "prometheus", 4582 | "uid": "${datasource}" 4583 | }, 4584 | "editorMode": "code", 4585 | "expr": "avg(\n (1-irate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"worker\", mode=\"idle\"}[$__rate_interval]))\n)", 4586 | "hide": false, 4587 | "legendFormat": "mean", 4588 | "range": true, 4589 | "refId": "mean" 4590 | } 4591 | ], 4592 | "title": "Worker CPU Spread (all cores)", 4593 | "type": "timeseries" 4594 | }, 4595 | { 4596 | "datasource": { 4597 | "type": "prometheus", 4598 | "uid": "${datasource}" 4599 | }, 4600 | "fieldConfig": { 4601 | "defaults": { 4602 | "color": { 4603 | "mode": "palette-classic" 4604 | }, 4605 | "custom": { 4606 | "axisCenteredZero": false, 4607 | "axisColorMode": "text", 4608 | "axisLabel": "", 4609 | "axisPlacement": "auto", 4610 | "barAlignment": 0, 4611 | "drawStyle": "line", 4612 | "fillOpacity": 0, 4613 | "gradientMode": "none", 4614 | "hideFrom": { 4615 | "legend": false, 4616 | "tooltip": false, 4617 | "viz": false 4618 | }, 4619 | "lineInterpolation": "linear", 4620 | "lineWidth": 1, 4621 | "pointSize": 5, 4622 | "scaleDistribution": { 4623 | "type": "linear" 4624 | }, 4625 | "showPoints": "never", 4626 | "spanNulls": false, 4627 | "stacking": { 4628 | "group": "A", 4629 | "mode": "none" 4630 | }, 4631 | "thresholdsStyle": { 4632 | "mode": "off" 4633 | } 4634 | }, 4635 | "mappings": [], 4636 | "min": 0, 4637 | "thresholds": { 4638 | "mode": "absolute", 4639 | "steps": [ 4640 | { 4641 | "color": "green" 4642 | }, 4643 | { 4644 | "color": "red", 4645 | "value": 80 4646 | } 4647 | ] 4648 | }, 4649 | "unit": "bytes" 4650 | }, 4651 | "overrides": [ 4652 | { 4653 | "matcher": { 4654 | "id": "byName", 4655 | "options": "max" 4656 | }, 4657 | "properties": [ 4658 | { 4659 | "id": "color", 4660 | "value": { 4661 | "fixedColor": "super-light-green", 4662 | "mode": "fixed" 4663 | } 4664 | }, 4665 | { 4666 | "id": "custom.fillBelowTo", 4667 | "value": "min" 4668 | }, 4669 | { 4670 | "id": "custom.fillOpacity", 4671 | "value": 20 4672 | }, 4673 | { 4674 | "id": "custom.lineWidth", 4675 | "value": 0 4676 | } 4677 | ] 4678 | }, 4679 | { 4680 | "matcher": { 4681 | "id": "byName", 4682 | "options": "min" 4683 | }, 4684 | "properties": [ 4685 | { 4686 | "id": "color", 4687 | "value": { 4688 | "fixedColor": "super-light-green", 4689 | "mode": "fixed" 4690 | } 4691 | }, 4692 | { 4693 | "id": "custom.lineWidth", 4694 | "value": 0 4695 | } 4696 | ] 4697 | }, 4698 | { 4699 | "matcher": { 4700 | "id": "byName", 4701 | "options": "pct80" 4702 | }, 4703 | "properties": [ 4704 | { 4705 | "id": "color", 4706 | "value": { 4707 | "fixedColor": "dark-green", 4708 | "mode": "fixed" 4709 | } 4710 | }, 4711 | { 4712 | "id": "custom.fillBelowTo", 4713 | "value": "pct20" 4714 | }, 4715 | { 4716 | "id": "custom.fillOpacity", 4717 | "value": 50 4718 | } 4719 | ] 4720 | }, 4721 | { 4722 | "matcher": { 4723 | "id": "byName", 4724 | "options": "pct20" 4725 | }, 4726 | "properties": [ 4727 | { 4728 | "id": "color", 4729 | "value": { 4730 | "fixedColor": "dark-green", 4731 | "mode": "fixed" 4732 | } 4733 | } 4734 | ] 4735 | }, 4736 | { 4737 | "matcher": { 4738 | "id": "byName", 4739 | "options": "avg" 4740 | }, 4741 | "properties": [ 4742 | { 4743 | "id": "color", 4744 | "value": { 4745 | "fixedColor": "text", 4746 | "mode": "fixed" 4747 | } 4748 | } 4749 | ] 4750 | }, 4751 | { 4752 | "matcher": { 4753 | "id": "byFrameRefID", 4754 | "options": "limit" 4755 | }, 4756 | "properties": [ 4757 | { 4758 | "id": "color", 4759 | "value": { 4760 | "fixedColor": "dark-red", 4761 | "mode": "fixed" 4762 | } 4763 | }, 4764 | { 4765 | "id": "custom.lineWidth", 4766 | "value": 2 4767 | } 4768 | ] 4769 | }, 4770 | { 4771 | "matcher": { 4772 | "id": "byName", 4773 | "options": "60% of limit (default spill threshold)" 4774 | }, 4775 | "properties": [ 4776 | { 4777 | "id": "color", 4778 | "value": { 4779 | "fixedColor": "yellow", 4780 | "mode": "fixed" 4781 | } 4782 | } 4783 | ] 4784 | }, 4785 | { 4786 | "matcher": { 4787 | "id": "byName", 4788 | "options": "median (50pct)" 4789 | }, 4790 | "properties": [ 4791 | { 4792 | "id": "color", 4793 | "value": { 4794 | "fixedColor": "super-light-green", 4795 | "mode": "fixed" 4796 | } 4797 | } 4798 | ] 4799 | } 4800 | ] 4801 | }, 4802 | "gridPos": { 4803 | "h": 7, 4804 | "w": 24, 4805 | "x": 0, 4806 | "y": 86 4807 | }, 4808 | "id": 91, 4809 | "options": { 4810 | "legend": { 4811 | "calcs": [], 4812 | "displayMode": "list", 4813 | "placement": "bottom", 4814 | "showLegend": true 4815 | }, 4816 | "tooltip": { 4817 | "mode": "single", 4818 | "sort": "none" 4819 | } 4820 | }, 4821 | "targets": [ 4822 | { 4823 | "datasource": { 4824 | "type": "prometheus", 4825 | "uid": "${datasource}" 4826 | }, 4827 | "editorMode": "code", 4828 | "expr": "max(node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"} - node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"})", 4829 | "legendFormat": "max", 4830 | "range": true, 4831 | "refId": "max" 4832 | }, 4833 | { 4834 | "datasource": { 4835 | "type": "prometheus", 4836 | "uid": "${datasource}" 4837 | }, 4838 | "editorMode": "code", 4839 | "expr": "quantile(0.8,node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"} - node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"})", 4840 | "hide": false, 4841 | "legendFormat": "pct80", 4842 | "range": true, 4843 | "refId": "pct80" 4844 | }, 4845 | { 4846 | "datasource": { 4847 | "type": "prometheus", 4848 | "uid": "${datasource}" 4849 | }, 4850 | "editorMode": "code", 4851 | "expr": "quantile(0.2,node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"} - node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"})", 4852 | "hide": false, 4853 | "legendFormat": "pct20", 4854 | "range": true, 4855 | "refId": "pct20" 4856 | }, 4857 | { 4858 | "datasource": { 4859 | "type": "prometheus", 4860 | "uid": "${datasource}" 4861 | }, 4862 | "editorMode": "code", 4863 | "expr": "min(node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"} - node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"})", 4864 | "hide": false, 4865 | "legendFormat": "min", 4866 | "range": true, 4867 | "refId": "min" 4868 | }, 4869 | { 4870 | "datasource": { 4871 | "type": "prometheus", 4872 | "uid": "${datasource}" 4873 | }, 4874 | "editorMode": "code", 4875 | "expr": "avg(node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"} - node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"})", 4876 | "hide": false, 4877 | "legendFormat": "avg", 4878 | "range": true, 4879 | "refId": "avg" 4880 | }, 4881 | { 4882 | "datasource": { 4883 | "type": "prometheus", 4884 | "uid": "${datasource}" 4885 | }, 4886 | "editorMode": "code", 4887 | "expr": "quantile(0.5, node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"} - node_memory_MemAvailable_bytes{cluster_id=\"$cluster_id\",spec=\"worker\"})", 4888 | "hide": false, 4889 | "legendFormat": "median (50pct)", 4890 | "range": true, 4891 | "refId": "median" 4892 | }, 4893 | { 4894 | "datasource": { 4895 | "type": "prometheus", 4896 | "uid": "${datasource}" 4897 | }, 4898 | "editorMode": "code", 4899 | "expr": "min(\n node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\", spec=\"worker\"}\n # only trust metric when scheduler is running\n and on () rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval]) > 0\n)", 4900 | "hide": false, 4901 | "legendFormat": "limit", 4902 | "range": true, 4903 | "refId": "limit" 4904 | }, 4905 | { 4906 | "datasource": { 4907 | "type": "prometheus", 4908 | "uid": "${datasource}" 4909 | }, 4910 | "editorMode": "code", 4911 | "expr": "min(\n node_memory_MemTotal_bytes{cluster_id=\"$cluster_id\", spec=\"worker\"}\n # only trust metric when scheduler is running\n and on () rate(node_cpu_seconds_total{cluster_id=\"$cluster_id\", spec=\"scheduler\"}[$__rate_interval]) > 0\n) * 0.6", 4912 | "hide": false, 4913 | "legendFormat": "60% of limit (default spill threshold)", 4914 | "range": true, 4915 | "refId": "60% of limit" 4916 | } 4917 | ], 4918 | "title": "Worker Memory Spread", 4919 | "type": "timeseries" 4920 | }, 4921 | { 4922 | "datasource": { 4923 | "type": "prometheus", 4924 | "uid": "${datasource}" 4925 | }, 4926 | "fieldConfig": { 4927 | "defaults": { 4928 | "color": { 4929 | "mode": "palette-classic" 4930 | }, 4931 | "custom": { 4932 | "axisCenteredZero": false, 4933 | "axisColorMode": "text", 4934 | "axisLabel": "", 4935 | "axisPlacement": "auto", 4936 | "barAlignment": 0, 4937 | "drawStyle": "line", 4938 | "fillOpacity": 0, 4939 | "gradientMode": "none", 4940 | "hideFrom": { 4941 | "legend": false, 4942 | "tooltip": false, 4943 | "viz": false 4944 | }, 4945 | "lineInterpolation": "linear", 4946 | "lineWidth": 1, 4947 | "pointSize": 5, 4948 | "scaleDistribution": { 4949 | "type": "linear" 4950 | }, 4951 | "showPoints": "never", 4952 | "spanNulls": false, 4953 | "stacking": { 4954 | "group": "A", 4955 | "mode": "none" 4956 | }, 4957 | "thresholdsStyle": { 4958 | "mode": "off" 4959 | } 4960 | }, 4961 | "mappings": [], 4962 | "thresholds": { 4963 | "mode": "absolute", 4964 | "steps": [ 4965 | { 4966 | "color": "green" 4967 | }, 4968 | { 4969 | "color": "red", 4970 | "value": 80 4971 | } 4972 | ] 4973 | }, 4974 | "unit": "bytes" 4975 | }, 4976 | "overrides": [ 4977 | { 4978 | "matcher": { 4979 | "id": "byName", 4980 | "options": "Total memory" 4981 | }, 4982 | "properties": [ 4983 | { 4984 | "id": "custom.stacking", 4985 | "value": { 4986 | "group": "A", 4987 | "mode": "none" 4988 | } 4989 | }, 4990 | { 4991 | "id": "custom.drawStyle", 4992 | "value": "line" 4993 | }, 4994 | { 4995 | "id": "custom.fillOpacity", 4996 | "value": 0 4997 | }, 4998 | { 4999 | "id": "custom.lineWidth", 5000 | "value": 2 5001 | }, 5002 | { 5003 | "id": "color", 5004 | "value": { 5005 | "fixedColor": "green", 5006 | "mode": "fixed" 5007 | } 5008 | } 5009 | ] 5010 | } 5011 | ] 5012 | }, 5013 | "gridPos": { 5014 | "h": 6, 5015 | "w": 12, 5016 | "x": 0, 5017 | "y": 93 5018 | }, 5019 | "id": 42, 5020 | "options": { 5021 | "legend": { 5022 | "calcs": [], 5023 | "displayMode": "list", 5024 | "placement": "bottom", 5025 | "showLegend": true 5026 | }, 5027 | "tooltip": { 5028 | "mode": "single", 5029 | "sort": "none" 5030 | } 5031 | }, 5032 | "targets": [ 5033 | { 5034 | "datasource": { 5035 | "type": "prometheus", 5036 | "uid": "${datasource}" 5037 | }, 5038 | "editorMode": "code", 5039 | "expr": "min(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"unmanaged\"})", 5040 | "hide": false, 5041 | "legendFormat": "min", 5042 | "range": true, 5043 | "refId": "min" 5044 | }, 5045 | { 5046 | "datasource": { 5047 | "type": "prometheus", 5048 | "uid": "${datasource}" 5049 | }, 5050 | "editorMode": "code", 5051 | "expr": "avg(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"unmanaged\"})", 5052 | "hide": false, 5053 | "legendFormat": "avg", 5054 | "range": true, 5055 | "refId": "avg" 5056 | }, 5057 | { 5058 | "datasource": { 5059 | "type": "prometheus", 5060 | "uid": "${datasource}" 5061 | }, 5062 | "editorMode": "code", 5063 | "expr": "quantile(0.8,(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"unmanaged\"}))", 5064 | "hide": false, 5065 | "legendFormat": "80pct", 5066 | "range": true, 5067 | "refId": "80pct" 5068 | }, 5069 | { 5070 | "datasource": { 5071 | "type": "prometheus", 5072 | "uid": "${datasource}" 5073 | }, 5074 | "editorMode": "code", 5075 | "expr": "max(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"unmanaged\"})", 5076 | "hide": false, 5077 | "legendFormat": "max", 5078 | "range": true, 5079 | "refId": "max" 5080 | } 5081 | ], 5082 | "title": "Unmanaged Memory", 5083 | "type": "timeseries" 5084 | }, 5085 | { 5086 | "datasource": { 5087 | "type": "prometheus", 5088 | "uid": "${datasource}" 5089 | }, 5090 | "fieldConfig": { 5091 | "defaults": { 5092 | "color": { 5093 | "mode": "palette-classic" 5094 | }, 5095 | "custom": { 5096 | "axisCenteredZero": false, 5097 | "axisColorMode": "text", 5098 | "axisLabel": "", 5099 | "axisPlacement": "auto", 5100 | "barAlignment": 0, 5101 | "drawStyle": "line", 5102 | "fillOpacity": 0, 5103 | "gradientMode": "none", 5104 | "hideFrom": { 5105 | "legend": false, 5106 | "tooltip": false, 5107 | "viz": false 5108 | }, 5109 | "lineInterpolation": "linear", 5110 | "lineWidth": 1, 5111 | "pointSize": 5, 5112 | "scaleDistribution": { 5113 | "type": "linear" 5114 | }, 5115 | "showPoints": "never", 5116 | "spanNulls": false, 5117 | "stacking": { 5118 | "group": "A", 5119 | "mode": "none" 5120 | }, 5121 | "thresholdsStyle": { 5122 | "mode": "off" 5123 | } 5124 | }, 5125 | "mappings": [], 5126 | "thresholds": { 5127 | "mode": "absolute", 5128 | "steps": [ 5129 | { 5130 | "color": "green" 5131 | }, 5132 | { 5133 | "color": "red", 5134 | "value": 80 5135 | } 5136 | ] 5137 | }, 5138 | "unit": "bytes" 5139 | }, 5140 | "overrides": [ 5141 | { 5142 | "matcher": { 5143 | "id": "byName", 5144 | "options": "Total memory" 5145 | }, 5146 | "properties": [ 5147 | { 5148 | "id": "custom.stacking", 5149 | "value": { 5150 | "group": "A", 5151 | "mode": "none" 5152 | } 5153 | }, 5154 | { 5155 | "id": "custom.drawStyle", 5156 | "value": "line" 5157 | }, 5158 | { 5159 | "id": "custom.fillOpacity", 5160 | "value": 0 5161 | }, 5162 | { 5163 | "id": "custom.lineWidth", 5164 | "value": 2 5165 | }, 5166 | { 5167 | "id": "color", 5168 | "value": { 5169 | "fixedColor": "green", 5170 | "mode": "fixed" 5171 | } 5172 | } 5173 | ] 5174 | } 5175 | ] 5176 | }, 5177 | "gridPos": { 5178 | "h": 6, 5179 | "w": 12, 5180 | "x": 12, 5181 | "y": 93 5182 | }, 5183 | "id": 43, 5184 | "options": { 5185 | "legend": { 5186 | "calcs": [], 5187 | "displayMode": "list", 5188 | "placement": "bottom", 5189 | "showLegend": true 5190 | }, 5191 | "tooltip": { 5192 | "mode": "single", 5193 | "sort": "none" 5194 | } 5195 | }, 5196 | "targets": [ 5197 | { 5198 | "datasource": { 5199 | "type": "prometheus", 5200 | "uid": "${datasource}" 5201 | }, 5202 | "editorMode": "code", 5203 | "expr": "min(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"managed\"})", 5204 | "hide": false, 5205 | "legendFormat": "min", 5206 | "range": true, 5207 | "refId": "min" 5208 | }, 5209 | { 5210 | "datasource": { 5211 | "type": "prometheus", 5212 | "uid": "${datasource}" 5213 | }, 5214 | "editorMode": "code", 5215 | "expr": "avg(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"managed\"})", 5216 | "hide": false, 5217 | "legendFormat": "avg", 5218 | "range": true, 5219 | "refId": "avg" 5220 | }, 5221 | { 5222 | "datasource": { 5223 | "type": "prometheus", 5224 | "uid": "${datasource}" 5225 | }, 5226 | "editorMode": "code", 5227 | "expr": "quantile(0.8,(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"managed\"}))", 5228 | "hide": false, 5229 | "legendFormat": "80pct", 5230 | "range": true, 5231 | "refId": "80pct" 5232 | }, 5233 | { 5234 | "datasource": { 5235 | "type": "prometheus", 5236 | "uid": "${datasource}" 5237 | }, 5238 | "editorMode": "code", 5239 | "expr": "max(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"managed\"})", 5240 | "hide": false, 5241 | "legendFormat": "max", 5242 | "range": true, 5243 | "refId": "max" 5244 | } 5245 | ], 5246 | "title": "Managed Memory", 5247 | "type": "timeseries" 5248 | }, 5249 | { 5250 | "datasource": { 5251 | "type": "prometheus", 5252 | "uid": "${datasource}" 5253 | }, 5254 | "fieldConfig": { 5255 | "defaults": { 5256 | "color": { 5257 | "mode": "palette-classic" 5258 | }, 5259 | "custom": { 5260 | "axisCenteredZero": false, 5261 | "axisColorMode": "text", 5262 | "axisLabel": "", 5263 | "axisPlacement": "auto", 5264 | "barAlignment": 0, 5265 | "drawStyle": "line", 5266 | "fillOpacity": 0, 5267 | "gradientMode": "none", 5268 | "hideFrom": { 5269 | "legend": false, 5270 | "tooltip": false, 5271 | "viz": false 5272 | }, 5273 | "lineInterpolation": "linear", 5274 | "lineWidth": 1, 5275 | "pointSize": 5, 5276 | "scaleDistribution": { 5277 | "type": "linear" 5278 | }, 5279 | "showPoints": "never", 5280 | "spanNulls": false, 5281 | "stacking": { 5282 | "group": "A", 5283 | "mode": "none" 5284 | }, 5285 | "thresholdsStyle": { 5286 | "mode": "off" 5287 | } 5288 | }, 5289 | "mappings": [], 5290 | "thresholds": { 5291 | "mode": "absolute", 5292 | "steps": [ 5293 | { 5294 | "color": "green" 5295 | }, 5296 | { 5297 | "color": "red", 5298 | "value": 80 5299 | } 5300 | ] 5301 | }, 5302 | "unit": "bytes" 5303 | }, 5304 | "overrides": [ 5305 | { 5306 | "matcher": { 5307 | "id": "byName", 5308 | "options": "Total memory" 5309 | }, 5310 | "properties": [ 5311 | { 5312 | "id": "custom.stacking", 5313 | "value": { 5314 | "group": "A", 5315 | "mode": "none" 5316 | } 5317 | }, 5318 | { 5319 | "id": "custom.drawStyle", 5320 | "value": "line" 5321 | }, 5322 | { 5323 | "id": "custom.fillOpacity", 5324 | "value": 0 5325 | }, 5326 | { 5327 | "id": "custom.lineWidth", 5328 | "value": 2 5329 | }, 5330 | { 5331 | "id": "color", 5332 | "value": { 5333 | "fixedColor": "green", 5334 | "mode": "fixed" 5335 | } 5336 | } 5337 | ] 5338 | } 5339 | ] 5340 | }, 5341 | "gridPos": { 5342 | "h": 6, 5343 | "w": 12, 5344 | "x": 0, 5345 | "y": 99 5346 | }, 5347 | "id": 41, 5348 | "options": { 5349 | "legend": { 5350 | "calcs": [], 5351 | "displayMode": "list", 5352 | "placement": "bottom", 5353 | "showLegend": true 5354 | }, 5355 | "tooltip": { 5356 | "mode": "single", 5357 | "sort": "none" 5358 | } 5359 | }, 5360 | "targets": [ 5361 | { 5362 | "datasource": { 5363 | "type": "prometheus", 5364 | "uid": "${datasource}" 5365 | }, 5366 | "editorMode": "code", 5367 | "expr": "min(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"spilled\"})", 5368 | "hide": false, 5369 | "legendFormat": "min", 5370 | "range": true, 5371 | "refId": "min" 5372 | }, 5373 | { 5374 | "datasource": { 5375 | "type": "prometheus", 5376 | "uid": "${datasource}" 5377 | }, 5378 | "editorMode": "code", 5379 | "expr": "avg(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"spilled\"})", 5380 | "hide": false, 5381 | "legendFormat": "avg", 5382 | "range": true, 5383 | "refId": "avg" 5384 | }, 5385 | { 5386 | "datasource": { 5387 | "type": "prometheus", 5388 | "uid": "${datasource}" 5389 | }, 5390 | "editorMode": "code", 5391 | "expr": "quantile(0.8,(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"spilled\"}))", 5392 | "hide": false, 5393 | "legendFormat": "80pct", 5394 | "range": true, 5395 | "refId": "80pct" 5396 | }, 5397 | { 5398 | "datasource": { 5399 | "type": "prometheus", 5400 | "uid": "${datasource}" 5401 | }, 5402 | "editorMode": "code", 5403 | "expr": "max(dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type=\"spilled\"})", 5404 | "hide": false, 5405 | "legendFormat": "max", 5406 | "range": true, 5407 | "refId": "max" 5408 | } 5409 | ], 5410 | "title": "Spilled Memory", 5411 | "type": "timeseries" 5412 | }, 5413 | { 5414 | "datasource": { 5415 | "type": "prometheus", 5416 | "uid": "${datasource}" 5417 | }, 5418 | "fieldConfig": { 5419 | "defaults": { 5420 | "color": { 5421 | "mode": "palette-classic" 5422 | }, 5423 | "custom": { 5424 | "axisCenteredZero": false, 5425 | "axisColorMode": "text", 5426 | "axisLabel": "", 5427 | "axisPlacement": "auto", 5428 | "barAlignment": 0, 5429 | "drawStyle": "line", 5430 | "fillOpacity": 0, 5431 | "gradientMode": "none", 5432 | "hideFrom": { 5433 | "legend": false, 5434 | "tooltip": false, 5435 | "viz": false 5436 | }, 5437 | "lineInterpolation": "linear", 5438 | "lineWidth": 1, 5439 | "pointSize": 5, 5440 | "scaleDistribution": { 5441 | "type": "linear" 5442 | }, 5443 | "showPoints": "never", 5444 | "spanNulls": false, 5445 | "stacking": { 5446 | "group": "A", 5447 | "mode": "none" 5448 | }, 5449 | "thresholdsStyle": { 5450 | "mode": "off" 5451 | } 5452 | }, 5453 | "mappings": [], 5454 | "thresholds": { 5455 | "mode": "absolute", 5456 | "steps": [ 5457 | { 5458 | "color": "green" 5459 | }, 5460 | { 5461 | "color": "red", 5462 | "value": 80 5463 | } 5464 | ] 5465 | }, 5466 | "unit": "bytes" 5467 | }, 5468 | "overrides": [ 5469 | { 5470 | "matcher": { 5471 | "id": "byName", 5472 | "options": "Total memory" 5473 | }, 5474 | "properties": [ 5475 | { 5476 | "id": "custom.stacking", 5477 | "value": { 5478 | "group": "A", 5479 | "mode": "none" 5480 | } 5481 | }, 5482 | { 5483 | "id": "custom.drawStyle", 5484 | "value": "line" 5485 | }, 5486 | { 5487 | "id": "custom.fillOpacity", 5488 | "value": 0 5489 | }, 5490 | { 5491 | "id": "custom.lineWidth", 5492 | "value": 2 5493 | }, 5494 | { 5495 | "id": "color", 5496 | "value": { 5497 | "fixedColor": "green", 5498 | "mode": "fixed" 5499 | } 5500 | } 5501 | ] 5502 | } 5503 | ] 5504 | }, 5505 | "gridPos": { 5506 | "h": 6, 5507 | "w": 12, 5508 | "x": 12, 5509 | "y": 99 5510 | }, 5511 | "id": 53, 5512 | "options": { 5513 | "legend": { 5514 | "calcs": [], 5515 | "displayMode": "list", 5516 | "placement": "bottom", 5517 | "showLegend": true 5518 | }, 5519 | "tooltip": { 5520 | "mode": "single", 5521 | "sort": "none" 5522 | } 5523 | }, 5524 | "targets": [ 5525 | { 5526 | "datasource": { 5527 | "type": "prometheus", 5528 | "uid": "${datasource}" 5529 | }, 5530 | "editorMode": "code", 5531 | "expr": "min(sum by(coiled_instance) (dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type!=\"unmanaged\"}))", 5532 | "hide": false, 5533 | "legendFormat": "min", 5534 | "range": true, 5535 | "refId": "min" 5536 | }, 5537 | { 5538 | "datasource": { 5539 | "type": "prometheus", 5540 | "uid": "${datasource}" 5541 | }, 5542 | "editorMode": "code", 5543 | "expr": "avg(sum by(coiled_instance) (dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type!=\"unmanaged\"}))", 5544 | "hide": false, 5545 | "legendFormat": "avg", 5546 | "range": true, 5547 | "refId": "avg" 5548 | }, 5549 | { 5550 | "datasource": { 5551 | "type": "prometheus", 5552 | "uid": "${datasource}" 5553 | }, 5554 | "editorMode": "code", 5555 | "expr": "quantile(0.8, (sum by(coiled_instance) (dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type!=\"unmanaged\"})))", 5556 | "hide": false, 5557 | "legendFormat": "80pct", 5558 | "range": true, 5559 | "refId": "80pct" 5560 | }, 5561 | { 5562 | "datasource": { 5563 | "type": "prometheus", 5564 | "uid": "${datasource}" 5565 | }, 5566 | "editorMode": "code", 5567 | "expr": "max(sum by(coiled_instance) (dask_worker_memory_bytes{cluster_id=\"$cluster_id\",type!=\"unmanaged\"}))", 5568 | "hide": false, 5569 | "legendFormat": "max", 5570 | "range": true, 5571 | "refId": "max" 5572 | } 5573 | ], 5574 | "title": "Managed+Spilled memory", 5575 | "type": "timeseries" 5576 | } 5577 | ], 5578 | "refresh": false, 5579 | "schemaVersion": 37, 5580 | "style": "dark", 5581 | "tags": [], 5582 | "templating": { 5583 | "list": [ 5584 | { 5585 | "current": { 5586 | "selected": true, 5587 | "text": "default", 5588 | "value": "default" 5589 | }, 5590 | "hide": 0, 5591 | "includeAll": false, 5592 | "label": "Data Source", 5593 | "multi": false, 5594 | "name": "datasource", 5595 | "options": [], 5596 | "query": "prometheus", 5597 | "queryValue": "", 5598 | "refresh": 1, 5599 | "regex": "", 5600 | "skipUrlSync": false, 5601 | "type": "datasource" 5602 | }, 5603 | { 5604 | "allValue": ".*", 5605 | "current": { 5606 | "selected": false, 5607 | "text": "All", 5608 | "value": "$__all" 5609 | }, 5610 | "datasource": { 5611 | "type": "prometheus", 5612 | "uid": "${datasource}" 5613 | }, 5614 | "definition": "label_values(node_cpu_seconds_total, env)", 5615 | "hide": 0, 5616 | "includeAll": true, 5617 | "label": "Env", 5618 | "multi": false, 5619 | "name": "env", 5620 | "options": [], 5621 | "query": { 5622 | "query": "label_values(node_cpu_seconds_total, env)", 5623 | "refId": "StandardVariableQuery" 5624 | }, 5625 | "refresh": 2, 5626 | "regex": "", 5627 | "skipUrlSync": false, 5628 | "sort": 0, 5629 | "type": "query" 5630 | }, 5631 | { 5632 | "datasource": { 5633 | "type": "prometheus", 5634 | "uid": "${datasource}" 5635 | }, 5636 | "definition": "label_values(node_cpu_seconds_total{env=~\"$env\"}, account)", 5637 | "hide": 0, 5638 | "includeAll": false, 5639 | "label": "Account", 5640 | "multi": false, 5641 | "name": "account", 5642 | "options": [], 5643 | "query": { 5644 | "query": "label_values(node_cpu_seconds_total{env=~\"$env\"}, account)", 5645 | "refId": "StandardVariableQuery" 5646 | }, 5647 | "refresh": 2, 5648 | "regex": "", 5649 | "skipUrlSync": false, 5650 | "sort": 0, 5651 | "type": "query" 5652 | }, 5653 | { 5654 | "datasource": { 5655 | "type": "prometheus", 5656 | "uid": "${datasource}" 5657 | }, 5658 | "definition": "node_cpu_seconds_total{account=\"$account\",env=~\"$env\"}", 5659 | "hide": 0, 5660 | "includeAll": false, 5661 | "label": "Cluster", 5662 | "multi": false, 5663 | "name": "cluster", 5664 | "options": [], 5665 | "query": { 5666 | "query": "node_cpu_seconds_total{account=\"$account\",env=~\"$env\"}", 5667 | "refId": "StandardVariableQuery" 5668 | }, 5669 | "refresh": 2, 5670 | "regex": "/cluster=\"(.*?)\".*/", 5671 | "skipUrlSync": false, 5672 | "sort": 0, 5673 | "type": "query" 5674 | }, 5675 | { 5676 | "datasource": { 5677 | "type": "prometheus", 5678 | "uid": "${datasource}" 5679 | }, 5680 | "definition": "label_values(node_cpu_seconds_total{account=\"$account\",env=~\"$env\",cluster=\"$cluster\"}, cluster_id)", 5681 | "hide": 0, 5682 | "includeAll": false, 5683 | "label": "Cluster ID", 5684 | "multi": false, 5685 | "name": "cluster_id", 5686 | "options": [], 5687 | "query": { 5688 | "query": "label_values(node_cpu_seconds_total{account=\"$account\",env=~\"$env\",cluster=\"$cluster\"}, cluster_id)", 5689 | "refId": "StandardVariableQuery" 5690 | }, 5691 | "refresh": 2, 5692 | "regex": "", 5693 | "skipUrlSync": false, 5694 | "sort": 0, 5695 | "type": "query" 5696 | } 5697 | ] 5698 | }, 5699 | "time": { 5700 | "from": "now-30m", 5701 | "to": "now" 5702 | }, 5703 | "timepicker": { 5704 | "refresh_intervals": [ 5705 | "5s", 5706 | "30s", 5707 | "1m", 5708 | "30m" 5709 | ] 5710 | }, 5711 | "timezone": "utc", 5712 | "title": "Coiled Cluster Metrics — Basic", 5713 | "uid": "GvbFsqKVk", 5714 | "version": 80, 5715 | "weekStart": "" 5716 | } --------------------------------------------------------------------------------