├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── alertmanager └── config.yml ├── caddy └── Caddyfile ├── config ├── docker-compose.exporters.yml ├── docker-compose.yml ├── dottk_checkout.png ├── grafana └── provisioning │ ├── dashboards │ ├── dashboard.yml │ ├── default.yml │ ├── docker_containers.json │ ├── docker_host.json │ ├── indexing.json │ ├── metrics.json │ ├── monitor_services.json │ ├── nginx_container.json │ └── postgres.json │ └── datasources │ ├── postgres.yml │ └── prometeus.yml ├── helpers └── aws │ ├── README.md │ ├── cadvisor_ecs_task_definition.json │ ├── node_exporter_task_definition.json │ └── prometheus.yml ├── prometheus ├── alert.rules └── prometheus.yml └── screens ├── Grafana_Docker_Containers.png ├── Grafana_Docker_Host.png ├── Grafana_Prometheus.png └── Slack_Notifications.png /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text eol=lf 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | *.jpg binary 44 | *.png binary 45 | *.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *DS_Store 2 | .vs/ 3 | .vscode/ 4 | .idea/ 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Stefan Prodan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | graphprotocol-infrastructure 2 | ======== 3 | 4 | A monitoring solution for hosting a graph node on a single Docker host with [Prometheus](https://prometheus.io/), [Grafana](http://grafana.org/), [cAdvisor](https://github.com/google/cadvisor), 5 | [NodeExporter](https://github.com/prometheus/node_exporter) and alerting with [AlertManager](https://github.com/prometheus/alertmanager). 6 | 7 | The monitoring configuration adapted the template by the graph team in the [mission control repository](https://github.com/graphprotocol/mission-control-indexer). 8 | 9 | The main difference is that it's cheaper. the full deployment on google cloud costs $500 per month while this simple docker compose script can be hosted on any bare metal server with more than 10 cores at about the same performance. The drawdown is that no backups are created. 10 | 11 | The data is stored in named volumes on the docker host and can be exported / copied over to a bigger machine once mpore performance is needed. 12 | 13 | The minimum competitive configuration I would assume to be the CPX51 VPS at [Hetzner](https://hetzner.cloud/?ref=uqph3EQTVIIR). By signing up using my referral link you can save 20€ and I get 10€ bonus for more experiments. 14 | 15 | You will need a achieve node to complete the testnet challenge. For testing purposes I can offer mine but make no guarantees regarding performance. 16 | 17 | ## Get a domain 18 | 19 | To enable SSL on your host you should get a domain. 20 | 21 | You can use any domain and any regsitrar that allowes you to edit DNS records to point subdomains to your IP address. 22 | 23 | For a free option go to [dotTK](http://www.dot.tk) and find a free domain name. Create a account and complete the registration. 24 | 25 | Hint: the shop is a bit broken. On the first try to checkout the shopping cart was empty for me, but there is a link to go back to the search results. Click this to go back, put the domain in the shopping cart again, next time it worked. 26 | 27 | In the last step choose "use dns" and enter the IP address of your server for 2 subdomains like in the picture. You can choose up to 12 months for free. 28 | 29 | ![dotTK checkout](/dottk_checkout.png) 30 | 31 | Under "Service > My Domains > Manage Domain > Manage Freenom DNS" you can add more subdomains later for e.g. the Grafana dashboard. 32 | 33 | ## Install 34 | 35 | Prerequisites: 36 | 37 | * Docker Engine >= 1.13 38 | * Docker Compose >= 1.11 39 | 40 | On a fresh Ubuntu server login via ssh and execute the following commands: 41 | 42 | ```bash 43 | apt update -y 44 | apt install docker.io docker-compose httpie 45 | ``` 46 | 47 | Clone this repository on your Docker host, cd into graphprotocol-infrastructure directory and run compose up: 48 | 49 | ```bash 50 | git clone https://github.com/butterfly-academy/graphprotocol-infrastructure.git 51 | cd graphprotocol-infrastructure 52 | 53 | EMAIL=my@email INDEX_HOST=index.mydomain.tk QUERY_HOST=query.mydomain.tk ADMIN_USER=admin ADMIN_PASSWORD=change_me ETHEREUM="mainnet:" ETHEREUM_START_BLOCK=7710671 docker-compose up -d 54 | ``` 55 | 56 | The ADMIN_USER and ADMIN_PASSWORD will be used by Grafana, Prometheus and AlertManager. 57 | QUERY_HOST and INDEX_HOST should point to the subdomains created earlier. 58 | 59 | Containers: 60 | 61 | * Graph Node (indexer / query node) `http://:8000` 62 | * Postgres Database 63 | * Prometheus (metrics database) `http://:9090` 64 | * Prometheus-Pushgateway (push acceptor for ephemeral and batch jobs) `http://:9091` 65 | * AlertManager (alerts management) `http://:9093` 66 | * Grafana (visualize metrics) `http://:3000` 67 | * NodeExporter (host metrics collector) 68 | * cAdvisor (containers metrics collector) 69 | * Caddy (reverse proxy and basic auth provider for prometheus and alertmanager) 70 | 71 | ## Indexing Subgraphs 72 | 73 | Connect via ssh to the server and issue the following commands to index the subraphs required for phase 0 of the testnet challenge. 74 | 75 | ```bash 76 | http post 127.0.0.1:8020 jsonrpc="2.0" method="subgraph_create" id="2" params:='{"name": "synthetixio-team/synthetix"}' 77 | http post 127.0.0.1:8020 jsonrpc="2.0" id="2" method="subgraph_deploy" params:='{"name": "synthetixio-team/synthetix", "ipfs_hash": "Qme2hDXrkBpuXAYEuwGPAjr6zwiMZV4FHLLBa3BHzatBWx"}' 78 | 79 | http post 127.0.0.1:8020 jsonrpc="2.0" method="subgraph_create" id="2" params:='{"name": "uniswap/uniswap-v2"}' 80 | http post 127.0.0.1:8020 jsonrpc="2.0" id="2" method="subgraph_deploy" params:='{"name": "uniswap/uniswap-v2", "ipfs_hash": "QmXKwSEMirgWVn41nRzkT3hpUBw29cp619Gx58XW6mPhZP"}' 81 | 82 | http post 127.0.0.1:8020 jsonrpc="2.0" method="subgraph_create" id="1" params:='{"name": "molochventures/moloch"}' 83 | http post 127.0.0.1:8020 jsonrpc="2.0" id="1" method="subgraph_deploy" params:='{"name": "molochventures/moloch", "ipfs_hash": "QmTXzATwNfgGVukV1fX2T6xw9f6LAYRVWpsdXyRWzUR2H9"}' 84 | 85 | http post 127.0.0.1:8020 jsonrpc="2.0" method="subgraph_create" id="4" params:='{"name": "jannis/gravity"}' 86 | http post 127.0.0.1:8020 jsonrpc="2.0" id="4" method="subgraph_deploy" params:='{"name": "jannis/gravity", "ipfs_hash": "QmbeDC4G8iPAUJ6tRBu99vwyYkaSiFwtXWKwwYkoNphV4X"}' 87 | ``` 88 | 89 | ## Debugging 90 | 91 | In case of problems you can access the log output of each container (e.g. graph-node) with the command 92 | 93 | ```bash 94 | docker logs --follow --tail 100 95 | ``` 96 | 97 | ## Setup Grafana 98 | 99 | Navigate to `http://:3000` and login with user ***admin*** password ***admin***. You can change the credentials in the compose file or by supplying the `ADMIN_USER` and `ADMIN_PASSWORD` environment variables on compose up. The config file can be added directly in grafana part like this 100 | ``` 101 | grafana: 102 | image: grafana/grafana:5.2.4 103 | env_file: 104 | - config 105 | 106 | ``` 107 | and the config file format should have this content 108 | ``` 109 | GF_SECURITY_ADMIN_USER=admin 110 | GF_SECURITY_ADMIN_PASSWORD=changeme 111 | GF_USERS_ALLOW_SIGN_UP=false 112 | ``` 113 | If you want to change the password, you have to remove this entry, otherwise the change will not take effect 114 | ``` 115 | - grafana_data:/var/lib/grafana 116 | ``` 117 | 118 | Grafana is preconfigured with dashboards and Prometheus plus Postgres as the default data source: 119 | 120 | ***Docker Host Dashboard*** 121 | 122 | ![Host](https://raw.githubusercontent.com/stefanprodan/dockprom/master/screens/Grafana_Docker_Host.png) 123 | 124 | The Docker Host Dashboard shows key metrics for monitoring the resource usage of your server: 125 | 126 | * Server uptime, CPU idle percent, number of CPU cores, available memory, swap and storage 127 | * System load average graph, running and blocked by IO processes graph, interrupts graph 128 | * CPU usage graph by mode (guest, idle, iowait, irq, nice, softirq, steal, system, user) 129 | * Memory usage graph by distribution (used, free, buffers, cached) 130 | * IO usage graph (read Bps, read Bps and IO time) 131 | * Network usage graph by device (inbound Bps, Outbound Bps) 132 | * Swap usage and activity graphs 133 | 134 | For storage and particularly Free Storage graph, you have to specify the fstype in grafana graph request. 135 | You can find it in `grafana/dashboards/docker_host.json`, at line 480 : 136 | 137 | "expr": "sum(node_filesystem_free_bytes{fstype=\"btrfs\"})", 138 | 139 | I work on BTRFS, so i need to change `aufs` to `btrfs`. 140 | 141 | You can find right value for your system in Prometheus `http://:9090` launching this request : 142 | 143 | node_filesystem_free_bytes 144 | 145 | ***Docker Containers Dashboard*** 146 | 147 | ![Containers](https://raw.githubusercontent.com/stefanprodan/dockprom/master/screens/Grafana_Docker_Containers.png) 148 | 149 | The Docker Containers Dashboard shows key metrics for monitoring running containers: 150 | 151 | * Total containers CPU load, memory and storage usage 152 | * Running containers graph, system load graph, IO usage graph 153 | * Container CPU usage graph 154 | * Container memory usage graph 155 | * Container cached memory usage graph 156 | * Container network inbound usage graph 157 | * Container network outbound usage graph 158 | 159 | Note that this dashboard doesn't show the containers that are part of the monitoring stack. 160 | 161 | ***Monitor Services Dashboard*** 162 | 163 | ![Monitor Services](https://raw.githubusercontent.com/stefanprodan/dockprom/master/screens/Grafana_Prometheus.png) 164 | 165 | The Monitor Services Dashboard shows key metrics for monitoring the containers that make up the monitoring stack: 166 | 167 | * Prometheus container uptime, monitoring stack total memory usage, Prometheus local storage memory chunks and series 168 | * Container CPU usage graph 169 | * Container memory usage graph 170 | * Prometheus chunks to persist and persistence urgency graphs 171 | * Prometheus chunks ops and checkpoint duration graphs 172 | * Prometheus samples ingested rate, target scrapes and scrape duration graphs 173 | * Prometheus HTTP requests graph 174 | * Prometheus alerts graph 175 | 176 | ## Define alerts 177 | 178 | Three alert groups have been setup within the [alert.rules](https://github.com/stefanprodan/dockprom/blob/master/prometheus/alert.rules) configuration file: 179 | 180 | * Monitoring services alerts [targets](https://github.com/stefanprodan/dockprom/blob/master/prometheus/alert.rules#L2-L11) 181 | * Docker Host alerts [host](https://github.com/stefanprodan/dockprom/blob/master/prometheus/alert.rules#L13-L40) 182 | * Docker Containers alerts [containers](https://github.com/stefanprodan/dockprom/blob/master/prometheus/alert.rules#L42-L69) 183 | 184 | You can modify the alert rules and reload them by making a HTTP POST call to Prometheus: 185 | 186 | ``` 187 | curl -X POST http://admin:admin@:9090/-/reload 188 | ``` 189 | 190 | ***Monitoring services alerts*** 191 | 192 | Trigger an alert if any of the monitoring targets (node-exporter and cAdvisor) are down for more than 30 seconds: 193 | 194 | ```yaml 195 | - alert: monitor_service_down 196 | expr: up == 0 197 | for: 30s 198 | labels: 199 | severity: critical 200 | annotations: 201 | summary: "Monitor service non-operational" 202 | description: "Service {{ $labels.instance }} is down." 203 | ``` 204 | 205 | ***Docker Host alerts*** 206 | 207 | Trigger an alert if the Docker host CPU is under high load for more than 30 seconds: 208 | 209 | ```yaml 210 | - alert: high_cpu_load 211 | expr: node_load1 > 1.5 212 | for: 30s 213 | labels: 214 | severity: warning 215 | annotations: 216 | summary: "Server under high load" 217 | description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." 218 | ``` 219 | 220 | Modify the load threshold based on your CPU cores. 221 | 222 | Trigger an alert if the Docker host memory is almost full: 223 | 224 | ```yaml 225 | - alert: high_memory_load 226 | expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 227 | for: 30s 228 | labels: 229 | severity: warning 230 | annotations: 231 | summary: "Server memory is almost full" 232 | description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." 233 | ``` 234 | 235 | Trigger an alert if the Docker host storage is almost full: 236 | 237 | ```yaml 238 | - alert: high_storage_load 239 | expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 240 | for: 30s 241 | labels: 242 | severity: warning 243 | annotations: 244 | summary: "Server storage is almost full" 245 | description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." 246 | ``` 247 | 248 | ***Docker Containers alerts*** 249 | 250 | Trigger an alert if a container is down for more than 30 seconds: 251 | 252 | ```yaml 253 | - alert: jenkins_down 254 | expr: absent(container_memory_usage_bytes{name="jenkins"}) 255 | for: 30s 256 | labels: 257 | severity: critical 258 | annotations: 259 | summary: "Jenkins down" 260 | description: "Jenkins container is down for more than 30 seconds." 261 | ``` 262 | 263 | Trigger an alert if a container is using more than 10% of total CPU cores for more than 30 seconds: 264 | 265 | ```yaml 266 | - alert: jenkins_high_cpu 267 | expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 268 | for: 30s 269 | labels: 270 | severity: warning 271 | annotations: 272 | summary: "Jenkins high CPU usage" 273 | description: "Jenkins CPU usage is {{ humanize $value}}%." 274 | ``` 275 | 276 | Trigger an alert if a container is using more than 1.2GB of RAM for more than 30 seconds: 277 | 278 | ```yaml 279 | - alert: jenkins_high_memory 280 | expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000 281 | for: 30s 282 | labels: 283 | severity: warning 284 | annotations: 285 | summary: "Jenkins high memory usage" 286 | description: "Jenkins memory consumption is at {{ humanize $value}}." 287 | ``` 288 | 289 | ## Setup alerting 290 | 291 | The AlertManager service is responsible for handling alerts sent by Prometheus server. 292 | AlertManager can send notifications via email, Pushover, Slack, HipChat or any other system that exposes a webhook interface. 293 | A complete list of integrations can be found [here](https://prometheus.io/docs/alerting/configuration). 294 | 295 | You can view and silence notifications by accessing `http://:9093`. 296 | 297 | The notification receivers can be configured in [alertmanager/config.yml](https://github.com/stefanprodan/dockprom/blob/master/alertmanager/config.yml) file. 298 | 299 | To receive alerts via Slack you need to make a custom integration by choose ***incoming web hooks*** in your Slack team app page. 300 | You can find more details on setting up Slack integration [here](http://www.robustperception.io/using-slack-with-the-alertmanager/). 301 | 302 | Copy the Slack Webhook URL into the ***api_url*** field and specify a Slack ***channel***. 303 | 304 | ```yaml 305 | route: 306 | receiver: 'slack' 307 | group_by: ['...'] 308 | 309 | receivers: 310 | - name: 'slack' 311 | slack_configs: 312 | - send_resolved: true 313 | text: "{{ .CommonAnnotations.description }}" 314 | username: 'Prometheus' 315 | channel: '#' 316 | api_url: 'https://hooks.slack.com/services/' 317 | ``` 318 | 319 | ![Slack Notifications](https://raw.githubusercontent.com/stefanprodan/dockprom/master/screens/Slack_Notifications.png) 320 | 321 | 322 | ## Using Email for alerting 323 | 324 | ```yaml 325 | route: 326 | receiver: 'email' 327 | group_by: ['...'] 328 | 329 | receivers: 330 | - name: 'email' 331 | email_configs: 332 | - to: receiver@email.com 333 | from: my@email.com 334 | smarthost: mail.server.biz:587 335 | auth_username: my@email.com 336 | auth_password: password 337 | require_tls: true 338 | ``` 339 | 340 | Note: setting up sending alerts from popular services like Gmail is more complicated due to higer security precautions. You need App passwords and stuff. Take it easy and use a small but standard conform provider. 341 | 342 | 343 | ## Sending metrics to the Pushgateway 344 | 345 | The [pushgateway](https://github.com/prometheus/pushgateway) is used to collect data from batch jobs or from services. 346 | 347 | To push data, simply execute: 348 | 349 | echo "some_metric 3.14" | curl --data-binary @- http://user:password@localhost:9091/metrics/job/some_job 350 | 351 | Please replace the `user:password` part with your user and password set in the initial configuration (default: `admin:admin`). 352 | 353 | ## Updating Grafana to v5.2.2 354 | 355 | [In Grafana versions >= 5.1 the id of the grafana user has been changed](http://docs.grafana.org/installation/docker/#migration-from-a-previous-version-of-the-docker-container-to-5-1-or-later). Unfortunately this means that files created prior to 5.1 won’t have the correct permissions for later versions. 356 | 357 | | Version | User | User ID | 358 | |:-------:|:-------:|:-------:| 359 | | < 5.1 | grafana | 104 | 360 | | \>= 5.1 | grafana | 472 | 361 | 362 | There are two possible solutions to this problem. 363 | - Change ownership from 104 to 472 364 | - Start the upgraded container as user 104 365 | 366 | ##### Specifying a user in docker-compose.yml 367 | 368 | To change ownership of the files run your grafana container as root and modify the permissions. 369 | 370 | First perform a `docker-compose down` then modify your docker-compose.yml to include the `user: root` option: 371 | 372 | ``` 373 | grafana: 374 | image: grafana/grafana:5.2.2 375 | container_name: grafana 376 | volumes: 377 | - grafana_data:/var/lib/grafana 378 | - ./grafana/datasources:/etc/grafana/datasources 379 | - ./grafana/dashboards:/etc/grafana/dashboards 380 | - ./grafana/setup.sh:/setup.sh 381 | entrypoint: /setup.sh 382 | user: root 383 | environment: 384 | - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin} 385 | - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 386 | - GF_USERS_ALLOW_SIGN_UP=false 387 | restart: unless-stopped 388 | expose: 389 | - 3000 390 | networks: 391 | - monitor-net 392 | labels: 393 | org.label-schema.group: "monitoring" 394 | ``` 395 | 396 | Perform a `docker-compose up -d` and then issue the following commands: 397 | 398 | ``` 399 | docker exec -it --user root grafana bash 400 | 401 | # in the container you just started: 402 | chown -R root:root /etc/grafana && \ 403 | chmod -R a+r /etc/grafana && \ 404 | chown -R grafana:grafana /var/lib/grafana && \ 405 | chown -R grafana:grafana /usr/share/grafana 406 | ``` 407 | 408 | To run the grafana container as `user: 104` change your `docker-compose.yml` like such: 409 | 410 | ``` 411 | grafana: 412 | image: grafana/grafana:5.2.2 413 | container_name: grafana 414 | volumes: 415 | - grafana_data:/var/lib/grafana 416 | - ./grafana/datasources:/etc/grafana/datasources 417 | - ./grafana/dashboards:/etc/grafana/dashboards 418 | - ./grafana/setup.sh:/setup.sh 419 | entrypoint: /setup.sh 420 | user: "104" 421 | environment: 422 | - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin} 423 | - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 424 | - GF_USERS_ALLOW_SIGN_UP=false 425 | restart: unless-stopped 426 | expose: 427 | - 3000 428 | networks: 429 | - monitor-net 430 | labels: 431 | org.label-schema.group: "monitoring" 432 | ``` 433 | -------------------------------------------------------------------------------- /alertmanager/config.yml: -------------------------------------------------------------------------------- 1 | route: 2 | receiver: 'slack' 3 | 4 | receivers: 5 | - name: 'slack' 6 | slack_configs: 7 | - send_resolved: true 8 | text: "{{ .CommonAnnotations.description }}" 9 | username: 'Prometheus' 10 | channel: '#' 11 | api_url: 'https://hooks.slack.com/services/' 12 | 13 | -------------------------------------------------------------------------------- /caddy/Caddyfile: -------------------------------------------------------------------------------- 1 | :9090 { 2 | basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD} 3 | proxy / prometheus:9090 { 4 | transparent 5 | } 6 | 7 | errors stderr 8 | tls off 9 | } 10 | 11 | :9093 { 12 | basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD} 13 | proxy / alertmanager:9093 { 14 | transparent 15 | } 16 | 17 | errors stderr 18 | tls off 19 | } 20 | 21 | :9091 { 22 | basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD} 23 | proxy / pushgateway:9091 { 24 | transparent 25 | } 26 | 27 | errors stderr 28 | tls off 29 | } 30 | 31 | :3000 { 32 | proxy / grafana:3000 { 33 | transparent 34 | websocket 35 | } 36 | 37 | errors stderr 38 | tls off 39 | } 40 | 41 | :8000 { 42 | proxy / graph-node:8000 { 43 | transparent 44 | } 45 | 46 | errors stderr 47 | tls off 48 | } 49 | 50 | :8001 { 51 | proxy / graph-node:8001 { 52 | transparent 53 | websocket 54 | } 55 | 56 | errors stderr 57 | tls off 58 | } -------------------------------------------------------------------------------- /config: -------------------------------------------------------------------------------- 1 | GF_SECURITY_ADMIN_USER=admin 2 | GF_SECURITY_ADMIN_PASSWORD=changeme 3 | GF_USERS_ALLOW_SIGN_UP=false 4 | -------------------------------------------------------------------------------- /docker-compose.exporters.yml: -------------------------------------------------------------------------------- 1 | version: '2.1' 2 | 3 | services: 4 | 5 | nodeexporter: 6 | image: prom/node-exporter:v1.0.1 7 | container_name: nodeexporter 8 | volumes: 9 | - /proc:/host/proc:ro 10 | - /sys:/host/sys:ro 11 | - /:/rootfs:ro 12 | command: 13 | - '--path.procfs=/host/proc' 14 | - '--path.rootfs=/rootfs' 15 | - '--path.sysfs=/host/sys' 16 | - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' 17 | restart: unless-stopped 18 | network_mode: host 19 | labels: 20 | org.label-schema.group: "monitoring" 21 | 22 | cadvisor: 23 | image: gcr.io/cadvisor/cadvisor:v0.37.0 24 | container_name: cadvisor 25 | volumes: 26 | - /:/rootfs:ro 27 | - /var/run:/var/run:rw 28 | - /sys:/sys:ro 29 | - /var/lib/docker/:/var/lib/docker:ro 30 | - /cgroup:/cgroup:ro 31 | restart: unless-stopped 32 | network_mode: host 33 | labels: 34 | org.label-schema.group: "monitoring" 35 | 36 | 37 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.1' 2 | 3 | networks: 4 | monitor-net: 5 | driver: bridge 6 | 7 | volumes: 8 | prometheus_data: {} 9 | grafana_data: {} 10 | postgres_data: {} 11 | nginx_certs: {} 12 | nginx_vhost: {} 13 | nginx_html: {} 14 | 15 | services: 16 | graph-node: 17 | image: graphprotocol/graph-node 18 | container_name: graph-node 19 | depends_on: 20 | - postgres 21 | environment: 22 | postgres_host: postgres:5432 23 | postgres_user: ${DB_USER:-graph-node} 24 | postgres_pass: ${DB_PASS:-let-me-in} 25 | postgres_db: ${DB_NAME:-graph-node} 26 | ipfs: '${IPFS_ENDPOINT:-https://testnet.thegraph.com/ipfs/}' 27 | ethereum: '${ETHEREUM}' 28 | ETHEREUM_START_BLOCK: ${ETHEREUM_START_BLOCK:-7710671} 29 | RUST_LOG: info 30 | VIRTUAL_HOST: ${QUERY_HOST:-query.butterflylabs.tk} 31 | VIRTUAL_PORT: 8000 32 | LETSENCRYPT_HOST: ${QUERY_HOST:-query.butterflylabs.tk} 33 | restart: unless-stopped 34 | ports: 35 | - 127.0.0.1:8020:8020 36 | expose: 37 | - 8001 38 | - 8000 39 | - 8030 40 | - 8040 41 | networks: 42 | - monitor-net 43 | nginx-proxy: 44 | image: jwilder/nginx-proxy 45 | container_name: nginx-proxy 46 | ports: 47 | - "80:80" 48 | - "443:443" 49 | volumes: 50 | - /var/run/docker.sock:/tmp/docker.sock:ro 51 | - nginx_certs:/etc/nginx/certs 52 | - nginx_vhost:/etc/nginx/vhost.d 53 | - nginx_html:/usr/share/nginx/html 54 | networks: 55 | - monitor-net 56 | restart: unless-stopped 57 | nginx-ssl: 58 | image: jrcs/letsencrypt-nginx-proxy-companion 59 | container_name: nginx-ssl 60 | volumes: 61 | - /var/run/docker.sock:/var/run/docker.sock:ro 62 | - nginx_certs:/etc/nginx/certs 63 | - nginx_vhost:/etc/nginx/vhost.d 64 | - nginx_html:/usr/share/nginx/html 65 | restart: unless-stopped 66 | networks: 67 | - monitor-net 68 | restart: unless-stopped 69 | environment: 70 | NGINX_PROXY_CONTAINER: nginx-proxy 71 | DEFAULT_EMAIL: ${EMAIL:-sebastian.heyden@gmail.com} 72 | postgres: 73 | image: postgres 74 | container_name: postgres 75 | command: ["postgres", "-cshared_preload_libraries=pg_stat_statements"] 76 | restart: unless-stopped 77 | environment: 78 | POSTGRES_USER: ${DB_USER:-graph-node} 79 | POSTGRES_PASSWORD: ${DB_PASS:-let-me-in} 80 | POSTGRES_DB: ${DB_NAME:-graph-node} 81 | volumes: 82 | - postgres_data:/var/lib/postgresql/data 83 | networks: 84 | - monitor-net 85 | expose: 86 | - 5432 87 | 88 | prometheus: 89 | image: prom/prometheus:v2.20.0 90 | container_name: prometheus 91 | volumes: 92 | - ./prometheus:/etc/prometheus 93 | - prometheus_data:/prometheus 94 | command: 95 | - '--config.file=/etc/prometheus/prometheus.yml' 96 | - '--storage.tsdb.path=/prometheus' 97 | - '--web.console.libraries=/etc/prometheus/console_libraries' 98 | - '--web.console.templates=/etc/prometheus/consoles' 99 | - '--storage.tsdb.retention.time=200h' 100 | - '--web.enable-lifecycle' 101 | restart: unless-stopped 102 | expose: 103 | - 9090 104 | networks: 105 | - monitor-net 106 | labels: 107 | org.label-schema.group: "monitoring" 108 | 109 | alertmanager: 110 | image: prom/alertmanager:v0.21.0 111 | container_name: alertmanager 112 | volumes: 113 | - ./alertmanager:/etc/alertmanager 114 | command: 115 | - '--config.file=/etc/alertmanager/config.yml' 116 | - '--storage.path=/alertmanager' 117 | restart: unless-stopped 118 | expose: 119 | - 9093 120 | networks: 121 | - monitor-net 122 | labels: 123 | org.label-schema.group: "monitoring" 124 | 125 | nodeexporter: 126 | image: prom/node-exporter:v1.0.1 127 | container_name: nodeexporter 128 | volumes: 129 | - /proc:/host/proc:ro 130 | - /sys:/host/sys:ro 131 | - /:/rootfs:ro 132 | command: 133 | - '--path.procfs=/host/proc' 134 | - '--path.rootfs=/rootfs' 135 | - '--path.sysfs=/host/sys' 136 | - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' 137 | restart: unless-stopped 138 | expose: 139 | - 9100 140 | networks: 141 | - monitor-net 142 | labels: 143 | org.label-schema.group: "monitoring" 144 | 145 | cadvisor: 146 | image: gcr.io/cadvisor/cadvisor:v0.37.0 147 | container_name: cadvisor 148 | volumes: 149 | - /:/rootfs:ro 150 | - /var/run:/var/run:rw 151 | - /sys:/sys:ro 152 | - /var/lib/docker:/var/lib/docker:ro 153 | - /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux 154 | restart: unless-stopped 155 | expose: 156 | - 8080 157 | networks: 158 | - monitor-net 159 | labels: 160 | org.label-schema.group: "monitoring" 161 | 162 | grafana: 163 | image: grafana/grafana:7.1.1 164 | container_name: grafana 165 | volumes: 166 | - grafana_data:/var/lib/grafana 167 | - ./grafana/provisioning:/etc/grafana/provisioning 168 | environment: 169 | - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin} 170 | - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 171 | - GF_USERS_ALLOW_SIGN_UP=false 172 | - GF_SECURITY_DISABLE_BRUTE_FORCE_LOGIN_PROTECTION=true 173 | - postgres_host=postgres 174 | - postgres_user=${DB_USER:-graph-node} 175 | - postgres_pass=${DB_PASS:-let-me-in} 176 | - postgres_db=${DB_NAME:-graph-node} 177 | - VIRTUAL_HOST=${INDEX_HOST:-index.butterflylabs.tk} 178 | - VIRTUAL_PORT=3000 179 | - LETSENCRYPT_HOST=${INDEX_HOST:-index.butterflylabs.tk} 180 | restart: unless-stopped 181 | expose: 182 | - 3000 183 | networks: 184 | - monitor-net 185 | labels: 186 | org.label-schema.group: "monitoring" 187 | 188 | pushgateway: 189 | image: prom/pushgateway:v1.2.0 190 | container_name: pushgateway 191 | restart: unless-stopped 192 | expose: 193 | - 9091 194 | networks: 195 | - monitor-net 196 | labels: 197 | org.label-schema.group: "monitoring" 198 | 199 | caddy: 200 | image: stefanprodan/caddy 201 | container_name: caddy 202 | ports: 203 | #- "3000:3000" 204 | #- "8000:8000" 205 | - "8001:8001" 206 | - "9090:9090" 207 | - "9093:9093" 208 | - "9091:9091" 209 | volumes: 210 | - ./caddy:/etc/caddy 211 | environment: 212 | - ADMIN_USER=${ADMIN_USER:-admin} 213 | - ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 214 | restart: unless-stopped 215 | networks: 216 | - monitor-net 217 | labels: 218 | org.label-schema.group: "monitoring" 219 | -------------------------------------------------------------------------------- /dottk_checkout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/butterfly-academy/graphprotocol-infrastructure/5a92f9b92949019c7a149f70908fe5caee95f425/dottk_checkout.png -------------------------------------------------------------------------------- /grafana/provisioning/dashboards/dashboard.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'Prometheus' 5 | orgId: 1 6 | folder: '' 7 | type: file 8 | disableDeletion: false 9 | editable: true 10 | allowUiUpdates: true 11 | options: 12 | path: /etc/grafana/provisioning/dashboards -------------------------------------------------------------------------------- /grafana/provisioning/dashboards/default.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | providers: 3 | - name: 'default' 4 | orgId: 1 5 | folder: '' 6 | folderUid: '' 7 | type: file 8 | allowUiUpdates: true 9 | updateIntervalSeconds: 31536000 10 | options: 11 | path: /etc/grafana/provisioning/dashboards 12 | -------------------------------------------------------------------------------- /grafana/provisioning/dashboards/docker_containers.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": null, 3 | "title": "Docker Containers", 4 | "description": "Containers metrics", 5 | "tags": [ 6 | "docker" 7 | ], 8 | "style": "dark", 9 | "timezone": "browser", 10 | "editable": true, 11 | "hideControls": false, 12 | "sharedCrosshair": true, 13 | "rows": [ 14 | { 15 | "collapse": false, 16 | "editable": true, 17 | "height": "150px", 18 | "panels": [ 19 | { 20 | "cacheTimeout": null, 21 | "colorBackground": false, 22 | "colorValue": false, 23 | "colors": [ 24 | "rgba(50, 172, 45, 0.97)", 25 | "rgba(237, 129, 40, 0.89)", 26 | "rgba(245, 54, 54, 0.9)" 27 | ], 28 | "datasource": "prometheus", 29 | "decimals": 2, 30 | "editable": true, 31 | "error": false, 32 | "format": "percent", 33 | "gauge": { 34 | "maxValue": 100, 35 | "minValue": 0, 36 | "show": true, 37 | "thresholdLabels": false, 38 | "thresholdMarkers": true 39 | }, 40 | "id": 4, 41 | "interval": null, 42 | "isNew": true, 43 | "links": [], 44 | "mappingType": 1, 45 | "mappingTypes": [ 46 | { 47 | "name": "value to text", 48 | "value": 1 49 | }, 50 | { 51 | "name": "range to text", 52 | "value": 2 53 | } 54 | ], 55 | "maxDataPoints": 100, 56 | "nullPointMode": "connected", 57 | "nullText": null, 58 | "postfix": "", 59 | "postfixFontSize": "50%", 60 | "prefix": "", 61 | "prefixFontSize": "50%", 62 | "rangeMaps": [ 63 | { 64 | "from": "null", 65 | "text": "N/A", 66 | "to": "null" 67 | } 68 | ], 69 | "span": 2, 70 | "sparkline": { 71 | "fillColor": "rgba(31, 118, 189, 0.18)", 72 | "full": false, 73 | "lineColor": "rgb(31, 120, 193)", 74 | "show": false 75 | }, 76 | "targets": [ 77 | { 78 | "expr": "sum(rate(container_cpu_user_seconds_total{image!=\"\"}[1m])) / count(node_cpu_seconds_total{mode=\"user\"}) * 100", 79 | "interval": "10s", 80 | "intervalFactor": 1, 81 | "legendFormat": "", 82 | "refId": "A", 83 | "step": 10 84 | } 85 | ], 86 | "thresholds": "65, 90", 87 | "title": "CPU Load", 88 | "transparent": false, 89 | "type": "singlestat", 90 | "valueFontSize": "80%", 91 | "valueMaps": [ 92 | { 93 | "op": "=", 94 | "text": "N/A", 95 | "value": "null" 96 | } 97 | ], 98 | "valueName": "avg", 99 | "timeFrom": "10s", 100 | "hideTimeOverride": true 101 | }, 102 | { 103 | "cacheTimeout": null, 104 | "colorBackground": false, 105 | "colorValue": false, 106 | "colors": [ 107 | "rgba(245, 54, 54, 0.9)", 108 | "rgba(237, 129, 40, 0.89)", 109 | "rgba(50, 172, 45, 0.97)" 110 | ], 111 | "datasource": "prometheus", 112 | "editable": true, 113 | "error": false, 114 | "format": "none", 115 | "gauge": { 116 | "maxValue": 100, 117 | "minValue": 0, 118 | "show": false, 119 | "thresholdLabels": false, 120 | "thresholdMarkers": true 121 | }, 122 | "id": 7, 123 | "interval": null, 124 | "isNew": true, 125 | "links": [], 126 | "mappingType": 1, 127 | "mappingTypes": [ 128 | { 129 | "name": "value to text", 130 | "value": 1 131 | }, 132 | { 133 | "name": "range to text", 134 | "value": 2 135 | } 136 | ], 137 | "maxDataPoints": 100, 138 | "nullPointMode": "connected", 139 | "nullText": null, 140 | "postfix": "", 141 | "postfixFontSize": "50%", 142 | "prefix": "", 143 | "prefixFontSize": "50%", 144 | "rangeMaps": [ 145 | { 146 | "from": "null", 147 | "text": "N/A", 148 | "to": "null" 149 | } 150 | ], 151 | "span": 2, 152 | "sparkline": { 153 | "fillColor": "rgba(31, 118, 189, 0.18)", 154 | "full": false, 155 | "lineColor": "rgb(31, 120, 193)", 156 | "show": false 157 | }, 158 | "targets": [ 159 | { 160 | "expr": "machine_cpu_cores", 161 | "interval": "", 162 | "intervalFactor": 2, 163 | "legendFormat": "", 164 | "metric": "machine_cpu_cores", 165 | "refId": "A", 166 | "step": 20 167 | } 168 | ], 169 | "thresholds": "", 170 | "title": "CPU Cores", 171 | "type": "singlestat", 172 | "valueFontSize": "80%", 173 | "valueMaps": [ 174 | { 175 | "op": "=", 176 | "text": "N/A", 177 | "value": "null" 178 | } 179 | ], 180 | "valueName": "avg" 181 | }, 182 | { 183 | "cacheTimeout": null, 184 | "colorBackground": false, 185 | "colorValue": false, 186 | "colors": [ 187 | "rgba(50, 172, 45, 0.97)", 188 | "rgba(237, 129, 40, 0.89)", 189 | "rgba(245, 54, 54, 0.9)" 190 | ], 191 | "datasource": "prometheus", 192 | "editable": true, 193 | "error": false, 194 | "format": "percent", 195 | "gauge": { 196 | "maxValue": 100, 197 | "minValue": 0, 198 | "show": true, 199 | "thresholdLabels": false, 200 | "thresholdMarkers": true 201 | }, 202 | "id": 5, 203 | "interval": null, 204 | "isNew": true, 205 | "links": [], 206 | "mappingType": 1, 207 | "mappingTypes": [ 208 | { 209 | "name": "value to text", 210 | "value": 1 211 | }, 212 | { 213 | "name": "range to text", 214 | "value": 2 215 | } 216 | ], 217 | "maxDataPoints": 100, 218 | "nullPointMode": "connected", 219 | "nullText": null, 220 | "postfix": "", 221 | "postfixFontSize": "50%", 222 | "prefix": "", 223 | "prefixFontSize": "50%", 224 | "rangeMaps": [ 225 | { 226 | "from": "null", 227 | "text": "N/A", 228 | "to": "null" 229 | } 230 | ], 231 | "span": 2, 232 | "sparkline": { 233 | "fillColor": "rgba(31, 118, 189, 0.18)", 234 | "full": false, 235 | "lineColor": "rgb(31, 120, 193)", 236 | "show": false 237 | }, 238 | "targets": [ 239 | { 240 | "expr": "(sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100", 241 | "interval": "10s", 242 | "intervalFactor": 2, 243 | "legendFormat": "", 244 | "refId": "A", 245 | "step": 20 246 | } 247 | ], 248 | "thresholds": "65, 90", 249 | "title": "Memory Load", 250 | "transparent": false, 251 | "type": "singlestat", 252 | "valueFontSize": "80%", 253 | "valueMaps": [ 254 | { 255 | "op": "=", 256 | "text": "N/A", 257 | "value": "null" 258 | } 259 | ], 260 | "valueName": "avg", 261 | "timeFrom": "10s", 262 | "hideTimeOverride": true 263 | }, 264 | { 265 | "cacheTimeout": null, 266 | "colorBackground": false, 267 | "colorValue": false, 268 | "colors": [ 269 | "rgba(245, 54, 54, 0.9)", 270 | "rgba(237, 129, 40, 0.89)", 271 | "rgba(50, 172, 45, 0.97)" 272 | ], 273 | "datasource": "prometheus", 274 | "decimals": 2, 275 | "editable": true, 276 | "error": false, 277 | "format": "bytes", 278 | "gauge": { 279 | "maxValue": 100, 280 | "minValue": 0, 281 | "show": false, 282 | "thresholdLabels": false, 283 | "thresholdMarkers": true 284 | }, 285 | "id": 2, 286 | "interval": null, 287 | "isNew": true, 288 | "links": [], 289 | "mappingType": 1, 290 | "mappingTypes": [ 291 | { 292 | "name": "value to text", 293 | "value": 1 294 | }, 295 | { 296 | "name": "range to text", 297 | "value": 2 298 | } 299 | ], 300 | "maxDataPoints": 100, 301 | "nullPointMode": "connected", 302 | "nullText": null, 303 | "postfix": "", 304 | "postfixFontSize": "50%", 305 | "prefix": "", 306 | "prefixFontSize": "50%", 307 | "rangeMaps": [ 308 | { 309 | "from": "null", 310 | "text": "N/A", 311 | "to": "null" 312 | } 313 | ], 314 | "span": 2, 315 | "sparkline": { 316 | "fillColor": "rgba(31, 118, 189, 0.18)", 317 | "full": false, 318 | "lineColor": "rgb(31, 120, 193)", 319 | "show": false 320 | }, 321 | "targets": [ 322 | { 323 | "expr": "sum(container_memory_usage_bytes{image!=\"\"})", 324 | "interval": "10s", 325 | "intervalFactor": 2, 326 | "legendFormat": "", 327 | "refId": "A", 328 | "step": 20 329 | } 330 | ], 331 | "thresholds": "", 332 | "timeFrom": "10s", 333 | "title": "Used Memory", 334 | "transparent": false, 335 | "type": "singlestat", 336 | "valueFontSize": "80%", 337 | "valueMaps": [ 338 | { 339 | "op": "=", 340 | "text": "N/A", 341 | "value": "null" 342 | } 343 | ], 344 | "valueName": "avg", 345 | "hideTimeOverride": true 346 | }, 347 | { 348 | "cacheTimeout": null, 349 | "colorBackground": false, 350 | "colorValue": false, 351 | "colors": [ 352 | "rgba(50, 172, 45, 0.97)", 353 | "rgba(237, 129, 40, 0.89)", 354 | "rgba(245, 54, 54, 0.9)" 355 | ], 356 | "datasource": "prometheus", 357 | "decimals": null, 358 | "editable": true, 359 | "error": false, 360 | "format": "percent", 361 | "gauge": { 362 | "maxValue": 100, 363 | "minValue": 0, 364 | "show": true, 365 | "thresholdLabels": false, 366 | "thresholdMarkers": true 367 | }, 368 | "id": 6, 369 | "interval": null, 370 | "isNew": true, 371 | "links": [], 372 | "mappingType": 1, 373 | "mappingTypes": [ 374 | { 375 | "name": "value to text", 376 | "value": 1 377 | }, 378 | { 379 | "name": "range to text", 380 | "value": 2 381 | } 382 | ], 383 | "maxDataPoints": 100, 384 | "nullPointMode": "connected", 385 | "nullText": null, 386 | "postfix": "", 387 | "postfixFontSize": "50%", 388 | "prefix": "", 389 | "prefixFontSize": "50%", 390 | "rangeMaps": [ 391 | { 392 | "from": "null", 393 | "text": "N/A", 394 | "to": "null" 395 | } 396 | ], 397 | "span": 2, 398 | "sparkline": { 399 | "fillColor": "rgba(31, 118, 189, 0.18)", 400 | "full": false, 401 | "lineColor": "rgb(31, 120, 193)", 402 | "show": false 403 | }, 404 | "targets": [ 405 | { 406 | "expr": "(node_filesystem_size_bytes{fstype=\"aufs\"} - node_filesystem_free_bytes{fstype=\"aufs\"}) / node_filesystem_size_bytes{fstype=\"aufs\"} * 100", 407 | "interval": "30s", 408 | "intervalFactor": 1, 409 | "legendFormat": "", 410 | "refId": "A", 411 | "step": 30 412 | } 413 | ], 414 | "thresholds": "65, 90", 415 | "title": "Storage Load", 416 | "transparent": false, 417 | "type": "singlestat", 418 | "valueFontSize": "80%", 419 | "valueMaps": [ 420 | { 421 | "op": "=", 422 | "text": "N/A", 423 | "value": "null" 424 | } 425 | ], 426 | "valueName": "avg", 427 | "timeFrom": "10s", 428 | "hideTimeOverride": true 429 | }, 430 | { 431 | "cacheTimeout": null, 432 | "colorBackground": false, 433 | "colorValue": false, 434 | "colors": [ 435 | "rgba(245, 54, 54, 0.9)", 436 | "rgba(237, 129, 40, 0.89)", 437 | "rgba(50, 172, 45, 0.97)" 438 | ], 439 | "datasource": "prometheus", 440 | "decimals": 2, 441 | "editable": true, 442 | "error": false, 443 | "format": "bytes", 444 | "gauge": { 445 | "maxValue": 100, 446 | "minValue": 0, 447 | "show": false, 448 | "thresholdLabels": false, 449 | "thresholdMarkers": true 450 | }, 451 | "id": 3, 452 | "interval": null, 453 | "isNew": true, 454 | "links": [], 455 | "mappingType": 1, 456 | "mappingTypes": [ 457 | { 458 | "name": "value to text", 459 | "value": 1 460 | }, 461 | { 462 | "name": "range to text", 463 | "value": 2 464 | } 465 | ], 466 | "maxDataPoints": 100, 467 | "nullPointMode": "connected", 468 | "nullText": null, 469 | "postfix": "", 470 | "postfixFontSize": "50%", 471 | "prefix": "", 472 | "prefixFontSize": "50%", 473 | "rangeMaps": [ 474 | { 475 | "from": "null", 476 | "text": "N/A", 477 | "to": "null" 478 | } 479 | ], 480 | "span": 2, 481 | "sparkline": { 482 | "fillColor": "rgba(31, 118, 189, 0.18)", 483 | "full": false, 484 | "lineColor": "rgb(31, 120, 193)", 485 | "show": false 486 | }, 487 | "targets": [ 488 | { 489 | "expr": "sum(container_fs_usage_bytes)", 490 | "interval": "30s", 491 | "intervalFactor": 2, 492 | "refId": "A", 493 | "step": 60 494 | } 495 | ], 496 | "thresholds": "", 497 | "title": "Used Storage", 498 | "transparent": false, 499 | "type": "singlestat", 500 | "valueFontSize": "80%", 501 | "valueMaps": [ 502 | { 503 | "op": "=", 504 | "text": "N/A", 505 | "value": "null" 506 | } 507 | ], 508 | "valueName": "avg", 509 | "timeFrom": "10s", 510 | "hideTimeOverride": true 511 | } 512 | ], 513 | "title": "Overview" 514 | }, 515 | { 516 | "collapse": false, 517 | "editable": true, 518 | "height": "150px", 519 | "panels": [ 520 | { 521 | "aliasColors": {}, 522 | "bars": true, 523 | "datasource": "prometheus", 524 | "decimals": 0, 525 | "editable": true, 526 | "error": false, 527 | "fill": 1, 528 | "grid": { 529 | "threshold1": null, 530 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 531 | "threshold2": null, 532 | "threshold2Color": "rgba(234, 112, 112, 0.22)", 533 | "thresholdLine": false 534 | }, 535 | "id": 9, 536 | "isNew": true, 537 | "legend": { 538 | "avg": false, 539 | "current": false, 540 | "max": false, 541 | "min": false, 542 | "show": false, 543 | "total": false, 544 | "values": false 545 | }, 546 | "lines": false, 547 | "linewidth": 2, 548 | "links": [], 549 | "nullPointMode": "connected", 550 | "percentage": false, 551 | "pointradius": 5, 552 | "points": false, 553 | "renderer": "flot", 554 | "seriesOverrides": [], 555 | "span": 4, 556 | "stack": false, 557 | "steppedLine": false, 558 | "targets": [ 559 | { 560 | "expr": "scalar(count(container_memory_usage_bytes{image!=\"\"}) > 0)", 561 | "interval": "", 562 | "intervalFactor": 2, 563 | "legendFormat": "containers", 564 | "refId": "A", 565 | "step": 2 566 | } 567 | ], 568 | "timeFrom": null, 569 | "timeShift": null, 570 | "title": "Running Containers", 571 | "tooltip": { 572 | "msResolution": true, 573 | "shared": true, 574 | "sort": 0, 575 | "value_type": "cumulative" 576 | }, 577 | "type": "graph", 578 | "xaxis": { 579 | "show": true 580 | }, 581 | "yaxes": [ 582 | { 583 | "format": "none", 584 | "label": "", 585 | "logBase": 1, 586 | "max": null, 587 | "min": 0, 588 | "show": true 589 | }, 590 | { 591 | "format": "short", 592 | "label": null, 593 | "logBase": 1, 594 | "max": null, 595 | "min": null, 596 | "show": false 597 | } 598 | ] 599 | }, 600 | { 601 | "aliasColors": {}, 602 | "bars": true, 603 | "datasource": "prometheus", 604 | "decimals": 2, 605 | "editable": true, 606 | "error": false, 607 | "fill": 1, 608 | "grid": { 609 | "threshold1": null, 610 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 611 | "threshold2": null, 612 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 613 | }, 614 | "id": 10, 615 | "isNew": true, 616 | "legend": { 617 | "avg": false, 618 | "current": false, 619 | "max": false, 620 | "min": false, 621 | "show": false, 622 | "total": false, 623 | "values": false 624 | }, 625 | "lines": false, 626 | "linewidth": 2, 627 | "links": [], 628 | "nullPointMode": "connected", 629 | "percentage": false, 630 | "pointradius": 5, 631 | "points": false, 632 | "renderer": "flot", 633 | "seriesOverrides": [ 634 | { 635 | "alias": "load 1m", 636 | "color": "#BF1B00" 637 | } 638 | ], 639 | "span": 4, 640 | "stack": false, 641 | "steppedLine": false, 642 | "targets": [ 643 | { 644 | "expr": "node_load1", 645 | "interval": "", 646 | "intervalFactor": 2, 647 | "legendFormat": "load 1m", 648 | "metric": "node_load1", 649 | "refId": "A", 650 | "step": 2 651 | } 652 | ], 653 | "timeFrom": null, 654 | "timeShift": null, 655 | "title": "System Load", 656 | "tooltip": { 657 | "msResolution": true, 658 | "shared": true, 659 | "sort": 0, 660 | "value_type": "cumulative" 661 | }, 662 | "type": "graph", 663 | "xaxis": { 664 | "show": true 665 | }, 666 | "yaxes": [ 667 | { 668 | "format": "short", 669 | "label": null, 670 | "logBase": 1, 671 | "max": null, 672 | "min": 0, 673 | "show": true 674 | }, 675 | { 676 | "format": "short", 677 | "label": null, 678 | "logBase": 1, 679 | "max": null, 680 | "min": null, 681 | "show": false 682 | } 683 | ] 684 | }, 685 | { 686 | "aliasColors": {}, 687 | "bars": false, 688 | "datasource": "prometheus", 689 | "editable": true, 690 | "error": false, 691 | "fill": 1, 692 | "grid": { 693 | "threshold1": null, 694 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 695 | "threshold2": null, 696 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 697 | }, 698 | "id": 15, 699 | "isNew": true, 700 | "legend": { 701 | "alignAsTable": true, 702 | "avg": true, 703 | "current": false, 704 | "max": true, 705 | "min": true, 706 | "rightSide": true, 707 | "show": false, 708 | "total": false, 709 | "values": true 710 | }, 711 | "lines": true, 712 | "linewidth": 2, 713 | "links": [], 714 | "nullPointMode": "connected", 715 | "percentage": false, 716 | "pointradius": 5, 717 | "points": false, 718 | "renderer": "flot", 719 | "seriesOverrides": [ 720 | { 721 | "alias": "read", 722 | "yaxis": 1 723 | }, 724 | { 725 | "alias": "written", 726 | "yaxis": 1 727 | }, 728 | { 729 | "alias": "io time", 730 | "yaxis": 2 731 | } 732 | ], 733 | "span": 4, 734 | "stack": false, 735 | "steppedLine": false, 736 | "targets": [ 737 | { 738 | "expr": "sum(irate(node_disk_read_bytes_total[5m]))", 739 | "interval": "2s", 740 | "intervalFactor": 4, 741 | "legendFormat": "read", 742 | "metric": "", 743 | "refId": "A", 744 | "step": 8 745 | }, 746 | { 747 | "expr": "sum(irate(node_disk_written_bytes_total[5m]))", 748 | "interval": "2s", 749 | "intervalFactor": 4, 750 | "legendFormat": "written", 751 | "metric": "", 752 | "refId": "B", 753 | "step": 8 754 | }, 755 | { 756 | "expr": "sum(irate(node_disk_io_time_seconds_total[5m]))", 757 | "interval": "2s", 758 | "intervalFactor": 4, 759 | "legendFormat": "io time", 760 | "metric": "", 761 | "refId": "C", 762 | "step": 8 763 | } 764 | ], 765 | "timeFrom": null, 766 | "timeShift": null, 767 | "title": "I/O Usage", 768 | "tooltip": { 769 | "msResolution": true, 770 | "shared": true, 771 | "sort": 0, 772 | "value_type": "cumulative" 773 | }, 774 | "type": "graph", 775 | "xaxis": { 776 | "show": true 777 | }, 778 | "yaxes": [ 779 | { 780 | "format": "bytes", 781 | "label": null, 782 | "logBase": 1, 783 | "max": null, 784 | "min": null, 785 | "show": true 786 | }, 787 | { 788 | "format": "ms", 789 | "label": null, 790 | "logBase": 1, 791 | "max": null, 792 | "min": null, 793 | "show": true 794 | } 795 | ] 796 | } 797 | ], 798 | "title": "Host stats" 799 | }, 800 | { 801 | "collapse": false, 802 | "editable": true, 803 | "height": "250px", 804 | "panels": [ 805 | { 806 | "aliasColors": {}, 807 | "bars": false, 808 | "datasource": "prometheus", 809 | "decimals": 2, 810 | "editable": true, 811 | "error": false, 812 | "fill": 1, 813 | "grid": { 814 | "threshold1": null, 815 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 816 | "threshold2": null, 817 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 818 | }, 819 | "id": 8, 820 | "isNew": true, 821 | "legend": { 822 | "alignAsTable": true, 823 | "avg": true, 824 | "current": false, 825 | "max": true, 826 | "min": true, 827 | "rightSide": true, 828 | "show": true, 829 | "total": false, 830 | "values": true 831 | }, 832 | "lines": true, 833 | "linewidth": 2, 834 | "links": [], 835 | "nullPointMode": "connected", 836 | "percentage": false, 837 | "pointradius": 5, 838 | "points": false, 839 | "renderer": "flot", 840 | "seriesOverrides": [], 841 | "span": 12, 842 | "stack": false, 843 | "steppedLine": false, 844 | "targets": [ 845 | { 846 | "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m])) / scalar(count(node_cpu_seconds_total{mode=\"user\"})) * 100", 847 | "intervalFactor": 10, 848 | "legendFormat": "{{ name }}", 849 | "metric": "container_cpu_user_seconds_total", 850 | "refId": "A", 851 | "step": 10 852 | } 853 | ], 854 | "timeFrom": null, 855 | "timeShift": null, 856 | "title": "Container CPU Usage", 857 | "tooltip": { 858 | "msResolution": true, 859 | "shared": true, 860 | "sort": 2, 861 | "value_type": "cumulative" 862 | }, 863 | "type": "graph", 864 | "xaxis": { 865 | "show": true 866 | }, 867 | "yaxes": [ 868 | { 869 | "format": "percent", 870 | "label": null, 871 | "logBase": 1, 872 | "max": null, 873 | "min": 0, 874 | "show": true 875 | }, 876 | { 877 | "format": "short", 878 | "label": null, 879 | "logBase": 1, 880 | "max": null, 881 | "min": null, 882 | "show": false 883 | } 884 | ] 885 | } 886 | ], 887 | "title": "CPU" 888 | }, 889 | { 890 | "collapse": false, 891 | "editable": true, 892 | "height": "250px", 893 | "panels": [ 894 | { 895 | "aliasColors": {}, 896 | "bars": false, 897 | "datasource": "prometheus", 898 | "decimals": 2, 899 | "editable": true, 900 | "error": false, 901 | "fill": 1, 902 | "grid": { 903 | "threshold1": null, 904 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 905 | "threshold2": null, 906 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 907 | }, 908 | "id": 11, 909 | "isNew": true, 910 | "legend": { 911 | "alignAsTable": true, 912 | "avg": true, 913 | "current": false, 914 | "max": true, 915 | "min": true, 916 | "rightSide": true, 917 | "show": true, 918 | "total": false, 919 | "values": true 920 | }, 921 | "lines": true, 922 | "linewidth": 2, 923 | "links": [], 924 | "nullPointMode": "connected", 925 | "percentage": false, 926 | "pointradius": 5, 927 | "points": false, 928 | "renderer": "flot", 929 | "seriesOverrides": [], 930 | "span": 12, 931 | "stack": false, 932 | "steppedLine": false, 933 | "targets": [ 934 | { 935 | "expr": "sum by (name)(container_memory_usage_bytes{image!=\"\",container_label_org_label_schema_group=\"\"})", 936 | "intervalFactor": 1, 937 | "legendFormat": "{{ name }}", 938 | "metric": "container_memory_usage", 939 | "refId": "A", 940 | "step": 1 941 | } 942 | ], 943 | "timeFrom": null, 944 | "timeShift": null, 945 | "title": "Container Memory Usage", 946 | "tooltip": { 947 | "msResolution": true, 948 | "shared": true, 949 | "sort": 0, 950 | "value_type": "cumulative" 951 | }, 952 | "type": "graph", 953 | "xaxis": { 954 | "show": true 955 | }, 956 | "yaxes": [ 957 | { 958 | "format": "bytes", 959 | "label": null, 960 | "logBase": 1, 961 | "max": null, 962 | "min": 0, 963 | "show": true 964 | }, 965 | { 966 | "format": "short", 967 | "label": null, 968 | "logBase": 1, 969 | "max": null, 970 | "min": null, 971 | "show": false 972 | } 973 | ] 974 | }, 975 | { 976 | "aliasColors": {}, 977 | "bars": false, 978 | "datasource": "prometheus", 979 | "decimals": 2, 980 | "editable": true, 981 | "error": false, 982 | "fill": 1, 983 | "grid": { 984 | "threshold1": null, 985 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 986 | "threshold2": null, 987 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 988 | }, 989 | "id": 12, 990 | "isNew": true, 991 | "legend": { 992 | "alignAsTable": true, 993 | "avg": true, 994 | "current": false, 995 | "max": true, 996 | "min": true, 997 | "rightSide": true, 998 | "show": true, 999 | "total": false, 1000 | "values": true 1001 | }, 1002 | "lines": true, 1003 | "linewidth": 2, 1004 | "links": [], 1005 | "nullPointMode": "connected", 1006 | "percentage": false, 1007 | "pointradius": 5, 1008 | "points": false, 1009 | "renderer": "flot", 1010 | "seriesOverrides": [], 1011 | "span": 12, 1012 | "stack": false, 1013 | "steppedLine": false, 1014 | "targets": [ 1015 | { 1016 | "expr": "sum by (name) (container_memory_cache{image!=\"\",container_label_org_label_schema_group=\"\"})", 1017 | "intervalFactor": 2, 1018 | "legendFormat": "{{name}}", 1019 | "metric": "container_memory_cache", 1020 | "refId": "A", 1021 | "step": 2 1022 | } 1023 | ], 1024 | "timeFrom": null, 1025 | "timeShift": null, 1026 | "title": "Container Cached Memory Usage", 1027 | "tooltip": { 1028 | "msResolution": true, 1029 | "shared": true, 1030 | "sort": 0, 1031 | "value_type": "cumulative" 1032 | }, 1033 | "type": "graph", 1034 | "xaxis": { 1035 | "show": true 1036 | }, 1037 | "yaxes": [ 1038 | { 1039 | "format": "bytes", 1040 | "label": null, 1041 | "logBase": 1, 1042 | "max": null, 1043 | "min": 0, 1044 | "show": true 1045 | }, 1046 | { 1047 | "format": "short", 1048 | "label": null, 1049 | "logBase": 1, 1050 | "max": null, 1051 | "min": null, 1052 | "show": false 1053 | } 1054 | ] 1055 | } 1056 | ], 1057 | "title": "Memory" 1058 | }, 1059 | { 1060 | "collapse": false, 1061 | "editable": true, 1062 | "height": "250px", 1063 | "panels": [ 1064 | { 1065 | "aliasColors": {}, 1066 | "bars": false, 1067 | "datasource": "prometheus", 1068 | "decimals": 2, 1069 | "editable": true, 1070 | "error": false, 1071 | "fill": 1, 1072 | "grid": { 1073 | "threshold1": null, 1074 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 1075 | "threshold2": null, 1076 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 1077 | }, 1078 | "id": 13, 1079 | "isNew": true, 1080 | "legend": { 1081 | "alignAsTable": true, 1082 | "avg": true, 1083 | "current": false, 1084 | "max": true, 1085 | "min": true, 1086 | "rightSide": true, 1087 | "show": true, 1088 | "total": false, 1089 | "values": true 1090 | }, 1091 | "lines": true, 1092 | "linewidth": 2, 1093 | "links": [], 1094 | "nullPointMode": "connected", 1095 | "percentage": false, 1096 | "pointradius": 5, 1097 | "points": false, 1098 | "renderer": "flot", 1099 | "seriesOverrides": [], 1100 | "span": 12, 1101 | "stack": false, 1102 | "steppedLine": false, 1103 | "targets": [ 1104 | { 1105 | "expr": "sum by (name) (rate(container_network_receive_bytes_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m]))", 1106 | "intervalFactor": 10, 1107 | "legendFormat": "{{ name }}", 1108 | "metric": "container_network_receive_bytes_total", 1109 | "refId": "A", 1110 | "step": 10 1111 | } 1112 | ], 1113 | "timeFrom": null, 1114 | "timeShift": null, 1115 | "title": "Container Network Input", 1116 | "tooltip": { 1117 | "msResolution": true, 1118 | "shared": true, 1119 | "sort": 2, 1120 | "value_type": "cumulative" 1121 | }, 1122 | "type": "graph", 1123 | "xaxis": { 1124 | "show": true 1125 | }, 1126 | "yaxes": [ 1127 | { 1128 | "format": "bytes", 1129 | "label": null, 1130 | "logBase": 1, 1131 | "max": null, 1132 | "min": 0, 1133 | "show": true 1134 | }, 1135 | { 1136 | "format": "short", 1137 | "label": null, 1138 | "logBase": 1, 1139 | "max": null, 1140 | "min": null, 1141 | "show": false 1142 | } 1143 | ] 1144 | }, 1145 | { 1146 | "aliasColors": {}, 1147 | "bars": false, 1148 | "datasource": "prometheus", 1149 | "decimals": 2, 1150 | "editable": true, 1151 | "error": false, 1152 | "fill": 1, 1153 | "grid": { 1154 | "threshold1": null, 1155 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 1156 | "threshold2": null, 1157 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 1158 | }, 1159 | "id": 14, 1160 | "isNew": true, 1161 | "legend": { 1162 | "alignAsTable": true, 1163 | "avg": true, 1164 | "current": false, 1165 | "max": true, 1166 | "min": true, 1167 | "rightSide": true, 1168 | "show": true, 1169 | "total": false, 1170 | "values": true 1171 | }, 1172 | "lines": true, 1173 | "linewidth": 2, 1174 | "links": [], 1175 | "nullPointMode": "connected", 1176 | "percentage": false, 1177 | "pointradius": 5, 1178 | "points": false, 1179 | "renderer": "flot", 1180 | "seriesOverrides": [], 1181 | "span": 12, 1182 | "stack": false, 1183 | "steppedLine": false, 1184 | "targets": [ 1185 | { 1186 | "expr": "sum by (name) (rate(container_network_transmit_bytes_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m]))", 1187 | "intervalFactor": 10, 1188 | "legendFormat": "{{ name }}", 1189 | "metric": "container_network_transmit_bytes_total", 1190 | "refId": "A", 1191 | "step": 10 1192 | } 1193 | ], 1194 | "timeFrom": null, 1195 | "timeShift": null, 1196 | "title": "Container Network Output", 1197 | "tooltip": { 1198 | "msResolution": true, 1199 | "shared": true, 1200 | "sort": 2, 1201 | "value_type": "cumulative" 1202 | }, 1203 | "type": "graph", 1204 | "xaxis": { 1205 | "show": true 1206 | }, 1207 | "yaxes": [ 1208 | { 1209 | "format": "bytes", 1210 | "label": null, 1211 | "logBase": 1, 1212 | "max": null, 1213 | "min": 0, 1214 | "show": true 1215 | }, 1216 | { 1217 | "format": "short", 1218 | "label": null, 1219 | "logBase": 1, 1220 | "max": null, 1221 | "min": null, 1222 | "show": false 1223 | } 1224 | ] 1225 | } 1226 | ], 1227 | "title": "Network" 1228 | } 1229 | ], 1230 | "time": { 1231 | "from": "now-15m", 1232 | "to": "now" 1233 | }, 1234 | "timepicker": { 1235 | "refresh_intervals": [ 1236 | "5s", 1237 | "10s", 1238 | "30s", 1239 | "1m", 1240 | "5m", 1241 | "15m", 1242 | "30m", 1243 | "1h", 1244 | "2h", 1245 | "1d" 1246 | ], 1247 | "time_options": [ 1248 | "5m", 1249 | "15m", 1250 | "1h", 1251 | "6h", 1252 | "12h", 1253 | "24h", 1254 | "2d", 1255 | "7d", 1256 | "30d" 1257 | ] 1258 | }, 1259 | "templating": { 1260 | "list": [] 1261 | }, 1262 | "annotations": { 1263 | "list": [] 1264 | }, 1265 | "refresh": "10s", 1266 | "schemaVersion": 12, 1267 | "version": 8, 1268 | "links": [], 1269 | "gnetId": null 1270 | } -------------------------------------------------------------------------------- /grafana/provisioning/dashboards/docker_host.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": null, 3 | "title": "Docker Host", 4 | "description": "Docker host metrics", 5 | "tags": [ 6 | "system" 7 | ], 8 | "style": "dark", 9 | "timezone": "browser", 10 | "editable": true, 11 | "hideControls": false, 12 | "sharedCrosshair": true, 13 | "rows": [ 14 | { 15 | "collapse": false, 16 | "editable": true, 17 | "height": "100px", 18 | "panels": [ 19 | { 20 | "cacheTimeout": null, 21 | "colorBackground": false, 22 | "colorValue": false, 23 | "colors": [ 24 | "rgba(245, 54, 54, 0.9)", 25 | "rgba(237, 129, 40, 0.89)", 26 | "rgba(50, 172, 45, 0.97)" 27 | ], 28 | "datasource": "prometheus", 29 | "decimals": 1, 30 | "editable": true, 31 | "error": false, 32 | "format": "s", 33 | "gauge": { 34 | "maxValue": 100, 35 | "minValue": 0, 36 | "show": false, 37 | "thresholdLabels": false, 38 | "thresholdMarkers": true 39 | }, 40 | "id": 1, 41 | "interval": null, 42 | "isNew": true, 43 | "links": [], 44 | "mappingType": 1, 45 | "mappingTypes": [ 46 | { 47 | "name": "value to text", 48 | "value": 1 49 | }, 50 | { 51 | "name": "range to text", 52 | "value": 2 53 | } 54 | ], 55 | "maxDataPoints": 100, 56 | "nullPointMode": "connected", 57 | "nullText": null, 58 | "postfix": "s", 59 | "postfixFontSize": "80%", 60 | "prefix": "", 61 | "prefixFontSize": "50%", 62 | "rangeMaps": [ 63 | { 64 | "from": "null", 65 | "text": "N/A", 66 | "to": "null" 67 | } 68 | ], 69 | "span": 2, 70 | "sparkline": { 71 | "fillColor": "rgba(31, 118, 189, 0.18)", 72 | "full": false, 73 | "lineColor": "rgb(31, 120, 193)", 74 | "show": false 75 | }, 76 | "targets": [ 77 | { 78 | "expr": "node_time_seconds - node_boot_time_seconds", 79 | "interval": "30s", 80 | "intervalFactor": 1, 81 | "refId": "A", 82 | "step": 30 83 | } 84 | ], 85 | "thresholds": "", 86 | "title": "Uptime", 87 | "type": "singlestat", 88 | "valueFontSize": "80%", 89 | "valueMaps": [ 90 | { 91 | "op": "=", 92 | "text": "N/A", 93 | "value": "null" 94 | } 95 | ], 96 | "valueName": "avg", 97 | "timeFrom": "10s", 98 | "hideTimeOverride": true 99 | }, 100 | { 101 | "cacheTimeout": null, 102 | "colorBackground": false, 103 | "colorValue": false, 104 | "colors": [ 105 | "rgba(245, 54, 54, 0.9)", 106 | "rgba(237, 129, 40, 0.89)", 107 | "rgba(50, 172, 45, 0.97)" 108 | ], 109 | "datasource": "prometheus", 110 | "editable": true, 111 | "error": false, 112 | "format": "percent", 113 | "gauge": { 114 | "maxValue": 100, 115 | "minValue": 0, 116 | "show": false, 117 | "thresholdLabels": false, 118 | "thresholdMarkers": true 119 | }, 120 | "id": 13, 121 | "interval": null, 122 | "isNew": true, 123 | "links": [], 124 | "mappingType": 1, 125 | "mappingTypes": [ 126 | { 127 | "name": "value to text", 128 | "value": 1 129 | }, 130 | { 131 | "name": "range to text", 132 | "value": 2 133 | } 134 | ], 135 | "maxDataPoints": 100, 136 | "nullPointMode": "connected", 137 | "nullText": null, 138 | "postfix": "", 139 | "postfixFontSize": "50%", 140 | "prefix": "", 141 | "prefixFontSize": "50%", 142 | "rangeMaps": [ 143 | { 144 | "from": "null", 145 | "text": "N/A", 146 | "to": "null" 147 | } 148 | ], 149 | "span": 2, 150 | "sparkline": { 151 | "fillColor": "rgba(31, 118, 189, 0.18)", 152 | "full": false, 153 | "lineColor": "rgb(31, 120, 193)", 154 | "show": false 155 | }, 156 | "targets": [ 157 | { 158 | "expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))", 159 | "interval": "10s", 160 | "intervalFactor": 2, 161 | "legendFormat": "", 162 | "refId": "A", 163 | "step": 20 164 | } 165 | ], 166 | "thresholds": "", 167 | "title": "CPU Idle", 168 | "type": "singlestat", 169 | "valueFontSize": "80%", 170 | "valueMaps": [ 171 | { 172 | "op": "=", 173 | "text": "N/A", 174 | "value": "null" 175 | } 176 | ], 177 | "valueName": "avg", 178 | "timeFrom": "10s", 179 | "hideTimeOverride": true 180 | }, 181 | { 182 | "cacheTimeout": null, 183 | "colorBackground": false, 184 | "colorValue": false, 185 | "colors": [ 186 | "rgba(245, 54, 54, 0.9)", 187 | "rgba(237, 129, 40, 0.89)", 188 | "rgba(50, 172, 45, 0.97)" 189 | ], 190 | "datasource": "prometheus", 191 | "editable": true, 192 | "error": false, 193 | "format": "none", 194 | "gauge": { 195 | "maxValue": 100, 196 | "minValue": 0, 197 | "show": false, 198 | "thresholdLabels": false, 199 | "thresholdMarkers": true 200 | }, 201 | "id": 12, 202 | "interval": null, 203 | "isNew": true, 204 | "links": [], 205 | "mappingType": 1, 206 | "mappingTypes": [ 207 | { 208 | "name": "value to text", 209 | "value": 1 210 | }, 211 | { 212 | "name": "range to text", 213 | "value": 2 214 | } 215 | ], 216 | "maxDataPoints": 100, 217 | "nullPointMode": "connected", 218 | "nullText": null, 219 | "postfix": "", 220 | "postfixFontSize": "50%", 221 | "prefix": "", 222 | "prefixFontSize": "50%", 223 | "rangeMaps": [ 224 | { 225 | "from": "null", 226 | "text": "N/A", 227 | "to": "null" 228 | } 229 | ], 230 | "span": 2, 231 | "sparkline": { 232 | "fillColor": "rgba(31, 118, 189, 0.18)", 233 | "full": false, 234 | "lineColor": "rgb(31, 120, 193)", 235 | "show": false 236 | }, 237 | "targets": [ 238 | { 239 | "expr": "machine_cpu_cores", 240 | "intervalFactor": 2, 241 | "metric": "machine_cpu_cores", 242 | "refId": "A", 243 | "step": 2 244 | } 245 | ], 246 | "thresholds": "", 247 | "title": "CPU Cores", 248 | "type": "singlestat", 249 | "valueFontSize": "80%", 250 | "valueMaps": [ 251 | { 252 | "op": "=", 253 | "text": "N/A", 254 | "value": "null" 255 | } 256 | ], 257 | "valueName": "avg", 258 | "timeFrom": "10s", 259 | "hideTimeOverride": true 260 | }, 261 | { 262 | "cacheTimeout": null, 263 | "colorBackground": false, 264 | "colorValue": false, 265 | "colors": [ 266 | "rgba(245, 54, 54, 0.9)", 267 | "rgba(237, 129, 40, 0.89)", 268 | "rgba(50, 172, 45, 0.97)" 269 | ], 270 | "datasource": "prometheus", 271 | "editable": true, 272 | "error": false, 273 | "format": "bytes", 274 | "gauge": { 275 | "maxValue": 100, 276 | "minValue": 0, 277 | "show": false, 278 | "thresholdLabels": false, 279 | "thresholdMarkers": true 280 | }, 281 | "id": 2, 282 | "interval": null, 283 | "isNew": true, 284 | "links": [], 285 | "mappingType": 1, 286 | "mappingTypes": [ 287 | { 288 | "name": "value to text", 289 | "value": 1 290 | }, 291 | { 292 | "name": "range to text", 293 | "value": 2 294 | } 295 | ], 296 | "maxDataPoints": 100, 297 | "nullPointMode": "connected", 298 | "nullText": null, 299 | "postfix": "", 300 | "postfixFontSize": "50%", 301 | "prefix": "", 302 | "prefixFontSize": "50%", 303 | "rangeMaps": [ 304 | { 305 | "from": "null", 306 | "text": "N/A", 307 | "to": "null" 308 | } 309 | ], 310 | "span": 2, 311 | "sparkline": { 312 | "fillColor": "rgba(31, 118, 189, 0.18)", 313 | "full": false, 314 | "lineColor": "rgb(31, 120, 193)", 315 | "show": false 316 | }, 317 | "targets": [ 318 | { 319 | "expr": "node_memory_MemAvailable_bytes", 320 | "interval": "30s", 321 | "intervalFactor": 2, 322 | "legendFormat": "", 323 | "refId": "A", 324 | "step": 60 325 | } 326 | ], 327 | "thresholds": "", 328 | "title": "Available Memory", 329 | "type": "singlestat", 330 | "valueFontSize": "80%", 331 | "valueMaps": [ 332 | { 333 | "op": "=", 334 | "text": "N/A", 335 | "value": "null" 336 | } 337 | ], 338 | "valueName": "avg", 339 | "timeFrom": "10s", 340 | "hideTimeOverride": true 341 | }, 342 | { 343 | "cacheTimeout": null, 344 | "colorBackground": false, 345 | "colorValue": false, 346 | "colors": [ 347 | "rgba(245, 54, 54, 0.9)", 348 | "rgba(237, 129, 40, 0.89)", 349 | "rgba(50, 172, 45, 0.97)" 350 | ], 351 | "datasource": "prometheus", 352 | "editable": true, 353 | "error": false, 354 | "format": "bytes", 355 | "gauge": { 356 | "maxValue": 100, 357 | "minValue": 0, 358 | "show": false, 359 | "thresholdLabels": false, 360 | "thresholdMarkers": true 361 | }, 362 | "id": 3, 363 | "interval": null, 364 | "isNew": true, 365 | "links": [], 366 | "mappingType": 1, 367 | "mappingTypes": [ 368 | { 369 | "name": "value to text", 370 | "value": 1 371 | }, 372 | { 373 | "name": "range to text", 374 | "value": 2 375 | } 376 | ], 377 | "maxDataPoints": 100, 378 | "nullPointMode": "connected", 379 | "nullText": null, 380 | "postfix": "", 381 | "postfixFontSize": "50%", 382 | "prefix": "", 383 | "prefixFontSize": "50%", 384 | "rangeMaps": [ 385 | { 386 | "from": "null", 387 | "text": "N/A", 388 | "to": "null" 389 | } 390 | ], 391 | "span": 2, 392 | "sparkline": { 393 | "fillColor": "rgba(31, 118, 189, 0.18)", 394 | "full": false, 395 | "lineColor": "rgb(31, 120, 193)", 396 | "show": false 397 | }, 398 | "targets": [ 399 | { 400 | "expr": "node_memory_SwapFree_bytes", 401 | "interval": "30s", 402 | "intervalFactor": 2, 403 | "refId": "A", 404 | "step": 60 405 | } 406 | ], 407 | "thresholds": "", 408 | "title": "Free Swap", 409 | "type": "singlestat", 410 | "valueFontSize": "80%", 411 | "valueMaps": [ 412 | { 413 | "op": "=", 414 | "text": "N/A", 415 | "value": "null" 416 | } 417 | ], 418 | "valueName": "avg", 419 | "timeFrom": "10s", 420 | "hideTimeOverride": true 421 | }, 422 | { 423 | "cacheTimeout": null, 424 | "colorBackground": false, 425 | "colorValue": false, 426 | "colors": [ 427 | "rgba(245, 54, 54, 0.9)", 428 | "rgba(237, 129, 40, 0.89)", 429 | "rgba(50, 172, 45, 0.97)" 430 | ], 431 | "datasource": "prometheus", 432 | "editable": true, 433 | "error": false, 434 | "format": "bytes", 435 | "gauge": { 436 | "maxValue": 100, 437 | "minValue": 0, 438 | "show": false, 439 | "thresholdLabels": false, 440 | "thresholdMarkers": true 441 | }, 442 | "id": 4, 443 | "interval": null, 444 | "isNew": true, 445 | "links": [], 446 | "mappingType": 1, 447 | "mappingTypes": [ 448 | { 449 | "name": "value to text", 450 | "value": 1 451 | }, 452 | { 453 | "name": "range to text", 454 | "value": 2 455 | } 456 | ], 457 | "maxDataPoints": 100, 458 | "nullPointMode": "connected", 459 | "nullText": null, 460 | "postfix": "", 461 | "postfixFontSize": "50%", 462 | "prefix": "", 463 | "prefixFontSize": "50%", 464 | "rangeMaps": [ 465 | { 466 | "from": "null", 467 | "text": "N/A", 468 | "to": "null" 469 | } 470 | ], 471 | "span": 2, 472 | "sparkline": { 473 | "fillColor": "rgba(31, 118, 189, 0.18)", 474 | "full": false, 475 | "lineColor": "rgb(31, 120, 193)", 476 | "show": false 477 | }, 478 | "targets": [ 479 | { 480 | "expr": "sum(node_filesystem_free_bytes{fstype=\"aufs\"})", 481 | "interval": "30s", 482 | "intervalFactor": 1, 483 | "legendFormat": "", 484 | "refId": "A", 485 | "step": 30 486 | } 487 | ], 488 | "thresholds": "", 489 | "title": "Free Storage", 490 | "type": "singlestat", 491 | "valueFontSize": "80%", 492 | "valueMaps": [ 493 | { 494 | "op": "=", 495 | "text": "N/A", 496 | "value": "null" 497 | } 498 | ], 499 | "valueName": "avg", 500 | "timeFrom": "10s", 501 | "hideTimeOverride": true 502 | } 503 | ], 504 | "title": "Available resources" 505 | }, 506 | { 507 | "collapse": false, 508 | "editable": true, 509 | "height": "150px", 510 | "panels": [ 511 | { 512 | "aliasColors": {}, 513 | "bars": true, 514 | "datasource": "prometheus", 515 | "decimals": 2, 516 | "editable": true, 517 | "error": false, 518 | "fill": 1, 519 | "grid": { 520 | "threshold1": null, 521 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 522 | "threshold2": null, 523 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 524 | }, 525 | "id": 9, 526 | "isNew": true, 527 | "legend": { 528 | "avg": false, 529 | "current": false, 530 | "max": false, 531 | "min": false, 532 | "show": false, 533 | "total": false, 534 | "values": false 535 | }, 536 | "lines": false, 537 | "linewidth": 2, 538 | "links": [], 539 | "nullPointMode": "connected", 540 | "percentage": false, 541 | "pointradius": 5, 542 | "points": false, 543 | "renderer": "flot", 544 | "seriesOverrides": [ 545 | { 546 | "alias": "load 1m", 547 | "color": "#1F78C1" 548 | } 549 | ], 550 | "span": 4, 551 | "stack": false, 552 | "steppedLine": false, 553 | "targets": [ 554 | { 555 | "expr": "node_load1", 556 | "interval": "10s", 557 | "intervalFactor": 1, 558 | "legendFormat": "load 1m", 559 | "refId": "A", 560 | "step": 10 561 | } 562 | ], 563 | "timeFrom": null, 564 | "timeShift": null, 565 | "title": "Load Average 1m", 566 | "tooltip": { 567 | "msResolution": true, 568 | "shared": true, 569 | "sort": 0, 570 | "value_type": "cumulative" 571 | }, 572 | "type": "graph", 573 | "xaxis": { 574 | "show": true 575 | }, 576 | "yaxes": [ 577 | { 578 | "format": "short", 579 | "label": null, 580 | "logBase": 1, 581 | "max": null, 582 | "min": 0, 583 | "show": true 584 | }, 585 | { 586 | "format": "short", 587 | "label": null, 588 | "logBase": 1, 589 | "max": null, 590 | "min": null, 591 | "show": false 592 | } 593 | ] 594 | }, 595 | { 596 | "aliasColors": {}, 597 | "bars": true, 598 | "datasource": "prometheus", 599 | "editable": true, 600 | "error": false, 601 | "fill": 1, 602 | "grid": { 603 | "threshold1": null, 604 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 605 | "threshold2": null, 606 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 607 | }, 608 | "id": 10, 609 | "isNew": true, 610 | "legend": { 611 | "avg": false, 612 | "current": false, 613 | "max": false, 614 | "min": false, 615 | "show": false, 616 | "total": false, 617 | "values": false 618 | }, 619 | "lines": false, 620 | "linewidth": 2, 621 | "links": [], 622 | "nullPointMode": "connected", 623 | "percentage": false, 624 | "pointradius": 5, 625 | "points": false, 626 | "renderer": "flot", 627 | "seriesOverrides": [ 628 | { 629 | "alias": "blocked by I/O", 630 | "color": "#58140C" 631 | } 632 | ], 633 | "span": 4, 634 | "stack": true, 635 | "steppedLine": false, 636 | "targets": [ 637 | { 638 | "expr": "node_procs_running", 639 | "interval": "10s", 640 | "intervalFactor": 1, 641 | "legendFormat": "running", 642 | "metric": "node_procs_running", 643 | "refId": "A", 644 | "step": 10 645 | }, 646 | { 647 | "expr": "node_procs_blocked", 648 | "interval": "10s", 649 | "intervalFactor": 1, 650 | "legendFormat": "blocked by I/O", 651 | "metric": "node_procs_blocked", 652 | "refId": "B", 653 | "step": 10 654 | } 655 | ], 656 | "timeFrom": null, 657 | "timeShift": null, 658 | "title": "Processes", 659 | "tooltip": { 660 | "msResolution": true, 661 | "shared": true, 662 | "sort": 2, 663 | "value_type": "individual" 664 | }, 665 | "type": "graph", 666 | "xaxis": { 667 | "show": true 668 | }, 669 | "yaxes": [ 670 | { 671 | "format": "short", 672 | "label": null, 673 | "logBase": 1, 674 | "max": null, 675 | "min": 0, 676 | "show": true 677 | }, 678 | { 679 | "format": "short", 680 | "label": null, 681 | "logBase": 1, 682 | "max": null, 683 | "min": null, 684 | "show": false 685 | } 686 | ] 687 | }, 688 | { 689 | "aliasColors": {}, 690 | "bars": true, 691 | "datasource": "prometheus", 692 | "editable": true, 693 | "error": false, 694 | "fill": 1, 695 | "grid": { 696 | "threshold1": null, 697 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 698 | "threshold2": null, 699 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 700 | }, 701 | "id": 11, 702 | "isNew": true, 703 | "legend": { 704 | "avg": false, 705 | "current": false, 706 | "max": false, 707 | "min": false, 708 | "show": false, 709 | "total": false, 710 | "values": false 711 | }, 712 | "lines": false, 713 | "linewidth": 2, 714 | "links": [], 715 | "nullPointMode": "connected", 716 | "percentage": false, 717 | "pointradius": 5, 718 | "points": false, 719 | "renderer": "flot", 720 | "seriesOverrides": [ 721 | { 722 | "alias": "interrupts", 723 | "color": "#806EB7" 724 | } 725 | ], 726 | "span": 4, 727 | "stack": false, 728 | "steppedLine": false, 729 | "targets": [ 730 | { 731 | "expr": " irate(node_intr_total[5m])", 732 | "interval": "10s", 733 | "intervalFactor": 1, 734 | "legendFormat": "interrupts", 735 | "metric": "node_intr_total", 736 | "refId": "A", 737 | "step": 10 738 | } 739 | ], 740 | "timeFrom": null, 741 | "timeShift": null, 742 | "title": "Interrupts", 743 | "tooltip": { 744 | "msResolution": true, 745 | "shared": true, 746 | "sort": 0, 747 | "value_type": "cumulative" 748 | }, 749 | "type": "graph", 750 | "xaxis": { 751 | "show": true 752 | }, 753 | "yaxes": [ 754 | { 755 | "format": "short", 756 | "label": null, 757 | "logBase": 1, 758 | "max": null, 759 | "min": null, 760 | "show": true 761 | }, 762 | { 763 | "format": "short", 764 | "label": null, 765 | "logBase": 1, 766 | "max": null, 767 | "min": null, 768 | "show": false 769 | } 770 | ] 771 | } 772 | ], 773 | "title": "Load" 774 | }, 775 | { 776 | "collapse": false, 777 | "editable": true, 778 | "height": "250px", 779 | "panels": [ 780 | { 781 | "aliasColors": {}, 782 | "bars": false, 783 | "datasource": "prometheus", 784 | "decimals": 2, 785 | "editable": true, 786 | "error": false, 787 | "fill": 4, 788 | "grid": { 789 | "threshold1": null, 790 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 791 | "threshold2": null, 792 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 793 | }, 794 | "id": 5, 795 | "isNew": true, 796 | "legend": { 797 | "alignAsTable": true, 798 | "avg": true, 799 | "current": false, 800 | "max": true, 801 | "min": true, 802 | "rightSide": true, 803 | "show": true, 804 | "total": false, 805 | "values": true 806 | }, 807 | "lines": true, 808 | "linewidth": 2, 809 | "links": [], 810 | "nullPointMode": "connected", 811 | "percentage": false, 812 | "pointradius": 5, 813 | "points": false, 814 | "renderer": "flot", 815 | "seriesOverrides": [], 816 | "span": 12, 817 | "stack": true, 818 | "steppedLine": false, 819 | "targets": [ 820 | { 821 | "expr": "sum(rate(node_cpu_seconds_total[1m])) by (mode) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))", 822 | "intervalFactor": 10, 823 | "legendFormat": "{{ mode }}", 824 | "metric": "node_cpu_seconds_total", 825 | "refId": "A", 826 | "step": 10 827 | } 828 | ], 829 | "timeFrom": null, 830 | "timeShift": null, 831 | "title": "CPU Usage", 832 | "tooltip": { 833 | "msResolution": true, 834 | "shared": true, 835 | "sort": 2, 836 | "value_type": "individual" 837 | }, 838 | "type": "graph", 839 | "xaxis": { 840 | "show": true 841 | }, 842 | "yaxes": [ 843 | { 844 | "format": "percent", 845 | "label": null, 846 | "logBase": 1, 847 | "max": 100, 848 | "min": 0, 849 | "show": true 850 | }, 851 | { 852 | "format": "short", 853 | "label": null, 854 | "logBase": 1, 855 | "max": null, 856 | "min": 0, 857 | "show": true 858 | } 859 | ] 860 | } 861 | ], 862 | "title": "CPU" 863 | }, 864 | { 865 | "collapse": false, 866 | "editable": true, 867 | "height": "250px", 868 | "panels": [ 869 | { 870 | "aliasColors": {}, 871 | "bars": false, 872 | "datasource": "prometheus", 873 | "decimals": 2, 874 | "editable": true, 875 | "error": false, 876 | "fill": 4, 877 | "grid": { 878 | "threshold1": null, 879 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 880 | "threshold2": null, 881 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 882 | }, 883 | "id": 6, 884 | "isNew": true, 885 | "legend": { 886 | "alignAsTable": true, 887 | "avg": true, 888 | "current": false, 889 | "max": true, 890 | "min": true, 891 | "rightSide": true, 892 | "show": true, 893 | "total": false, 894 | "values": true 895 | }, 896 | "lines": true, 897 | "linewidth": 2, 898 | "links": [], 899 | "nullPointMode": "null", 900 | "percentage": false, 901 | "pointradius": 5, 902 | "points": false, 903 | "renderer": "flot", 904 | "seriesOverrides": [ 905 | { 906 | "alias": "Used", 907 | "color": "#BF1B00" 908 | }, 909 | { 910 | "alias": "Free", 911 | "color": "#7EB26D" 912 | }, 913 | { 914 | "alias": "Buffers", 915 | "color": "#6ED0E0" 916 | }, 917 | { 918 | "alias": "Cached", 919 | "color": "#EF843C" 920 | } 921 | ], 922 | "span": 12, 923 | "stack": true, 924 | "steppedLine": false, 925 | "targets": [ 926 | { 927 | "expr": "node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)", 928 | "intervalFactor": 1, 929 | "legendFormat": "Used", 930 | "refId": "A", 931 | "step": 1 932 | }, 933 | { 934 | "expr": "node_memory_MemFree_bytes", 935 | "intervalFactor": 1, 936 | "legendFormat": "Free", 937 | "refId": "B", 938 | "step": 1 939 | }, 940 | { 941 | "expr": "node_memory_Buffers_bytes", 942 | "intervalFactor": 1, 943 | "legendFormat": "Buffers", 944 | "refId": "C", 945 | "step": 1 946 | }, 947 | { 948 | "expr": "node_memory_Cached_bytes", 949 | "intervalFactor": 1, 950 | "legendFormat": "Cached", 951 | "refId": "D", 952 | "step": 1 953 | } 954 | ], 955 | "timeFrom": null, 956 | "timeShift": null, 957 | "title": "Memory Usage", 958 | "tooltip": { 959 | "msResolution": true, 960 | "shared": true, 961 | "sort": 2, 962 | "value_type": "individual" 963 | }, 964 | "type": "graph", 965 | "xaxis": { 966 | "show": true 967 | }, 968 | "yaxes": [ 969 | { 970 | "format": "bytes", 971 | "label": null, 972 | "logBase": 1, 973 | "max": null, 974 | "min": null, 975 | "show": true 976 | }, 977 | { 978 | "format": "short", 979 | "label": null, 980 | "logBase": 1, 981 | "max": null, 982 | "min": null, 983 | "show": true 984 | } 985 | ] 986 | } 987 | ], 988 | "title": "Memory" 989 | }, 990 | { 991 | "collapse": false, 992 | "editable": true, 993 | "height": "250px", 994 | "panels": [ 995 | { 996 | "aliasColors": {}, 997 | "bars": false, 998 | "datasource": "prometheus", 999 | "decimals": 2, 1000 | "editable": true, 1001 | "error": false, 1002 | "fill": 1, 1003 | "grid": { 1004 | "threshold1": null, 1005 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 1006 | "threshold2": null, 1007 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 1008 | }, 1009 | "id": 7, 1010 | "isNew": true, 1011 | "legend": { 1012 | "alignAsTable": true, 1013 | "avg": true, 1014 | "current": false, 1015 | "max": true, 1016 | "min": true, 1017 | "rightSide": true, 1018 | "show": true, 1019 | "total": false, 1020 | "values": true 1021 | }, 1022 | "lines": true, 1023 | "linewidth": 2, 1024 | "links": [], 1025 | "nullPointMode": "connected", 1026 | "percentage": false, 1027 | "pointradius": 5, 1028 | "points": false, 1029 | "renderer": "flot", 1030 | "seriesOverrides": [ 1031 | { 1032 | "alias": "read", 1033 | "yaxis": 1 1034 | }, 1035 | { 1036 | "alias": "written", 1037 | "yaxis": 1 1038 | }, 1039 | { 1040 | "alias": "io time", 1041 | "yaxis": 2 1042 | } 1043 | ], 1044 | "span": 12, 1045 | "stack": false, 1046 | "steppedLine": false, 1047 | "targets": [ 1048 | { 1049 | "expr": "sum(irate(node_disk_read_bytes_total[1m]))", 1050 | "interval": "", 1051 | "intervalFactor": 1, 1052 | "legendFormat": "read", 1053 | "metric": "node_disk_read_bytes_total", 1054 | "refId": "A", 1055 | "step": 1 1056 | }, 1057 | { 1058 | "expr": "sum(irate(node_disk_written_bytes_total[1m]))", 1059 | "intervalFactor": 1, 1060 | "legendFormat": "written", 1061 | "metric": "node_disk_written_bytes_total", 1062 | "refId": "B", 1063 | "step": 1 1064 | }, 1065 | { 1066 | "expr": "sum(irate(node_disk_io_time_seconds_total[1m]))", 1067 | "intervalFactor": 1, 1068 | "legendFormat": "io time", 1069 | "metric": "node_disk_io_time_seconds_total", 1070 | "refId": "C", 1071 | "step": 1 1072 | } 1073 | ], 1074 | "timeFrom": null, 1075 | "timeShift": null, 1076 | "title": "I/O Usage", 1077 | "tooltip": { 1078 | "msResolution": true, 1079 | "shared": true, 1080 | "sort": 0, 1081 | "value_type": "cumulative" 1082 | }, 1083 | "type": "graph", 1084 | "xaxis": { 1085 | "show": true 1086 | }, 1087 | "yaxes": [ 1088 | { 1089 | "format": "Bps", 1090 | "label": null, 1091 | "logBase": 1, 1092 | "max": null, 1093 | "min": 0, 1094 | "show": true 1095 | }, 1096 | { 1097 | "format": "ms", 1098 | "label": null, 1099 | "logBase": 1, 1100 | "max": null, 1101 | "min": null, 1102 | "show": true 1103 | } 1104 | ] 1105 | } 1106 | ], 1107 | "title": "I/O" 1108 | }, 1109 | { 1110 | "collapse": false, 1111 | "editable": true, 1112 | "height": "250px", 1113 | "panels": [ 1114 | { 1115 | "aliasColors": {}, 1116 | "bars": false, 1117 | "datasource": "prometheus", 1118 | "decimals": 2, 1119 | "editable": true, 1120 | "error": false, 1121 | "fill": 4, 1122 | "grid": { 1123 | "threshold1": null, 1124 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 1125 | "threshold2": null, 1126 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 1127 | }, 1128 | "id": 8, 1129 | "isNew": true, 1130 | "legend": { 1131 | "alignAsTable": true, 1132 | "avg": true, 1133 | "current": false, 1134 | "max": true, 1135 | "min": true, 1136 | "rightSide": true, 1137 | "show": true, 1138 | "total": false, 1139 | "values": true 1140 | }, 1141 | "lines": true, 1142 | "linewidth": 2, 1143 | "links": [], 1144 | "nullPointMode": "connected", 1145 | "percentage": false, 1146 | "pointradius": 5, 1147 | "points": false, 1148 | "renderer": "flot", 1149 | "seriesOverrides": [], 1150 | "span": 12, 1151 | "stack": true, 1152 | "steppedLine": false, 1153 | "targets": [ 1154 | { 1155 | "expr": "irate(node_network_receive_bytes_total{device!=\"lo\"}[1m])", 1156 | "intervalFactor": 1, 1157 | "legendFormat": "In: {{ device }}", 1158 | "metric": "node_network_receive_bytes_total", 1159 | "refId": "A", 1160 | "step": 1 1161 | }, 1162 | { 1163 | "expr": "irate(node_network_transmit_bytes_total{device!=\"lo\"}[1m])", 1164 | "intervalFactor": 1, 1165 | "legendFormat": "Out: {{ device }}", 1166 | "metric": "node_network_transmit_bytes_total", 1167 | "refId": "B", 1168 | "step": 1 1169 | } 1170 | ], 1171 | "timeFrom": null, 1172 | "timeShift": null, 1173 | "title": "Network Usage", 1174 | "tooltip": { 1175 | "msResolution": true, 1176 | "shared": true, 1177 | "sort": 2, 1178 | "value_type": "individual" 1179 | }, 1180 | "type": "graph", 1181 | "xaxis": { 1182 | "show": true 1183 | }, 1184 | "yaxes": [ 1185 | { 1186 | "format": "Bps", 1187 | "label": null, 1188 | "logBase": 1, 1189 | "max": null, 1190 | "min": 0, 1191 | "show": true 1192 | }, 1193 | { 1194 | "format": "short", 1195 | "label": null, 1196 | "logBase": 1, 1197 | "max": null, 1198 | "min": null, 1199 | "show": false 1200 | } 1201 | ] 1202 | } 1203 | ], 1204 | "title": "Network" 1205 | }, 1206 | { 1207 | "collapse": false, 1208 | "editable": true, 1209 | "height": "250px", 1210 | "panels": [ 1211 | { 1212 | "aliasColors": {}, 1213 | "bars": false, 1214 | "datasource": "prometheus", 1215 | "decimals": 2, 1216 | "editable": true, 1217 | "error": false, 1218 | "fill": 4, 1219 | "grid": { 1220 | "threshold1": null, 1221 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 1222 | "threshold2": null, 1223 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 1224 | }, 1225 | "id": 14, 1226 | "isNew": true, 1227 | "legend": { 1228 | "alignAsTable": true, 1229 | "avg": true, 1230 | "current": false, 1231 | "max": true, 1232 | "min": true, 1233 | "rightSide": false, 1234 | "show": true, 1235 | "total": false, 1236 | "values": true 1237 | }, 1238 | "lines": true, 1239 | "linewidth": 2, 1240 | "links": [], 1241 | "nullPointMode": "connected", 1242 | "percentage": false, 1243 | "pointradius": 5, 1244 | "points": false, 1245 | "renderer": "flot", 1246 | "seriesOverrides": [ 1247 | { 1248 | "alias": "Used", 1249 | "color": "#890F02" 1250 | }, 1251 | { 1252 | "alias": "Free", 1253 | "color": "#7EB26D" 1254 | } 1255 | ], 1256 | "span": 6, 1257 | "stack": true, 1258 | "steppedLine": false, 1259 | "targets": [ 1260 | { 1261 | "expr": "node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes", 1262 | "interval": "10s", 1263 | "intervalFactor": 1, 1264 | "legendFormat": "Used", 1265 | "refId": "A", 1266 | "step": 10 1267 | }, 1268 | { 1269 | "expr": "node_memory_SwapFree_bytes", 1270 | "interval": "10s", 1271 | "intervalFactor": 1, 1272 | "legendFormat": "Free", 1273 | "refId": "B", 1274 | "step": 10 1275 | } 1276 | ], 1277 | "timeFrom": null, 1278 | "timeShift": null, 1279 | "title": "Swap Usage", 1280 | "tooltip": { 1281 | "msResolution": true, 1282 | "shared": true, 1283 | "sort": 2, 1284 | "value_type": "individual" 1285 | }, 1286 | "type": "graph", 1287 | "xaxis": { 1288 | "show": true 1289 | }, 1290 | "yaxes": [ 1291 | { 1292 | "format": "bytes", 1293 | "label": null, 1294 | "logBase": 1, 1295 | "max": null, 1296 | "min": 0, 1297 | "show": true 1298 | }, 1299 | { 1300 | "format": "short", 1301 | "label": null, 1302 | "logBase": 1, 1303 | "max": null, 1304 | "min": null, 1305 | "show": false 1306 | } 1307 | ] 1308 | }, 1309 | { 1310 | "aliasColors": {}, 1311 | "bars": false, 1312 | "datasource": "prometheus", 1313 | "decimals": 2, 1314 | "editable": true, 1315 | "error": false, 1316 | "fill": 1, 1317 | "grid": { 1318 | "threshold1": null, 1319 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 1320 | "threshold2": null, 1321 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 1322 | }, 1323 | "id": 15, 1324 | "isNew": true, 1325 | "legend": { 1326 | "alignAsTable": true, 1327 | "avg": true, 1328 | "current": false, 1329 | "max": true, 1330 | "min": true, 1331 | "show": true, 1332 | "total": false, 1333 | "values": true 1334 | }, 1335 | "lines": true, 1336 | "linewidth": 2, 1337 | "links": [], 1338 | "nullPointMode": "connected", 1339 | "percentage": false, 1340 | "pointradius": 5, 1341 | "points": false, 1342 | "renderer": "flot", 1343 | "seriesOverrides": [], 1344 | "span": 6, 1345 | "stack": false, 1346 | "steppedLine": false, 1347 | "targets": [ 1348 | { 1349 | "expr": "rate(node_vmstat_pswpin[1m]) * 4096 or irate(node_vmstat_pswpin[5m]) * 4096", 1350 | "interval": "10s", 1351 | "intervalFactor": 1, 1352 | "legendFormat": "In", 1353 | "refId": "A", 1354 | "step": 10 1355 | }, 1356 | { 1357 | "expr": "rate(node_vmstat_pswpout[1m]) * 4096 or irate(node_vmstat_pswpout[5m]) * 4096", 1358 | "interval": "10s", 1359 | "intervalFactor": 1, 1360 | "legendFormat": "Out", 1361 | "refId": "B", 1362 | "step": 10 1363 | } 1364 | ], 1365 | "timeFrom": null, 1366 | "timeShift": null, 1367 | "title": "Swap I/O", 1368 | "tooltip": { 1369 | "msResolution": true, 1370 | "shared": true, 1371 | "sort": 0, 1372 | "value_type": "cumulative" 1373 | }, 1374 | "type": "graph", 1375 | "xaxis": { 1376 | "show": true 1377 | }, 1378 | "yaxes": [ 1379 | { 1380 | "format": "Bps", 1381 | "label": null, 1382 | "logBase": 1, 1383 | "max": null, 1384 | "min": 0, 1385 | "show": true 1386 | }, 1387 | { 1388 | "format": "short", 1389 | "label": null, 1390 | "logBase": 1, 1391 | "max": null, 1392 | "min": null, 1393 | "show": false 1394 | } 1395 | ] 1396 | } 1397 | ], 1398 | "title": "New row" 1399 | } 1400 | ], 1401 | "time": { 1402 | "from": "now-15m", 1403 | "to": "now" 1404 | }, 1405 | "timepicker": { 1406 | "refresh_intervals": [ 1407 | "5s", 1408 | "10s", 1409 | "30s", 1410 | "1m", 1411 | "5m", 1412 | "15m", 1413 | "30m", 1414 | "1h", 1415 | "2h", 1416 | "1d" 1417 | ], 1418 | "time_options": [ 1419 | "5m", 1420 | "15m", 1421 | "1h", 1422 | "6h", 1423 | "12h", 1424 | "24h", 1425 | "2d", 1426 | "7d", 1427 | "30d" 1428 | ] 1429 | }, 1430 | "templating": { 1431 | "list": [] 1432 | }, 1433 | "annotations": { 1434 | "list": [] 1435 | }, 1436 | "refresh": "10s", 1437 | "schemaVersion": 12, 1438 | "version": 2, 1439 | "links": [], 1440 | "gnetId": null 1441 | } -------------------------------------------------------------------------------- /grafana/provisioning/dashboards/indexing.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "id": 4, 19 | "iteration": 1590595997503, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "columns": [], 24 | "datasource": "postgres", 25 | "fieldConfig": { 26 | "defaults": { 27 | "custom": {} 28 | }, 29 | "overrides": [] 30 | }, 31 | "fontSize": "90%", 32 | "gridPos": { 33 | "h": 10, 34 | "w": 24, 35 | "x": 0, 36 | "y": 0 37 | }, 38 | "id": 10, 39 | "links": [], 40 | "pageSize": null, 41 | "scroll": true, 42 | "showHeader": true, 43 | "sort": { 44 | "col": 6, 45 | "desc": true 46 | }, 47 | "styles": [ 48 | { 49 | "alias": "blocks behind", 50 | "align": "auto", 51 | "colorMode": null, 52 | "colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"], 53 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 54 | "decimals": 0, 55 | "mappingType": 1, 56 | "pattern": "blocks_behind_network", 57 | "thresholds": [""], 58 | "type": "number", 59 | "unit": "none" 60 | }, 61 | { 62 | "alias": "", 63 | "align": "auto", 64 | "colorMode": null, 65 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 66 | "decimals": 1, 67 | "pattern": "lag", 68 | "thresholds": [], 69 | "type": "number", 70 | "unit": "m" 71 | }, 72 | { 73 | "alias": "syn", 74 | "align": "auto", 75 | "colorMode": null, 76 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 77 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 78 | "decimals": 2, 79 | "mappingType": 1, 80 | "pattern": "synced", 81 | "thresholds": [], 82 | "type": "string", 83 | "unit": "short", 84 | "valueMaps": [ 85 | { 86 | "text": "✓", 87 | "value": "true" 88 | }, 89 | { 90 | "text": "—", 91 | "value": "false" 92 | } 93 | ] 94 | } 95 | ], 96 | "targets": [ 97 | { 98 | "format": "table", 99 | "group": [], 100 | "metricColumn": "none", 101 | "rawQuery": true, 102 | "rawSql": "-- grafana ignore\nselect\n g.name || (case\n when g.pending_version = v.id then ' (P)'\n when g.current_version = v.id then ' (C)'\n else ' (U)' end) as subgraph_name,\n d.id as deployment,\n s.id as schema,\n replace(a.node_id, 'index_node_','') as nodeId,\n d.synced::text,\n n.name as network,\n (n.head_block_number - d.latest_ethereum_block_number) as blocks_behind_network,\n (case when n.name = 'mainnet' then ((n.head_block_number - d.latest_ethereum_block_number)/4)::text\n when n.name = 'rinkeby' then ((n.head_block_number - d.latest_ethereum_block_number)/4)::text\n when n.name = 'kovan' then ((n.head_block_number - d.latest_ethereum_block_number)/15)::text\n when n.name = 'poa-core' then ((n.head_block_number - d.latest_ethereum_block_number)/12)::text\n else 'ø' end) as lag\nfrom subgraphs.subgraph_deployment as d,\n subgraphs.subgraph_deployment_assignment as a,\n subgraphs.subgraph_version as v,\n subgraphs.subgraph as g,\n subgraphs.ethereum_contract_data_source as ds,\n subgraphs.subgraph_manifest m,\n ethereum_networks as n,\n deployment_schemas as s\nwhere g.id = v.subgraph\n and v.id in (g.pending_version, g.current_version)\n and a.id = d.id\n and m.id = d.manifest \n and ds.id = m.data_sources[1]\n and s.subgraph = d.id\n and v.deployment = d.id\n and not d.failed\n and n.name = ds.network\norder by blocks_behind_network desc, subgraph_name;", 103 | "refId": "A", 104 | "select": [ 105 | [ 106 | { 107 | "params": ["value"], 108 | "type": "column" 109 | } 110 | ] 111 | ], 112 | "timeColumn": "time", 113 | "where": [ 114 | { 115 | "name": "$__timeFilter", 116 | "params": [], 117 | "type": "macro" 118 | } 119 | ] 120 | } 121 | ], 122 | "timeFrom": null, 123 | "timeShift": null, 124 | "title": "Subgraph Block vs. Network Block", 125 | "transform": "table", 126 | "type": "table-old" 127 | }, 128 | { 129 | "columns": [], 130 | "datasource": "postgres", 131 | "fieldConfig": { 132 | "defaults": { 133 | "custom": {} 134 | }, 135 | "overrides": [] 136 | }, 137 | "fontSize": "100%", 138 | "gridPos": { 139 | "h": 8, 140 | "w": 24, 141 | "x": 0, 142 | "y": 10 143 | }, 144 | "id": 33, 145 | "pageSize": null, 146 | "showHeader": true, 147 | "sort": { 148 | "col": 0, 149 | "desc": true 150 | }, 151 | "styles": [ 152 | { 153 | "alias": "", 154 | "align": "auto", 155 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 156 | "decimals": null, 157 | "pattern": "/vid|block_number/", 158 | "type": "number", 159 | "unit": "none" 160 | }, 161 | { 162 | "alias": "", 163 | "align": "right", 164 | "colorMode": null, 165 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 166 | "decimals": 2, 167 | "pattern": "/.*/", 168 | "thresholds": [], 169 | "type": "number", 170 | "unit": "none" 171 | } 172 | ], 173 | "targets": [ 174 | { 175 | "format": "table", 176 | "group": [], 177 | "metricColumn": "none", 178 | "rawQuery": true, 179 | "rawSql": "SELECT\n vid, block_number, subgraph_id, message, handler\nFROM subgraphs.subgraph_error\nORDER BY vid DESC", 180 | "refId": "A", 181 | "select": [ 182 | [ 183 | { 184 | "params": ["value"], 185 | "type": "column" 186 | } 187 | ] 188 | ], 189 | "table": "subgraphs.subgraph_error", 190 | "timeColumn": "time", 191 | "where": [] 192 | } 193 | ], 194 | "timeFrom": null, 195 | "timeShift": null, 196 | "title": "Fatal errors", 197 | "transform": "table", 198 | "type": "table-old" 199 | }, 200 | { 201 | "columns": [], 202 | "datasource": "postgres", 203 | "fieldConfig": { 204 | "defaults": { 205 | "custom": {} 206 | }, 207 | "overrides": [] 208 | }, 209 | "fontSize": "100%", 210 | "gridPos": { 211 | "h": 7, 212 | "w": 24, 213 | "x": 0, 214 | "y": 18 215 | }, 216 | "id": 6, 217 | "links": [], 218 | "pageSize": null, 219 | "scroll": true, 220 | "showHeader": true, 221 | "sort": { 222 | "col": 1, 223 | "desc": true 224 | }, 225 | "styles": [ 226 | { 227 | "alias": "", 228 | "align": "auto", 229 | "colorMode": null, 230 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 231 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 232 | "decimals": 0, 233 | "mappingType": 1, 234 | "pattern": "num_assigned", 235 | "thresholds": [], 236 | "type": "number", 237 | "unit": "short" 238 | } 239 | ], 240 | "targets": [ 241 | { 242 | "format": "table", 243 | "group": [], 244 | "metricColumn": "none", 245 | "rawQuery": true, 246 | "rawSql": "select replace(node_id, 'index_node_', '') as nodeId,\n count(*) as num_assigned,\n array_agg(id) as deployments\n from subgraphs.subgraph_deployment_assignment\n group by node_id\n order by node_id asc;\n", 247 | "refId": "A", 248 | "select": [ 249 | [ 250 | { 251 | "params": ["value"], 252 | "type": "column" 253 | } 254 | ] 255 | ], 256 | "timeColumn": "time", 257 | "where": [ 258 | { 259 | "name": "$__timeFilter", 260 | "params": [], 261 | "type": "macro" 262 | } 263 | ] 264 | } 265 | ], 266 | "timeFrom": null, 267 | "timeShift": null, 268 | "title": "Subgraph Node Assignments", 269 | "transform": "table", 270 | "type": "table-old" 271 | }, 272 | { 273 | "cacheTimeout": null, 274 | "datasource": "postgres", 275 | "fieldConfig": { 276 | "defaults": { 277 | "custom": { 278 | "align": null 279 | }, 280 | "mappings": [], 281 | "nullValueMode": "connected", 282 | "thresholds": { 283 | "mode": "absolute", 284 | "steps": [ 285 | { 286 | "color": "green", 287 | "value": null 288 | }, 289 | { 290 | "color": "red", 291 | "value": 80 292 | } 293 | ] 294 | }, 295 | "unit": "none" 296 | }, 297 | "overrides": [ 298 | { 299 | "matcher": { 300 | "id": "byName", 301 | "options": "name" 302 | }, 303 | "properties": [ 304 | { 305 | "id": "displayName", 306 | "value": "Network name" 307 | } 308 | ] 309 | }, 310 | { 311 | "matcher": { 312 | "id": "byName", 313 | "options": "block_number" 314 | }, 315 | "properties": [ 316 | { 317 | "id": "displayName", 318 | "value": "Block number" 319 | } 320 | ] 321 | }, 322 | { 323 | "matcher": { 324 | "id": "byName", 325 | "options": "block_hash" 326 | }, 327 | "properties": [ 328 | { 329 | "id": "displayName", 330 | "value": "Block Hash" 331 | } 332 | ] 333 | }, 334 | { 335 | "matcher": { 336 | "id": "byName", 337 | "options": "Block number" 338 | }, 339 | "properties": [ 340 | { 341 | "id": "custom.width", 342 | "value": 101 343 | } 344 | ] 345 | }, 346 | { 347 | "matcher": { 348 | "id": "byName", 349 | "options": "Network name" 350 | }, 351 | "properties": [ 352 | { 353 | "id": "custom.width", 354 | "value": 116 355 | } 356 | ] 357 | }, 358 | { 359 | "matcher": { 360 | "id": "byName", 361 | "options": "Block Hash" 362 | }, 363 | "properties": [ 364 | { 365 | "id": "custom.width", 366 | "value": 396 367 | } 368 | ] 369 | } 370 | ] 371 | }, 372 | "gridPos": { 373 | "h": 8, 374 | "w": 12, 375 | "x": 0, 376 | "y": 25 377 | }, 378 | "id": 2, 379 | "interval": "10s", 380 | "links": [], 381 | "maxDataPoints": 100, 382 | "options": { 383 | "showHeader": true, 384 | "sortBy": [] 385 | }, 386 | "pluginVersion": "7.0.1", 387 | "targets": [ 388 | { 389 | "format": "table", 390 | "group": [], 391 | "metricColumn": "none", 392 | "rawQuery": true, 393 | "rawSql": "SELECT\n name,\n head_block_number AS \"block_number\",\n head_block_hash AS \"block_hash\"\nFROM ethereum_networks\nORDER BY name;", 394 | "refId": "A", 395 | "select": [ 396 | [ 397 | { 398 | "params": ["value"], 399 | "type": "column" 400 | } 401 | ] 402 | ], 403 | "table": "ethereum_networks", 404 | "timeColumn": "time", 405 | "where": [] 406 | } 407 | ], 408 | "timeFrom": null, 409 | "timeShift": null, 410 | "title": "Network Head Blocks", 411 | "type": "table" 412 | }, 413 | { 414 | "alert": { 415 | "alertRuleTags": {}, 416 | "conditions": [ 417 | { 418 | "evaluator": { 419 | "params": [0], 420 | "type": "gt" 421 | }, 422 | "operator": { 423 | "type": "and" 424 | }, 425 | "query": { 426 | "params": ["A", "3m", "now"] 427 | }, 428 | "reducer": { 429 | "params": [], 430 | "type": "avg" 431 | }, 432 | "type": "query" 433 | } 434 | ], 435 | "executionErrorState": "alerting", 436 | "for": "5m", 437 | "frequency": "1m", 438 | "handler": 1, 439 | "name": "Deployed subgraphs that have not started alert", 440 | "noDataState": "no_data", 441 | "notifications": [ 442 | { 443 | "uid": "okc8ZyRZz" 444 | } 445 | ] 446 | }, 447 | "aliasColors": {}, 448 | "bars": false, 449 | "dashLength": 10, 450 | "dashes": false, 451 | "datasource": "postgres", 452 | "fieldConfig": { 453 | "defaults": { 454 | "custom": {} 455 | }, 456 | "overrides": [] 457 | }, 458 | "fill": 1, 459 | "fillGradient": 0, 460 | "gridPos": { 461 | "h": 8, 462 | "w": 12, 463 | "x": 12, 464 | "y": 25 465 | }, 466 | "hiddenSeries": false, 467 | "id": 29, 468 | "legend": { 469 | "avg": false, 470 | "current": false, 471 | "max": false, 472 | "min": false, 473 | "show": true, 474 | "total": false, 475 | "values": false 476 | }, 477 | "lines": true, 478 | "linewidth": 1, 479 | "nullPointMode": "null", 480 | "options": { 481 | "dataLinks": [] 482 | }, 483 | "percentage": false, 484 | "pointradius": 2, 485 | "points": false, 486 | "renderer": "flot", 487 | "seriesOverrides": [], 488 | "spaceLength": 10, 489 | "stack": false, 490 | "steppedLine": false, 491 | "targets": [ 492 | { 493 | "format": "time_series", 494 | "group": [], 495 | "metricColumn": "none", 496 | "rawQuery": true, 497 | "rawSql": "select now() as time, count(*) as num_subgraphs\n from (\n select d.id, d.latest_ethereum_block_number block\n from subgraphs.subgraph_deployment d,\n subgraphs.subgraph_deployment_assignment a\n where d.id = a.id\n and not d.failed) a\n where a.block < 100;\n", 498 | "refId": "A", 499 | "select": [ 500 | [ 501 | { 502 | "params": ["value"], 503 | "type": "column" 504 | } 505 | ] 506 | ], 507 | "timeColumn": "time", 508 | "where": [ 509 | { 510 | "name": "$__timeFilter", 511 | "params": [], 512 | "type": "macro" 513 | } 514 | ] 515 | } 516 | ], 517 | "thresholds": [ 518 | { 519 | "colorMode": "critical", 520 | "fill": true, 521 | "line": true, 522 | "op": "gt", 523 | "value": 0 524 | } 525 | ], 526 | "timeFrom": null, 527 | "timeRegions": [], 528 | "timeShift": null, 529 | "title": "Deployed subgraphs that have not started", 530 | "tooltip": { 531 | "shared": true, 532 | "sort": 0, 533 | "value_type": "individual" 534 | }, 535 | "type": "graph", 536 | "xaxis": { 537 | "buckets": null, 538 | "mode": "time", 539 | "name": null, 540 | "show": true, 541 | "values": [] 542 | }, 543 | "yaxes": [ 544 | { 545 | "format": "short", 546 | "label": null, 547 | "logBase": 1, 548 | "max": null, 549 | "min": null, 550 | "show": true 551 | }, 552 | { 553 | "format": "short", 554 | "label": null, 555 | "logBase": 1, 556 | "max": null, 557 | "min": null, 558 | "show": true 559 | } 560 | ], 561 | "yaxis": { 562 | "align": false, 563 | "alignLevel": null 564 | } 565 | } 566 | ], 567 | "refresh": "", 568 | "schemaVersion": 25, 569 | "style": "dark", 570 | "tags": [], 571 | "templating": { 572 | "list": [ 573 | { 574 | "datasource": "Elasticsearch", 575 | "filters": [], 576 | "hide": 0, 577 | "label": "", 578 | "name": "Filters", 579 | "skipUrlSync": false, 580 | "type": "adhoc" 581 | } 582 | ] 583 | }, 584 | "time": { 585 | "from": "now-6h", 586 | "to": "now" 587 | }, 588 | "timepicker": { 589 | "refresh_intervals": ["10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"], 590 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 591 | }, 592 | "timezone": "", 593 | "title": "Indexing Status", 594 | "uid": "7rcuDImZk", 595 | "version": 4 596 | } 597 | -------------------------------------------------------------------------------- /grafana/provisioning/dashboards/nginx_container.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": null, 3 | "title": "Nginx", 4 | "description": "Nginx exporter metrics", 5 | "tags": [ 6 | "nginx" 7 | ], 8 | "style": "dark", 9 | "timezone": "browser", 10 | "editable": true, 11 | "hideControls": false, 12 | "sharedCrosshair": true, 13 | "rows": [ 14 | { 15 | "collapse": false, 16 | "editable": true, 17 | "height": "250px", 18 | "panels": [ 19 | { 20 | "aliasColors": {}, 21 | "bars": false, 22 | "datasource": "prometheus", 23 | "decimals": 2, 24 | "editable": true, 25 | "error": false, 26 | "fill": 1, 27 | "grid": { 28 | "threshold1": null, 29 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 30 | "threshold2": null, 31 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 32 | }, 33 | "id": 3, 34 | "isNew": true, 35 | "legend": { 36 | "alignAsTable": true, 37 | "avg": true, 38 | "current": true, 39 | "max": true, 40 | "min": true, 41 | "rightSide": true, 42 | "show": true, 43 | "total": false, 44 | "values": true 45 | }, 46 | "lines": true, 47 | "linewidth": 2, 48 | "links": [], 49 | "nullPointMode": "connected", 50 | "percentage": false, 51 | "pointradius": 5, 52 | "points": false, 53 | "renderer": "flot", 54 | "seriesOverrides": [], 55 | "span": 12, 56 | "stack": false, 57 | "steppedLine": false, 58 | "targets": [ 59 | { 60 | "expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)", 61 | "hide": false, 62 | "interval": "", 63 | "intervalFactor": 10, 64 | "legendFormat": "requests", 65 | "metric": "", 66 | "refId": "B", 67 | "step": 10 68 | } 69 | ], 70 | "timeFrom": null, 71 | "timeShift": null, 72 | "title": "Requests/sec", 73 | "tooltip": { 74 | "msResolution": false, 75 | "shared": true, 76 | "sort": 0, 77 | "value_type": "cumulative" 78 | }, 79 | "type": "graph", 80 | "xaxis": { 81 | "show": true 82 | }, 83 | "yaxes": [ 84 | { 85 | "format": "short", 86 | "label": null, 87 | "logBase": 1, 88 | "max": null, 89 | "min": 0, 90 | "show": true 91 | }, 92 | { 93 | "format": "short", 94 | "label": null, 95 | "logBase": 1, 96 | "max": null, 97 | "min": null, 98 | "show": true 99 | } 100 | ] 101 | }, 102 | { 103 | "aliasColors": {}, 104 | "bars": false, 105 | "datasource": "prometheus", 106 | "decimals": 2, 107 | "editable": true, 108 | "error": false, 109 | "fill": 1, 110 | "grid": { 111 | "threshold1": null, 112 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 113 | "threshold2": null, 114 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 115 | }, 116 | "id": 2, 117 | "isNew": true, 118 | "legend": { 119 | "alignAsTable": true, 120 | "avg": true, 121 | "current": true, 122 | "max": true, 123 | "min": true, 124 | "rightSide": true, 125 | "show": true, 126 | "total": false, 127 | "values": true 128 | }, 129 | "lines": true, 130 | "linewidth": 2, 131 | "links": [], 132 | "nullPointMode": "connected", 133 | "percentage": false, 134 | "pointradius": 5, 135 | "points": false, 136 | "renderer": "flot", 137 | "seriesOverrides": [], 138 | "span": 12, 139 | "stack": false, 140 | "steppedLine": false, 141 | "targets": [ 142 | { 143 | "expr": "sum(nginx_connections_current) by (state)", 144 | "interval": "", 145 | "intervalFactor": 2, 146 | "legendFormat": "{{state}}", 147 | "metric": "", 148 | "refId": "A", 149 | "step": 2 150 | } 151 | ], 152 | "timeFrom": null, 153 | "timeShift": null, 154 | "title": "Connections", 155 | "tooltip": { 156 | "msResolution": false, 157 | "shared": true, 158 | "sort": 0, 159 | "value_type": "cumulative" 160 | }, 161 | "type": "graph", 162 | "xaxis": { 163 | "show": true 164 | }, 165 | "yaxes": [ 166 | { 167 | "format": "short", 168 | "label": null, 169 | "logBase": 1, 170 | "max": null, 171 | "min": 0, 172 | "show": true 173 | }, 174 | { 175 | "format": "short", 176 | "label": null, 177 | "logBase": 1, 178 | "max": null, 179 | "min": null, 180 | "show": true 181 | } 182 | ] 183 | }, 184 | { 185 | "aliasColors": {}, 186 | "bars": false, 187 | "datasource": "prometheus", 188 | "decimals": 2, 189 | "editable": true, 190 | "error": false, 191 | "fill": 1, 192 | "grid": { 193 | "threshold1": null, 194 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 195 | "threshold2": null, 196 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 197 | }, 198 | "id": 1, 199 | "isNew": true, 200 | "legend": { 201 | "alignAsTable": true, 202 | "avg": true, 203 | "current": true, 204 | "max": true, 205 | "min": true, 206 | "rightSide": true, 207 | "show": true, 208 | "total": false, 209 | "values": true 210 | }, 211 | "lines": true, 212 | "linewidth": 2, 213 | "links": [], 214 | "nullPointMode": "connected", 215 | "percentage": false, 216 | "pointradius": 5, 217 | "points": false, 218 | "renderer": "flot", 219 | "seriesOverrides": [], 220 | "span": 12, 221 | "stack": false, 222 | "steppedLine": false, 223 | "targets": [ 224 | { 225 | "expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)", 226 | "hide": false, 227 | "interval": "", 228 | "intervalFactor": 10, 229 | "legendFormat": "{{stage}}", 230 | "metric": "", 231 | "refId": "B", 232 | "step": 10 233 | } 234 | ], 235 | "timeFrom": null, 236 | "timeShift": null, 237 | "title": "Connections rate", 238 | "tooltip": { 239 | "msResolution": false, 240 | "shared": true, 241 | "sort": 0, 242 | "value_type": "cumulative" 243 | }, 244 | "type": "graph", 245 | "xaxis": { 246 | "show": true 247 | }, 248 | "yaxes": [ 249 | { 250 | "format": "short", 251 | "label": null, 252 | "logBase": 1, 253 | "max": null, 254 | "min": 0, 255 | "show": true 256 | }, 257 | { 258 | "format": "short", 259 | "label": null, 260 | "logBase": 1, 261 | "max": null, 262 | "min": null, 263 | "show": true 264 | } 265 | ] 266 | } 267 | ], 268 | "title": "Nginx exporter metrics" 269 | }, 270 | { 271 | "collapse": false, 272 | "editable": true, 273 | "height": "250px", 274 | "panels": [ 275 | { 276 | "aliasColors": {}, 277 | "bars": false, 278 | "datasource": null, 279 | "editable": true, 280 | "error": false, 281 | "fill": 1, 282 | "grid": { 283 | "threshold1": null, 284 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 285 | "threshold2": null, 286 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 287 | }, 288 | "id": 4, 289 | "isNew": true, 290 | "legend": { 291 | "alignAsTable": true, 292 | "avg": true, 293 | "current": true, 294 | "max": true, 295 | "min": true, 296 | "rightSide": true, 297 | "show": true, 298 | "total": false, 299 | "values": true 300 | }, 301 | "lines": true, 302 | "linewidth": 2, 303 | "links": [], 304 | "nullPointMode": "connected", 305 | "percentage": false, 306 | "pointradius": 5, 307 | "points": false, 308 | "renderer": "flot", 309 | "seriesOverrides": [], 310 | "span": 12, 311 | "stack": false, 312 | "steppedLine": false, 313 | "targets": [ 314 | { 315 | "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100", 316 | "intervalFactor": 2, 317 | "legendFormat": "nginx", 318 | "refId": "A", 319 | "step": 2 320 | } 321 | ], 322 | "timeFrom": null, 323 | "timeShift": null, 324 | "title": "CPU usage", 325 | "tooltip": { 326 | "msResolution": false, 327 | "shared": true, 328 | "sort": 0, 329 | "value_type": "cumulative" 330 | }, 331 | "type": "graph", 332 | "xaxis": { 333 | "show": true 334 | }, 335 | "yaxes": [ 336 | { 337 | "format": "short", 338 | "label": null, 339 | "logBase": 1, 340 | "max": null, 341 | "min": null, 342 | "show": true 343 | }, 344 | { 345 | "format": "short", 346 | "label": null, 347 | "logBase": 1, 348 | "max": null, 349 | "min": null, 350 | "show": true 351 | } 352 | ] 353 | } 354 | ], 355 | "title": "Nginx container metrics" 356 | } 357 | ], 358 | "time": { 359 | "from": "now-15m", 360 | "to": "now" 361 | }, 362 | "timepicker": { 363 | "refresh_intervals": [ 364 | "5s", 365 | "10s", 366 | "30s", 367 | "1m", 368 | "5m", 369 | "15m", 370 | "30m", 371 | "1h", 372 | "2h", 373 | "1d" 374 | ], 375 | "time_options": [ 376 | "5m", 377 | "15m", 378 | "1h", 379 | "6h", 380 | "12h", 381 | "24h", 382 | "2d", 383 | "7d", 384 | "30d" 385 | ] 386 | }, 387 | "templating": { 388 | "list": [] 389 | }, 390 | "annotations": { 391 | "list": [] 392 | }, 393 | "refresh": "10s", 394 | "schemaVersion": 12, 395 | "version": 9, 396 | "links": [], 397 | "gnetId": null 398 | } -------------------------------------------------------------------------------- /grafana/provisioning/dashboards/postgres.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "id": 3, 19 | "links": [], 20 | "panels": [ 21 | { 22 | "columns": [], 23 | "datasource": "postgres", 24 | "fontSize": "100%", 25 | "gridPos": { 26 | "h": 8, 27 | "w": 24, 28 | "x": 0, 29 | "y": 0 30 | }, 31 | "id": 4, 32 | "links": [], 33 | "pageSize": null, 34 | "scroll": true, 35 | "showHeader": true, 36 | "sort": { 37 | "col": 1, 38 | "desc": true 39 | }, 40 | "styles": [ 41 | { 42 | "alias": "Age", 43 | "align": "auto", 44 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 45 | "pattern": "age", 46 | "type": "number", 47 | "unit": "s" 48 | }, 49 | { 50 | "alias": "", 51 | "align": "auto", 52 | "colorMode": null, 53 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 54 | "decimals": 2, 55 | "pattern": "/.*/", 56 | "thresholds": [], 57 | "type": "number", 58 | "unit": "short" 59 | } 60 | ], 61 | "targets": [ 62 | { 63 | "format": "table", 64 | "group": [], 65 | "metricColumn": "none", 66 | "rawQuery": true, 67 | "rawSql": "-- grafana ignore\nselect application_name,\n extract(epoch from age(now(), xact_start)) as age,\n query from pg_stat_activity\n where query not like '%grafana ignore%'\n and state='active'\norder by query_start desc", 68 | "refId": "A", 69 | "select": [ 70 | [ 71 | { 72 | "params": ["value"], 73 | "type": "column" 74 | } 75 | ] 76 | ], 77 | "timeColumn": "time", 78 | "where": [ 79 | { 80 | "name": "$__timeFilter", 81 | "params": [], 82 | "type": "macro" 83 | } 84 | ] 85 | } 86 | ], 87 | "timeFrom": null, 88 | "timeShift": null, 89 | "title": "Active Queries", 90 | "transform": "table", 91 | "type": "table" 92 | }, 93 | { 94 | "columns": [], 95 | "datasource": "postgres", 96 | "fontSize": "100%", 97 | "gridPos": { 98 | "h": 5, 99 | "w": 13, 100 | "x": 0, 101 | "y": 8 102 | }, 103 | "id": 9, 104 | "links": [], 105 | "pageSize": null, 106 | "pluginVersion": "6.2.1", 107 | "scroll": true, 108 | "showHeader": true, 109 | "sort": { 110 | "col": 1, 111 | "desc": false 112 | }, 113 | "styles": [ 114 | { 115 | "alias": "", 116 | "align": "auto", 117 | "dateFormat": "YYYY-MM-DD", 118 | "pattern": "last_autovacuum", 119 | "type": "date" 120 | }, 121 | { 122 | "alias": "tx left", 123 | "align": "auto", 124 | "colorMode": "row", 125 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 126 | "decimals": null, 127 | "pattern": "tx_before_wraparound_vacuum", 128 | "thresholds": ["20000000", "40000000"], 129 | "type": "number", 130 | "unit": "short" 131 | }, 132 | { 133 | "alias": "", 134 | "align": "auto", 135 | "colorMode": null, 136 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 137 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 138 | "decimals": 2, 139 | "mappingType": 1, 140 | "pattern": "xid_age", 141 | "thresholds": [], 142 | "type": "number", 143 | "unit": "short" 144 | } 145 | ], 146 | "targets": [ 147 | { 148 | "format": "table", 149 | "group": [], 150 | "metricColumn": "none", 151 | "rawQuery": true, 152 | "rawSql": "-- grafana ignore\nSELECT\n relname as table,\n least((SELECT setting::int FROM pg_settings WHERE name = 'autovacuum_freeze_max_age') - age(relfrozenxid), \n (SELECT setting::int FROM pg_settings WHERE name = 'autovacuum_multixact_freeze_max_age') - mxid_age(relminmxid))\n tx_before_wraparound_vacuum,\n pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum,\n age(relfrozenxid) AS xid_age,\n mxid_age(relminmxid) AS mxid_age\nFROM\n pg_class c\nWHERE\n c.relname in ('ethereum_blocks', 'eth_call_cache','subgraph_deployment')\n and c.relfrozenxid != 0\n", 153 | "refId": "A", 154 | "select": [ 155 | [ 156 | { 157 | "params": ["value"], 158 | "type": "column" 159 | } 160 | ] 161 | ], 162 | "timeColumn": "time", 163 | "where": [ 164 | { 165 | "name": "$__timeFilter", 166 | "params": [], 167 | "type": "macro" 168 | } 169 | ] 170 | } 171 | ], 172 | "timeFrom": null, 173 | "timeShift": null, 174 | "title": "Transactions until wraparound vacuum", 175 | "transform": "table", 176 | "type": "table" 177 | }, 178 | { 179 | "columns": [], 180 | "datasource": "postgres", 181 | "fontSize": "100%", 182 | "gridPos": { 183 | "h": 3, 184 | "w": 11, 185 | "x": 13, 186 | "y": 8 187 | }, 188 | "id": 11, 189 | "pageSize": null, 190 | "showHeader": true, 191 | "sort": { 192 | "col": 0, 193 | "desc": true 194 | }, 195 | "styles": [ 196 | { 197 | "alias": "Tables needing vacuum", 198 | "align": "auto", 199 | "colorMode": null, 200 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 201 | "decimals": 0, 202 | "pattern": "tables_needing_vacuum", 203 | "thresholds": [], 204 | "type": "number", 205 | "unit": "none" 206 | }, 207 | { 208 | "alias": "Txns past", 209 | "align": "auto", 210 | "colorMode": null, 211 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 212 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 213 | "decimals": 0, 214 | "mappingType": 1, 215 | "pattern": "txns_past", 216 | "thresholds": [], 217 | "type": "number", 218 | "unit": "locale" 219 | }, 220 | { 221 | "alias": "Last autovacuum", 222 | "align": "auto", 223 | "colorMode": null, 224 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 225 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 226 | "decimals": 2, 227 | "mappingType": 1, 228 | "pattern": "last_autovacuum", 229 | "thresholds": [], 230 | "type": "date", 231 | "unit": "short" 232 | } 233 | ], 234 | "targets": [ 235 | { 236 | "format": "table", 237 | "group": [], 238 | "metricColumn": "none", 239 | "rawQuery": true, 240 | "rawSql": "-- grafana ignore\nselect count(*) as tables_needing_vacuum,\n -min(tx_before_wraparound_vacuum) as txns_past,\n min(last_autovacuum) as last_autovacuum\n from (\n select oid::regclass::text AS table,\n least(\n (select setting::int\n from pg_settings\n where name = 'autovacuum_freeze_max_age') - age(relfrozenxid),\n (select setting::int\n from pg_settings\n where name = 'autovacuum_multixact_freeze_max_age')\n - mxid_age(relminmxid)) as tx_before_wraparound_vacuum,\n pg_stat_get_last_autovacuum_time(oid) AS last_autovacuum,\n age(relfrozenxid) AS xid_age,\n mxid_age(relminmxid) AS mxid_age\n from pg_class\n where relfrozenxid != 0\n and oid > 16384\n and relkind='r') a where a.tx_before_wraparound_vacuum < 0;\n", 241 | "refId": "A", 242 | "select": [ 243 | [ 244 | { 245 | "params": ["value"], 246 | "type": "column" 247 | } 248 | ] 249 | ], 250 | "timeColumn": "time", 251 | "where": [ 252 | { 253 | "name": "$__timeFilter", 254 | "params": [], 255 | "type": "macro" 256 | } 257 | ] 258 | } 259 | ], 260 | "timeFrom": null, 261 | "timeShift": null, 262 | "title": "Autovacuum pressure", 263 | "transform": "table", 264 | "type": "table" 265 | }, 266 | { 267 | "columns": [], 268 | "datasource": "postgres", 269 | "fontSize": "100%", 270 | "gridPos": { 271 | "h": 9, 272 | "w": 24, 273 | "x": 0, 274 | "y": 13 275 | }, 276 | "id": 7, 277 | "links": [], 278 | "pageSize": null, 279 | "scroll": true, 280 | "showHeader": true, 281 | "sort": { 282 | "col": 4, 283 | "desc": false 284 | }, 285 | "styles": [ 286 | { 287 | "alias": "Granted", 288 | "align": "auto", 289 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 290 | "decimals": 0, 291 | "mappingType": 1, 292 | "pattern": "granted", 293 | "preserveFormat": false, 294 | "sanitize": false, 295 | "thresholds": [""], 296 | "type": "string", 297 | "unit": "none", 298 | "valueMaps": [ 299 | { 300 | "text": "✓", 301 | "value": "true" 302 | }, 303 | { 304 | "text": "—", 305 | "value": "false" 306 | } 307 | ] 308 | }, 309 | { 310 | "alias": "Age", 311 | "align": "auto", 312 | "colorMode": null, 313 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 314 | "decimals": 2, 315 | "pattern": "age", 316 | "thresholds": [], 317 | "type": "number", 318 | "unit": "s" 319 | } 320 | ], 321 | "targets": [ 322 | { 323 | "format": "table", 324 | "group": [], 325 | "metricColumn": "none", 326 | "rawQuery": true, 327 | "rawSql": "-- grafana ignore\nSELECT a.application_name,\n coalesce(extract(epoch from age(now(), a.xact_start)), 0) as age,\n l.relation::regclass,\n l.mode,\n l.GRANTED::varchar,\n l.locktype \"Target\"\nFROM pg_stat_activity a\nJOIN pg_locks l ON l.pid = a.pid\nwhere pg_backend_pid() != a.pid\norder by granted asc, age desc;", 328 | "refId": "A", 329 | "select": [ 330 | [ 331 | { 332 | "params": ["value"], 333 | "type": "column" 334 | } 335 | ] 336 | ], 337 | "timeColumn": "time", 338 | "where": [ 339 | { 340 | "name": "$__timeFilter", 341 | "params": [], 342 | "type": "macro" 343 | } 344 | ] 345 | } 346 | ], 347 | "timeFrom": null, 348 | "timeShift": null, 349 | "title": "Active locks", 350 | "transform": "table", 351 | "type": "table" 352 | }, 353 | { 354 | "columns": [], 355 | "datasource": "postgres", 356 | "fontSize": "100%", 357 | "gridPos": { 358 | "h": 8, 359 | "w": 24, 360 | "x": 0, 361 | "y": 22 362 | }, 363 | "id": 5, 364 | "links": [], 365 | "pageSize": null, 366 | "scroll": true, 367 | "showHeader": true, 368 | "sort": { 369 | "col": 4, 370 | "desc": false 371 | }, 372 | "styles": [ 373 | { 374 | "alias": "Age", 375 | "align": "auto", 376 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 377 | "pattern": "age", 378 | "type": "number", 379 | "unit": "s" 380 | }, 381 | { 382 | "alias": "", 383 | "align": "auto", 384 | "colorMode": null, 385 | "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], 386 | "decimals": 2, 387 | "pattern": "/.*/", 388 | "thresholds": [], 389 | "type": "number", 390 | "unit": "short" 391 | } 392 | ], 393 | "targets": [ 394 | { 395 | "format": "table", 396 | "group": [], 397 | "metricColumn": "none", 398 | "rawQuery": true, 399 | "rawSql": "-- grafana ignore\nselect client_addr,\n application_name,\n usename,\n state,\n extract(epoch from age(now(), xact_start)) as age\n from pg_stat_activity\n where query not like '%grafana ignore%'\n and state like '%idle in transaction%'\norder by query_start desc", 400 | "refId": "A", 401 | "select": [ 402 | [ 403 | { 404 | "params": ["value"], 405 | "type": "column" 406 | } 407 | ] 408 | ], 409 | "timeColumn": "time", 410 | "where": [ 411 | { 412 | "name": "$__timeFilter", 413 | "params": [], 414 | "type": "macro" 415 | } 416 | ] 417 | } 418 | ], 419 | "timeFrom": null, 420 | "timeShift": null, 421 | "title": "Idle transactions", 422 | "transform": "table", 423 | "type": "table" 424 | } 425 | ], 426 | "schemaVersion": 22, 427 | "style": "dark", 428 | "tags": [], 429 | "templating": { 430 | "list": [] 431 | }, 432 | "time": { 433 | "from": "now-6h", 434 | "to": "now" 435 | }, 436 | "timepicker": { 437 | "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"], 438 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 439 | }, 440 | "timezone": "", 441 | "title": "Postgres Statistics", 442 | "uid": "Mo6FxoiWz", 443 | "variables": { 444 | "list": [] 445 | }, 446 | "version": 4 447 | } 448 | -------------------------------------------------------------------------------- /grafana/provisioning/datasources/postgres.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | datasources: 3 | - access: proxy 4 | editable: true 5 | name: postgres 6 | orgId: 1 7 | type: postgres 8 | url: $postgres_host 9 | user: $postgres_user 10 | database: $postgres_db 11 | secureJsonData: 12 | password: $postgres_pass 13 | jsonData: 14 | sslmode: disable 15 | postgresVersion: 906 16 | -------------------------------------------------------------------------------- /grafana/provisioning/datasources/prometeus.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | datasources: 3 | - access: proxy 4 | editable: true 5 | name: prometheus 6 | orgId: 1 7 | type: prometheus 8 | url: http://prometheus:9090/ 9 | version: 1 -------------------------------------------------------------------------------- /helpers/aws/README.md: -------------------------------------------------------------------------------- 1 | # Prometheus on EC2 & ECS: 2 | 3 | Some helpers for anyone configuring Prometheus on ECS and AWS EC2. 4 | 5 | To get started on AWS ECS and EC2: 6 | 7 | *For EC2/ECS nodes*: 8 | - Import the ecs task definition and add cadvisor and node-exporter service/task definition and run them on each host you want to be monitored 9 | - Any hosts which have "Monitoring: On" tag will be automatically added in the targets 10 | - Expose ports 9100 and 9191 to your Prometheus host 11 | 12 | *For Prometheus host*: 13 | 14 | - Copy prometheus.yml configuration present here to base prometheus configuration to enable EC2 service discovery 15 | - `docker compose up -d` 16 | 17 | **Note**: 18 | Set query.staleness-delta to 1m make metrics more realtime 19 | 20 | 21 | ### TODO 22 | - Add alerting rules based on ECS 23 | -------------------------------------------------------------------------------- /helpers/aws/cadvisor_ecs_task_definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "family": "cadvisor", 3 | "containerDefinitions": [ 4 | { 5 | "name": "cadvisor", 6 | "image": "google/cadvisor", 7 | "cpu": 10, 8 | "memory": 300, 9 | "portMappings": [ 10 | { 11 | "containerPort": 9191, 12 | "hostPort": 9191 13 | } 14 | ], 15 | "essential": true, 16 | "privileged": true, 17 | "mountPoints": [ 18 | { 19 | "sourceVolume": "root", 20 | "containerPath": "/rootfs", 21 | "readOnly": true 22 | }, 23 | { 24 | "sourceVolume": "var_run", 25 | "containerPath": "/var/run", 26 | "readOnly": false 27 | }, 28 | { 29 | "sourceVolume": "sys", 30 | "containerPath": "/sys", 31 | "readOnly": true 32 | }, 33 | { 34 | "sourceVolume": "var_lib_docker", 35 | "containerPath": "/var/lib/docker", 36 | "readOnly": true 37 | }, 38 | { 39 | "sourceVolume": "cgroup", 40 | "containerPath": "/cgroup", 41 | "readOnly": true 42 | } 43 | ] 44 | } 45 | ], 46 | "volumes": [ 47 | { 48 | "name": "root", 49 | "host": { 50 | "sourcePath": "/" 51 | } 52 | }, 53 | { 54 | "name": "var_run", 55 | "host": { 56 | "sourcePath": "/var/run" 57 | } 58 | }, 59 | { 60 | "name": "sys", 61 | "host": { 62 | "sourcePath": "/sys" 63 | } 64 | }, 65 | { 66 | "name": "var_lib_docker", 67 | "host": { 68 | "sourcePath": "/var/lib/docker/" 69 | } 70 | }, 71 | { 72 | "name": "cgroup", 73 | "host": { 74 | "sourcePath": "/cgroup" 75 | } 76 | } 77 | ] 78 | } -------------------------------------------------------------------------------- /helpers/aws/node_exporter_task_definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "family": "prometheus", 3 | "containerDefinitions": [ 4 | { 5 | "portMappings": [ 6 | { 7 | "hostPort": 9100, 8 | "containerPort": 9100, 9 | "protocol": "tcp" 10 | } 11 | ], 12 | "essential": true, 13 | "name": "node_exporter", 14 | "image": "prom/node-exporter", 15 | "cpu": 0, 16 | "privileged": null, 17 | "memoryReservation": 150 18 | } 19 | ], 20 | "volumes": [], 21 | "networkMode": "host" 22 | } 23 | -------------------------------------------------------------------------------- /helpers/aws/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | evaluation_interval: 15s 4 | 5 | # Attach these labels to any time series or alerts when communicating with 6 | # external systems (federation, remote storage, Alertmanager). 7 | external_labels: 8 | monitor: 'docker-host-alpha' 9 | 10 | # Load and evaluate rules in this file every 'evaluation_interval' seconds. 11 | rule_files: 12 | - "targets.rules" 13 | - "hosts.rules" 14 | - "containers.rules" 15 | 16 | # A scrape configuration containing exactly one endpoint to scrape. 17 | scrape_configs: 18 | - job_name: 'nodeexporter' 19 | scrape_interval: 5s 20 | static_configs: 21 | - targets: ['nodeexporter:9100'] 22 | 23 | - job_name: 'cadvisor' 24 | scrape_interval: 5s 25 | static_configs: 26 | - targets: ['cadvisor:8080'] 27 | 28 | - job_name: 'prometheus' 29 | scrape_interval: 10s 30 | static_configs: 31 | - targets: ['localhost:9090'] 32 | 33 | 34 | # sample scrape configuration for AWS EC2 35 | - job_name: 'nodeexporter' 36 | ec2_sd_configs: 37 | - region: us-east-1 38 | port: 9100 39 | relabel_configs: 40 | # Only monitor instances which have a tag called Monitoring "Monitoring" 41 | - source_labels: [__meta_ec2_tag_Monitoring] 42 | regex: On 43 | action: keep 44 | 45 | - job_name: 'cadvisor' 46 | ec2_sd_configs: 47 | - region: us-east-1 48 | port: 9010 49 | relabel_configs: 50 | # Only monitor instances which have a tag called Monitoring "Monitoring" 51 | - source_labels: [__meta_ec2_tag_Monitoring] 52 | regex: On 53 | action: keep 54 | -------------------------------------------------------------------------------- /prometheus/alert.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: targets 3 | rules: 4 | - alert: monitor_service_down 5 | expr: up == 0 6 | for: 30s 7 | labels: 8 | severity: critical 9 | annotations: 10 | summary: "Monitor service non-operational" 11 | description: "Service {{ $labels.instance }} is down." 12 | 13 | - name: host 14 | rules: 15 | - alert: high_cpu_load 16 | expr: node_load1 > 1.5 17 | for: 30s 18 | labels: 19 | severity: warning 20 | annotations: 21 | summary: "Server under high load" 22 | description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." 23 | 24 | - alert: high_memory_load 25 | expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 26 | for: 30s 27 | labels: 28 | severity: warning 29 | annotations: 30 | summary: "Server memory is almost full" 31 | description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." 32 | 33 | - alert: high_storage_load 34 | expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 35 | for: 30s 36 | labels: 37 | severity: warning 38 | annotations: 39 | summary: "Server storage is almost full" 40 | description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." 41 | 42 | - name: containers 43 | rules: 44 | - alert: graphnode_down 45 | expr: absent((time() - container_last_seen{name="graph-node"}) < 10) 46 | for: 30s 47 | labels: 48 | severity: critical 49 | annotations: 50 | summary: "Graph Node down" 51 | description: "Graph Node container is down for more than 30 seconds." 52 | 53 | - alert: graphnode_high_cpu 54 | expr: sum(rate(container_cpu_usage_seconds_total{name="graph-node"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 55 | for: 30s 56 | labels: 57 | severity: warning 58 | annotations: 59 | summary: "Graph Node high CPU usage" 60 | description: "Graph Node CPU usage is {{ humanize $value}}%." 61 | 62 | - alert: graphnode_high_memory 63 | expr: sum(container_memory_usage_bytes{name="graph-node"}) > 1200000000 64 | for: 30s 65 | labels: 66 | severity: warning 67 | annotations: 68 | summary: "Graph Node high memory usage" 69 | description: "Graph Node memory consumption is at {{ humanize $value}}." 70 | 71 | - alert: postgres_down 72 | expr: absent((time() - container_last_seen{name="postgres"}) < 10) 73 | for: 30s 74 | labels: 75 | severity: critical 76 | annotations: 77 | summary: "Postgres down" 78 | description: "Postgres container is down for more than 30 seconds." 79 | 80 | - alert: postgres_high_cpu 81 | expr: sum(rate(container_cpu_usage_seconds_total{name="postgres"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 82 | for: 30s 83 | labels: 84 | severity: warning 85 | annotations: 86 | summary: "Postgres high CPU usage" 87 | description: "Postgres CPU usage is {{ humanize $value}}%." 88 | 89 | - alert: postgres_high_memory 90 | expr: sum(container_memory_usage_bytes{name="postgres"}) > 1200000000 91 | for: 30s 92 | labels: 93 | severity: warning 94 | annotations: 95 | summary: "Postgres high memory usage" 96 | description: "Postgres memory consumption is at {{ humanize $value}}." 97 | 98 | - alert: nginx_down 99 | expr: absent((time() - container_last_seen{name="nginx-proxy"}) < 10) 100 | for: 30s 101 | labels: 102 | severity: critical 103 | annotations: 104 | summary: "Nginx down" 105 | description: "Nginx container is down for more than 30 seconds." 106 | 107 | - alert: nginx_high_cpu 108 | expr: sum(rate(container_cpu_usage_seconds_total{name="nginx-proxy"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 109 | for: 30s 110 | labels: 111 | severity: warning 112 | annotations: 113 | summary: "Nginx high CPU usage" 114 | description: "PostNginxgres CPU usage is {{ humanize $value}}%." 115 | 116 | - alert: nginx_high_memory 117 | expr: sum(container_memory_usage_bytes{name="nginx-proxy"}) > 1200000000 118 | for: 30s 119 | labels: 120 | severity: warning 121 | annotations: 122 | summary: "Nginx high memory usage" 123 | description: "Nginx memory consumption is at {{ humanize $value}}." 124 | 125 | - alert: caddy_down 126 | expr: absent((time() - container_last_seen{name="caddy"}) < 10) 127 | for: 30s 128 | labels: 129 | severity: critical 130 | annotations: 131 | summary: "Caddy down" 132 | description: "Caddy container is down for more than 30 seconds." 133 | 134 | - alert: caddy_high_cpu 135 | expr: sum(rate(container_cpu_usage_seconds_total{name="caddy"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 136 | for: 30s 137 | labels: 138 | severity: warning 139 | annotations: 140 | summary: "Caddy high CPU usage" 141 | description: "Caddy CPU usage is {{ humanize $value}}%." 142 | 143 | - alert: caddy_high_memory 144 | expr: sum(container_memory_usage_bytes{name="caddy"}) > 1200000000 145 | for: 30s 146 | labels: 147 | severity: warning 148 | annotations: 149 | summary: "Caddy high memory usage" 150 | description: "Caddy memory consumption is at {{ humanize $value}}." -------------------------------------------------------------------------------- /prometheus/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | evaluation_interval: 15s 4 | 5 | # Attach these labels to any time series or alerts when communicating with 6 | # external systems (federation, remote storage, Alertmanager). 7 | external_labels: 8 | monitor: 'docker-host-alpha' 9 | 10 | # Load and evaluate rules in this file every 'evaluation_interval' seconds. 11 | rule_files: 12 | - "alert.rules" 13 | 14 | # A scrape configuration containing exactly one endpoint to scrape. 15 | scrape_configs: 16 | - job_name: 'nodeexporter' 17 | scrape_interval: 5s 18 | static_configs: 19 | - targets: ['nodeexporter:9100'] 20 | 21 | - job_name: 'cadvisor' 22 | scrape_interval: 5s 23 | static_configs: 24 | - targets: ['cadvisor:8080'] 25 | 26 | - job_name: 'prometheus' 27 | scrape_interval: 10s 28 | static_configs: 29 | - targets: ['localhost:9090'] 30 | 31 | - job_name: 'pushgateway' 32 | scrape_interval: 10s 33 | honor_labels: true 34 | static_configs: 35 | - targets: ['pushgateway:9091'] 36 | 37 | - job_name: 'graph-node' 38 | scrape_interval: 5s 39 | static_configs: 40 | - targets: ['graph-node:8040'] 41 | 42 | - job_name: 'grafana' 43 | scrape_interval: 5s 44 | static_configs: 45 | - targets: ['grafana:3000'] 46 | 47 | alerting: 48 | alertmanagers: 49 | - scheme: http 50 | static_configs: 51 | - targets: 52 | - 'alertmanager:9093' 53 | 54 | # - job_name: 'nginx' 55 | # scrape_interval: 10s 56 | # static_configs: 57 | # - targets: ['nginxexporter:9113'] 58 | 59 | # - job_name: 'aspnetcore' 60 | # scrape_interval: 10s 61 | # static_configs: 62 | # - targets: ['eventlog-proxy:5000', 'eventlog:5000'] 63 | -------------------------------------------------------------------------------- /screens/Grafana_Docker_Containers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/butterfly-academy/graphprotocol-infrastructure/5a92f9b92949019c7a149f70908fe5caee95f425/screens/Grafana_Docker_Containers.png -------------------------------------------------------------------------------- /screens/Grafana_Docker_Host.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/butterfly-academy/graphprotocol-infrastructure/5a92f9b92949019c7a149f70908fe5caee95f425/screens/Grafana_Docker_Host.png -------------------------------------------------------------------------------- /screens/Grafana_Prometheus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/butterfly-academy/graphprotocol-infrastructure/5a92f9b92949019c7a149f70908fe5caee95f425/screens/Grafana_Prometheus.png -------------------------------------------------------------------------------- /screens/Slack_Notifications.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/butterfly-academy/graphprotocol-infrastructure/5a92f9b92949019c7a149f70908fe5caee95f425/screens/Slack_Notifications.png --------------------------------------------------------------------------------