├── .gitignore ├── grafana ├── docs │ ├── img │ │ ├── grafana-login.png │ │ ├── grafana-logs.png │ │ ├── grafana-alerts.png │ │ ├── grafana-explore.png │ │ ├── grafana-welcome.png │ │ ├── pagerduty-alert.png │ │ ├── grafana-dashboard.png │ │ └── grafana-logs-celestia-appd.png │ └── grafana.md ├── conf │ ├── grafana │ │ ├── datasources │ │ │ ├── loki.yaml │ │ │ └── prometheus.yml │ │ ├── dashboards │ │ │ ├── dashboard.yaml │ │ │ ├── process_exporter.json │ │ │ ├── celestia-da.json │ │ │ └── celestia.json │ │ └── grafana.ini │ ├── loki │ │ ├── rules │ │ │ └── celestia.rules.yaml │ │ └── loki.yaml │ ├── promtail.yaml │ ├── prometheus │ │ ├── rules │ │ │ ├── celestia-node.rules.yaml │ │ │ ├── celestia-appd.rules.yaml │ │ │ └── alert.rules.yaml │ │ └── prometheus.yml.tpl │ ├── otel-collector-config.yaml.tpl │ └── alertmanager.yaml.tpl ├── start.sh ├── README.md └── docker-compose.yml └── blobstreamx-installer ├── .env.local.example ├── install-docker.sh ├── .env.tpl ├── README.md └── install.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .env.local -------------------------------------------------------------------------------- /grafana/docs/img/grafana-login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/P-OPSTeam/celestia-tools/HEAD/grafana/docs/img/grafana-login.png -------------------------------------------------------------------------------- /grafana/docs/img/grafana-logs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/P-OPSTeam/celestia-tools/HEAD/grafana/docs/img/grafana-logs.png -------------------------------------------------------------------------------- /grafana/docs/img/grafana-alerts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/P-OPSTeam/celestia-tools/HEAD/grafana/docs/img/grafana-alerts.png -------------------------------------------------------------------------------- /grafana/docs/img/grafana-explore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/P-OPSTeam/celestia-tools/HEAD/grafana/docs/img/grafana-explore.png -------------------------------------------------------------------------------- /grafana/docs/img/grafana-welcome.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/P-OPSTeam/celestia-tools/HEAD/grafana/docs/img/grafana-welcome.png -------------------------------------------------------------------------------- /grafana/docs/img/pagerduty-alert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/P-OPSTeam/celestia-tools/HEAD/grafana/docs/img/pagerduty-alert.png -------------------------------------------------------------------------------- /grafana/docs/img/grafana-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/P-OPSTeam/celestia-tools/HEAD/grafana/docs/img/grafana-dashboard.png -------------------------------------------------------------------------------- /grafana/docs/img/grafana-logs-celestia-appd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/P-OPSTeam/celestia-tools/HEAD/grafana/docs/img/grafana-logs-celestia-appd.png -------------------------------------------------------------------------------- /grafana/conf/grafana/datasources/loki.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: Loki 5 | type: loki 6 | access: proxy 7 | url: http://loki:3100 8 | jsonData: 9 | maxLines: 1000 -------------------------------------------------------------------------------- /grafana/conf/loki/rules/celestia.rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: celestia-appd 3 | rules: 4 | - alert: celestia-appd_Log_ERR_detected 5 | expr: count_over_time ({unit="celestia-appd.service"} |~ "ERR.*"[1m]) >= 1 6 | for: 0m 7 | labels: 8 | severity: warning 9 | category: logs 10 | annotations: 11 | title: "celestia-appd logged an ERR" 12 | description: "celestia-appd logged an ERR" 13 | logurl: "https://pops.one" 14 | -------------------------------------------------------------------------------- /blobstreamx-installer/.env.local.example: -------------------------------------------------------------------------------- 1 | # Relayer private key 2 | PRIVATE_KEY= 3 | # The chain ID you are relaying to 4 | CHAINID= 5 | # RPC for the chain you are relaying to 6 | RPC_URL= 7 | # Celestia RPC for the chain you are relaying to 8 | TENDERMINT_RPC_URL= 9 | # Address of Blobstream X contract : https://docs.celestia.org/developers/blobstream#deployed-contracts 10 | CONTRACT_ADDRESS= 11 | 12 | # verify the below the below and update accordingly from the verifier verifier-buildXX.tar.gz once install.sh is done 13 | WRAPPER_BINARY="./artifacts/verifier-build" -------------------------------------------------------------------------------- /blobstreamx-installer/install-docker.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # install docker / docker-compose 4 | sudo apt update 5 | sudo apt install -y ca-certificates curl gnupg lsb-release 6 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg 7 | echo \ 8 | "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \ 9 | $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null 10 | sudo apt update 11 | sudo apt install -y docker-compose docker-ce docker-ce-cli containerd.io 12 | sudo usermod -aG docker $USER #you need to logout and login back after that 13 | echo "you need to logout and for usermod setting to take affect" -------------------------------------------------------------------------------- /grafana/conf/promtail.yaml: -------------------------------------------------------------------------------- 1 | server: 2 | http_listen_port: 9080 3 | grpc_listen_port: 0 4 | 5 | positions: 6 | filename: /tmp/positions.yaml 7 | 8 | clients: 9 | - url: http://loki:3100/loki/api/v1/push 10 | 11 | scrape_configs: 12 | - job_name: system 13 | static_configs: 14 | - targets: 15 | - localhost 16 | labels: 17 | job: varlogs 18 | __path__: /var/log/*log 19 | 20 | - job_name: dockerlogs 21 | docker_sd_configs: 22 | - host: unix:///var/run/docker.sock 23 | refresh_interval: 5s 24 | relabel_configs: 25 | - source_labels: ['__meta_docker_container_name'] 26 | regex: '/(.*)' 27 | target_label: 'container' 28 | 29 | - job_name: journal 30 | journal: 31 | max_age: 12h 32 | labels: 33 | job: systemd-journal 34 | path: /var/log/journal 35 | relabel_configs: 36 | - source_labels: ['__journal__systemd_unit'] 37 | target_label: 'unit' -------------------------------------------------------------------------------- /grafana/conf/grafana/dashboards/dashboard.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | # an unique provider name 5 | - name: 'pops' 6 | # org id. will default to orgId 1 if not specified 7 | orgId: 1 8 | # name of the dashboard folder. Required 9 | folder: '' 10 | # folder UID. will be automatically generated if not specified 11 | folderUid: '' 12 | # provider type. Required 13 | type: file 14 | # disable dashboard deletion 15 | disableDeletion: false 16 | # enable dashboard editing 17 | editable: true 18 | # how often Grafana will scan for changed dashboards 19 | updateIntervalSeconds: 10 20 | # allow updating provisioned dashboards from the UI 21 | allowUiUpdates: true 22 | options: 23 | # path to dashboard files on disk. Required 24 | path: /etc/grafana/provisioning/dashboards -------------------------------------------------------------------------------- /grafana/conf/prometheus/rules/celestia-node.rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: celestia-node 3 | rules: 4 | - alert: celestia_node_service_not_running 5 | expr: node_systemd_unit_state{name=~"celestia-(bridge|light|full).*", state="active"} == 0 6 | for: 1m 7 | labels: 8 | severity: warning 9 | annotations: 10 | summary: "Celestia Node service is not running (instance {{ $labels.instance }})" 11 | description: "Celestia Node service is not running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 12 | 13 | - alert: celestia_node_stop_syncing 14 | expr: delta(celestia_total_synced_headers[5m]) == 0 or delta(celestia_hdr_store_head_height_gauge[5m]) == 0 15 | for: 1m 16 | labels: 17 | severity: warning 18 | annotations: 19 | summary: "celestia node {{ $labels.instance }} is stuck" 20 | description: "celestia node {{ $labels.instance }} is stuck" 21 | -------------------------------------------------------------------------------- /grafana/conf/prometheus/prometheus.yml.tpl: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 10s 3 | scrape_timeout: 3s 4 | evaluation_interval: 5s 5 | 6 | # Rules and alerts are read from the specified file(s) 7 | rule_files: 8 | - rules/*.rules.yaml 9 | 10 | # Alerting specifies settings related to the Alertmanager 11 | alerting: 12 | alertmanagers: 13 | - static_configs: 14 | - targets: 15 | # Alertmanager's default port is 9093 16 | - alertmanager:9093 17 | 18 | scrape_configs: 19 | - job_name: celestia-appd 20 | static_configs: 21 | - targets: ['PUBLIC_IP:26660', 'process-exporter:9256'] 22 | 23 | - job_name: prometheus 24 | static_configs: 25 | - targets: ['localhost:9090','cadvisor:8080','node-exporter:9100'] 26 | 27 | - job_name: otel-collector 28 | static_configs: 29 | - targets: ['otel-collector:8888'] 30 | 31 | - job_name: celestia-da-node-metrics 32 | static_configs: 33 | - targets: ['otel-collector:OTEL_PROMETHEUS_EXPORTER'] 34 | 35 | -------------------------------------------------------------------------------- /grafana/conf/otel-collector-config.yaml.tpl: -------------------------------------------------------------------------------- 1 | receivers: 2 | otlp: 3 | protocols: 4 | grpc: 5 | http: 6 | prometheus: 7 | config: 8 | scrape_configs: 9 | - job_name: 'otel-collector' 10 | scrape_interval: 10s 11 | static_configs: 12 | - targets: ['127.0.0.1:8888'] 13 | exporters: 14 | otlphttp: 15 | endpoint: https://otel.celestia.observer 16 | prometheus: 17 | endpoint: "0.0.0.0:OTEL_PROMETHEUS_EXPORTER" 18 | namespace: celestia 19 | send_timestamps: true 20 | metric_expiration: 180m 21 | enable_open_metrics: true 22 | resource_to_telemetry_conversion: 23 | enabled: true 24 | processors: 25 | batch: 26 | memory_limiter: 27 | # 80% of maximum memory up to 2G 28 | limit_mib: 1500 29 | # 25% of limit up to 2G 30 | spike_limit_mib: 512 31 | check_interval: 5s 32 | service: 33 | pipelines: 34 | metrics: 35 | receivers: [otlp] 36 | exporters: [otlphttp, prometheus] 37 | -------------------------------------------------------------------------------- /blobstreamx-installer/.env.tpl: -------------------------------------------------------------------------------- 1 | # Ethereum config 2 | PRIVATE_KEY=<> 3 | RPC_URL=<> 4 | 5 | # Tendermint config. Accepts comma separated list of RPC URLs for failover. 6 | TENDERMINT_RPC_URL=<> 7 | 8 | # Operator script config 9 | SUCCINCT_RPC_URL=local 10 | SUCCINCT_API_KEY= 11 | CHAIN_ID=<> 12 | CONTRACT_ADDRESS=<> 13 | NEXT_HEADER_FUNCTION_ID=<<0xNEXT_HEADER_FUNCTION_ID>> 14 | HEADER_RANGE_FUNCTION_ID=<<0xHEADER_RANGE_FUNCTION_ID>> 15 | 16 | # Optional from here on. Only add to `.env` if you want to do local proving. 17 | # Set both to true if you want to do local proving and relaying. 18 | LOCAL_PROVE_MODE=true 19 | LOCAL_RELAY_MODE=true 20 | # Add the path to each binary (ex. PROVE_BINARY_0x6d...=blobstream-artifacts/header_range) 21 | PROVE_BINARY_<<0xNEXT_HEADER_FUNCTION_ID>>="<>" 22 | PROVE_BINARY_<<0xHEADER_RANGE_FUNCTION_ID>>="<>" 23 | # actually a folder to the binary 24 | WRAPPER_BINARY="<>" -------------------------------------------------------------------------------- /grafana/conf/alertmanager.yaml.tpl: -------------------------------------------------------------------------------- 1 | #see here for a more detail config file : 2 | # https://github.com/prometheus/alertmanager/blob/main/doc/examples/simple.yml 3 | global: 4 | # The smarthost and SMTP sender used for mail notifications. 5 | smtp_smarthost: 'smtp.example.com:8825' 6 | smtp_from: 'no-reply@example.com' 7 | smtp_require_tls: false 8 | # The root route on which each incoming alert enters. 9 | route: 10 | # The labels by which incoming alerts are grouped together. For example, 11 | # multiple alerts coming in for cluster=A and alertname=LatencyHigh would 12 | # be batched into a single group. 13 | # 14 | # To aggregate by all possible labels use '...' as the sole label name. 15 | # This effectively disables aggregation entirely, passing through all 16 | # alerts as-is. This is unlikely to be what you want, unless you have 17 | # a very low alert volume or your upstream notification system performs 18 | # its own grouping. Example: group_by: [...] 19 | receiver: 'PagerDuty' 20 | group_by: [...] 21 | repeat_interval: 4h 22 | 23 | routes: 24 | - receiver: "PagerDuty" 25 | 26 | receivers: 27 | - name: 'PagerDuty' 28 | pagerduty_configs: 29 | - service_key: 'PD_SERVICE_KEY' -------------------------------------------------------------------------------- /grafana/conf/loki/loki.yaml: -------------------------------------------------------------------------------- 1 | auth_enabled: false 2 | 3 | server: 4 | http_listen_port: 3100 5 | 6 | common: 7 | path_prefix: /mnt/loki 8 | storage: 9 | filesystem: 10 | chunks_directory: /mnt/data/chunks 11 | rules_directory: /mnt/data/rules 12 | replication_factor: 1 13 | ring: 14 | instance_addr: 127.0.0.1 15 | kvstore: 16 | store: inmemory 17 | 18 | storage_config: 19 | boltdb: 20 | directory: /mnt/data/index 21 | 22 | filesystem: 23 | directory: /mnt/data/chunks 24 | 25 | limits_config: 26 | enforce_metric_name: false 27 | reject_old_samples: true 28 | reject_old_samples_max_age: 168h 29 | ingestion_rate_mb: 64 30 | ingestion_burst_size_mb: 64 31 | per_stream_rate_limit: 64 32 | per_stream_rate_limit_burst: 64MB 33 | 34 | chunk_store_config: 35 | max_look_back_period: 0s 36 | 37 | table_manager: 38 | retention_deletes_enabled: true 39 | retention_period: 30d 40 | 41 | schema_config: 42 | configs: 43 | - from: 2020-10-24 44 | store: boltdb-shipper 45 | object_store: filesystem 46 | schema: v11 47 | index: 48 | prefix: index_ 49 | period: 24h 50 | 51 | ruler: 52 | storage: 53 | type: local 54 | local: 55 | directory: /etc/loki/rules 56 | rule_path: /tmp/ 57 | alertmanager_url: http://alertmanager:9093 58 | ring: 59 | kvstore: 60 | store: inmemory 61 | enable_api: true 62 | enable_alertmanager_v2: true -------------------------------------------------------------------------------- /grafana/start.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | #### Fill up all the variable below 4 | PD_INTEGRATION_KEY="" 5 | ADMIN_USER="admin" #change if you want 6 | ADMIN_PASSWORD="useyourownpassword" 7 | 8 | # port with celestia DA node metrics exposed by otel 9 | OTEL_PROMETHEUS_EXPORTER=8889 10 | OTEL_GRPC_PORT=4317 11 | OTEL_HTTP_PORT=4318 12 | 13 | #### no more variable to change/set 14 | 15 | cp conf/prometheus/prometheus.yml.tpl conf/prometheus/prometheus.yml 16 | sed -i "s/PUBLIC_IP/$(curl -s ifconfig.me)/g" conf/prometheus/prometheus.yml 17 | sed -i "s/OTEL_PROMETHEUS_EXPORTER/${OTEL_PROMETHEUS_EXPORTER}/g" conf/prometheus/prometheus.yml 18 | 19 | cp conf/alertmanager.yaml.tpl conf/alertmanager.yaml 20 | sed -i "s/PD_SERVICE_KEY/${PD_INTEGRATION_KEY}/g" conf/alertmanager.yaml 21 | 22 | cp conf/otel-collector-config.yaml.tpl conf/otel-collector-config.yaml 23 | sed -i "s/OTEL_PROMETHEUS_EXPORTER/${OTEL_PROMETHEUS_EXPORTER}/g" conf/otel-collector-config.yaml 24 | 25 | OTEL_GRPC_PORT=${OTEL_GRPC_PORT} \ 26 | OTEL_HTTP_PORT=${OTEL_HTTP_PORT} \ 27 | USERID=$(id -u $USER) \ 28 | USERGP=$(id -g $USER) \ 29 | ADMIN_USER=${ADMIN_USER} \ 30 | ADMIN_PASSWORD=${ADMIN_PASSWORD} \ 31 | GF_USERS_ALLOW_SIGN_UP=false \ 32 | PROMETHEUS_CONFIG="./data/prometheus.yml" \ 33 | GRAFANA_CONFIG="./data/grafana.ini" \ 34 | OTEL_PROMETHEUS_EXPORTER=${OTEL_PROMETHEUS_EXPORTER} \ 35 | docker-compose up -d --remove-orphans --build "$@" 36 | 37 | sudo chown -R $USER:$USER data -------------------------------------------------------------------------------- /grafana/conf/grafana/datasources/prometheus.yml: -------------------------------------------------------------------------------- 1 | # config file version 2 | apiVersion: 1 3 | 4 | # list of datasources that should be deleted from the database 5 | #deleteDatasources: 6 | # - name: Prometheus 7 | # orgId: 1 8 | 9 | # list of datasources to insert/update depending 10 | # whats available in the database 11 | datasources: 12 | # name of the datasource. Required 13 | - name: Prometheus 14 | # datasource type. Required 15 | type: prometheus 16 | # access mode. direct or proxy. Required 17 | access: proxy 18 | # org id. will default to orgId 1 if not specified 19 | orgId: 1 20 | # url 21 | url: http://prometheus:9090 22 | # database password, if used 23 | password: 24 | # database user, if used 25 | user: 26 | # database name, if used 27 | database: 28 | # enable/disable basic auth 29 | basicAuth: false 30 | # basic auth username, if used 31 | basicAuthUser: 32 | # basic auth password, if used 33 | basicAuthPassword: 34 | # enable/disable with credentials headers 35 | withCredentials: 36 | # mark as default datasource. Max one per org 37 | isDefault: true 38 | # fields that will be converted to json and stored in json_data 39 | jsonData: 40 | graphiteVersion: "1.1" 41 | tlsAuth: false 42 | tlsAuthWithCACert: false 43 | # json object of data that will be encrypted. 44 | secureJsonData: 45 | tlsCACert: "..." 46 | tlsClientCert: "..." 47 | tlsClientKey: "..." 48 | version: 1 49 | # allow users to edit datasources from the UI. 50 | editable: true -------------------------------------------------------------------------------- /grafana/conf/prometheus/rules/celestia-appd.rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: celestia-appd 3 | rules: 4 | - alert: NodeStuck 5 | expr: delta(tendermint_consensus_height[5m]) == 0 6 | for: 1m 7 | labels: 8 | severity: warning 9 | annotations: 10 | summary: "celestia-appd {{ $labels.instance }} is stuck" 11 | description: "celestia-appd {{ $labels.instance }} is stuck" 12 | 13 | - alert: lowP2Pconnection 14 | expr: tendermint_p2p_peers < 2 15 | for: 5m 16 | labels: 17 | severity: warning 18 | annotations: 19 | summary: "celestia-appd {{ $labels.instance }} is having less than 2 peers connected for 5min" 20 | description: "celestia-appd {{ $labels.instance }} is currently connected to {{ $value }}" 21 | 22 | - alert: FastSyncing 23 | expr: tendermint_consensus_fast_syncing == 1 24 | for: 1m 25 | labels: 26 | severity: info 27 | annotations: 28 | summary: "celestia-appd {{ $labels.instance }} is currently fast syncing for 1m" 29 | description: "celestia-appd {{ $labels.instance }} is currently fast syncing for 1m" 30 | 31 | - alert: StateSyncing 32 | expr: tendermint_consensus_state_syncing == 1 33 | for: 1m 34 | labels: 35 | severity: info 36 | annotations: 37 | summary: "celestia-appd {{ $labels.instance }} is currently state syncing for 1m" 38 | description: "celestia-appd {{ $labels.instance }} is currently state syncing" 39 | 40 | - alert: UnusualBlocktime 41 | expr: 60 / (delta(tendermint_consensus_block_interval_seconds_count[30m])/30) > 24 42 | for: 1m 43 | labels: 44 | severity: info 45 | annotations: 46 | summary: "celestia-appd {{ $labels.instance }} or network block time is more than 24s" 47 | description: "celestia-appd {{ $labels.instance }} or network block time is currently at {{ $value }}" 48 | 49 | - alert: celestia_appd_is_not_running 50 | expr: node_systemd_unit_state{name=~"celestia-appd.*", state="active"} == 0 51 | for: 1m 52 | labels: 53 | severity: warning 54 | annotations: 55 | summary: "Celestia Appd service is not running (instance {{ $labels.instance }})" 56 | description: "Celestia Appd service is not running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" -------------------------------------------------------------------------------- /grafana/README.md: -------------------------------------------------------------------------------- 1 | # Monitoring stack for Celestia Validator 2 | 3 | ## Description 4 | This Monitoring stack is made of : 5 | - grafana for the viewing of the graph 6 | - node_exporter to monitor your host (and the celestia services) 7 | - prometheus to capture the metrics and make it available for Grafana 8 | - loki to display logs 9 | - promtail to send logs to loki 10 | - alertmanager integrated with pagerduty to receive alert 11 | - local otel collector configured to forward the DA node metrics to CELESTIA_OTEL and expose the DA node prometheus metrics (:25660) 12 | 13 | ## Prereq 14 | 15 | - celestia-appd and celestia needs to be installed with systemd so logs are available in journalctl. Name of the unit file must match the official doc : https://docs.celestia.org/nodes/systemd/ 16 | - celestia-appd needs to be configured to allow prometheus telemetry 17 | - celestia node needs to be configured with metrics : `-metrics.tls=false --metrics --metrics.endpoint 127.0.0.1:4318` 18 | - install docker - sudo/root privilege 19 | 20 | ```bash 21 | # install docker / docker-compose 22 | sudo apt update 23 | sudo apt install -y ca-certificates curl gnupg lsb-release 24 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg 25 | echo \ 26 | "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \ 27 | $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null 28 | sudo apt update 29 | sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin 30 | sudo usermod -aG docker $USER #you need to logout and login back after that 31 | ``` 32 | 33 | ## Installing the stack 34 | 35 | ### Clone the repo 36 | 37 | ```bash 38 | git clone https://github.com/P-OPSTeam/celestia-tools 39 | cd celestia-tools/grafana 40 | ``` 41 | 42 | ### Update start.sh 43 | 44 | - update the admin/password of your grafana 45 | - Next, If you wanna be alerted, you will need to create an account on pagerduty and get your integration key https://support.pagerduty.com/docs/services-and-integrations 46 | 47 | > alertmanager will fail to start if the PD integration key is not filled up 48 | 49 | 50 | ### Start the stack 51 | 52 | ```bash 53 | bash start.sh 54 | ``` 55 | 56 | ### Check the documentation 57 | 58 | - [Grafana documentation](docs/grafana.md) 59 | 60 | ## TODO 61 | 62 | - [ ] TBD 63 | 64 | 65 | -------------------------------------------------------------------------------- /grafana/docs/grafana.md: -------------------------------------------------------------------------------- 1 | # Grafana 2 | 3 | ## What it is ? 4 | 5 | Grafana open source software enables you to query, visualize, alert on, and explore your metrics, logs, and traces wherever they are stored. Extracted from and feel free to read https://grafana.com/docs/grafana/latest/introduction/ for more information 6 | 7 | ## Login to the grafana page 8 | 9 | Obtain the IP of your grafana stack (usually your validator ip), basically the one you used to install the stack and access grafana by browsing http://grafana_stack_ip:3000 10 | 11 | Grafana will then load and will present you with the login page 12 | 13 | ![Grafana login screen](img/grafana-login.png?raw=true "Grafana login screen") 14 | 15 | Enter the username and password as defined entered in start.sh 16 | 17 | You will then be on the Grafana welcome page 18 | 19 | ![Grafana welcome](img/grafana-welcome.png?raw=true "Grafana welcome") 20 | 21 | ## Open a dashboard 22 | 23 | Click on for the `4 squares` on the left menu, then `browse`, then `General`, and `Cosmos/Tendermint Network dashboard` 24 | 25 | You will then be able to see your validator dashboard : 26 | 27 | ![Validator dashboard](img/grafana-dashboard.png?raw=true "Validator dashboard") 28 | 29 | Feel free to explore the other dashboard and ask us any questions. You can reach out to us on discord https://discord.gg/jRAmy7uS8v or telegram https://t.me/POPS_Team_Validator 30 | 31 | 32 | ## Accessing your validator logs (loki) 33 | 34 | Click on the `compass` on the left menu, then `Explorer`, on top click `Prometheus` and change it to `Loki` 35 | 36 | ![Grafana explore](img/grafana-explore.png?raw=true "Grafana explore") 37 | 38 | You should now be able to use the `builder` to select a label (ie `unit`) and the value ie `celestia-appd` then click on the `Run Query` to see the logs 39 | 40 | ![Grafana celestia-appd logs](img/grafana-logs-celestia-appd.png?raw=true "Grafana celestia-appd logs") 41 | 42 | Below an example with the logs of the loki container 43 | 44 | ![Grafana logs](img/grafana-logs.png?raw=true "Grafana logs") 45 | 46 | > Note that logs rely on another component of the stack called promtail and is used to send the logs to loki. Freel free to checkout `conf/promtail.yaml` configuration file 47 | 48 | ## What about the alerts ? 49 | 50 | Click on the bell on the left menu then click on `Alert rules`. You'll see the loki and prometheus rules 51 | 52 | ![Grafana Alert](img/grafana-alerts.png?raw=true "Grafana Alert") 53 | 54 | > Free to navigate into all the defined rules which you can find the same under the `conf/prometheus/rules` and `conf/loki/rules` folder 55 | -------------------------------------------------------------------------------- /blobstreamx-installer/README.md: -------------------------------------------------------------------------------- 1 | # P-OPS BlobstreamX installer 2 | 3 | Thanks to the succinct team to help out with the blobstream installation, here is the blobstreamx installer script. 4 | 5 | # Prereq 6 | 7 | Tested and installed with the below 8 | 9 | ## Hardware 10 | 11 | AWS instance r6x.8xlarge (or 32vcpu 256GiB Ram) + 150 GB ebs gp2 12 | 13 | ## OS 14 | 15 | ubuntu 22 16 | 17 | ## Software 18 | 19 | docker, rust 20 | 21 | ## Relay 22 | your relayer address will need to be whitelisted 23 | 24 | # Install steps 25 | 26 | # Info gathering 27 | 28 | ## Succinct Explorer Release ID 29 | Get the explorer release ID of both header range/next header release by looking at the URL column in the the table there : https://hackmd.io/@succinctlabs/HJE7XRrup#Download-Blobstream-X-Plonky2x-Circuits 30 | 31 | > ie 33 in https://alpha.succinct.xyz/celestia/blobstreamx/releases/33 32 | 33 | ## Verifier build 34 | Confirm the verifier build tar.gz https://hackmd.io/@succinctlabs/HJE7XRrup#Download-Verifier-Groth16-Wrapper-Circuit 35 | 36 | > ie verifier-build13.tar.gz 37 | 38 | ## Install 39 | 40 | ### Clone repo 41 | 42 | ```bash 43 | git clone https://github.com/P-OPSTeam/celestia-tools.git 44 | cd celestia-tools/blobstreamx-installer 45 | ``` 46 | 47 | ### Copy and edit .env.local 48 | 49 | ```bash 50 | cp .env.local.example .env.local 51 | ``` 52 | use your favorite editor to edit .env.local 53 | 54 | > .env.local variable will be sourced by the script 55 | 56 | ### Install docker 57 | 58 | ```bash 59 | bash install-docker.sh 60 | ``` 61 | 62 | logout, login back to your instance and come back to the installer directory 63 | 64 | ```bash 65 | cd celestia-tools/blobstreamx-installer 66 | ``` 67 | 68 | ### Install blobstreamx 69 | 70 | ```bash 71 | bash install.sh 72 | ``` 73 | 74 | > as of April 7: bash install.sh verifier-build13.tar.gz 36 35 75 | 76 | ### check your ~/blobstreamx/.env is good 77 | 78 | you should have something similar to the below for the arbitrum sepolia 79 | 80 | ``` 81 | # Relayer private key 82 | PRIVATE_KEY= 83 | # Chainid of the chain the proof are supposed to be submitted to 84 | CHAINID=421614 85 | RPC_URL=https://arbitrum-sepolia.blockpi.network/v1/rpc/public 86 | TENDERMINT_RPC_URL=https://rpc-mocha.pops.one 87 | CONTRACT_ADDRESS=0xc3e209eb245Fd59c8586777b499d6A665DF3ABD2 88 | 89 | # update the below accorindgly to the expected binary downloaded from the release 90 | #PROVE_BINARY_NEXT_HEADER="./artifacts/next-header-mocha/next_header_mocha" 91 | #PROVE_BINARY_HEADER_RANGE="./artifacts/header-range-mocha/header_range_mocha" 92 | WRAPPER_BINARY="./artifacts/verifier-build" 93 | ``` 94 | 95 | ### run the prover/relayer 96 | 97 | ```bash 98 | cargo run --bin blobstreamx --release 99 | ``` -------------------------------------------------------------------------------- /grafana/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.3' 2 | 3 | services: 4 | prometheus: 5 | image: prom/prometheus:v2.47.0 6 | container_name: prometheus 7 | user: ${USERID:-1000}:${USERGP:-1000} 8 | command: 9 | - --log.level=warn 10 | - --config.file=/etc/prometheus/prometheus.yml 11 | - --storage.tsdb.path=/prometheus 12 | - --storage.tsdb.retention.time=360d 13 | - --web.console.libraries=/usr/share/prometheus/console_libraries 14 | - --web.console.templates=/usr/share/prometheus/consoles 15 | ports: 16 | - "19090:9090" 17 | depends_on: 18 | - alertmanager 19 | volumes: 20 | - ${PROMETHEUS_CONFIG_PATH:-./conf/prometheus/}:/etc/prometheus/:ro 21 | - ${XDG_DATA_HOME:-./data}/prometheus:/prometheus 22 | restart: unless-stopped 23 | 24 | alertmanager: 25 | image: prom/alertmanager:v0.26.0 26 | container_name: alertmanager 27 | volumes: 28 | - ./conf/alertmanager.yaml:/etc/alertmanager/alertmanager.yaml:ro 29 | - ${XDG_DATA_HOME:-./data}/alertmanager:/data 30 | command: 31 | - --config.file=/etc/alertmanager/alertmanager.yaml 32 | - --storage.path=/data 33 | - --log.level=debug 34 | restart: unless-stopped 35 | user: ${USERID:-1000}:${USERGP:-1000} 36 | ports: 37 | - "9093:9093" 38 | 39 | grafana: 40 | image: grafana/grafana:10.1.4 41 | user: ${USERID:-1000}:${USERGP:-1000} 42 | container_name: grafana 43 | ports: 44 | - 3000:3000 45 | environment: 46 | - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin} 47 | - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} 48 | volumes: 49 | - ${GRAFANA_CONFIG_PATH:-./conf/grafana/grafana.ini}:/etc/grafana/grafana.ini 50 | - ./conf/grafana/datasources:/etc/grafana/provisioning/datasources 51 | - ./conf/grafana/dashboards:/etc/grafana/provisioning/dashboards 52 | - ./conf/grafana/notifiers:/etc/grafana/provisioning/notifiers 53 | - ${XDG_DATA_HOME:-./data}/grafana:/var/lib/grafana 54 | restart: unless-stopped 55 | 56 | cadvisor: 57 | image: gcr.io/cadvisor/cadvisor:v0.47.1 58 | container_name: cadvisor 59 | restart: unless-stopped 60 | volumes: 61 | - /:/rootfs:ro 62 | - /var/run:/var/run:rw 63 | - /sys:/sys:ro 64 | - /var/lib/docker/:/var/lib/docker:ro 65 | expose: 66 | - 8080 67 | 68 | node-exporter: 69 | image: prom/node-exporter:v1.6.1 70 | container_name: node_exporter 71 | restart: unless-stopped 72 | command: 73 | - '--path.rootfs=/host' 74 | - '--collector.systemd' 75 | pid: host 76 | volumes: 77 | - '/:/host:ro,rslave' 78 | expose: 79 | - 9100 80 | 81 | loki: 82 | container_name: loki 83 | image: grafana/loki:2.9.1 84 | ports: 85 | - 3100:3100 86 | user: ${USERID:-1000}:${USERGP:-1000} 87 | volumes: 88 | - ./conf/loki/loki.yaml:/etc/loki/loki.yaml:ro 89 | - ${XDG_DATA_HOME:-./data}/loki:/mnt 90 | - ./conf/loki/rules:/etc/loki/rules/fake:ro 91 | command: -config.file=/etc/loki/loki.yaml 92 | 93 | promtail: 94 | container_name: promtail 95 | image: grafana/promtail:2.9.1 96 | volumes: 97 | - /var/log:/var/log:ro 98 | - ./conf/promtail.yaml:/etc/promtail/promtail.yaml:ro 99 | - /var/run/docker.sock:/var/run/docker.sock:ro 100 | command: -config.file=/etc/promtail/promtail.yaml 101 | 102 | otel-collector: 103 | container_name: otel-collector 104 | image: otel/opentelemetry-collector:0.86.0 105 | command: ["--config=/etc/otel-collector-config.yaml"] 106 | volumes: 107 | - ./conf/otel-collector-config.yaml:/etc/otel-collector-config.yaml 108 | ports: 109 | - ${OTEL_GRPC_PORT:-4317}:${OTEL_GRPC_PORT:-4317} 110 | - ${OTEL_HTTP_PORT:-4317}:${OTEL_HTTP_PORT:-4317} 111 | - ${OTEL_PROMETHEUS_EXPORTER:-8889}:${OTEL_PROMETHEUS_EXPORTER:-8889} 112 | -------------------------------------------------------------------------------- /blobstreamx-installer/install.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # script to install blobstreamX prover/relayer 4 | 5 | if [[ -z $1 || -z $2 || -z $3 ]]; then 6 | echo "./install.sh VERIFIER_BUILD HEADER_RANGE_EXPLORER_RELEASE_ID NEXT_HEADER_EXPLORER_RELEASE_ID" 7 | exit 1 8 | fi 9 | 10 | # ie verifier-build13.tar.gz 11 | VERIFIER_BUILD=$1 12 | # ie 33 in https://alpha.succinct.xyz/celestia/blobstreamx/releases/33 13 | HEADER_RANGE_EXPLORER_RELEASE_ID=$2 14 | NEXT_HEADER_EXPLORER_RELEASE_ID=$3 15 | 16 | source $HOME/celestia-tools/blobstreamx-installer/.env.local 17 | 18 | echo "Update apt and make sure required packages are installed" 19 | sudo apt update 20 | sudo apt install -y aria2 jq tree git build-essential libssl-dev pkg-config 21 | 22 | echo 23 | echo "Get HEADER_RANGE info from succinct explorer https://alpha.succinct.xyz/celestia/blobstreamx/releases/$HEADER_RANGE_EXPLORER_RELEASE_ID" 24 | header_info_json=$(curl -s https://alpha.succinct.xyz/api/projects/celestia/blobstreamx/releases/$HEADER_RANGE_EXPLORER_RELEASE_ID) 25 | 26 | hr_release_id=$(echo $header_info_json | jq -r .id) 27 | hr_most_recent_function_id=$(echo "$header_info_json" | jq -r --arg CHAINID "$CHAINID" '.edges.deployments | map(select(.chain_id == ($CHAINID | tonumber))) | max_by(.updated_at) | .function_id') 28 | 29 | echo 30 | echo "Get NEXT_HEADER info from succinct explorer https://alpha.succinct.xyz/celestia/blobstreamx/releases/$NEXT_HEADER_EXPLORER_RELEASE_ID" 31 | next_header_json=$(curl -s https://alpha.succinct.xyz/api/projects/celestia/blobstreamx/releases/$NEXT_HEADER_EXPLORER_RELEASE_ID) 32 | 33 | nh_release_id=$(echo $next_header_json | jq -r .id) 34 | nh_most_recent_function_id=$(echo "$next_header_json" | jq -r --arg CHAINID "$CHAINID" '.edges.deployments | map(select(.chain_id == ($CHAINID | tonumber))) | max_by(.updated_at) | .function_id') 35 | 36 | echo "info obtained from succinct explorer:" 37 | echo "- header range release id : $hr_release_id" 38 | echo "- header range function id : $hr_most_recent_function_id" 39 | echo "- next header release id : $nh_release_id" 40 | echo "- next header function id : $nh_most_recent_function_id" 41 | 42 | if [[ -z $hr_release_id || -z $hr_most_recent_function_id || -z $nh_release_id || -z $nh_most_recent_function_id ]]; then 43 | echo "none of the 4 information above should be empty" 44 | exit 1 45 | fi 46 | 47 | echo 48 | echo "clone blobstream project" 49 | cd $HOME 50 | git clone https://github.com/succinctlabs/blobstreamx 51 | cd blobstreamx; mkdir artifacts; cd artifacts; 52 | 53 | echo 54 | echo "Download Verifier : $VERIFIER_BUILD" 55 | aria2c -s14 -x14 -k100M https://public-circuits.s3.amazonaws.com/$VERIFIER_BUILD 56 | echo 57 | echo "Install Verifier : $VERIFIER_BUILD" 58 | tar xzf $VERIFIER_BUILD 59 | 60 | echo 61 | echo "Download header range circuits files from $hr_release_id" 62 | aria2c -s14 -x14 -k100M https://public-blobstreamx-circuits.s3.amazonaws.com/${hr_release_id}.tar.gz 63 | echo 64 | echo "Install header range" 65 | tar xzf ${hr_release_id}.tar.gz 66 | echo 67 | echo "Determine header range folder and binary name" 68 | hr_folder_name=$(find . -maxdepth 1 -type d \( -name "*range*" \)) 69 | hr_binary_name=$(basename $(find "$hr_folder_name" -type f \( -name "*range*" \) ! -name "._*")) 70 | chmod +x ${hr_folder_name}/${hr_binary_name} 71 | 72 | echo 73 | echo "Download next header circuits files from $nh_release_id" 74 | aria2c -s14 -x14 -k100M https://public-blobstreamx-circuits.s3.amazonaws.com/${nh_release_id}.tar.gz 75 | echo 76 | echo "Install next header" 77 | tar xzf ${nh_release_id}.tar.gz 78 | echo 79 | echo "Determine next header folder and binary name" 80 | nh_folder_name=$(find . -maxdepth 1 -type d \( -name "*next*" \)) 81 | nh_binary_name=$(basename $(find "$nh_folder_name" -type f \( -name "*next*" \) ! -name "._*")) 82 | chmod +x ${nh_folder_name}/${nh_binary_name} 83 | 84 | echo 85 | echo "Update .env file" 86 | 87 | cp $HOME/celestia-tools/blobstreamx-installer/.env.tpl $HOME/blobstreamx/.env 88 | 89 | sed -i "s|<>|${PRIVATE_KEY}|g" $HOME/blobstreamx/.env 90 | sed -i "s|<>|${RPC_URL}|g" $HOME/blobstreamx/.env 91 | sed -i "s|<>|${TENDERMINT_RPC_URL}|g" $HOME/blobstreamx/.env 92 | sed -i "s|<>|${CHAINID}|g" $HOME/blobstreamx/.env 93 | sed -i "s|<>|${CONTRACT_ADDRESS}|g" $HOME/blobstreamx/.env 94 | sed -i "s|<<0xNEXT_HEADER_FUNCTION_ID>>|${nh_most_recent_function_id}|g" $HOME/blobstreamx/.env 95 | sed -i "s|<<0xHEADER_RANGE_FUNCTION_ID>>|${hr_most_recent_function_id}|g" $HOME/blobstreamx/.env 96 | sed -i "s|<>|\./artifacts/${nh_folder_name}/${nh_binary_name}|g" $HOME/blobstreamx/.env 97 | sed -i "s|<>|\./artifacts/${hr_folder_name}/${hr_binary_name}|g" $HOME/blobstreamx/.env 98 | sed -i "s|<>|${WRAPPER_BINARY}|g" $HOME/blobstreamx/.env 99 | 100 | # docker needs to be installed before hand bash install-docker.sh 101 | echo 102 | echo "pull latest succinct-local-prover image" 103 | docker pull succinctlabs/succinct-local-prover 104 | 105 | echo 106 | echo "Install rust" 107 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 108 | 109 | 110 | source ~/.profile 111 | 112 | echo "check $HOME/blobstreamx/.env see if all ok" 113 | echo "then start proving and/or relaying with cargo run --bin blobstreamx --release" -------------------------------------------------------------------------------- /grafana/conf/grafana/dashboards/process_exporter.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "DS_PROMETHEUS", 5 | "label": "prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "panel", 15 | "id": "graph", 16 | "name": "Graph", 17 | "version": "" 18 | }, 19 | { 20 | "type": "grafana", 21 | "id": "grafana", 22 | "name": "Grafana", 23 | "version": "3.1.1" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "1.0.0" 30 | } 31 | ], 32 | "id": null, 33 | "title": "Processes monitoring", 34 | "tags": [ 35 | "OS" 36 | ], 37 | "style": "dark", 38 | "timezone": "browser", 39 | "editable": true, 40 | "hideControls": false, 41 | "sharedCrosshair": true, 42 | "rows": [ 43 | { 44 | "collapse": false, 45 | "editable": true, 46 | "height": "250px", 47 | "panels": [ 48 | { 49 | "aliasColors": {}, 50 | "bars": false, 51 | "datasource": "Prometheus", 52 | "editable": true, 53 | "error": false, 54 | "fill": 1, 55 | "grid": { 56 | "threshold1": null, 57 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 58 | "threshold2": null, 59 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 60 | }, 61 | "id": 1, 62 | "isNew": true, 63 | "legend": { 64 | "avg": false, 65 | "current": false, 66 | "max": false, 67 | "min": false, 68 | "show": true, 69 | "total": false, 70 | "values": false 71 | }, 72 | "lines": true, 73 | "linewidth": 2, 74 | "links": [], 75 | "nullPointMode": "null", 76 | "percentage": false, 77 | "pointradius": 5, 78 | "points": false, 79 | "renderer": "flot", 80 | "seriesOverrides": [], 81 | "span": 6, 82 | "stack": false, 83 | "steppedLine": false, 84 | "targets": [ 85 | { 86 | "expr": "namedprocess_namegroup_num_procs{groupname=~\"$processes\"}", 87 | "intervalFactor": 2, 88 | "legendFormat": "{{groupname}}", 89 | "metric": "process_namegroup_num_procs", 90 | "refId": "A", 91 | "step": 10 92 | } 93 | ], 94 | "timeFrom": null, 95 | "timeShift": null, 96 | "title": "num processes", 97 | "tooltip": { 98 | "msResolution": false, 99 | "shared": true, 100 | "sort": 0, 101 | "value_type": "individual" 102 | }, 103 | "type": "graph", 104 | "xaxis": { 105 | "show": true 106 | }, 107 | "yaxes": [ 108 | { 109 | "format": "short", 110 | "label": null, 111 | "logBase": 1, 112 | "max": null, 113 | "min": null, 114 | "show": true 115 | }, 116 | { 117 | "format": "short", 118 | "label": null, 119 | "logBase": 1, 120 | "max": null, 121 | "min": null, 122 | "show": true 123 | } 124 | ] 125 | }, 126 | { 127 | "aliasColors": {}, 128 | "bars": false, 129 | "datasource": "Prometheus", 130 | "editable": true, 131 | "error": false, 132 | "fill": 1, 133 | "grid": { 134 | "threshold1": null, 135 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 136 | "threshold2": null, 137 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 138 | }, 139 | "id": 2, 140 | "isNew": true, 141 | "legend": { 142 | "avg": false, 143 | "current": false, 144 | "max": false, 145 | "min": false, 146 | "show": true, 147 | "total": false, 148 | "values": false 149 | }, 150 | "lines": true, 151 | "linewidth": 2, 152 | "links": [], 153 | "nullPointMode": "null", 154 | "percentage": false, 155 | "pointradius": 5, 156 | "points": false, 157 | "renderer": "flot", 158 | "seriesOverrides": [], 159 | "span": 6, 160 | "stack": false, 161 | "steppedLine": false, 162 | "targets": [ 163 | { 164 | "expr": "rate(namedprocess_namegroup_cpu_seconds_total{groupname=~\"$processes\"}[$interval])", 165 | "intervalFactor": 2, 166 | "legendFormat": "{{groupname}}", 167 | "metric": "process_namegroup_cpu_seconds_total", 168 | "refId": "A", 169 | "step": 10 170 | } 171 | ], 172 | "timeFrom": null, 173 | "timeShift": null, 174 | "title": "cpu", 175 | "tooltip": { 176 | "msResolution": false, 177 | "shared": true, 178 | "sort": 0, 179 | "value_type": "cumulative" 180 | }, 181 | "type": "graph", 182 | "xaxis": { 183 | "show": true 184 | }, 185 | "yaxes": [ 186 | { 187 | "format": "s", 188 | "label": null, 189 | "logBase": 1, 190 | "max": null, 191 | "min": 0, 192 | "show": true 193 | }, 194 | { 195 | "format": "short", 196 | "label": null, 197 | "logBase": 1, 198 | "max": null, 199 | "min": null, 200 | "show": true 201 | } 202 | ] 203 | }, 204 | { 205 | "aliasColors": {}, 206 | "bars": false, 207 | "datasource": "Prometheus", 208 | "editable": true, 209 | "error": false, 210 | "fill": 1, 211 | "grid": { 212 | "threshold1": null, 213 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 214 | "threshold2": null, 215 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 216 | }, 217 | "id": 3, 218 | "isNew": true, 219 | "legend": { 220 | "avg": false, 221 | "current": false, 222 | "max": false, 223 | "min": false, 224 | "show": true, 225 | "total": false, 226 | "values": false 227 | }, 228 | "lines": true, 229 | "linewidth": 2, 230 | "links": [], 231 | "nullPointMode": "null", 232 | "percentage": false, 233 | "pointradius": 5, 234 | "points": false, 235 | "renderer": "flot", 236 | "seriesOverrides": [], 237 | "span": 6, 238 | "stack": false, 239 | "steppedLine": false, 240 | "targets": [ 241 | { 242 | "expr": "rate(namedprocess_namegroup_read_bytes_total{groupname=~\"$processes\"}[$interval])", 243 | "intervalFactor": 2, 244 | "legendFormat": "{{groupname}}", 245 | "metric": "namedprocess_namegroup_read_bytes_total", 246 | "refId": "A", 247 | "step": 10 248 | } 249 | ], 250 | "timeFrom": null, 251 | "timeShift": null, 252 | "title": "read bytes", 253 | "tooltip": { 254 | "msResolution": false, 255 | "shared": true, 256 | "sort": 0, 257 | "value_type": "individual" 258 | }, 259 | "type": "graph", 260 | "xaxis": { 261 | "show": true 262 | }, 263 | "yaxes": [ 264 | { 265 | "format": "Bps", 266 | "label": null, 267 | "logBase": 1, 268 | "max": null, 269 | "min": 0, 270 | "show": true 271 | }, 272 | { 273 | "format": "short", 274 | "label": null, 275 | "logBase": 1, 276 | "max": null, 277 | "min": null, 278 | "show": true 279 | } 280 | ] 281 | }, 282 | { 283 | "aliasColors": {}, 284 | "bars": false, 285 | "datasource": "Prometheus", 286 | "editable": true, 287 | "error": false, 288 | "fill": 1, 289 | "grid": { 290 | "threshold1": null, 291 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 292 | "threshold2": null, 293 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 294 | }, 295 | "id": 4, 296 | "isNew": true, 297 | "legend": { 298 | "avg": false, 299 | "current": false, 300 | "max": false, 301 | "min": false, 302 | "show": true, 303 | "total": false, 304 | "values": false 305 | }, 306 | "lines": true, 307 | "linewidth": 2, 308 | "links": [], 309 | "nullPointMode": "null", 310 | "percentage": false, 311 | "pointradius": 5, 312 | "points": false, 313 | "renderer": "flot", 314 | "seriesOverrides": [], 315 | "span": 6, 316 | "stack": false, 317 | "steppedLine": false, 318 | "targets": [ 319 | { 320 | "expr": "rate(namedprocess_namegroup_write_bytes_total{groupname=~\"$processes\"}[$interval])", 321 | "intervalFactor": 2, 322 | "legendFormat": "{{groupname}}", 323 | "metric": "namedprocess_namegroup_read_bytes_total", 324 | "refId": "A", 325 | "step": 10 326 | } 327 | ], 328 | "timeFrom": null, 329 | "timeShift": null, 330 | "title": "write bytes", 331 | "tooltip": { 332 | "msResolution": false, 333 | "shared": true, 334 | "sort": 0, 335 | "value_type": "individual" 336 | }, 337 | "type": "graph", 338 | "xaxis": { 339 | "show": true 340 | }, 341 | "yaxes": [ 342 | { 343 | "format": "Bps", 344 | "label": null, 345 | "logBase": 1, 346 | "max": null, 347 | "min": 0, 348 | "show": true 349 | }, 350 | { 351 | "format": "short", 352 | "label": null, 353 | "logBase": 1, 354 | "max": null, 355 | "min": null, 356 | "show": true 357 | } 358 | ] 359 | }, 360 | { 361 | "aliasColors": {}, 362 | "bars": false, 363 | "datasource": "Prometheus", 364 | "editable": true, 365 | "error": false, 366 | "fill": 1, 367 | "grid": { 368 | "threshold1": null, 369 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 370 | "threshold2": null, 371 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 372 | }, 373 | "id": 5, 374 | "isNew": true, 375 | "legend": { 376 | "avg": false, 377 | "current": false, 378 | "max": false, 379 | "min": false, 380 | "show": true, 381 | "total": false, 382 | "values": false 383 | }, 384 | "lines": true, 385 | "linewidth": 2, 386 | "links": [], 387 | "nullPointMode": "null", 388 | "percentage": false, 389 | "pointradius": 5, 390 | "points": false, 391 | "renderer": "flot", 392 | "seriesOverrides": [], 393 | "span": 6, 394 | "stack": false, 395 | "steppedLine": false, 396 | "targets": [ 397 | { 398 | "expr": "namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"resident\"}", 399 | "intervalFactor": 2, 400 | "legendFormat": "{{groupname}}", 401 | "metric": "namedprocess_namegroup_memory_bytes", 402 | "refId": "A", 403 | "step": 10 404 | } 405 | ], 406 | "timeFrom": null, 407 | "timeShift": null, 408 | "title": "resident memory", 409 | "tooltip": { 410 | "msResolution": false, 411 | "shared": true, 412 | "sort": 0, 413 | "value_type": "individual" 414 | }, 415 | "type": "graph", 416 | "xaxis": { 417 | "show": true 418 | }, 419 | "yaxes": [ 420 | { 421 | "format": "bytes", 422 | "label": null, 423 | "logBase": 1, 424 | "max": null, 425 | "min": 0, 426 | "show": true 427 | }, 428 | { 429 | "format": "short", 430 | "label": null, 431 | "logBase": 1, 432 | "max": null, 433 | "min": null, 434 | "show": true 435 | } 436 | ] 437 | }, 438 | { 439 | "aliasColors": {}, 440 | "bars": false, 441 | "datasource": "Prometheus", 442 | "editable": true, 443 | "error": false, 444 | "fill": 1, 445 | "grid": { 446 | "threshold1": null, 447 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 448 | "threshold2": null, 449 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 450 | }, 451 | "id": 6, 452 | "isNew": true, 453 | "legend": { 454 | "avg": false, 455 | "current": false, 456 | "max": false, 457 | "min": false, 458 | "show": true, 459 | "total": false, 460 | "values": false 461 | }, 462 | "lines": true, 463 | "linewidth": 2, 464 | "links": [], 465 | "nullPointMode": "null", 466 | "percentage": false, 467 | "pointradius": 5, 468 | "points": false, 469 | "renderer": "flot", 470 | "seriesOverrides": [], 471 | "span": 6, 472 | "stack": false, 473 | "steppedLine": false, 474 | "targets": [ 475 | { 476 | "expr": "namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"virtual\"}", 477 | "intervalFactor": 2, 478 | "legendFormat": "{{groupname}}", 479 | "metric": "namedprocess_namegroup_memory_bytes", 480 | "refId": "A", 481 | "step": 10 482 | } 483 | ], 484 | "timeFrom": null, 485 | "timeShift": null, 486 | "title": "virtual memory", 487 | "tooltip": { 488 | "msResolution": false, 489 | "shared": true, 490 | "sort": 0, 491 | "value_type": "individual" 492 | }, 493 | "type": "graph", 494 | "xaxis": { 495 | "show": true 496 | }, 497 | "yaxes": [ 498 | { 499 | "format": "bytes", 500 | "label": null, 501 | "logBase": 1, 502 | "max": null, 503 | "min": 0, 504 | "show": true 505 | }, 506 | { 507 | "format": "short", 508 | "label": null, 509 | "logBase": 1, 510 | "max": null, 511 | "min": null, 512 | "show": true 513 | } 514 | ] 515 | } 516 | ], 517 | "title": "Row" 518 | } 519 | ], 520 | "time": { 521 | "from": "now-1h", 522 | "to": "now" 523 | }, 524 | "timepicker": { 525 | "refresh_intervals": [ 526 | "5s", 527 | "10s", 528 | "30s", 529 | "1m", 530 | "5m", 531 | "15m", 532 | "30m", 533 | "1h", 534 | "2h", 535 | "1d" 536 | ], 537 | "time_options": [ 538 | "5m", 539 | "15m", 540 | "1h", 541 | "6h", 542 | "12h", 543 | "24h", 544 | "2d", 545 | "7d", 546 | "30d" 547 | ] 548 | }, 549 | "templating": { 550 | "list": [ 551 | { 552 | "current": { 553 | "selected": false, 554 | "text": "10m", 555 | "value": "10m" 556 | }, 557 | "datasource": null, 558 | "hide": 0, 559 | "includeAll": false, 560 | "multi": false, 561 | "name": "interval", 562 | "options": [ 563 | { 564 | "selected": false, 565 | "text": "1m", 566 | "value": "1m" 567 | }, 568 | { 569 | "selected": false, 570 | "text": "5m", 571 | "value": "5m" 572 | }, 573 | { 574 | "selected": true, 575 | "text": "10m", 576 | "value": "10m" 577 | }, 578 | { 579 | "selected": false, 580 | "text": "30m", 581 | "value": "30m" 582 | }, 583 | { 584 | "selected": false, 585 | "text": "1h", 586 | "value": "1h" 587 | }, 588 | { 589 | "selected": false, 590 | "text": "6h", 591 | "value": "6h" 592 | }, 593 | { 594 | "selected": false, 595 | "text": "12h", 596 | "value": "12h" 597 | }, 598 | { 599 | "selected": false, 600 | "text": "1d", 601 | "value": "1d" 602 | }, 603 | { 604 | "selected": false, 605 | "text": "7d", 606 | "value": "7d" 607 | }, 608 | { 609 | "selected": false, 610 | "text": "14d", 611 | "value": "14d" 612 | }, 613 | { 614 | "selected": false, 615 | "text": "30d", 616 | "value": "30d" 617 | } 618 | ], 619 | "query": "1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", 620 | "refresh": 0, 621 | "type": "interval" 622 | }, 623 | { 624 | "allValue": ".+", 625 | "current": {}, 626 | "datasource": "Prometheus", 627 | "hide": 0, 628 | "includeAll": true, 629 | "multi": true, 630 | "name": "processes", 631 | "options": [], 632 | "query": "label_values(namedprocess_namegroup_cpu_seconds_total,groupname)", 633 | "refresh": 1, 634 | "type": "query" 635 | } 636 | ] 637 | }, 638 | "annotations": { 639 | "list": [] 640 | }, 641 | "refresh": "10s", 642 | "schemaVersion": 12, 643 | "version": 1, 644 | "links": [ 645 | { 646 | "asDropdown": true, 647 | "icon": "external link", 648 | "includeVars": true, 649 | "keepTime": true, 650 | "tags": [ 651 | "OS" 652 | ], 653 | "title": "OS", 654 | "type": "dashboards" 655 | } 656 | ], 657 | "gnetId": 249, 658 | "description": "Process metrics exported by https://github.com/ncabatoff/process-exporter." 659 | } -------------------------------------------------------------------------------- /grafana/conf/prometheus/rules/alert.rules.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: prometheus 3 | rules: 4 | - alert: PrometheusJobMissing 5 | expr: absent(up{job="prometheus"}) 6 | for: 0m 7 | labels: 8 | severity: warning 9 | annotations: 10 | summary: Prometheus job missing (instance {{ $labels.instance }}) 11 | description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 12 | 13 | - alert: PrometheusTargetMissing 14 | expr: up == 0 15 | for: 10m 16 | labels: 17 | severity: critical 18 | annotations: 19 | summary: Prometheus target missing (instance {{ $labels.instance }}) 20 | description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 21 | 22 | - alert: PrometheusAlertmanagerConfigurationReloadFailure 23 | expr: alertmanager_config_last_reload_successful != 1 24 | for: 0m 25 | labels: 26 | severity: warning 27 | annotations: 28 | summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) 29 | description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 30 | 31 | - alert: PrometheusNotConnectedToAlertmanager 32 | expr: prometheus_notifications_alertmanagers_discovered < 1 33 | for: 0m 34 | labels: 35 | severity: critical 36 | annotations: 37 | summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) 38 | description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 39 | 40 | - name: Hosts 41 | rules: 42 | - alert: HostOutOfMemory 43 | expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 44 | for: 2m 45 | labels: 46 | severity: warning 47 | annotations: 48 | summary: Host out of memory (instance {{ $labels.instance }}) 49 | description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 50 | 51 | - alert: HostMemoryUnderMemoryPressure 52 | expr: rate(node_vmstat_pgmajfault[1m]) > 1000 53 | for: 2m 54 | labels: 55 | severity: warning 56 | annotations: 57 | summary: Host memory under memory pressure (instance {{ $labels.instance }}) 58 | description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 59 | 60 | - alert: HostUnusualNetworkThroughputIn 61 | expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 62 | for: 5m 63 | labels: 64 | severity: warning 65 | annotations: 66 | summary: Host unusual network throughput in (instance {{ $labels.instance }}) 67 | description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 68 | 69 | - alert: HostUnusualNetworkThroughputOut 70 | expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 71 | for: 5m 72 | labels: 73 | severity: warning 74 | annotations: 75 | summary: Host unusual network throughput out (instance {{ $labels.instance }}) 76 | description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 77 | 78 | - alert: HostUnusualDiskReadRate 79 | expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 80 | for: 5m 81 | labels: 82 | severity: warning 83 | annotations: 84 | summary: Host unusual disk read rate (instance {{ $labels.instance }}) 85 | description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 86 | 87 | - alert: HostUnusualDiskWriteRate 88 | expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 150 89 | for: 5m 90 | labels: 91 | severity: warning 92 | annotations: 93 | summary: Host unusual disk write rate (instance {{ $labels.instance }}) 94 | description: "Disk is probably writing too much data (> 150 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 95 | 96 | # Please add ignored mountpoints in node_exporter parameters like 97 | # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". 98 | # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. 99 | - alert: HostOutOfDiskSpace 100 | expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 101 | for: 2m 102 | labels: 103 | severity: warning 104 | annotations: 105 | summary: Host out of disk space (instance {{ $labels.instance }}) 106 | description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 107 | 108 | # Please add ignored mountpoints in node_exporter parameters like 109 | # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". 110 | # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. 111 | - alert: HostDiskWillFillIn24Hours 112 | expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 113 | for: 2m 114 | labels: 115 | severity: warning 116 | annotations: 117 | summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) 118 | description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 119 | 120 | - alert: HostOutOfInodes 121 | expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 122 | for: 2m 123 | labels: 124 | severity: warning 125 | annotations: 126 | summary: Host out of inodes (instance {{ $labels.instance }}) 127 | description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 128 | 129 | - alert: HostInodesWillFillIn24Hours 130 | expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 131 | for: 2m 132 | labels: 133 | severity: warning 134 | annotations: 135 | summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) 136 | description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 137 | 138 | - alert: HostUnusualDiskReadLatency 139 | expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.3 and rate(node_disk_reads_completed_total[1m]) > 0 140 | for: 2m 141 | labels: 142 | severity: warning 143 | annotations: 144 | summary: Host unusual disk read latency (instance {{ $labels.instance }}) 145 | description: "Disk latency is growing (read operations > 300ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 146 | 147 | - alert: HostUnusualDiskWriteLatency 148 | expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.3 and rate(node_disk_writes_completed_total[1m]) > 0 149 | for: 5m 150 | labels: 151 | severity: warning 152 | annotations: 153 | summary: Host unusual disk write latency (instance {{ $labels.instance }}) 154 | description: "Disk latency is growing (write operations > 300ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 155 | 156 | - alert: HostHighCpuLoad 157 | expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 95 158 | for: 0m 159 | labels: 160 | severity: warning 161 | annotations: 162 | summary: Host high CPU load (instance {{ $labels.instance }}) 163 | description: "CPU load is > 95%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 164 | 165 | - alert: HostCpuStealNoisyNeighbor 166 | expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 30 167 | for: 0m 168 | labels: 169 | severity: warning 170 | annotations: 171 | summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) 172 | description: "CPU steal is > 30%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 173 | 174 | - alert: HostSwapIsFillingUp 175 | expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 176 | for: 2m 177 | labels: 178 | severity: warning 179 | annotations: 180 | summary: Host swap is filling up (instance {{ $labels.instance }}) 181 | description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 182 | 183 | - alert: HostSystemdServiceCrashed 184 | expr: node_systemd_unit_state{state="failed"} == 1 185 | for: 0m 186 | labels: 187 | severity: warning 188 | annotations: 189 | summary: Host systemd service crashed (instance {{ $labels.instance }}) 190 | description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 191 | 192 | - alert: HostPhysicalComponentTooHot 193 | expr: node_hwmon_temp_celsius > 80 194 | for: 5m 195 | labels: 196 | severity: warning 197 | annotations: 198 | summary: Host physical component too hot (instance {{ $labels.instance }}) 199 | description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 200 | 201 | - alert: HostNodeOvertemperatureAlarm 202 | expr: node_hwmon_temp_crit_alarm_celsius == 1 203 | for: 0m 204 | labels: 205 | severity: critical 206 | annotations: 207 | summary: Host node overtemperature alarm (instance {{ $labels.instance }}) 208 | description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 209 | 210 | - alert: HostRaidArrayGotInactive 211 | expr: node_md_state{state="inactive"} > 0 212 | for: 0m 213 | labels: 214 | severity: critical 215 | annotations: 216 | summary: Host RAID array got inactive (instance {{ $labels.instance }}) 217 | description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 218 | 219 | - alert: HostRaidDiskFailure 220 | expr: node_md_disks{state="failed"} > 0 221 | for: 2m 222 | labels: 223 | severity: warning 224 | annotations: 225 | summary: Host RAID disk failure (instance {{ $labels.instance }}) 226 | description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 227 | 228 | - alert: HostOomKillDetected 229 | expr: increase(node_vmstat_oom_kill[1m]) > 0 230 | for: 0m 231 | labels: 232 | severity: warning 233 | annotations: 234 | summary: Host OOM kill detected (instance {{ $labels.instance }}) 235 | description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 236 | 237 | - alert: HostEdacCorrectableErrorsDetected 238 | expr: increase(node_edac_correctable_errors_total[1m]) > 0 239 | for: 0m 240 | labels: 241 | severity: info 242 | annotations: 243 | summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) 244 | description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 245 | 246 | - alert: HostEdacUncorrectableErrorsDetected 247 | expr: node_edac_uncorrectable_errors_total > 0 248 | for: 0m 249 | labels: 250 | severity: warning 251 | annotations: 252 | summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) 253 | description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 254 | 255 | - alert: HostNetworkReceiveErrors 256 | expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 257 | for: 2m 258 | labels: 259 | severity: warning 260 | annotations: 261 | summary: Host Network Receive Errors (instance {{ $labels.instance }}) 262 | description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 263 | 264 | - alert: HostNetworkTransmitErrors 265 | expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 266 | for: 2m 267 | labels: 268 | severity: warning 269 | annotations: 270 | summary: Host Network Transmit Errors (instance {{ $labels.instance }}) 271 | description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 272 | 273 | - alert: HostNetworkInterfaceSaturated 274 | expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000 275 | for: 1m 276 | labels: 277 | severity: warning 278 | annotations: 279 | summary: Host Network Interface Saturated (instance {{ $labels.instance }}) 280 | description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 281 | 282 | - alert: HostNetworkBondDegraded 283 | expr: (node_bonding_active - node_bonding_slaves) != 0 284 | for: 2m 285 | labels: 286 | severity: warning 287 | annotations: 288 | summary: Host Network Bond Degraded (instance {{ $labels.instance }}) 289 | description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 290 | 291 | - alert: HostConntrackLimit 292 | expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 293 | for: 5m 294 | labels: 295 | severity: warning 296 | annotations: 297 | summary: Host conntrack limit (instance {{ $labels.instance }}) 298 | description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 299 | 300 | - alert: HostClockSkew 301 | expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) 302 | for: 2m 303 | labels: 304 | severity: warning 305 | annotations: 306 | summary: Host clock skew (instance {{ $labels.instance }}) 307 | description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 308 | 309 | - alert: HostClockNotSynchronising 310 | expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 311 | for: 2m 312 | labels: 313 | severity: warning 314 | annotations: 315 | summary: Host clock not synchronising (instance {{ $labels.instance }}) 316 | description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 317 | 318 | - alert: HostRequiresReboot 319 | expr: node_reboot_required > 0 320 | for: 4h 321 | labels: 322 | severity: info 323 | annotations: 324 | summary: Host requires reboot (instance {{ $labels.instance }}) 325 | description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 326 | 327 | - name: Containers 328 | rules: 329 | ### Container Cadvisor 330 | # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. 331 | #- alert: ContainerKilled 332 | # expr: time() - container_last_seen > 60 333 | # for: 0m 334 | # labels: 335 | # severity: warning 336 | # annotations: 337 | # summary: Container killed (instance {{ $labels.instance }}) 338 | # description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 339 | 340 | # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. 341 | - alert: ContainerAbsent 342 | expr: absent(container_last_seen) 343 | for: 5m 344 | labels: 345 | severity: warning 346 | annotations: 347 | summary: Container absent (instance {{ $labels.instance }}) 348 | description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 349 | 350 | - alert: ContainerCpuUsage 351 | expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80 352 | for: 2m 353 | labels: 354 | severity: warning 355 | annotations: 356 | summary: Container CPU usage (instance {{ $labels.instance }}) 357 | description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 358 | 359 | # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d 360 | - alert: ContainerMemoryUsage 361 | expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 362 | for: 2m 363 | labels: 364 | severity: warning 365 | annotations: 366 | summary: Container Memory usage (instance {{ $labels.instance }}) 367 | description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 368 | 369 | #- alert: ContainerVolumeUsage 370 | # expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 95 371 | # for: 2m 372 | # labels: 373 | # severity: warning 374 | # annotations: 375 | # summary: Container Volume usage (instance {{ $labels.instance }}) 376 | # description: "Container Volume usage is above 95%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 377 | 378 | - alert: ContainerVolumeIoUsage 379 | expr: (sum(container_fs_io_current{name!=""}) BY (instance, name) * 100) > 80 380 | for: 2m 381 | labels: 382 | severity: warning 383 | annotations: 384 | summary: Container Volume IO usage (instance {{ $labels.instance }}) 385 | description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 386 | 387 | - alert: ContainerHighThrottleRate 388 | expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 389 | for: 2m 390 | labels: 391 | severity: warning 392 | annotations: 393 | summary: Container high throttle rate (instance {{ $labels.instance }}) 394 | description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 395 | -------------------------------------------------------------------------------- /grafana/conf/grafana/dashboards/celestia-da.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "description": "", 25 | "editable": true, 26 | "fiscalYearStartMonth": 0, 27 | "graphTooltip": 0, 28 | "id": 5, 29 | "links": [], 30 | "liveNow": false, 31 | "panels": [ 32 | { 33 | "datasource": { 34 | "type": "prometheus", 35 | "uid": "PBFA97CFB590B2093" 36 | }, 37 | "gridPos": { 38 | "h": 4, 39 | "w": 17, 40 | "x": 0, 41 | "y": 0 42 | }, 43 | "id": 2, 44 | "options": { 45 | "code": { 46 | "language": "plaintext", 47 | "showLineNumbers": false, 48 | "showMiniMap": false 49 | }, 50 | "content": "\r\n\r\n \r\n \r\n \r\n\r\n
\r\n \r\n
\r\n", 51 | "mode": "html" 52 | }, 53 | "pluginVersion": "9.4.7", 54 | "targets": [ 55 | { 56 | "datasource": { 57 | "type": "prometheus", 58 | "uid": "PBFA97CFB590B2093" 59 | }, 60 | "refId": "A" 61 | } 62 | ], 63 | "type": "text" 64 | }, 65 | { 66 | "datasource": { 67 | "type": "prometheus", 68 | "uid": "PBFA97CFB590B2093" 69 | }, 70 | "description": "celestia_head", 71 | "fieldConfig": { 72 | "defaults": { 73 | "color": { 74 | "mode": "thresholds" 75 | }, 76 | "mappings": [], 77 | "thresholds": { 78 | "mode": "absolute", 79 | "steps": [ 80 | { 81 | "color": "green", 82 | "value": null 83 | } 84 | ] 85 | } 86 | }, 87 | "overrides": [] 88 | }, 89 | "gridPos": { 90 | "h": 4, 91 | "w": 3, 92 | "x": 0, 93 | "y": 4 94 | }, 95 | "id": 16, 96 | "options": { 97 | "colorMode": "value", 98 | "graphMode": "none", 99 | "justifyMode": "auto", 100 | "orientation": "auto", 101 | "reduceOptions": { 102 | "calcs": [ 103 | "lastNotNull" 104 | ], 105 | "fields": "", 106 | "values": false 107 | }, 108 | "textMode": "auto" 109 | }, 110 | "pluginVersion": "9.4.7", 111 | "targets": [ 112 | { 113 | "datasource": { 114 | "type": "prometheus", 115 | "uid": "PBFA97CFB590B2093" 116 | }, 117 | "editorMode": "code", 118 | "expr": "celestia_head", 119 | "legendFormat": "__auto", 120 | "range": true, 121 | "refId": "A" 122 | } 123 | ], 124 | "title": "celestia_head", 125 | "type": "stat" 126 | }, 127 | { 128 | "datasource": { 129 | "type": "prometheus", 130 | "uid": "PBFA97CFB590B2093" 131 | }, 132 | "description": "Celestia head", 133 | "fieldConfig": { 134 | "defaults": { 135 | "color": { 136 | "mode": "palette-classic" 137 | }, 138 | "custom": { 139 | "axisCenteredZero": false, 140 | "axisColorMode": "text", 141 | "axisLabel": "", 142 | "axisPlacement": "auto", 143 | "barAlignment": 0, 144 | "drawStyle": "line", 145 | "fillOpacity": 0, 146 | "gradientMode": "none", 147 | "hideFrom": { 148 | "legend": false, 149 | "tooltip": false, 150 | "viz": false 151 | }, 152 | "lineInterpolation": "linear", 153 | "lineWidth": 1, 154 | "pointSize": 5, 155 | "scaleDistribution": { 156 | "type": "linear" 157 | }, 158 | "showPoints": "auto", 159 | "spanNulls": false, 160 | "stacking": { 161 | "group": "A", 162 | "mode": "none" 163 | }, 164 | "thresholdsStyle": { 165 | "mode": "off" 166 | } 167 | }, 168 | "mappings": [], 169 | "thresholds": { 170 | "mode": "absolute", 171 | "steps": [ 172 | { 173 | "color": "green", 174 | "value": null 175 | }, 176 | { 177 | "color": "red", 178 | "value": 80 179 | } 180 | ] 181 | } 182 | }, 183 | "overrides": [] 184 | }, 185 | "gridPos": { 186 | "h": 6, 187 | "w": 14, 188 | "x": 3, 189 | "y": 4 190 | }, 191 | "id": 20, 192 | "options": { 193 | "legend": { 194 | "calcs": [], 195 | "displayMode": "list", 196 | "placement": "bottom", 197 | "showLegend": true 198 | }, 199 | "tooltip": { 200 | "mode": "single", 201 | "sort": "none" 202 | } 203 | }, 204 | "targets": [ 205 | { 206 | "datasource": { 207 | "type": "prometheus", 208 | "uid": "PBFA97CFB590B2093" 209 | }, 210 | "editorMode": "code", 211 | "expr": "celestia_head", 212 | "legendFormat": "Celestia head", 213 | "range": true, 214 | "refId": "A" 215 | } 216 | ], 217 | "title": "Celestia head", 218 | "type": "timeseries" 219 | }, 220 | { 221 | "datasource": { 222 | "type": "prometheus", 223 | "uid": "PBFA97CFB590B2093" 224 | }, 225 | "description": "total time the node has been running", 226 | "fieldConfig": { 227 | "defaults": { 228 | "color": { 229 | "mode": "thresholds" 230 | }, 231 | "mappings": [], 232 | "thresholds": { 233 | "mode": "absolute", 234 | "steps": [ 235 | { 236 | "color": "green", 237 | "value": null 238 | } 239 | ] 240 | }, 241 | "unit": "s" 242 | }, 243 | "overrides": [] 244 | }, 245 | "gridPos": { 246 | "h": 4, 247 | "w": 3, 248 | "x": 0, 249 | "y": 8 250 | }, 251 | "id": 6, 252 | "options": { 253 | "colorMode": "value", 254 | "graphMode": "none", 255 | "justifyMode": "auto", 256 | "orientation": "auto", 257 | "reduceOptions": { 258 | "calcs": [ 259 | "lastNotNull" 260 | ], 261 | "fields": "", 262 | "values": false 263 | }, 264 | "textMode": "auto" 265 | }, 266 | "pluginVersion": "9.4.7", 267 | "targets": [ 268 | { 269 | "datasource": { 270 | "type": "prometheus", 271 | "uid": "PBFA97CFB590B2093" 272 | }, 273 | "editorMode": "code", 274 | "expr": "celestia_node_runtime_counter_in_seconds", 275 | "legendFormat": "__auto", 276 | "range": true, 277 | "refId": "A" 278 | } 279 | ], 280 | "title": "Node uptime", 281 | "type": "stat" 282 | }, 283 | { 284 | "datasource": { 285 | "type": "prometheus", 286 | "uid": "PBFA97CFB590B2093" 287 | }, 288 | "description": "", 289 | "fieldConfig": { 290 | "defaults": { 291 | "color": { 292 | "mode": "palette-classic" 293 | }, 294 | "custom": { 295 | "axisCenteredZero": false, 296 | "axisColorMode": "text", 297 | "axisLabel": "", 298 | "axisPlacement": "auto", 299 | "barAlignment": 0, 300 | "drawStyle": "line", 301 | "fillOpacity": 0, 302 | "gradientMode": "none", 303 | "hideFrom": { 304 | "legend": false, 305 | "tooltip": false, 306 | "viz": false 307 | }, 308 | "lineInterpolation": "linear", 309 | "lineWidth": 1, 310 | "pointSize": 5, 311 | "scaleDistribution": { 312 | "type": "linear" 313 | }, 314 | "showPoints": "auto", 315 | "spanNulls": false, 316 | "stacking": { 317 | "group": "A", 318 | "mode": "none" 319 | }, 320 | "thresholdsStyle": { 321 | "mode": "off" 322 | } 323 | }, 324 | "mappings": [], 325 | "thresholds": { 326 | "mode": "absolute", 327 | "steps": [ 328 | { 329 | "color": "green", 330 | "value": null 331 | }, 332 | { 333 | "color": "red", 334 | "value": 80 335 | } 336 | ] 337 | } 338 | }, 339 | "overrides": [] 340 | }, 341 | "gridPos": { 342 | "h": 6, 343 | "w": 14, 344 | "x": 3, 345 | "y": 10 346 | }, 347 | "id": 22, 348 | "options": { 349 | "legend": { 350 | "calcs": [], 351 | "displayMode": "list", 352 | "placement": "bottom", 353 | "showLegend": true 354 | }, 355 | "tooltip": { 356 | "mode": "single", 357 | "sort": "none" 358 | } 359 | }, 360 | "targets": [ 361 | { 362 | "datasource": { 363 | "type": "prometheus", 364 | "uid": "PBFA97CFB590B2093" 365 | }, 366 | "editorMode": "code", 367 | "expr": "celestia_total_synced_headers", 368 | "legendFormat": "Total synced header", 369 | "range": true, 370 | "refId": "A" 371 | } 372 | ], 373 | "title": "Total synced headers", 374 | "type": "timeseries" 375 | }, 376 | { 377 | "datasource": { 378 | "type": "prometheus", 379 | "uid": "PBFA97CFB590B2093" 380 | }, 381 | "description": "Total synced header count", 382 | "fieldConfig": { 383 | "defaults": { 384 | "color": { 385 | "mode": "thresholds" 386 | }, 387 | "mappings": [], 388 | "thresholds": { 389 | "mode": "absolute", 390 | "steps": [ 391 | { 392 | "color": "green", 393 | "value": null 394 | } 395 | ] 396 | } 397 | }, 398 | "overrides": [] 399 | }, 400 | "gridPos": { 401 | "h": 4, 402 | "w": 3, 403 | "x": 0, 404 | "y": 12 405 | }, 406 | "id": 14, 407 | "options": { 408 | "colorMode": "value", 409 | "graphMode": "none", 410 | "justifyMode": "auto", 411 | "orientation": "auto", 412 | "reduceOptions": { 413 | "calcs": [ 414 | "lastNotNull" 415 | ], 416 | "fields": "", 417 | "values": false 418 | }, 419 | "textMode": "auto" 420 | }, 421 | "pluginVersion": "9.4.7", 422 | "targets": [ 423 | { 424 | "datasource": { 425 | "type": "prometheus", 426 | "uid": "PBFA97CFB590B2093" 427 | }, 428 | "editorMode": "code", 429 | "expr": "celestia_total_synced_headers", 430 | "legendFormat": "__auto", 431 | "range": true, 432 | "refId": "A" 433 | } 434 | ], 435 | "title": "Synced header count", 436 | "type": "stat" 437 | }, 438 | { 439 | "datasource": { 440 | "type": "prometheus", 441 | "uid": "PBFA97CFB590B2093" 442 | }, 443 | "fieldConfig": { 444 | "defaults": { 445 | "color": { 446 | "mode": "thresholds" 447 | }, 448 | "mappings": [], 449 | "thresholds": { 450 | "mode": "absolute", 451 | "steps": [ 452 | { 453 | "color": "green", 454 | "value": null 455 | } 456 | ] 457 | } 458 | }, 459 | "overrides": [] 460 | }, 461 | "gridPos": { 462 | "h": 10, 463 | "w": 8, 464 | "x": 0, 465 | "y": 16 466 | }, 467 | "id": 8, 468 | "options": { 469 | "colorMode": "value", 470 | "graphMode": "none", 471 | "justifyMode": "auto", 472 | "orientation": "auto", 473 | "reduceOptions": { 474 | "calcs": [ 475 | "lastNotNull" 476 | ], 477 | "fields": "", 478 | "values": false 479 | }, 480 | "textMode": "auto" 481 | }, 482 | "pluginVersion": "9.4.7", 483 | "targets": [ 484 | { 485 | "datasource": { 486 | "type": "prometheus", 487 | "uid": "PBFA97CFB590B2093" 488 | }, 489 | "editorMode": "code", 490 | "expr": "celestia_das_head_updated_counter", 491 | "hide": false, 492 | "legendFormat": "amount of times DAS'er advanced network head", 493 | "range": true, 494 | "refId": "B" 495 | }, 496 | { 497 | "datasource": { 498 | "type": "prometheus", 499 | "uid": "PBFA97CFB590B2093" 500 | }, 501 | "editorMode": "code", 502 | "expr": "celestia_das_network_head", 503 | "hide": false, 504 | "legendFormat": "most recent network head", 505 | "range": true, 506 | "refId": "C" 507 | }, 508 | { 509 | "datasource": { 510 | "type": "prometheus", 511 | "uid": "PBFA97CFB590B2093" 512 | }, 513 | "editorMode": "code", 514 | "expr": "celestia_das_sampled_chain_head", 515 | "hide": false, 516 | "legendFormat": "height of the sampled chain", 517 | "range": true, 518 | "refId": "D" 519 | }, 520 | { 521 | "datasource": { 522 | "type": "prometheus", 523 | "uid": "PBFA97CFB590B2093" 524 | }, 525 | "editorMode": "code", 526 | "expr": "celestia_das_total_sampled_headers", 527 | "hide": false, 528 | "legendFormat": "total sampled headers", 529 | "range": true, 530 | "refId": "E" 531 | } 532 | ], 533 | "title": "DAS", 534 | "type": "stat" 535 | }, 536 | { 537 | "datasource": { 538 | "type": "prometheus", 539 | "uid": "PBFA97CFB590B2093" 540 | }, 541 | "description": "Last das sample", 542 | "fieldConfig": { 543 | "defaults": { 544 | "color": { 545 | "mode": "thresholds" 546 | }, 547 | "mappings": [], 548 | "thresholds": { 549 | "mode": "absolute", 550 | "steps": [ 551 | { 552 | "color": "green", 553 | "value": null 554 | } 555 | ] 556 | }, 557 | "unit": "dateTimeAsLocal" 558 | }, 559 | "overrides": [] 560 | }, 561 | "gridPos": { 562 | "h": 4, 563 | "w": 3, 564 | "x": 8, 565 | "y": 16 566 | }, 567 | "id": 10, 568 | "options": { 569 | "colorMode": "value", 570 | "graphMode": "none", 571 | "justifyMode": "auto", 572 | "orientation": "auto", 573 | "reduceOptions": { 574 | "calcs": [], 575 | "fields": "", 576 | "values": false 577 | }, 578 | "textMode": "auto" 579 | }, 580 | "pluginVersion": "9.4.7", 581 | "targets": [ 582 | { 583 | "datasource": { 584 | "type": "prometheus", 585 | "uid": "PBFA97CFB590B2093" 586 | }, 587 | "editorMode": "code", 588 | "expr": "celestia_das_latest_sampled_ts * 1000", 589 | "legendFormat": "__auto", 590 | "range": true, 591 | "refId": "A" 592 | } 593 | ], 594 | "title": "Last das sampled", 595 | "type": "stat" 596 | }, 597 | { 598 | "datasource": { 599 | "type": "prometheus", 600 | "uid": "PBFA97CFB590B2093" 601 | }, 602 | "description": "Total count of submitted PayForBlob transactions", 603 | "fieldConfig": { 604 | "defaults": { 605 | "color": { 606 | "mode": "thresholds" 607 | }, 608 | "mappings": [], 609 | "thresholds": { 610 | "mode": "absolute", 611 | "steps": [ 612 | { 613 | "color": "green", 614 | "value": null 615 | } 616 | ] 617 | } 618 | }, 619 | "overrides": [] 620 | }, 621 | "gridPos": { 622 | "h": 4, 623 | "w": 3, 624 | "x": 11, 625 | "y": 16 626 | }, 627 | "id": 4, 628 | "options": { 629 | "colorMode": "value", 630 | "graphMode": "none", 631 | "justifyMode": "auto", 632 | "orientation": "auto", 633 | "reduceOptions": { 634 | "calcs": [ 635 | "lastNotNull" 636 | ], 637 | "fields": "", 638 | "values": false 639 | }, 640 | "textMode": "auto" 641 | }, 642 | "pluginVersion": "9.4.7", 643 | "targets": [ 644 | { 645 | "datasource": { 646 | "type": "prometheus", 647 | "uid": "PBFA97CFB590B2093" 648 | }, 649 | "editorMode": "code", 650 | "expr": "celestia_pfb_count", 651 | "legendFormat": "__auto", 652 | "range": true, 653 | "refId": "A" 654 | } 655 | ], 656 | "title": "PFB count", 657 | "type": "stat" 658 | }, 659 | { 660 | "datasource": { 661 | "type": "prometheus", 662 | "uid": "PBFA97CFB590B2093" 663 | }, 664 | "description": "Last PayForBlob date and time", 665 | "fieldConfig": { 666 | "defaults": { 667 | "color": { 668 | "mode": "thresholds" 669 | }, 670 | "mappings": [ 671 | { 672 | "options": { 673 | "0": { 674 | "index": 0, 675 | "text": "NA" 676 | } 677 | }, 678 | "type": "value" 679 | } 680 | ], 681 | "thresholds": { 682 | "mode": "absolute", 683 | "steps": [ 684 | { 685 | "color": "green", 686 | "value": null 687 | } 688 | ] 689 | } 690 | }, 691 | "overrides": [] 692 | }, 693 | "gridPos": { 694 | "h": 4, 695 | "w": 3, 696 | "x": 14, 697 | "y": 16 698 | }, 699 | "id": 18, 700 | "options": { 701 | "colorMode": "value", 702 | "graphMode": "none", 703 | "justifyMode": "auto", 704 | "orientation": "auto", 705 | "reduceOptions": { 706 | "calcs": [ 707 | "lastNotNull" 708 | ], 709 | "fields": "", 710 | "values": false 711 | }, 712 | "textMode": "auto" 713 | }, 714 | "pluginVersion": "9.4.7", 715 | "targets": [ 716 | { 717 | "datasource": { 718 | "type": "prometheus", 719 | "uid": "PBFA97CFB590B2093" 720 | }, 721 | "editorMode": "code", 722 | "expr": "celestia_last_pfb_timestamp", 723 | "legendFormat": "__auto", 724 | "range": true, 725 | "refId": "A" 726 | } 727 | ], 728 | "title": "Last PayForBlob ", 729 | "type": "stat" 730 | }, 731 | { 732 | "datasource": { 733 | "type": "prometheus", 734 | "uid": "PBFA97CFB590B2093" 735 | }, 736 | "description": "Sampled header counter for different width height", 737 | "fieldConfig": { 738 | "defaults": { 739 | "color": { 740 | "mode": "thresholds" 741 | }, 742 | "mappings": [], 743 | "thresholds": { 744 | "mode": "absolute", 745 | "steps": [ 746 | { 747 | "color": "green" 748 | } 749 | ] 750 | } 751 | }, 752 | "overrides": [] 753 | }, 754 | "gridPos": { 755 | "h": 6, 756 | "w": 9, 757 | "x": 8, 758 | "y": 20 759 | }, 760 | "id": 12, 761 | "options": { 762 | "colorMode": "value", 763 | "graphMode": "none", 764 | "justifyMode": "auto", 765 | "orientation": "auto", 766 | "reduceOptions": { 767 | "calcs": [ 768 | "lastNotNull" 769 | ], 770 | "fields": "", 771 | "values": false 772 | }, 773 | "textMode": "auto" 774 | }, 775 | "pluginVersion": "9.4.7", 776 | "targets": [ 777 | { 778 | "datasource": { 779 | "type": "prometheus", 780 | "uid": "PBFA97CFB590B2093" 781 | }, 782 | "editorMode": "code", 783 | "expr": "celestia_das_sampled_headers_counter", 784 | "legendFormat": "header width of {{header_width}}", 785 | "range": true, 786 | "refId": "A" 787 | } 788 | ], 789 | "title": "Sampled header counter", 790 | "type": "stat" 791 | }, 792 | { 793 | "datasource": { 794 | "type": "loki", 795 | "uid": "P8E80F9AEF21F6940" 796 | }, 797 | "gridPos": { 798 | "h": 9, 799 | "w": 17, 800 | "x": 0, 801 | "y": 26 802 | }, 803 | "id": 24, 804 | "options": { 805 | "dedupStrategy": "none", 806 | "enableLogDetails": true, 807 | "prettifyLogMessage": false, 808 | "showCommonLabels": false, 809 | "showLabels": false, 810 | "showTime": false, 811 | "sortOrder": "Descending", 812 | "wrapLogMessage": false 813 | }, 814 | "pluginVersion": "9.4.7", 815 | "targets": [ 816 | { 817 | "datasource": { 818 | "type": "loki", 819 | "uid": "P8E80F9AEF21F6940" 820 | }, 821 | "editorMode": "code", 822 | "expr": "{unit=~\"celestia-[lbf].*.service\"} |= ``", 823 | "queryType": "range", 824 | "refId": "A" 825 | } 826 | ], 827 | "title": "DA node logs", 828 | "type": "logs" 829 | } 830 | ], 831 | "refresh": "", 832 | "revision": 1, 833 | "schemaVersion": 38, 834 | "style": "dark", 835 | "tags": [ 836 | "celestia", 837 | "da" 838 | ], 839 | "templating": { 840 | "list": [] 841 | }, 842 | "time": { 843 | "from": "now-6h", 844 | "to": "now" 845 | }, 846 | "timepicker": {}, 847 | "timezone": "", 848 | "title": "Celestia DA node", 849 | "uid": "sQBDU4L4z", 850 | "version": 6, 851 | "weekStart": "" 852 | } -------------------------------------------------------------------------------- /grafana/conf/grafana/grafana.ini: -------------------------------------------------------------------------------- 1 | app_mode = production 2 | 3 | #################################### Logging ########################## 4 | [log] 5 | # Either "console", "file", "syslog". Default is console and file 6 | # Use space to separate multiple modes, e.g. "console file" 7 | ;mode = console file 8 | 9 | # Either "debug", "info", "warn", "error", "critical", default is "info" 10 | level = warn 11 | 12 | # optional settings to set different levels for specific loggers. Ex filters = sqlstore:debug 13 | ;filters = 14 | 15 | #################################### Paths #################################### 16 | [paths] 17 | # Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) 18 | ;data = /var/lib/grafana 19 | 20 | # Temporary files in `data` directory older than given duration will be removed 21 | ;temp_data_lifetime = 24h 22 | 23 | # Directory where grafana can store logs 24 | ;logs = /var/log/grafana 25 | 26 | # Directory where grafana will automatically scan and look for plugins 27 | ;plugins = /var/lib/grafana/plugins 28 | 29 | # folder that contains provisioning config files that grafana will apply on startup and while running. 30 | ;provisioning = conf/provisioning 31 | 32 | #################################### Server #################################### 33 | [server] 34 | # Protocol (http, https, h2, socket) 35 | ;protocol = http 36 | 37 | # The ip address to bind to, empty will bind to all interfaces 38 | ;http_addr = 39 | 40 | # The http port to use 41 | ;http_port = 3000 42 | 43 | # The public facing domain name used to access grafana from a browser 44 | ;domain = localhost 45 | 46 | # Redirect to correct domain if host header does not match domain 47 | # Prevents DNS rebinding attacks 48 | ;enforce_domain = false 49 | 50 | # The full public facing url you use in browser, used for redirects and emails 51 | # If you use reverse proxy and sub path specify full url (with sub path) 52 | ;root_url = %(protocol)s://%(domain)s:%(http_port)s/ 53 | 54 | # Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons. 55 | ;serve_from_sub_path = false 56 | 57 | # Log web requests 58 | ;router_logging = false 59 | 60 | # the path relative working path 61 | ;static_root_path = public 62 | 63 | # enable gzip 64 | ;enable_gzip = false 65 | 66 | # https certs & key file 67 | ;cert_file = 68 | ;cert_key = 69 | 70 | # Unix socket path 71 | ;socket = 72 | 73 | #################################### Database #################################### 74 | [database] 75 | # You can configure the database connection by specifying type, host, name, user and password 76 | # as separate properties or as on string using the url properties. 77 | 78 | # Either "mysql", "postgres" or "sqlite3", it's your choice 79 | ;type = sqlite3 80 | ;host = 127.0.0.1:3306 81 | ;name = grafana 82 | ;user = root 83 | # If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" 84 | ;password = 85 | 86 | # Use either URL or the previous fields to configure the database 87 | # Example: mysql://user:secret@host:port/database 88 | ;url = 89 | 90 | # For "postgres" only, either "disable", "require" or "verify-full" 91 | ;ssl_mode = disable 92 | 93 | ;ca_cert_path = 94 | ;client_key_path = 95 | ;client_cert_path = 96 | ;server_cert_name = 97 | 98 | # For "sqlite3" only, path relative to data_path setting 99 | ;path = grafana.db 100 | 101 | # Max idle conn setting default is 2 102 | ;max_idle_conn = 2 103 | 104 | # Max conn setting default is 0 (mean not set) 105 | ;max_open_conn = 106 | 107 | # Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours) 108 | ;conn_max_lifetime = 14400 109 | 110 | # Set to true to log the sql calls and execution times. 111 | ;log_queries = 112 | 113 | # For "sqlite3" only. cache mode setting used for connecting to the database. (private, shared) 114 | ;cache_mode = private 115 | 116 | #################################### Cache server ############################# 117 | [remote_cache] 118 | # Either "redis", "memcached" or "database" default is "database" 119 | ;type = database 120 | 121 | # cache connectionstring options 122 | # database: will use Grafana primary database. 123 | # redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=0,ssl=false`. Only addr is required. ssl may be 'true', 'false', or 'insecure'. 124 | # memcache: 127.0.0.1:11211 125 | ;connstr = 126 | 127 | #################################### Data proxy ########################### 128 | [dataproxy] 129 | 130 | # This enables data proxy logging, default is false 131 | ;logging = false 132 | 133 | # How long the data proxy waits before timing out, default is 30 seconds. 134 | # This setting also applies to core backend HTTP data sources where query requests use an HTTP client with timeout set. 135 | ;timeout = 30 136 | 137 | # If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request, default is false. 138 | ;send_user_header = false 139 | 140 | #################################### Analytics #################################### 141 | [analytics] 142 | # Server reporting, sends usage counters to stats.grafana.org every 24 hours. 143 | # No ip addresses are being tracked, only simple counters to track 144 | # running instances, dashboard and error counts. It is very helpful to us. 145 | # Change this option to false to disable reporting. 146 | ;reporting_enabled = true 147 | 148 | # Set to false to disable all checks to https://grafana.net 149 | # for new versions (grafana itself and plugins), check is used 150 | # in some UI views to notify that grafana or plugin update exists 151 | # This option does not cause any auto updates, nor send any information 152 | # only a GET request to http://grafana.com to get latest versions 153 | ;check_for_updates = true 154 | 155 | # Google Analytics universal tracking code, only enabled if you specify an id here 156 | ;google_analytics_ua_id = 157 | 158 | # Google Tag Manager ID, only enabled if you specify an id here 159 | ;google_tag_manager_id = 160 | 161 | #################################### Security #################################### 162 | [security] 163 | # disable creation of admin user on first start of grafana 164 | ;disable_initial_admin_creation = false 165 | 166 | # default admin user, created on startup 167 | ;admin_user = admin 168 | 169 | # default admin password, can be changed before first start of grafana, or in profile settings 170 | ;admin_password = admin 171 | 172 | # used for signing 173 | ;secret_key = SW2YcwTIb9zpOOhoPsMm 174 | 175 | # disable gravatar profile images 176 | ;disable_gravatar = false 177 | 178 | # data source proxy whitelist (ip_or_domain:port separated by spaces) 179 | ;data_source_proxy_whitelist = 180 | 181 | # disable protection against brute force login attempts 182 | ;disable_brute_force_login_protection = false 183 | 184 | # set to true if you host Grafana behind HTTPS. default is false. 185 | ;cookie_secure = false 186 | 187 | # set cookie SameSite attribute. defaults to `lax`. can be set to "lax", "strict", "none" and "disabled" 188 | ;cookie_samesite = lax 189 | 190 | # set to true if you want to allow browsers to render Grafana in a ,