├── docs ├── alertmanager.png ├── grafana_datasource.png ├── prometheus_targets.png ├── grafana_cluster_overview.png ├── grafana_import_dashboard.png ├── grafana_prometheus_stats.png ├── grafana_datasource_dashboard.png └── index.md ├── manifests ├── prometheus-core-configmap │ ├── prometheus-record.rules │ ├── prometheus-alert.rules │ └── prometheus.yml ├── grafana-import-dashboards-configmap │ ├── prometheus-datasource.json │ ├── grafana-net-162-dashboard.json │ ├── grafana-net-193-dashboard.json │ ├── grafana-net-2-dashboard.json │ └── grafana-net-315-dashboard.json ├── kube-metrics-service.yaml ├── grafana-core-service.yaml ├── prometheus-alert-service.yaml ├── prometheus-core-service.yaml ├── prometheus-node-exporter-service.yaml ├── kube-metrics-deployment.yaml ├── prometheus-node-exporter-daemonset.yaml ├── prometheus-alert-deployment.yaml ├── prometheus-core-deployment.yaml ├── grafana-core-deployment.yaml ├── grafana-import-dashboards-job.yaml ├── prometheus-alert-configmap │ └── alertmanager.yml ├── prometheus-alert-configmap.yaml ├── prometheus-core-configmap.yaml └── grafana-import-dashboards-configmap.yaml └── README.md /docs/alertmanager.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/alertmanager.png -------------------------------------------------------------------------------- /docs/grafana_datasource.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_datasource.png -------------------------------------------------------------------------------- /docs/prometheus_targets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/prometheus_targets.png -------------------------------------------------------------------------------- /docs/grafana_cluster_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_cluster_overview.png -------------------------------------------------------------------------------- /docs/grafana_import_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_import_dashboard.png -------------------------------------------------------------------------------- /docs/grafana_prometheus_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_prometheus_stats.png -------------------------------------------------------------------------------- /docs/grafana_datasource_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_datasource_dashboard.png -------------------------------------------------------------------------------- /manifests/prometheus-core-configmap/prometheus-record.rules: -------------------------------------------------------------------------------- 1 | 2 | instance:fd_utilization = process_open_fds / process_max_fds 3 | -------------------------------------------------------------------------------- /manifests/grafana-import-dashboards-configmap/prometheus-datasource.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "prometheus", 3 | "type": "prometheus", 4 | "url": "http://prometheus:9090", 5 | "access": "proxy", 6 | "basicAuth": false 7 | } 8 | -------------------------------------------------------------------------------- /manifests/kube-metrics-service.yaml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: kube-state-metrics 5 | annotations: 6 | prometheus.io/scrape: 'true' 7 | labels: 8 | app: kube-state-metrics 9 | spec: 10 | type: NodePort 11 | selector: 12 | app: kube-state-metrics 13 | ports: 14 | - name: kube-state-metrics 15 | port: 8080 16 | -------------------------------------------------------------------------------- /manifests/grafana-core-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: grafana 5 | labels: 6 | app: grafana 7 | component: core 8 | # annotations: 9 | # prometheus.io/scrape: 'true' 10 | spec: 11 | type: NodePort 12 | ports: 13 | - port: 3000 14 | nodePort: 31000 15 | selector: 16 | app: grafana 17 | component: core 18 | -------------------------------------------------------------------------------- /manifests/prometheus-alert-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: alertmanager 5 | labels: 6 | app: prometheus 7 | component: alert 8 | annotations: 9 | prometheus.io/scrape: 'true' 10 | spec: 11 | selector: 12 | app: prometheus 13 | component: alert 14 | type: NodePort 15 | ports: 16 | - port: 9093 17 | nodePort: 30093 18 | protocol: TCP 19 | -------------------------------------------------------------------------------- /manifests/prometheus-core-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: prometheus 5 | labels: 6 | app: prometheus 7 | component: core 8 | annotations: 9 | prometheus.io/scrape: 'true' 10 | spec: 11 | type: NodePort 12 | ports: 13 | - port: 9090 14 | nodePort: 30900 15 | protocol: TCP 16 | name: webui 17 | selector: 18 | app: prometheus 19 | component: core 20 | -------------------------------------------------------------------------------- /manifests/prometheus-node-exporter-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | annotations: 5 | prometheus.io/scrape: 'true' 6 | name: prometheus-node-exporter 7 | labels: 8 | app: prometheus 9 | component: node-exporter 10 | spec: 11 | clusterIP: None 12 | ports: 13 | - name: prometheus-node-exporter 14 | port: 9100 15 | protocol: TCP 16 | selector: 17 | app: prometheus 18 | component: node-exporter 19 | type: ClusterIP 20 | -------------------------------------------------------------------------------- /manifests/kube-metrics-deployment.yaml: -------------------------------------------------------------------------------- 1 | kind: Deployment 2 | apiVersion: extensions/v1beta1 3 | metadata: 4 | name: kube-state-metrics-deployment 5 | spec: 6 | replicas: 1 7 | template: 8 | metadata: 9 | labels: 10 | app: kube-state-metrics 11 | spec: 12 | containers: 13 | - name: kube-state-metrics 14 | image: gcr.io/google_containers/kube-state-metrics:v0.2.0 15 | ports: 16 | - name: exporter 17 | containerPort: 8080 18 | resources: 19 | requests: 20 | cpu: 10m 21 | memory: 10Mi 22 | limits: 23 | cpu: 100m 24 | memory: 50Mi 25 | -------------------------------------------------------------------------------- /manifests/prometheus-node-exporter-daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: DaemonSet 3 | metadata: 4 | name: prometheus-node-exporter 5 | labels: 6 | app: prometheus 7 | component: node-exporter 8 | spec: 9 | template: 10 | metadata: 11 | name: prometheus-node-exporter 12 | labels: 13 | app: prometheus 14 | component: node-exporter 15 | spec: 16 | containers: 17 | - image: prom/node-exporter:0.12.0 18 | name: prometheus-node-exporter 19 | ports: 20 | - name: prom-node-exp 21 | #^ must be an IANA_SVC_NAME (at most 15 characters, ..) 22 | containerPort: 9100 23 | hostPort: 9100 24 | hostNetwork: true 25 | hostPID: true 26 | -------------------------------------------------------------------------------- /manifests/prometheus-alert-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: prometheus-alert 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: prometheus 10 | component: alert 11 | template: 12 | metadata: 13 | name: alertmanager 14 | labels: 15 | app: prometheus 16 | component: alert 17 | spec: 18 | containers: 19 | - name: alertmanager 20 | image: prom/alertmanager:v0.4.2 21 | args: 22 | - '-config.file=/etc/alertmanager/alertmanager.yml' 23 | - '-storage.path=/alertmanager' 24 | ports: 25 | - name: alertmanager 26 | containerPort: 9093 27 | volumeMounts: 28 | - name: config-volume 29 | mountPath: /etc/alertmanager 30 | - name: alertmanager 31 | mountPath: /prometheus 32 | volumes: 33 | - name: config-volume 34 | configMap: 35 | name: prometheus-alert 36 | - name: alertmanager 37 | emptyDir: {} 38 | -------------------------------------------------------------------------------- /manifests/prometheus-core-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: prometheus-core 5 | labels: 6 | app: prometheus 7 | component: core 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: prometheus 13 | component: core 14 | template: 15 | metadata: 16 | name: prometheus-main 17 | labels: 18 | app: prometheus 19 | component: core 20 | spec: 21 | containers: 22 | - name: prometheus 23 | image: prom/prometheus:v1.2.3 24 | args: 25 | - '-storage.local.retention=6h' 26 | - '-storage.local.memory-chunks=500000' 27 | - '-config.file=/etc/prometheus/prometheus.yml' 28 | - '-alertmanager.url=http://alertmanager:9093' 29 | ports: 30 | - name: webui 31 | containerPort: 9090 32 | volumeMounts: 33 | - name: config-volume 34 | mountPath: /etc/prometheus 35 | - name: data-volume 36 | mountPath: /prometheus 37 | # nodeSelector: 38 | # kubernetes.io/hostname: 192.168.1.110 39 | volumes: 40 | - name: config-volume 41 | configMap: 42 | name: prometheus-core 43 | - name: data-volume 44 | hostPath: 45 | # directory location on host 46 | path: /prometheusData 47 | -------------------------------------------------------------------------------- /manifests/grafana-core-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: grafana-core 5 | labels: 6 | app: grafana 7 | component: core 8 | spec: 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: grafana 14 | component: core 15 | spec: 16 | containers: 17 | - image: xiaoping378/grafana-cn:3.1.1 18 | name: grafana-core 19 | # env: 20 | resources: 21 | # keep request = limit to keep this container in guaranteed class 22 | limits: 23 | cpu: 100m 24 | memory: 100Mi 25 | requests: 26 | cpu: 100m 27 | memory: 100Mi 28 | ports: 29 | - name: grafana 30 | containerPort: 3000 31 | env: 32 | # This variable is required to setup templates in Grafana. 33 | # The following env variables are required to make Grafana accessible via 34 | # the kubernetes api-server proxy. On production clusters, we recommend 35 | # removing these env variables, setup auth for grafana, and expose the grafana 36 | # service using a LoadBalancer or a public IP. 37 | - name: GF_AUTH_BASIC_ENABLED 38 | value: "false" 39 | - name: GF_AUTH_ANONYMOUS_ENABLED 40 | value: "true" 41 | - name: GF_AUTH_ANONYMOUS_ORG_ROLE 42 | value: Admin 43 | # - name: GF_SERVER_ROOT_URL 44 | # value: /api/v1/proxy/namespaces/monitoring/services/grafana/ 45 | volumeMounts: 46 | - name: grafana-persistent-storage 47 | mountPath: /var 48 | # nodeSelector: 49 | # kubernetes.io/hostname: 192.168.1.110 50 | volumes: 51 | - name: grafana-persistent-storage 52 | hostPath: 53 | path: /grafanaData 54 | -------------------------------------------------------------------------------- /manifests/grafana-import-dashboards-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: grafana-import-dashboards 5 | labels: 6 | app: grafana 7 | component: import-dashboards 8 | spec: 9 | template: 10 | metadata: 11 | name: grafana-import-dashboards 12 | labels: 13 | app: grafana 14 | component: import-dashboards 15 | spec: 16 | containers: 17 | - name: grafana-import-dashboards 18 | image: docker 19 | imagePullPolicy: IfNotPresent 20 | command: ["/bin/sh", "-c"] 21 | workingDir: /opt/grafana-import-dashboards 22 | args: 23 | # FIXME use kubernetes probe instead of "until curl" 24 | - > 25 | until $(curl --silent --fail --show-error --output /dev/null http://grafana:3000/api/datasources); do 26 | printf '.' ; sleep 1 ; 27 | done ; 28 | for file in *-datasource.json ; do 29 | if [ -e "$file" ] ; then 30 | echo "importing $file" && 31 | curl --silent --fail --show-error \ 32 | --request POST http://grafana:3000/api/datasources \ 33 | --header "Content-Type: application/json" \ 34 | --data-binary "@$file" ; 35 | echo "" ; 36 | fi 37 | done ; 38 | for file in *-dashboard.json ; do 39 | if [ -e "$file" ] ; then 40 | echo "importing $file" && 41 | curl --silent --fail --show-error \ 42 | --request POST http://grafana:3000/api/dashboards/import \ 43 | --header "Content-Type: application/json" \ 44 | --data-binary "@$file" ; 45 | echo "" ; 46 | fi 47 | done 48 | 49 | volumeMounts: 50 | - name: config-volume 51 | mountPath: /opt/grafana-import-dashboards 52 | restartPolicy: Never 53 | volumes: 54 | - name: config-volume 55 | configMap: 56 | name: grafana-import-dashboards 57 | -------------------------------------------------------------------------------- /manifests/prometheus-alert-configmap/alertmanager.yml: -------------------------------------------------------------------------------- 1 | global: 2 | # ResolveTimeout is the time after which an alert is declared resolved 3 | # if it has not been updated. 4 | resolve_timeout: 5m 5 | 6 | # The smarthost and SMTP sender used for mail notifications. 7 | smtp_smarthost: 'smtp.zmail300.cn:25' 8 | smtp_from: 'xuxiaoping@300.cn' 9 | smtp_auth_username: 'xuxiaoping@300.cn' 10 | smtp_auth_password: '*******' 11 | # require_tls, don't work in global section. should put it into email config. 12 | # refer to https://github.com/prometheus/alertmanager/issues/193 13 | # smtp_require_tls: false 14 | 15 | # The API URL to use for Slack notifications. 16 | slack_api_url: 'global.slack_api_url' 17 | 18 | # # The auth token for Hipchat. 19 | # hipchat_auth_token: '1234556789' 20 | # 21 | # # Alternative host for Hipchat. 22 | # hipchat_url: 'https://hipchat.foobar.org/' 23 | 24 | # # The directory from which notification templates are read. 25 | # templates: 26 | # - '/etc/alertmanager/template/*.tmpl' 27 | 28 | # The root route on which each incoming alert enters. 29 | route: 30 | 31 | # The labels by which incoming alerts are grouped together. For example, 32 | # multiple alerts coming in for cluster=A and alertname=LatencyHigh would 33 | # be batched into a single group. 34 | 35 | group_by: ['alertname', 'cluster', 'service'] 36 | 37 | # When a new group of alerts is created by an incoming alert, wait at 38 | # least 'group_wait' to send the initial notification. 39 | # This way ensures that you get multiple alerts for the same group that start 40 | # firing shortly after another are batched together on the first 41 | # notification. 42 | 43 | group_wait: 30s 44 | 45 | # When the first notification was sent, wait 'group_interval' to send a batch 46 | # of new alerts that started firing for that group. 47 | 48 | group_interval: 5m 49 | 50 | # If an alert has successfully been sent, wait 'repeat_interval' to 51 | # resend them. 52 | 53 | repeat_interval: 3h 54 | 55 | # A default receiver 56 | 57 | # If an alert isn't caught by a route, send it to default. 58 | receiver: default 59 | 60 | # All the above attributes are inherited by all child routes and can 61 | # overwritten on each. 62 | 63 | # The child route trees. 64 | routes: 65 | # Send severity=slack alerts to slack. 66 | - match: 67 | severity: slack 68 | receiver: slack_alert 69 | - match: 70 | severity: email 71 | receiver: email_alert 72 | 73 | receivers: 74 | - name: 'default' 75 | slack_configs: 76 | - channel: '#alerts' 77 | 78 | - name: 'slack_alert' 79 | slack_configs: 80 | - channel: '#alerts' 81 | 82 | # # Whether or not to notify about resolved alerts. 83 | # send_resolved: true 84 | # 85 | # # The Slack webhook URL. 86 | # [ api_url: | default = global.slack_api_url ] 87 | # 88 | # # The channel or user to send notifications to. 89 | # channel: 90 | # 91 | # # API request data as defined by the Slack webhook API. 92 | # [ color: | default = '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' ] 93 | # [ username: | default = '{{ template "slack.default.username" . }}' 94 | # [ title: | default = '{{ template "slack.default.title" . }}' ] 95 | # [ title_link: | default = '{{ template "slack.default.titlelink" . }}' ] 96 | # [ icon_emoji: ] 97 | # [ pretext: | default = '{{ template "slack.default.pretext" . }}' ] 98 | # [ text: | default = '{{ template "slack.default.text" . }}' ] 99 | # [ fallback: | default = '{{ template "slack.default.fallback" . }}' ] 100 | 101 | - name: 'email_alert' 102 | email_configs: 103 | - to: 'xiaoping378@163.com' 104 | # Will override the global section, have a higher priority. 105 | smarthost: 'smtp.zmail300.cn:25' 106 | from: 'xuxiaoping2@300.cn' 107 | auth_username: 'xuxiaoping2@300.cn' 108 | auth_password: '********' 109 | require_tls: false 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Setup for Prometheus and Grafana 2 | 3 | ## Quick start 4 | 5 | To quickly start all the things just do this: 6 | ```bash 7 | kubectl create namespace monitoring 8 | kubectl --namespace monitoring create \ 9 | --filename https://raw.githubusercontent.com/xiaoping378/k8s-monitor/master/manifests-all.yaml 10 | ``` 11 | 12 | To shut down all components again: 13 | ```bash 14 | kubectl delete namespace monitoring 15 | ``` 16 | 本项目依赖的所有docker镜像已打包放在百度云上,所有镜像均可以在docker hub上找到 [下载镜像tar包](https://pan.baidu.com/s/1hskbi6o) 17 | 18 | ## More Details 19 | 20 | Alternatively follow these steps to get a feeling for the different components of this setup: 21 | 22 | ```bash 23 | kubectl create --filename manifests/prometheus-core-configmap.yaml 24 | # kubectl get configmaps 25 | # kubectl delete configmaps/prometheus 26 | 27 | kubectl create --filename manifests/prometheus-core-service.yaml 28 | # kubectl get services/prometheus 29 | # minikube service prometheus 30 | 31 | kubectl create --filename manifests/prometheus-core-deployment.yaml 32 | # kubectl get --all-namespaces --output wide pods 33 | # kubectl logs prometheus-2556266794-sd260 34 | # kubectl delete pods/prometheus-2556266794-sd260 35 | 36 | kubectl create --filename manifests/node-exporter-service.yaml 37 | kubectl create --filename manifests/node-exporter-daemonset.yaml 38 | 39 | # create Alertmanager 40 | kubectl create --filename manifests/prometheus-alert-configmap.yaml 41 | kubectl create --filename manifests/prometheus-alert-service.yaml 42 | kubectl create --filename manifests/prometheus-alert-deployment.yaml 43 | 44 | kubectl create --filename manifests/grafana-service.yaml 45 | # kubectl get services/grafana 46 | # minikube service grafana 47 | 48 | kubectl create --filename manifests/grafana-deployment.yaml 49 | # kubectl get --all-namespaces --output wide pods 50 | ``` 51 | 52 | See grafana.net for some example [dashboards](https://grafana.net/dashboards) and [plugins](https://grafana.net/plugins). 53 | 54 | - Configure [Prometheus](https://grafana.net/plugins/prometheus) data source for Grafana.
55 | `Grafana UI / Data Sources / Add data source` 56 | - `Name`: `prometheus` 57 | - `Type`: `Prometheus` 58 | - `Url`: `http://prometheus:9090` 59 | - `Add` 60 | 61 | - Import [Prometheus Stats](https://grafana.net/dashboards/2):
62 | `Grafana UI / Dashboards / Import` 63 | - `Grafana.net Dashboard`: `https://grafana.net/dashboards/2` 64 | - `Load` 65 | - `Prometheus`: `prometheus` 66 | - `Save & Open` 67 | 68 | - Import [Kubernetes cluster monitoring](https://grafana.net/dashboards/162):
69 | `Grafana UI / Dashboards / Import` 70 | - `Grafana.net Dashboard`: `https://grafana.net/dashboards/162` 71 | - `Load` 72 | - `Prometheus`: `prometheus` 73 | - `Save & Open` 74 | 75 | Instead of manually configuring the datasource and dashboards you can run the following job. It uses the API to configure Grafana to a state similar to when you manually go through the steps described above. 76 | 77 | ```bash 78 | kubectl create --filename manifests/grafana-import-dashboards-job.yaml 79 | ``` 80 | 81 | 82 | ## Create one single manifest file 83 | 84 | ```bash 85 | target="./manifests-all.yaml" 86 | rm "$target" 87 | printf -- "# Derived from ./manifests/*.yaml\n---\n" >> "$target" 88 | for file in ./manifests/*.yaml ; do 89 | if [ -e "$file" ] ; then 90 | cat "$file" >> "$target" 91 | printf -- "---\n" >> "$target" 92 | fi 93 | done 94 | ``` 95 | 96 | ## create configmap file 97 | 98 | ```bash 99 | kubectl create configmap prometheus-core --from-file=manifests/prometheus-core-configmap --output yaml --dry-run > manifests/prometheus-core-configmap.yaml 100 | kubectl create configmap grafana-import-dashboards --from-file=manifests/grafana-import-dashboards-configmap --output yaml --dry-run > manifests/grafana-import-dashboards-configmap.yaml 101 | kubectl create configmap prometheus-alert --from-file=manifests/prometheus-alert-configmap --output yaml --dry-run > manifests/prometheus-alert-configmap.yaml 102 | ``` 103 | 104 | ## Credits 105 | 106 | Based on 107 | ``` 108 | https://github.com/giantswarm/kubernetes-prometheus 109 | ``` 110 | -------------------------------------------------------------------------------- /manifests/prometheus-alert-configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | alertmanager.yml: | 4 | global: 5 | # ResolveTimeout is the time after which an alert is declared resolved 6 | # if it has not been updated. 7 | resolve_timeout: 5m 8 | 9 | # The smarthost and SMTP sender used for mail notifications. 10 | smtp_smarthost: 'smtp.zmail300.cn:25' 11 | smtp_from: 'xuxiaoping@300.cn' 12 | smtp_auth_username: 'xuxiaoping@300.cn' 13 | smtp_auth_password: '*******' 14 | # require_tls, don't work in global section. should put it into email config. 15 | # refer to https://github.com/prometheus/alertmanager/issues/193 16 | # smtp_require_tls: false 17 | 18 | # The API URL to use for Slack notifications. 19 | slack_api_url: 'global.slack_api_url' 20 | 21 | # # The auth token for Hipchat. 22 | # hipchat_auth_token: '1234556789' 23 | # 24 | # # Alternative host for Hipchat. 25 | # hipchat_url: 'https://hipchat.foobar.org/' 26 | 27 | # # The directory from which notification templates are read. 28 | # templates: 29 | # - '/etc/alertmanager/template/*.tmpl' 30 | 31 | # The root route on which each incoming alert enters. 32 | route: 33 | 34 | # The labels by which incoming alerts are grouped together. For example, 35 | # multiple alerts coming in for cluster=A and alertname=LatencyHigh would 36 | # be batched into a single group. 37 | 38 | group_by: ['alertname', 'cluster', 'service'] 39 | 40 | # When a new group of alerts is created by an incoming alert, wait at 41 | # least 'group_wait' to send the initial notification. 42 | # This way ensures that you get multiple alerts for the same group that start 43 | # firing shortly after another are batched together on the first 44 | # notification. 45 | 46 | group_wait: 30s 47 | 48 | # When the first notification was sent, wait 'group_interval' to send a batch 49 | # of new alerts that started firing for that group. 50 | 51 | group_interval: 5m 52 | 53 | # If an alert has successfully been sent, wait 'repeat_interval' to 54 | # resend them. 55 | 56 | repeat_interval: 3h 57 | 58 | # A default receiver 59 | 60 | # If an alert isn't caught by a route, send it to default. 61 | receiver: default 62 | 63 | # All the above attributes are inherited by all child routes and can 64 | # overwritten on each. 65 | 66 | # The child route trees. 67 | routes: 68 | # Send severity=slack alerts to slack. 69 | - match: 70 | severity: slack 71 | receiver: slack_alert 72 | - match: 73 | severity: email 74 | receiver: email_alert 75 | 76 | receivers: 77 | - name: 'default' 78 | slack_configs: 79 | - channel: '#alerts' 80 | 81 | - name: 'slack_alert' 82 | slack_configs: 83 | - channel: '#alerts' 84 | 85 | # # Whether or not to notify about resolved alerts. 86 | # send_resolved: true 87 | # 88 | # # The Slack webhook URL. 89 | # [ api_url: | default = global.slack_api_url ] 90 | # 91 | # # The channel or user to send notifications to. 92 | # channel: 93 | # 94 | # # API request data as defined by the Slack webhook API. 95 | # [ color: | default = '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' ] 96 | # [ username: | default = '{{ template "slack.default.username" . }}' 97 | # [ title: | default = '{{ template "slack.default.title" . }}' ] 98 | # [ title_link: | default = '{{ template "slack.default.titlelink" . }}' ] 99 | # [ icon_emoji: ] 100 | # [ pretext: | default = '{{ template "slack.default.pretext" . }}' ] 101 | # [ text: | default = '{{ template "slack.default.text" . }}' ] 102 | # [ fallback: | default = '{{ template "slack.default.fallback" . }}' ] 103 | 104 | - name: 'email_alert' 105 | email_configs: 106 | - to: 'xiaoping378@163.com' 107 | # Will override the global section, have a higher priority. 108 | smarthost: 'smtp.zmail300.cn:25' 109 | from: 'xuxiaoping2@300.cn' 110 | auth_username: 'xuxiaoping2@300.cn' 111 | auth_password: '********' 112 | require_tls: false 113 | kind: ConfigMap 114 | metadata: 115 | creationTimestamp: null 116 | name: prometheus-alert 117 | -------------------------------------------------------------------------------- /manifests/prometheus-core-configmap/prometheus-alert.rules: -------------------------------------------------------------------------------- 1 | # Alert for any instance that is unreachable for >5 minutes. 2 | ALERT InstanceDown 3 | IF up == 0 4 | FOR 5m 5 | LABELS { severity = "email" } 6 | ANNOTATIONS { 7 | summary = "Instance {{ $labels.instance }} down", 8 | description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.", 9 | } 10 | 11 | ALERT node_cpu_threshold_exceeded 12 | IF 100 * node_load5 > 90 13 | LABELS { severity = "email" } 14 | ANNOTATIONS { 15 | summary = "Instance {{ $labels.instance }} CPU usage is dangerously high", 16 | description = "This device's CPU usage has exceeded the threshold with a value of {{ $value }}.", 17 | } 18 | 19 | ALERT node_memory_threshold_exceeded 20 | IF (node_memory_MemFree+node_memory_Buffers+node_memory_Cached) / node_memory_MemTotal < 0.1 21 | LABELS { severity = "email" } 22 | ANNOTATIONS { 23 | summary = "Instance {{ $labels.instance }} MEM usage is dangerously high", 24 | description = "This device's MEM usage has exceeded the threshold with a value of {{ $value }}.", 25 | } 26 | 27 | ALERT node_fs_threshold_exceeded 28 | IF node_filesystem_free{fstype="rootfs"} / node_filesystem_size{fstype="rootfs"} < 0.2 29 | LABELS { severity = "email" } 30 | ANNOTATIONS { 31 | summary = "Node filesystem usage is high", 32 | description = "Node {{ $labels.instance }}'s filesystem {{ $labels.mountpoint }} has less than 20% disk space remaining." 33 | } 34 | 35 | ALERT container_cpu_threshold_exceeded 36 | IF rate(container_cpu_user_seconds_total{image!=""}[5m]) * 100 > 90 37 | LABELS { severity = "email" } 38 | ANNOTATIONS { 39 | summary = "Instance {{ $labels.kubernetes_container_name }} CPU usage is dangerously high", 40 | description = "This device's CPU usage has exceeded the threshold with a value of {{ $value }}.", 41 | } 42 | 43 | ALERT FdExhaustionClose 44 | IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 45 | FOR 10m 46 | LABELS { severity = "email" } 47 | ANNOTATIONS { 48 | summary = "file descriptors soon exhausted", 49 | description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", 50 | } 51 | 52 | ALERT ContainerReboot 53 | IF increase(container_last_seen{container_label_io_kubernetes_container_hash!=""}[30s]) < 25 54 | LABELS { severity = "email" } 55 | ANNOTATIONS { 56 | summary = "Container reboot", 57 | description = "{{ $labels.container_label_io_kubernetes_pod_name }}刚刚发生重启, 已经重启过{{ $labels.container_label_io_kubernetes_container_restartCount }}次." 58 | } 59 | 60 | ALERT PodRestartingTooMuch 61 | IF rate(kube_pod_container_status_restarts[1m]) > 1/(5*60) 62 | FOR 1h 63 | LABELS { severity="warning" } 64 | ANNOTATIONS { 65 | summary = "Pod {{$labels.namespace}}/{{$label.pod}} restarting too much.", 66 | description = "Pod {{$labels.namespace}}/{{$label.pod}} restarting too much.", 67 | } 68 | 69 | ALERT PodSlowToLaunch 70 | IF rate(kubelet_pod_start_latency_microseconds{quantile="0.99"}[1m]) > 5 71 | FOR 1h 72 | LABELS { severity="warning" } 73 | ANNOTATIONS { 74 | summary = "Pods are slow to launch.", 75 | description = "Pods are taking longer than 5 milliseconds to launch.", 76 | } 77 | 78 | 79 | # #etcd monitor 80 | # ALERT HighNumberOfFailedHTTPRequests 81 | # IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) 82 | # / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 83 | # FOR 10m 84 | # LABELS { 85 | # severity = "warning" 86 | # } 87 | # ANNOTATIONS { 88 | # summary = "a high number of HTTP requests are failing", 89 | # description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", 90 | # } 91 | 92 | # ALERT HTTPRequestsSlow 93 | # IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 94 | # FOR 10m 95 | # LABELS { 96 | # severity = "warning" 97 | # } 98 | # ANNOTATIONS { 99 | # summary = "slow HTTP requests", 100 | # description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", 101 | # } 102 | 103 | # ALERT etcdNoLeader 104 | # IF etcd_server_has_leader{job="etcd"} == 0 105 | # FOR 1m 106 | # LABELS { 107 | # severity = "critical" 108 | # } 109 | # ANNOTATIONS { 110 | # summary = "etcd node has no leader", 111 | # description = "etcd node {{ $labels.instance }} has no leader", 112 | # } 113 | 114 | # ALERT InsufficientPeers 115 | # IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) 116 | # FOR 3m 117 | # LABELS { 118 | # severity = "critical" 119 | # } 120 | # ANNOTATIONS { 121 | # summary = "etcd cluster small", 122 | # description = "If one more etcd peer goes down the cluster will be unavailable", 123 | # } 124 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Monitoring with Prometheus and Grafana" 3 | description = "Recipe to spin up a monitoring setup with Prometheus and Grafana on Kubernetes." 4 | date = "2016-09-29" 5 | type = "page" 6 | weight = 100 7 | categories = ["recipes"] 8 | +++ 9 | 10 | # Monitoring with Prometheus and Grafana 11 | 12 | [Prometheus](https://prometheus.io/) is an open-source monitoring solution that includes the gathering of metrics, their storage in an internal time series database as well as querying and alerting based on that data. 13 | 14 | It offers a lot of integrations incl. Docker, Kubernetes, etc. 15 | 16 | Prometheus can also visualize your data. However, in this recipe we include another open-source tool, [Grafana](http://grafana.org/), for the visualization part, as it offers a more powerful and flexible way to generate visuals and dashboards. 17 | 18 | If you just want to get Prometheus and Grafana up and running you can deploy the whole recipe with a single command instead of going through all steps detailed out below: 19 | 20 | ```bash 21 | kubectl create --filename manifests/ 22 | ``` 23 | 24 | ## Deploying Prometheus 25 | 26 | First, we need to create the configuration for our Prometheus. For this we use a Config Map, which we later mount into our Prometheus pod to configure it. This way we can change the configuration without having to redeploy Prometheus itself. 27 | 28 | `kubectl create --filename manifests/prometheus-core-configmap.yaml` 29 | 30 | Then, we create a service to be able to access Prometheus. 31 | 32 | `kubectl create --filename manifests/prometheus-core-service.yaml` 33 | 34 | Finally, we can deploy Prometheus itself. 35 | 36 | `kubectl create --filename manifests/prometheus-core-deployment.yaml` 37 | 38 | Further, we need the Prometheus Node Exporter deployed to each node. For this we use a Daemon Set and a fronting service for Prometheus to be able to access the node exporters. 39 | 40 | ``` 41 | kubectl create --filename manifests/prometheus-node-exporter-service.yaml 42 | kubectl create --filename manifests/prometheus-node-exporter-daemonset.yaml 43 | ``` 44 | 45 | Wait a bit for all the pods to come up. Then Prometheus should be ready and running. We can check the Prometheus targets at https://mycluster.k8s.gigantic.io/api/v1/proxy/namespaces/default/services/prometheus/targets 46 | 47 | ![Prometheus Targets](prometheus_targets.png) 48 | 49 | ## Deploying Alertmanager 50 | we need to create the configuration for our Alertmanager. For this we use a Config Map, which we later mount into our Alertmanager pod to configure it. This way we can change the configuration without having to redeploy Alertmanager itself. 51 | 52 | `kubectl create --filename manifests/prometheus-alert-configmap.yaml` 53 | 54 | Then, we create a service to be able to access Alertmanager. 55 | 56 | `kubectl create --filename manifests/prometheus-alert-service.yaml` 57 | 58 | Finally, we can deploy Alertmanager itself. 59 | 60 | `kubectl create --filename manifests/prometheus-alert-deployment.yaml` 61 | 62 | 63 | Wait a bit for all the pods to come up. Then Alertmanager should be ready and running. We can check the Alertmanager targets at 64 | https://mycluster.k8s.gigantic.io/api/v1/proxy/namespaces/default/services/alertmanager/ 65 | 66 | ![Alertmanager](alertmanager.png) 67 | 68 | 69 | ## Deploying Grafana 70 | 71 | Now that we have Prometheus up and running we can deploy Grafana to have a nicer frontend for our metrics. 72 | 73 | Again, we create a service to be able to access Grafana and a deployment to manage the pods. 74 | 75 | ``` 76 | kubectl create --filename manifests/grafana-services.yaml 77 | kubectl create --filename manifests/grafana-deployment.yaml 78 | ``` 79 | 80 | Wait a bit for Grafana to come up. Then you can access Grafana at https://mycluster.k8s.gigantic.io/api/v1/proxy/namespaces/default/services/grafana/ 81 | 82 | ## Setting Up Grafana 83 | 84 | TLDR: If you don't want to go through all the manual steps below you can let the following job use the API to configure Grafana to a similar state. 85 | 86 | ```bash 87 | kubectl create --filename manifests/grafana-import-dashboards-job.yaml 88 | ``` 89 | 90 | Once we're in Grafana we need to first configure [Prometheus](https://grafana.net/plugins/prometheus) as a data source. 91 | 92 | - `Grafana UI / Data Sources / Add data source` 93 | - `Name`: `prometheus` 94 | - `Type`: `Prometheus` 95 | - `Url`: `http://prometheus:9090` 96 | - `Add` 97 | 98 | ![Grafana Datasource](grafana_datasource.png) 99 | 100 | Then go to the Dashboards tab and import the [Prometheus Stats dashboard](https://grafana.net/dashboards/2), which shows the status of Prometheus itself. 101 | 102 | ![Grafana Datasource Dashboard](grafana_datasource_dashboard.png) 103 | 104 | You can check it out to see how your Prometheus is doing. 105 | 106 | ![Grafana Datasource Dashboard](grafana_prometheus_stats.png) 107 | 108 | Last, but not least we can import a sample [Kubernetes cluster monitoring dashboard](https://grafana.net/dashboards/162), to get a first overview over our cluster metrics. 109 | 110 | - `Grafana UI / Dashboards / Import` 111 | - `Grafana.net Dashboard`: `https://grafana.net/dashboards/162` 112 | - `Load` 113 | - `Prometheus`: `prometheus` 114 | - `Save & Open` 115 | 116 | ![Grafana Import Dashboard](grafana_import_dashboard.png) 117 | 118 | Voilá. You have a nice first dashboard with metrics of your Kubernetes cluster. 119 | 120 | ![Grafana Import Dashboard](grafana_cluster_overview.png) 121 | 122 | ## Next Steps 123 | 124 | Next, you should get into the [Grafana](http://docs.grafana.org/) and [Prometheus](https://prometheus.io/docs/introduction/overview/) documentations to get to know the tools and either build your own dashboards or extend the samples from above. 125 | 126 | You can also check out grafana.net for some more example [dashboards](https://grafana.net/dashboards) and [plugins](https://grafana.net/plugins). 127 | 128 | More Alertmanager documentations in [here](https://prometheus.io/docs/alerting/overview/) 129 | -------------------------------------------------------------------------------- /manifests/prometheus-core-configmap/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 10s 3 | scrape_timeout: 10s 4 | evaluation_interval: 10s 5 | 6 | rule_files: 7 | - '/etc/prometheus/prometheus-alert.rules' 8 | 9 | scrape_configs: 10 | # # The job name is added as a label `job=` to any timeseries scraped from this config. 11 | # - job_name: 'prometheus' 12 | 13 | # # Override the global default and scrape targets from this job every 5 seconds. 14 | # scrape_interval: 5s 15 | 16 | # # metrics_path defaults to '/metrics' 17 | # # scheme defaults to 'http'. 18 | 19 | # static_configs: 20 | # - targets: ['localhost:9090'] 21 | 22 | - job_name: 'kubernetes-cluster' 23 | 24 | # Default to scraping over https. If required, just disable this or change to 25 | # `http`. 26 | scheme: https 27 | 28 | # This TLS & bearer token file config is used to connect to the actual scrape 29 | # endpoints for cluster components. This is separate to discovery auth 30 | # configuration (`in_cluster` below) because discovery & scraping are two 31 | # separate concerns in Prometheus. 32 | tls_config: 33 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 34 | # If your node certificates are self-signed or use a different CA to the 35 | # master CA, then disable certificate verification below. Note that 36 | # certificate verification is an integral part of a secure infrastructure 37 | # so this should only be disabled in a controlled environment. You can 38 | # disable certificate verification by uncommenting the line below. 39 | # 40 | # insecure_skip_verify: true 41 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 42 | 43 | kubernetes_sd_configs: 44 | - api_servers: 45 | - 'https://kubernetes.default.svc' 46 | in_cluster: true 47 | role: apiserver 48 | 49 | - job_name: 'kubernetes-nodes' 50 | 51 | # Default to scraping over https. If required, just disable this or change to 52 | # `http`. 53 | scheme: http 54 | 55 | # This TLS & bearer token file config is used to connect to the actual scrape 56 | # endpoints for cluster components. This is separate to discovery auth 57 | # configuration (`in_cluster` below) because discovery & scraping are two 58 | # separate concerns in Prometheus. 59 | tls_config: 60 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 61 | # If your node certificates are self-signed or use a different CA to the 62 | # master CA, then disable certificate verification below. Note that 63 | # certificate verification is an integral part of a secure infrastructure 64 | # so this should only be disabled in a controlled environment. You can 65 | # disable certificate verification by uncommenting the line below. 66 | # 67 | # insecure_skip_verify: true 68 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 69 | 70 | kubernetes_sd_configs: 71 | - api_servers: 72 | - 'https://kubernetes.default.svc' 73 | in_cluster: true 74 | role: node 75 | 76 | relabel_configs: 77 | - action: labelmap 78 | regex: __meta_kubernetes_node_label_(.+) 79 | - source_labels: [__meta_kubernetes_role] 80 | action: replace 81 | target_label: kubernetes_role 82 | - source_labels: [__address__] 83 | regex: '(.*):10250' 84 | replacement: '${1}:10255' 85 | target_label: __address__ 86 | # Scrape config for service endpoints. 87 | # 88 | # The relabeling allows the actual service scrape endpoint to be configured 89 | # via the following annotations: 90 | # 91 | # * `prometheus.io/scrape`: Only scrape services that have a value of `true` 92 | # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need 93 | # to set this to `https` & most likely set the `tls_config` of the scrape config. 94 | # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. 95 | # * `prometheus.io/port`: If the metrics are exposed on a different port to the 96 | # service then set this appropriately. 97 | - job_name: 'kubernetes-service-endpoints' 98 | 99 | kubernetes_sd_configs: 100 | - api_servers: 101 | - 'https://kubernetes.default.svc' 102 | in_cluster: true 103 | role: endpoint 104 | 105 | relabel_configs: 106 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] 107 | action: keep 108 | regex: true 109 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] 110 | action: replace 111 | target_label: __scheme__ 112 | regex: (https?) 113 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] 114 | action: replace 115 | target_label: __metrics_path__ 116 | regex: (.+) 117 | - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] 118 | action: replace 119 | target_label: __address__ 120 | regex: (.+)(?::\d+);(\d+) 121 | replacement: $1:$2 122 | - action: labelmap 123 | regex: __meta_kubernetes_service_label_(.+) 124 | - source_labels: [__meta_kubernetes_service_namespace] 125 | action: replace 126 | target_label: kubernetes_namespace 127 | - source_labels: [__meta_kubernetes_service_name] 128 | action: replace 129 | target_label: kubernetes_name 130 | 131 | # Example scrape config for probing services via the Blackbox Exporter. 132 | # 133 | # The relabeling allows the actual service scrape endpoint to be configured 134 | # via the following annotations: 135 | # 136 | # * `prometheus.io/probe`: Only probe services that have a value of `true` 137 | - job_name: 'kubernetes-services' 138 | 139 | metrics_path: /probe 140 | params: 141 | module: [http_2xx] 142 | 143 | kubernetes_sd_configs: 144 | - api_servers: 145 | - 'https://kubernetes.default.svc' 146 | in_cluster: true 147 | role: service 148 | 149 | relabel_configs: 150 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] 151 | action: keep 152 | regex: true 153 | - source_labels: [__address__] 154 | target_label: __param_target 155 | - target_label: __address__ 156 | replacement: blackbox 157 | - source_labels: [__param_target] 158 | target_label: instance 159 | - action: labelmap 160 | regex: __meta_kubernetes_service_label_(.+) 161 | - source_labels: [__meta_kubernetes_service_namespace] 162 | target_label: kubernetes_namespace 163 | - source_labels: [__meta_kubernetes_service_name] 164 | target_label: kubernetes_name 165 | 166 | # Example scrape config for pods 167 | # 168 | # The relabeling allows the actual pod scrape endpoint to be configured via the 169 | # following annotations: 170 | # 171 | # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` 172 | # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. 173 | # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. 174 | - job_name: 'kubernetes-pods' 175 | 176 | kubernetes_sd_configs: 177 | - api_servers: 178 | - 'https://kubernetes.default.svc' 179 | in_cluster: true 180 | role: pod 181 | 182 | relabel_configs: 183 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] 184 | action: keep 185 | regex: true 186 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] 187 | action: replace 188 | target_label: __metrics_path__ 189 | regex: (.+) 190 | - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] 191 | action: replace 192 | regex: (.+):(?:\d+);(\d+) 193 | replacement: ${1}:${2} 194 | target_label: __address__ 195 | - action: labelmap 196 | regex: __meta_kubernetes_pod_label_(.+) 197 | - source_labels: [__meta_kubernetes_pod_namespace] 198 | action: replace 199 | target_label: kubernetes_namespace 200 | - source_labels: [__meta_kubernetes_pod_name] 201 | action: replace 202 | target_label: kubernetes_pod_name 203 | -------------------------------------------------------------------------------- /manifests/grafana-import-dashboards-configmap/grafana-net-162-dashboard.json: -------------------------------------------------------------------------------- 1 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Kubernetes cluster monitoring (via Prometheus)","tags":["kubernetes"],"style":"dark","timezone":"browser","editable":true,"hideControls":true,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":4,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(sum(node_memory_MemTotal) - sum(node_memory_MemFree+node_memory_Buffers+node_memory_Cached) ) / sum(node_memory_MemTotal) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(sum by (io_kubernetes_container_name)( rate(container_cpu_usage_seconds_total{image!=\"\"}[1m] ) )) / count(node_cpu{mode=\"system\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster CPU usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(sum(node_filesystem_size{device=\"rootfs\"}) - sum(node_filesystem_free{device=\"rootfs\"}) ) / sum(node_filesystem_size{device=\"rootfs\"}) * 100","interval":"10s","intervalFactor":1,"metric":"","refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster Filesystem usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"title":"Row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (io_kubernetes_container_name)( rate(container_cpu_usage_seconds_total{image!=\"\"}[1m] ) )","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_container_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sort_desc(sum(container_memory_usage_bytes{image!=\"\"}) by (io_kubernetes_container_name, image))","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_container_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":8,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sort_desc(sum by (kubernetes_pod_name) (rate (container_network_receive_bytes_total{name!=\"\", kubernetes_pod_name=~\"op-.*\"}[1m]) ))","interval":"10s","intervalFactor":1,"legendFormat":"{{ kubernetes_pod_name }}","metric":"network","refId":"A","step":10},{"expr":"sort_desc(sum by (kubernetes_pod_name) (rate (container_network_transmit_bytes_total{name!=\"\", kubernetes_pod_name=~\"op-.*\"}[1m]) ))","interval":"10s","intervalFactor":1,"legendFormat":"{{ kubernetes_pod_name }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod Network i/o","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"}],"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":46,"links":[],"gnetId":162,"description":"Monitor a Kubernetes cluster using Prometheus TSDB. Shows overall cluster CPU / Memory / Disk usage as well as individual pod statistics. "},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} 2 | -------------------------------------------------------------------------------- /manifests/grafana-import-dashboards-configmap/grafana-net-193-dashboard.json: -------------------------------------------------------------------------------- 1 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"prometheus with cAdvisor as a target","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Docker monitoring","description":"Docker monitoring with Prometheus and cAdvisor","tags":["docker"],"style":"dark","timezone":"browser","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"50","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"count(container_last_seen{image!=\"\"})","intervalFactor":2,"legendFormat":"","metric":"container_last_seen","refId":"A","step":240}],"thresholds":"","title":"Running containers","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"mbytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":5,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(container_memory_usage_bytes{image!=\"\"})/1024/1024","intervalFactor":2,"legendFormat":"","metric":"container_memory_usage_bytes","refId":"A","step":240}],"thresholds":"","title":"Total Memory Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100)","intervalFactor":2,"legendFormat":"","metric":"container_memory_usage_bytes","refId":"A","step":240}],"thresholds":"","title":"Total CPU Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100","intervalFactor":2,"legendFormat":"{{name}}","metric":"cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"CPU Usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"Row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":1,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"container_memory_usage_bytes{image!=\"\"}","hide":false,"intervalFactor":2,"legendFormat":"{{name}}","metric":"container_memory_usage_bytes","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Memory Usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":"","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"irate(container_network_receive_bytes_total{image!=\"\"}[5m])","intervalFactor":2,"legendFormat":"{{name}}","metric":"container_network_receive_bytes_total","refId":"A","step":20}],"timeFrom":null,"timeShift":null,"title":"Network Rx","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":4,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"irate(container_network_transmit_bytes_total{image!=\"\"}[5m])","intervalFactor":2,"legendFormat":"{{name}}","refId":"A","step":20}],"timeFrom":null,"timeShift":null,"title":"Network Tx","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"}],"time":{"from":"now-3h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":26,"links":[],"gnetId":193},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} 2 | -------------------------------------------------------------------------------- /manifests/grafana-import-dashboards-configmap/grafana-net-2-dashboard.json: -------------------------------------------------------------------------------- 1 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"text","name":"Text","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Prometheus Stats","tags":[],"style":"dark","timezone":"browser","editable":true,"hideControls":true,"sharedCrosshair":false,"rows":[{"collapse":false,"editable":true,"height":178,"panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","decimals":1,"editable":true,"error":false,"format":"s","id":5,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(time() - process_start_time_seconds{job=\"prometheus\"})","intervalFactor":2,"refId":"A","step":4}],"thresholds":"","title":"Uptime","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","id":6,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"targets":[{"expr":"prometheus_local_storage_memory_series","intervalFactor":2,"refId":"A","step":4}],"thresholds":"1,5","title":"Local Storage Memory Series","type":"singlestat","valueFontSize":"70%","valueMaps":[],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","id":7,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"targets":[{"expr":"prometheus_local_storage_indexing_queue_length","intervalFactor":2,"refId":"A","step":4}],"thresholds":"500,4000","title":"Interal Storage Queue Length","type":"singlestat","valueFontSize":"70%","valueMaps":[{"op":"=","text":"Empty","value":"0"}],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"content":"\"Prometheus\nPrometheus\n\n

You're using Prometheus, an open-source systems monitoring and alerting toolkit originally built at SoundCloud. For more information, check out the Grafana and Prometheus projects.

","editable":true,"error":false,"id":9,"links":[],"mode":"html","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":227,"panels":[{"aliasColors":{"prometheus":"#C15C17","{instance=\"localhost:9090\",job=\"prometheus\"}":"#C15C17"},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(prometheus_local_storage_ingested_samples_total[5m])","interval":"","intervalFactor":2,"legendFormat":"{{job}}","metric":"","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Samples ingested (rate-5m)","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Samples Ingested\nThis graph displays the count of samples ingested by the Prometheus server, as measured over the last 5 minutes, per time series in the range vector. When troubleshooting an issue on IRC or Github, this is often the first stat requested by the Prometheus team. ","editable":true,"error":false,"id":8,"links":[],"mode":"markdown","span":2.995914043583536,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{"prometheus":"#F9BA8F","{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}":"#F9BA8F"},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":5,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(prometheus_target_interval_length_seconds_count[5m])","intervalFactor":2,"legendFormat":"{{job}}","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Target Scrapes (last 5m)","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":14,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":4,"stack":false,"steppedLine":false,"targets":[{"expr":"prometheus_target_interval_length_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}","interval":"","intervalFactor":2,"legendFormat":"{{quantile}} ({{interval}})","metric":"","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Scrape Duration","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Scrapes\nPrometheus scrapes metrics from instrumented jobs, either directly or via an intermediary push gateway for short-lived jobs. Target scrapes will show how frequently targets are scraped, as measured over the last 5 minutes, per time series in the range vector. Scrape Duration will show how long the scrapes are taking, with percentiles available as series. ","editable":true,"error":false,"id":11,"links":[],"mode":"markdown","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":null,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":12,"legend":{"alignAsTable":false,"avg":false,"current":false,"hideEmpty":true,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"prometheus_evaluator_duration_milliseconds{quantile!=\"0.01\", quantile!=\"0.05\"}","interval":"","intervalFactor":2,"legendFormat":"{{quantile}}","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Rule Eval Duration","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"percentunit","label":""},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Rule Evaluation Duration\nThis graph panel plots the duration for all evaluations to execute. The 50th percentile, 90th percentile and 99th percentile are shown as three separate series to help identify outliers that may be skewing the data.","editable":true,"error":false,"id":15,"links":[],"mode":"markdown","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"}],"time":{"from":"now-5m","to":"now"},"timepicker":{"now":true,"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":0,"links":[{"icon":"info","tags":[],"targetBlank":true,"title":"Grafana Docs","tooltip":"","type":"link","url":"http://www.grafana.org/docs"},{"icon":"info","tags":[],"targetBlank":true,"title":"Prometheus Docs","type":"link","url":"http://prometheus.io/docs/introduction/overview/"}],"gnetId":2,"description":"The official, pre-built Prometheus Stats Dashboard."},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} 2 | -------------------------------------------------------------------------------- /manifests/prometheus-core-configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | prometheus-alert.rules: "# Alert for any instance that is unreachable for >5 minutes.\nALERT 4 | InstanceDown\n IF up == 0\n FOR 5m\n LABELS { severity = \"email\" }\n ANNOTATIONS 5 | {\n summary = \"Instance {{ $labels.instance }} down\",\n description = 6 | \"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 7 | 5 minutes.\",\n }\n\nALERT node_cpu_threshold_exceeded \n IF 100 * node_load5 8 | > 90\n LABELS { severity = \"email\" }\n ANNOTATIONS {\n summary = \"Instance 9 | {{ $labels.instance }} CPU usage is dangerously high\",\n description = \"This 10 | device's CPU usage has exceeded the threshold with a value of {{ $value }}.\",\n 11 | \ }\n\nALERT node_memory_threshold_exceeded \n IF (node_memory_MemFree+node_memory_Buffers+node_memory_Cached) 12 | / node_memory_MemTotal < 0.1\n LABELS { severity = \"email\" }\n ANNOTATIONS 13 | {\n summary = \"Instance {{ $labels.instance }} MEM usage is dangerously high\",\n 14 | \ description = \"This device's MEM usage has exceeded the threshold with a 15 | value of {{ $value }}.\",\n }\n\nALERT node_fs_threshold_exceeded\n IF node_filesystem_free{fstype=\"rootfs\"} 16 | / node_filesystem_size{fstype=\"rootfs\"} < 0.2\n LABELS { severity = \"email\" 17 | }\n ANNOTATIONS {\n summary = \"Node filesystem usage is high\",\n description 18 | = \"Node {{ $labels.instance }}'s filesystem {{ $labels.mountpoint }} has less 19 | than 20% disk space remaining.\"\n }\n\nALERT container_cpu_threshold_exceeded 20 | \ \n IF rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100 > 90\n 21 | \ LABELS { severity = \"email\" }\n ANNOTATIONS {\n summary = \"Instance {{ 22 | $labels.kubernetes_container_name }} CPU usage is dangerously high\",\n description 23 | = \"This device's CPU usage has exceeded the threshold with a value of {{ $value 24 | }}.\",\n }\n\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[1h], 25 | 3600 * 4) > 1\n FOR 10m\n LABELS { severity = \"email\" }\n ANNOTATIONS {\n 26 | \ summary = \"file descriptors soon exhausted\",\n description = \"{{ $labels.job 27 | }} instance {{ $labels.instance }} will exhaust in file descriptors soon\",\n 28 | \ }\n\nALERT ContainerReboot\n IF increase(container_last_seen{container_label_io_kubernetes_container_hash!=\"\"}[30s]) 29 | < 25\n LABELS { severity = \"email\" }\n ANNOTATIONS {\n summary = \"Container 30 | reboot\",\n description = \"{{ $labels.container_label_io_kubernetes_pod_name 31 | }}刚刚发生重启, 已经重启过{{ $labels.container_label_io_kubernetes_container_restartCount 32 | }}次.\"\n }\n\nALERT PodRestartingTooMuch\n IF rate(kube_pod_container_status_restarts[1m]) 33 | > 1/(5*60)\n FOR 1h\n LABELS { severity=\"warning\" }\n ANNOTATIONS 34 | {\n summary = \"Pod {{$labels.namespace}}/{{$label.pod}} restarting too much.\",\n 35 | \ description = \"Pod {{$labels.namespace}}/{{$label.pod}} restarting too much.\",\n 36 | \ }\n\nALERT PodSlowToLaunch\n IF rate(kubelet_pod_start_latency_microseconds{quantile=\"0.99\"}[1m]) 37 | > 5\n FOR 1h\n LABELS { severity=\"warning\" }\n ANNOTATIONS {\n 38 | \ summary = \"Pods are slow to launch.\",\n description = \"Pods are taking 39 | longer than 5 milliseconds to launch.\",\n }\n\n\n# #etcd monitor\n# ALERT HighNumberOfFailedHTTPRequests\n# 40 | IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd\"}[5m]))\n# / sum 41 | by(method) (rate(etcd_http_received_total{job=\"etcd\"}[5m])) > 0.01\n# FOR 10m\n# 42 | LABELS {\n# severity = \"warning\"\n# }\n# ANNOTATIONS {\n# summary = \"a 43 | high number of HTTP requests are failing\",\n# description = \"{{ $value }}% 44 | of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance 45 | }}\",\n# }\n\n# ALERT HTTPRequestsSlow\n# IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) 46 | > 0.15\n# FOR 10m\n# LABELS {\n# severity = \"warning\"\n# }\n# ANNOTATIONS 47 | {\n# summary = \"slow HTTP requests\",\n# description = \"on ectd instance 48 | {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow\",\n# }\n\n# 49 | ALERT etcdNoLeader\n# IF etcd_server_has_leader{job=\"etcd\"} == 0\n# FOR 1m\n# 50 | LABELS {\n# severity = \"critical\"\n# }\n# ANNOTATIONS {\n# summary = \"etcd 51 | node has no leader\",\n# description = \"etcd node {{ $labels.instance }} has 52 | no leader\",\n# }\n\n# ALERT InsufficientPeers\n# IF count(up{job=\"etcd\"} == 53 | 0) > (count(up{job=\"etcd\"}) / 2 - 1)\n# FOR 3m\n# LABELS {\n# severity = \"critical\"\n# 54 | }\n# ANNOTATIONS {\n# summary = \"etcd cluster small\",\n# description = \"If 55 | one more etcd peer goes down the cluster will be unavailable\",\n# }\n" 56 | prometheus-record.rules: |2 57 | 58 | instance:fd_utilization = process_open_fds / process_max_fds 59 | prometheus.yml: | 60 | global: 61 | scrape_interval: 10s 62 | scrape_timeout: 10s 63 | evaluation_interval: 10s 64 | 65 | rule_files: 66 | - '/etc/prometheus/prometheus-alert.rules' 67 | 68 | scrape_configs: 69 | # # The job name is added as a label `job=` to any timeseries scraped from this config. 70 | # - job_name: 'prometheus' 71 | 72 | # # Override the global default and scrape targets from this job every 5 seconds. 73 | # scrape_interval: 5s 74 | 75 | # # metrics_path defaults to '/metrics' 76 | # # scheme defaults to 'http'. 77 | 78 | # static_configs: 79 | # - targets: ['localhost:9090'] 80 | 81 | - job_name: 'kubernetes-cluster' 82 | 83 | # Default to scraping over https. If required, just disable this or change to 84 | # `http`. 85 | scheme: https 86 | 87 | # This TLS & bearer token file config is used to connect to the actual scrape 88 | # endpoints for cluster components. This is separate to discovery auth 89 | # configuration (`in_cluster` below) because discovery & scraping are two 90 | # separate concerns in Prometheus. 91 | tls_config: 92 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 93 | # If your node certificates are self-signed or use a different CA to the 94 | # master CA, then disable certificate verification below. Note that 95 | # certificate verification is an integral part of a secure infrastructure 96 | # so this should only be disabled in a controlled environment. You can 97 | # disable certificate verification by uncommenting the line below. 98 | # 99 | # insecure_skip_verify: true 100 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 101 | 102 | kubernetes_sd_configs: 103 | - api_servers: 104 | - 'https://kubernetes.default.svc' 105 | in_cluster: true 106 | role: apiserver 107 | 108 | - job_name: 'kubernetes-nodes' 109 | 110 | # Default to scraping over https. If required, just disable this or change to 111 | # `http`. 112 | scheme: http 113 | 114 | # This TLS & bearer token file config is used to connect to the actual scrape 115 | # endpoints for cluster components. This is separate to discovery auth 116 | # configuration (`in_cluster` below) because discovery & scraping are two 117 | # separate concerns in Prometheus. 118 | tls_config: 119 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 120 | # If your node certificates are self-signed or use a different CA to the 121 | # master CA, then disable certificate verification below. Note that 122 | # certificate verification is an integral part of a secure infrastructure 123 | # so this should only be disabled in a controlled environment. You can 124 | # disable certificate verification by uncommenting the line below. 125 | # 126 | # insecure_skip_verify: true 127 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 128 | 129 | kubernetes_sd_configs: 130 | - api_servers: 131 | - 'https://kubernetes.default.svc' 132 | in_cluster: true 133 | role: node 134 | 135 | relabel_configs: 136 | - action: labelmap 137 | regex: __meta_kubernetes_node_label_(.+) 138 | - source_labels: [__meta_kubernetes_role] 139 | action: replace 140 | target_label: kubernetes_role 141 | - source_labels: [__address__] 142 | regex: '(.*):10250' 143 | replacement: '${1}:10255' 144 | target_label: __address__ 145 | # Scrape config for service endpoints. 146 | # 147 | # The relabeling allows the actual service scrape endpoint to be configured 148 | # via the following annotations: 149 | # 150 | # * `prometheus.io/scrape`: Only scrape services that have a value of `true` 151 | # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need 152 | # to set this to `https` & most likely set the `tls_config` of the scrape config. 153 | # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. 154 | # * `prometheus.io/port`: If the metrics are exposed on a different port to the 155 | # service then set this appropriately. 156 | - job_name: 'kubernetes-service-endpoints' 157 | 158 | kubernetes_sd_configs: 159 | - api_servers: 160 | - 'https://kubernetes.default.svc' 161 | in_cluster: true 162 | role: endpoint 163 | 164 | relabel_configs: 165 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] 166 | action: keep 167 | regex: true 168 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] 169 | action: replace 170 | target_label: __scheme__ 171 | regex: (https?) 172 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] 173 | action: replace 174 | target_label: __metrics_path__ 175 | regex: (.+) 176 | - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] 177 | action: replace 178 | target_label: __address__ 179 | regex: (.+)(?::\d+);(\d+) 180 | replacement: $1:$2 181 | - action: labelmap 182 | regex: __meta_kubernetes_service_label_(.+) 183 | - source_labels: [__meta_kubernetes_service_namespace] 184 | action: replace 185 | target_label: kubernetes_namespace 186 | - source_labels: [__meta_kubernetes_service_name] 187 | action: replace 188 | target_label: kubernetes_name 189 | 190 | # Example scrape config for probing services via the Blackbox Exporter. 191 | # 192 | # The relabeling allows the actual service scrape endpoint to be configured 193 | # via the following annotations: 194 | # 195 | # * `prometheus.io/probe`: Only probe services that have a value of `true` 196 | - job_name: 'kubernetes-services' 197 | 198 | metrics_path: /probe 199 | params: 200 | module: [http_2xx] 201 | 202 | kubernetes_sd_configs: 203 | - api_servers: 204 | - 'https://kubernetes.default.svc' 205 | in_cluster: true 206 | role: service 207 | 208 | relabel_configs: 209 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] 210 | action: keep 211 | regex: true 212 | - source_labels: [__address__] 213 | target_label: __param_target 214 | - target_label: __address__ 215 | replacement: blackbox 216 | - source_labels: [__param_target] 217 | target_label: instance 218 | - action: labelmap 219 | regex: __meta_kubernetes_service_label_(.+) 220 | - source_labels: [__meta_kubernetes_service_namespace] 221 | target_label: kubernetes_namespace 222 | - source_labels: [__meta_kubernetes_service_name] 223 | target_label: kubernetes_name 224 | 225 | # Example scrape config for pods 226 | # 227 | # The relabeling allows the actual pod scrape endpoint to be configured via the 228 | # following annotations: 229 | # 230 | # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` 231 | # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. 232 | # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. 233 | - job_name: 'kubernetes-pods' 234 | 235 | kubernetes_sd_configs: 236 | - api_servers: 237 | - 'https://kubernetes.default.svc' 238 | in_cluster: true 239 | role: pod 240 | 241 | relabel_configs: 242 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] 243 | action: keep 244 | regex: true 245 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] 246 | action: replace 247 | target_label: __metrics_path__ 248 | regex: (.+) 249 | - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] 250 | action: replace 251 | regex: (.+):(?:\d+);(\d+) 252 | replacement: ${1}:${2} 253 | target_label: __address__ 254 | - action: labelmap 255 | regex: __meta_kubernetes_pod_label_(.+) 256 | - source_labels: [__meta_kubernetes_pod_namespace] 257 | action: replace 258 | target_label: kubernetes_namespace 259 | - source_labels: [__meta_kubernetes_pod_name] 260 | action: replace 261 | target_label: kubernetes_pod_name 262 | kind: ConfigMap 263 | metadata: 264 | creationTimestamp: null 265 | name: prometheus-core 266 | -------------------------------------------------------------------------------- /manifests/grafana-import-dashboards-configmap/grafana-net-315-dashboard.json: -------------------------------------------------------------------------------- 1 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Kubernetes resources usage monitoring (via Prometheus)","description":"Monitors Kubernetes cluster using Prometheus. Shows overall cluster CPU / Memory / Filesystem usage as well as individual pod, containers, systemd services statistics. Uses cAdvisor metrics only.","tags":["kubernetes"],"style":"dark","timezone":"browser","editable":true,"hideControls":false,"sharedCrosshair":false,"rows":[{"title":"Network I/O pressure","height":"200px","editable":true,"collapse":false,"panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)","thresholdLine":false},"id":32,"isNew":true,"legend":{"alignAsTable":false,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":false,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Received","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Sent","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Network I/O pressure","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":false}],"height":"200px","transparent":false}]},{"collapse":false,"editable":true,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":4,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster memory usage","transparent":false,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster CPU usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (container_fs_limit_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"legendFormat":"","metric":"","refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster filesystem usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":9,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"20%","prefix":"","prefixFontSize":"20%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":10,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":11,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":12,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":13,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":14,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_limit_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"showTitle":false,"title":"Total usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":17,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_pod_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"showTitle":false,"title":"Pods CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":23,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (systemd_service_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":24,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":null,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",io_kubernetes_container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"container_cpu","refId":"A","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_cpu","refId":"B","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_cpu","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers CPU usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":20,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"repeat":null,"showTitle":false,"title":"All processes CPU usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":25,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_pod_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":26,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (systemd_service_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":27,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",io_kubernetes_container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}) by (io_kubernetes_container_name, io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10},{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, name, image)","interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_memory_usage:sort_desc","refId":"B","step":10},{"expr":"sum (container_memory_working_set_bytes{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_memory_usage:sort_desc","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers memory usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":28,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes memory usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":16,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ io_kubernetes_pod_name }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ io_kubernetes_pod_name }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods network I/O"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":30,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"network","refId":"B","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"network","refId":"D","step":10},{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"C","step":10},{"expr":"sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"E","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"F","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers network I/O"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":29,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ id }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ id }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes network I/O"}],"time":{"from":"now-30m","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"multi":false,"name":"Node","options":[],"query":"label_values(kubernetes_io_hostname)","refresh":1,"type":"query"}]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":18,"links":[],"gnetId":315},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} 2 | -------------------------------------------------------------------------------- /manifests/grafana-import-dashboards-configmap.yaml: -------------------------------------------------------------------------------- 1 | # based on the content created by: 2 | # kubectl create configmap grafana-import-dashboards --from-file=manifests/grafana-import-dashboards-configmap --output yaml 3 | 4 | apiVersion: v1 5 | kind: ConfigMap 6 | metadata: 7 | name: grafana-import-dashboards 8 | labels: 9 | app: grafana 10 | component: import-dashboards 11 | data: 12 | grafana-net-2-dashboard.json: | 13 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"text","name":"Text","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Prometheus Stats","tags":[],"style":"dark","timezone":"browser","editable":true,"hideControls":true,"sharedCrosshair":false,"rows":[{"collapse":false,"editable":true,"height":178,"panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","decimals":1,"editable":true,"error":false,"format":"s","id":5,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(time() - process_start_time_seconds{job=\"prometheus\"})","intervalFactor":2,"refId":"A","step":4}],"thresholds":"","title":"Uptime","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","id":6,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"targets":[{"expr":"prometheus_local_storage_memory_series","intervalFactor":2,"refId":"A","step":4}],"thresholds":"1,5","title":"Local Storage Memory Series","type":"singlestat","valueFontSize":"70%","valueMaps":[],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","id":7,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"targets":[{"expr":"prometheus_local_storage_indexing_queue_length","intervalFactor":2,"refId":"A","step":4}],"thresholds":"500,4000","title":"Interal Storage Queue Length","type":"singlestat","valueFontSize":"70%","valueMaps":[{"op":"=","text":"Empty","value":"0"}],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"content":"\"Prometheus\nPrometheus\n\n

You're using Prometheus, an open-source systems monitoring and alerting toolkit originally built at SoundCloud. For more information, check out the Grafana and Prometheus projects.

","editable":true,"error":false,"id":9,"links":[],"mode":"html","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":227,"panels":[{"aliasColors":{"prometheus":"#C15C17","{instance=\"localhost:9090\",job=\"prometheus\"}":"#C15C17"},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(prometheus_local_storage_ingested_samples_total[5m])","interval":"","intervalFactor":2,"legendFormat":"{{job}}","metric":"","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Samples ingested (rate-5m)","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Samples Ingested\nThis graph displays the count of samples ingested by the Prometheus server, as measured over the last 5 minutes, per time series in the range vector. When troubleshooting an issue on IRC or Github, this is often the first stat requested by the Prometheus team. ","editable":true,"error":false,"id":8,"links":[],"mode":"markdown","span":2.995914043583536,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{"prometheus":"#F9BA8F","{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}":"#F9BA8F"},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":5,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(prometheus_target_interval_length_seconds_count[5m])","intervalFactor":2,"legendFormat":"{{job}}","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Target Scrapes (last 5m)","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":14,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":4,"stack":false,"steppedLine":false,"targets":[{"expr":"prometheus_target_interval_length_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}","interval":"","intervalFactor":2,"legendFormat":"{{quantile}} ({{interval}})","metric":"","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Scrape Duration","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Scrapes\nPrometheus scrapes metrics from instrumented jobs, either directly or via an intermediary push gateway for short-lived jobs. Target scrapes will show how frequently targets are scraped, as measured over the last 5 minutes, per time series in the range vector. Scrape Duration will show how long the scrapes are taking, with percentiles available as series. ","editable":true,"error":false,"id":11,"links":[],"mode":"markdown","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":null,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":12,"legend":{"alignAsTable":false,"avg":false,"current":false,"hideEmpty":true,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"prometheus_evaluator_duration_milliseconds{quantile!=\"0.01\", quantile!=\"0.05\"}","interval":"","intervalFactor":2,"legendFormat":"{{quantile}}","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Rule Eval Duration","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"percentunit","label":""},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Rule Evaluation Duration\nThis graph panel plots the duration for all evaluations to execute. The 50th percentile, 90th percentile and 99th percentile are shown as three separate series to help identify outliers that may be skewing the data.","editable":true,"error":false,"id":15,"links":[],"mode":"markdown","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"}],"time":{"from":"now-5m","to":"now"},"timepicker":{"now":true,"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":0,"links":[{"icon":"info","tags":[],"targetBlank":true,"title":"Grafana Docs","tooltip":"","type":"link","url":"http://www.grafana.org/docs"},{"icon":"info","tags":[],"targetBlank":true,"title":"Prometheus Docs","type":"link","url":"http://prometheus.io/docs/introduction/overview/"}],"gnetId":2,"description":"The official, pre-built Prometheus Stats Dashboard."},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} 14 | grafana-net-162-dashboard.json: | 15 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Kubernetes cluster monitoring (via Prometheus)","tags":["kubernetes"],"style":"dark","timezone":"browser","editable":true,"hideControls":true,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":4,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(sum(node_memory_MemTotal) - sum(node_memory_MemFree+node_memory_Buffers+node_memory_Cached) ) / sum(node_memory_MemTotal) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(sum by (io_kubernetes_container_name)( rate(container_cpu_usage_seconds_total{image!=\"\"}[1m] ) )) / count(node_cpu{mode=\"system\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster CPU usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(sum(node_filesystem_size{device=\"rootfs\"}) - sum(node_filesystem_free{device=\"rootfs\"}) ) / sum(node_filesystem_size{device=\"rootfs\"}) * 100","interval":"10s","intervalFactor":1,"metric":"","refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster Filesystem usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"title":"Row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (io_kubernetes_container_name)( rate(container_cpu_usage_seconds_total{image!=\"\"}[1m] ) )","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_container_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sort_desc(sum(container_memory_usage_bytes{image!=\"\"}) by (io_kubernetes_container_name, image))","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_container_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":8,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sort_desc(sum by (kubernetes_pod_name) (rate (container_network_receive_bytes_total{name!=\"\", kubernetes_pod_name=~\"op-.*\"}[1m]) ))","interval":"10s","intervalFactor":1,"legendFormat":"{{ kubernetes_pod_name }}","metric":"network","refId":"A","step":10},{"expr":"sort_desc(sum by (kubernetes_pod_name) (rate (container_network_transmit_bytes_total{name!=\"\", kubernetes_pod_name=~\"op-.*\"}[1m]) ))","interval":"10s","intervalFactor":1,"legendFormat":"{{ kubernetes_pod_name }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod Network i/o","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"}],"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":46,"links":[],"gnetId":162,"description":"Monitor a Kubernetes cluster using Prometheus TSDB. Shows overall cluster CPU / Memory / Disk usage as well as individual pod statistics. "},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} 16 | grafana-net-193-dashboard.json: | 17 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"prometheus with cAdvisor as a target","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Docker monitoring","description":"Docker monitoring with Prometheus and cAdvisor","tags":["docker"],"style":"dark","timezone":"browser","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"50","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"count(container_last_seen{image!=\"\"})","intervalFactor":2,"legendFormat":"","metric":"container_last_seen","refId":"A","step":240}],"thresholds":"","title":"Running containers","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"mbytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":5,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(container_memory_usage_bytes{image!=\"\"})/1024/1024","intervalFactor":2,"legendFormat":"","metric":"container_memory_usage_bytes","refId":"A","step":240}],"thresholds":"","title":"Total Memory Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100)","intervalFactor":2,"legendFormat":"","metric":"container_memory_usage_bytes","refId":"A","step":240}],"thresholds":"","title":"Total CPU Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100","intervalFactor":2,"legendFormat":"{{name}}","metric":"cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"CPU Usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"Row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":1,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"container_memory_usage_bytes{image!=\"\"}","hide":false,"intervalFactor":2,"legendFormat":"{{name}}","metric":"container_memory_usage_bytes","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Memory Usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":"","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"irate(container_network_receive_bytes_total{image!=\"\"}[5m])","intervalFactor":2,"legendFormat":"{{name}}","metric":"container_network_receive_bytes_total","refId":"A","step":20}],"timeFrom":null,"timeShift":null,"title":"Network Rx","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":4,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"irate(container_network_transmit_bytes_total{image!=\"\"}[5m])","intervalFactor":2,"legendFormat":"{{name}}","refId":"A","step":20}],"timeFrom":null,"timeShift":null,"title":"Network Tx","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"}],"time":{"from":"now-3h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":26,"links":[],"gnetId":193},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} 18 | grafana-net-315-dashboard.json: | 19 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Kubernetes resources usage monitoring (via Prometheus)","description":"Monitors Kubernetes cluster using Prometheus. Shows overall cluster CPU / Memory / Filesystem usage as well as individual pod, containers, systemd services statistics. Uses cAdvisor metrics only.","tags":["kubernetes"],"style":"dark","timezone":"browser","editable":true,"hideControls":false,"sharedCrosshair":false,"rows":[{"title":"Network I/O pressure","height":"200px","editable":true,"collapse":false,"panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)","thresholdLine":false},"id":32,"isNew":true,"legend":{"alignAsTable":false,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":false,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Received","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Sent","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Network I/O pressure","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":false}],"height":"200px","transparent":false}]},{"collapse":false,"editable":true,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":4,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster memory usage","transparent":false,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster CPU usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (container_fs_limit_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"legendFormat":"","metric":"","refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster filesystem usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":9,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"20%","prefix":"","prefixFontSize":"20%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":10,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":11,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":12,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":13,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":14,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_limit_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"showTitle":false,"title":"Total usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":17,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_pod_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"showTitle":false,"title":"Pods CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":23,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (systemd_service_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":24,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":null,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",io_kubernetes_container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"container_cpu","refId":"A","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_cpu","refId":"B","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_cpu","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers CPU usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":20,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"repeat":null,"showTitle":false,"title":"All processes CPU usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":25,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_pod_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":26,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (systemd_service_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":27,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",io_kubernetes_container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}) by (io_kubernetes_container_name, io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10},{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, name, image)","interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_memory_usage:sort_desc","refId":"B","step":10},{"expr":"sum (container_memory_working_set_bytes{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_memory_usage:sort_desc","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers memory usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":28,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes memory usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":16,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ io_kubernetes_pod_name }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ io_kubernetes_pod_name }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods network I/O"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":30,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"network","refId":"B","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"network","refId":"D","step":10},{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"C","step":10},{"expr":"sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"E","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"F","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers network I/O"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":29,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ id }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ id }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes network I/O"}],"time":{"from":"now-30m","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"multi":false,"name":"Node","options":[],"query":"label_values(kubernetes_io_hostname)","refresh":1,"type":"query"}]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":18,"links":[],"gnetId":315},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} 20 | prometheus-datasource.json: | 21 | { 22 | "name": "prometheus", 23 | "type": "prometheus", 24 | "url": "http://prometheus:9090", 25 | "access": "proxy", 26 | "basicAuth": false 27 | } 28 | --------------------------------------------------------------------------------