├── docs
    ├── alertmanager.png
    ├── grafana_datasource.png
    ├── prometheus_targets.png
    ├── grafana_cluster_overview.png
    ├── grafana_import_dashboard.png
    ├── grafana_prometheus_stats.png
    ├── grafana_datasource_dashboard.png
    └── index.md
├── manifests
    ├── prometheus-core-configmap
    │   ├── prometheus-record.rules
    │   ├── prometheus-alert.rules
    │   └── prometheus.yml
    ├── grafana-import-dashboards-configmap
    │   ├── prometheus-datasource.json
    │   ├── grafana-net-162-dashboard.json
    │   ├── grafana-net-193-dashboard.json
    │   ├── grafana-net-2-dashboard.json
    │   └── grafana-net-315-dashboard.json
    ├── kube-metrics-service.yaml
    ├── grafana-core-service.yaml
    ├── prometheus-alert-service.yaml
    ├── prometheus-core-service.yaml
    ├── prometheus-node-exporter-service.yaml
    ├── kube-metrics-deployment.yaml
    ├── prometheus-node-exporter-daemonset.yaml
    ├── prometheus-alert-deployment.yaml
    ├── prometheus-core-deployment.yaml
    ├── grafana-core-deployment.yaml
    ├── grafana-import-dashboards-job.yaml
    ├── prometheus-alert-configmap
    │   └── alertmanager.yml
    ├── prometheus-alert-configmap.yaml
    ├── prometheus-core-configmap.yaml
    └── grafana-import-dashboards-configmap.yaml
└── README.md


/docs/alertmanager.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/alertmanager.png


--------------------------------------------------------------------------------
/docs/grafana_datasource.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_datasource.png


--------------------------------------------------------------------------------
/docs/prometheus_targets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/prometheus_targets.png


--------------------------------------------------------------------------------
/docs/grafana_cluster_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_cluster_overview.png


--------------------------------------------------------------------------------
/docs/grafana_import_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_import_dashboard.png


--------------------------------------------------------------------------------
/docs/grafana_prometheus_stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_prometheus_stats.png


--------------------------------------------------------------------------------
/docs/grafana_datasource_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoping378/k8s-monitor/HEAD/docs/grafana_datasource_dashboard.png


--------------------------------------------------------------------------------
/manifests/prometheus-core-configmap/prometheus-record.rules:
--------------------------------------------------------------------------------
1 | 
2 | instance:fd_utilization = process_open_fds / process_max_fds
3 | 


--------------------------------------------------------------------------------
/manifests/grafana-import-dashboards-configmap/prometheus-datasource.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "prometheus",
3 |   "type": "prometheus",
4 |   "url": "http://prometheus:9090",
5 |   "access": "proxy",
6 |   "basicAuth": false
7 | }
8 | 


--------------------------------------------------------------------------------
/manifests/kube-metrics-service.yaml:
--------------------------------------------------------------------------------
 1 | kind: Service
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: kube-state-metrics
 5 |   annotations:
 6 |     prometheus.io/scrape: 'true'
 7 |   labels:
 8 |     app: kube-state-metrics
 9 | spec:
10 |   type: NodePort
11 |   selector:
12 |     app: kube-state-metrics
13 |   ports:
14 |   - name: kube-state-metrics
15 |     port: 8080
16 | 


--------------------------------------------------------------------------------
/manifests/grafana-core-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: grafana
 5 |   labels:
 6 |     app: grafana
 7 |     component: core
 8 |   # annotations:
 9 |   #   prometheus.io/scrape: 'true'
10 | spec:
11 |   type: NodePort
12 |   ports:
13 |     - port: 3000
14 |       nodePort: 31000
15 |   selector:
16 |     app: grafana
17 |     component: core
18 | 


--------------------------------------------------------------------------------
/manifests/prometheus-alert-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: alertmanager
 5 |   labels:
 6 |     app: prometheus
 7 |     component: alert
 8 |   annotations:
 9 |     prometheus.io/scrape: 'true'
10 | spec:
11 |   selector:
12 |     app: prometheus
13 |     component: alert
14 |   type: NodePort
15 |   ports:
16 |   - port: 9093
17 |     nodePort: 30093
18 |     protocol: TCP
19 | 


--------------------------------------------------------------------------------
/manifests/prometheus-core-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: prometheus
 5 |   labels:
 6 |     app: prometheus
 7 |     component: core
 8 |   annotations:
 9 |     prometheus.io/scrape: 'true'
10 | spec:
11 |   type: NodePort
12 |   ports:
13 |     - port: 9090
14 |       nodePort: 30900
15 |       protocol: TCP
16 |       name: webui
17 |   selector:
18 |     app: prometheus
19 |     component: core
20 | 


--------------------------------------------------------------------------------
/manifests/prometheus-node-exporter-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   annotations:
 5 |     prometheus.io/scrape: 'true'
 6 |   name: prometheus-node-exporter
 7 |   labels:
 8 |     app: prometheus
 9 |     component: node-exporter
10 | spec:
11 |   clusterIP: None
12 |   ports:
13 |     - name: prometheus-node-exporter
14 |       port: 9100
15 |       protocol: TCP
16 |   selector:
17 |     app: prometheus
18 |     component: node-exporter
19 |   type: ClusterIP
20 | 


--------------------------------------------------------------------------------
/manifests/kube-metrics-deployment.yaml:
--------------------------------------------------------------------------------
 1 | kind: Deployment
 2 | apiVersion: extensions/v1beta1
 3 | metadata:
 4 |   name: kube-state-metrics-deployment
 5 | spec:
 6 |   replicas: 1
 7 |   template:
 8 |     metadata:
 9 |       labels:
10 |         app: kube-state-metrics
11 |     spec:
12 |       containers:
13 |       - name: kube-state-metrics
14 |         image: gcr.io/google_containers/kube-state-metrics:v0.2.0
15 |         ports:
16 |         - name: exporter
17 |           containerPort: 8080
18 |         resources:
19 |           requests:
20 |             cpu: 10m
21 |             memory: 10Mi
22 |           limits:
23 |             cpu: 100m
24 |             memory: 50Mi
25 | 


--------------------------------------------------------------------------------
/manifests/prometheus-node-exporter-daemonset.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: extensions/v1beta1
 2 | kind: DaemonSet
 3 | metadata:
 4 |   name: prometheus-node-exporter
 5 |   labels:
 6 |     app: prometheus
 7 |     component: node-exporter
 8 | spec:
 9 |   template:
10 |     metadata:
11 |       name: prometheus-node-exporter
12 |       labels:
13 |         app: prometheus
14 |         component: node-exporter
15 |     spec:
16 |       containers:
17 |       - image: prom/node-exporter:0.12.0
18 |         name: prometheus-node-exporter
19 |         ports:
20 |         - name: prom-node-exp
21 |           #^ must be an IANA_SVC_NAME (at most 15 characters, ..)
22 |           containerPort: 9100
23 |           hostPort: 9100
24 |       hostNetwork: true
25 |       hostPID: true
26 | 


--------------------------------------------------------------------------------
/manifests/prometheus-alert-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: extensions/v1beta1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: prometheus-alert
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     matchLabels:
 9 |       app: prometheus
10 |       component: alert
11 |   template:
12 |     metadata:
13 |       name: alertmanager
14 |       labels:
15 |         app: prometheus
16 |         component: alert
17 |     spec:
18 |       containers:
19 |       - name: alertmanager
20 |         image: prom/alertmanager:v0.4.2
21 |         args:
22 |           - '-config.file=/etc/alertmanager/alertmanager.yml'
23 |           - '-storage.path=/alertmanager'
24 |         ports:
25 |         - name: alertmanager
26 |           containerPort: 9093
27 |         volumeMounts:
28 |         - name: config-volume
29 |           mountPath: /etc/alertmanager
30 |         - name: alertmanager
31 |           mountPath: /prometheus
32 |       volumes:
33 |       - name: config-volume
34 |         configMap:
35 |           name: prometheus-alert
36 |       - name: alertmanager
37 |         emptyDir: {}
38 | 


--------------------------------------------------------------------------------
/manifests/prometheus-core-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: extensions/v1beta1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: prometheus-core
 5 |   labels:
 6 |     app: prometheus
 7 |     component: core
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       app: prometheus
13 |       component: core
14 |   template:
15 |     metadata:
16 |       name: prometheus-main
17 |       labels:
18 |         app: prometheus
19 |         component: core
20 |     spec:
21 |       containers:
22 |       - name: prometheus
23 |         image: prom/prometheus:v1.2.3
24 |         args:
25 |           - '-storage.local.retention=6h'
26 |           - '-storage.local.memory-chunks=500000'
27 |           - '-config.file=/etc/prometheus/prometheus.yml'
28 |           - '-alertmanager.url=http://alertmanager:9093'
29 |         ports:
30 |         - name: webui
31 |           containerPort: 9090
32 |         volumeMounts:
33 |         - name: config-volume
34 |           mountPath: /etc/prometheus
35 |         - name: data-volume
36 |           mountPath: /prometheus
37 |       # nodeSelector:
38 |       #   kubernetes.io/hostname: 192.168.1.110
39 |       volumes:
40 |       - name: config-volume
41 |         configMap:
42 |           name: prometheus-core
43 |       - name: data-volume
44 |         hostPath:
45 |           # directory location on host
46 |           path: /prometheusData
47 | 


--------------------------------------------------------------------------------
/manifests/grafana-core-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: extensions/v1beta1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: grafana-core
 5 |   labels:
 6 |     app: grafana
 7 |     component: core
 8 | spec:
 9 |   replicas: 1
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: grafana
14 |         component: core
15 |     spec:
16 |       containers:
17 |         - image: xiaoping378/grafana-cn:3.1.1
18 |           name: grafana-core
19 |           # env:
20 |           resources:
21 |             # keep request = limit to keep this container in guaranteed class
22 |             limits:
23 |               cpu: 100m
24 |               memory: 100Mi
25 |             requests:
26 |               cpu: 100m
27 |               memory: 100Mi
28 |           ports:
29 |             - name: grafana
30 |               containerPort: 3000
31 |           env:
32 |             # This variable is required to setup templates in Grafana.
33 |               # The following env variables are required to make Grafana accessible via
34 |               # the kubernetes api-server proxy. On production clusters, we recommend
35 |               # removing these env variables, setup auth for grafana, and expose the grafana
36 |               # service using a LoadBalancer or a public IP.
37 |             - name: GF_AUTH_BASIC_ENABLED
38 |               value: "false"
39 |             - name: GF_AUTH_ANONYMOUS_ENABLED
40 |               value: "true"
41 |             - name: GF_AUTH_ANONYMOUS_ORG_ROLE
42 |               value: Admin
43 |             # - name: GF_SERVER_ROOT_URL
44 |             #   value: /api/v1/proxy/namespaces/monitoring/services/grafana/
45 |           volumeMounts:
46 |           - name: grafana-persistent-storage
47 |             mountPath: /var
48 |       # nodeSelector:
49 |       #   kubernetes.io/hostname: 192.168.1.110
50 |       volumes:
51 |       - name: grafana-persistent-storage
52 |         hostPath:
53 |           path: /grafanaData
54 | 


--------------------------------------------------------------------------------
/manifests/grafana-import-dashboards-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: grafana-import-dashboards
 5 |   labels:
 6 |     app: grafana
 7 |     component: import-dashboards
 8 | spec:
 9 |   template:
10 |     metadata:
11 |       name: grafana-import-dashboards
12 |       labels:
13 |         app: grafana
14 |         component: import-dashboards
15 |     spec:
16 |       containers:
17 |       - name: grafana-import-dashboards
18 |         image: docker
19 |         imagePullPolicy: IfNotPresent
20 |         command: ["/bin/sh", "-c"]
21 |         workingDir: /opt/grafana-import-dashboards
22 |         args:
23 |           # FIXME use kubernetes probe instead of "until curl"
24 |           - >
25 |             until $(curl --silent --fail --show-error --output /dev/null http://grafana:3000/api/datasources); do
26 |               printf '.' ; sleep 1 ;
27 |             done ;
28 |             for file in *-datasource.json ; do
29 |               if [ -e "$file" ] ; then
30 |                 echo "importing $file" &&
31 |                 curl --silent --fail --show-error \
32 |                   --request POST http://grafana:3000/api/datasources \
33 |                   --header "Content-Type: application/json" \
34 |                   --data-binary "@$file" ;
35 |                 echo "" ;
36 |               fi
37 |             done ;
38 |             for file in *-dashboard.json ; do
39 |               if [ -e "$file" ] ; then
40 |                 echo "importing $file" &&
41 |                 curl --silent --fail --show-error \
42 |                   --request POST http://grafana:3000/api/dashboards/import \
43 |                   --header "Content-Type: application/json" \
44 |                   --data-binary "@$file" ;
45 |                 echo "" ;
46 |               fi
47 |             done
48 | 
49 |         volumeMounts:
50 |         - name: config-volume
51 |           mountPath: /opt/grafana-import-dashboards
52 |       restartPolicy: Never
53 |       volumes:
54 |       - name: config-volume
55 |         configMap:
56 |           name: grafana-import-dashboards
57 | 


--------------------------------------------------------------------------------
/manifests/prometheus-alert-configmap/alertmanager.yml:
--------------------------------------------------------------------------------
  1 | global:
  2 |   # ResolveTimeout is the time after which an alert is declared resolved
  3 |   # if it has not been updated.
  4 |   resolve_timeout: 5m
  5 | 
  6 |   # The smarthost and SMTP sender used for mail notifications.
  7 |   smtp_smarthost: 'smtp.zmail300.cn:25'
  8 |   smtp_from: 'xuxiaoping@300.cn'
  9 |   smtp_auth_username: 'xuxiaoping@300.cn'
 10 |   smtp_auth_password: '*******'
 11 |   # require_tls, don't work in global section. should put it into email config.
 12 |   # refer to https://github.com/prometheus/alertmanager/issues/193
 13 |   # smtp_require_tls: false
 14 | 
 15 |   # The API URL to use for Slack notifications.
 16 |   slack_api_url: 'global.slack_api_url'
 17 | 
 18 |   # # The auth token for Hipchat.
 19 |   # hipchat_auth_token: '1234556789'
 20 |   #
 21 |   # # Alternative host for Hipchat.
 22 |   # hipchat_url: 'https://hipchat.foobar.org/'
 23 | 
 24 | # # The directory from which notification templates are read.
 25 | # templates:
 26 | # - '/etc/alertmanager/template/*.tmpl'
 27 | 
 28 | # The root route on which each incoming alert enters.
 29 | route:
 30 | 
 31 |   # The labels by which incoming alerts are grouped together. For example,
 32 |   # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
 33 |   # be batched into a single group.
 34 | 
 35 |   group_by: ['alertname', 'cluster', 'service']
 36 | 
 37 |   # When a new group of alerts is created by an incoming alert, wait at
 38 |   # least 'group_wait' to send the initial notification.
 39 |   # This way ensures that you get multiple alerts for the same group that start
 40 |   # firing shortly after another are batched together on the first
 41 |   # notification.
 42 | 
 43 |   group_wait: 30s
 44 | 
 45 |   # When the first notification was sent, wait 'group_interval' to send a batch
 46 |   # of new alerts that started firing for that group.
 47 | 
 48 |   group_interval: 5m
 49 | 
 50 |   # If an alert has successfully been sent, wait 'repeat_interval' to
 51 |   # resend them.
 52 | 
 53 |   repeat_interval: 3h
 54 | 
 55 |   # A default receiver
 56 | 
 57 |   # If an alert isn't caught by a route, send it to default.
 58 |   receiver: default
 59 | 
 60 |   # All the above attributes are inherited by all child routes and can
 61 |   # overwritten on each.
 62 | 
 63 |   # The child route trees.
 64 |   routes:
 65 |   # Send severity=slack alerts to slack.
 66 |   - match:
 67 |       severity: slack
 68 |     receiver: slack_alert
 69 |   - match:
 70 |       severity: email
 71 |     receiver: email_alert
 72 | 
 73 | receivers:
 74 | - name: 'default'
 75 |   slack_configs:
 76 |   - channel: '#alerts'
 77 | 
 78 | - name: 'slack_alert'
 79 |   slack_configs:
 80 |   - channel: '#alerts'
 81 | 
 82 |     # # Whether or not to notify about resolved alerts.
 83 |     # send_resolved: true
 84 |     #
 85 |     # # The Slack webhook URL.
 86 |     # [ api_url: <string> | default = global.slack_api_url ]
 87 |     #
 88 |     # # The channel or user to send notifications to.
 89 |     # channel: <tmpl_string>
 90 |     #
 91 |     # # API request data as defined by the Slack webhook API.
 92 |     # [ color: <tmpl_string> | default = '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' ]
 93 |     # [ username: <tmpl_string> | default = '{{ template "slack.default.username" . }}'
 94 |     # [ title: <tmpl_string> | default = '{{ template "slack.default.title" . }}' ]
 95 |     # [ title_link: <tmpl_string> | default = '{{ template "slack.default.titlelink" . }}' ]
 96 |     # [ icon_emoji: <tmpl_string> ]
 97 |     # [ pretext: <tmpl_string> | default = '{{ template "slack.default.pretext" . }}' ]
 98 |     # [ text: <tmpl_string> | default = '{{ template "slack.default.text" . }}' ]
 99 |     # [ fallback: <tmpl_string> | default = '{{ template "slack.default.fallback" . }}' ]
100 | 
101 | - name: 'email_alert'
102 |   email_configs:
103 |   - to: 'xiaoping378@163.com'
104 |     # Will override the global section, have a higher priority.
105 |     smarthost: 'smtp.zmail300.cn:25'
106 |     from: 'xuxiaoping2@300.cn'
107 |     auth_username: 'xuxiaoping2@300.cn'
108 |     auth_password: '********'
109 |     require_tls: false
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Kubernetes Setup for Prometheus and Grafana
  2 | 
  3 | ## Quick start
  4 | 
  5 | To quickly start all the things just do this:
  6 | ```bash
  7 | kubectl create namespace monitoring
  8 | kubectl --namespace monitoring create \
  9 |   --filename https://raw.githubusercontent.com/xiaoping378/k8s-monitor/master/manifests-all.yaml
 10 | ```
 11 | 
 12 | To shut down all components again:
 13 | ```bash
 14 | kubectl delete namespace monitoring
 15 | ```
 16 | 本项目依赖的所有docker镜像已打包放在百度云上,所有镜像均可以在docker hub上找到 [下载镜像tar包](https://pan.baidu.com/s/1hskbi6o)
 17 | 
 18 | ## More Details
 19 | 
 20 | Alternatively follow these steps to get a feeling for the different components of this setup:
 21 | 
 22 | ```bash
 23 | kubectl create --filename manifests/prometheus-core-configmap.yaml
 24 | # kubectl get configmaps
 25 | # kubectl delete configmaps/prometheus
 26 | 
 27 | kubectl create --filename manifests/prometheus-core-service.yaml
 28 | # kubectl get services/prometheus
 29 | # minikube service prometheus
 30 | 
 31 | kubectl create --filename manifests/prometheus-core-deployment.yaml
 32 | # kubectl get --all-namespaces --output wide pods
 33 | # kubectl logs prometheus-2556266794-sd260
 34 | # kubectl delete pods/prometheus-2556266794-sd260
 35 | 
 36 | kubectl create --filename manifests/node-exporter-service.yaml
 37 | kubectl create --filename manifests/node-exporter-daemonset.yaml
 38 | 
 39 | # create Alertmanager
 40 | kubectl create --filename manifests/prometheus-alert-configmap.yaml
 41 | kubectl create --filename manifests/prometheus-alert-service.yaml
 42 | kubectl create --filename manifests/prometheus-alert-deployment.yaml
 43 | 
 44 | kubectl create --filename manifests/grafana-service.yaml
 45 | # kubectl get services/grafana
 46 | # minikube service grafana
 47 | 
 48 | kubectl create --filename manifests/grafana-deployment.yaml
 49 | # kubectl get --all-namespaces --output wide pods
 50 | ```
 51 | 
 52 | See grafana.net for some example [dashboards](https://grafana.net/dashboards) and [plugins](https://grafana.net/plugins).
 53 | 
 54 | - Configure [Prometheus](https://grafana.net/plugins/prometheus) data source for Grafana.<br/>
 55 | `Grafana UI / Data Sources / Add data source`
 56 |   - `Name`: `prometheus`
 57 |   - `Type`: `Prometheus`
 58 |   - `Url`: `http://prometheus:9090`
 59 |   - `Add`
 60 | 
 61 | - Import [Prometheus Stats](https://grafana.net/dashboards/2):<br/>
 62 |   `Grafana UI / Dashboards / Import`
 63 |   - `Grafana.net Dashboard`: `https://grafana.net/dashboards/2`
 64 |   - `Load`
 65 |   - `Prometheus`: `prometheus`
 66 |   - `Save & Open`
 67 | 
 68 | - Import [Kubernetes cluster monitoring](https://grafana.net/dashboards/162):<br/>
 69 |   `Grafana UI / Dashboards / Import`
 70 |   - `Grafana.net Dashboard`: `https://grafana.net/dashboards/162`
 71 |   - `Load`
 72 |   - `Prometheus`: `prometheus`
 73 |   - `Save & Open`
 74 | 
 75 | Instead of manually configuring the datasource and dashboards you can run the following job. It uses the API to configure Grafana to a state similar to when you manually go through the steps described above.
 76 | 
 77 | ```bash
 78 | kubectl create --filename manifests/grafana-import-dashboards-job.yaml
 79 | ```
 80 | 
 81 | 
 82 | ## Create one single manifest file
 83 | 
 84 | ```bash
 85 | target="./manifests-all.yaml"
 86 | rm "$target"
 87 | printf -- "# Derived from ./manifests/*.yaml\n---\n" >> "$target"
 88 | for file in ./manifests/*.yaml ; do
 89 |   if [ -e "$file" ] ; then
 90 |      cat "$file" >> "$target"
 91 |      printf -- "---\n" >> "$target"
 92 |   fi
 93 | done
 94 | ```
 95 | 
 96 | ## create configmap file
 97 | 
 98 | ```bash
 99 | kubectl create configmap prometheus-core --from-file=manifests/prometheus-core-configmap --output yaml --dry-run > manifests/prometheus-core-configmap.yaml
100 | kubectl create configmap grafana-import-dashboards --from-file=manifests/grafana-import-dashboards-configmap --output yaml --dry-run > manifests/grafana-import-dashboards-configmap.yaml
101 | kubectl create configmap prometheus-alert --from-file=manifests/prometheus-alert-configmap --output yaml --dry-run > manifests/prometheus-alert-configmap.yaml
102 | ```
103 | 
104 | ## Credits
105 | 
106 | Based on
107 | ```
108 | https://github.com/giantswarm/kubernetes-prometheus
109 | ```
110 | 


--------------------------------------------------------------------------------
/manifests/prometheus-alert-configmap.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | data:
  3 |   alertmanager.yml: |
  4 |     global:
  5 |       # ResolveTimeout is the time after which an alert is declared resolved
  6 |       # if it has not been updated.
  7 |       resolve_timeout: 5m
  8 | 
  9 |       # The smarthost and SMTP sender used for mail notifications.
 10 |       smtp_smarthost: 'smtp.zmail300.cn:25'
 11 |       smtp_from: 'xuxiaoping@300.cn'
 12 |       smtp_auth_username: 'xuxiaoping@300.cn'
 13 |       smtp_auth_password: '*******'
 14 |       # require_tls, don't work in global section. should put it into email config.
 15 |       # refer to https://github.com/prometheus/alertmanager/issues/193
 16 |       # smtp_require_tls: false
 17 | 
 18 |       # The API URL to use for Slack notifications.
 19 |       slack_api_url: 'global.slack_api_url'
 20 | 
 21 |       # # The auth token for Hipchat.
 22 |       # hipchat_auth_token: '1234556789'
 23 |       #
 24 |       # # Alternative host for Hipchat.
 25 |       # hipchat_url: 'https://hipchat.foobar.org/'
 26 | 
 27 |     # # The directory from which notification templates are read.
 28 |     # templates:
 29 |     # - '/etc/alertmanager/template/*.tmpl'
 30 | 
 31 |     # The root route on which each incoming alert enters.
 32 |     route:
 33 | 
 34 |       # The labels by which incoming alerts are grouped together. For example,
 35 |       # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
 36 |       # be batched into a single group.
 37 | 
 38 |       group_by: ['alertname', 'cluster', 'service']
 39 | 
 40 |       # When a new group of alerts is created by an incoming alert, wait at
 41 |       # least 'group_wait' to send the initial notification.
 42 |       # This way ensures that you get multiple alerts for the same group that start
 43 |       # firing shortly after another are batched together on the first
 44 |       # notification.
 45 | 
 46 |       group_wait: 30s
 47 | 
 48 |       # When the first notification was sent, wait 'group_interval' to send a batch
 49 |       # of new alerts that started firing for that group.
 50 | 
 51 |       group_interval: 5m
 52 | 
 53 |       # If an alert has successfully been sent, wait 'repeat_interval' to
 54 |       # resend them.
 55 | 
 56 |       repeat_interval: 3h
 57 | 
 58 |       # A default receiver
 59 | 
 60 |       # If an alert isn't caught by a route, send it to default.
 61 |       receiver: default
 62 | 
 63 |       # All the above attributes are inherited by all child routes and can
 64 |       # overwritten on each.
 65 | 
 66 |       # The child route trees.
 67 |       routes:
 68 |       # Send severity=slack alerts to slack.
 69 |       - match:
 70 |           severity: slack
 71 |         receiver: slack_alert
 72 |       - match:
 73 |           severity: email
 74 |         receiver: email_alert
 75 | 
 76 |     receivers:
 77 |     - name: 'default'
 78 |       slack_configs:
 79 |       - channel: '#alerts'
 80 | 
 81 |     - name: 'slack_alert'
 82 |       slack_configs:
 83 |       - channel: '#alerts'
 84 | 
 85 |         # # Whether or not to notify about resolved alerts.
 86 |         # send_resolved: true
 87 |         #
 88 |         # # The Slack webhook URL.
 89 |         # [ api_url: <string> | default = global.slack_api_url ]
 90 |         #
 91 |         # # The channel or user to send notifications to.
 92 |         # channel: <tmpl_string>
 93 |         #
 94 |         # # API request data as defined by the Slack webhook API.
 95 |         # [ color: <tmpl_string> | default = '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' ]
 96 |         # [ username: <tmpl_string> | default = '{{ template "slack.default.username" . }}'
 97 |         # [ title: <tmpl_string> | default = '{{ template "slack.default.title" . }}' ]
 98 |         # [ title_link: <tmpl_string> | default = '{{ template "slack.default.titlelink" . }}' ]
 99 |         # [ icon_emoji: <tmpl_string> ]
100 |         # [ pretext: <tmpl_string> | default = '{{ template "slack.default.pretext" . }}' ]
101 |         # [ text: <tmpl_string> | default = '{{ template "slack.default.text" . }}' ]
102 |         # [ fallback: <tmpl_string> | default = '{{ template "slack.default.fallback" . }}' ]
103 | 
104 |     - name: 'email_alert'
105 |       email_configs:
106 |       - to: 'xiaoping378@163.com'
107 |         # Will override the global section, have a higher priority.
108 |         smarthost: 'smtp.zmail300.cn:25'
109 |         from: 'xuxiaoping2@300.cn'
110 |         auth_username: 'xuxiaoping2@300.cn'
111 |         auth_password: '********'
112 |         require_tls: false
113 | kind: ConfigMap
114 | metadata:
115 |   creationTimestamp: null
116 |   name: prometheus-alert
117 | 


--------------------------------------------------------------------------------
/manifests/prometheus-core-configmap/prometheus-alert.rules:
--------------------------------------------------------------------------------
  1 | # Alert for any instance that is unreachable for >5 minutes.
  2 | ALERT InstanceDown
  3 |   IF up == 0
  4 |   FOR 5m
  5 |   LABELS { severity = "email" }
  6 |   ANNOTATIONS {
  7 |     summary = "Instance {{ $labels.instance }} down",
  8 |     description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
  9 |   }
 10 | 
 11 | ALERT node_cpu_threshold_exceeded  
 12 |   IF 100 * node_load5 > 90
 13 |   LABELS { severity = "email" }
 14 |   ANNOTATIONS {
 15 |     summary = "Instance {{ $labels.instance }} CPU usage is dangerously high",
 16 |     description = "This device's CPU usage has exceeded the threshold with a value of {{ $value }}.",
 17 |   }
 18 | 
 19 | ALERT node_memory_threshold_exceeded  
 20 |   IF (node_memory_MemFree+node_memory_Buffers+node_memory_Cached) / node_memory_MemTotal < 0.1
 21 |   LABELS { severity = "email" }
 22 |   ANNOTATIONS {
 23 |     summary = "Instance {{ $labels.instance }} MEM usage is dangerously high",
 24 |     description = "This device's MEM usage has exceeded the threshold with a value of {{ $value }}.",
 25 |   }
 26 | 
 27 | ALERT node_fs_threshold_exceeded
 28 |   IF node_filesystem_free{fstype="rootfs"} / node_filesystem_size{fstype="rootfs"} < 0.2
 29 |   LABELS { severity = "email" }
 30 |   ANNOTATIONS {
 31 |     summary = "Node filesystem usage is high",
 32 |     description = "Node {{ $labels.instance }}'s filesystem {{ $labels.mountpoint }} has less than 20% disk space remaining."
 33 |   }
 34 | 
 35 | ALERT container_cpu_threshold_exceeded  
 36 |   IF rate(container_cpu_user_seconds_total{image!=""}[5m]) * 100 > 90
 37 |   LABELS { severity = "email" }
 38 |   ANNOTATIONS {
 39 |     summary = "Instance {{ $labels.kubernetes_container_name }} CPU usage is dangerously high",
 40 |     description = "This device's CPU usage has exceeded the threshold with a value of {{ $value }}.",
 41 |   }
 42 | 
 43 | ALERT FdExhaustionClose
 44 |   IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
 45 |   FOR 10m
 46 |   LABELS { severity = "email" }
 47 |   ANNOTATIONS {
 48 |     summary = "file descriptors soon exhausted",
 49 |     description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
 50 |   }
 51 | 
 52 | ALERT ContainerReboot
 53 |   IF increase(container_last_seen{container_label_io_kubernetes_container_hash!=""}[30s]) < 25
 54 |   LABELS { severity = "email" }
 55 |   ANNOTATIONS {
 56 |     summary = "Container reboot",
 57 |     description = "{{ $labels.container_label_io_kubernetes_pod_name }}刚刚发生重启， 已经重启过{{ $labels.container_label_io_kubernetes_container_restartCount }}次."
 58 |   }
 59 | 
 60 | ALERT PodRestartingTooMuch
 61 |   IF          rate(kube_pod_container_status_restarts[1m]) > 1/(5*60)
 62 |   FOR         1h
 63 |   LABELS      { severity="warning" }
 64 |   ANNOTATIONS {
 65 |     summary = "Pod {{$labels.namespace}}/{{$label.pod}} restarting too much.",
 66 |     description = "Pod {{$labels.namespace}}/{{$label.pod}} restarting too much.",
 67 |   }
 68 | 
 69 | ALERT PodSlowToLaunch
 70 |   IF          rate(kubelet_pod_start_latency_microseconds{quantile="0.99"}[1m]) > 5
 71 |   FOR         1h
 72 |   LABELS      { severity="warning" }
 73 |   ANNOTATIONS {
 74 |     summary = "Pods are slow to launch.",
 75 |     description = "Pods are taking longer than 5 milliseconds to launch.",
 76 |   }
 77 | 
 78 | 
 79 | # #etcd monitor
 80 | # ALERT HighNumberOfFailedHTTPRequests
 81 | # IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
 82 | #   / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
 83 | # FOR 10m
 84 | # LABELS {
 85 | #   severity = "warning"
 86 | # }
 87 | # ANNOTATIONS {
 88 | #   summary = "a high number of HTTP requests are failing",
 89 | #   description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
 90 | # }
 91 | 
 92 | # ALERT HTTPRequestsSlow
 93 | # IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
 94 | # FOR 10m
 95 | # LABELS {
 96 | #   severity = "warning"
 97 | # }
 98 | # ANNOTATIONS {
 99 | #   summary = "slow HTTP requests",
100 | #   description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
101 | # }
102 | 
103 | # ALERT etcdNoLeader
104 | # IF etcd_server_has_leader{job="etcd"} == 0
105 | # FOR 1m
106 | # LABELS {
107 | #   severity = "critical"
108 | # }
109 | # ANNOTATIONS {
110 | #   summary = "etcd node has no leader",
111 | #   description = "etcd node {{ $labels.instance }} has no leader",
112 | # }
113 | 
114 | # ALERT InsufficientPeers
115 | # IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
116 | # FOR 3m
117 | # LABELS {
118 | #   severity = "critical"
119 | # }
120 | # ANNOTATIONS {
121 | #   summary = "etcd cluster small",
122 | #   description = "If one more etcd peer goes down the cluster will be unavailable",
123 | # }
124 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | +++
  2 | title = "Monitoring with Prometheus and Grafana"
  3 | description = "Recipe to spin up a monitoring setup with Prometheus and Grafana on Kubernetes."
  4 | date = "2016-09-29"
  5 | type = "page"
  6 | weight = 100
  7 | categories = ["recipes"]
  8 | +++
  9 | 
 10 | # Monitoring with Prometheus and Grafana
 11 | 
 12 | [Prometheus](https://prometheus.io/) is an open-source monitoring solution that includes the gathering of metrics, their storage in an internal time series database as well as querying and alerting based on that data.
 13 | 
 14 | It offers a lot of integrations incl. Docker, Kubernetes, etc.
 15 | 
 16 | Prometheus can also visualize your data. However, in this recipe we include another open-source tool, [Grafana](http://grafana.org/), for the visualization part, as it offers a more powerful and flexible way to generate visuals and dashboards.
 17 | 
 18 | If you just want to get Prometheus and Grafana up and running you can deploy the whole recipe with a single command instead of going through all steps detailed out below:
 19 | 
 20 | ```bash
 21 | kubectl create --filename manifests/
 22 | ```
 23 | 
 24 | ## Deploying Prometheus
 25 | 
 26 | First, we need to create the configuration for our Prometheus. For this we use a Config Map, which we later mount into our Prometheus pod to configure it. This way we can change the configuration without having to redeploy Prometheus itself.
 27 | 
 28 | `kubectl create --filename manifests/prometheus-core-configmap.yaml`
 29 | 
 30 | Then, we create a service to be able to access Prometheus.
 31 | 
 32 | `kubectl create --filename manifests/prometheus-core-service.yaml`
 33 | 
 34 | Finally, we can deploy Prometheus itself.
 35 | 
 36 | `kubectl create --filename manifests/prometheus-core-deployment.yaml`
 37 | 
 38 | Further, we need the Prometheus Node Exporter deployed to each node. For this we use a Daemon Set and a fronting service for Prometheus to be able to access the node exporters.
 39 | 
 40 | ```
 41 | kubectl create --filename manifests/prometheus-node-exporter-service.yaml
 42 | kubectl create --filename manifests/prometheus-node-exporter-daemonset.yaml
 43 | ```
 44 | 
 45 | Wait a bit for all the pods to come up. Then Prometheus should be ready and running. We can check the Prometheus targets at https://mycluster.k8s.gigantic.io/api/v1/proxy/namespaces/default/services/prometheus/targets
 46 | 
 47 | ![Prometheus Targets](prometheus_targets.png)
 48 | 
 49 | ## Deploying Alertmanager
 50 | we need to create the configuration for our Alertmanager. For this we use a Config Map, which we later mount into our Alertmanager pod to configure it. This way we can change the configuration without having to redeploy Alertmanager itself.
 51 | 
 52 | `kubectl create --filename manifests/prometheus-alert-configmap.yaml`
 53 | 
 54 | Then, we create a service to be able to access Alertmanager.
 55 | 
 56 | `kubectl create --filename manifests/prometheus-alert-service.yaml`
 57 | 
 58 | Finally, we can deploy Alertmanager itself.
 59 | 
 60 | `kubectl create --filename manifests/prometheus-alert-deployment.yaml`
 61 | 
 62 | 
 63 | Wait a bit for all the pods to come up. Then Alertmanager should be ready and running. We can check the Alertmanager targets at
 64 | https://mycluster.k8s.gigantic.io/api/v1/proxy/namespaces/default/services/alertmanager/
 65 | 
 66 | ![Alertmanager](alertmanager.png)
 67 | 
 68 | 
 69 | ## Deploying Grafana
 70 | 
 71 | Now that we have Prometheus up and running we can deploy Grafana to have a nicer frontend for our metrics.
 72 | 
 73 | Again, we create a service to be able to access Grafana and a deployment to manage the pods.
 74 | 
 75 | ```
 76 | kubectl create --filename manifests/grafana-services.yaml
 77 | kubectl create --filename manifests/grafana-deployment.yaml
 78 | ```
 79 | 
 80 | Wait a bit for Grafana to come up. Then you can access Grafana at https://mycluster.k8s.gigantic.io/api/v1/proxy/namespaces/default/services/grafana/
 81 | 
 82 | ## Setting Up Grafana
 83 | 
 84 | TLDR: If you don't want to go through all the manual steps below you can let the following job use the API to configure Grafana to a similar state.
 85 | 
 86 | ```bash
 87 | kubectl create --filename manifests/grafana-import-dashboards-job.yaml
 88 | ```
 89 | 
 90 | Once we're in Grafana we need to first configure [Prometheus](https://grafana.net/plugins/prometheus) as a data source.
 91 | 
 92 | - `Grafana UI / Data Sources / Add data source`
 93 | 	- `Name`: `prometheus`
 94 | 	- `Type`: `Prometheus`
 95 | 	- `Url`: `http://prometheus:9090`
 96 | 	- `Add`
 97 | 
 98 | ![Grafana Datasource](grafana_datasource.png)
 99 | 
100 | Then go to the Dashboards tab and import the [Prometheus Stats dashboard](https://grafana.net/dashboards/2), which shows the status of Prometheus itself.
101 | 
102 | ![Grafana Datasource Dashboard](grafana_datasource_dashboard.png)
103 | 
104 | You can check it out to see how your Prometheus is doing.
105 | 
106 | ![Grafana Datasource Dashboard](grafana_prometheus_stats.png)
107 | 
108 | Last, but not least we can import a sample [Kubernetes cluster monitoring dashboard](https://grafana.net/dashboards/162), to get a first overview over our cluster metrics.
109 | 
110 | -  `Grafana UI / Dashboards / Import`
111 | 	- `Grafana.net Dashboard`: `https://grafana.net/dashboards/162`
112 | 	- `Load`
113 | 	- `Prometheus`: `prometheus`
114 | 	- `Save & Open`
115 | 
116 | ![Grafana Import Dashboard](grafana_import_dashboard.png)
117 | 
118 | Voilá. You have a nice first dashboard with metrics of your Kubernetes cluster.
119 | 
120 | ![Grafana Import Dashboard](grafana_cluster_overview.png)
121 | 
122 | ## Next Steps
123 | 
124 | Next, you should get into the [Grafana](http://docs.grafana.org/) and [Prometheus](https://prometheus.io/docs/introduction/overview/) documentations to get to know the tools and either build your own dashboards or extend the samples from above.
125 | 
126 | You can also check out grafana.net for some more example [dashboards](https://grafana.net/dashboards) and [plugins](https://grafana.net/plugins).
127 | 
128 | More Alertmanager documentations in [here](https://prometheus.io/docs/alerting/overview/)
129 | 


--------------------------------------------------------------------------------
/manifests/prometheus-core-configmap/prometheus.yml:
--------------------------------------------------------------------------------
  1 | global:
  2 |   scrape_interval: 10s
  3 |   scrape_timeout: 10s
  4 |   evaluation_interval: 10s
  5 | 
  6 | rule_files:
  7 |   - '/etc/prometheus/prometheus-alert.rules'
  8 | 
  9 | scrape_configs:
 10 |   # # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
 11 |   # - job_name: 'prometheus'
 12 | 
 13 |   #   # Override the global default and scrape targets from this job every 5 seconds.
 14 |   #   scrape_interval: 5s
 15 | 
 16 |   #   # metrics_path defaults to '/metrics'
 17 |   #   # scheme defaults to 'http'.
 18 | 
 19 |   #   static_configs:
 20 |   #     - targets: ['localhost:9090']
 21 | 
 22 |   - job_name: 'kubernetes-cluster'
 23 | 
 24 |     # Default to scraping over https. If required, just disable this or change to
 25 |     # `http`.
 26 |     scheme: https
 27 | 
 28 |     # This TLS & bearer token file config is used to connect to the actual scrape
 29 |     # endpoints for cluster components. This is separate to discovery auth
 30 |     # configuration (`in_cluster` below) because discovery & scraping are two
 31 |     # separate concerns in Prometheus.
 32 |     tls_config:
 33 |       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 34 |       # If your node certificates are self-signed or use a different CA to the
 35 |       # master CA, then disable certificate verification below. Note that
 36 |       # certificate verification is an integral part of a secure infrastructure
 37 |       # so this should only be disabled in a controlled environment. You can
 38 |       # disable certificate verification by uncommenting the line below.
 39 |       #
 40 |       # insecure_skip_verify: true
 41 |     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
 42 | 
 43 |     kubernetes_sd_configs:
 44 |     - api_servers:
 45 |       - 'https://kubernetes.default.svc'
 46 |       in_cluster: true
 47 |       role: apiserver
 48 | 
 49 |   - job_name: 'kubernetes-nodes'
 50 | 
 51 |     # Default to scraping over https. If required, just disable this or change to
 52 |     # `http`.
 53 |     scheme: http
 54 | 
 55 |     # This TLS & bearer token file config is used to connect to the actual scrape
 56 |     # endpoints for cluster components. This is separate to discovery auth
 57 |     # configuration (`in_cluster` below) because discovery & scraping are two
 58 |     # separate concerns in Prometheus.
 59 |     tls_config:
 60 |       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 61 |       # If your node certificates are self-signed or use a different CA to the
 62 |       # master CA, then disable certificate verification below. Note that
 63 |       # certificate verification is an integral part of a secure infrastructure
 64 |       # so this should only be disabled in a controlled environment. You can
 65 |       # disable certificate verification by uncommenting the line below.
 66 |       #
 67 |       # insecure_skip_verify: true
 68 |     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
 69 | 
 70 |     kubernetes_sd_configs:
 71 |     - api_servers:
 72 |       - 'https://kubernetes.default.svc'
 73 |       in_cluster: true
 74 |       role: node
 75 | 
 76 |     relabel_configs:
 77 |     - action: labelmap
 78 |       regex: __meta_kubernetes_node_label_(.+)
 79 |     - source_labels: [__meta_kubernetes_role]
 80 |       action: replace
 81 |       target_label: kubernetes_role
 82 |     - source_labels: [__address__]
 83 |       regex: '(.*):10250'
 84 |       replacement: '${1}:10255'
 85 |       target_label: __address__
 86 |   # Scrape config for service endpoints.
 87 |   #
 88 |   # The relabeling allows the actual service scrape endpoint to be configured
 89 |   # via the following annotations:
 90 |   #
 91 |   # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
 92 |   # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
 93 |   # to set this to `https` & most likely set the `tls_config` of the scrape config.
 94 |   # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
 95 |   # * `prometheus.io/port`: If the metrics are exposed on a different port to the
 96 |   # service then set this appropriately.
 97 |   - job_name: 'kubernetes-service-endpoints'
 98 | 
 99 |     kubernetes_sd_configs:
100 |     - api_servers:
101 |       - 'https://kubernetes.default.svc'
102 |       in_cluster: true
103 |       role: endpoint
104 | 
105 |     relabel_configs:
106 |     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
107 |       action: keep
108 |       regex: true
109 |     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
110 |       action: replace
111 |       target_label: __scheme__
112 |       regex: (https?)
113 |     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
114 |       action: replace
115 |       target_label: __metrics_path__
116 |       regex: (.+)
117 |     - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
118 |       action: replace
119 |       target_label: __address__
120 |       regex: (.+)(?::\d+);(\d+)
121 |       replacement: $1:$2
122 |     - action: labelmap
123 |       regex: __meta_kubernetes_service_label_(.+)
124 |     - source_labels: [__meta_kubernetes_service_namespace]
125 |       action: replace
126 |       target_label: kubernetes_namespace
127 |     - source_labels: [__meta_kubernetes_service_name]
128 |       action: replace
129 |       target_label: kubernetes_name
130 | 
131 |   # Example scrape config for probing services via the Blackbox Exporter.
132 |   #
133 |   # The relabeling allows the actual service scrape endpoint to be configured
134 |   # via the following annotations:
135 |   #
136 |   # * `prometheus.io/probe`: Only probe services that have a value of `true`
137 |   - job_name: 'kubernetes-services'
138 | 
139 |     metrics_path: /probe
140 |     params:
141 |       module: [http_2xx]
142 | 
143 |     kubernetes_sd_configs:
144 |     - api_servers:
145 |       - 'https://kubernetes.default.svc'
146 |       in_cluster: true
147 |       role: service
148 | 
149 |     relabel_configs:
150 |     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
151 |       action: keep
152 |       regex: true
153 |     - source_labels: [__address__]
154 |       target_label: __param_target
155 |     - target_label: __address__
156 |       replacement: blackbox
157 |     - source_labels: [__param_target]
158 |       target_label: instance
159 |     - action: labelmap
160 |       regex: __meta_kubernetes_service_label_(.+)
161 |     - source_labels: [__meta_kubernetes_service_namespace]
162 |       target_label: kubernetes_namespace
163 |     - source_labels: [__meta_kubernetes_service_name]
164 |       target_label: kubernetes_name
165 | 
166 |   # Example scrape config for pods
167 |   #
168 |   # The relabeling allows the actual pod scrape endpoint to be configured via the
169 |   # following annotations:
170 |   #
171 |   # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
172 |   # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
173 |   # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
174 |   - job_name: 'kubernetes-pods'
175 | 
176 |     kubernetes_sd_configs:
177 |     - api_servers:
178 |       - 'https://kubernetes.default.svc'
179 |       in_cluster: true
180 |       role: pod
181 | 
182 |     relabel_configs:
183 |     - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
184 |       action: keep
185 |       regex: true
186 |     - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
187 |       action: replace
188 |       target_label: __metrics_path__
189 |       regex: (.+)
190 |     - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
191 |       action: replace
192 |       regex: (.+):(?:\d+);(\d+)
193 |       replacement: ${1}:${2}
194 |       target_label: __address__
195 |     - action: labelmap
196 |       regex: __meta_kubernetes_pod_label_(.+)
197 |     - source_labels: [__meta_kubernetes_pod_namespace]
198 |       action: replace
199 |       target_label: kubernetes_namespace
200 |     - source_labels: [__meta_kubernetes_pod_name]
201 |       action: replace
202 |       target_label: kubernetes_pod_name
203 | 


--------------------------------------------------------------------------------
/manifests/grafana-import-dashboards-configmap/grafana-net-162-dashboard.json:
--------------------------------------------------------------------------------
1 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Kubernetes cluster monitoring (via Prometheus)","tags":["kubernetes"],"style":"dark","timezone":"browser","editable":true,"hideControls":true,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":4,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(sum(node_memory_MemTotal) - sum(node_memory_MemFree+node_memory_Buffers+node_memory_Cached) ) / sum(node_memory_MemTotal) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(sum by (io_kubernetes_container_name)( rate(container_cpu_usage_seconds_total{image!=\"\"}[1m] ) )) / count(node_cpu{mode=\"system\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster CPU usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(sum(node_filesystem_size{device=\"rootfs\"}) - sum(node_filesystem_free{device=\"rootfs\"}) ) / sum(node_filesystem_size{device=\"rootfs\"}) * 100","interval":"10s","intervalFactor":1,"metric":"","refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster Filesystem usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"title":"Row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (io_kubernetes_container_name)( rate(container_cpu_usage_seconds_total{image!=\"\"}[1m] ) )","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_container_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sort_desc(sum(container_memory_usage_bytes{image!=\"\"}) by (io_kubernetes_container_name, image))","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_container_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":8,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sort_desc(sum by (kubernetes_pod_name) (rate (container_network_receive_bytes_total{name!=\"\", kubernetes_pod_name=~\"op-.*\"}[1m]) ))","interval":"10s","intervalFactor":1,"legendFormat":"{{ kubernetes_pod_name }}","metric":"network","refId":"A","step":10},{"expr":"sort_desc(sum by (kubernetes_pod_name) (rate (container_network_transmit_bytes_total{name!=\"\", kubernetes_pod_name=~\"op-.*\"}[1m]) ))","interval":"10s","intervalFactor":1,"legendFormat":"{{ kubernetes_pod_name }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod Network i/o","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"}],"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":46,"links":[],"gnetId":162,"description":"Monitor a Kubernetes cluster using Prometheus TSDB.  Shows overall cluster CPU / Memory / Disk usage as well as individual pod statistics. "},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}
2 | 


--------------------------------------------------------------------------------
/manifests/grafana-import-dashboards-configmap/grafana-net-193-dashboard.json:
--------------------------------------------------------------------------------
1 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"prometheus with cAdvisor as a target","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Docker monitoring","description":"Docker monitoring with Prometheus and cAdvisor","tags":["docker"],"style":"dark","timezone":"browser","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"50","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"count(container_last_seen{image!=\"\"})","intervalFactor":2,"legendFormat":"","metric":"container_last_seen","refId":"A","step":240}],"thresholds":"","title":"Running containers","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"mbytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":5,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(container_memory_usage_bytes{image!=\"\"})/1024/1024","intervalFactor":2,"legendFormat":"","metric":"container_memory_usage_bytes","refId":"A","step":240}],"thresholds":"","title":"Total Memory Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100)","intervalFactor":2,"legendFormat":"","metric":"container_memory_usage_bytes","refId":"A","step":240}],"thresholds":"","title":"Total CPU Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100","intervalFactor":2,"legendFormat":"{{name}}","metric":"cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"CPU Usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"Row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":1,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"container_memory_usage_bytes{image!=\"\"}","hide":false,"intervalFactor":2,"legendFormat":"{{name}}","metric":"container_memory_usage_bytes","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Memory Usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":"","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"irate(container_network_receive_bytes_total{image!=\"\"}[5m])","intervalFactor":2,"legendFormat":"{{name}}","metric":"container_network_receive_bytes_total","refId":"A","step":20}],"timeFrom":null,"timeShift":null,"title":"Network Rx","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":4,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"irate(container_network_transmit_bytes_total{image!=\"\"}[5m])","intervalFactor":2,"legendFormat":"{{name}}","refId":"A","step":20}],"timeFrom":null,"timeShift":null,"title":"Network Tx","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"}],"time":{"from":"now-3h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":26,"links":[],"gnetId":193},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}
2 | 


--------------------------------------------------------------------------------
/manifests/grafana-import-dashboards-configmap/grafana-net-2-dashboard.json:
--------------------------------------------------------------------------------
1 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"text","name":"Text","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Prometheus Stats","tags":[],"style":"dark","timezone":"browser","editable":true,"hideControls":true,"sharedCrosshair":false,"rows":[{"collapse":false,"editable":true,"height":178,"panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","decimals":1,"editable":true,"error":false,"format":"s","id":5,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(time() - process_start_time_seconds{job=\"prometheus\"})","intervalFactor":2,"refId":"A","step":4}],"thresholds":"","title":"Uptime","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","id":6,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"targets":[{"expr":"prometheus_local_storage_memory_series","intervalFactor":2,"refId":"A","step":4}],"thresholds":"1,5","title":"Local Storage Memory Series","type":"singlestat","valueFontSize":"70%","valueMaps":[],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","id":7,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"targets":[{"expr":"prometheus_local_storage_indexing_queue_length","intervalFactor":2,"refId":"A","step":4}],"thresholds":"500,4000","title":"Interal Storage Queue Length","type":"singlestat","valueFontSize":"70%","valueMaps":[{"op":"=","text":"Empty","value":"0"}],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"content":"<img src=\"http://prometheus.io/assets/prometheus_logo_grey.svg\" alt=\"Prometheus logo\" style=\"height: 40px;\">\n<span style=\"font-family: 'Open Sans', 'Helvetica Neue', Helvetica; font-size: 25px;vertical-align: text-top;color: #bbbfc2;margin-left: 10px;\">Prometheus</span>\n\n<p style=\"margin-top: 10px;\">You're using Prometheus, an open-source systems monitoring and alerting toolkit originally built at SoundCloud. For more information, check out the <a href=\"http://www.grafana.org/\">Grafana</a> and <a href=\"http://prometheus.io/\">Prometheus</a> projects.</p>","editable":true,"error":false,"id":9,"links":[],"mode":"html","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":227,"panels":[{"aliasColors":{"prometheus":"#C15C17","{instance=\"localhost:9090\",job=\"prometheus\"}":"#C15C17"},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(prometheus_local_storage_ingested_samples_total[5m])","interval":"","intervalFactor":2,"legendFormat":"{{job}}","metric":"","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Samples ingested (rate-5m)","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Samples Ingested\nThis graph displays the count of samples ingested by the Prometheus server, as measured over the last 5 minutes, per time series in the range vector. When troubleshooting an issue on IRC or Github, this is often the first stat requested by the Prometheus team. ","editable":true,"error":false,"id":8,"links":[],"mode":"markdown","span":2.995914043583536,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{"prometheus":"#F9BA8F","{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}":"#F9BA8F"},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":5,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(prometheus_target_interval_length_seconds_count[5m])","intervalFactor":2,"legendFormat":"{{job}}","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Target Scrapes (last 5m)","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":14,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":4,"stack":false,"steppedLine":false,"targets":[{"expr":"prometheus_target_interval_length_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}","interval":"","intervalFactor":2,"legendFormat":"{{quantile}} ({{interval}})","metric":"","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Scrape Duration","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Scrapes\nPrometheus scrapes metrics from instrumented jobs, either directly or via an intermediary push gateway for short-lived jobs. Target scrapes will show how frequently targets are scraped, as measured over the last 5 minutes, per time series in the range vector. Scrape Duration will show how long the scrapes are taking, with percentiles available as series. ","editable":true,"error":false,"id":11,"links":[],"mode":"markdown","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":null,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":12,"legend":{"alignAsTable":false,"avg":false,"current":false,"hideEmpty":true,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"prometheus_evaluator_duration_milliseconds{quantile!=\"0.01\", quantile!=\"0.05\"}","interval":"","intervalFactor":2,"legendFormat":"{{quantile}}","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Rule Eval Duration","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"percentunit","label":""},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Rule Evaluation Duration\nThis graph panel plots the duration for all evaluations to execute. The 50th percentile, 90th percentile and 99th percentile are shown as three separate series to help identify outliers that may be skewing the data.","editable":true,"error":false,"id":15,"links":[],"mode":"markdown","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"}],"time":{"from":"now-5m","to":"now"},"timepicker":{"now":true,"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":0,"links":[{"icon":"info","tags":[],"targetBlank":true,"title":"Grafana Docs","tooltip":"","type":"link","url":"http://www.grafana.org/docs"},{"icon":"info","tags":[],"targetBlank":true,"title":"Prometheus Docs","type":"link","url":"http://prometheus.io/docs/introduction/overview/"}],"gnetId":2,"description":"The  official, pre-built Prometheus Stats Dashboard."},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}
2 | 


--------------------------------------------------------------------------------
/manifests/prometheus-core-configmap.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | data:
  3 |   prometheus-alert.rules: "# Alert for any instance that is unreachable for >5 minutes.\nALERT
  4 |     InstanceDown\n  IF up == 0\n  FOR 5m\n  LABELS { severity = \"email\" }\n  ANNOTATIONS
  5 |     {\n    summary = \"Instance {{ $labels.instance }} down\",\n    description =
  6 |     \"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than
  7 |     5 minutes.\",\n  }\n\nALERT node_cpu_threshold_exceeded  \n  IF 100 * node_load5
  8 |     > 90\n  LABELS { severity = \"email\" }\n  ANNOTATIONS {\n    summary = \"Instance
  9 |     {{ $labels.instance }} CPU usage is dangerously high\",\n    description = \"This
 10 |     device's CPU usage has exceeded the threshold with a value of {{ $value }}.\",\n
 11 |     \ }\n\nALERT node_memory_threshold_exceeded  \n  IF (node_memory_MemFree+node_memory_Buffers+node_memory_Cached)
 12 |     / node_memory_MemTotal < 0.1\n  LABELS { severity = \"email\" }\n  ANNOTATIONS
 13 |     {\n    summary = \"Instance {{ $labels.instance }} MEM usage is dangerously high\",\n
 14 |     \   description = \"This device's MEM usage has exceeded the threshold with a
 15 |     value of {{ $value }}.\",\n  }\n\nALERT node_fs_threshold_exceeded\n  IF node_filesystem_free{fstype=\"rootfs\"}
 16 |     / node_filesystem_size{fstype=\"rootfs\"} < 0.2\n  LABELS { severity = \"email\"
 17 |     }\n  ANNOTATIONS {\n    summary = \"Node filesystem usage is high\",\n    description
 18 |     = \"Node {{ $labels.instance }}'s filesystem {{ $labels.mountpoint }} has less
 19 |     than 20% disk space remaining.\"\n  }\n\nALERT container_cpu_threshold_exceeded
 20 |     \ \n  IF rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100 > 90\n
 21 |     \ LABELS { severity = \"email\" }\n  ANNOTATIONS {\n    summary = \"Instance {{
 22 |     $labels.kubernetes_container_name }} CPU usage is dangerously high\",\n    description
 23 |     = \"This device's CPU usage has exceeded the threshold with a value of {{ $value
 24 |     }}.\",\n  }\n\nALERT FdExhaustionClose\n  IF predict_linear(instance:fd_utilization[1h],
 25 |     3600 * 4) > 1\n  FOR 10m\n  LABELS { severity = \"email\" }\n  ANNOTATIONS {\n
 26 |     \   summary = \"file descriptors soon exhausted\",\n    description = \"{{ $labels.job
 27 |     }} instance {{ $labels.instance }} will exhaust in file descriptors soon\",\n
 28 |     \ }\n\nALERT ContainerReboot\n  IF increase(container_last_seen{container_label_io_kubernetes_container_hash!=\"\"}[30s])
 29 |     < 25\n  LABELS { severity = \"email\" }\n  ANNOTATIONS {\n    summary = \"Container
 30 |     reboot\",\n    description = \"{{ $labels.container_label_io_kubernetes_pod_name
 31 |     }}刚刚发生重启， 已经重启过{{ $labels.container_label_io_kubernetes_container_restartCount
 32 |     }}次.\"\n  }\n\nALERT PodRestartingTooMuch\n  IF          rate(kube_pod_container_status_restarts[1m])
 33 |     > 1/(5*60)\n  FOR         1h\n  LABELS      { severity=\"warning\" }\n  ANNOTATIONS
 34 |     {\n    summary = \"Pod {{$labels.namespace}}/{{$label.pod}} restarting too much.\",\n
 35 |     \   description = \"Pod {{$labels.namespace}}/{{$label.pod}} restarting too much.\",\n
 36 |     \ }\n\nALERT PodSlowToLaunch\n  IF          rate(kubelet_pod_start_latency_microseconds{quantile=\"0.99\"}[1m])
 37 |     > 5\n  FOR         1h\n  LABELS      { severity=\"warning\" }\n  ANNOTATIONS {\n
 38 |     \   summary = \"Pods are slow to launch.\",\n    description = \"Pods are taking
 39 |     longer than 5 milliseconds to launch.\",\n  }\n\n\n# #etcd monitor\n# ALERT HighNumberOfFailedHTTPRequests\n#
 40 |     IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd\"}[5m]))\n#   / sum
 41 |     by(method) (rate(etcd_http_received_total{job=\"etcd\"}[5m])) > 0.01\n# FOR 10m\n#
 42 |     LABELS {\n#   severity = \"warning\"\n# }\n# ANNOTATIONS {\n#   summary = \"a
 43 |     high number of HTTP requests are failing\",\n#   description = \"{{ $value }}%
 44 |     of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance
 45 |     }}\",\n# }\n\n# ALERT HTTPRequestsSlow\n# IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
 46 |     > 0.15\n# FOR 10m\n# LABELS {\n#   severity = \"warning\"\n# }\n# ANNOTATIONS
 47 |     {\n#   summary = \"slow HTTP requests\",\n#   description = \"on ectd instance
 48 |     {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow\",\n# }\n\n#
 49 |     ALERT etcdNoLeader\n# IF etcd_server_has_leader{job=\"etcd\"} == 0\n# FOR 1m\n#
 50 |     LABELS {\n#   severity = \"critical\"\n# }\n# ANNOTATIONS {\n#   summary = \"etcd
 51 |     node has no leader\",\n#   description = \"etcd node {{ $labels.instance }} has
 52 |     no leader\",\n# }\n\n# ALERT InsufficientPeers\n# IF count(up{job=\"etcd\"} ==
 53 |     0) > (count(up{job=\"etcd\"}) / 2 - 1)\n# FOR 3m\n# LABELS {\n#   severity = \"critical\"\n#
 54 |     }\n# ANNOTATIONS {\n#   summary = \"etcd cluster small\",\n#   description = \"If
 55 |     one more etcd peer goes down the cluster will be unavailable\",\n# }\n"
 56 |   prometheus-record.rules: |2
 57 | 
 58 |     instance:fd_utilization = process_open_fds / process_max_fds
 59 |   prometheus.yml: |
 60 |     global:
 61 |       scrape_interval: 10s
 62 |       scrape_timeout: 10s
 63 |       evaluation_interval: 10s
 64 | 
 65 |     rule_files:
 66 |       - '/etc/prometheus/prometheus-alert.rules'
 67 | 
 68 |     scrape_configs:
 69 |       # # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
 70 |       # - job_name: 'prometheus'
 71 | 
 72 |       #   # Override the global default and scrape targets from this job every 5 seconds.
 73 |       #   scrape_interval: 5s
 74 | 
 75 |       #   # metrics_path defaults to '/metrics'
 76 |       #   # scheme defaults to 'http'.
 77 | 
 78 |       #   static_configs:
 79 |       #     - targets: ['localhost:9090']
 80 | 
 81 |       - job_name: 'kubernetes-cluster'
 82 | 
 83 |         # Default to scraping over https. If required, just disable this or change to
 84 |         # `http`.
 85 |         scheme: https
 86 | 
 87 |         # This TLS & bearer token file config is used to connect to the actual scrape
 88 |         # endpoints for cluster components. This is separate to discovery auth
 89 |         # configuration (`in_cluster` below) because discovery & scraping are two
 90 |         # separate concerns in Prometheus.
 91 |         tls_config:
 92 |           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 93 |           # If your node certificates are self-signed or use a different CA to the
 94 |           # master CA, then disable certificate verification below. Note that
 95 |           # certificate verification is an integral part of a secure infrastructure
 96 |           # so this should only be disabled in a controlled environment. You can
 97 |           # disable certificate verification by uncommenting the line below.
 98 |           #
 99 |           # insecure_skip_verify: true
100 |         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
101 | 
102 |         kubernetes_sd_configs:
103 |         - api_servers:
104 |           - 'https://kubernetes.default.svc'
105 |           in_cluster: true
106 |           role: apiserver
107 | 
108 |       - job_name: 'kubernetes-nodes'
109 | 
110 |         # Default to scraping over https. If required, just disable this or change to
111 |         # `http`.
112 |         scheme: http
113 | 
114 |         # This TLS & bearer token file config is used to connect to the actual scrape
115 |         # endpoints for cluster components. This is separate to discovery auth
116 |         # configuration (`in_cluster` below) because discovery & scraping are two
117 |         # separate concerns in Prometheus.
118 |         tls_config:
119 |           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
120 |           # If your node certificates are self-signed or use a different CA to the
121 |           # master CA, then disable certificate verification below. Note that
122 |           # certificate verification is an integral part of a secure infrastructure
123 |           # so this should only be disabled in a controlled environment. You can
124 |           # disable certificate verification by uncommenting the line below.
125 |           #
126 |           # insecure_skip_verify: true
127 |         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
128 | 
129 |         kubernetes_sd_configs:
130 |         - api_servers:
131 |           - 'https://kubernetes.default.svc'
132 |           in_cluster: true
133 |           role: node
134 | 
135 |         relabel_configs:
136 |         - action: labelmap
137 |           regex: __meta_kubernetes_node_label_(.+)
138 |         - source_labels: [__meta_kubernetes_role]
139 |           action: replace
140 |           target_label: kubernetes_role
141 |         - source_labels: [__address__]
142 |           regex: '(.*):10250'
143 |           replacement: '${1}:10255'
144 |           target_label: __address__
145 |       # Scrape config for service endpoints.
146 |       #
147 |       # The relabeling allows the actual service scrape endpoint to be configured
148 |       # via the following annotations:
149 |       #
150 |       # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
151 |       # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
152 |       # to set this to `https` & most likely set the `tls_config` of the scrape config.
153 |       # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
154 |       # * `prometheus.io/port`: If the metrics are exposed on a different port to the
155 |       # service then set this appropriately.
156 |       - job_name: 'kubernetes-service-endpoints'
157 | 
158 |         kubernetes_sd_configs:
159 |         - api_servers:
160 |           - 'https://kubernetes.default.svc'
161 |           in_cluster: true
162 |           role: endpoint
163 | 
164 |         relabel_configs:
165 |         - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
166 |           action: keep
167 |           regex: true
168 |         - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
169 |           action: replace
170 |           target_label: __scheme__
171 |           regex: (https?)
172 |         - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
173 |           action: replace
174 |           target_label: __metrics_path__
175 |           regex: (.+)
176 |         - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
177 |           action: replace
178 |           target_label: __address__
179 |           regex: (.+)(?::\d+);(\d+)
180 |           replacement: $1:$2
181 |         - action: labelmap
182 |           regex: __meta_kubernetes_service_label_(.+)
183 |         - source_labels: [__meta_kubernetes_service_namespace]
184 |           action: replace
185 |           target_label: kubernetes_namespace
186 |         - source_labels: [__meta_kubernetes_service_name]
187 |           action: replace
188 |           target_label: kubernetes_name
189 | 
190 |       # Example scrape config for probing services via the Blackbox Exporter.
191 |       #
192 |       # The relabeling allows the actual service scrape endpoint to be configured
193 |       # via the following annotations:
194 |       #
195 |       # * `prometheus.io/probe`: Only probe services that have a value of `true`
196 |       - job_name: 'kubernetes-services'
197 | 
198 |         metrics_path: /probe
199 |         params:
200 |           module: [http_2xx]
201 | 
202 |         kubernetes_sd_configs:
203 |         - api_servers:
204 |           - 'https://kubernetes.default.svc'
205 |           in_cluster: true
206 |           role: service
207 | 
208 |         relabel_configs:
209 |         - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
210 |           action: keep
211 |           regex: true
212 |         - source_labels: [__address__]
213 |           target_label: __param_target
214 |         - target_label: __address__
215 |           replacement: blackbox
216 |         - source_labels: [__param_target]
217 |           target_label: instance
218 |         - action: labelmap
219 |           regex: __meta_kubernetes_service_label_(.+)
220 |         - source_labels: [__meta_kubernetes_service_namespace]
221 |           target_label: kubernetes_namespace
222 |         - source_labels: [__meta_kubernetes_service_name]
223 |           target_label: kubernetes_name
224 | 
225 |       # Example scrape config for pods
226 |       #
227 |       # The relabeling allows the actual pod scrape endpoint to be configured via the
228 |       # following annotations:
229 |       #
230 |       # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
231 |       # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
232 |       # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
233 |       - job_name: 'kubernetes-pods'
234 | 
235 |         kubernetes_sd_configs:
236 |         - api_servers:
237 |           - 'https://kubernetes.default.svc'
238 |           in_cluster: true
239 |           role: pod
240 | 
241 |         relabel_configs:
242 |         - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
243 |           action: keep
244 |           regex: true
245 |         - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
246 |           action: replace
247 |           target_label: __metrics_path__
248 |           regex: (.+)
249 |         - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
250 |           action: replace
251 |           regex: (.+):(?:\d+);(\d+)
252 |           replacement: ${1}:${2}
253 |           target_label: __address__
254 |         - action: labelmap
255 |           regex: __meta_kubernetes_pod_label_(.+)
256 |         - source_labels: [__meta_kubernetes_pod_namespace]
257 |           action: replace
258 |           target_label: kubernetes_namespace
259 |         - source_labels: [__meta_kubernetes_pod_name]
260 |           action: replace
261 |           target_label: kubernetes_pod_name
262 | kind: ConfigMap
263 | metadata:
264 |   creationTimestamp: null
265 |   name: prometheus-core
266 | 


--------------------------------------------------------------------------------
/manifests/grafana-import-dashboards-configmap/grafana-net-315-dashboard.json:
--------------------------------------------------------------------------------
1 | {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Kubernetes resources usage monitoring (via Prometheus)","description":"Monitors Kubernetes cluster using Prometheus. Shows overall cluster CPU / Memory / Filesystem usage as well as individual pod, containers, systemd services statistics. Uses cAdvisor metrics only.","tags":["kubernetes"],"style":"dark","timezone":"browser","editable":true,"hideControls":false,"sharedCrosshair":false,"rows":[{"title":"Network I/O pressure","height":"200px","editable":true,"collapse":false,"panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)","thresholdLine":false},"id":32,"isNew":true,"legend":{"alignAsTable":false,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":false,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Received","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Sent","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Network I/O pressure","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":false}],"height":"200px","transparent":false}]},{"collapse":false,"editable":true,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":4,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster memory usage","transparent":false,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster CPU usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (container_fs_limit_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"legendFormat":"","metric":"","refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster filesystem usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":9,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"20%","prefix":"","prefixFontSize":"20%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":10,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":11,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":12,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":13,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":14,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_limit_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"showTitle":false,"title":"Total usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":17,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_pod_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"showTitle":false,"title":"Pods CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":23,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (systemd_service_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":24,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":null,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",io_kubernetes_container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"container_cpu","refId":"A","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_cpu","refId":"B","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_cpu","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers CPU usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":20,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"repeat":null,"showTitle":false,"title":"All processes CPU usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":25,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_pod_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":26,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (systemd_service_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":27,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",io_kubernetes_container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}) by (io_kubernetes_container_name, io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10},{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, name, image)","interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_memory_usage:sort_desc","refId":"B","step":10},{"expr":"sum (container_memory_working_set_bytes{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_memory_usage:sort_desc","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers memory usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":28,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes memory usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":16,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ io_kubernetes_pod_name }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ io_kubernetes_pod_name }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods network I/O"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":30,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"network","refId":"B","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"network","refId":"D","step":10},{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"C","step":10},{"expr":"sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"E","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"F","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers network I/O"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":29,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ id }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ id }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes network I/O"}],"time":{"from":"now-30m","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"multi":false,"name":"Node","options":[],"query":"label_values(kubernetes_io_hostname)","refresh":1,"type":"query"}]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":18,"links":[],"gnetId":315},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}
2 | 


--------------------------------------------------------------------------------
/manifests/grafana-import-dashboards-configmap.yaml:
--------------------------------------------------------------------------------
 1 | # based on the content created by:
 2 | # kubectl create configmap grafana-import-dashboards --from-file=manifests/grafana-import-dashboards-configmap --output yaml
 3 | 
 4 | apiVersion: v1
 5 | kind: ConfigMap
 6 | metadata:
 7 |   name: grafana-import-dashboards
 8 |   labels:
 9 |     app: grafana
10 |     component: import-dashboards
11 | data:
12 |   grafana-net-2-dashboard.json: |
13 |     {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"text","name":"Text","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Prometheus Stats","tags":[],"style":"dark","timezone":"browser","editable":true,"hideControls":true,"sharedCrosshair":false,"rows":[{"collapse":false,"editable":true,"height":178,"panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","decimals":1,"editable":true,"error":false,"format":"s","id":5,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(time() - process_start_time_seconds{job=\"prometheus\"})","intervalFactor":2,"refId":"A","step":4}],"thresholds":"","title":"Uptime","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","id":6,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"targets":[{"expr":"prometheus_local_storage_memory_series","intervalFactor":2,"refId":"A","step":4}],"thresholds":"1,5","title":"Local Storage Memory Series","type":"singlestat","valueFontSize":"70%","valueMaps":[],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","id":7,"interval":null,"links":[],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"targets":[{"expr":"prometheus_local_storage_indexing_queue_length","intervalFactor":2,"refId":"A","step":4}],"thresholds":"500,4000","title":"Interal Storage Queue Length","type":"singlestat","valueFontSize":"70%","valueMaps":[{"op":"=","text":"Empty","value":"0"}],"valueName":"current","mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false}},{"content":"<img src=\"http://prometheus.io/assets/prometheus_logo_grey.svg\" alt=\"Prometheus logo\" style=\"height: 40px;\">\n<span style=\"font-family: 'Open Sans', 'Helvetica Neue', Helvetica; font-size: 25px;vertical-align: text-top;color: #bbbfc2;margin-left: 10px;\">Prometheus</span>\n\n<p style=\"margin-top: 10px;\">You're using Prometheus, an open-source systems monitoring and alerting toolkit originally built at SoundCloud. For more information, check out the <a href=\"http://www.grafana.org/\">Grafana</a> and <a href=\"http://prometheus.io/\">Prometheus</a> projects.</p>","editable":true,"error":false,"id":9,"links":[],"mode":"html","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":227,"panels":[{"aliasColors":{"prometheus":"#C15C17","{instance=\"localhost:9090\",job=\"prometheus\"}":"#C15C17"},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(prometheus_local_storage_ingested_samples_total[5m])","interval":"","intervalFactor":2,"legendFormat":"{{job}}","metric":"","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Samples ingested (rate-5m)","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Samples Ingested\nThis graph displays the count of samples ingested by the Prometheus server, as measured over the last 5 minutes, per time series in the range vector. When troubleshooting an issue on IRC or Github, this is often the first stat requested by the Prometheus team. ","editable":true,"error":false,"id":8,"links":[],"mode":"markdown","span":2.995914043583536,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{"prometheus":"#F9BA8F","{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}":"#F9BA8F"},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":5,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(prometheus_target_interval_length_seconds_count[5m])","intervalFactor":2,"legendFormat":"{{job}}","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Target Scrapes (last 5m)","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":14,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":4,"stack":false,"steppedLine":false,"targets":[{"expr":"prometheus_target_interval_length_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}","interval":"","intervalFactor":2,"legendFormat":"{{quantile}} ({{interval}})","metric":"","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Scrape Duration","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"short"},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Scrapes\nPrometheus scrapes metrics from instrumented jobs, either directly or via an intermediary push gateway for short-lived jobs. Target scrapes will show how frequently targets are scraped, as measured over the last 5 minutes, per time series in the range vector. Scrape Duration will show how long the scrapes are taking, with percentiles available as series. ","editable":true,"error":false,"id":11,"links":[],"mode":"markdown","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":null,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":12,"legend":{"alignAsTable":false,"avg":false,"current":false,"hideEmpty":true,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"prometheus_evaluator_duration_milliseconds{quantile!=\"0.01\", quantile!=\"0.05\"}","interval":"","intervalFactor":2,"legendFormat":"{{quantile}}","refId":"A","step":2}],"timeFrom":null,"timeShift":null,"title":"Rule Eval Duration","tooltip":{"shared":true,"value_type":"cumulative","ordering":"alphabetical","msResolution":false},"type":"graph","yaxes":[{"show":true,"min":null,"max":null,"logBase":1,"format":"percentunit","label":""},{"show":true,"min":null,"max":null,"logBase":1,"format":"short"}],"xaxis":{"show":true}},{"content":"#### Rule Evaluation Duration\nThis graph panel plots the duration for all evaluations to execute. The 50th percentile, 90th percentile and 99th percentile are shown as three separate series to help identify outliers that may be skewing the data.","editable":true,"error":false,"id":15,"links":[],"mode":"markdown","span":3,"style":{},"title":"","transparent":true,"type":"text"}],"title":"New row"}],"time":{"from":"now-5m","to":"now"},"timepicker":{"now":true,"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":0,"links":[{"icon":"info","tags":[],"targetBlank":true,"title":"Grafana Docs","tooltip":"","type":"link","url":"http://www.grafana.org/docs"},{"icon":"info","tags":[],"targetBlank":true,"title":"Prometheus Docs","type":"link","url":"http://prometheus.io/docs/introduction/overview/"}],"gnetId":2,"description":"The  official, pre-built Prometheus Stats Dashboard."},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}
14 |   grafana-net-162-dashboard.json: |
15 |     {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Kubernetes cluster monitoring (via Prometheus)","tags":["kubernetes"],"style":"dark","timezone":"browser","editable":true,"hideControls":true,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":4,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(sum(node_memory_MemTotal) - sum(node_memory_MemFree+node_memory_Buffers+node_memory_Cached) ) / sum(node_memory_MemTotal) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(sum by (io_kubernetes_container_name)( rate(container_cpu_usage_seconds_total{image!=\"\"}[1m] ) )) / count(node_cpu{mode=\"system\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster CPU usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"(sum(node_filesystem_size{device=\"rootfs\"}) - sum(node_filesystem_free{device=\"rootfs\"}) ) / sum(node_filesystem_size{device=\"rootfs\"}) * 100","interval":"10s","intervalFactor":1,"metric":"","refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster Filesystem usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"title":"Row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (io_kubernetes_container_name)( rate(container_cpu_usage_seconds_total{image!=\"\"}[1m] ) )","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_container_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sort_desc(sum(container_memory_usage_bytes{image!=\"\"}) by (io_kubernetes_container_name, image))","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_container_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":8,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sort_desc(sum by (kubernetes_pod_name) (rate (container_network_receive_bytes_total{name!=\"\", kubernetes_pod_name=~\"op-.*\"}[1m]) ))","interval":"10s","intervalFactor":1,"legendFormat":"{{ kubernetes_pod_name }}","metric":"network","refId":"A","step":10},{"expr":"sort_desc(sum by (kubernetes_pod_name) (rate (container_network_transmit_bytes_total{name!=\"\", kubernetes_pod_name=~\"op-.*\"}[1m]) ))","interval":"10s","intervalFactor":1,"legendFormat":"{{ kubernetes_pod_name }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Pod Network i/o","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"}],"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":46,"links":[],"gnetId":162,"description":"Monitor a Kubernetes cluster using Prometheus TSDB.  Shows overall cluster CPU / Memory / Disk usage as well as individual pod statistics. "},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}
16 |   grafana-net-193-dashboard.json: |
17 |     {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"prometheus with cAdvisor as a target","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.0"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Docker monitoring","description":"Docker monitoring with Prometheus and cAdvisor","tags":["docker"],"style":"dark","timezone":"browser","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"50","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"count(container_last_seen{image!=\"\"})","intervalFactor":2,"legendFormat":"","metric":"container_last_seen","refId":"A","step":240}],"thresholds":"","title":"Running containers","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"mbytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":5,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(container_memory_usage_bytes{image!=\"\"})/1024/1024","intervalFactor":2,"legendFormat":"","metric":"container_memory_usage_bytes","refId":"A","step":240}],"thresholds":"","title":"Total Memory Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"20","id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum(rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100)","intervalFactor":2,"legendFormat":"","metric":"container_memory_usage_bytes","refId":"A","step":240}],"thresholds":"","title":"Total CPU Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(container_cpu_user_seconds_total{image!=\"\"}[5m]) * 100","intervalFactor":2,"legendFormat":"{{name}}","metric":"cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"CPU Usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"Row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":1,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"container_memory_usage_bytes{image!=\"\"}","hide":false,"intervalFactor":2,"legendFormat":"{{name}}","metric":"container_memory_usage_bytes","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Memory Usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":"","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":3,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"irate(container_network_receive_bytes_total{image!=\"\"}[5m])","intervalFactor":2,"legendFormat":"{{name}}","metric":"container_network_receive_bytes_total","refId":"A","step":20}],"timeFrom":null,"timeShift":null,"title":"Network Rx","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":4,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"irate(container_network_transmit_bytes_total{image!=\"\"}[5m])","intervalFactor":2,"legendFormat":"{{name}}","refId":"A","step":20}],"timeFrom":null,"timeShift":null,"title":"Network Tx","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"title":"New row"}],"time":{"from":"now-3h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":26,"links":[],"gnetId":193},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}
18 |   grafana-net-315-dashboard.json: |
19 |     {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Kubernetes resources usage monitoring (via Prometheus)","description":"Monitors Kubernetes cluster using Prometheus. Shows overall cluster CPU / Memory / Filesystem usage as well as individual pod, containers, systemd services statistics. Uses cAdvisor metrics only.","tags":["kubernetes"],"style":"dark","timezone":"browser","editable":true,"hideControls":false,"sharedCrosshair":false,"rows":[{"title":"Network I/O pressure","height":"200px","editable":true,"collapse":false,"panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)","thresholdLine":false},"id":32,"isNew":true,"legend":{"alignAsTable":false,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":false,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Received","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Sent","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Network I/O pressure","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":false}],"height":"200px","transparent":false}]},{"collapse":false,"editable":true,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":4,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster memory usage","transparent":false,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster CPU usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (container_fs_limit_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"legendFormat":"","metric":"","refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster filesystem usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":9,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"20%","prefix":"","prefixFontSize":"20%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":10,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":11,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":12,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":13,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":14,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_limit_bytes{device=\"/dev/vda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"showTitle":false,"title":"Total usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":17,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_pod_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"showTitle":false,"title":"Pods CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":23,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (systemd_service_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":24,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":null,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",io_kubernetes_container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"container_cpu","refId":"A","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_cpu","refId":"B","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_cpu","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers CPU usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":20,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes CPU usage","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percent","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"repeat":null,"showTitle":false,"title":"All processes CPU usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":25,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ io_kubernetes_pod_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":26,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (systemd_service_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":27,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",io_kubernetes_container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}) by (io_kubernetes_container_name, io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10},{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, name, image)","interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_memory_usage:sort_desc","refId":"B","step":10},{"expr":"sum (container_memory_working_set_bytes{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_memory_usage:sort_desc","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers memory usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":28,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (container_memory_working_set_bytes{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes memory usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":16,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ io_kubernetes_pod_name }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ io_kubernetes_pod_name }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods network I/O"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":30,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"network","refId":"B","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (io_kubernetes_container_name, io_kubernetes_pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- pod: {{ io_kubernetes_pod_name }} | {{ io_kubernetes_container_name }}","metric":"network","refId":"D","step":10},{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"C","step":10},{"expr":"sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"E","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"F","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers network I/O"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":29,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ id }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ id }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes network I/O","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes network I/O"}],"time":{"from":"now-30m","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"multi":false,"name":"Node","options":[],"query":"label_values(kubernetes_io_hostname)","refresh":1,"type":"query"}]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":18,"links":[],"gnetId":315},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}
20 |   prometheus-datasource.json: |
21 |     {
22 |       "name": "prometheus",
23 |       "type": "prometheus",
24 |       "url": "http://prometheus:9090",
25 |       "access": "proxy",
26 |       "basicAuth": false
27 |     }
28 | 


--------------------------------------------------------------------------------