├── .gitignore ├── .gitlab-ci.yml ├── Dockerfile ├── app.py ├── docker-compose.yml ├── kubernetes ├── README.md ├── checkpoints │ ├── 10 │ │ ├── deployment.yml │ │ └── secrets.yml │ ├── 11 │ │ ├── deployment.yml │ │ └── redis.yml │ ├── 12 │ │ ├── deployment.yml │ │ └── service-monitor.yml │ ├── 03 │ │ └── deployment.yml │ ├── 04 │ │ └── service.yml │ ├── 05 │ │ └── ingress.yml │ ├── 06 │ │ └── deployment.yml │ ├── 07 │ │ └── deployment.yml │ ├── 08 │ │ └── redis.yml │ └── 09 │ │ └── redis.yml ├── final │ ├── deployment.yml │ ├── ingress.yml │ ├── redis.yml │ ├── secret.yml │ ├── service-monitor.yml │ └── service.yml └── prometheus │ ├── deploy │ ├── manifests │ ├── alertmanager │ │ ├── alertmanager-config.yaml │ │ ├── alertmanager-service.yaml │ │ └── alertmanager.yaml │ ├── grafana │ │ ├── grafana-configs.yaml │ │ ├── grafana-credentials.yaml │ │ ├── grafana-dashboards.yaml │ │ ├── grafana-deployment.yaml │ │ └── grafana-service.yaml │ ├── kube-state-metrics │ │ ├── kube-state-metrics-cluster-role-binding.yaml │ │ ├── kube-state-metrics-cluster-role.yaml │ │ ├── kube-state-metrics-deployment.yaml │ │ ├── kube-state-metrics-role-binding.yaml │ │ ├── kube-state-metrics-role.yaml │ │ ├── kube-state-metrics-service-account.yaml │ │ └── kube-state-metrics-service.yaml │ ├── node-exporter │ │ ├── node-exporter-cluster-role-binding.yaml │ │ ├── node-exporter-cluster-role.yaml │ │ ├── node-exporter-daemonset.yaml │ │ ├── node-exporter-service-account.yaml │ │ └── node-exporter-service.yaml │ ├── prometheus-operator │ │ ├── prometheus-operator-cluster-role-binding.yaml │ │ ├── prometheus-operator-cluster-role.yaml │ │ ├── prometheus-operator-service-account.yaml │ │ ├── prometheus-operator-service.yaml │ │ └── prometheus-operator.yaml │ └── prometheus │ │ ├── prometheus-k8s-roles.yaml │ │ ├── prometheus-k8s-rules.yaml │ │ ├── prometheus-k8s-service-monitor-alertmanager.yaml │ │ ├── prometheus-k8s-service-monitor-apiserver.yaml │ │ ├── prometheus-k8s-service-monitor-kube-controller-manager.yaml │ │ ├── prometheus-k8s-service-monitor-kube-scheduler.yaml │ │ ├── prometheus-k8s-service-monitor-kube-state-metrics.yaml │ │ ├── prometheus-k8s-service-monitor-kubelet.yaml │ │ ├── prometheus-k8s-service-monitor-node-exporter.yaml │ │ ├── prometheus-k8s-service-monitor-prometheus-operator.yaml │ │ ├── prometheus-k8s-service-monitor-prometheus.yaml │ │ ├── prometheus-k8s-service.yaml │ │ ├── prometheus-k8s-serviceaccount.yml │ │ └── prometheus-k8s.yaml │ └── teardown ├── requirements.txt └── runner.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | env/ 3 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - deploy 3 | 4 | build docker image: 5 | stage: deploy 6 | dependencies: [] 7 | image: docker 8 | script: 9 | - IMAGE=$CI_REGISTRY_IMAGE:${CI_COMMIT_REF_SLUG/master/latest} 10 | - docker build -t $IMAGE . 11 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY 12 | - docker push $IMAGE 13 | only: 14 | - tags@ops/kube-workshop 15 | - branches@ops/kube-workshop 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6.3-slim 2 | 3 | WORKDIR /opt/demo 4 | 5 | COPY requirements.txt ./ 6 | 7 | RUN set -x \ 8 | && apt-get update \ 9 | && apt-get install -y --no-install-recommends build-essential libev-dev git \ 10 | && pip install --no-cache-dir -r requirements.txt \ 11 | && apt-get purge -y --auto-remove build-essential 12 | 13 | ADD app.py ./ 14 | ADD runner.py ./ 15 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import logging 3 | import os 4 | import hmac 5 | 6 | from flask import Flask, request, Response 7 | from flask_redis import FlaskRedis 8 | from redis.exceptions import RedisError 9 | from prometheus_client import Counter, generate_latest 10 | 11 | HOST_COUNTER = 'host_counts' 12 | COUNT = Counter('request_count', 'App request count', ['method', 'endpoint', 'http_status']) 13 | 14 | logger = logging.getLogger(__name__) 15 | app = Flask(__name__) 16 | 17 | app.config['REDIS_URL'] = os.environ['REDIS_URL'] 18 | redis_store = FlaskRedis(app) 19 | 20 | 21 | @app.after_request 22 | def after_request(response): 23 | COUNT.labels(request.method, request.endpoint, response.status_code).inc() 24 | return response 25 | 26 | 27 | @app.route('/') 28 | def index_page(): 29 | logger.info('GET /') 30 | pipe = redis_store.pipeline() 31 | pipe.hincrby(HOST_COUNTER, socket.gethostname()) 32 | pipe.hgetall(HOST_COUNTER) 33 | result = pipe.execute() 34 | return '\n'.join(['{}: {}'.format(k.decode(), v.decode()) for k, v in result[1].items()]) 35 | 36 | 37 | @app.route('/reset') 38 | def reset(): 39 | auth = request.authorization 40 | if not auth or not hmac.compare_digest('{}:{}'.format(auth.username, auth.password), os.environ.get('AUTH', '')): 41 | return Response('Not authenticated', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) 42 | redis_store.delete(HOST_COUNTER) 43 | return 'ok' 44 | 45 | 46 | @app.route('/health') 47 | def health_check(): 48 | try: 49 | redis_store.ping() 50 | except RedisError: 51 | return 'Redis not available', 500 52 | return 'ok' 53 | 54 | 55 | @app.route('/metrics') 56 | def metrics(): 57 | return generate_latest() 58 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | 4 | redis: 5 | image: redis 6 | ports: 7 | - "127.0.0.1:6379:6379" 8 | 9 | -------------------------------------------------------------------------------- /kubernetes/README.md: -------------------------------------------------------------------------------- 1 | Kubernetes workshop 2 | =================== 3 | 4 | ## 1. Making sure everyone is ready and `kubectl` is working 5 | 6 | 7 | Goals: 8 | 9 | - make sure everyone is ready to start workshop and have all prerequisites 10 | - explain `minikube` and `kubectl` 11 | 12 | Bash aliases you can add to your `~/.bash_profile`: 13 | ``` 14 | # k alias to kubectl 15 | alias k='kubectl' 16 | # autocomplete for k (alias to kubectl) 17 | source <(kubectl completion bash | sed 's/kubectl/k/g') 18 | ``` 19 | 20 | Links: 21 | 22 | - https://kubernetes.io/docs/tasks/tools/install-kubectl/ 23 | - installation [https://github.com/kubernetes/minikube/blob/v0.25.0/README.md](https://github.com/kubernetes/minikube/blob/v0.25.0/README.md) 24 | - minikube download [https://github.com/kubernetes/minikube/releases](https://github.com/kubernetes/minikube/releases) 25 | - minikube drivers [https://github.com/kubernetes/minikube/blob/master/docs/drivers.md](https://github.com/kubernetes/minikube/blob/master/docs/drivers.md) 26 | 27 | ## 2. What is container and what is a pod? 28 | 29 | Goals: 30 | 31 | - explain basic concepts 32 | 33 | Links: 34 | 35 | - container or a single image [https://kubernetes.io/docs/concepts/containers/images/](https://kubernetes.io/docs/concepts/containers/images/) 36 | - understanding pod https://kubernetes.io/docs/concepts/workloads/pods/pod-overview/#understanding-pods 37 | 38 | ## 3. Creating your first deployment 39 | 40 | Goals: 41 | 42 | - write a deployment spec file 43 | - create and inspect deployment 44 | - interact with deployment remotely using `kubectl` 45 | 46 | Create deployment.yml 47 | 48 | ``` 49 | --- 50 | apiVersion: apps/v1beta2 51 | kind: Deployment 52 | metadata: 53 | name: flask-demo 54 | labels: 55 | app: flask-demo 56 | spec: 57 | replicas: 3 58 | selector: 59 | matchLabels: 60 | app: flask-demo 61 | template: 62 | metadata: 63 | labels: 64 | app: flask-demo 65 | spec: 66 | containers: 67 | - name: flask-demo 68 | image: exponea/flask-demo:1.0 69 | command: ["python", "runner.py"] 70 | 71 | ``` 72 | 73 | Useful commands: 74 | 75 | ``` 76 | kubectl explain deployment 77 | kubectl apply -f demo-deployment.yaml 78 | kubectl get pods 79 | kubectl describe pod 80 | kubectl logs 81 | kubectl exec -- ps axu 82 | kubectl delete pod 83 | kubectl port-forward 9090:80 # exposing pod to your local machine 84 | ``` 85 | 86 | Links: 87 | 88 | - creating a deployment [https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#creating-a-deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#creating-a-deployment) 89 | 90 | ## 4. Service resource 91 | 92 | Goals: 93 | 94 | - create a service 95 | - use label selectors to expose a pod externally 96 | - headless service, node port and loadbalancer 97 | 98 | Create service.yml 99 | 100 | ``` 101 | --- 102 | apiVersion: v1 103 | kind: Service 104 | metadata: 105 | name: flask-demo 106 | labels: 107 | app: flask-demo 108 | spec: 109 | ports: 110 | - port: 80 111 | name: http 112 | targetPort: 80 113 | selector: 114 | app: flask-demo 115 | type: NodePort # add this only if you are using GKE cluster 116 | ``` 117 | 118 | Useful commands: 119 | 120 | ``` 121 | kubectl explain service 122 | kubectl get service 123 | kubectl describe service 124 | kubectl get endpoint 125 | kubectl describe endpoint 126 | ``` 127 | 128 | Check service with curl: `kubectl run curl --image=tutum/curl -it --rm` 129 | 130 | Links: 131 | 132 | - defining a service [https://kubernetes.io/docs/concepts/services-networking/service/#defining-a-service](https://kubernetes.io/docs/concepts/services-networking/service/#defining-a-service) 133 | 134 | ## 5. Ingress 135 | 136 | Goals: 137 | 138 | - explain why do we need ingress 139 | - differences between cloud providers and minikube 140 | - setup ingress and loadbalancer in minikube 141 | 142 | Create ingress.yml 143 | 144 | ``` 145 | --- 146 | apiVersion: extensions/v1beta1 147 | kind: Ingress 148 | metadata: 149 | name: flask-demo 150 | spec: 151 | backend: 152 | serviceName: flask-demo 153 | servicePort: 80 154 | ``` 155 | 156 | Useful commands: 157 | 158 | ``` 159 | minikube addons enable ingress # setup nginx lb in minikube 160 | minikube service list # list all services with urls in local cluster 161 | minikube ip 162 | ``` 163 | 164 | To simulate dns, add IP from `minikube ip` to /etc/hosts and you can access http://flask.demo from your browser. 165 | 166 | ``` 167 | echo $(minikube ip) flask.demo | sudo tee -a /etc/hosts 168 | ``` 169 | 170 | Links: 171 | 172 | - [https://kubernetes.io/docs/concepts/services-networking/ingress/](https://kubernetes.io/docs/concepts/services-networking/ingress/) 173 | 174 | ## 6. Resources 175 | 176 | Goals: 177 | 178 | - explain resource management 179 | - setup resource consumption and limits for our deployment 180 | 181 | Commands: 182 | ``` 183 | kubectl explain deployment.spec.template.spec.containers.resources 184 | ``` 185 | 186 | Similar to probes, add to `deployment.yml` under `spec.template.spec.containers` this block: 187 | ``` 188 | resources: 189 | limits: 190 | cpu: "1" 191 | memory: 128Mi 192 | requests: 193 | cpu: "50m" 194 | memory: 32Mi 195 | ``` 196 | 197 | Links: 198 | 199 | - [https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/) 200 | 201 | ## 7. Rolling update, scaling 202 | 203 | Goals: 204 | 205 | - show rolling update deployment and scaling 206 | - migrate app to version `2.0` 207 | - mention various deployment strategies 208 | 209 | Modify deployment.yml, add following lines under `spec` section: 210 | 211 | ``` 212 | strategy: 213 | type: RollingUpdate 214 | rollingUpdate: 215 | maxSurge: 1 216 | maxUnavailable: 0 217 | ``` 218 | 219 | * deployment is rolled out only if you change `spec.template` 220 | * deployment will stop if it encounter some error, can’t pull image, health check etc. 221 | 222 | Useful commands: 223 | 224 | ``` 225 | kubectl rollout status deployment/flask-demo 226 | kubectl scale deployment flask-demo --replicas=5 227 | ``` 228 | 229 | Extra: 230 | 231 | Try how horizontal pod autoscaler works using: 232 | ``` 233 | minikube addons enable heapster # enable heapster metrics 234 | kubectl autoscale deployment --cpu-percent=50 --min=1 --max=10 235 | kubectl get hpa 236 | ``` 237 | and create high load on flask-demo container. 238 | 239 | Links: 240 | 241 | - [https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#updating-a-deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#updating-a-deployment) 242 | - [https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) 243 | 244 | ## 8. Config map 245 | 246 | Goals: 247 | 248 | - create ConfigMaps to store application configuration data 249 | - prepare config for redis we will use in our application 250 | 251 | Create redis.yml 252 | ``` 253 | --- 254 | apiVersion: v1 255 | kind: ConfigMap 256 | metadata: 257 | name: redis-conf 258 | data: 259 | redis.conf: |+ 260 | appendonly yes 261 | protected-mode no 262 | bind 0.0.0.0 263 | port 6379 264 | dir /var/lib/redis 265 | ``` 266 | 267 | Commands: 268 | ``` 269 | kubectl explain configmap 270 | kubectl create configmap redis-conf --from-file redis.conf # create directly from file 271 | ``` 272 | 273 | Extra: 274 | 275 | Add `hostPath` volume `/data` to our deployment. 276 | 277 | Links: 278 | 279 | - [https://kubernetes.io/docs/tasks/configure-pod-container/configure-pod-configmap/#create-a-configmap](https://kubernetes.io/docs/tasks/configure-pod-container/configure-pod-configmap/#create-a-configmap) 280 | - [https://kubernetes.io/docs/concepts/storage/volumes/#emptydir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) 281 | - creating volume [https://kubernetes.io/docs/concepts/storage/volumes/#hostpath](https://kubernetes.io/docs/concepts/storage/volumes/#hostpath) 282 | 283 | ## 9. Stateful set 284 | 285 | Goals: 286 | 287 | - create StatefulSet to provision redis database (stateful application) 288 | - explain ordinal index, stable network id, stable storage, deployment and scaling guarantees 289 | 290 | Append to redis.yml 291 | 292 | ``` 293 | --- 294 | apiVersion: v1 295 | kind: Service 296 | metadata: 297 | labels: 298 | app: redis 299 | name: redis 300 | spec: 301 | ports: 302 | - name: redis 303 | protocol: TCP 304 | port: 6379 305 | targetPort: 6379 306 | selector: 307 | app: redis 308 | 309 | --- 310 | apiVersion: apps/v1beta1 311 | kind: StatefulSet 312 | metadata: 313 | name: redis 314 | spec: 315 | serviceName: redis 316 | replicas: 1 317 | template: 318 | metadata: 319 | labels: 320 | app: redis 321 | spec: 322 | terminationGracePeriodSeconds: 10 323 | containers: 324 | - name: redis 325 | image: redis:4.0.8-alpine 326 | command: 327 | - redis-server 328 | args: 329 | - /etc/redis/redis.conf 330 | resources: 331 | requests: 332 | cpu: 100m 333 | memory: 100Mi 334 | ports: 335 | - containerPort: 6379 336 | name: redis 337 | volumeMounts: 338 | - name: redis-data 339 | mountPath: /var/lib/redis 340 | - name: redis-conf 341 | mountPath: /etc/redis 342 | volumes: 343 | - name: redis-conf 344 | configMap: 345 | name: redis-conf 346 | items: 347 | - key: redis.conf 348 | path: redis.conf 349 | volumeClaimTemplates: 350 | - metadata: 351 | name: redis-data 352 | spec: 353 | accessModes: ["ReadWriteOnce"] 354 | resources: 355 | requests: 356 | storage: 100Mi 357 | ``` 358 | 359 | Extra: 360 | 361 | Create Job resource that will create some file inside `/data` volume. 362 | 363 | Links: 364 | 365 | - stateful sets [https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) 366 | - persistent volumes [https://kubernetes.io/docs/concepts/storage/persistent-volumes/](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) 367 | - running job [https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#running-an-example-job](https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#running-an-example-job) 368 | 369 | ## 10. Managing app configuration and secret 370 | 371 | Goals: 372 | 373 | - explain secrets 374 | - setup environment variables required in new app release 375 | - use secrets to store env variable with password 376 | - migrate app to version `3.0` 377 | 378 | Setup environment variable in deployment.yml. Add following code under `spec.template.spec.containers`: 379 | 380 | ``` 381 | env: 382 | - name: REDIS_URL 383 | value: redis://redis:6379/0 384 | ``` 385 | 386 | Create secrets.yml containing password: 387 | ``` 388 | --- 389 | apiVersion: v1 390 | kind: Secret 391 | metadata: 392 | name: flask-demo 393 | type: Opaque 394 | data: 395 | AUTH: YWRtaW46YWRtaW4= 396 | ``` 397 | 398 | 399 | Secrets must be Base64 encoded string: 400 | ``` 401 | echo -n admin:admin | base64 402 | ``` 403 | 404 | Again update deployment.yml and add extra to `spec.template.spec.containers`: 405 | ``` 406 | envFrom: 407 | - secretRef: 408 | name: flask-demo 409 | ``` 410 | 411 | Deploy app version `3.0`. 412 | 413 | Extra: 414 | 415 | Create CronJob resource that will write current date to some file inside `/data` volume every minute. 416 | 417 | Links: 418 | 419 | - environment variables [https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container](https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container) 420 | - secrets [https://kubernetes.io/docs/concepts/configuration/secret/#creating-your-own-secrets](https://kubernetes.io/docs/concepts/configuration/secret/#creating-your-own-secrets) 421 | - creating cron job [https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#creating-a-cron-job](https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#creating-a-cron-job) 422 | 423 | ## 11. Monitoring and Health Checks 424 | 425 | Goals: 426 | 427 | - create pods with readiness and liveness probes 428 | - troubleshoot failing readiness and liveness probes 429 | - kubectl explain `spec.template.spec.containers.readinessProbe` (what is the difference… http, exec command, tcp socket) 430 | 431 | Modify deployment.yml and add following code under `spec.template.spec.containers` section: 432 | ``` 433 | readinessProbe: 434 | httpGet: 435 | path: /health 436 | port: 80 437 | initialDelaySeconds: 5 438 | periodSeconds: 3 439 | ``` 440 | 441 | The same with redis.yml: 442 | ``` 443 | livenessProbe: 444 | exec: 445 | command: 446 | - sh 447 | - -c 448 | - "redis-cli -h $(hostname) ping" 449 | initialDelaySeconds: 15 450 | timeoutSeconds: 5 451 | ``` 452 | 453 | Extra: 454 | 455 | It is your responsibility to know your application and write sensible endpoints for liveness and readiness probe, but sometimes you are not a developer of the application and you know that it should not run before something else is running... like Redis. 456 | 457 | Use previous knowledge + `deployment.spec.template.spec.initContainers` to make sure Redis is running before our deployment can be started. 458 | 459 | Links: 460 | 461 | - [https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/) 462 | - [https://kubernetes.io/docs/concepts/workloads/pods/init-containers/](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/) 463 | 464 | ## 12. Prometheus + Grafana 465 | 466 | Goals: 467 | 468 | - setup Prometheus and Grafana 469 | - migrate app to version `4.0` 470 | - setup service monitor 471 | - show example dashboard and graphs 472 | 473 | 474 | Setup auth: 475 | ``` 476 | kubectl create clusterrolebinding -cluster-admin-binding --clusterrole=cluster-admin --user=@mail.com 477 | ``` 478 | 479 | To setup Prometheus with Grafana run this command: 480 | ``` 481 | kubernetes/prometheus/deploy 482 | ``` 483 | 484 | Deploy app version `4.0` with `/metrics` endpoint for monitoring. 485 | 486 | Setup service monitor for app with command: 487 | ``` 488 | kubectl apply -f service-monitor.yml --namespace=monitoring 489 | ``` 490 | 491 | Example Grafana metrics: 492 | ``` 493 | Requests per second: 494 | sum(rate(request_count[1m])) by (http_status, method, exported_endpoint) 495 | 496 | Memory usage: 497 | process_resident_memory_bytes{job="flask-demo"} 498 | 499 | Cpu usage: 500 | rate(process_cpu_seconds_total{job="flask-demo"}[3m]) * 100 501 | 502 | Open file descriptors: 503 | process_open_fds{job="flask-demo"} 504 | ``` 505 | 506 | ## 13. Kubernetes dashboard 507 | 508 | Run `minikube dashboard`. 509 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/03/deployment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1beta2 3 | kind: Deployment 4 | metadata: 5 | name: flask-demo 6 | labels: 7 | app: flask-demo 8 | spec: 9 | replicas: 3 10 | selector: 11 | matchLabels: 12 | app: flask-demo 13 | template: 14 | metadata: 15 | labels: 16 | app: flask-demo 17 | spec: 18 | containers: 19 | - name: flask-demo 20 | image: exponea/flask-demo:1.0 21 | command: ["python", "runner.py"] 22 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/04/service.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: flask-demo 6 | labels: 7 | app: flask-demo 8 | spec: 9 | ports: 10 | - port: 80 11 | name: http 12 | targetPort: 80 13 | selector: 14 | app: flask-demo 15 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/05/ingress.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: extensions/v1beta1 3 | kind: Ingress 4 | metadata: 5 | name: flask-demo 6 | spec: 7 | rules: 8 | - host: flask.demo 9 | http: 10 | paths: 11 | - backend: 12 | serviceName: flask-demo 13 | servicePort: 80 14 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/06/deployment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1beta2 3 | kind: Deployment 4 | metadata: 5 | name: flask-demo 6 | labels: 7 | app: flask-demo 8 | spec: 9 | replicas: 3 10 | selector: 11 | matchLabels: 12 | app: flask-demo 13 | template: 14 | metadata: 15 | labels: 16 | app: flask-demo 17 | spec: 18 | containers: 19 | - name: flask-demo 20 | image: exponea/flask-demo:2.0 21 | command: ["python", "runner.py"] 22 | resources: 23 | limits: 24 | cpu: "1" 25 | memory: 128Mi 26 | requests: 27 | cpu: "50m" 28 | memory: 32Mi 29 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/07/deployment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1beta2 3 | kind: Deployment 4 | metadata: 5 | name: flask-demo 6 | labels: 7 | app: flask-demo 8 | spec: 9 | replicas: 3 10 | strategy: 11 | type: RollingUpdate 12 | rollingUpdate: 13 | maxSurge: 1 14 | maxUnavailable: 0 15 | selector: 16 | matchLabels: 17 | app: flask-demo 18 | template: 19 | metadata: 20 | labels: 21 | app: flask-demo 22 | spec: 23 | containers: 24 | - name: flask-demo 25 | image: exponea/flask-demo:2.0 26 | command: ["python", "runner.py"] 27 | resources: 28 | limits: 29 | cpu: "1" 30 | memory: 128Mi 31 | requests: 32 | cpu: "50m" 33 | memory: 32Mi 34 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/08/redis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: redis-conf 6 | data: 7 | redis.conf: |+ 8 | appendonly yes 9 | protected-mode no 10 | bind 0.0.0.0 11 | port 6379 12 | dir /var/lib/redis 13 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/09/redis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: redis-conf 6 | data: 7 | redis.conf: |+ 8 | appendonly yes 9 | protected-mode no 10 | bind 0.0.0.0 11 | port 6379 12 | dir /var/lib/redis 13 | --- 14 | apiVersion: v1 15 | kind: Service 16 | metadata: 17 | labels: 18 | app: redis 19 | name: redis 20 | spec: 21 | ports: 22 | - name: redis 23 | protocol: TCP 24 | port: 6379 25 | targetPort: 6379 26 | selector: 27 | app: redis 28 | 29 | --- 30 | apiVersion: apps/v1beta1 31 | kind: StatefulSet 32 | metadata: 33 | name: redis 34 | spec: 35 | serviceName: redis 36 | replicas: 1 37 | template: 38 | metadata: 39 | labels: 40 | app: redis 41 | spec: 42 | terminationGracePeriodSeconds: 10 43 | containers: 44 | - name: redis 45 | image: redis:4.0.8-alpine 46 | command: 47 | - redis-server 48 | args: 49 | - /etc/redis/redis.conf 50 | resources: 51 | requests: 52 | cpu: 100m 53 | memory: 100Mi 54 | ports: 55 | - containerPort: 6379 56 | name: redis 57 | volumeMounts: 58 | - name: redis-data 59 | mountPath: /var/lib/redis 60 | - name: redis-conf 61 | mountPath: /etc/redis 62 | volumes: 63 | - name: redis-conf 64 | configMap: 65 | name: redis-conf 66 | items: 67 | - key: redis.conf 68 | path: redis.conf 69 | volumeClaimTemplates: 70 | - metadata: 71 | name: redis-data 72 | spec: 73 | accessModes: ["ReadWriteOnce"] 74 | resources: 75 | requests: 76 | storage: 100Mi 77 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/10/deployment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1beta2 3 | kind: Deployment 4 | metadata: 5 | name: flask-demo 6 | labels: 7 | app: flask-demo 8 | spec: 9 | replicas: 3 10 | strategy: 11 | type: RollingUpdate 12 | rollingUpdate: 13 | maxSurge: 1 14 | maxUnavailable: 0 15 | selector: 16 | matchLabels: 17 | app: flask-demo 18 | template: 19 | metadata: 20 | labels: 21 | app: flask-demo 22 | spec: 23 | containers: 24 | - name: flask-demo 25 | image: exponea/flask-demo:3.0 26 | command: ["python", "runner.py"] 27 | resources: 28 | limits: 29 | cpu: "1" 30 | memory: 128Mi 31 | requests: 32 | cpu: "50m" 33 | memory: 32Mi 34 | env: 35 | - name: REDIS_URL 36 | value: redis://redis:6379/0 37 | envFrom: 38 | - secretRef: 39 | name: flask-demo 40 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/10/secrets.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Secret 4 | metadata: 5 | name: flask-demo 6 | type: Opaque 7 | data: 8 | AUTH: YWRtaW46YWRtaW4= 9 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/11/deployment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1beta2 3 | kind: Deployment 4 | metadata: 5 | name: flask-demo 6 | labels: 7 | app: flask-demo 8 | spec: 9 | replicas: 3 10 | strategy: 11 | type: RollingUpdate 12 | rollingUpdate: 13 | maxSurge: 1 14 | maxUnavailable: 0 15 | selector: 16 | matchLabels: 17 | app: flask-demo 18 | template: 19 | metadata: 20 | labels: 21 | app: flask-demo 22 | spec: 23 | containers: 24 | - name: flask-demo 25 | image: exponea/flask-demo:3.0 26 | command: ["python", "runner.py"] 27 | resources: 28 | limits: 29 | cpu: "1" 30 | memory: 128Mi 31 | requests: 32 | cpu: "50m" 33 | memory: 32Mi 34 | env: 35 | - name: REDIS_URL 36 | value: redis://redis:6379/0 37 | envFrom: 38 | - secretRef: 39 | name: flask-demo 40 | readinessProbe: 41 | httpGet: 42 | path: /health 43 | port: 80 44 | initialDelaySeconds: 5 45 | periodSeconds: 3 46 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/11/redis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: redis-conf 6 | data: 7 | redis.conf: |+ 8 | appendonly yes 9 | protected-mode no 10 | bind 0.0.0.0 11 | port 6379 12 | dir /var/lib/redis 13 | --- 14 | apiVersion: v1 15 | kind: Service 16 | metadata: 17 | labels: 18 | app: redis 19 | name: redis 20 | spec: 21 | ports: 22 | - name: redis 23 | protocol: TCP 24 | port: 6379 25 | targetPort: 6379 26 | selector: 27 | app: redis 28 | 29 | --- 30 | apiVersion: apps/v1beta1 31 | kind: StatefulSet 32 | metadata: 33 | name: redis 34 | spec: 35 | serviceName: redis 36 | replicas: 1 37 | template: 38 | metadata: 39 | labels: 40 | app: redis 41 | spec: 42 | terminationGracePeriodSeconds: 10 43 | containers: 44 | - name: redis 45 | image: redis:4.0.8-alpine 46 | command: 47 | - redis-server 48 | args: 49 | - /etc/redis/redis.conf 50 | resources: 51 | requests: 52 | cpu: 100m 53 | memory: 100Mi 54 | ports: 55 | - containerPort: 6379 56 | name: redis 57 | volumeMounts: 58 | - name: redis-data 59 | mountPath: /var/lib/redis 60 | - name: redis-conf 61 | mountPath: /etc/redis 62 | livenessProbe: 63 | exec: 64 | command: 65 | - sh 66 | - -c 67 | - "redis-cli -h $(hostname) ping" 68 | initialDelaySeconds: 15 69 | timeoutSeconds: 5 70 | volumes: 71 | - name: redis-conf 72 | configMap: 73 | name: redis-conf 74 | items: 75 | - key: redis.conf 76 | path: redis.conf 77 | volumeClaimTemplates: 78 | - metadata: 79 | name: redis-data 80 | spec: 81 | accessModes: ["ReadWriteOnce"] 82 | resources: 83 | requests: 84 | storage: 100Mi 85 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/12/deployment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1beta2 3 | kind: Deployment 4 | metadata: 5 | name: flask-demo 6 | labels: 7 | app: flask-demo 8 | spec: 9 | replicas: 3 10 | strategy: 11 | type: RollingUpdate 12 | rollingUpdate: 13 | maxSurge: 1 14 | maxUnavailable: 0 15 | selector: 16 | matchLabels: 17 | app: flask-demo 18 | template: 19 | metadata: 20 | labels: 21 | app: flask-demo 22 | spec: 23 | containers: 24 | - name: flask-demo 25 | image: exponea/flask-demo:4.0 26 | command: ["python", "runner.py"] 27 | resources: 28 | limits: 29 | cpu: "1" 30 | memory: 128Mi 31 | requests: 32 | cpu: "50m" 33 | memory: 32Mi 34 | env: 35 | - name: REDIS_URL 36 | value: redis://redis:6379/0 37 | envFrom: 38 | - secretRef: 39 | name: flask-demo 40 | readinessProbe: 41 | httpGet: 42 | path: /health 43 | port: 80 44 | initialDelaySeconds: 5 45 | periodSeconds: 3 46 | -------------------------------------------------------------------------------- /kubernetes/checkpoints/12/service-monitor.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | name: flask-demo 6 | namespace: monitoring 7 | labels: 8 | monitoring: enabled 9 | spec: 10 | selector: 11 | matchLabels: 12 | app: flask-demo 13 | namespaceSelector: 14 | matchNames: 15 | - default 16 | endpoints: 17 | - port: http 18 | interval: 10s 19 | -------------------------------------------------------------------------------- /kubernetes/final/deployment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1beta2 3 | kind: Deployment 4 | metadata: 5 | name: flask-demo 6 | labels: 7 | app: flask-demo 8 | spec: 9 | replicas: 3 10 | strategy: 11 | type: RollingUpdate 12 | rollingUpdate: 13 | maxSurge: 1 14 | maxUnavailable: 0 15 | selector: 16 | matchLabels: 17 | app: flask-demo 18 | template: 19 | metadata: 20 | labels: 21 | app: flask-demo 22 | spec: 23 | containers: 24 | - name: flask-demo 25 | image: exponea/flask-demo:4.0 26 | command: ["python", "runner.py"] 27 | env: 28 | - name: REDIS_URL 29 | value: redis://redis:6379/0 30 | envFrom: 31 | - secretRef: 32 | name: flask-demo 33 | readinessProbe: 34 | httpGet: 35 | path: /health 36 | port: 80 37 | initialDelaySeconds: 5 38 | periodSeconds: 3 39 | resources: 40 | limits: 41 | cpu: "1" 42 | memory: 128Mi 43 | requests: 44 | cpu: "50m" 45 | memory: 32Mi 46 | -------------------------------------------------------------------------------- /kubernetes/final/ingress.yml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Ingress 3 | metadata: 4 | name: flask-demo 5 | spec: 6 | backend: 7 | serviceName: flask-demo 8 | servicePort: 80 9 | -------------------------------------------------------------------------------- /kubernetes/final/redis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: redis-conf 6 | data: 7 | redis.conf: |+ 8 | appendonly yes 9 | protected-mode no 10 | bind 0.0.0.0 11 | port 6379 12 | dir /var/lib/redis 13 | --- 14 | apiVersion: v1 15 | kind: Service 16 | metadata: 17 | labels: 18 | app: redis 19 | name: redis 20 | spec: 21 | ports: 22 | - name: redis 23 | protocol: TCP 24 | port: 6379 25 | targetPort: 6379 26 | selector: 27 | app: redis 28 | 29 | --- 30 | apiVersion: apps/v1beta1 31 | kind: StatefulSet 32 | metadata: 33 | name: redis 34 | spec: 35 | serviceName: redis 36 | replicas: 1 37 | template: 38 | metadata: 39 | labels: 40 | app: redis 41 | spec: 42 | terminationGracePeriodSeconds: 10 43 | containers: 44 | - name: redis 45 | image: redis:4.0.8-alpine 46 | command: 47 | - redis-server 48 | args: 49 | - /etc/redis/redis.conf 50 | resources: 51 | requests: 52 | cpu: 100m 53 | memory: 100Mi 54 | ports: 55 | - containerPort: 6379 56 | name: redis 57 | volumeMounts: 58 | - name: redis-data 59 | mountPath: /var/lib/redis 60 | - name: redis-conf 61 | mountPath: /etc/redis 62 | livenessProbe: 63 | exec: 64 | command: 65 | - sh 66 | - -c 67 | - "redis-cli -h $(hostname) ping" 68 | initialDelaySeconds: 15 69 | timeoutSeconds: 5 70 | volumes: 71 | - name: redis-conf 72 | configMap: 73 | name: redis-conf 74 | items: 75 | - key: redis.conf 76 | path: redis.conf 77 | volumeClaimTemplates: 78 | - metadata: 79 | name: redis-data 80 | spec: 81 | accessModes: ["ReadWriteOnce"] 82 | resources: 83 | requests: 84 | storage: 100Mi 85 | -------------------------------------------------------------------------------- /kubernetes/final/secret.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Secret 4 | metadata: 5 | name: flask-demo 6 | type: Opaque 7 | data: 8 | AUTH: YWRtaW46YWRtaW4= 9 | -------------------------------------------------------------------------------- /kubernetes/final/service-monitor.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | name: flask-demo 6 | namespace: monitoring 7 | labels: 8 | monitoring: enabled 9 | spec: 10 | selector: 11 | matchLabels: 12 | app: flask-demo 13 | namespaceSelector: 14 | any: true 15 | endpoints: 16 | - port: http 17 | interval: 10s 18 | -------------------------------------------------------------------------------- /kubernetes/final/service.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: flask-demo 6 | labels: 7 | app: flask-demo 8 | spec: 9 | ports: 10 | - port: 80 11 | name: http 12 | targetPort: 80 13 | selector: 14 | app: flask-demo 15 | type: NodePort # add this only if you are using GKE cluster 16 | -------------------------------------------------------------------------------- /kubernetes/prometheus/deploy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ -z "${KUBECONFIG}" ]; then 4 | export KUBECONFIG=~/.kube/config 5 | fi 6 | 7 | # CAUTION - setting NAMESPACE will deploy most components to the given namespace 8 | # however some are hardcoded to 'monitoring'. Only use if you have reviewed all manifests. 9 | 10 | if [ -z "${NAMESPACE}" ]; then 11 | NAMESPACE=monitoring 12 | fi 13 | 14 | kubectl create namespace "$NAMESPACE" 15 | 16 | kctl() { 17 | kubectl --namespace "$NAMESPACE" "$@" 18 | } 19 | 20 | kctl apply -f manifests/prometheus-operator 21 | 22 | # Wait for CRDs to be ready. 23 | printf "Waiting for Operator to register custom resource definitions..." 24 | until kctl get customresourcedefinitions servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done 25 | until kctl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done 26 | until kctl get customresourcedefinitions alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done 27 | until kctl get servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done 28 | until kctl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done 29 | until kctl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done 30 | echo "done!" 31 | 32 | kctl apply -f manifests/node-exporter 33 | kctl apply -f manifests/kube-state-metrics 34 | kctl apply -f manifests/grafana/grafana-credentials.yaml 35 | kctl apply -f manifests/grafana/grafana-configs.yaml 36 | kctl apply -f manifests/grafana 37 | find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \; 38 | kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml 39 | kctl apply -f manifests/alertmanager/ 40 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/alertmanager/alertmanager-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: alertmanager-main 5 | data: 6 | alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== 7 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/alertmanager/alertmanager-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | alertmanager: main 6 | name: alertmanager-main 7 | spec: 8 | type: NodePort 9 | ports: 10 | - name: web 11 | nodePort: 30903 12 | port: 9093 13 | protocol: TCP 14 | targetPort: web 15 | selector: 16 | alertmanager: main 17 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/alertmanager/alertmanager.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Alertmanager 3 | metadata: 4 | name: main 5 | labels: 6 | alertmanager: main 7 | spec: 8 | replicas: 1 9 | version: v0.16.0 10 | baseImage: quay.io/prometheus/alertmanager 11 | securityContext: 12 | fsGroup: 2000 13 | runAsNonRoot: true 14 | runAsUser: 1000 15 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/grafana/grafana-configs.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: grafana-datasources 6 | data: 7 | prometheus.yaml: |- 8 | { 9 | "apiVersion": 1, 10 | "datasources": [ 11 | { 12 | "access": "proxy", 13 | "basicAuth": false, 14 | "name": "prometheus", 15 | "type": "prometheus", 16 | "url": "http://prometheus-k8s.monitoring.svc:9090" 17 | }, 18 | { 19 | "access": "proxy", 20 | "basicAuth": false, 21 | "name": "alertmanager", 22 | "type": "camptocamp-prometheus-alertmanager-datasource", 23 | "url": "http://alertmanager-main.monitoring.svc:9093", 24 | "jsonData": { 25 | "severity_critical": "3", 26 | "severity_high": "2", 27 | "severity_warning": "1", 28 | "severity_info": "0" 29 | } 30 | }, 31 | ] 32 | } 33 | --- 34 | apiVersion: v1 35 | kind: ConfigMap 36 | metadata: 37 | name: grafana-dashboards 38 | data: 39 | dashboards.yaml: |- 40 | [ 41 | { 42 | "folder": "", 43 | "name": "0", 44 | "options": { 45 | "path": "/grafana-dashboard-definitions/0" 46 | }, 47 | "org_id": 1, 48 | "type": "file" 49 | } 50 | ] 51 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/grafana/grafana-credentials.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: grafana-credentials 5 | data: 6 | user: YWRtaW4= 7 | password: YWRtaW4= 8 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/grafana/grafana-deployment.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1beta1 3 | kind: Deployment 4 | metadata: 5 | name: grafana 6 | spec: 7 | replicas: 1 8 | template: 9 | metadata: 10 | labels: 11 | app: grafana 12 | spec: 13 | securityContext: 14 | runAsNonRoot: true 15 | runAsUser: 65534 16 | initContainers: 17 | - name: grafana-plugin-installer 18 | image: grafana/grafana:6.0.0 19 | command: ["grafana-cli", "--pluginsDir", "/data", "plugins", "install", "camptocamp-prometheus-alertmanager-datasource"] 20 | volumeMounts: 21 | - name: grafana-storage 22 | mountPath: /data 23 | containers: 24 | - name: grafana 25 | image: grafana/grafana:6.2.2 26 | env: 27 | - name: GF_PATHS_DATA 28 | value: /data 29 | - name: GF_PATHS_PLUGINS 30 | value: /data 31 | - name: GF_AUTH_BASIC_ENABLED 32 | value: "true" 33 | - name: GF_AUTH_ANONYMOUS_ENABLED 34 | value: "true" 35 | - name: GF_SECURITY_ADMIN_USER 36 | valueFrom: 37 | secretKeyRef: 38 | name: grafana-credentials 39 | key: user 40 | - name: GF_SECURITY_ADMIN_PASSWORD 41 | valueFrom: 42 | secretKeyRef: 43 | name: grafana-credentials 44 | key: password 45 | ports: 46 | - name: http 47 | containerPort: 3000 48 | resources: 49 | requests: 50 | memory: 100Mi 51 | cpu: 100m 52 | limits: 53 | memory: 1Gi 54 | cpu: 1 55 | readinessProbe: 56 | httpGet: 57 | path: /api/health 58 | port: 3000 59 | volumeMounts: 60 | - name: grafana-storage 61 | mountPath: /data 62 | - name: grafana-dashboard-definitions-0 63 | mountPath: /grafana-dashboard-definitions/0 64 | - name: grafana-dashboards 65 | mountPath: /etc/grafana/provisioning/dashboards 66 | readOnly: false 67 | - name: grafana-datasources 68 | mountPath: /etc/grafana/provisioning/datasources 69 | readOnly: false 70 | volumes: 71 | - name: grafana-storage 72 | emptyDir: {} 73 | - name: grafana-datasources 74 | configMap: 75 | name: grafana-datasources 76 | - name: grafana-dashboards 77 | configMap: 78 | name: grafana-dashboards 79 | - name: grafana-dashboard-definitions-0 80 | configMap: 81 | name: grafana-dashboard-definitions-0 82 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/grafana/grafana-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: grafana 5 | labels: 6 | app: grafana 7 | spec: 8 | type: NodePort 9 | ports: 10 | - port: 3000 11 | protocol: TCP 12 | nodePort: 30902 13 | targetPort: web 14 | selector: 15 | app: grafana 16 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: kube-state-metrics 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: kube-state-metrics 9 | subjects: 10 | - kind: ServiceAccount 11 | name: kube-state-metrics 12 | namespace: monitoring 13 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: kube-state-metrics 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - configmaps 10 | - secrets 11 | - nodes 12 | - pods 13 | - services 14 | - resourcequotas 15 | - replicationcontrollers 16 | - limitranges 17 | - persistentvolumeclaims 18 | - persistentvolumes 19 | - namespaces 20 | - endpoints 21 | verbs: 22 | - list 23 | - watch 24 | - apiGroups: 25 | - extensions 26 | resources: 27 | - daemonsets 28 | - deployments 29 | - replicasets 30 | verbs: 31 | - list 32 | - watch 33 | - apiGroups: 34 | - apps 35 | resources: 36 | - statefulsets 37 | - daemonsets 38 | - deployments 39 | - replicasets 40 | verbs: 41 | - list 42 | - watch 43 | - apiGroups: 44 | - batch 45 | resources: 46 | - cronjobs 47 | - jobs 48 | verbs: 49 | - list 50 | - watch 51 | - apiGroups: 52 | - autoscaling 53 | resources: 54 | - horizontalpodautoscalers 55 | verbs: 56 | - list 57 | - watch 58 | - apiGroups: 59 | - authentication.k8s.io 60 | resources: 61 | - tokenreviews 62 | verbs: 63 | - create 64 | - apiGroups: 65 | - authorization.k8s.io 66 | resources: 67 | - subjectaccessreviews 68 | verbs: 69 | - create 70 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta2 2 | kind: Deployment 3 | metadata: 4 | name: kube-state-metrics 5 | namespace: monitoring 6 | labels: 7 | app: kube-state-metrics 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: kube-state-metrics 13 | template: 14 | metadata: 15 | labels: 16 | app: kube-state-metrics 17 | spec: 18 | serviceAccountName: kube-state-metrics 19 | securityContext: 20 | runAsNonRoot: true 21 | runAsUser: 65534 22 | containers: 23 | - name: kube-rbac-proxy-main 24 | image: quay.io/coreos/kube-rbac-proxy:v0.4.1 25 | args: 26 | - --logtostderr 27 | - --secure-listen-address=:8443 28 | - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 29 | - --upstream=http://127.0.0.1:8081/ 30 | ports: 31 | - name: https-main 32 | containerPort: 8443 33 | resources: 34 | limits: 35 | cpu: 1 36 | memory: 40Mi 37 | requests: 38 | cpu: 10m 39 | memory: 20Mi 40 | - name: kube-rbac-proxy-self 41 | image: quay.io/coreos/kube-rbac-proxy:v0.4.1 42 | args: 43 | - --logtostderr 44 | - --secure-listen-address=:9443 45 | - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 46 | - --upstream=http://127.0.0.1:8082/ 47 | ports: 48 | - name: https-self 49 | containerPort: 9443 50 | resources: 51 | limits: 52 | cpu: 1 53 | memory: 40Mi 54 | requests: 55 | cpu: 10m 56 | memory: 20Mi 57 | - name: kube-state-metrics 58 | image: quay.io/coreos/kube-state-metrics:v1.7.1 59 | args: 60 | - "--host=127.0.0.1" 61 | - "--port=8081" 62 | - "--telemetry-host=127.0.0.1" 63 | - "--telemetry-port=8082" 64 | resources: 65 | limits: 66 | cpu: 1 67 | memory: 100Mi 68 | requests: 69 | cpu: 100m 70 | memory: 30Mi 71 | - name: addon-resizer 72 | image: gcr.io/google-containers/addon-resizer-amd64:2.1 73 | resources: 74 | limits: 75 | cpu: 1 76 | memory: 60Mi 77 | requests: 78 | cpu: 10m 79 | memory: 30Mi 80 | env: 81 | - name: MY_POD_NAME 82 | valueFrom: 83 | fieldRef: 84 | apiVersion: v1 85 | fieldPath: metadata.name 86 | - name: MY_POD_NAMESPACE 87 | valueFrom: 88 | fieldRef: 89 | apiVersion: v1 90 | fieldPath: metadata.namespace 91 | command: 92 | - /pod_nanny 93 | - --container=kube-state-metrics 94 | - --cpu=100m 95 | - --extra-cpu=5m 96 | - --memory=150Mi 97 | - --extra-memory=5Mi 98 | - --acceptance-offset=5 99 | - --deployment=kube-state-metrics 100 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1beta1 2 | kind: RoleBinding 3 | metadata: 4 | name: kube-state-metrics 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: Role 8 | name: kube-state-metrics-resizer 9 | subjects: 10 | - kind: ServiceAccount 11 | name: kube-state-metrics 12 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/kube-state-metrics/kube-state-metrics-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: kube-state-metrics 5 | namespace: monitoring 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - pods 11 | verbs: 12 | - get 13 | - apiGroups: 14 | - extensions 15 | resourceNames: 16 | - kube-state-metrics 17 | resources: 18 | - deployments 19 | verbs: 20 | - get 21 | - update 22 | - apiGroups: 23 | - apps 24 | resourceNames: 25 | - kube-state-metrics 26 | resources: 27 | - deployments 28 | verbs: 29 | - get 30 | - update 31 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: kube-state-metrics 5 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/kube-state-metrics/kube-state-metrics-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: kube-state-metrics 5 | labels: 6 | app: kube-state-metrics 7 | k8s-app: kube-state-metrics 8 | spec: 9 | clusterIP: None 10 | ports: 11 | - name: https-main 12 | port: 8443 13 | targetPort: https-main 14 | protocol: TCP 15 | - name: https-self 16 | port: 9443 17 | targetPort: https-self 18 | protocol: TCP 19 | selector: 20 | app: kube-state-metrics 21 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/node-exporter/node-exporter-cluster-role-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: node-exporter 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: node-exporter 9 | subjects: 10 | - kind: ServiceAccount 11 | name: node-exporter 12 | namespace: monitoring 13 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/node-exporter/node-exporter-cluster-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: node-exporter 5 | rules: 6 | - apiGroups: 7 | - authentication.k8s.io 8 | resources: 9 | - tokenreviews 10 | verbs: 11 | - create 12 | - apiGroups: 13 | - authorization.k8s.io 14 | resources: 15 | - subjectaccessreviews 16 | verbs: 17 | - create 18 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/node-exporter/node-exporter-daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: DaemonSet 3 | metadata: 4 | name: node-exporter 5 | labels: 6 | app: node-exporter 7 | spec: 8 | updateStrategy: 9 | rollingUpdate: 10 | maxUnavailable: 1 11 | type: RollingUpdate 12 | template: 13 | metadata: 14 | labels: 15 | app: node-exporter 16 | name: node-exporter 17 | spec: 18 | serviceAccountName: node-exporter 19 | securityContext: 20 | runAsNonRoot: true 21 | runAsUser: 65534 22 | hostNetwork: true 23 | hostPID: true 24 | containers: 25 | - image: quay.io/prometheus/node-exporter:v0.18.1 26 | args: 27 | - --web.listen-address=127.0.0.1:9100 28 | - --path.procfs=/host/proc 29 | - --path.sysfs=/host/sys 30 | - --path.rootfs=/host/root 31 | - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) 32 | - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ 33 | name: node-exporter 34 | resources: 35 | requests: 36 | memory: 30Mi 37 | cpu: 10m 38 | limits: 39 | memory: 128Mi 40 | cpu: 1 41 | volumeMounts: 42 | - name: proc 43 | readOnly: true 44 | mountPath: /host/proc 45 | - name: sys 46 | readOnly: true 47 | mountPath: /host/sys 48 | - name: root 49 | mountPath: /host/root 50 | mountPropagation: HostToContainer 51 | readOnly: true 52 | - name: kube-rbac-proxy 53 | image: quay.io/coreos/kube-rbac-proxy:v0.4.1 54 | args: 55 | - --logtostderr 56 | - --secure-listen-address=$(IP):9100 57 | - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 58 | - --upstream=http://127.0.0.1:9100/ 59 | env: 60 | - name: IP 61 | valueFrom: 62 | fieldRef: 63 | fieldPath: status.podIP 64 | ports: 65 | - containerPort: 9100 66 | hostPort: 9100 67 | name: https 68 | resources: 69 | requests: 70 | memory: 20Mi 71 | cpu: 10m 72 | limits: 73 | memory: 40Mi 74 | cpu: 200m 75 | tolerations: 76 | - effect: NoSchedule 77 | operator: Exists 78 | volumes: 79 | - name: proc 80 | hostPath: 81 | path: /proc 82 | - name: sys 83 | hostPath: 84 | path: /sys 85 | - name: root 86 | hostPath: 87 | path: / 88 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/node-exporter/node-exporter-service-account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: node-exporter 5 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/node-exporter/node-exporter-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: node-exporter 5 | labels: 6 | app: node-exporter 7 | k8s-app: node-exporter 8 | spec: 9 | clusterIP: None 10 | ports: 11 | - name: https 12 | port: 9100 13 | targetPort: 9100 14 | selector: 15 | app: node-exporter 16 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1beta1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: prometheus-operator 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: prometheus-operator 9 | subjects: 10 | - kind: ServiceAccount 11 | name: prometheus-operator 12 | namespace: monitoring -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: prometheus-operator 5 | rules: 6 | - apiGroups: 7 | - apiextensions.k8s.io 8 | resources: 9 | - customresourcedefinitions 10 | verbs: 11 | - '*' 12 | - apiGroups: 13 | - monitoring.coreos.com 14 | resources: 15 | - alertmanagers 16 | - prometheuses 17 | - prometheuses/finalizers 18 | - alertmanagers/finalizers 19 | - servicemonitors 20 | - prometheusrules 21 | verbs: 22 | - '*' 23 | - apiGroups: 24 | - apps 25 | resources: 26 | - statefulsets 27 | verbs: 28 | - '*' 29 | - apiGroups: 30 | - "" 31 | resources: 32 | - configmaps 33 | - secrets 34 | verbs: 35 | - '*' 36 | - apiGroups: 37 | - "" 38 | resources: 39 | - pods 40 | verbs: 41 | - list 42 | - delete 43 | - apiGroups: 44 | - "" 45 | resources: 46 | - services 47 | - services/finalizers 48 | - endpoints 49 | verbs: 50 | - get 51 | - create 52 | - update 53 | - delete 54 | - apiGroups: 55 | - "" 56 | resources: 57 | - nodes 58 | verbs: 59 | - list 60 | - watch 61 | - apiGroups: 62 | - "" 63 | resources: 64 | - namespaces 65 | verbs: 66 | - get 67 | - list 68 | - watch 69 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus-operator/prometheus-operator-service-account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: prometheus-operator 5 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus-operator/prometheus-operator-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: prometheus-operator 5 | labels: 6 | k8s-app: prometheus-operator 7 | spec: 8 | clusterIP: None 9 | ports: 10 | - name: http 11 | port: 8080 12 | targetPort: http 13 | selector: 14 | k8s-app: prometheus-operator 15 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus-operator/prometheus-operator.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1beta2 3 | kind: Deployment 4 | metadata: 5 | name: prometheus-operator 6 | labels: 7 | k8s-app: prometheus-operator 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | k8s-app: prometheus-operator 13 | template: 14 | metadata: 15 | labels: 16 | k8s-app: prometheus-operator 17 | spec: 18 | containers: 19 | - args: 20 | - --kubelet-service=kube-system/kubelet 21 | - --logtostderr=true 22 | - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 23 | - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.29.0 24 | image: quay.io/coreos/prometheus-operator:v0.29.0 25 | name: prometheus-operator 26 | ports: 27 | - containerPort: 8080 28 | name: http 29 | resources: 30 | limits: 31 | cpu: 200m 32 | memory: 200Mi 33 | requests: 34 | cpu: 100m 35 | memory: 100Mi 36 | securityContext: 37 | allowPrivilegeEscalation: false 38 | readOnlyRootFilesystem: true 39 | securityContext: 40 | runAsNonRoot: true 41 | runAsUser: 65534 42 | serviceAccountName: prometheus-operator 43 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-roles.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1beta1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: prometheus-k8s 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: ClusterRole 9 | name: prometheus-k8s 10 | subjects: 11 | - kind: ServiceAccount 12 | name: prometheus-k8s 13 | namespace: monitoring 14 | --- 15 | apiVersion: rbac.authorization.k8s.io/v1 16 | kind: ClusterRole 17 | metadata: 18 | name: prometheus-k8s 19 | rules: 20 | - apiGroups: 21 | - "" 22 | resources: 23 | - nodes/metrics 24 | verbs: 25 | - get 26 | - nonResourceURLs: 27 | - /metrics 28 | verbs: 29 | - get 30 | 31 | - apiGroups: [""] 32 | resources: 33 | - nodes 34 | - services 35 | - endpoints 36 | - pods 37 | verbs: ["get", "list", "watch"] 38 | - apiGroups: [""] 39 | resources: 40 | - configmaps 41 | verbs: ["get"] 42 | 43 | --- 44 | apiVersion: v1 45 | kind: ServiceAccount 46 | metadata: 47 | name: prometheus-k8s 48 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-rules.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | name: prometheus-k8s-rules-system 5 | labels: 6 | role: prometheus-rulefiles 7 | prometheus: k8s 8 | alertmanager-rules-group: common 9 | spec: 10 | groups: 11 | - name: kubernetes-absent 12 | rules: 13 | - alert: AlertmanagerDown 14 | annotations: 15 | description: Alertmanager has disappeared from Prometheus target discovery. 16 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown 17 | expr: | 18 | absent(up{job="alertmanager-main"} == 1) 19 | for: 15m 20 | labels: 21 | severity: '3' 22 | - alert: KubeAPIDown 23 | annotations: 24 | description: KubeAPI has disappeared from Prometheus target discovery. 25 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown 26 | expr: | 27 | absent(up{job="apiserver"} == 1) 28 | for: 15m 29 | labels: 30 | severity: '3' 31 | - alert: KubeStateMetricsDown 32 | annotations: 33 | description: KubeStateMetrics has disappeared from Prometheus target discovery. 34 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown 35 | expr: | 36 | absent(up{job="kube-state-metrics"} == 1) 37 | for: 15m 38 | labels: 39 | severity: '3' 40 | - alert: KubeletDown 41 | annotations: 42 | description: Kubelet has disappeared from Prometheus target discovery. 43 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown 44 | expr: | 45 | absent(up{job="kubelet"} == 1) 46 | for: 15m 47 | labels: 48 | severity: '3' 49 | - alert: NodeExporterDown 50 | annotations: 51 | description: NodeExporter has disappeared from Prometheus target discovery. 52 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown 53 | expr: | 54 | absent(up{job="node-exporter"} == 1) 55 | for: 15m 56 | labels: 57 | severity: '3' 58 | - alert: PrometheusDown 59 | annotations: 60 | description: Prometheus has disappeared from Prometheus target discovery. 61 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown 62 | expr: | 63 | absent(up{job="prometheus-k8s"} == 1) 64 | for: 15m 65 | labels: 66 | severity: '3' 67 | - alert: PrometheusOperatorDown 68 | annotations: 69 | description: PrometheusOperator has disappeared from Prometheus target discovery. 70 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown 71 | expr: | 72 | absent(up{job="prometheus-operator"} == 1) 73 | for: 15m 74 | labels: 75 | severity: '3' 76 | 77 | - name: kubernetes-apps 78 | rules: 79 | - alert: KubePodCrashLooping 80 | annotations: 81 | description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container 82 | }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.' 83 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping 84 | expr: | 85 | rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * on (namespace,pod) group_left(label_team) kube_pod_labels * 60 * 5 > 0 86 | for: 1h 87 | labels: 88 | severity: '3' 89 | - alert: KubePodNotReady 90 | annotations: 91 | description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready 92 | state for longer than an hour.' 93 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready 94 | expr: | 95 | sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on (namespace,pod) group_left(label_team) kube_pod_labels > 0 96 | for: 1h 97 | labels: 98 | severity: '3' 99 | - alert: KubeDeploymentGenerationMismatch 100 | annotations: 101 | description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment 102 | }} does not match, this indicates that the Deployment has failed but has 103 | not been rolled back.' 104 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch 105 | expr: | 106 | (kube_deployment_status_observed_generation{job="kube-state-metrics"} 107 | != 108 | kube_deployment_metadata_generation{job="kube-state-metrics"}) 109 | * on(namespace,deployment) group_left(label_team) kube_deployment_labels 110 | for: 15m 111 | labels: 112 | severity: '3' 113 | - alert: KubeDeploymentReplicasMismatch 114 | annotations: 115 | description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not 116 | matched the expected number of replicas for longer than an hour.' 117 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch 118 | expr: | 119 | (kube_deployment_spec_replicas{job="kube-state-metrics"} 120 | != 121 | kube_deployment_status_replicas_available{job="kube-state-metrics"}) 122 | * on(namespace,deployment) group_left(label_team) kube_deployment_labels 123 | for: 1h 124 | labels: 125 | severity: '3' 126 | - alert: KubeStatefulSetReplicasMismatch 127 | annotations: 128 | description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has 129 | not matched the expected number of replicas for longer than 15 minutes.' 130 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch 131 | expr: | 132 | (kube_statefulset_status_replicas_ready{job="kube-state-metrics"} 133 | != 134 | kube_statefulset_status_replicas{job="kube-state-metrics"}) 135 | * on(namespace,statefulset) group_left(label_team) kube_statefulset_labels 136 | for: 15m 137 | labels: 138 | severity: '3' 139 | - alert: KubeStatefulSetGenerationMismatch 140 | annotations: 141 | description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset 142 | }} does not match, this indicates that the StatefulSet has failed but has 143 | not been rolled back.' 144 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch 145 | expr: | 146 | (kube_statefulset_status_observed_generation{job="kube-state-metrics"} 147 | != 148 | kube_statefulset_metadata_generation{job="kube-state-metrics"}) 149 | * on(namespace,statefulset) group_left(label_team) kube_statefulset_labels 150 | for: 15m 151 | labels: 152 | severity: '3' 153 | - alert: KubeStatefulSetUpdateNotRolledOut 154 | annotations: 155 | description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update 156 | has not been rolled out.' 157 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout 158 | expr: | 159 | max without (revision) ( 160 | kube_statefulset_status_current_revision{job="kube-state-metrics"} 161 | unless 162 | kube_statefulset_status_update_revision{job="kube-state-metrics"} 163 | ) 164 | * 165 | ( 166 | kube_statefulset_replicas{job="kube-state-metrics"} 167 | != 168 | kube_statefulset_status_replicas_updated{job="kube-state-metrics"} 169 | ) * on(namespace,statefulset) group_left(label_team) kube_statefulset_labels 170 | for: 15m 171 | labels: 172 | severity: '3' 173 | - alert: KubeDaemonSetRolloutStuck 174 | annotations: 175 | description: 'Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace 176 | }}/{{ $labels.daemonset }} are scheduled and ready.' 177 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck 178 | expr: | 179 | (kube_daemonset_status_number_ready{job="kube-state-metrics"} 180 | / 181 | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) 182 | * on(namespace,daemonset) group_left(label_team) kube_daemonset_labels 183 | * 100 < 100 184 | for: 15m 185 | labels: 186 | severity: '3' 187 | - alert: KubeDaemonSetNotScheduled 188 | annotations: 189 | description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset 190 | }} are not scheduled.' 191 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled 192 | expr: | 193 | (kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} 194 | - 195 | kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"}) 196 | * on(namespace,daemonset) group_left(label_team) kube_daemonset_labels > 0 197 | for: 10m 198 | labels: 199 | severity: '2' 200 | - alert: KubeDaemonSetMisScheduled 201 | annotations: 202 | description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset 203 | }} are running where they are not supposed to run.' 204 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled 205 | expr: | 206 | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} 207 | * on(namespace,daemonset) group_left(label_team) kube_daemonset_labels > 0 208 | for: 10m 209 | labels: 210 | severity: '2' 211 | - alert: PodFrequentlyRestarting 212 | expr: | 213 | increase(kube_pod_container_status_restarts_total[1h]) 214 | * on (namespace,pod) group_left(label_team) kube_pod_labels > 5 215 | for: 10m 216 | labels: 217 | severity: '2' 218 | annotations: 219 | description: 'Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value | printf "%.0f"}} 220 | times within the last hour' 221 | summary: Pod is restarting frequently 222 | - alert: PodFrequentlyRestarting 223 | expr: | 224 | increase(kube_pod_container_status_restarts_total[1h]) 225 | * on (namespace,pod) group_left(label_team) kube_pod_labels > 10 226 | for: 10m 227 | labels: 228 | severity: '3' 229 | annotations: 230 | description: 'Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value | printf "%.0f"}} 231 | times within the last hour' 232 | summary: Pod is restarting frequently 233 | 234 | - name: kubernetes-resources 235 | rules: 236 | - alert: KubeCPUOvercommit 237 | annotations: 238 | description: Cluster has overcommitted CPU resource requests for Pods and cannot 239 | tolerate node failure. 240 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit 241 | expr: | 242 | sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) 243 | / 244 | sum(node:node_num_cpu:sum) 245 | > 246 | (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) 247 | for: 5m 248 | labels: 249 | severity: '2' 250 | - alert: KubeMemOvercommit 251 | annotations: 252 | description: Cluster has overcommitted memory resource requests for Pods and cannot 253 | tolerate node failure. 254 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit 255 | expr: | 256 | sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) 257 | / 258 | sum(node_memory_MemTotal) 259 | > 260 | (count(node:node_num_cpu:sum)-1) 261 | / 262 | count(node:node_num_cpu:sum) 263 | for: 5m 264 | labels: 265 | severity: '2' 266 | - alert: KubeCPUOvercommit 267 | annotations: 268 | description: Cluster has overcommitted CPU resource requests for Namespaces. 269 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit 270 | expr: | 271 | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) 272 | / 273 | sum(node:node_num_cpu:sum) 274 | > 1.5 275 | for: 5m 276 | labels: 277 | severity: '2' 278 | - alert: KubeMemOvercommit 279 | annotations: 280 | description: Cluster has overcommitted memory resource requests for Namespaces. 281 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit 282 | expr: | 283 | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) 284 | / 285 | sum(node_memory_MemTotal{job="node-exporter"}) 286 | > 1.5 287 | for: 5m 288 | labels: 289 | severity: '2' 290 | - alert: KubeQuotaExceeded 291 | annotations: 292 | description: 'Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value 293 | }}% of its {{ $labels.resource }} quota.' 294 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded 295 | expr: | 296 | 100 * kube_resourcequota{job="kube-state-metrics", type="used"} 297 | / ignoring(instance, job, type) 298 | (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) 299 | > 90 300 | for: 15m 301 | labels: 302 | severity: '2' 303 | - name: kubernetes-storage 304 | rules: 305 | - alert: KubePersistentVolumeUsageWarn10 306 | annotations: 307 | description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim 308 | }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value 309 | }}% free.' 310 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical 311 | expr: | 312 | 100 * 313 | (kubelet_volume_stats_available_bytes{job="kubelet"} 314 | / 315 | kubelet_volume_stats_capacity_bytes{job="kubelet"}) 316 | * on(namespace,persistentvolumeclaim) group_left(label_team) kube_persistentvolumeclaim_labels 317 | < 10 318 | for: 1m 319 | labels: 320 | severity: '2' 321 | - alert: KubePersistentVolumeUsageCritical5 322 | annotations: 323 | description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim 324 | }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value 325 | }}% free.' 326 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical 327 | expr: | 328 | 100 * 329 | (kubelet_volume_stats_available_bytes{job="kubelet"} 330 | / 331 | kubelet_volume_stats_capacity_bytes{job="kubelet"}) 332 | * on(namespace,persistentvolumeclaim) group_left(label_team) kube_persistentvolumeclaim_labels 333 | < 5 334 | for: 1m 335 | labels: 336 | severity: '3' 337 | - alert: KubePersistentVolumeUsageCritical3 338 | annotations: 339 | description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim 340 | }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value 341 | }}% free.' 342 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical 343 | expr: | 344 | 100 * 345 | (kubelet_volume_stats_available_bytes{job="kubelet"} 346 | / 347 | kubelet_volume_stats_capacity_bytes{job="kubelet"}) 348 | * on(namespace,persistentvolumeclaim) group_left(label_team) kube_persistentvolumeclaim_labels 349 | < 3 350 | for: 1m 351 | labels: 352 | severity: '3' 353 | - alert: KubePersistentVolumeFullInFourDays 354 | annotations: 355 | description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim 356 | }} in Namespace {{ $labels.namespace }} is expected to fill up within four 357 | days. Currently {{ $value }} bytes are available.' 358 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays 359 | expr: | 360 | (( 361 | kubelet_volume_stats_used_bytes{job="kubelet"} 362 | / 363 | kubelet_volume_stats_capacity_bytes{job="kubelet"} 364 | ) > 0.85 365 | and 366 | predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600)) 367 | * on(namespace,persistentvolumeclaim) group_left(label_team) kube_persistentvolumeclaim_labels 368 | < 0 369 | for: 5m 370 | labels: 371 | severity: '2' 372 | 373 | - name: kubernetes-system 374 | rules: 375 | - alert: KubeNodeNotReady 376 | annotations: 377 | description: '{{ $labels.node }} has been unready for more than an hour.' 378 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready 379 | expr: | 380 | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 381 | for: 1h 382 | labels: 383 | severity: '2' 384 | - alert: KubeVersionMismatch 385 | annotations: 386 | description: 'There are {{ $value }} different versions of Kubernetes components 387 | running.' 388 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch 389 | expr: | 390 | count(count(kubernetes_build_info{job!="kube-dns", job!="apiserver"}) by (gitVersion)) > 1 391 | for: 1h 392 | labels: 393 | severity: '2' 394 | - alert: KubeClientErrors 395 | annotations: 396 | description: 'Kubernetes API server client "{{ $labels.job }}/{{ $labels.instance 397 | }}" is experiencing {{ printf "%0.0f" $value }}% errors.' 398 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors 399 | expr: | 400 | (sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job) 401 | / 402 | sum(rate(rest_client_requests_total[5m])) by (instance, job)) 403 | * 100 > 1 404 | for: 15m 405 | labels: 406 | severity: '2' 407 | - alert: KubeClientErrors 408 | annotations: 409 | description: 'Kubernetes API server client "{{ $labels.job }}/{{ $labels.instance 410 | }}" is experiencing {{ printf "%0.0f" $value }} errors / second.' 411 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors 412 | expr: | 413 | sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 414 | for: 15m 415 | labels: 416 | severity: '2' 417 | - alert: KubeletTooManyPods 418 | annotations: 419 | description: 'Kubelet {{ $labels.node }} is running {{ $value }} Pods, close 420 | to the limit of 110.' 421 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods 422 | expr: | 423 | kubelet_running_pod_count{job="kubelet", node!~"gke-app-imf.*"} > 110 * 0.9 424 | for: 15m 425 | labels: 426 | severity: '2' 427 | - alert: KubeletTooManyPods 428 | annotations: 429 | description: 'Kubelet {{ $labels.node }} is running {{ $value }} Pods, close 430 | to the limit of 200.' 431 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods 432 | expr: | 433 | kubelet_running_pod_count{job="kubelet", node=~"gke-app-imf.*"} > 200 * 0.9 434 | for: 15m 435 | labels: 436 | severity: '2' 437 | - alert: KubeAPILatencyHigh 438 | annotations: 439 | description: 'The API server has a 99th percentile latency of {{ $value }} seconds 440 | for {{ $labels.verb }} {{ $labels.resource }}.' 441 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh 442 | expr: | 443 | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 444 | for: 10m 445 | labels: 446 | severity: '2' 447 | - alert: KubeAPILatencyHigh 448 | annotations: 449 | description: 'The API server has a 99th percentile latency of {{ $value }} seconds 450 | for {{ $labels.verb }} {{ $labels.resource }}.' 451 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh 452 | expr: | 453 | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 454 | for: 10m 455 | labels: 456 | severity: '3' 457 | - alert: KubeAPIErrorsHigh 458 | annotations: 459 | description: 'API server is returning errors for {{ $value }}% of requests.' 460 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh 461 | expr: | 462 | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) 463 | / 464 | sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10 465 | for: 10m 466 | labels: 467 | severity: '3' 468 | - alert: KubeAPIErrorsHigh 469 | annotations: 470 | description: 'API server is returning errors for {{ $value }}% of requests.' 471 | runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh 472 | expr: | 473 | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) 474 | / 475 | sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 476 | for: 10m 477 | labels: 478 | severity: '2' 479 | 480 | - name: alertmanager.rules 481 | rules: 482 | - alert: AlertmanagerConfigInconsistent 483 | annotations: 484 | description: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` 485 | are out of sync.' 486 | expr: | 487 | count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 488 | for: 5m 489 | labels: 490 | severity: '2' 491 | - alert: AlertmanagerFailedReload 492 | annotations: 493 | description: 'Reloading Alertmanagers configuration has failed for {{ $labels.namespace 494 | }}/{{ $labels.pod}}.' 495 | expr: | 496 | alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 497 | for: 10m 498 | labels: 499 | severity: '3' 500 | 501 | - name: general.rules 502 | rules: 503 | # Majority of targets should be pods, but they don't have to be (for example nodes). 504 | # If targets are pods, then we try to route them to appropriate teams, other targets are routed to infra team 505 | - alert: TargetDown 506 | annotations: 507 | description: '{{ $value }}% of the {{ $labels.namespace }}/{{ $labels.job }} pods are down.' 508 | expr: | 509 | 100 * 510 | (count(up{pod!=""} * on(namespace,pod) group_left(label_team) kube_pod_labels == 0) by (namespace,job,label_team) 511 | / 512 | count(up{pod!=""} * on(namespace,pod) group_left(label_team) kube_pod_labels) by (namespace,job,label_team)) 513 | > 10 514 | for: 10m 515 | labels: 516 | severity: '2' 517 | - alert: TargetDown 518 | annotations: 519 | description: '{{ $value }}% of the {{ $labels.namespace}}/{{ $labels.job }} targets are down.' 520 | expr: 100 * (count(up{pod=""} == 0) BY (job, namespace) / count(up{pod=""}) BY (job, namespace)) > 10 521 | for: 10m 522 | labels: 523 | severity: '2' 524 | - alert: DeadMansSwitch 525 | annotations: 526 | description: This is a DeadMansSwitch meant to ensure that the entire alerting 527 | pipeline is functional. 528 | expr: vector(1) 529 | labels: 530 | severity: '0' 531 | - alert: ProbeFailed 532 | annotations: 533 | description: 'Failed probe: {{ $labels.target }}' 534 | expr: probe_success == 0 535 | for: 1m 536 | labels: 537 | severity: '3' 538 | 539 | - name: prometheus.rules 540 | rules: 541 | - alert: PrometheusConfigReloadFailed 542 | annotations: 543 | description: 'Reloading Prometheus configuration has failed for {{$labels.namespace}}/{{$labels.pod}}' 544 | summary: Reloading Promehteus' configuration failed 545 | expr: | 546 | prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0 547 | for: 10m 548 | labels: 549 | severity: '1' 550 | - alert: PrometheusNotificationQueueRunningFull 551 | annotations: 552 | description: 'Prometheus alert notification queue is running full for {{$labels.namespace}}/{{ 553 | $labels.pod}}' 554 | summary: Prometheus alert notification queue is running full 555 | expr: | 556 | predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"} 557 | labels: 558 | severity: '1' 559 | - alert: PrometheusErrorSendingAlerts 560 | annotations: 561 | description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ 562 | $labels.pod}} to Alertmanager {{$labels.Alertmanager}}' 563 | summary: Errors while sending alert from Prometheus 564 | expr: | 565 | rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01 566 | for: 10m 567 | labels: 568 | severity: '2' 569 | - alert: PrometheusErrorSendingAlerts 570 | annotations: 571 | description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ 572 | $labels.pod}} to Alertmanager {{$labels.Alertmanager}}' 573 | summary: Errors while sending alerts from Prometheus 574 | expr: | 575 | rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03 576 | for: 10m 577 | labels: 578 | severity: '3' 579 | - alert: PrometheusNotConnectedToAlertmanagers 580 | annotations: 581 | description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected 582 | to any Alertmanagers' 583 | summary: Prometheus is not connected to any Alertmanagers 584 | expr: | 585 | prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1 586 | for: 10m 587 | labels: 588 | severity: '1' 589 | - alert: PrometheusTSDBReloadsFailing 590 | annotations: 591 | description: '{{$labels.job}} at {{$labels.instance}} had {{$value | printf "%.0f" | humanize}} 592 | reload failures over the last four hours.' 593 | summary: Prometheus has issues reloading data blocks from disk 594 | expr: | 595 | increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0 596 | for: 12h 597 | labels: 598 | severity: '1' 599 | - alert: PrometheusTSDBCompactionsFailing 600 | annotations: 601 | description: '{{$labels.job}} at {{$labels.instance}} had {{$value | printf "%.0f" | humanize}} 602 | compaction failures over the last four hours.' 603 | summary: Prometheus has issues compacting sample blocks 604 | expr: | 605 | increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0 606 | for: 12h 607 | labels: 608 | severity: '1' 609 | - alert: PrometheusTSDBWALCorruptions 610 | annotations: 611 | description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead 612 | log (WAL).' 613 | summary: Prometheus write-ahead log is corrupted 614 | expr: | 615 | tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0 616 | for: 4h 617 | labels: 618 | severity: '1' 619 | - alert: PrometheusNotIngestingSamples 620 | annotations: 621 | description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isnt ingesting 622 | samples.' 623 | summary: Prometheus isn't ingesting samples 624 | expr: | 625 | rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0 626 | for: 10m 627 | labels: 628 | severity: '1' 629 | - alert: PrometheusTargetScrapesDuplicate 630 | annotations: 631 | description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected 632 | due to duplicate timestamps but different values' 633 | summary: Prometheus has many samples rejected 634 | expr: | 635 | increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0 636 | for: 10m 637 | labels: 638 | severity: '1' 639 | 640 | - name: prometheus-operator 641 | rules: 642 | - alert: PrometheusOperatorReconcileErrors 643 | annotations: 644 | description: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace 645 | }} Namespace.' 646 | expr: | 647 | rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 648 | for: 10m 649 | labels: 650 | severity: '1' 651 | - alert: PrometheusOperatorNodeLookupErrors 652 | annotations: 653 | description: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.' 654 | expr: | 655 | rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 656 | for: 10m 657 | labels: 658 | severity: '1' 659 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: alertmanager 5 | labels: 6 | monitoring: enabled 7 | spec: 8 | selector: 9 | matchLabels: 10 | alertmanager: main 11 | namespaceSelector: 12 | matchNames: 13 | - monitoring 14 | endpoints: 15 | - port: web 16 | interval: 30s 17 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: kube-apiserver 5 | labels: 6 | monitoring: enabled 7 | spec: 8 | jobLabel: component 9 | selector: 10 | matchLabels: 11 | component: apiserver 12 | provider: kubernetes 13 | namespaceSelector: 14 | matchNames: 15 | - default 16 | endpoints: 17 | - port: https 18 | interval: 30s 19 | scheme: https 20 | tlsConfig: 21 | caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 22 | serverName: kubernetes 23 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 24 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: kube-controller-manager 5 | labels: 6 | monitoring: enable 7 | spec: 8 | jobLabel: k8s-app 9 | endpoints: 10 | - port: http-metrics 11 | interval: 30s 12 | selector: 13 | matchLabels: 14 | k8s-app: kube-controller-manager 15 | namespaceSelector: 16 | matchNames: 17 | - kube-system 18 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: kube-scheduler 5 | labels: 6 | monitoring: enabled 7 | spec: 8 | jobLabel: k8s-app 9 | endpoints: 10 | - port: http-metrics 11 | interval: 30s 12 | selector: 13 | matchLabels: 14 | k8s-app: kube-scheduler 15 | namespaceSelector: 16 | matchNames: 17 | - kube-system 18 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: kube-state-metrics 5 | labels: 6 | monitoring: enabled 7 | spec: 8 | jobLabel: k8s-app 9 | selector: 10 | matchLabels: 11 | k8s-app: kube-state-metrics 12 | namespaceSelector: 13 | matchNames: 14 | - monitoring 15 | endpoints: 16 | - port: https-main 17 | scheme: https 18 | interval: 30s 19 | honorLabels: true 20 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 21 | tlsConfig: 22 | insecureSkipVerify: true 23 | - port: https-self 24 | scheme: https 25 | interval: 30s 26 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 27 | tlsConfig: 28 | insecureSkipVerify: true 29 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: kubelet 5 | labels: 6 | monitoring: enabled 7 | spec: 8 | jobLabel: k8s-app 9 | endpoints: 10 | - port: https-metrics 11 | scheme: https 12 | interval: 30s 13 | tlsConfig: 14 | insecureSkipVerify: true 15 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 16 | - port: https-metrics 17 | scheme: https 18 | path: /metrics/cadvisor 19 | interval: 30s 20 | honorLabels: true 21 | tlsConfig: 22 | insecureSkipVerify: true 23 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 24 | selector: 25 | matchLabels: 26 | k8s-app: kubelet 27 | namespaceSelector: 28 | matchNames: 29 | - kube-system 30 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: node-exporter 5 | labels: 6 | monitoring: enabled 7 | spec: 8 | jobLabel: k8s-app 9 | selector: 10 | matchLabels: 11 | k8s-app: node-exporter 12 | namespaceSelector: 13 | matchNames: 14 | - monitoring 15 | endpoints: 16 | - port: https 17 | scheme: https 18 | interval: 30s 19 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 20 | tlsConfig: 21 | insecureSkipVerify: true 22 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: prometheus-operator 5 | labels: 6 | monitoring: enabled 7 | spec: 8 | endpoints: 9 | - port: http 10 | selector: 11 | matchLabels: 12 | k8s-app: prometheus-operator 13 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: prometheus 5 | labels: 6 | monitoring: enabled 7 | spec: 8 | selector: 9 | matchLabels: 10 | prometheus: k8s 11 | namespaceSelector: 12 | matchNames: 13 | - monitoring 14 | endpoints: 15 | - port: web 16 | interval: 30s 17 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: prometheus-k8s 5 | labels: 6 | prometheus: k8s 7 | spec: 8 | type: NodePort 9 | sessionAffinity: ClientIP 10 | ports: 11 | - name: http 12 | nodePort: 30900 13 | port: 9090 14 | targetPort: 9090 15 | selector: 16 | prometheus: k8s 17 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s-serviceaccount.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: prometheus-k8s 5 | -------------------------------------------------------------------------------- /kubernetes/prometheus/manifests/prometheus/prometheus-k8s.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: Prometheus 4 | metadata: 5 | name: k8s 6 | labels: 7 | prometheus: k8s 8 | spec: 9 | replicas: 1 10 | version: v2.7.1 11 | serviceAccountName: prometheus-k8s 12 | serviceMonitorSelector: 13 | matchLabels: 14 | monitoring: enabled 15 | ruleSelector: 16 | matchLabels: 17 | role: prometheus-rulefiles 18 | prometheus: k8s 19 | resources: 20 | requests: 21 | memory: 1Gi 22 | cpu: 100m 23 | securityContext: 24 | fsGroup: 2000 25 | runAsNonRoot: true 26 | runAsUser: 1000 27 | alerting: 28 | alertmanagers: 29 | - namespace: monitoring 30 | name: alertmanager-main 31 | port: web 32 | storage: 33 | volumeClaimTemplate: 34 | metadata: 35 | name: prometheus-k8s-db 36 | spec: 37 | accessModes: ["ReadWriteOnce"] 38 | storageClassName: "fast" 39 | resources: 40 | requests: 41 | storage: 10Gi 42 | --- 43 | kind: StorageClass 44 | apiVersion: storage.k8s.io/v1 45 | metadata: 46 | name: fast 47 | provisioner: kubernetes.io/gce-pd 48 | parameters: 49 | type: pd-ssd 50 | allowVolumeExpansion: true 51 | -------------------------------------------------------------------------------- /kubernetes/prometheus/teardown: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ -z "${KUBECONFIG}" ]; then 4 | export KUBECONFIG=~/.kube/config 5 | fi 6 | 7 | # CAUTION - NAMESPACE must match its value when deploy script was run. 8 | # Some resources are always deployed to the monitoring namespace. 9 | 10 | if [ -z "${NAMESPACE}" ]; then 11 | NAMESPACE=monitoring 12 | fi 13 | 14 | kctl() { 15 | kubectl --namespace "$NAMESPACE" "$@" 16 | } 17 | 18 | kctl delete -f manifests/node-exporter 19 | kctl delete -f manifests/kube-state-metrics 20 | kctl delete -f manifests/grafana 21 | find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \; 22 | kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml 23 | kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml 24 | kctl delete -f manifests/alertmanager 25 | 26 | # Hack: wait a bit to let the controller delete the deployed Prometheus server. 27 | sleep 5 28 | 29 | kctl delete -f manifests/prometheus-operator 30 | kctl delete svc/alertmanager-operated 31 | kctl delete svc/prometheus-operated 32 | 33 | kubectl --namespace=kube-system delete -f manifests/k8s/kubeadm/ 34 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==0.12.2 2 | git+https://github.com/jonashaag/bjoern.git@d2cfd7be26eba77d42aad683b375892cfedb04d7#egg=bjoern 3 | Flask-Redis==0.3.0 4 | prometheus-client==0.1.1 5 | -------------------------------------------------------------------------------- /runner.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import bjoern 3 | import logging 4 | import sys 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def run_bjoern(args): 10 | from app import app 11 | logger.info('Starting bjoern web server') 12 | bjoern.run(app, args.host, args.port) 13 | 14 | 15 | def main(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--host', dest='host', default='0.0.0.0') 18 | parser.add_argument('--port', dest='port', type=int, default=80) 19 | 20 | args = parser.parse_args() 21 | logging.basicConfig(stream=sys.stderr, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 22 | run_bjoern(args) 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | --------------------------------------------------------------------------------