├── .gitignore
├── CKA
    ├── README.md
    └── images
    │   ├── cka-logo.png
    │   ├── cni-4.png
    │   ├── cni-5.png
    │   ├── core-dns-1.png
    │   ├── core-dns-2.png
    │   ├── core-dns-3.png
    │   ├── ingress-1.png
    │   ├── ingress-10.png
    │   ├── ingress-2.png
    │   ├── ingress-3.png
    │   ├── ingress-4.png
    │   ├── ingress-5.png
    │   ├── ingress-6.png
    │   ├── ingress-7.png
    │   ├── ingress-8.png
    │   ├── ingress-9.png
    │   ├── init-container.png
    │   ├── kube-proxy-2.png
    │   ├── loadbalancer-1.png
    │   ├── loadbalancer-2.png
    │   ├── logo.png
    │   ├── nodeport-accessibility.png
    │   ├── rbac-1.png
    │   ├── rbac-2.png
    │   ├── rbac-3.png
    │   ├── rbac-4.png
    │   └── rbac-5.png
├── Helm
    ├── README.md
    └── images
    │   ├── helm-demo-1.png
    │   └── logo.svg
├── LICENSE
├── Operators
    ├── README.md
    └── images
    │   ├── api.png
    │   ├── logo.png
    │   └── prometheus-server.png
├── Prometheus
    ├── README.md
    └── images
    │   ├── Alertmanager-ui.png
    │   ├── CPU.png
    │   ├── Grafana-UI.png
    │   ├── Prometheus-Federations.jpg
    │   ├── a-rule.png
    │   ├── add-dash-2.png
    │   ├── add-dashboard.png
    │   ├── alert-rule-1.png
    │   ├── alert-rules.png
    │   ├── alertmanager-api.png
    │   ├── alertmanager-new-status.png
    │   ├── alertmanager-status.png
    │   ├── cpu-number.png
    │   ├── create-graph.png
    │   ├── create-rule-1.png
    │   ├── create-rule-2.png
    │   ├── create-rule-3.png
    │   ├── exporter.png
    │   ├── exposed-metrics.png
    │   ├── firing.png
    │   ├── gmail.jpg
    │   ├── grafana-cluster-rows.png
    │   ├── grafana-dash-id.png
    │   ├── grafana-dashboards.png
    │   ├── grafana-data-sources.png
    │   ├── grafana-explore.png
    │   ├── grafana-node-pods.png
    │   ├── grafana-users.png
    │   ├── logo.svg
    │   ├── node-exporter-nodes.png
    │   ├── node-status.png
    │   ├── node-target.png
    │   ├── pending.png
    │   ├── prometheus-rule.png
    │   ├── prometheus-server.png
    │   ├── prometheus-status-config.png
    │   ├── prometheus-target-expand.png
    │   ├── prometheus-targets.png
    │   ├── prometheus-ui.png
    │   ├── promql-prometheus-ui-1.png
    │   ├── promql-prometheus-ui-2.png
    │   ├── promql-query.png
    │   ├── redis-dashboard.png
    │   ├── redis-down.png
    │   ├── redis-exporter.png
    │   ├── redis-queries.png
    │   ├── redis-rules.png
    │   ├── request-duration.png
    │   ├── request-per-seconds.png
    │   └── requests-dash.png
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | # IDE folder
2 | .idea
3 | 
4 | # MacOS file
5 | .DS_Store
6 | 
7 | 


--------------------------------------------------------------------------------
/CKA/images/cka-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/cka-logo.png


--------------------------------------------------------------------------------
/CKA/images/cni-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/cni-4.png


--------------------------------------------------------------------------------
/CKA/images/cni-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/cni-5.png


--------------------------------------------------------------------------------
/CKA/images/core-dns-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/core-dns-1.png


--------------------------------------------------------------------------------
/CKA/images/core-dns-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/core-dns-2.png


--------------------------------------------------------------------------------
/CKA/images/core-dns-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/core-dns-3.png


--------------------------------------------------------------------------------
/CKA/images/ingress-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-1.png


--------------------------------------------------------------------------------
/CKA/images/ingress-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-10.png


--------------------------------------------------------------------------------
/CKA/images/ingress-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-2.png


--------------------------------------------------------------------------------
/CKA/images/ingress-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-3.png


--------------------------------------------------------------------------------
/CKA/images/ingress-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-4.png


--------------------------------------------------------------------------------
/CKA/images/ingress-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-5.png


--------------------------------------------------------------------------------
/CKA/images/ingress-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-6.png


--------------------------------------------------------------------------------
/CKA/images/ingress-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-7.png


--------------------------------------------------------------------------------
/CKA/images/ingress-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-8.png


--------------------------------------------------------------------------------
/CKA/images/ingress-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/ingress-9.png


--------------------------------------------------------------------------------
/CKA/images/init-container.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/init-container.png


--------------------------------------------------------------------------------
/CKA/images/kube-proxy-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/kube-proxy-2.png


--------------------------------------------------------------------------------
/CKA/images/loadbalancer-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/loadbalancer-1.png


--------------------------------------------------------------------------------
/CKA/images/loadbalancer-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/loadbalancer-2.png


--------------------------------------------------------------------------------
/CKA/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/logo.png


--------------------------------------------------------------------------------
/CKA/images/nodeport-accessibility.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/nodeport-accessibility.png


--------------------------------------------------------------------------------
/CKA/images/rbac-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/rbac-1.png


--------------------------------------------------------------------------------
/CKA/images/rbac-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/rbac-2.png


--------------------------------------------------------------------------------
/CKA/images/rbac-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/rbac-3.png


--------------------------------------------------------------------------------
/CKA/images/rbac-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/rbac-4.png


--------------------------------------------------------------------------------
/CKA/images/rbac-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/CKA/images/rbac-5.png


--------------------------------------------------------------------------------
/Helm/README.md:
--------------------------------------------------------------------------------
   1 | <!-- PROJECT LOGO -->
   2 | <br />
   3 | <div align="center">
   4 |   <a href="https://github.com/alifiroozi80/CKA/tree/main/Helm">
   5 |     <img src="images/logo.svg" alt="Logo" width="150" height="150">
   6 |   </a>
   7 | 
   8 | <h3 align="center">Helm</h3>
   9 | 
  10 |   <p align="center">
  11 |     The Kubernetes Package Manager
  12 |   </p>
  13 | </div>
  14 | <br>
  15 | <div id="top">
  16 | <!-- TABLE OF CONTENTS -->
  17 | <details>
  18 |   <summary>Table of Contents</summary>
  19 |   <ol>
  20 |   <li>
  21 |      Helm Chart
  22 |     <ul>
  23 |         <li><a href="#intro-What-is-helm">What is Helm?</a>
  24 |             <ul>
  25 |                 <li><a href="#intro-Sharing-Helm-Charts">Sharing Helm Charts</a></li>
  26 |                 <li><a href="#intro-Templating-Engine">Templating Engine</a></li>
  27 |                 <li><a href="#intro-Different-Environments">Different Environments</a></li>
  28 |             </ul>
  29 |         </li>
  30 |         <li><a href="#intro-Helm-Chart-Structure">Helm Chart Structure</a></li>
  31 |         <li><a href="#intro-Value-injection">Value injection</a></li>
  32 |         <li><a href="#intro-Release-Management">Release Management</a></li>
  33 |         <li><a href="#intro-Differences-between-charts-and-packages">Differences between charts and packages</a></li>
  34 |         <li><a href="#intro-Charts-and-repositories">Charts and repositories</a>
  35 |           <ul>
  36 |             <li><a href="#intro-Managing-repositories">Managing repositories</a></li>
  37 |             <li><a href="#intro-Search-available-charts">Search available charts</a></li>
  38 |           </ul>
  39 |         </li>
  40 |         <li><a href="#intro-Checking-possible-values">Checking possible values</a></li>
  41 |     </ul>
  42 |   </li>
  43 |   <li>
  44 |      Demo: Install a Stateful App on K8s using Helm
  45 |     <ul>
  46 |         <li><a href="#Demo-Stateful-overview">Overview</a></li>
  47 |         <li><a href="#Demo-Stateful-Add-bitnami-repo">Add bitnami repo</a></li>
  48 |         <li><a href="#Demo-Stateful-Install-MongoDB">Install MongoDB</a></li>
  49 |         <li><a href="#Demo-Stateful-Install-MongoExpress">Install MongoExpress</a></li>
  50 |         <li><a href="#Demo-Stateful-Deploy-Ingress-controller-and-Create-Ingress-rule">Deploy Ingress controller and Create Ingress rule</a></li>
  51 |     </ul>
  52 |   </li>
  53 |   <li>
  54 |      Demo: Create Helm Chart for an Online Boutique
  55 |     <ul>
  56 |         <li><a href="#Demo-Online-Boutique-overview">Overview</a></li>
  57 |         <li><a href="#Online-Boutique-Basic-structure-of-Helm-Chart">Basic structure of Helm Chart</a></li>
  58 |         <li><a href="#Online-Boutique-Create-Basic-Template-File">Create Basic Template File</a></li>
  59 |         <li><a href="#Online-Boutique-Dynamic-Environment-Variables">Dynamic Environment Variables</a></li>
  60 |         <li><a href="#Online-Boutique-Dynamic-Set-Variables">Set Variables</a></li>
  61 |         <li><a href="#Online-Boutique-Test-the-Chart">Test the Chart</a></li>
  62 |         <li><a href="#Online-Boutique-Install-the-Chart">Install the Chart</a></li>
  63 |     </ul>
  64 |   </li>
  65 |   <li>
  66 |      Helmfile
  67 |     <ul>
  68 |         <li><a href="#Helmfile-What-is-a-Helmfile">What is a Helmfile?</a></li>
  69 |         <li><a href="#Helmfile-Uninstall-previous-Helm-Charts">Uninstall previous Helm Charts</a></li>
  70 |         <li><a href="#Helmfile-Create-helmfile">Create helmfile</a></li>
  71 |         <li><a href="#Helmfile-Install-helmfile">Install helmfile</a></li>
  72 |         <li><a href="#Helmfile-Deploy-Helm-Charts">Deploy Helm Charts</a></li>
  73 |         <li><a href="#Helmfile-Uninstall-Releases">Uninstall Releases</a></li>
  74 |         <li><a href="#Helmfile-Wrap-UP">Wrap UP</a></li>
  75 |     </ul>
  76 |   </li>
  77 | </ol>
  78 | </details>
  79 | </div>
  80 | 
  81 | ---
  82 | 
  83 | # Helm Chart
  84 | 
  85 | ## What is Helm?
  86 | 
  87 | <div id="intro-What-is-helm">
  88 | 
  89 | According to the [official documentation](https://helm.sh), **Helm is a package manager for Kubernetes** to
  90 | package YAML files and distribute them in public and private repositories.
  91 | 
  92 | Helm charts:
  93 | 
  94 | * Bundle of YAML files
  95 | * Create your own Helm Charts with Helm
  96 | * Push them to Helm repository (Or any other repository out there)
  97 | * Download and use existing ones
  98 | 
  99 | Helm has some interesting features that makes it awesome, such as:
 100 | 
 101 | * Sharing Helm Charts
 102 | * Templating Engine
 103 | * Same application across different environments
 104 | 
 105 | ### Sharing Helm Charts
 106 | 
 107 | <div id="intro-Sharing-Helm-Charts">
 108 | 
 109 | There is already alot of charts oot there (e.g. Database Apps, Monitoring Apps)
 110 | 
 111 | To search between those:
 112 | 
 113 | * `helm search <KEYWORD>`
 114 |     * Search provides the ability to search for Helm charts in the various places
 115 |       they can be stored including the **[Artifact Hub](https://artifacthub.io)** and **repositories you have added**.
 116 | * [Artifact Hub](https://artifacthub.io)
 117 |     * The [Artifact Hub](https://artifacthub.io) is a `CNCF` project to make discovering distributed cloud native
 118 |       artifacts easy. This includes Helm charts.
 119 | 
 120 | * There is two types of registries:
 121 |     1) Public Registries (e.g. [Artifact Hub](https://artifacthub.io))
 122 |     2) Private Registries (Share in organization)
 123 | 
 124 | </div> <!-- Sharing Helm Charts -->
 125 | 
 126 | ### Templating Engine
 127 | 
 128 | <div id="intro-Templating-Engine">
 129 | 
 130 | Let's say we have a bunch of microservices, so here in our scenario:
 131 | 
 132 | * Deployment and Service configuration almost the same!
 133 | * Most values are the same!
 134 | * Wouldn't be nice if we could define one file for deployment and another for service and just replace the values?!
 135 | * [x] Helm does that!
 136 | 
 137 | ---
 138 | 
 139 | 1) Define a common blueprint
 140 | 2) Dynamic values are replaced by placeholders (this syntax: `{{ .Values.XXX }}`)
 141 | 
 142 | * values.yaml
 143 |     ```yaml
 144 |     name: nginx-app
 145 |     container:
 146 |       name: nginx-app-container
 147 |       image: nginx
 148 |       port: 80
 149 |     ```
 150 | * Template YAML config
 151 |     ```yaml
 152 |     apiVersion: v1
 153 |     kind: Pod
 154 |     metadata: 
 155 |       name: {{ .Values.name }}
 156 |     spec:
 157 |       containers:
 158 |         - name: {{ .Values.container.name }}
 159 |           image: {{ .Values.container.image }}
 160 |           port: {{ .Values.container.port }}
 161 |     ```
 162 | 
 163 | * Values defined either by `yaml file` or with `--set` flag
 164 | * Now instead of **many YAML files** we have **just 1 YAML file**
 165 | * This is extremely powerful! e.g. for **CI/CD**
 166 |     * In your build you can replace the values on the fly
 167 | 
 168 | </div> <!-- Templating Engine -->
 169 | 
 170 | ### Different Environments
 171 | 
 172 | <div id="intro-Different-Environments">
 173 | 
 174 | This Helm features (<a href="#intro-Sharing-Helm-Charts">Sharing Helm Charts</a> & <a href="#intro-Templating-Engine">
 175 | Templating Engine</a>) are super helpful for CI/CD pipelines or DevOps Best-Practices
 176 | 
 177 | Let's say you have 3 environments:
 178 | 
 179 | * Development
 180 | * Staging
 181 | * Production
 182 | 
 183 | Now you can manage all those three environments by one single Chart!
 184 | 
 185 | </div> <!-- Different Environments -->
 186 | </div> <!-- What is Helm -->
 187 | 
 188 | ## Helm Chart Structure
 189 | 
 190 | <div id="intro-Helm-Chart-Structure">
 191 | 
 192 | * Directory structure:
 193 |     ```text
 194 |     mychart
 195 |     |-- Chart.yaml
 196 |     |-- charts
 197 |     |-- templates
 198 |     |   |-- NOTES.txt # Optionally files
 199 |     |   |-- _helpers.tpl # Optionally files
 200 |     |   |-- deployment.yaml
 201 |     |   |-- ingress.yaml
 202 |     |   `-- service.yaml
 203 |     `-- values.yaml
 204 |     ```
 205 | * Top level `mychart` folder: Name of the chart
 206 | * `Chart.yaml`: Meta information about the chart
 207 | * `charts`: folder: Chart dependencies
 208 | * `templates` folder: The actual templater files
 209 | * `values.yaml`: The actual template files
 210 | 
 211 | #### README and NOTES.txt
 212 | 
 213 | * At the top-level of the chart, it's a good idea to have a `README`
 214 | * It will be viewable with e.g. `helm show readme bitnami/mongodb`
 215 | * In the `templates/` directory, we can also have a `NOTES.txt` file
 216 | * When the template is installed (or upgraded), `NOTES.txt` is processed too (i.e. its `{{ ... }}` tags are evaluated)
 217 | * It gets displayed after the **installation** or **upgrades**
 218 | * It's a great place to generate messages to tell the user:
 219 |   * how to connect to the release they just deployed
 220 |   * any passwords or other thing that we generated for them
 221 | 
 222 | ---
 223 | 
 224 | * `helm install <CHARTNAME>`
 225 |     * Template files will be filled with the values from `values.yaml`
 226 | 
 227 | </div> <!-- Helm Chart Structure -->
 228 | 
 229 | ## Value injection
 230 | 
 231 | <div id="intro-Value-injection">
 232 | 
 233 | Let's say we have a
 234 | 
 235 | * `values.yaml`
 236 |     ```yaml
 237 |     imageName: nginx
 238 |     port: 80
 239 |     version: 1.0.0
 240 |     ```
 241 | * `my-values.yaml`
 242 |     ```yaml
 243 |     version: 2.0.0
 244 |     ```
 245 | * Install the chart
 246 |     ```shell
 247 |     helm install --values=my-values.yaml <CHARTNAME>
 248 |     ```
 249 | * Result would be:
 250 |     ```yaml
 251 |     imageName: nginx # From `values.yaml`
 252 |     port: 80 # From `values.yaml`
 253 |     version: 2.0.0 # From `my-values.yaml` -- Override value
 254 |     ```
 255 | * You can achieve the exact same goal without having `my-values.yaml` file:
 256 |     ```shell
 257 |     helm install --set version=2.0.0 <CHARTNAME>
 258 |     ```
 259 | 
 260 | </div> <!-- Value injection -->
 261 | 
 262 | ## Release Management
 263 | 
 264 | <div id="intro-Release-Management">
 265 | 
 266 | ### Helm version 2 vs. 3
 267 | 
 268 | * Helm 3 was released [November 13, 2019](https://helm.sh/blog/helm-3-released)
 269 | * Charts remain compatible between Helm 2 and Helm 3
 270 | * The CLI is very similar (with minor changes to some commands)
 271 | * The main difference is that Helm 2 uses `tiller`, a server-side component
 272 | * Helm 3 doesn't use `tiller` at all, making it simpler (yay!)
 273 | 
 274 | * Helm version 2 comes in two parts:
 275 |     * CLIENT (Helm CLI)
 276 |     * SERVER (Tiller -- should be deployed on Kubernetes cluster already)
 277 | * CLIENT (Helm CLI) send requests to SERVER (Tiller)
 278 | * Then Tiler stores a copy of configuration
 279 |     * Keeping track of all chart execution
 280 |       ```text
 281 |       | Revision | Request            |
 282 |       |----------|--------------------|
 283 |       | 1        | Installed Chart    | -- `helm install <CHART-NAME>`
 284 |       | 2        | Upgrade to v 1.0.0 | -- `helm upgrade <CHART-NAME>`
 285 |       | 3        | Rolled back to 1   | -- `helm rollback <CHART-NAME>`
 286 |       ```
 287 | 
 288 |     * Changes are applied to existing Deployment instead of creating a new one
 289 |     * Handling rollbacks
 290 | 
 291 | ---
 292 | 
 293 | ### Downside of Tillers
 294 | 
 295 | * Tiller has too much power inside K8s cluster
 296 |     * Create
 297 |     * Update
 298 |     * Delete
 299 | * Security Issues
 300 | 
 301 | ---
 302 | 
 303 | ### Helm version 3
 304 | 
 305 | * Helm solves the security concerns by deleting Tiller in Helm 3
 306 | * Removal of Tiller:
 307 |     * Replaces `client/server` with `client/library` architecture (Helm binary only)
 308 |     * Security is now on per-user basis (delegated to Kubernetes user cluster security)
 309 |     * Releases are now stored as in-cluster secrets and the release object metadata has changed
 310 |     * Releases are persisted on a release namespace basis and not in the Tiller namespace anymore
 311 | * [And so many more!](https://helm.sh/docs/topics/v2_v3_migration)
 312 | 
 313 | ---
 314 | 
 315 | ### With or without tiller
 316 | 
 317 | With Helm 3:
 318 | 
 319 | * The helm CLI communicates directly with the Kubernetes API
 320 | * It creates resources (deployments, services...) with our credentials
 321 | 
 322 | With Helm 2:
 323 | 
 324 | * The helm CLI communicates with tiller, telling tiller what to do
 325 | * Tiller then communicates with the Kubernetes API, using its own credentials
 326 | 
 327 | </div> <!-- Release Management -->
 328 | 
 329 | ## Differences between charts and packages
 330 | 
 331 | <div id="intro-Differences-between-charts-and-packages">
 332 | 
 333 | * A package (`deb`, `rpm`, etc.) contains binaries, libraries, etc.
 334 | * A chart contains YAML manifests
 335 |   (the binaries, libraries, etc. are in the images referenced by the chart)
 336 | * On most distributions, a package can only be installed **once**
 337 |   (installing another version replaces the installed one)
 338 | * A chart can be installed **multiple times**
 339 | * Each installation is called a **release**
 340 | * This allows to install e.g. 10 instances of MongoDB
 341 |   (with potentially different versions and configurations)
 342 | 
 343 | ---
 344 | 
 345 | Wait a minute ...
 346 | 
 347 | But, on my Debian system, I have Python `2` and Python `3`.
 348 | 
 349 | Also, I have multiple versions of the Postgres database engine!
 350 | 
 351 | Yes!
 352 | 
 353 | But they have different package names:
 354 | 
 355 | * `python2.7`, `python3.8`
 356 | * `postgresql-10`, `postgresql-11`
 357 | 
 358 | </div> <!-- Differences between charts and packages -->
 359 | 
 360 | ## Charts and repositories
 361 | 
 362 | <div id="intro-Charts-and-repositories">
 363 | 
 364 | * A repository (or `repo` in short) is a **collection of charts**
 365 | * It's just a bunch of files
 366 |   (they can be hosted by a static HTTP server, or on a local directory)
 367 | * We can add "repos" to Helm, giving them a nickname
 368 | * The nickname is used when referring to charts on that repo
 369 |   (for instance, if we try to install hello/world, that means the chart world on the repo hello; and that repo hello
 370 |   might be something like `https://blahblah.hello.io/charts`)
 371 | 
 372 | ### Managing repositories
 373 | 
 374 | <div id="intro-Managing-repositories">
 375 | 
 376 | * Let's check what repositories we have, and add the `bitnami` repo (the `bitnami` repo (By VMWare) contains a set of
 377 |   useful charts)
 378 | 
 379 | * List our repos:
 380 |   ```shell
 381 |   helm repo list
 382 |   ```
 383 | 
 384 | * Add the `bitnami` repo:
 385 |   ```shell
 386 |   helm repo add bitnami https://charts.bitnami.com/bitnami
 387 |   ```
 388 | 
 389 | * Adding a repo can take a few seconds (it downloads the list of charts from the repo).
 390 | 
 391 | </div> <!-- Managing repositories -->
 392 | 
 393 | ### Search available charts
 394 | 
 395 | <div id="intro-Search-available-charts">
 396 | 
 397 | * We can search available charts with `helm search` command
 398 | * We need to specify where to search (`hub` or `repo`)
 399 |     * `hub`: Search for charts in the Artifact Hub or your own hub instance
 400 |     * `repo`: Search repositories for a keyword in charts
 401 | * Search all charts in the repo that we added earlier:
 402 |   ```shell
 403 |   helm search repo bitnami
 404 |   ```
 405 | * Search for a specific chart (`mongodb` chart in our case)
 406 |   ```shell
 407 |   helm search repo bitnami/mongodb
 408 |   ```
 409 | 
 410 | </div> <!-- Search available charts -->
 411 | </div> <!-- Charts and repositories -->
 412 | 
 413 | ## Checking possible values
 414 | 
 415 | <div id="intro-Checking-possible-values">
 416 | 
 417 | * We can inspect a chart with `helm show`
 418 | * Look at the README for `mongodb`
 419 |   ```shell
 420 |   helm show readme bitnami/mongodb
 421 |   ```
 422 | * Look at the values and their defaults:
 423 |   ```shell
 424 |   helm show readme bitnami/mongodb
 425 |   ```
 426 | 
 427 | </div> <!-- Checking possible values -->
 428 | 
 429 | <p align="right">(<a href="#top">back to top</a>)</p>
 430 | 
 431 | # Demo: Install a Stateful App on K8s using Helm
 432 | 
 433 | ## Overview
 434 | 
 435 | <div id="Demo-Stateful-overview">
 436 | 
 437 | In this Demo project, we will deploy a MongoDB application with its visualizer called MongoExpress
 438 | 
 439 | Then we install an Ingress Controller and Create an Ingress Rule to reaching our MongoExpress through the browser...
 440 | 
 441 | 2 ways to deploy StatefulSet:
 442 | 
 443 | 1) Create all the configuration files yourself
 444 |     * StatefulSet Config File
 445 |     * Other configuration files
 446 | 2) Use bundle of those config files
 447 |     * Helm
 448 | 
 449 | </div> <!-- Overview -->
 450 | 
 451 | ## Add bitnami repo
 452 | 
 453 | <div id="Demo-Stateful-Add-bitnami-repo">
 454 | 
 455 | First we need to add [bitnami](https://github.com/bitnami/charts) repository to our cluster via Helm...
 456 | 
 457 | Bitnami: Popular applications, provided by [Bitnami](https://bitnami.com), ready to launch on Kubernetes
 458 | using [Kubernetes Helm](https://github.com/helm/helm).
 459 | 
 460 | * Add the bitnami repo
 461 |   ```shell
 462 |   helm repo add bitnami https://charts.bitnami.com/bitnami
 463 |   ```
 464 | * Update the helm repos
 465 |   ```shell
 466 |   helm repo update
 467 |   ```
 468 | * See all the charts in this repo
 469 |   ```shell
 470 |   helm search repo bitnami
 471 |   ```
 472 | * See the mongoDB charts
 473 |   ```shell
 474 |   helm search repo bitnami/mongo
 475 |   ```
 476 | 
 477 | </div> <!-- Add bitnami repo -->
 478 | 
 479 | ## Install MongoDB
 480 | 
 481 | <div id="Demo-Stateful-Install-MongoDB">
 482 | 
 483 | * See the [MongoDB chart page](https://github.com/bitnami/charts/tree/master/bitnami/mongodb)
 484 | 
 485 | * Based on [the docs](https://github.com/bitnami/charts/tree/master/bitnami/mongodb), create a `mobgo-values.yaml`:
 486 |   ```yaml
 487 |   architecture: replicaset # MongoDB's architecture (standalone or replicaset)
 488 |   replicaCount: 3 # Number of MongoDB nodes (only when architecture=replicaset)
 489 |   persistence:
 490 |     storageClass: "aws-ebs" # PVC Storage Class for MongoDB data volume
 491 |   auth: 
 492 |     rootPassword: secret-key-pwd # MongoDB root password
 493 |   ```
 494 | * Install the chart
 495 |   ```shell
 496 |   helm install <OUR-NAME> --values <CUSTOM-VALUES-FILE-NAME> <CHART-NAME]
 497 |   ```
 498 |   ```shell
 499 |   helm install mongodb --values mobgo-values.yaml bitnami/mongodb
 500 |   ```
 501 | 
 502 | And now everything is good to go!
 503 | 
 504 | </div> <!-- Install MongoDB -->
 505 | 
 506 | ## Install MongoExpress
 507 | 
 508 | <div id="Demo-Stateful-Install-MongoExpress">
 509 | 
 510 | ```yaml
 511 | apiVersion: apps/v1
 512 | kind: Deployment
 513 | metadata:
 514 |   name: mongo-express
 515 |   labels:
 516 |     app: mongo-express
 517 | spec:
 518 |   replicas: 1
 519 |   selector:
 520 |     matchLabels:
 521 |       app: mongo-express
 522 |   template:
 523 |     metadata:
 524 |       labels:
 525 |         app: mongo-express
 526 |     spec:
 527 |       containers:
 528 |         - name: mongo-express
 529 |           image: mongo-express
 530 |           ports:
 531 |             - containerPort: 8081
 532 |           env:
 533 |             - name: ME_CONFIG_MONGODB_ADMINUSERNAME
 534 |               value: root
 535 |             - name: ME_CONFIG_MONGODB_SERVER
 536 |               value: mongodb-0.mongodb-headless
 537 |             - name: ME_CONFIG_MONGODB_ADMINPASSWORD
 538 |               valueFrom:
 539 |                 secretKeyRef:
 540 |                   name: mongodb
 541 |                   key: mongodb-root-password # It comes from here: `kubectl get secret mongodb -o yaml | grep mongodb-root-password`
 542 | ---
 543 | # Internal Service - Only accessible within the cluster
 544 | apiVersion: v1
 545 | kind: Service
 546 | metadata:
 547 |   name: mongo-express-service
 548 | spec:
 549 |   selector:
 550 |     app: mongo-express
 551 |   ports:
 552 |     - protocol: TCP
 553 |       port: 8081
 554 |       targetPort: 8081
 555 | ```
 556 | 
 557 | Apply it...
 558 | 
 559 | </div> <!-- Install MongoExpress -->
 560 | 
 561 | ## Deploy Ingress controller and Create Ingress rule
 562 | 
 563 | <div id="Demo-Stateful-Deploy-Ingress-controller-and-Create-Ingress-rule">
 564 | 
 565 | 1) Deploy Ingress Controller
 566 |     * nginx-ingress controller
 567 | 2) Create Ingress Rule
 568 | 
 569 | ---
 570 | 
 571 | ### Deploy Ingress Controller
 572 | 
 573 | As [documentation](https://github.com/kubernetes/ingress-nginx/tree/main/charts/ingress-nginx) mentioned:
 574 | 
 575 | * Add the repo
 576 |   ```shell
 577 |   helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
 578 |   ```
 579 | * Update the repos
 580 |   ```shell
 581 |   helm repo update
 582 |   ```
 583 | * Installing the Chart
 584 |   ```shell
 585 |   helm install nginx-ingress ingress-nginx/ingress-nginx --set controller.publishService.enabled=true 
 586 |   ```
 587 | 
 588 | ---
 589 | 
 590 | ### Create Ingress Rule
 591 | 
 592 | ```yaml
 593 | apiVersion: extensions/v1beta1
 594 | kind: Ingress
 595 | metadata:
 596 |   annotations:
 597 |     kubernetes.io/ingress.class: nginx
 598 |   name: mongo-express
 599 | spec:
 600 |   rules:
 601 |     - host: CLOUD-LOADBALANCER-ADDRESS
 602 |       http:
 603 |         paths:
 604 |           - path: /
 605 |             backend:
 606 |               serviceName: mongo-express-service
 607 |               servicePort: 8081
 608 | ```
 609 | 
 610 | Apply it and everything should be fine...
 611 | 
 612 | ---
 613 | 
 614 | ### What happening right now?!
 615 | 
 616 | 1) Browser: hostname configured in ingress rule
 617 | 2) Hostname gets resolved to external IP of LoadBalancer
 618 | 3) Ingress Controller resolved rule and forwards to internal Service of MongoExpress
 619 | 
 620 | </div> <!-- Deploy Ingress controller and Create Ingress rule -->
 621 | <p align="right">(<a href="#top">back to top</a>)</p>
 622 | 
 623 | # Demo: Create Helm Chart for an Online Boutique
 624 | 
 625 | ## Overview
 626 | 
 627 | <div id="Demo-Online-Boutique-overview">
 628 | 
 629 | This chapter teaches a lot about Creating, managing, and more about Helm.
 630 | 
 631 | For this demo, we use an example from google
 632 | cloud: [Online Boutique](https://github.com/GoogleCloudPlatform/microservices-demo).
 633 | 
 634 | Online Boutique is a Sample cloud-native application with ten microservices, each with
 635 | its [YAML file](https://github.com/GoogleCloudPlatform/microservices-demo/tree/main/kubernetes-manifests), but we will
 636 | create one shared helm chart for this application.
 637 | 
 638 | <img src="images/helm-demo-1.png">
 639 | 
 640 | This the complete YAML file
 641 | 
 642 | ```yaml
 643 | ---
 644 | apiVersion: apps/v1
 645 | kind: Deployment
 646 | metadata:
 647 |   name: emailservice
 648 | spec:
 649 |   selector:
 650 |     matchLabels:
 651 |       app: emailservice
 652 |   template:
 653 |     metadata:
 654 |       labels:
 655 |         app: emailservice
 656 |     spec:
 657 |       containers:
 658 |         - name: server
 659 |           image: gcr.io/google-samples/microservices-demo/emailservice:v0.2.3
 660 |           ports:
 661 |             - containerPort: 8080
 662 |           env:
 663 |             - name: PORT
 664 |               value: "8080"
 665 |             - name: DISABLE_TRACING
 666 |               value: "1"
 667 |             - name: DISABLE_PROFILER
 668 |               value: "1"
 669 |           readinessProbe:
 670 |             periodSeconds: 5
 671 |             exec:
 672 |               command: [ "/bin/grpc_health_probe", "-addr=:8080" ]
 673 |           livenessProbe:
 674 |             periodSeconds: 5
 675 |             exec:
 676 |               command: [ "/bin/grpc_health_probe", "-addr=:8080" ]
 677 |           resources:
 678 |             requests:
 679 |               cpu: 100m
 680 |               memory: 64Mi
 681 |             limits:
 682 |               cpu: 200m
 683 |               memory: 128Mi
 684 | ---
 685 | apiVersion: v1
 686 | kind: Service
 687 | metadata:
 688 |   name: emailservice
 689 | spec:
 690 |   type: ClusterIP
 691 |   selector:
 692 |     app: emailservice
 693 |   ports:
 694 |     - protocol: TCP
 695 |       port: 5000
 696 |       targetPort: 8080
 697 | 
 698 | ---
 699 | apiVersion: apps/v1
 700 | kind: Deployment
 701 | metadata:
 702 |   name: recommendationservice
 703 | spec:
 704 |   selector:
 705 |     matchLabels:
 706 |       app: recommendationservice
 707 |   template:
 708 |     metadata:
 709 |       labels:
 710 |         app: recommendationservice
 711 |     spec:
 712 |       containers:
 713 |         - name: server
 714 |           image: gcr.io/google-samples/microservices-demo/recommendationservice:v0.2.3
 715 |           ports:
 716 |             - containerPort: 8080
 717 |           readinessProbe:
 718 |             periodSeconds: 5
 719 |             exec:
 720 |               command: [ "/bin/grpc_health_probe", "-addr=:8080" ]
 721 |           livenessProbe:
 722 |             periodSeconds: 5
 723 |             exec:
 724 |               command: [ "/bin/grpc_health_probe", "-addr=:8080" ]
 725 |           env:
 726 |             - name: PORT
 727 |               value: "8080"
 728 |             - name: PRODUCT_CATALOG_SERVICE_ADDR
 729 |               value: "productcatalogservice:3550"
 730 |             - name: DISABLE_TRACING
 731 |               value: "1"
 732 |             - name: DISABLE_PROFILER
 733 |               value: "1"
 734 |             - name: DISABLE_DEBUGGER
 735 |               value: "1"
 736 |           resources:
 737 |             requests:
 738 |               cpu: 100m
 739 |               memory: 220Mi
 740 |             limits:
 741 |               cpu: 200m
 742 |               memory: 450Mi
 743 | ---
 744 | apiVersion: v1
 745 | kind: Service
 746 | metadata:
 747 |   name: recommendationservice
 748 | spec:
 749 |   type: ClusterIP
 750 |   selector:
 751 |     app: recommendationservice
 752 |   ports:
 753 |     - protocol: TCP
 754 |       port: 8080
 755 |       targetPort: 8080
 756 | 
 757 | ---
 758 | apiVersion: apps/v1
 759 | kind: Deployment
 760 | metadata:
 761 |   name: paymentservice
 762 | spec:
 763 |   selector:
 764 |     matchLabels:
 765 |       app: paymentservice
 766 |   template:
 767 |     metadata:
 768 |       labels:
 769 |         app: paymentservice
 770 |     spec:
 771 |       containers:
 772 |         - name: server
 773 |           image: gcr.io/google-samples/microservices-demo/paymentservice:v0.2.3
 774 |           ports:
 775 |             - containerPort: 50051
 776 |           env:
 777 |             - name: PORT
 778 |               value: "50051"
 779 |           readinessProbe:
 780 |             exec:
 781 |               command: [ "/bin/grpc_health_probe", "-addr=:50051" ]
 782 |           livenessProbe:
 783 |             exec:
 784 |               command: [ "/bin/grpc_health_probe", "-addr=:50051" ]
 785 |           resources:
 786 |             requests:
 787 |               cpu: 100m
 788 |               memory: 64Mi
 789 |             limits:
 790 |               cpu: 200m
 791 |               memory: 128Mi
 792 | ---
 793 | apiVersion: v1
 794 | kind: Service
 795 | metadata:
 796 |   name: paymentservice
 797 | spec:
 798 |   type: ClusterIP
 799 |   selector:
 800 |     app: paymentservice
 801 |   ports:
 802 |     - protocol: TCP
 803 |       port: 50051
 804 |       targetPort: 50051
 805 | 
 806 | ---
 807 | apiVersion: apps/v1
 808 | kind: Deployment
 809 | metadata:
 810 |   name: productcatalogservice
 811 | spec:
 812 |   selector:
 813 |     matchLabels:
 814 |       app: productcatalogservice
 815 |   template:
 816 |     metadata:
 817 |       labels:
 818 |         app: productcatalogservice
 819 |     spec:
 820 |       containers:
 821 |         - name: server
 822 |           image: gcr.io/google-samples/microservices-demo/productcatalogservice:v0.2.3
 823 |           ports:
 824 |             - containerPort: 3550
 825 |           env:
 826 |             - name: PORT
 827 |               value: "3550"
 828 |           readinessProbe:
 829 |             exec:
 830 |               command: [ "/bin/grpc_health_probe", "-addr=:3550" ]
 831 |           livenessProbe:
 832 |             exec:
 833 |               command: [ "/bin/grpc_health_probe", "-addr=:3550" ]
 834 |           resources:
 835 |             requests:
 836 |               cpu: 100m
 837 |               memory: 64Mi
 838 |             limits:
 839 |               cpu: 200m
 840 |               memory: 128Mi
 841 | ---
 842 | apiVersion: v1
 843 | kind: Service
 844 | metadata:
 845 |   name: productcatalogservice
 846 | spec:
 847 |   type: ClusterIP
 848 |   selector:
 849 |     app: productcatalogservice
 850 |   ports:
 851 |     - protocol: TCP
 852 |       port: 3550
 853 |       targetPort: 3550
 854 | 
 855 | ---
 856 | apiVersion: apps/v1
 857 | kind: Deployment
 858 | metadata:
 859 |   name: currencyservice
 860 | spec:
 861 |   selector:
 862 |     matchLabels:
 863 |       app: currencyservice
 864 |   template:
 865 |     metadata:
 866 |       labels:
 867 |         app: currencyservice
 868 |     spec:
 869 |       containers:
 870 |         - name: server
 871 |           image: gcr.io/google-samples/microservices-demo/currencyservice:v0.2.3
 872 |           ports:
 873 |             - containerPort: 7000
 874 |           env:
 875 |             - name: PORT
 876 |               value: "7000"
 877 |           readinessProbe:
 878 |             exec:
 879 |               command: [ "/bin/grpc_health_probe", "-addr=:7000" ]
 880 |           livenessProbe:
 881 |             exec:
 882 |               command: [ "/bin/grpc_health_probe", "-addr=:7000" ]
 883 |           resources:
 884 |             requests:
 885 |               cpu: 100m
 886 |               memory: 64Mi
 887 |             limits:
 888 |               cpu: 200m
 889 |               memory: 128Mi
 890 | ---
 891 | apiVersion: v1
 892 | kind: Service
 893 | metadata:
 894 |   name: currencyservice
 895 | spec:
 896 |   type: ClusterIP
 897 |   selector:
 898 |     app: currencyservice
 899 |   ports:
 900 |     - protocol: TCP
 901 |       port: 7000
 902 |       targetPort: 7000
 903 | 
 904 | ---
 905 | apiVersion: apps/v1
 906 | kind: Deployment
 907 | metadata:
 908 |   name: shippingservice
 909 | spec:
 910 |   selector:
 911 |     matchLabels:
 912 |       app: shippingservice
 913 |   template:
 914 |     metadata:
 915 |       labels:
 916 |         app: shippingservice
 917 |     spec:
 918 |       containers:
 919 |         - name: server
 920 |           image: gcr.io/google-samples/microservices-demo/shippingservice:v0.2.3
 921 |           ports:
 922 |             - containerPort: 50051
 923 |           env:
 924 |             - name: PORT
 925 |               value: "50051"
 926 |           readinessProbe:
 927 |             periodSeconds: 5
 928 |             exec:
 929 |               command: [ "/bin/grpc_health_probe", "-addr=:50051" ]
 930 |           livenessProbe:
 931 |             exec:
 932 |               command: [ "/bin/grpc_health_probe", "-addr=:50051" ]
 933 |           resources:
 934 |             requests:
 935 |               cpu: 100m
 936 |               memory: 64Mi
 937 |             limits:
 938 |               cpu: 200m
 939 |               memory: 128Mi
 940 | ---
 941 | apiVersion: v1
 942 | kind: Service
 943 | metadata:
 944 |   name: shippingservice
 945 | spec:
 946 |   type: ClusterIP
 947 |   selector:
 948 |     app: shippingservice
 949 |   ports:
 950 |     - protocol: TCP
 951 |       port: 50051
 952 |       targetPort: 50051
 953 | 
 954 | ---
 955 | apiVersion: apps/v1
 956 | kind: Deployment
 957 | metadata:
 958 |   name: adservice
 959 | spec:
 960 |   selector:
 961 |     matchLabels:
 962 |       app: adservice
 963 |   template:
 964 |     metadata:
 965 |       labels:
 966 |         app: adservice
 967 |     spec:
 968 |       containers:
 969 |         - name: server
 970 |           image: gcr.io/google-samples/microservices-demo/adservice:v0.2.3
 971 |           ports:
 972 |             - containerPort: 9555
 973 |           env:
 974 |             - name: PORT
 975 |               value: "9555"
 976 |           resources:
 977 |             requests:
 978 |               cpu: 200m
 979 |               memory: 180Mi
 980 |             limits:
 981 |               cpu: 300m
 982 |               memory: 300Mi
 983 |           readinessProbe:
 984 |             initialDelaySeconds: 20
 985 |             periodSeconds: 15
 986 |             exec:
 987 |               command: [ "/bin/grpc_health_probe", "-addr=:9555" ]
 988 |           livenessProbe:
 989 |             initialDelaySeconds: 20
 990 |             periodSeconds: 15
 991 |             exec:
 992 |               command: [ "/bin/grpc_health_probe", "-addr=:9555" ]
 993 | ---
 994 | apiVersion: v1
 995 | kind: Service
 996 | metadata:
 997 |   name: adservice
 998 | spec:
 999 |   type: ClusterIP
1000 |   selector:
1001 |     app: adservice
1002 |   ports:
1003 |     - protocol: TCP
1004 |       port: 9555
1005 |       targetPort: 9555
1006 | 
1007 | ---
1008 | apiVersion: apps/v1
1009 | kind: Deployment
1010 | metadata:
1011 |   name: cartservice
1012 | spec:
1013 |   selector:
1014 |     matchLabels:
1015 |       app: cartservice
1016 |   template:
1017 |     metadata:
1018 |       labels:
1019 |         app: cartservice
1020 |     spec:
1021 |       containers:
1022 |         - name: server
1023 |           image: gcr.io/google-samples/microservices-demo/cartservice:v0.2.3
1024 |           ports:
1025 |             - containerPort: 7070
1026 |           env:
1027 |             - name: REDIS_ADDR
1028 |               value: "redis-cart:6379"
1029 |           resources:
1030 |             requests:
1031 |               cpu: 200m
1032 |               memory: 64Mi
1033 |             limits:
1034 |               cpu: 300m
1035 |               memory: 128Mi
1036 |           readinessProbe:
1037 |             initialDelaySeconds: 15
1038 |             exec:
1039 |               command: [ "/bin/grpc_health_probe", "-addr=:7070", "-rpc-timeout=5s" ]
1040 |           livenessProbe:
1041 |             initialDelaySeconds: 15
1042 |             periodSeconds: 10
1043 |             exec:
1044 |               command: [ "/bin/grpc_health_probe", "-addr=:7070", "-rpc-timeout=5s" ]
1045 | ---
1046 | apiVersion: v1
1047 | kind: Service
1048 | metadata:
1049 |   name: cartservice
1050 | spec:
1051 |   type: ClusterIP
1052 |   selector:
1053 |     app: cartservice
1054 |   ports:
1055 |     - protocol: TCP
1056 |       port: 7070
1057 |       targetPort: 7070
1058 | 
1059 | ---
1060 | apiVersion: apps/v1
1061 | kind: Deployment
1062 | metadata:
1063 |   name: checkoutservice
1064 | spec:
1065 |   selector:
1066 |     matchLabels:
1067 |       app: checkoutservice
1068 |   template:
1069 |     metadata:
1070 |       labels:
1071 |         app: checkoutservice
1072 |     spec:
1073 |       containers:
1074 |         - name: server
1075 |           image: gcr.io/google-samples/microservices-demo/checkoutservice:v0.2.3
1076 |           ports:
1077 |             - containerPort: 5050
1078 |           readinessProbe:
1079 |             exec:
1080 |               command: [ "/bin/grpc_health_probe", "-addr=:5050" ]
1081 |           livenessProbe:
1082 |             exec:
1083 |               command: [ "/bin/grpc_health_probe", "-addr=:5050" ]
1084 |           env:
1085 |             - name: PORT
1086 |               value: "5050"
1087 |             - name: PRODUCT_CATALOG_SERVICE_ADDR
1088 |               value: "productcatalogservice:3550"
1089 |             - name: SHIPPING_SERVICE_ADDR
1090 |               value: "shippingservice:50051"
1091 |             - name: PAYMENT_SERVICE_ADDR
1092 |               value: "paymentservice:50051"
1093 |             - name: EMAIL_SERVICE_ADDR
1094 |               value: "emailservice:5000"
1095 |             - name: CURRENCY_SERVICE_ADDR
1096 |               value: "currencyservice:7000"
1097 |             - name: CART_SERVICE_ADDR
1098 |               value: "cartservice:7070"
1099 |           resources:
1100 |             requests:
1101 |               cpu: 100m
1102 |               memory: 64Mi
1103 |             limits:
1104 |               cpu: 200m
1105 |               memory: 128Mi
1106 | ---
1107 | apiVersion: v1
1108 | kind: Service
1109 | metadata:
1110 |   name: checkoutservice
1111 | spec:
1112 |   type: ClusterIP
1113 |   selector:
1114 |     app: checkoutservice
1115 |   ports:
1116 |     - protocol: TCP
1117 |       port: 5050
1118 |       targetPort: 5050
1119 | 
1120 | ---
1121 | apiVersion: apps/v1
1122 | kind: Deployment
1123 | metadata:
1124 |   name: frontend
1125 | spec:
1126 |   selector:
1127 |     matchLabels:
1128 |       app: frontend
1129 |   template:
1130 |     metadata:
1131 |       labels:
1132 |         app: frontend
1133 |     spec:
1134 |       containers:
1135 |         - name: server
1136 |           image: gcr.io/google-samples/microservices-demo/frontend:v0.2.3
1137 |           ports:
1138 |             - containerPort: 8080
1139 |           readinessProbe:
1140 |             initialDelaySeconds: 10
1141 |             httpGet:
1142 |               path: "/_healthz"
1143 |               port: 8080
1144 |               httpHeaders:
1145 |                 - name: "Cookie"
1146 |                   value: "shop_session-id=x-readiness-probe"
1147 |           livenessProbe:
1148 |             initialDelaySeconds: 10
1149 |             httpGet:
1150 |               path: "/_healthz"
1151 |               port: 8080
1152 |               httpHeaders:
1153 |                 - name: "Cookie"
1154 |                   value: "shop_session-id=x-liveness-probe"
1155 |           env:
1156 |             - name: PORT
1157 |               value: "8080"
1158 |             - name: PRODUCT_CATALOG_SERVICE_ADDR
1159 |               value: "productcatalogservice:3550"
1160 |             - name: CURRENCY_SERVICE_ADDR
1161 |               value: "currencyservice:7000"
1162 |             - name: CART_SERVICE_ADDR
1163 |               value: "cartservice:7070"
1164 |             - name: RECOMMENDATION_SERVICE_ADDR
1165 |               value: "recommendationservice:8080"
1166 |             - name: SHIPPING_SERVICE_ADDR
1167 |               value: "shippingservice:50051"
1168 |             - name: CHECKOUT_SERVICE_ADDR
1169 |               value: "checkoutservice:5050"
1170 |             - name: AD_SERVICE_ADDR
1171 |               value: "adservice:9555"
1172 |           resources:
1173 |             requests:
1174 |               cpu: 100m
1175 |               memory: 64Mi
1176 |             limits:
1177 |               cpu: 200m
1178 |               memory: 128Mi
1179 | ---
1180 | apiVersion: v1
1181 | kind: Service
1182 | metadata:
1183 |   name: frontend
1184 | spec:
1185 |   type: ClusterIP
1186 |   selector:
1187 |     app: frontend
1188 |   ports:
1189 |     - name: http
1190 |       port: 80
1191 |       targetPort: 8080
1192 | ---
1193 | apiVersion: v1
1194 | kind: Service
1195 | metadata:
1196 |   name: frontend-external
1197 | spec:
1198 |   type: LoadBalancer
1199 |   selector:
1200 |     app: frontend
1201 |   ports:
1202 |     - name: http
1203 |       port: 80
1204 |       targetPort: 8080
1205 | 
1206 | ---
1207 | apiVersion: apps/v1
1208 | kind: Deployment
1209 | metadata:
1210 |   name: redis-cart
1211 | spec:
1212 |   selector:
1213 |     matchLabels:
1214 |       app: redis-cart
1215 |   template:
1216 |     metadata:
1217 |       labels:
1218 |         app: redis-cart
1219 |     spec:
1220 |       containers:
1221 |         - name: redis
1222 |           image: redis:alpine
1223 |           ports:
1224 |             - containerPort: 6379
1225 |           readinessProbe:
1226 |             periodSeconds: 5
1227 |             tcpSocket:
1228 |               port: 6379
1229 |           livenessProbe:
1230 |             periodSeconds: 5
1231 |             tcpSocket:
1232 |               port: 6379
1233 |           volumeMounts:
1234 |             - mountPath: /data
1235 |               name: redis-data
1236 |           resources:
1237 |             requests:
1238 |               cpu: 70m
1239 |               memory: 200Mi
1240 |             limits:
1241 |               cpu: 125m
1242 |               memory: 256Mi
1243 |       volumes:
1244 |         - name: redis-data
1245 |           emptyDir: { }
1246 | ---
1247 | apiVersion: v1
1248 | kind: Service
1249 | metadata:
1250 |   name: redis-cart
1251 | spec:
1252 |   type: ClusterIP
1253 |   selector:
1254 |     app: redis-cart
1255 |   ports:
1256 |     - name: redis
1257 |       port: 6379
1258 |       targetPort: 6379
1259 | ```
1260 | 
1261 | </div> <!-- Overview -->
1262 | 
1263 | ## Basic structure of Helm Chart
1264 | 
1265 | <div id="Online-Boutique-Basic-structure-of-Helm-Chart">
1266 | 
1267 | * We are going to show a way to create a very simplified chart
1268 | * Create a sample chart with the name of `boutique`:
1269 |   ```shell
1270 |   helm create boutique
1271 |   ```
1272 | * This will create a folder (`boutique`):
1273 |   ```shell
1274 |   boutique
1275 |   ├── charts # Chart dependencies
1276 |   ├── Chart.yaml # Metadata info about chart
1277 |   ├── .helmignore # Files that you don't want to include in your helm chart
1278 |   ├── templates # Where the actual K8s YAML files (template files) are created 
1279 |   │   ├── deployment.yaml
1280 |   │   ├── _helpers.tpl
1281 |   │   ├── hpa.yaml
1282 |   │   ├── ingress.yaml
1283 |   │   ├── NOTES.txt
1284 |   │   ├── serviceaccount.yaml
1285 |   │   ├── service.yaml
1286 |   │   └── tests
1287 |   │       └── test-connection.yaml
1288 |   └── values.yaml # Actual values for the templates files
1289 |   
1290 |   3 directories, 11 files
1291 |   ```
1292 | * Let's delete everything in `templates`, then create two empty YAML files called `deployment.yaml` and `service.yaml`.
1293 |   After this you should have something like this:
1294 |   ```text
1295 |   boutique
1296 |   ├── charts
1297 |   ├── Chart.yaml
1298 |   ├── .helmignore
1299 |   ├── templates
1300 |   │   ├── deployment.yaml
1301 |   │   └── service.yaml
1302 |   └── values.yaml
1303 |   ```
1304 | 
1305 | </div> <!-- Basic structure of Helm Chart -->
1306 | 
1307 | ## Create Basic Template File
1308 | 
1309 | <div id="Online-Boutique-Create-Basic-Template-File">
1310 | 
1311 | * `deployment.yaml`:
1312 |   ```yaml
1313 |   apiVersion: apps/v1
1314 |   kind: Deployment
1315 |   metadata:
1316 |     name: {{ .Values.appName }}
1317 |   spec:
1318 |     replicas: {{ .Values.appReplicas }}
1319 |     selector:
1320 |       matchLabels:
1321 |         app: {{ .Values.appName }}
1322 |     template:
1323 |       metadata:
1324 |         labels:
1325 |           app: {{ .Values.appName }}
1326 |       spec:
1327 |         containers:
1328 |         - name: {{ .Values.appName }}
1329 |           image: "{{ .Values.appImage }}:{{ .Values.appVersion }}"
1330 |           resources:
1331 |             limits:
1332 |               memory: "128Mi"
1333 |               cpu: "500m"
1334 |           ports:
1335 |           - containerPort: {{ .Values.containerPort }}
1336 |   ```
1337 | 
1338 | * `service.yaml`
1339 |   ```yaml
1340 |   apiVersion: v1
1341 |   kind: Service
1342 |   metadata:
1343 |     name: {{ .Values.appName }}
1344 |   spec:
1345 |     type: {{ .Values.serviceType }}
1346 |     selector:
1347 |       app: {{ .Values.appName }}
1348 |     ports:
1349 |     - protocol: TCP
1350 |       port: {{ .Values.servicePort }}
1351 |       targetPort: {{ .Values.containerPort }}
1352 |   ```
1353 | 
1354 | ---
1355 | 
1356 | ### Some notes
1357 | 
1358 | #### `{{ .Values.xxx }}`
1359 | 
1360 | * `Values` Object
1361 |     * It is a built-in object
1362 |     * By default, Values is empty
1363 |     * `Values` are passed into template, from 3 sources
1364 |         * The `values.yaml` file in the chart
1365 |         * User specified file passed with `-f` flag
1366 |         * Parameters passed with `--set` flag
1367 | * Built-in Objects
1368 |     * Several objects are passed into a template from the template engine
1369 |     * Examples: `Release`, `Files`, `Values`, etc.
1370 |     * Check [this](https://helm.sh/docs/chart_template_guide/builtin_objects) out
1371 | 
1372 | #### Variable Naming Convention
1373 | 
1374 | * Names should begin with a lowercase letter
1375 | * Separated with camelcase
1376 | * Two ways of naming: `Flat` and `Nested`
1377 | 
1378 | ---
1379 | 
1380 | ##### Flat
1381 | 
1382 | * `deployment.yaml`
1383 |   ```yaml
1384 |   [...]
1385 |     spec:
1386 |       containers:
1387 |       - name: {{ .Values.containerName }}
1388 |         image: {{ .Values.containerImage }}
1389 |   [...]
1390 |   ```
1391 | * `values.yaml`
1392 |   ```yaml
1393 |   containerName: xxx
1394 |   containerImage: yyy
1395 |   ```
1396 | 
1397 | ##### Nested
1398 | 
1399 | * `deployment.yaml`
1400 |   ```yaml
1401 |   [...]
1402 |     spec:
1403 |       containers:
1404 |       - name: {{ .Values.container.name }}
1405 |         image: {{ .Values.container.image }}
1406 |   [...]
1407 |   ```
1408 | * `values.yaml`
1409 |   ```yaml
1410 |   container:
1411 |     name: xxx
1412 |     image: yyy
1413 |   ```
1414 | 
1415 | ##### **Flat** or **Nested** Values
1416 | 
1417 | * Values may be flat or nested deeply
1418 | * Best practice is to use flat structure, which is simpler
1419 | 
1420 | </div> <!-- Create Basic Template File -->
1421 | 
1422 | ## Dynamic Environment Variables
1423 | 
1424 | <div id="Online-Boutique-Dynamic-Environment-Variables">
1425 | 
1426 | Till now, we have 2 YAML files: `deployment.yaml` and `service.yaml`.
1427 | 
1428 | But notice the `deployment.yaml` file is not complete yet! Each deployment has its environment variables, and we have to
1429 | add them to complete the chart, but the problem is we can't hardcode the `env` section like we used to because each
1430 | deployment has a different range of env! How can we achieve that?
1431 | 
1432 | The answer is `range`!
1433 | 
1434 | * Range:
1435 |     * Provides a "for"-style loop
1436 |     * To iterate through or "range over" a list
1437 |     * Check the [Docs](https://helm.sh/docs/chart_template_guide/control_structures/#looping-with-the-range-action)
1438 | 
1439 | ---
1440 | 
1441 | * Add the env to the existing `deployment.yaml` file:
1442 |   ```yaml
1443 |   [...]
1444 |   env:
1445 |   {{- range .Values.containerEnvVars}}
1446 |   - name: {{ .key }}
1447 |     value: {{ .value | quote }}
1448 |   {{- end}}
1449 |   ```
1450 | 
1451 | What is this `|` symbol? --> It is Pipeline
1452 | 
1453 | * Pipeline
1454 |     * Same concept as UNIX
1455 |     * Tool for changing together template commands
1456 |     * [Docs](https://helm.sh/docs/chart_template_guide/functions_and_pipelines/#pipelines)
1457 | 
1458 | </div> <!-- Dynamic Environment Variables -->
1459 | 
1460 | ## Set Variables
1461 | 
1462 | <div id="Online-Boutique-Dynamic-Set-Variables">
1463 | 
1464 | * Example value file for our `servicename`:
1465 |   ```yaml
1466 |   appName: servicename
1467 |   appImage: gcr.io/google-samples/microservices-demo/servicename:v0.3.6
1468 |   appVersion: v.0.0.0
1469 |   appReplicas: 1
1470 |   containerPort: 8080
1471 |   containerEnvVars:
1472 |   - key: ENV_VAR_ONE
1473 |     value: "valueone"
1474 |   - key: ENV_VAR_TWO
1475 |     value: "valuetwo"
1476 |   servicePort: 8080
1477 |   serviceType: ClusterIP
1478 |   ```
1479 | 
1480 | * You must create a `xxx-values.yaml` file for each deployment!
1481 | 
1482 | </div> <!-- Set Variables -->
1483 | 
1484 | ## Test the Chart
1485 | 
1486 | <div id="Online-Boutique-Test-the-Chart">
1487 | 
1488 | Now everything **seems** right.
1489 | 
1490 | But still, it is nice to test our chart (template) before installing it.
1491 | 
1492 | ---
1493 | 
1494 | * Render chart templates locally and display the output
1495 |   ```shell
1496 |   helm template boutique
1497 |   ```
1498 | 
1499 | ---
1500 | 
1501 | * Examines a chart for possible issues
1502 |   ```shell
1503 |   helm lint boutique
1504 |   ```
1505 | * `helm lint` command:
1506 |     * Examines a chart for possible issues
1507 |     * `ERROR`: Issues that will cause chart to fal installation
1508 |     * `WARNING`: Issues that break with convention or recommandations
1509 | 
1510 | </div> <!-- Test the Chart -->
1511 | 
1512 | ## Install the Chart
1513 | 
1514 | <div id="Online-Boutique-Install-the-Chart">
1515 | 
1516 | `helm install` command
1517 | 
1518 | ```shell
1519 | helm install -f <VALUES.YAML-FILE> <RELEASE-NAME> <CHART-NAME>
1520 | ```
1521 | 
1522 | ---
1523 | 
1524 | * `helm install --dry-run`: Check generated manifest without installing the chart
1525 | * Difference of `--dry-run` and `template`:
1526 |     * `--dry-run` send files to K8s cluster, while `template` only validates it locally
1527 | 
1528 | ---
1529 | 
1530 | Now we have ten values.yaml file
1531 | 
1532 | We have to type the `helm install ...` command ten times, or
1533 | 
1534 | * install.sh
1535 |   ```shell
1536 |   helm install -f values/email-service-values.yaml emailservice boutique
1537 |   helm install -f values/cart-service-values.yaml cartservice boutique
1538 |   helm install -f values/currency-service-values.yaml currencyservice boutique
1539 |   helm install -f values/payment-service-values.yaml paymentservice boutique
1540 |   helm install -f values/recommendation-service-values.yaml recommendationservice boutique
1541 |   helm install -f values/productcatalog-service-values.yaml productcatalogservice boutique
1542 |   helm install -f values/shipping-service-values.yaml shippingservice boutique
1543 |   helm install -f values/ad-service-values.yaml adservice boutique
1544 |   helm install -f values/checkout-service-values.yaml checkoutservice boutique
1545 |   helm install -f values/frontend-values.yaml frontendservice boutique
1546 |   ```
1547 | * Change the mode to executable
1548 |   ```shell
1549 |   sudo chmod u+x install.sh
1550 |   ```
1551 | * Run the script
1552 |   ```shell
1553 |   bash install.sh
1554 |   ```
1555 | 
1556 | * I know it sounds not entirely professional, but it considers two things
1557 |     1) It is still better than option one (Everything in a long YAML file)
1558 |     2) We will learn something called `helmfile` in the next section that makes this process easier (Instead of a bash
1559 |        script)
1560 | 
1561 | </div> <!-- Install the Chart -->
1562 | <p align="right">(<a href="#top">back to top</a>)</p>
1563 | 
1564 | # Helmfile
1565 | 
1566 | Please, before this section, review the <a href="#Demo-Online-Boutique-overview">Demo: Create Helm Chart for an Online
1567 | Boutique</a>. This section is a Continuation of the previous section
1568 | 
1569 | ## What is a Helmfile?
1570 | 
1571 | <div id="Helmfile-What-is-a-Helmfile">
1572 | 
1573 | Helmfile is a **declarative** specification for deploying Helm charts that adds functionality to Helm.
1574 | 
1575 | * Declarative: Define the **desired** state!
1576 | * Declare a definition of an entire K8s cluster
1577 | * Change specification depending on application or type of environment
1578 | * Example of a `helmfile.yaml`:
1579 |   ```yaml
1580 |   releases:
1581 |     - chart: emailservice
1582 |       name: boutique
1583 |       values:
1584 |         - ./values/emailservice.yaml
1585 |   ```
1586 | 
1587 | </div> <!-- What is a Helmfile? -->
1588 | 
1589 | ## Uninstall previous Helm Charts
1590 | 
1591 | <div id="Helmfile-Uninstall-previous-Helm-Charts">
1592 | 
1593 | Let's uninstall the previously deployed chart from our cluster
1594 | 
1595 | * uninstall.sh
1596 |   ```shell
1597 |   helm uninstall emailservice
1598 |   helm uninstall cartservice
1599 |   helm uninstall currencyservice
1600 |   helm uninstall paymentservice
1601 |   helm uninstall recommendationservice
1602 |   helm uninstall productcatalogservice
1603 |   helm uninstall shippingservice
1604 |   helm uninstall adservice
1605 |   helm uninstall checkoutservice
1606 |   helm uninstall frontendservice
1607 |   ```
1608 | * Change the mode to executable
1609 |   ```shell
1610 |   sudo chmod u+x uninstall.sh
1611 |   ```
1612 | * Run the script
1613 |   ```shell
1614 |   bash uninstall.sh
1615 |   ```
1616 | 
1617 | </div> <!-- Uninstall previous Helm Charts -->
1618 | 
1619 | ## Create helmfile
1620 | 
1621 | <div id="Helmfile-Create-helmfile">
1622 | 
1623 | * helmfile.yaml
1624 |   ```yaml
1625 |   releases:
1626 |     - name: <RELEASE-NAME>
1627 |       chart: <CHART-NAME>
1628 |       values:
1629 |         - <VALUES.YAML-FILE>
1630 |   ```
1631 | 
1632 | ---
1633 | 
1634 | So now, instead of a bash script (`install.sh`), we write everything in the `helmfile.yaml` file.
1635 | 
1636 | ```yaml
1637 | releases:
1638 |   - name: shippingservice
1639 |     chart: boutique
1640 |     values:
1641 |       - values/shipping-service-values.yaml
1642 | 
1643 |   - name: adservice
1644 |     chart: boutique
1645 |     values:
1646 |       - values/ad-service-values.yaml
1647 |   - [ ... ]
1648 | ```
1649 | 
1650 | </div> <!-- Create helmfile -->
1651 | 
1652 | ## Install helmfile
1653 | 
1654 | <div id="Helmfile-Install-helmfile">
1655 | 
1656 | To interact with this helmfile, we have first to [install](https://github.com/roboll/helmfile#installation) `helmfile`
1657 | CLI
1658 | 
1659 | * Verify installation
1660 |   ```shell
1661 |   helmfile --version
1662 |   ```
1663 | 
1664 | </div> <!-- Install helmfile -->
1665 | 
1666 | ## Deploy Helm Charts
1667 | 
1668 | <div id="Helmfile-Deploy-Helm-Charts">
1669 | 
1670 | * Install Helm Charts via Helnfile
1671 |   ```shell
1672 |   helmfile sync
1673 |   ```
1674 | * See the results
1675 |   ```shell
1676 |   helmfile list
1677 |   ```
1678 | 
1679 | </div> <!-- Deploy Helm Charts -->
1680 | 
1681 | ## Uninstall Releases
1682 | 
1683 | <div id="Helmfile-Uninstall-Releases">
1684 | 
1685 | * To uninstall the chart
1686 |   ```shell
1687 |   helmfile destroy
1688 |   ```
1689 | 
1690 | </div> <!-- Uninstall Releases -->
1691 | 
1692 | ## Wrap UP
1693 | 
1694 | <div id="Helmfile-Wrap-UP">
1695 | 
1696 | Now you learned:
1697 | 
1698 | 1) 1 Shared Helm Chart for all microservices
1699 | 2) Deploy declaratively using helmfile
1700 | 
1701 | There is only one final question: Where to host the Helm charts?
1702 | 
1703 | ---
1704 | 
1705 | Where to host the Helm charts?
1706 | 
1707 | ✅ Host it in a Git Repository
1708 | 
1709 | 2 Options:
1710 | 
1711 | 1) **WITH** your application code
1712 | 2) **SEPARATE** Git Repository for Helm Charts
1713 | 
1714 | </div> <!-- Wrap UP -->
1715 | <p align="right">(<a href="#top">back to top</a>)</p>
1716 | 


--------------------------------------------------------------------------------
/Helm/images/helm-demo-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Helm/images/helm-demo-1.png


--------------------------------------------------------------------------------
/Helm/images/logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" height="351" viewBox="0 0 304 351" width="304">
 3 | <mask id="a" fill="#fff">
 4 | <path d="m0 0h313.303155v159.864865h-313.303155z" fill="#fff" fill-rule="evenodd"/>
 5 | </mask>
 6 | <mask id="b" fill="#fff">
 7 | <path d="m0 0h313.303155v159.864865h-313.303155z" fill="#fff" fill-rule="evenodd"/>
 8 | </mask>
 9 | <g fill="none" fill-rule="evenodd" transform="translate(-11 -51)">
10 | <path d="m11.6785714 189h19.7858357v26.789h23.9037v-26.789h19.7858358v75.25h-19.7858358v-28.695333h-23.9037v28.695333h-19.7858357zm86.1738429 75.25v-75.25h46.8030427v16.354333h-27.017207v12.240667h23.9037v16.655333h-23.9037v13.846h27.017207v16.153667zm68.4971567 0v-75.25h19.785836v55.384h27.117643v19.866zm77.536372-75.25 30.733328 27.892667 30.632893-27.892667h8.938779v75.25h-19.886272v-38.628333l-19.6854 17.959666-19.785835-17.859333v38.528h-19.886272v-75.25z" fill="#0f1689"/>
11 | <g transform="matrix(1 0 0 -1 11.958136 455)">
12 | <g fill="#0f1689" mask="url(#a)">
13 | <path d="m203.460676 95.6875425c6.93631 0 12.559301-14.8092194 12.559301-33.0773172s-5.622991-33.0773172-12.559301-33.0773172c-6.936311 0-12.559301 14.8092194-12.559301 33.0773172s5.62299 33.0773172 12.559301 33.0773172z" transform="matrix(.81915204 .57357644 -.57357644 .81915204 111.870091 -51.706556)"/>
14 | <path d="m30.1423223 95.6875425c6.9363104 0 12.559301-14.8092194 12.559301-33.0773172s-5.6229906-33.0773172-12.559301-33.0773172-12.5593009 14.8092194-12.5593009 33.0773172 5.6229905 33.0773172 12.5593009 33.0773172z" transform="matrix(-.81915204 .57357644 .57357644 .81915204 58.084611 47.704768)"/>
15 | <path d="m116.732815 66.2752676c6.936311 0 12.559301-14.8092193 12.559301-33.0773172 0-18.2680978-5.62299-33.07731713-12.559301-33.07731713-6.93631 0-12.559301 14.80921933-12.559301 33.07731713 0 18.2680979 5.622991 33.0773172 12.559301 33.0773172z" transform="matrix(-1 0 0 1 272.628524 53.670762)"/>
16 | </g>
17 | <path d="m251.467006 173.099849c-20.230076-33.609969-56.889565-56.067908-98.755776-56.067908-40.720798 0-76.5158766 21.245901-97.0586959 53.334588m2.1981107 129.169534c20.8403036 30.232701 55.5559042 50.026591 94.8605852 50.026591 39.376099 0 74.146424-19.865887 94.974049-50.191495" mask="url(#a)" stroke="#0f1689" stroke-width="20"/>
18 | </g>
19 | <g transform="translate(11.958136)">
20 | <g fill="#0f1689" mask="url(#b)">
21 | <path d="m203.460676 95.6875425c6.93631 0 12.559301-14.8092194 12.559301-33.0773172s-5.622991-33.0773172-12.559301-33.0773172c-6.936311 0-12.559301 14.8092194-12.559301 33.0773172s5.62299 33.0773172 12.559301 33.0773172z" transform="matrix(.81915204 .57357644 -.57357644 .81915204 111.870091 -54.166016)"/>
22 | <path d="m30.1423223 95.6875425c6.9363104 0 12.559301-14.8092194 12.559301-33.0773172s-5.6229906-33.0773172-12.559301-33.0773172-12.5593009 14.8092194-12.5593009 33.0773172 5.6229905 33.0773172 12.5593009 33.0773172z" transform="matrix(-.81915204 .57357644 .57357644 .81915204 58.084611 45.245308)"/>
23 | <path d="m116.732815 66.2752676c6.936311 0 12.559301-14.8092193 12.559301-33.0773172 0-18.2680978-5.62299-33.07731713-12.559301-33.07731713-6.93631 0-12.559301 14.80921933-12.559301 33.07731713 0 18.2680979 5.622991 33.0773172 12.559301 33.0773172z" transform="matrix(-1 0 0 1 272.628524 51.211302)"/>
24 | </g>
25 | <path d="m251.467006 170.64039c-20.230076-33.609969-56.889565-56.067908-98.755776-56.067908-40.720798 0-76.5158766 21.2459-97.0586959 53.334587m2.1981107 129.169534c20.8403036 30.232702 55.5559042 50.026591 94.8605852 50.026591 39.376099 0 74.146424-19.865886 94.974049-50.191494" mask="url(#b)" stroke="#0f1689" stroke-width="20"/>
26 | </g>
27 | </g>
28 | </svg>
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Ali
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Operators/README.md:
--------------------------------------------------------------------------------
  1 | <!-- PROJECT LOGO -->
  2 | <br />
  3 | <div align="center">
  4 |   <a href="https://github.com/alifiroozi80/CKA/edit/main/Operators">
  5 |     <img src="images/logo.png" alt="Logo" width="150" height="150">
  6 |   </a>
  7 | 
  8 | <h3 align="center">Operators</h3>
  9 | 
 10 |   <p align="center">
 11 |     Extend the functionality of the Kubernetes API
 12 |   </p>
 13 | </div>
 14 | <br>
 15 | <div id="top">
 16 | <!-- TABLE OF CONTENTS -->
 17 | <details>
 18 |   <summary>Table of Contents</summary>
 19 |   <ol>
 20 |   <li>
 21 |      Extending the Kubernetes API
 22 |     <ul>
 23 |         <li>
 24 |                 Intro
 25 |             <ul>
 26 |                 <li><a href="#Extending-the-Kubernetes-API">Extending the Kubernetes API</a></li>
 27 |                 <li><a href="#Revisiting-the-API-server">Revisiting the API server</a></li>
 28 |                 <li><a href="#Creating-new-types">Creating new types</a></li>
 29 |             </ul>
 30 |         </li>
 31 |         <li>
 32 |                 Custom Resource Definitions (CRDs)
 33 |             <ul>
 34 |                 <li><a href="#Custom-Resource-Definitions-CRDs">Creating a simple CRD</a></li>
 35 |                 <li><a href="#CRD-Syntax-of-CRD">Syntax of a CRD YAML file</a>
 36 |                     <ul>
 37 |                         <li><a href="#CRD-Syntax-of-CRD-Validation">Validation</a></li>
 38 |                         <li><a href="#CRD-Syntax-of-CRD-Presentation">Presentation</a></li>
 39 |                         <li><a href="#CRD-Syntax-of-CRD-Versioning">Versioning</a>
 40 |                             <ul>
 41 |                                 <li><a href="#CRD-Syntax-of-CRD-Stored-Versioning">Stored version</a></li>
 42 |                                 <li><a href="#CRD-Syntax-of-CRD-Served-Versioning">Served versions</a></li>
 43 |                             </ul>
 44 |                         </li>
 45 |                     </ul>
 46 |                 </li>
 47 |                 <li><a href="#Creating-a-custom-resource">Creating a custom resource</a></li>
 48 |                 <li><a href="#What-can-we-do-with-CRDs">What can we do with CRDs?</a></li>
 49 |                 <li><a href="#Some-Notes-on-CRDs">Some Notes on CRDs</a></li>
 50 |                 <li><a href="#Ab-using-the-API-server">(Ab)using the API server</a></li>
 51 |             </ul>
 52 |         </li>
 53 |     </ul>
 54 |   </li>
 55 |   <li>
 56 |      K8s Operators
 57 |     <ul>
 58 |         <li><a href="#Operators-What-are-operators">What are operators?</a>
 59 |           <ul>
 60 |             <li><a href="#Operators-What-are-they-made-from">What are they made from?</a></li>
 61 |           </ul>
 62 |         </li>
 63 |         <li><a href="#Operators-Why-use-operators">Why use operators?</a>
 64 |            <ul>
 65 |             <li><a href="#Operators-State-ful-less-Applications-with-out-Operator">State(ful/less) Applications with/out Operator</a></li>
 66 |             <li><a href="#Operators-Use-cases-for-operators">Use-cases for operators</a></li>
 67 |           </ul>
 68 |         </li>
 69 |         <li><a href="#Operators-How-operators-work">How operators work</a></li>
 70 |         <li>
 71 |             Demo: Setup Prometheus Monitoring
 72 |             <ul>
 73 |                 <li><a href="#Demo-Setup-Prometheus-Monitoring-Introduction">Introduction</a></li>
 74 |                 <li><a href="#Demo-Setup-Prometheus-Monitoring-Setup-with-Helm-Chart">Setup with Helm Chart</a></li>
 75 |                 <li><a href="#Demo-Understand-the-created-Prometheus-components">Understand the created Prometheus components</a></li>
 76 |                 <li><a href="#Demo-Access-Grafana">Access Grafana</a></li>
 77 |                 <li><a href="#Demo-Prometheus-UI">Prometheus UI</a></li>
 78 |             </ul>
 79 |         </li>
 80 |         <li>
 81 |             Demo: Splunk Operator for Kubernetes
 82 |             <ul>
 83 |                 <li><a href="#Demo-Splunk-Operator-for-Kubernetes-Introduction">Introduction</a></li>
 84 |                 <li><a href="#Demo-Splunk-Operator-for-Kubernetes-Deploy-Splunk-Operator">Installing the Splunk Operator</a></li>
 85 |                 <li><a href="#Demo-Splunk-Operator-for-Kubernetes-Deploy-Splunk">Creating a Splunk Enterprise deployment</a></li>
 86 |                 <li><a href="#Demo-Splunk-Operator-for-Kubernetes-Access-Splunk-Dashboard">Access Splunk Dashboard</a></li>
 87 |             </ul>
 88 |         </li>
 89 |         <li>
 90 |             What does it take to write an operator?
 91 |             <ul>
 92 |                 <li>
 93 |                   Top-down vs. bottom-up
 94 |                   <ul>
 95 |                     <li><a href="#Operators-Top-down-vs-bottom-up">Top-down approach</a>
 96 |                     <li><a href="#Operators-Top-down-vs-bottom-up">Bottom-up approach</a>
 97 |                   </ul>
 98 |                 </li>
 99 |                 <li><a href="#Operators-General-idea">General idea</a></li>
100 |                 <li><a href="#Operators-Tools">Tools fot writing an operator</a></li>
101 |             </ul>
102 |         </li>
103 |         <li><a href="#Operators-Operator-reliability">Operator reliability</a></li>
104 |         <li><a href="#Operators-Beyond-CRDs">Beyond CRDs</a></li>
105 |     </ul>
106 |   </li>
107 | </ol>
108 | </details>
109 | </div>
110 | 
111 | ---
112 | 
113 | # Extending the Kubernetes API
114 | 
115 | # Intro
116 | 
117 | <div id="Intro-to-Extending-the-Kubernetes-API">
118 | 
119 | ## Extending the Kubernetes API
120 | 
121 | <div id="Extending-the-Kubernetes-API">
122 | 
123 | There are multiple ways to extend the Kubernetes API.
124 | 
125 | For instance:
126 | 
127 | * Custom Resource Definitions (CRDs)
128 | * Aggregation Layer
129 | * etc.
130 | 
131 | We are going to cover:
132 | 
133 | * Custom Resource Definitions (CRDs)
134 | 
135 | </div> <!-- Extending the Kubernetes API -->
136 | 
137 | ## Revisiting the API server
138 | 
139 | <div id="Revisiting-the-API-server">
140 | 
141 | * The Kubernetes API server is a central point of the control plane (everything connects to it: `controller manager`,
142 |   `scheduler`, `kubelet`)
143 | * Almost everything in Kubernetes is materialized by a resource
144 | * Resources have a type (or "kind")
145 | * We can see existing types with `kubectl api-resources`
146 | * We can list resources of a given type with `kubectl get <type>`
147 | 
148 | <img src="images/api.png">
149 | 
150 | </div> <!-- Revisiting the API server -->
151 | 
152 | ## Creating new types
153 | 
154 | <div id="Creating-new-types">
155 | 
156 | * We can create new types with Custom Resource Definitions (CRDs)
157 | * CRDs are created **dynamically** (without recompiling or restarting the API server)
158 | * **CRDs themselves are resources**:
159 |     * we can create a new type with `kubectl create` and some YAML
160 |     * we can see all our custom types with `kubectl get crds`
161 | * After we create a CRD, the new type works just like built-in types
162 | 
163 | </div> <!-- Creating new types -->
164 | </div> <!-- Intro -->
165 | 
166 | # Custom Resource Definitions (CRDs)
167 | 
168 | <div id="">
169 | 
170 | ## Creating a simple CRD
171 | 
172 | <div id="Custom-Resource-Definitions-CRDs">
173 | 
174 | * [Docs](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#create-a-customresourcedefinition)
175 | 
176 | * This YAML file describes a very simple CRD representing different kinds of coffee
177 |     ```yaml
178 |     apiVersion: apiextensions.k8s.io/v1
179 |     kind: CustomResourceDefinition
180 |     metadata:
181 |       name: coffees.crd.example
182 |     spec:
183 |       group: crd.example
184 |       versions: # VERSIONING
185 |         - name: v1
186 |           served: true # VERSIONING-SERVED
187 |           storage: true # VERSIONING-STORED
188 |           schema: # VALIDATION
189 |             openAPIV3Schema:
190 |               type: object
191 |               properties:
192 |                 spec:
193 |                   type: object
194 |                   properties:
195 |                     taste:
196 |                       type: string
197 |       scope: Namespaced
198 |       names:
199 |         plural: coffees
200 |         singular: coffee
201 |         kind: Coffee
202 |         shortNames:
203 |         - cof
204 |     ```
205 | * Apply it
206 |     ```shell
207 |     kubectl apply -f <FILE-NAME.yaml>
208 |     ```
209 | * Confirm that it shows up
210 |     ```shell
211 |     kubectl get crd
212 |     ```
213 | 
214 | </div> <!-- Creating a simple CRD -->
215 | 
216 | ## Syntax of a CRD YAML file
217 | 
218 | <div id="CRD-Syntax-of-CRD">
219 | 
220 | ### Validation
221 | 
222 | <div id="CRD-Syntax-of-CRD-Validation">
223 | 
224 | * By default, a CRD is "free form" (we can put pretty much anything we want in it)
225 | * When creating a CRD, we should provide an
226 |   `OpenAPI v3 schema` ([Example](https://github.com/amaizfinance/redis-operator/blob/master/deploy/crds/k8s_v1alpha1_redis_crd.yaml#L34))
227 | * The API server will then validate resources created/edited with this schema
228 | * If we need a stronger validation, we can use a Validating Admission Webhook:
229 |     * Run
230 |       an [admission webhook server](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#write-an-admission-webhook-server)
231 |       to receive validation requests
232 |     * Register the webhook by creating
233 |       a [ValidatingWebhookConfiguration](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#configure-admission-webhooks-on-the-fly)
234 |     * Each time the API server receives a request matching the configuration,
235 |     * The request is sent to our server for validation
236 | 
237 | </div> <!-- Validation -->
238 | 
239 | ### Presentation
240 | 
241 | <div id="CRD-Syntax-of-CRD-Presentation">
242 | 
243 | * By default, `kubectl get mycustomresource` won't display much information (just the name and age of each resource)
244 | * When creating a CRD, we can specify additional columns to
245 |   print ([Example](https://github.com/amaizfinance/redis-operator/blob/master/deploy/crds/k8s_v1alpha1_redis_crd.yaml#L6)
246 |   , [Docs](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#additional-printer-columns))
247 | * By default, `kubectl describe mycustomresource` will also be generic
248 | * `kubectl describe` can show events related to our custom resources (for that, we need to create Event resources, and
249 |   fill the `involvedObject` field)
250 | * For scalable resources, we can define a `scale` sub-resource
251 | * This will enable the use of `kubectl scale` and other scaling-related operations
252 | 
253 | </div> <!-- Presentation -->
254 | 
255 | ### Versioning
256 | 
257 | <div id="CRD-Syntax-of-CRD-Versioning">
258 | 
259 | * As our operator evolves over time, we may have to change the CRD (add, remove, change fields)
260 | * Like every other resource in
261 |   Kubernetes, [custom resources are versioned](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definition-versioning)
262 | * When creating a CRD, we need to specify a list of versions
263 | * Versions can be marked as `stored` and/or `served`
264 | 
265 | #### Stored version
266 | 
267 | <div id="CRD-Syntax-of-CRD-Stored-Versioning">
268 | 
269 | * Exactly one version has to be marked as the `stored` version
270 | * As the name implies, it is the one that will be stored in etcd
271 | * Resources in storage are never converted automatically (we need to read and re-write them ourselves)
272 | * Yes, this means that we can have different versions in etcd at any time
273 | * Our code needs to handle all the versions that still exist in storage
274 | 
275 | </div> <!-- Stored version -->
276 | 
277 | #### Served versions
278 | 
279 | <div id="CRD-Syntax-of-CRD-Served-Versioning">
280 | 
281 | * By default, the Kubernetes API will serve resources "as-is" (using their stored version)
282 | * It will assume that all versions are compatible storage-wise (i.e. that the spec and fields are compatible between
283 |   versions)
284 | 
285 | </div> <!-- Served versions -->
286 | </div> <!-- Versioning -->
287 | </div> <!-- Syntax of CRD -->
288 | 
289 | ## Creating a custom resource
290 | 
291 | <div id="Creating-a-custom-resource">
292 | 
293 | * This YAML file defines a resource using the CRD that we just created
294 |     ```yaml
295 |     apiVersion: crd.example/v1
296 |     kind: Coffee
297 |     metadata:
298 |       name: arabica
299 |     spec:
300 |       taste: strong
301 |     ```
302 | * Apply it
303 |     ```shell
304 |     kubectl apply -f <FILE-NAME.yaml>
305 |     ```
306 | * View the coffee beans that we just created:
307 |     ```shell
308 |     kubectl get coffees
309 |     ```
310 | 
311 | </div> <!-- Creating a custom resource -->
312 | 
313 | ## What can we do with CRDs?
314 | 
315 | <div id="What-can-we-do-with-CRDs">
316 | 
317 | There are many possibilities!
318 | 
319 | * `Operators` encapsulate complex sets of resources (e.g.: a PostgreSQL replicated cluster; an etcd cluster...
320 |   see [OperatorHub](https://operatorhub.io) to find more)
321 | * Custom use-cases `gitkube`
322 |     * Gitkube is a tool used to deploy applications on a Kubernetes cluster by simply running a git push . In other
323 |       words, you can go from your source code to a deployed application on a Kubernetes cluster.
324 |     * Creates a new custom type, `Remote`, exposing a `git`+`ssh` server
325 |     * Deploy by pushing YAML or Helm charts to that remote
326 | * Replacing built-in types with CRDs (see
327 |   this [lightning talk by Tim Hockin](https://www.youtube.com/watch?v=ji0FWzFwNhA))
328 | 
329 | </div> <!-- What can we do with CRDs? -->
330 | 
331 | ## Some Notes on CRDs
332 | 
333 | <div id="Some-Notes-on-CRDs">
334 | 
335 | * When creating a CRD, we should pass an **OpenAPI v3 schema** (which will be used to validate resources)
336 | * Generally, when creating a CRD, we also want to run a **controller** (otherwise nothing will happen when we create
337 |   resources of that type)
338 | * The controller will typically **watch** our custom resources (and **take action when they are created/updated**)
339 | 
340 | </div> <!-- Some Notes on CRDs -->
341 | 
342 | ## (Ab)using the API server
343 | 
344 | <div id="Ab-using-the-API-server">
345 | 
346 | * If we need to store something "safely" (as `key:value` in `etcd`), we can use CRDs
347 | * This gives us primitives to read/write/list objects
348 | * The Kubernetes API server can run on its own (without the scheduler, controller manager, and kubelet)
349 | * By loading CRDs, we can have it manage totally different objects (unrelated to containers, clusters, etc.)
350 | 
351 | </div> <!-- (Ab)using the API server -->
352 | </div> <!-- Custom Resource Definitions (CRDs) -->
353 | 
354 | <p align="right">(<a href="#top">back to top</a>)</p>
355 | 
356 | # K8s Operator
357 | 
358 | ## What are operators?
359 | 
360 | <div id="Operators-What-are-operators">
361 | 
362 | According to [CoreOS](https://cloud.redhat.com/blog/introducing-operators-putting-operational-knowledge-into-software),
363 | An operator represents human operational knowledge in software to manage an application reliably.
364 | 
365 | Remember that Operators are mainly used for **Stateful** or **complex** Applications.
366 | 
367 | Examples:
368 | 
369 | * Deploying and configuring replication with MySQL, PostgreSQL ...
370 | * Setting up Elasticsearch, Kafka, RabbitMQ, Zookeeper ...
371 | * Reacting to failures when intervention is needed
372 | * Scaling up and down these systems
373 | 
374 | ### What are they made from?
375 | 
376 | <div id="Operators-What-are-they-made-from">
377 | 
378 | * Operators combine two things:
379 |     * Custom Resource Definitions (CRD)
380 |     * Controller code watching the corresponding resources and acting upon them
381 | * A given operator can define one or multiple CRDs
382 | * The controller code (control loop) typically runs within the cluster (running as a Deployment with 1 replica is a
383 |   common scenario)
384 | * But it could also run elsewhere (nothing mandates that the code run on the cluster, as long as it has API access)
385 | 
386 | </div> <!-- What are they made from? -->
387 | </div> <!-- What are operators? -->
388 | 
389 | ## Why use operators?
390 | 
391 | <div id="Operators-Why-use-operators">
392 | 
393 | * Kubernetes gives us Deployments, StatefulSets, Services ...
394 | * These mechanisms give us building blocks to deploy applications
395 | * They work great for services that are made of `N` identical containers (like stateless ones)
396 | * They also work great for some stateful applications like Consul, etcd ... (with the help of highly persistent volumes)
397 | * They're not enough for complex services:
398 |     * where different containers have different roles
399 |     * where extra steps have to be taken when scaling or replacing containers
400 | 
401 | ### State(ful/less) Applications with/out Operator
402 | 
403 | <div id="Operators-State-ful-less-Applications-with-out-Operator">
404 | 
405 | First, remember that Operators are mainly used for **Stateful** or **complex** Applications.
406 | 
407 | So, we are going to first talk about:
408 | 
409 | 1) Stateless Applications on K8s
410 | 2) Stateful Applications **without** Operator
411 | 3) Stateful Applications **with** Operator
412 | 
413 | ---
414 | 
415 | #### Stateless applications on K8s
416 | 
417 | Stateless Application: A Stateless application is one which depends on **no persistent storage**. The only thing your
418 | cluster is responsible for is the code, and other static content, being hosted on it. That's it, no changing databases,
419 | no writes and no leftover files when the pod is deleted.
420 | 
421 | So, it means when you deploy a stateless application:
422 | 
423 | * No control is necessary after the app is deployed
424 | * Everything such as Updates or Scaling Down/Up is up to Kubernetes (Via **Control loop**)
425 | * Control loop: **OBSERVER**, **CHECK DIFFERENCE**, **TAKE ACTION**
426 | 
427 | Kubernetes:
428 | 
429 | * Recreate died Pods
430 | * Restart updated Pods
431 | 
432 | ---
433 | 
434 | #### Stateful Applications **without** Operator
435 | 
436 | Stateful Application: Stateful applications **save data to persistent disk storage** for use by the server, by clients,
437 | and by other applications. An example of a stateful application is a database or key-value store to which data is saved
438 | and retrieved by other applications.
439 | 
440 | Let's say we have a MySQL database with three replicas.
441 | 
442 | * All three replicas are different
443 | * Each replica has its state and identity
444 | * Order is important
445 | * This process differs for each application (MySQL, PostgreSQL, etc.)
446 | * So, no standard solution
447 | 
448 | Typically, that's why Stateful applications require:
449 | 
450 | * Manual intervention
451 | * Humans who operate these applications
452 | 
453 | However, in Kubernetes, this could be a problem because manually updating components is kind of against the Kubernetes
454 | concept, that is, Automation, Self-Healing, etc.
455 | 
456 | That's why many decide to host Stateful applications outside the K8s cluster, but we need some applications inside the
457 | K8s cluster, such as Prometheus monitoring, ETCD, etc.
458 | 
459 | So, how to manage Stateful applications?
460 | 
461 | ✅Operator
462 | 
463 | ---
464 | 
465 | #### Stateful Applications **with** Operator
466 | 
467 | Operator: An application-specific controller that extends the functionality of the Kubernetes API to create, configure,
468 | and **manage instances of complex applications on behalf of a Kubernetes user**.
469 | 
470 | So, the **Operator replaces the human Operator with a software operator**
471 | 
472 | * It means the Operator takes care of:
473 |     * How to deploy the app?
474 |     * How to create a cluster of replicas?
475 |     * How to recover?
476 |     * etc
477 | 
478 | So, Tasks are **automated** and **reusable**
479 | 
480 | This will end up having only one standard automated process.
481 | 
482 | It means: more complex/environments => more benefits
483 | 
484 | ##### How does this work?
485 | 
486 | Under the hood, it uses the:
487 | 
488 | * Control loop mechanism
489 |     * Control loop: **OBSERVER**, **CHECK DIFFERENCE**, **TAKE ACTION**
490 | * CRD
491 |     * CRD: Custom Resource Definition
492 |     * Custom K8s component (extends K8s API)
493 | * Domain/app-specific knowledge
494 |     * Automates entire lifecycle of the app it operates
495 | 
496 | ---
497 | 
498 | #### Summary
499 | 
500 | * Kubernetes managing the complete lifecycle of **Stateless** apps
501 |     * No business logic is necessary to:
502 |         * Create
503 |         * Update
504 |         * Delete
505 | 
506 | 
507 | * K8s can't automate the processes natively for Stateful apps
508 |     * Operators
509 |         * Prometheus-Operator
510 |         * MySQL-Operator
511 |         * PostgreSQL-Operator
512 | 
513 | </div> <!-- State(ful/less) Applications with/out Operator -->
514 | 
515 | ### Use-cases for operators
516 | 
517 | <div id="Operators-Use-cases-for-operators">
518 | 
519 | * Systems with primary/secondary replication, Examples: MariaDB, MySQL, PostgreSQL, Redis ...
520 | * Systems where different groups of nodes have different roles, Examples: ElasticSearch, MongoDB ...
521 | * Systems with complex dependencies (that are themselves managed with operators), Examples: Flink or Kafka, which both
522 |   depend on Zookeeper
523 | * Representing and managing external resources (Example: [AWS Service Operator](https://operatorhub.io/?keyword=AWS))
524 | * Managing complex cluster add-ons (Example: [Istio operator](https://operatorhub.io/operator/istio))
525 | * Deploying and managing our applications' lifecycles (more on that later)
526 | 
527 | </div> <!-- Use-cases for operators -->
528 | </div> <!-- Why use operators? -->
529 | 
530 | ## How operators work
531 | 
532 | <div id="Operators-How-operators-work">
533 | 
534 | * An operator creates one or more CRDs (i.e., it creates new "Kinds" of resources on our cluster)
535 | * The operator also runs a controller that will watch its resources
536 | * Each time we create/update/delete a resource, the controller is notified
537 | 
538 | </div> <!-- How operators work -->
539 | 
540 | ## Demo: Setup Prometheus Monitoring
541 | 
542 | <div id="">
543 | 
544 | ### Introduction
545 | 
546 | <div id="Demo-Setup-Prometheus-Monitoring-Introduction">
547 | 
548 | * We will set up a Prometheus in the K8s cluster in this Demo.
549 | * You can deploy pretty much anything you like, but I chose to deploy Prometheus because we have to go through this in
550 |   our next lesson (Monitor K8s cluster with Prometheus)
551 | 
552 | * What is Prometheus?
553 |   Prometheus is an open-source technology that provides monitoring and alerting functionality for cloud-native
554 |   environments, including Kubernetes. It can collect and store metrics as time-series data, recording information with a
555 |   timestamp. It can also collect and record labels, which are optional key-value pairs.
556 | * Is Prometheus a DevOps tool?
557 |   Prometheus is a tool that every DevOps professional should be familiar with.
558 | 
559 | ---
560 | 
561 | For deploying it, we have three options:
562 | 
563 | 1) Create all configuration YAML files by ourselves and execute them in the proper order
564 |     * Inefficient ❌
565 |     * Lot of effort ❌
566 | 2) Using an Operator
567 |     * Manages the combination of all components as one unit ✅
568 | 3) Using Helm chart to deploy Operator
569 |     * Most efficient ✅✅
570 |     * Maintained by Helm community
571 |     * Helm: Initial Setup
572 |     * Operator: Manage Setup
573 | 
574 | ---
575 | 
576 | ### Prometheus Architect
577 | 
578 | Here is not the place to talk about Prometheus. We discuss Prometheus in our next lesson in detail, but for the demo's
579 | sake, we are going through it very briefly.
580 | 
581 | Prometheus has three main components, and we have to set up these three.
582 | 
583 | * Prometheus Server processes and stores metrics data
584 | * Alert Manager: Send Alerts
585 | * Visualize the scraped data in UI (e.g. Grafana)
586 | 
587 | <img src="images/prometheus-server.png">
588 | 
589 | </div> <!-- Introduction -->
590 | 
591 | ### Setup with Helm Chart
592 | 
593 | <div id="Demo-Setup-Prometheus-Monitoring-Setup-with-Helm-Chart">
594 | 
595 | **Note:** If you are unfamiliar with Helm, I have a [Helm tutorial](https://github.com/alifiroozi80/CKA/tree/main/Helm)
596 | about it.
597 | 
598 | Be sure to check it out, but essentially Helm is a package manager for Kubernetes.
599 | 
600 | With that in mind, let's jump into the demo.
601 | 
602 | We will use the [Prometheus Chart](https://github.com/prometheus-community/helm-charts)
603 | 
604 | * Add the repo
605 |   ```shell
606 |   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
607 |   ```
608 | * Install Prometheus
609 |     ```shell
610 |     helm install prometheus prometheus-community/kube-prometheus-stack
611 |     ```
612 | 
613 | And that's it,
614 | We are done with Helm in this section!
615 | 
616 | </div> <!-- Setup with Helm Chart -->
617 | 
618 | ### Understand the created Prometheus components
619 | 
620 | <div id="Demo-Understand-the-created-Prometheus-components">
621 | 
622 | And that's it!
623 | Now we have a fully managed Prometheus server.
624 | 
625 | If you check your current cluster state with  `kubectl get all` you'll see a bunch of `Deployments`, `DaemonSets`, etc.
626 | 
627 | They are all created and managed by Prometheus Operator!
628 | 
629 | Now you'll hopefully see the benefits of Operators in the K8s cluster.
630 | 
631 | **Note:** What are these Components?
632 | 
633 | Here is not the place to talk about Prometheus, we will deep dive into Prometheus in our following lessons, but our goal
634 | here is only to show the benefit of Operators, not learning Prometheus.
635 | 
636 | </div> <!-- Understand the created Prometheus components -->
637 | 
638 | ### Access Grafana
639 | 
640 | <div id="Demo-Access-Grafana">
641 | 
642 | We want to access our newly created Grafana dashboard.
643 | 
644 | For that, we just quickly use the `kubectl port-forward` command.
645 | 
646 | Using Kubectl port forward lets you **quickly access your Kubernetes clusters directly from your local computer**.
647 | 
648 | ```shell
649 | kubectl port-forward deployment/prometheus-grafana 3000
650 | ```
651 | 
652 | ---
653 | 
654 | Head over to [localhost:3000](http://127.0.0.1:3000), and now you should see the Grafana dashboard.
655 | But first, we have to log in to it.
656 | 
657 | We will have to use a default username and password. However, we can create new users and update passwords later. Use
658 | the default credentials for now:
659 | 
660 | * username: admin
661 | * password: prom-operator
662 | 
663 | </div> <!-- Access Grafana -->
664 | 
665 | ### Prometheus UI
666 | 
667 | <div id="Demo-Prometheus-UI">
668 | 
669 | ```shell
670 | kubectl port-forward service/prometheus-kube-prometheus-prometheus 9090
671 | ```
672 | 
673 | </div> <!-- Prometheus UI -->
674 | </div> <!-- Demo: Setup Prometheus Monitoring -->
675 | 
676 | ## Demo: Splunk Operator for Kubernetes
677 | 
678 | <div id="">
679 | 
680 | ### Introduction
681 | 
682 | <div id="Demo-Splunk-Operator-for-Kubernetes-Introduction">
683 | 
684 | * We will set up a Splunk Enterprise application in our cluster for our second Demo.
685 | * Here we don't use helm to manage our YAML files. We download and apply them by ourselves.
686 | * What is Splunk?
687 | 
688 |   Splunk Enterprise collects data from any source, including metrics, logs, clickstreams, sensors, network traffic,
689 |   web servers, custom applications, hypervisors, containers, social media, and cloud
690 |   services. ([Splunk Enterprise](https://www.splunk.com/en_us/products/splunk-enterprise.html))
691 | * Is Splunk used in DevOps?
692 | 
693 |   **Log and application lifecycle analytics:** Splunk is a leading log management tool ideal for DevOps.
694 | * But, here is not the place to talk about Splunk. There are tons of tutorials on that. Here, we deploy it in our
695 |   cluster to see and interact with Operators in action.
696 | 
697 | </div> <!-- Introduction -->
698 | 
699 | ### Installing the Splunk Operator
700 | 
701 | <div id="Demo-Splunk-Operator-for-Kubernetes-Deploy-Splunk-Operator">
702 | 
703 | * [Docs](https://github.com/splunk/splunk-operator/tree/master/docs#getting-started-with-the-splunk-operator-for-kubernetes)
704 | 
705 | * We install and start the **Splunk Operator** for **cluster-wide** by running:
706 |     ```shell
707 |     kubectl apply -f https://github.com/splunk/splunk-operator/releases/download/2.0.0/splunk-operator-cluster.yaml
708 |     ```
709 | * Check the Pod
710 |     ```shell
711 |     $ kubectl get pods -n splunk-operator
712 |     NAME                                                  READY   STATUS    RESTARTS      AGE
713 |     splunk-operator-controller-manager-795cf4fc8c-46x2r   2/2     Running   0             1m
714 |     ```
715 | 
716 | </div> <!-- Deploy Splunk Operator -->
717 | 
718 | ### Creating a Splunk Enterprise deployment
719 | 
720 | <div id="Demo-Splunk-Operator-for-Kubernetes-Deploy-Splunk">
721 | 
722 | The `Standalone` **custom resource** is used to create a single instance deployment of Splunk Enterprise.
723 | 
724 | * Run this command to create a deployment named `s1`:
725 |     ```shell
726 |     cat <<EOF | kubectl apply -n splunk-operator -f -
727 |     apiVersion: enterprise.splunk.com/v3
728 |     kind: Standalone
729 |     metadata:
730 |       name: s1
731 |       finalizers:
732 |       - enterprise.splunk.com/delete-pvc
733 |     EOF
734 |     ```
735 | * Check the Pods
736 |     ```shell
737 |     $ kubectl get po -n splunk-operator 
738 |     NAME                                                  READY   STATUS    RESTARTS      AGE
739 |     splunk-s1-standalone-0                                1/1     Running   0             20s
740 |     splunk-operator-controller-manager-795cf4fc8c-46x2r   2/2     Running   0             5m
741 |     ```
742 | 
743 | </div> <!-- Deploy Splunk -->
744 | 
745 | ### Access Splunk Dashboard
746 | 
747 | <div id="Demo-Splunk-Operator-for-Kubernetes-Access-Splunk-Dashboard">
748 | 
749 | We use the `kubectl port-forward` to open port `8000` for Splunk Web access:
750 | 
751 | ```shell
752 | kubectl port-forward -n splunk-operator splunk-s1-standalone-0 8000
753 | ```
754 | 
755 | ---
756 | 
757 | For the first time you want to log in, you should use the default username and password created.
758 | 
759 | * Username: `admin`
760 | * Password:
761 | 
762 | 1) Get the password key in secret yaml file
763 |     ```shell
764 |     kubectl get secrets splunk-s1-standalone-secret-v1 -n splunk-operator -o yaml | grep password
765 |     ```
766 | 2) Decode the value
767 |     ```shell
768 |     echo 'VALUE-FROM-ABOVE-COMMAND' | base64 --decode | tr -d "\n"
769 |     ```
770 | 
771 | </div> <!-- Access Splunk Dashboard -->
772 | </div> <!-- Demo: Splunk Operator for Kubernetes -->
773 | 
774 | ## What does it take to write an operator?
775 | 
776 | <div id="">
777 | 
778 | * Writing a quick-and-dirty operator, or a POC/MVP, is easy
779 | * Writing a robust operator is hard
780 | * We will describe the general idea
781 | * We will list a few tools that can help us
782 | 
783 | ### Top-down vs. bottom-up
784 | 
785 | <div id="">
786 | 
787 | * Both approaches are possible
788 | * Let's see what they entail, and their respective pros and cons
789 | 
790 | #### Top-down approach
791 | 
792 | <div id="Operators-Top-down-vs-bottom-up">
793 | 
794 | * Start with **high-level design** (see next section)
795 | * Pros:
796 |     * Can yield cleaner design that will be more robust
797 | * Cons:
798 |     * Must be able to anticipate all the events that might happen
799 |     * Design will be better only to the extent of what we anticipated
800 |     * Hard to anticipate if we don't have production experience
801 | 
802 | ---
803 | 
804 | #### High-level design
805 | 
806 | * What are we solving? (e.g.: geographic databases backed by PostGIS with Redis caches)
807 | * What are our use-cases, stories? (e.g.: adding/resizing caches and read replicas; load balancing queries)
808 | * What kind of outage do we want to address? (e.g.: loss of individual node, pod, volume)
809 | * What are our non-features, the things we don't want to address? (e.g.: loss of datacenter/zone; differentiating
810 |   between read and write queries; cache invalidation; upgrading to newer major versions of Redis, PostGIS, PostgreSQL)
811 | 
812 | #### Low-level design
813 | 
814 | * What Custom Resource Definitions do we need? (one, many?)
815 | * How will we store configuration information? (part of the CRD spec fields, annotations, other?)
816 | * Do we need to store state? If so, where?
817 |     * State that is small and doesn't change much can be stored via the Kubernetes API (e.g.: leader information,
818 |       configuration, credentials)
819 |     * Things that are big and/or change a lot should go elsewhere (e.g.: metrics, bigger configuration file like GeoIP)
820 | 
821 | #### What can we store via the Kubernetes API?
822 | 
823 | * The API server stores most Kubernetes resources in etcd
824 | * Etcd is designed for **reliability**, not for **performance**
825 | * If our storage needs exceed what etcd can offer, we need to use something else:
826 |     * Either directly
827 |     * Or by extending the API server (for instance by using the agregation layer,
828 |       like [metrics server](https://github.com/kubernetes-sigs/metrics-server) does)
829 | 
830 | </div> <!-- Top-down approach -->
831 | 
832 | #### Bottom-up approach
833 | 
834 | <div id="Operators-Top-down-vs-bottom-up">
835 | 
836 | * Start with existing Kubernetes resources (`Deployment`, `statefulSet`, etc.)
837 | * Run the system in production
838 | * Add scripts, automation, to facilitate day-to-day operations
839 | * **Turn the scripts into an operator**
840 | * Pros: simpler to get started; reflects actual use-cases
841 | * Cons: can result in convoluted designs requiring extensive refactor
842 | 
843 | </div> <!-- Bottom-up approach -->
844 | </div> <!-- Top-down vs. bottom-up -->
845 | 
846 | ### General idea
847 | 
848 | <div id="Operators-General-idea">
849 | 
850 | * Our operator will watch its CRDs and associated resources
851 | * Drawing state diagrams and finite state automata helps a lot
852 | * It's OK if some transitions lead to a big catch-all "human intervention"
853 | * Over time, we will learn about new failure modes and add to these diagrams
854 | * It's OK to start with CRD creation / deletion and prevent any modification (that's the easy POC/MVP we were talking
855 |   about)
856 | * `Presentation` and `validation` will help our users (see <a href="#CRD-Syntax-of-CRD">here</a>)
857 | 
858 | </div> <!-- General idea -->
859 | 
860 | ### Tools fot writing an operator
861 | 
862 | <div id="Operators-Tools">
863 | 
864 | * CoreOS / RedHat Operator Framework
865 |     * [GitHub](https://github.com/operator-framework)
866 |       | [Blog](https://developers.redhat.com/blog/2018/12/18/introduction-to-the-kubernetes-operator-framework)
867 |       | [Intro talk](https://www.youtube.com/watch?v=8k_ayO1VRXE)
868 |       | [Deep dive talk](https://www.youtube.com/watch?v=fu7ecA2rXmc)
869 |       | [Simple example](https://faun.pub/writing-your-first-kubernetes-operator-8f3df4453234)
870 | * Kubernetes Operator Pythonic Framework (Kopf)
871 |     * [GitHub](https://github.com/nolar/kopf) | [Docs](https://kopf.readthedocs.io/en/stable)
872 | 
873 | </div> <!-- Tools fot writing an operator -->
874 | </div> <!-- What does it take to write an operator? -->
875 | 
876 | ## Operator reliability
877 | 
878 | <div id="Operators-Operator-reliability">
879 | 
880 | * Remember that the operator itself must be resilient (e.g.: the node running it can fail)
881 | * Our operator must be able to restart and recover gracefully
882 | * Do not store state locally (unless we can reconstruct that state when we restart)
883 | * As indicated earlier, we can use the Kubernetes API to store data:
884 |     * In the custom resources themselves
885 |     * In other resources' annotations
886 | 
887 | </div> <!-- Operator reliability -->
888 | 
889 | ## Beyond CRDs
890 | 
891 | <div id="Operators-Beyond-CRDs">
892 | 
893 | * CRDs cannot use custom storage (e.g. for time series data)
894 | * CRDs cannot support arbitrary sub-resources (like logs or exec for Pods)
895 | * CRDs cannot support protobuf (for faster, more efficient communication)
896 | * If we need these things, we can use
897 |   the [aggregation layer](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/apiserver-aggregation)
898 |   instead
899 | * The aggregation layer proxies all requests below a specific path to another server (this is used e.g. by the metrics
900 |   server)
901 | * [This documentation page](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/#choosing-a-method-for-adding-custom-resources)
902 |   compares the features of CRDs and API aggregation
903 | 
904 | </div> <!-- Beyond CRDs -->
905 | 
906 | <p align="right">(<a href="#top">back to top</a>)</p>
907 | 


--------------------------------------------------------------------------------
/Operators/images/api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Operators/images/api.png


--------------------------------------------------------------------------------
/Operators/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Operators/images/logo.png


--------------------------------------------------------------------------------
/Operators/images/prometheus-server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Operators/images/prometheus-server.png


--------------------------------------------------------------------------------
/Prometheus/README.md:
--------------------------------------------------------------------------------
   1 | <!-- PROJECT LOGO -->
   2 | <br />
   3 | <div align="center">
   4 |   <a href="https://github.com/alifiroozi80/CKA/edit/main/Prometheus">
   5 |     <img src="images/logo.svg" alt="Logo" width="150" height="150">
   6 |   </a>
   7 | 
   8 | <h3 align="center">Prometheus</h3>
   9 | 
  10 |   <p align="center">
  11 |     The Prometheus monitoring system and time series database.
  12 |   </p>
  13 | </div>
  14 | <br>
  15 | <div id="top">
  16 | <!-- TABLE OF CONTENTS -->
  17 | <details>
  18 |   <summary>Table of Contents</summary>
  19 |   <ol>
  20 |   <li>
  21 |      Introduction to Prometheus
  22 |     <ul>
  23 |         <li><a href="#Introduction-to-Prometheus-What-is-Prometheus">What is Prometheus</a></li>
  24 |         <li><a href="#Introduction-to-Prometheus-Prometheus-Architecture">Prometheus Architecture</a></li>
  25 |         <li><a href="#Introduction-to-Prometheus-Pull-Mechanism">Pull Mechanism</a></li>
  26 |         <li><a href="#Introduction-to-Prometheus-Configuring-Prometheus">Configuring Prometheus</a></li>
  27 |         <li><a href="#Introduction-to-Prometheus-Alert-Manager">Alert Manager</a></li>
  28 |         <li><a href="#Introduction-to-Prometheus-Data-Storage">Data Storage</a></li>
  29 |         <li><a href="#Introduction-to-Prometheus-PromQL-Query-Language">PromQL Query Language</a></li>
  30 |         <li><a href="#Introduction-to-Prometheus-Wrap-UP">Wrap UP</a></li>
  31 |     </ul>
  32 |   </li>
  33 |   <li>
  34 |       Install Prometheus in Kubernetes
  35 |     <ul>
  36 |       <li><a href="#Install-Prometheus-in-Kubernetes-Setup-Prometheus-in-K8s-cluster">Setup Prometheus in K8s cluster</a></li>
  37 |       <li><a href="#Install-Prometheus-in-Kubernetes-Understanding-Prometheus-Components">Understanding Prometheus Components</a></li>
  38 |     </ul>
  39 |   </li>
  40 |   <li>
  41 |       Data Visualization with Prometheus UI
  42 |     <ul>
  43 |       <li><a href="#Data-Visualization-with-Prometheus-UI-Decide-what-we-want-to-monitor">Decide what we want to monitor?</a></li>
  44 |       <li><a href="#Data-Visualization-with-Prometheus-UI-Prometheus-UI">Prometheus UI</a></li>
  45 |     </ul>
  46 |   </li>
  47 |   <li>
  48 |       Introduction to Grafana UI
  49 |     <ul>
  50 |       <li><a href="#Introduction-to-Grafana-Intro">Intro</a></li>
  51 |       <li><a href="#Introduction-to-Grafana-Create-your-dashboard">Create your dashboard</a></li>
  52 |       <li><a href="#Introduction-to-Grafana-Resource-Consumption-of-Cluster-Nodes">Resource Consumption of Cluster Nodes</a></li>
  53 |       <li><a href="#Introduction-to-Grafana-Test-Anomaly">Test Anomaly</a></li>
  54 |       <li><a href="#Introduction-to-Grafana-Configure-Users-and-Data-Structures">Configure Users & Data Structures</a></li>
  55 |     </ul>
  56 |   </li>
  57 |   <li>
  58 |       Alert Rules in Prometheus
  59 |     <ul>
  60 |       <li><a href="#Alert-Rules-in-Prometheus-Overview">Overview</a></li>
  61 |       <li><a href="#Alert-Rules-in-Prometheus-Existing-Alert-Rules">Existing Alert Rules</a></li>
  62 |     </ul>
  63 |   </li>
  64 |   <li>
  65 |       Create your Alert Rule
  66 |     <ul>
  67 |       <li><a href="#Create-your-Alert-Rule-Create-your-1st-Rule">Create your 1st Rule (HostHighCpuLoad)</a></li>
  68 |       <li><a href="#Create-your-Alert-Rule-Alert-Rule-for-Kubernetes">Alert Rule for Kubernetes</a></li>
  69 |       <li><a href="#Create-your-Alert-Rule-Create-your-2nd-Rule">Create your 2nd Rule (KubernetesPodCrashLooping)</a></li>
  70 |       <li><a href="#Create-your-Alert-Rule-Apply-Alert-Rules">Apply Alert Rules</a></li>
  71 |       <li><a href="#Create-your-Alert-Rule-Test-Alert-Rule">Test Alert Rule</a></li>
  72 |     </ul>
  73 |   </li>
  74 |   <li>
  75 |       Alertmanager
  76 |     <ul>
  77 |       <li>
  78 |           Introduction to Alertmanager
  79 |         <ul>
  80 |           <li><a href="#Alertmanager-Firing-State">Firing State</a></li>
  81 |           <li><a href="#Alertmanager-Alertmanager-Configuration-File">Alertmanager Configuration File</a></li>
  82 |         </ul>
  83 |       </li>
  84 |       <li>
  85 |           Configure Alertmanager with Email Receiver
  86 |         <ul>
  87 |           <li><a href="#Alertmanager-Configure-Alertmanager">Configure Alertmanager</a></li>
  88 |           <li><a href="#Alertmanager-Configure-Email-Notification">Configure Email Notification</a></li>
  89 |         </ul>
  90 |       </li>
  91 |       <li><a href="#Alertmanager-Trigger-Alerts-for-Email-Receiver">Trigger Alerts for Email Receiver</a></li>
  92 |     </ul>
  93 |   </li>
  94 |   <li>
  95 |       Monitor Third-Party Applications
  96 |     <ul>
  97 |       <li><a href="#Monitor-Third-Party-Applications-Intro">Intro</a></li>
  98 |       <li>
  99 |           Deploy Redis Exporter
 100 |         <ul>
 101 |           <li><a href="#Deploy-Redis-Exporter-Create-Alert-Rules-for-Redis">Create Alert Rules for Redis</a></li>
 102 |           <li><a href="#Deploy-Redis-Exporter-Trigger-Redis-is-Down">Trigger the 'Redis is Down'</a></li>
 103 |           <li><a href="#Deploy-Redis-Exporter-Create-Redis-Dashboard-in-Grafana">Create Redis Dashboard in Grafana</a></li>
 104 |         </ul>
 105 |       </li>
 106 |       <li><a href="#Monitor-Third-Party-Applications-Alert-Rules-Grafana-dashboard-for-Redis">Alert Rules & Grafana dashboard for Redis</a></li>
 107 |     </ul>
 108 |   </li>
 109 |   <li>
 110 |       Monitor own App
 111 |     <ul>
 112 |       <li>
 113 |         Collect & Expose Metrics
 114 |         <ul>
 115 |           <li><a href="#Collect-and-Expose-Metrics-Intro">Intro</a></li>
 116 |           <li><a href="#Collect-and-Expose-Metrics-Expose-Metrics">Expose Metrics</a></li>
 117 |           <li><a href="#Collect-and-Expose-Metrics-Build-Docker-Image-Push-it-to-a-Repo">Build Docker Image & Push it to a Repo</a></li>
 118 |           <li><a href="#Collect-and-Expose-Metrics-Deploy-App-into-K8s-cluster">Deploy App into K8s cluster</a></li>
 119 |         </ul>
 120 |       </li>
 121 |       <li>
 122 |           Configure Monitoring
 123 |         <ul>
 124 |           <li><a href="#Configure-Monitoring-Create-ServiceMonitor">Create ServiceMonitor</a></li>
 125 |           <li><a href="#Configure-Monitoring-Create-Grafana-Dashboard">Create Grafana Dashboard</a></li>
 126 |         </ul>
 127 |       </li>
 128 |     </ul>
 129 |   </li>
 130 | </ol>
 131 | </details>
 132 | </div>
 133 | 
 134 | ---
 135 | 
 136 | # Introduction to Prometheus
 137 | 
 138 | ## What is Prometheus
 139 | 
 140 | <div id="Introduction-to-Prometheus-What-is-Prometheus">
 141 | 
 142 | According to [docs](https://prometheus.io/docs/introduction/overview):
 143 | 
 144 | * Prometheus is an open-source monitoring system including:
 145 |     * Multiple service discovery backends to figure out which metrics to collect
 146 |     * A scraper to collect these metrics
 147 |     * An efficient time series database to store these metrics
 148 |     * A specific query language (PromQL) to query these time series
 149 |     * An alert manager to notify us according to metrics values or trends
 150 | 
 151 | ---
 152 | 
 153 | ### Why use Prometheus?
 154 | 
 155 | Today DevOps is a complex process. Therefore, we need more automation!
 156 | 
 157 | Let's say we have a couple of servers and a bunch of containers on them, and those containers talk to each other.
 158 | 
 159 | Now imagine your application suddenly is down!
 160 | 
 161 | * Error?
 162 | * Overloaded?
 163 | * Enough resources?
 164 | * Response latency?
 165 | 
 166 | You want to know if this issue is
 167 | 
 168 | * On Hardware
 169 | * On Application
 170 | 
 171 | How do you know what went wrong?
 172 | 
 173 | * Backend running?
 174 | * Any exception?
 175 | * Auth-Service running?
 176 | * Why did the Auth-Service crash?
 177 | 
 178 | ---
 179 | 
 180 | ### Use cases for using Prometheus monitoring
 181 | 
 182 | * Constantly monitor all the services
 183 | * Alert when crash
 184 | * Identify problems before
 185 | 
 186 | </div> <!-- What is Prometheus -->
 187 | 
 188 | ## Prometheus Architecture
 189 | 
 190 | <div id="Introduction-to-Prometheus-Prometheus-Architecture">
 191 | 
 192 | <img src="images/prometheus-server.png" alt="Prometheus Server">
 193 | 
 194 | ---
 195 | 
 196 | * Main component: **Prometheus Server**
 197 |     * Does the actual monitoring work
 198 |     * It has 3 component inside it
 199 | 
 200 | 1) HTTP Server (Accepts PromQL queries from Prometheus web UI, Grafana, etc.)
 201 | 2) Storage (Stored metrics (More on that later) data in Time-Series database)
 202 | 3) Retrieval (Pull metrics data from Application, Servers, Services, etc.)
 203 | 
 204 | ---
 205 | 
 206 | * **What** does Prometheus monitor?
 207 |     * Linux/Windows Servers
 208 |     * Single Application
 209 |     * Services like Database
 210 | 
 211 | * **Which units** are monitored of those targets?
 212 | * CPU State
 213 | * Memory/Disk Space Usage
 214 | * Request Count
 215 | * Exception Count
 216 | * Request Duration
 217 | 
 218 | ---
 219 | 
 220 | ### Metrics
 221 | 
 222 | * Format: **Human-Readable** text-based
 223 | * Metrics entries: **TYPE** and **HELP** attributes
 224 | * **Help**
 225 |     * Description of what the metric is
 226 | * **TYPE**: 3 metrics types
 227 |     * **Counter**: How many times X happend?
 228 |     * **Gauge**: What is the current value of X now?
 229 |     * **Histogram**: How long or How big?
 230 | * Example of a metric
 231 |   ```text
 232 |   # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
 233 |   # TYPE go_gc_duration_seconds summary
 234 |   go_gc_duration_seconds{quantile="0"} 5.2344e-05
 235 |   go_gc_duration_seconds{quantile="0.25"} 6.0619e-05
 236 |   go_gc_duration_seconds{quantile="0.5"} 6.6437e-05
 237 |   go_gc_duration_seconds{quantile="0.75"} 8.0573e-05
 238 |   go_gc_duration_seconds{quantile="1"} 0.00264025
 239 |   go_gc_duration_seconds_sum 766.689340367
 240 |   go_gc_duration_seconds_count 1.0138386e+07
 241 |   # HELP go_goroutines Number of goroutines that currently exist.
 242 |   # TYPE go_goroutines gauge
 243 |   go_goroutines 15
 244 |   ```
 245 | 
 246 | ### What kind of metrics can we collect?
 247 | 
 248 | * Node metrics (related to physical or virtual machines)
 249 | * Container metrics (resource usage per container)
 250 | * Databases, message queues, load balancers, etc. (check out
 251 |   this [list of exporters](https://prometheus.io/docs/instrumenting/exporters)!)
 252 | * Instrumentation (=deluxe `print` for our code)
 253 | * Business metrics (customers served, revenue, ...)
 254 | 
 255 | * Node Metrics
 256 |     * CPU, RAM, disk usage on the whole node
 257 |     * Total number of processes running, and their states
 258 |     * Number of open files, sockets, and their states
 259 |     * I/O activity (disk, network), per operation or volume
 260 |     * Physical/hardware (when applicable): temperature, fan speed...
 261 |     * ...and much more!
 262 | * Container Metrics
 263 |     * Similar to node metrics, but not totally identical
 264 |     * RAM breakdown will be different
 265 |         * active vs inactive memory
 266 |         * some memory is shared between containers, and specially accounted for
 267 |     * I/O activity is also harder to track
 268 |         * async writes can cause deferred "charges"
 269 |         * some page-ins are also shared between containers
 270 | * Application Metrics
 271 |     * Arbitrary metrics related to your application and business
 272 |     * System performance: request latency, error rate...
 273 |     * Volume information: number of rows in database, message queue size...
 274 |     * Business data: inventory, items sold, revenue...
 275 | 
 276 | ### Collecting Metrics data from Targets
 277 | 
 278 | * The Prometheus server will scrape URLs like `HOSTADDRESS/metrics` at regular intervals (by default: every minute; can
 279 |   be more/less frequent)
 280 | * The list of URLs to scrape (the scrape targets) is defined in configuration
 281 | * Pulls from HTTP endpoint
 282 | * Must be in the correct format
 283 | 
 284 | ---
 285 | 
 286 | ### Target endpoints and exporters
 287 | 
 288 | * Some applications exposing `/metrics` endpoints **By default**
 289 | * Many services need another component (**Exporter**)
 290 | 
 291 | e.g., do You want to monitor a Linux Server?
 292 | 
 293 | * Download a Node Exporter
 294 | * Untar and execute it
 295 | * Convert the metrics of the server
 296 | * Exposes `/metrics` endpoint
 297 | * Configure Prometheus to scrap this endpoint
 298 | * **NOTE:** Exporters are available as **Docker image** too!
 299 | 
 300 | </div> <!-- Prometheus Architecture -->
 301 | 
 302 | ## Pull Mechanism
 303 | 
 304 | <div id="Introduction-to-Prometheus-Pull-Mechanism">
 305 | 
 306 | * Prometheus use **Pull** mechanism to scrap the data from applications (Instead of **Push** mechanism that others like
 307 |   Amazon Cloud Watch use)
 308 |     * Multiple Prometheus instances can pull metrics data
 309 |     * Better detection/insight if service is up & running
 310 | * Push system of other monitoring systems, e.g. Amazon Cloud Watch
 311 |     * Application/Servers **push** to a centralized collection platform
 312 |         * High load of network traffic
 313 |         * Monitoring can become your bottleneck
 314 |         * Install additional software or tool to push metrics
 315 | 
 316 | ---
 317 | 
 318 | #### Pushgateway
 319 | 
 320 | What if our target only **runs for a short time?**
 321 | 
 322 | * The Prometheus Pushgateway allows you to push time series from short-lived service-level batch jobs to an intermediary
 323 |   job which Prometheus can scrape.
 324 | * In other words: The Pushgateway is an intermediary service which allows you to push metrics from jobs which cannot be
 325 |   scraped.
 326 | * [WHEN TO USE THE PUSHGATEWAY](https://prometheus.io/docs/practices/pushing/#when-to-use-the-pushgateway)
 327 | 
 328 | </div> <!-- Pull Mechanism -->
 329 | 
 330 | ## Configuring Prometheus
 331 | 
 332 | <div id="Introduction-to-Prometheus-Configuring-Prometheus">
 333 | 
 334 | * How does Prometheus know what to scrape and when?
 335 |     * [X] with simple YAML file!
 336 | 
 337 | * In that YAML file, we define **which targets?** and **at what interval?**
 338 | * Prometheus uses a **Service Recovery** to find those targets endpoints
 339 | * Here is an example:
 340 |   ```yaml
 341 |   # How often Prometheus will scrape its targets
 342 |   global:
 343 |     scrape_interval: 15s
 344 |     evaluation_interval: 15s
 345 |   
 346 |   # Rules for aggregating metric values or creating alerts when condition met
 347 |   rule_files:
 348 |   # - "first.rules"
 349 |   # - "second.rules"
 350 |   
 351 |   # What resources Prometheus monitors
 352 |   scrape_configs:
 353 |     - job_name: prometheus # Prometheus has its own /metrics endpoint
 354 |       static_configs:
 355 |         - targets: [ 'localhost:9090' ]
 356 |   
 357 |     # Define your own jobs
 358 |     - job_name: node_exporter
 359 |       metrics_path: "/metrics" # Default value for each job 
 360 |       schema: "http" # Default value for each job 
 361 |       scrape_interval: 1m
 362 |       scrape_timeout: 1m
 363 |       static_configs:
 364 |         - targets: [ 'localhost:9100' ]
 365 |   ```
 366 | 
 367 | </div> <!-- Configuring Prometheus -->
 368 | 
 369 | ## Alert Manager
 370 | 
 371 | <div id="Introduction-to-Prometheus-Alert-Manager">
 372 | 
 373 | * **How** does Prometheus trigger the alerts?
 374 | * **Who** receives the alerts?
 375 | 
 376 | Prometheus has a components called **Alertmanager** that responsible for reading the alert rules of config file.
 377 | 
 378 | The [Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager) handles alerts sent by client applications
 379 | such as the Prometheus server. It takes care of duplicating, grouping, and routing them to the correct receiver
 380 | integration such as email, PagerDuty, or OpsGenie. It also takes care of silencing and inhibition of alerts
 381 | 
 382 | </div> <!-- Alert Manager -->
 383 | 
 384 | ## Data Storage
 385 | 
 386 | <div id="Introduction-to-Prometheus-Data-Storage">
 387 | 
 388 | * **Where** does Prometheus store the data?
 389 |     * [x] [Disk usage](https://prometheus.io/docs/prometheus/latest/storage)
 390 | 
 391 | Prometheus stores its on-disk time series data under the directory specified by the flag `storage`, `local`, `path`. The
 392 | default path is `./data` (relative to the working directory), which is good to try something out quickly but most likely
 393 | not what you want for actual operations.
 394 | 
 395 | So, once you collect the metrics, Prometheus allows you to query those data by accepting PromQL queries
 396 | 
 397 | </div> <!-- Data Storage -->
 398 | 
 399 | ## PromQL Query Language
 400 | 
 401 | <div id="Introduction-to-Prometheus-PromQL-Query-Language">
 402 | 
 403 | You can:
 404 | 
 405 | * Query targets directly through `Prometheus Web UI`
 406 | * Or use more powerful visualization tools e.g. `Grafana`
 407 | 
 408 | They both use `PromQL` to get the data out of Prometheus
 409 | 
 410 | Example Queries
 411 | 
 412 | * `http_requests_total{status!~"4.."}`
 413 |     * Query all HTTP status codes except `4xx` ones
 414 | * `rate(http_requests_total[5m])[30m:]`
 415 |     * Returns the `5min` rate of `http_requests_total` metric for the past `30mins`
 416 | * `sum by (instance) (irate(container_cpu_usage_seconds_total{pod_name=~"xxx.*"}[5m]))`
 417 |     * Returns the cumulated CPU usage of `xxx` pods for each node
 418 | 
 419 | ---
 420 | 
 421 | * We won't learn PromQL in this repo
 422 | * We are going cover the basics to get an idea of what is possible tho
 423 | * We are going to break down one of the queries above (building it one step at a time)
 424 | * We'll learn more about PromQL in the Prometheus Alertmanager section
 425 | 
 426 | ### Step 1 - Graphing one metric across all tags
 427 | 
 428 | * This query will show us CPU usage across all containers: `container_cpu_usage_seconds_total`
 429 | * The suffix of the metrics name tells us:
 430 |     * The unit (seconds of CPU)
 431 |     * That it's the total used since the container creation
 432 | * Since it's a "total," it is an increasing quantity (we need to compute the derivative if we want e.g. CPU % over time)
 433 | * We see that the metrics retrieved have `tags` attached to them
 434 | 
 435 | ### Step 2 - Selecting metrics with tags
 436 | 
 437 | * This query will show us only metrics for `xxx` containers: `container_cpu_usage_seconds_total{pod_name=~"xxx.*"}`
 438 | * The `=~` operator allows regex matching
 439 | * We select all the pods with a name starting with `xxx` (it would be better to use labels to select pods; more on that
 440 |   later)
 441 | * The result is a smaller set of containers
 442 | 
 443 | ### Step 3 - Transforming counters in rates
 444 | 
 445 | * This query will show us CPU usage % instead of total seconds
 446 |   used: `100*irate(container_cpu_usage_seconds_total{pod_name=~"xxx.*"}[5m])`
 447 | * The [irate](https://prometheus.io/docs/prometheus/latest/querying/functions/#irate) operator computes the "per-second
 448 |   instant rate of increase"
 449 |     * `rate` is similar but allows decreasing counters and negative values
 450 |     * With `irate`, if a counter goes back to zero, we don't get a negative spike
 451 | * The `[5m]` tells how far to look back if there is a gap in the data
 452 | * And we multiply with `100*` to get CPU % usage
 453 | 
 454 | ### Step 4 - Aggregation operators
 455 | 
 456 | * This query sums the CPU usage per node:
 457 |   ```shell
 458 |   sum by (instance) (
 459 |     irate(container_cpu_usage_seconds_total{pod_name=~"xxx.*"}[5m])
 460 |   )
 461 |   ```
 462 | * `instance` corresponds to the node on which the container is running
 463 | * `sum by (instance) (...)` computes the sum for each instance
 464 | * Note: all the other tags are collapsed (in other words, the resulting graph only shows the `instance` tag)
 465 | * PromQL supports many
 466 |   more [aggregation operators](https://prometheus.io/docs/prometheus/latest/querying/operators/#aggregation-operators)
 467 | 
 468 | </div> <!-- PromQL Query Language -->
 469 | 
 470 | ## Wrap UP
 471 | 
 472 | <div id="Introduction-to-Prometheus-Wrap-UP">
 473 | 
 474 | ### Prometheus Characteristics
 475 | 
 476 | * Reliable
 477 | * Stand-alone and Self-containing
 478 | * Works, even if other parts of infrastructure broken
 479 | * No extensive set-up needed
 480 | * Less complex
 481 | 
 482 | ---
 483 | 
 484 | ### Scale Prometheus using Prometheus Federation
 485 | 
 486 | * Scalable cloud apps need monitoring that scales with them
 487 | * [Prometheus Federation](https://logz.io/blog/devops/prometheus-architecture-at-scale)
 488 |   allows a Prometheus server to **scrape data from other Prometheus servers**
 489 | 
 490 | <img src="images/Prometheus-Federations.jpg" alt="Prometheus Federations">
 491 | 
 492 | ---
 493 | 
 494 | ### Prometheus with Docker and Kubernetes
 495 | 
 496 | * Fully compatible
 497 | * Prometheus components are available as Docker image
 498 | * Can easily be deployed in Container Environments like Kubernetes
 499 | * **Monitoring of K8s cluster Node Resources out-of-the box!**
 500 | 
 501 | </div> <!-- Wrap UP -->
 502 | <p align="right">(<a href="#top">back to top</a>)</p>
 503 | 
 504 | # Install Prometheus in Kubernetes
 505 | 
 506 | ## Setup Prometheus in K8s cluster
 507 | 
 508 | <div id="Install-Prometheus-in-Kubernetes-Setup-Prometheus-in-K8s-cluster">
 509 | 
 510 | For deploying it, we have three options:
 511 | 
 512 | 1) Create all configuration YAML files by ourselves and execute them in the proper order
 513 |     * Inefficient ❌
 514 |     * Lot of effort ❌
 515 | 2) Using an Operator
 516 |     * Manages the combination of all components as one unit ✅
 517 | 3) Using Helm chart to deploy Operator
 518 |     * Most efficient ✅✅
 519 |     * Maintained by Helm community
 520 |     * Helm: Initial Setup
 521 |     * Operator: Manage Setup
 522 | 
 523 | We will use option 3 (Helm chart to deploy Operator) here.
 524 | 
 525 | NOTE: we have covered [Helm](https://github.com/alifiroozi80/CKA/tree/main/Helm)
 526 | and [Operators](https://github.com/alifiroozi80/CKA/tree/main/Operators) previously in this repo. Here we don't need to
 527 | know much about Operators. However, we should know a little bit about Helm.
 528 | 
 529 | We will use the [Prometheus Chart](https://github.com/prometheus-community/helm-charts)
 530 | 
 531 | * Add the repo
 532 |   ```shell
 533 |   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
 534 |   ```
 535 | * Create `monitoring` Namespace
 536 |     ```shell
 537 |     kubectl create ns monitoring
 538 |     ```
 539 | * Install Prometheus in its own Namespace (`monitoring`)
 540 |     ```shell
 541 |     helm install prometheus prometheus-community/kube-prometheus-stack -n monitoring
 542 |     ```
 543 | * Get everything in `monitoring` Namespace
 544 |     ```shell
 545 |     kubectl get all -n monitoring
 546 |     ```
 547 | 
 548 | </div> <!-- Setup Prometheus in K8s cluster -->
 549 | 
 550 | ## Understanding Prometheus Components
 551 | 
 552 | <div id="Install-Prometheus-in-Kubernetes-Understanding-Prometheus-Components">
 553 | 
 554 | * TWO StatefulSets
 555 |     * Prometheus Server
 556 |     * Alertmanager
 557 | * THREE Deployments
 558 |     * Prometheus Operators
 559 |         * Created Prometheus and Alertmanager StatefulSet
 560 |     * Grafana
 561 |     * Kube State Metrics
 562 |         * Own Helm chart
 563 |         * Dependency of this Helm chart that we have just installed
 564 |         * Scrapes K8s components --> K8s infrastructure monitoring out-of-the box!
 565 | * THREE ReplicaSets
 566 |     * Created by Deployments
 567 | * ONE DaemonSet
 568 |     * Node Expoerter DaemonSet
 569 |     * DaemonSet: **Runs on every Worker Node**
 570 |     * Connects to Server
 571 |     * Translate Worker Node metrics to Prometheus metrics
 572 | * Pods
 573 |     * From Deployments and StatefulSets
 574 | * Services
 575 |     * Each component has its own
 576 | * ConfigMaps
 577 |     * Configurations for different parts
 578 |     * Managed by Operator
 579 |     * How to connect to default metrics
 580 | * Secrets
 581 |     * For Grafana
 582 |     * For Prometheus
 583 |     * For Operator
 584 |     * For Alertmanager
 585 |     * Certificates
 586 |     * Username & Passwords
 587 |     * etc
 588 | * CRDS (Custom Resource Definition)
 589 |     * Extension of Kubernetes API
 590 | 
 591 | </div> <!-- Understanding Prometheus Components -->
 592 | <p align="right">(<a href="#top">back to top</a>)</p>
 593 | 
 594 | # Data Visualization with Prometheus UI
 595 | 
 596 | ## Decide what we want to monitor?
 597 | 
 598 | <div id="Data-Visualization-with-Prometheus-UI-Decide-what-we-want-to-monitor">
 599 | 
 600 | ### What is your goal? What do you want to monitor?
 601 | 
 602 | * We want to notice when something **unexpected** happens
 603 | * **Observe** any **anomalies**
 604 |     * CPU spikes
 605 |     * High Load
 606 |     * Insufficient Storage
 607 |     * Unauthorized Requests
 608 | * Analyze and **react** accordingly
 609 | 
 610 | ### How do we get this information?
 611 | 
 612 | * Visibility of monitoring data
 613 | * What data do we have available?
 614 |     * Cluster Nodes?
 615 |         * CPU
 616 |         * RAM
 617 |     * Applications?
 618 |         * Numbers of Requests
 619 |     * Kubernetes Components
 620 |         * App Availability
 621 | 
 622 | </div> <!-- Decide what we want to monitor? -->
 623 | 
 624 | ## Prometheus UI
 625 | 
 626 | <div id="Data-Visualization-with-Prometheus-UI-Prometheus-UI">
 627 | 
 628 | So, we have to monitor that information somehow.
 629 | 
 630 | For that, by default, we have **Prometheus Web UI**.
 631 | 
 632 | Let's see that in action and become familiar with it.
 633 | 
 634 | * See the Prometheus Web UI service.
 635 |   ```shell
 636 |   kubectl get svc -n monitoring prometheus-kube-prometheus-prometheus
 637 |   ```
 638 | * Port-forward it to `localhost:9090`
 639 |   ```shell
 640 |   kubectl port-forward -n monitoring service/prometheus-kube-prometheus-prometheus 9090:9090 &
 641 |   ```
 642 | * **NOTE:** What is `&` in Bash? A single `&`  at the end of a command means that the command should be **run in the
 643 |   background**
 644 | 
 645 | ---
 646 | 
 647 | As you can see, it's a straightforward UI.
 648 | 
 649 | <img src="images/prometheus-ui.png" alt="prometheus UI">
 650 | 
 651 | So, what targets is Prometheus monitoring?
 652 | 
 653 | You can see all targets if you go to the `Status/Targets`.
 654 | 
 655 | <img src="images/prometheus-targets.png" alt="Prometheus Targets">
 656 | 
 657 | These targets are here by default. **You need to add the "target" which you want to monitor.**
 658 | 
 659 | If you expand one of these targets (`show more` button), you'll see a bunch of columns, one of which is `Endpoint`.
 660 | That is the endpoint exposed **inside** the cluster.
 661 | 
 662 | <img src="images/prometheus-target-expand.png" alt="Prometheus Expanded Target">
 663 | 
 664 | ---
 665 | 
 666 | Let's jump back to the first page.
 667 | 
 668 | * Here, a long column allows you to execute PromQL queries. queries that we learned a little while ago in
 669 |   the <a href="#Introduction-to-Prometheus-PromQL-Query-Language">PromQL Query Language</a> section.
 670 | 
 671 | **Note:** Remember that execute queries here is `Low Level`, which is for `Debugging`.
 672 | 
 673 | ---
 674 | 
 675 | Also, if we head over to `Status/Configuration` and `Status/Runtime & Build Information`, you can see the Prometheus
 676 | configuration in **read-only** mode.
 677 | 
 678 | I want to discuss the concept of `job`s in Prometheus in the `Status/Configuration`.
 679 | 
 680 | Under the `scrape_configs`, you can see the list of jobs.
 681 | 
 682 | <img src="images/prometheus-status-config.png" alt="Prometheus Status Config">
 683 | 
 684 | But what are these jobs?
 685 | 
 686 | * If you go to `Status/Targets` and expand one of the targets with two (or more) processes running (which says `2/2`),
 687 |   you can see two `Endpoints` here.
 688 | * Each of these `Endpoints` is called **Instance**, an address where you can scrape metrics.
 689 |     * Instance: An endpoint you can scrape.
 690 | * A job is a collection of those Instances that scrape the same application, and it is called a **Job**.
 691 |     * Job: Collection of Instances with the same purpose.
 692 | 
 693 | You can see a bunch of labels in the `Labels` column. One of them is `job`, which holds the **job name**, and you can
 694 | see this label (e.g. `job="apiserver"`) is matched **on both Instances**.
 695 | 
 696 | * Now, if you see this target name, e.g., `monitoring-kube-prometheus-apiserver/0`, it matches its `job_name`
 697 |   in `Status/Configuration`.
 698 | * That means if we execute a query, e.g., `apiserver_request_total`, we'll get a bunch of metrics, and each metric
 699 |   contains a lot of labels, and one of them is always the `job` name.
 700 | * And you can also filter metrics by its job name: `apiserver_request_total{job="apiserver"}`
 701 | * And every metric also has an `Instance` label that represents the `Endpoint`/`Instance` from which that metric is
 702 |   scraped.
 703 | * And here for `Apiserver`, we have two Instances, and again, we can filter based on one of these
 704 |   Instances: `apiserver_request_total{instance="192.168.126.249:443"}`.
 705 | 
 706 | ---
 707 | 
 708 | Again, remember that here is not the place to see the graphs and visualize any anomalies. We have a great data
 709 | visualizer
 710 | called **Grafana**, and we will learn it in our next section!
 711 | 
 712 | </div> <!-- Prometheus UI -->
 713 | <p align="right">(<a href="#top">back to top</a>)</p>
 714 | 
 715 | # Introduction to Grafana UI
 716 | 
 717 | ## Intro
 718 | 
 719 | <div id="Introduction-to-Grafana-Intro">
 720 | 
 721 | We learned that we need a proper data visualization tool that accesses the metrics from the Prometheus server like
 722 | [Grafana](https://grafana.com).
 723 | 
 724 | What is Grafana?
 725 | 
 726 | Grafana is an [open-source](https://github.com/grafana/grafana) interactive data visualization platform developed by
 727 | Grafana Labs. It allows users to see
 728 | their data via charts and graphs that are unified into one dashboard (or multiple dashboards!) for more straightforward
 729 | interpretation and understanding.
 730 | 
 731 | Let's access Grafana UI!
 732 | 
 733 | ---
 734 | 
 735 | * First, let's see the Grafana service.
 736 |   ```shell
 737 |   kubectl get service/prometheus-grafana -n monitoring
 738 |   ```
 739 | * Then we port forward it to `localhost:8080`.
 740 |   ```shell
 741 |   kubectl port-forward service//monitoring-grafana 8080:80 -n monitoring &
 742 |   ```
 743 | * Now, if you head to [localhost:8080](http://localhost:8080), we'll see the Grafana dashboard. But first, we have to
 744 |   log in to it.
 745 | 
 746 | * We will have to use a default username and password. However, we can create new users and update passwords later. Use
 747 |   the default credentials for now:
 748 |     * username: `admin`
 749 |     * password: `prom-operator`
 750 | 
 751 | ---
 752 | 
 753 | Congratulation, you are in Grafana UI!
 754 | 
 755 | <img src="images/Grafana-UI.png" alt="Grafana UI">
 756 | 
 757 | Before moving any further, if you don't have anything in your cluster, let's deploy a
 758 | simple [online shop deployment](https://github.com/alifiroozi80/CKA/tree/main/Helm#Demo-Online-Boutique-overview).
 759 | 
 760 | Because we need something in our cluster to be monitored by Prometheus (Beside the Prometheus itself!)
 761 | 
 762 | This online shop is an example from Google to understand the concept of Microservice, and we've covered it in our Helm
 763 | section (See [here](https://github.com/alifiroozi80/CKA/tree/main/Helm) if you're interested)
 764 | 
 765 | Now, let's access the Grafana Dashboards.
 766 | 
 767 | Grafana Dashboards: Dashboard is a set of one or more panels.
 768 | 
 769 | If you click on `Dashboards` button on the left or head over
 770 | to [localhost:8080/dashboards](http://localhost:8080/dashboards), you'll see a bunch of default dashboards made by the
 771 | Grafana team.
 772 | 
 773 | <img src="images/grafana-dashboards.png" alt="Grafana Dashboards">
 774 | 
 775 | In most cases, we'll use these default dashboards.
 776 | 
 777 | These dashboards are grouped and organized in folders. Right now, we have one folder, the `General` folder, and inside
 778 | that folder, we have a bunch of dashboards.
 779 | 
 780 | So, let's see some of these dashboards that are interesting for us.
 781 | 
 782 | ---
 783 | 
 784 | ##### 1) `Kubernetes / Compute Resources / Cluster`
 785 | 
 786 | This is a Grafana dashboard! as you can see, it contains multiple rows that you can collapse.
 787 | 
 788 | <img src="images/grafana-cluster-rows.png" alt="Kubernetes / Compute Resources / Cluster dashboard">
 789 | 
 790 | * Grafana Dashboards
 791 |     * A dashboard is a set of one or more panels
 792 |     * You can create your Dashboards (More on that later)
 793 |     * Organized into one or more rows
 794 |     * A row is a logical divider within a dashboard
 795 |     * Rows are used to group panels together
 796 | 
 797 | Each row has multiple `Panels`.
 798 | 
 799 | * Panel
 800 |     * The **primary visualization building block** in Grafana
 801 |     * Composed of a query and a visualization
 802 |     * Each panel has a query editor **specific to the data source** selected in the panel
 803 |     * Can be moved and resized within a dashboard
 804 | 
 805 | * Structure Summary
 806 |     * Folders
 807 |     * Dashboards
 808 |     * Rows
 809 |     * Panels
 810 | 
 811 | If you expand the `CPU` row, you'll see the CPU usage diagram.
 812 | 
 813 | It is excellent, but what if an anomaly happens, and I want to see precisely which Pod(s) caused that?
 814 | 
 815 | For that, we have another Dashboard called: `Kubernetes / Compute Resources / Node (Pods)`
 816 | 
 817 | ---
 818 | 
 819 | ##### 2) `Kubernetes / Compute Resources / Node (Pods)`
 820 | 
 821 | Now, head back to Dashboards and go inside the `Kubernetes / Compute Resources / Node (Pods)` dashboard.
 822 | 
 823 | Again, you'll see multiple rows and panels, two rows for CPU and two rows for Memory usage.
 824 | 
 825 | And if you expand the `CPU Usage` and `CPU Quota`, you'll see every pod in detail and how much CPU they consume
 826 | in `Table` and `graph` View.
 827 | 
 828 | <img src="images/grafana-node-pods.png" alt="Kubernetes / Compute Resources / Node (Pods)">
 829 | 
 830 | You can switch between Nodes and select the ones you want to monitor.
 831 | 
 832 | On the top right, you also have a Time-Frame selection.
 833 | By default, you always see the data from 1 last hour, but you can always change that and, e.g., see data from yesterday
 834 | till now.
 835 | 
 836 | ---
 837 | 
 838 | Another exciting thing that you should know is if you click inside one of these panels, a menu will pop up, and if you
 839 | click on `Edit`, you'll see the Graph with its corresponding PromQL query.
 840 | 
 841 | <img src="images/promql-query.png" alt="PromQL query for a Graph">
 842 | 
 843 | These queries are legit PromQL queries; if you copy and paste them to the Prometheus UI, you'll see the result and the
 844 | Graph! (But not beautiful as Grafana!)
 845 | e.g. `sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster="$cluster", node=~"$node"}) by (pod)`
 846 | 
 847 | <img src="images/promql-prometheus-ui-1.png" alt="PromQL query in Prometheus UI">
 848 | <img src="images/promql-prometheus-ui-2.png" alt="PromQL query in Prometheus UI">
 849 | 
 850 | * Again, As a DevOps Engineer
 851 |     * In most cases, you don't need deep knowledge of PromQL
 852 |     * Just basic PromQL queries
 853 |     * We'll learn more about PromQL in the Prometheus Alertmanager section
 854 | 
 855 | </div> <!-- Intro -->
 856 | 
 857 | ## Create your dashboard
 858 | 
 859 | <div id="Introduction-to-Grafana-Create-your-dashboard">
 860 | 
 861 | We can create our dashboard in `Dashboards/New dashboard` or head over
 862 | to [localhost:8080/dashboard/new](http://localhost:8080/dashboard/new).
 863 | 
 864 | However, we should know PromQL for that, or we can use the metric browser, which lets us select and choose between
 865 | metrics (These metrics are the same metrics that you have and saw in Prometheus UI (Open metrics explorer))
 866 | 
 867 | <img src="images/create-graph.png" alt="Create Your own Graph">
 868 | 
 869 | Then apply the changes and see your beautiful Graph!
 870 | 
 871 | </div> <!-- Create your dashboard -->
 872 | 
 873 | ## Resource Consumption of Cluster Nodes
 874 | 
 875 | <div id="Introduction-to-Grafana-Resource-Consumption-of-Cluster-Nodes">
 876 | 
 877 | And while we are here, let's review another important dashboard, `Node Exporter / Nodes`.
 878 | This dashboard is great for resource consumption of Cluster Nodes
 879 | 
 880 | <img src="images/node-exporter-nodes.png" alt="Node Exporter / Nodes Dashboard">
 881 | 
 882 | </div> <!-- Resource Consumption of Cluster Nodes -->
 883 | 
 884 | ## Test Anomaly
 885 | 
 886 | <div id="Introduction-to-Grafana-Test-Anomaly">
 887 | 
 888 | Let's make an anomaly to our cluster and see it on the Grafana dashboards.
 889 | 
 890 | Here we want to curl 10000 on the front page of our online shop (or any other deployment in our cluster, e.g., Nginx,
 891 | etc.)
 892 | 
 893 | ---
 894 | 
 895 | * First, create a pod
 896 |   ```shell
 897 |   kubectl run curl-test --image radial/busyboxplus:curl -it --rm
 898 |   ```
 899 | 
 900 | Now, you are inside that Pod.
 901 | 
 902 | * Let's create simple bash script
 903 |   ```shell
 904 |   # $ vi test.sh
 905 |   for i in $(seq 1 10000);
 906 |   do
 907 |     curl http://ADDRESS-OF-YOUR-SVC > test.txt
 908 |   done
 909 |   ```
 910 | 
 911 | * Make it executable
 912 |   ```shell
 913 |   chmod +x test.sh
 914 |   ```
 915 | 
 916 | * And Run it
 917 |   ```shell
 918 |   ./test.sh
 919 |   ```
 920 | 
 921 | ---
 922 | 
 923 | After it is finished, let's see two important dashboards:
 924 | 
 925 | 1) `Kubernetes / Compute Resources / Cluster`
 926 | 2) `Kubernetes / Compute Resources / Node (Pods)`
 927 | 
 928 | You should see minor changes to your dashboards. Of course, it is not a big deal because we didn't push hard.
 929 | 
 930 | If you want to see some significant changes, you should curl more or do something else big enough.
 931 | 
 932 | </div> <!-- Test Anomaly -->
 933 | 
 934 | ## Configure Users & Data Structures
 935 | 
 936 | <div id="Introduction-to-Grafana-Configure-Users-and-Data-Structures">
 937 | 
 938 | Grafana is managing users, teams, and even multiple organizations by itself!
 939 | 
 940 | For managing and inviting other users, click on `Configuration/Users` or head over
 941 | to [localhost:8080/org/users](http://localhost:8080/org/users)
 942 | 
 943 | <img src="images/grafana-users.png" alt="Grafana Users">
 944 | 
 945 | ---
 946 | 
 947 | Grafana also supports many different storage backends.
 948 | 
 949 | See the complete list [here](https://grafana.com/docs/grafana/latest/datasources/#supported-data-sources)
 950 | 
 951 | If you click on `Configuration/Data sources` or head over
 952 | to [localhost:8080/datasources](http://localhost:8080/datasources), you can see, by default, Prometheusis there.
 953 | 
 954 | <img src="images/grafana-data-sources.png" alt="Grafana Data Sources">
 955 | 
 956 | You can also add another data source by clicking on the `Add data source` button or head over
 957 | to [localhost:8080/datasources/new](http://localhost:8080/datasources/new).
 958 | 
 959 | Multiple data sources exist, such as most cloud providers, actual databases, etc.
 960 | 
 961 | ---
 962 | 
 963 | Also, if you click on the `Explore` button or head over to [localhost:8080/explore](http://localhost:8080/explore),
 964 | based on the data source you have configured, queries will be different.
 965 | 
 966 | If you have multiple data sources, you can select one of them to query based on that.
 967 | 
 968 | PromQL is a query language for Prometheus. If you've added the, e.g., PostgreSQL or MySQL databases, you can select them
 969 | and search based on SQL query language.
 970 | 
 971 | <img src="images/grafana-explore.png" alt="Grafana Explore">
 972 | 
 973 | </div> <!-- Configure Users & Data Structures -->
 974 | <p align="right">(<a href="#top">back to top</a>)</p>
 975 | 
 976 | # Alert Rules in Prometheus
 977 | 
 978 | ## Overview
 979 | 
 980 | <div id="Alert-Rules-in-Prometheus-Overview">
 981 | 
 982 | We've learned about Grafana and its awesome dashboards and features.
 983 | 
 984 | * But, in the real world,
 985 |     * People won't wait in front of the screen for anomalies
 986 |     * You want to **get notified** when something happens (via Email, Slack message, etc.)
 987 |     * Then you will check the Dashboards
 988 | 
 989 | * Configure our Monitoring Stack to notify us whenever something unexpected happens. e.g.
 990 |     * CPU Usage is more than 50%
 991 |     * Pod can't start
 992 |     * App not accessible
 993 | 
 994 | ---
 995 | 
 996 | Alerting with Prometheus is separated into two parts.
 997 | 
 998 | 1) Define what we want to be notified about (**Alert Rules in Prometheus Server**)
 999 | 
1000 | * E.g.
1001 |     * Send notification when CPU usage is above 50%
1002 |     * Send notification when Pod can not restart
1003 | 
1004 | 2) Send notification (**Configure Alertmanager**)
1005 | 
1006 | * Alertmanager sends out the email/slack/etc. notification
1007 | 
1008 | ---
1009 | 
1010 | Which Alert Rule do we want to configure
1011 | 
1012 | * In Dashboard, you can see average CPU usage
1013 | * E.g. 20%-40% max
1014 | * If it exceeds 50%, we should trigger an alert
1015 |     * Alert: when CPU > 50%
1016 | 
1017 | </div> <!-- Overview -->
1018 | 
1019 | ## Existing Alert Rules
1020 | 
1021 | <div id="Alert-Rules-in-Prometheus-Existing-Alert-Rules">
1022 | 
1023 | We have some Alert Rules already out of the box, so in this section, we want to look at them and see what they look
1024 | like.
1025 | 
1026 | To see the Alert Rules already configured in Prometheus UI, click on `Alerts` or head over
1027 | to [localhost:9090/alerts](http://localhost:9090/alerts).
1028 | 
1029 | <img src="images/alert-rules.png" alt="Alert Rules">
1030 | 
1031 | As you see, a bunch of rules has been grouped. (e.g. `alertmanager.rules`, `etcd`, `config-reloaders`, etc.)
1032 | 
1033 | * We have three states with each role:
1034 |     * Green: `Inactive` or condition not met
1035 |     * Red: `Firing`. Condition is met
1036 |         * Firing: meaning that Alert is sent to Alertmanager
1037 |     * Yellow: Elements that are active but not firing yet, are in the `Pending` state
1038 | 
1039 | ---
1040 | 
1041 | Let's open a rule and go through it!
1042 | 
1043 | <img src="images/a-rule.png" alt="A Rule">
1044 | 
1045 | As you can see, it is very straightforward.
1046 | 
1047 | * `name`: Is the Rule Name
1048 | * `expr`: The PromQL query expression to be executed (More on that later)
1049 | * `for`: If the condition is met, how much should it wait to send an Alert? (Causes Prometheus to wait for a particular
1050 |   duration, so Prometheus will continue that the alert continues to be active, e.g., 10 minutes before firing the alert)
1051 | * `labels`: Allows specifying a set of additional labels to be attached to the alert.
1052 |   You can Group rules based on labels (e.g., send `critical` rules to slack and `warning` rules to email or even, e.g.,
1053 |   send namespace `dev` rules to slack and application `xxx` to Webhook URL)
1054 | * `annotations`: Specifies a set of information labels for more extended additional information
1055 |     * `description`: The body of the error
1056 |     * `runbook_url`: The explanation of the error
1057 |     * `summary`: What is the problem?
1058 | 
1059 | ---
1060 | 
1061 | Let's talk about a little more on `expr`.
1062 | 
1063 | This is a standard PromQL query; if you paste it into the Prometheus UI search bar, you'll see the result and details!
1064 | 
1065 | <img src="images/alert-rule-1.png" alt="Alert Rule expr">
1066 | 
1067 | But I don't know the PromQL language, you may say!
1068 | You have [Prometheus Docs](https://prometheus.io/docs/prometheus/latest/querying/functions/) that explain every PromQL
1069 | function in detail!
1070 | 
1071 | e.g., here, `max_over_time` is the maximum value of all points in the specified interval. And that explains `[5m]` at
1072 | the end of the query!
1073 | 
1074 | See, it is easy!
1075 | 
1076 | </div> <!-- Existing Alert Rules -->
1077 | <p align="right">(<a href="#top">back to top</a>)</p>
1078 | 
1079 | # Create your Alert Rule
1080 | 
1081 | Till now, we've seen the existing Alert Rules, but it's time to create our own Alert Rules for things that we
1082 | specifically care about, e.g., When CPU usage is higher than 50% or when a Pod can not start
1083 | 
1084 | ## Create your 1st Rule (`HostHighCpuLoad`)
1085 | 
1086 | <div id="Create-your-Alert-Rule-Create-your-1st-Rule">
1087 | 
1088 | Alright! It's time for the FUN part!
1089 | 
1090 | Create a `alert-rules.yaml` file. Then we want a starting point. For that, let's copy and paste one of the existing
1091 | Alert Rules we've checked because its syntax is pretty much the same!
1092 | 
1093 | ```yaml
1094 | name: AlertmanagerFailedReload
1095 | expr: max_over_time(alertmanager_config_last_reload_successful{job="prometheus-kube-prometheus-alertmanager",namespace="monitoring"}[5m]) == 0
1096 | for: 10m
1097 | labels:
1098 |   severity: critical
1099 | annotations:
1100 |   description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
1101 |   runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
1102 |   summary: Reloading an Alertmanager configuration has failed.
1103 | ```
1104 | 
1105 | We are going to change it, and we are going to do it line by line.
1106 | 
1107 | * Change the name to `HostHighCpuLoad`.
1108 | * Then for `expr` paste this:
1109 |   ```shell
1110 |   100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 50
1111 |   ```
1112 | 
1113 | What the hell is that?!
1114 | 
1115 | Well, let's break it down into small pieces.
1116 | 
1117 | ---
1118 | 
1119 | First, Let's start with `node_cpu_seconds_total`.
1120 | What is it? This counter metric counts the number of seconds the CPU has been running in a particular mode.
1121 | 
1122 | If you execute this query in Prometheus UI, you'll see a bunch of output.
1123 | 
1124 | <img src="images/create-rule-1.png" alt="Create your Alert Rule pic-1">
1125 | 
1126 | Each of these outputs has a `mode` label.
1127 | We want to grab the `idle` mode because `mode="idle"` means the CPU is NOT being used.
1128 | 
1129 | So, let's grab only the ones who have `mode="idle"`
1130 | 
1131 | ---
1132 | 
1133 | Now I have fewer outputs, but they are not entirely readable. It is just a number. I want them in percentage!
1134 | So we use the [rate](https://prometheus.io/docs/prometheus/latest/querying/functions/#rate) function to calculate its
1135 | rate and then multiply the result by 100 to get it in percentage.
1136 | 
1137 | ```shell
1138 | (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100
1139 | ```
1140 | 
1141 | <img src="images/create-rule-2.png" alt="Create your Alert Rule pic-2">
1142 | 
1143 | **NOTE:** How much this number is higher, the less CPU your host is using
1144 | 
1145 | ---
1146 | 
1147 | Now, I want to get only one answer per Node.
1148 | E.g., if I have a Three-Node cluster, I want to get three answers, each anser is for a Node, and I want them in
1149 | percentage.
1150 | 
1151 | For that, I have to use the `instance` label.
1152 | 
1153 | If you've noticed, the value of the `instance` label is the IP address of a Node. So, I filter it by the `instance`
1154 | label:
1155 | 
1156 | ```shell
1157 | avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100
1158 | ```
1159 | 
1160 | **NOTE:** Here, I have a One-Node Cluster, so my `instance` is just one; if you have more than one Node, you'll see more
1161 | outputs.
1162 | 
1163 | <img src="images/create-rule-3.png" alt="Create your Alert Rule pic-3">
1164 | 
1165 | **NOTE:** How much this number is higher, the less CPU your host is using.
1166 | 
1167 | Then I subtract the value by 100 to get the used value.
1168 | 
1169 | ```shell
1170 | 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)
1171 | ```
1172 | 
1173 | ---
1174 | 
1175 | The final thing to do is set a condition.
1176 | 
1177 | So, I use the`>` symbol to set a condition.
1178 | If the value is greater than 50, the condition will be met!
1179 | 
1180 | ---
1181 | 
1182 | * We set the `for` to `2m` because 50% is not that much!
1183 | * Then we set `severity` to `warning` because, again, 50% is not a big deal
1184 | * Also, we added another label, `namespace: monitoring`, because we will use it later
1185 | 
1186 | For `annotations`, we don't need `runbook_url`, but you can create some page somewhere (Github, for instance), and
1187 | for `runbook_url`, refer to this page.
1188 | 
1189 | * Add the problem `summary` section
1190 | 
1191 | * We want to give the full detail in the description, so for value, we use the `{{ $value }}` syntax, and for knowing
1192 |   which Node, we use ` {{ $labels.instance }}` (`labels.` because `instance` is a label (you saw it when you executed
1193 |   the query before!))
1194 | 
1195 | Also, `\n` means a new line.
1196 | 
1197 | ---
1198 | 
1199 | Here is the complete YAML file:
1200 | 
1201 | ```yaml
1202 | name: HostHighCpuLoad
1203 | expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 50
1204 | for: 2m
1205 | labels:
1206 |   severity: warning
1207 |   namespace: monitoring
1208 | annotations:
1209 |   description: "CPU load on a host is over 50%\n Instance = {{ $labels.instance }}\n Value = {{ $value }}"
1210 |   summary: "Host CPU load is high"
1211 | ```
1212 | 
1213 | That's it for this section!
1214 | 
1215 | </div> <!-- Create your 1st Rule -->
1216 | 
1217 | ## Alert Rule for Kubernetes
1218 | 
1219 | <div id="Create-your-Alert-Rule-Alert-Rule-for-Kubernetes">
1220 | 
1221 | Alright, Now we have written a rule and want to apply it.
1222 | 
1223 | But How can we do that?!
1224 | 
1225 | You may say we can edit the `/etc/prometheus/rules/prometheus-prometheus-kube-prometheus-prometheus-rulefiles-0/*.yaml
1226 | ` directly and re-apply it again. (We see that in Prometheus UI, under `Status/Configuration`
1227 | or [localhost:9090/config](http://localhost:9090/config)).
1228 | 
1229 | However, it is not an efficient way.
1230 | 
1231 | We've installed the Prometheus in our cluster via Prometheus Operator!
1232 | 
1233 | So, it is super easy and efficient if we use it for managing our custom rules!
1234 | 
1235 | Operators are managing Custom Kubernetes components (defined by `CRD`s)
1236 | 
1237 | Prometheus Operator extends the Kubernetes API
1238 | We create custom K8s resources
1239 | The operator takes our custom K8s resource and tells Prometheus to reload the alert rules.
1240 | 
1241 | So, here we want to create a simple `CRD` based on
1242 | the [docs](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/monitoring-apis-index.html)
1243 | 
1244 | I have discussed it in this repo if you want to know more about Operators, CRD, etc.
1245 | See [here](https://github.com/alifiroozi80/CKA/tree/main/Operators)
1246 | 
1247 | ---
1248 | 
1249 | The first steps are easy!
1250 | 
1251 | First, we define the `apiVersion`, `kind`, and `metadata`.
1252 | You can find the `apiVersion` and `kind` with the `kubectl api-resources` command or via
1253 | the [docs](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/prometheusrule-monitoring-coreos-com-v1.html)
1254 | 
1255 | ```yaml
1256 | apiVersion: monitoring.coreos.com/v1
1257 | kind: PrometheusRule
1258 | metadata:
1259 |   name: custom-rules # The rules name
1260 |   namespace: monitoring # The same Namespace that Prometheus has been installed in it
1261 | ```
1262 | 
1263 | ---
1264 | 
1265 | Then, we want to write the `spec` section.
1266 | 
1267 | To do so, we have to go with
1268 | the [official docs](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/prometheusrule-monitoring-coreos-com-v1.html)
1269 | .
1270 | 
1271 | On the left side, you can see all the CRDs you can create.
1272 | 
1273 | Click on `PromethesuRule`.
1274 | 
1275 | As you
1276 | can [see](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/prometheusrule-monitoring-coreos-com-v1.html#spec)
1277 | , under the `spec`, we have a required attribute which is `groups`.
1278 | 
1279 | That attribute is an array and takes two parameters, `name` and `rules`. (
1280 | see [here](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/prometheusrule-monitoring-coreos-com-v1.html#spec-groups-2))
1281 | .
1282 | 
1283 | * `name`: As you saw in Prometheus UI, under the `Alerts` section
1284 |   or [localhost:9090/alerts](http://localhost:9090/alerts), rules are grouped, and each group has a name. This name is
1285 |   defined under the `name` key.
1286 | * `rules`: We define our rules here
1287 | 
1288 | ---
1289 | 
1290 | The `rules` attribute is also an array and takes some keys.
1291 | 
1292 | See [here](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/prometheusrule-monitoring-coreos-com-v1.html#spec-groups-rules-2)
1293 | .
1294 | 
1295 | Surprise!!
1296 | 
1297 | These keys are the same as the file we've already written! (except the `alert` key!)
1298 | 
1299 | So, paste the `name` of the rule in `alert`, and paste the remains here!
1300 | 
1301 | Eventually, your file should be something like this:
1302 | 
1303 | ```yaml
1304 | apiVersion: monitoring.coreos.com/v1
1305 | kind: PrometheusRule
1306 | metadata:
1307 |   name: custom-rules
1308 |   namespace: monitoring
1309 | spec:
1310 |   groups:
1311 |     - name: custom.rules
1312 |       rules:
1313 |         - alert: HostHighCpuLoad
1314 |           expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 50
1315 |           for: 2m
1316 |           labels:
1317 |             severity: warning
1318 |             namespace: monitoring
1319 |           annotations:
1320 |             description: "CPU load on a host is over 50%\n Instance = {{ $labels.instance }}\n Value = {{ $value }}"
1321 |             summary: "Host CPU load is high"
1322 | ```
1323 | 
1324 | And that's it!
1325 | 
1326 | We've just created a CRD. The Prometheus Operator will take care of it!
1327 | 
1328 | Congrats!
1329 | 
1330 | </div> <!-- Alert Rule for Kubernetes -->
1331 | 
1332 | ## Create your 2nd Rule (`KubernetesPodCrashLooping`)
1333 | 
1334 | <div id="Create-your-Alert-Rule-Create-your-2nd-Rule">
1335 | 
1336 | Let's create another Alert Rule that notifies us whenever a Pod in the cluster restarts over three times.
1337 | 
1338 | The syntax is exactly like before. Add another `- alert` under `.spec.groups.rules`
1339 | 
1340 | Before we go further, let's break down the `expr` for this rule.
1341 | 
1342 | If you execute this query below in Prometheus UI, you'll see all the Pods in your cluster, which shows **how many times
1343 | they have been restarted**.
1344 | 
1345 | ```shell
1346 | kube_pod_container_status_restarts_total
1347 | ```
1348 | 
1349 | We want to see those which have been restarted over three times, so the query would be like this:
1350 | 
1351 | ```shell
1352 | kube_pod_container_status_restarts_total > 3
1353 | ```
1354 | 
1355 | ---
1356 | 
1357 | So, the alert rule would be something like this:
1358 | 
1359 | ```yaml
1360 | - alert: KubernetesPodCrashLooping
1361 |   expr: kube_pod_container_status_restarts_total > 5
1362 |   for: 0m # Notify us immediately
1363 |   labels:
1364 |     severity: critical # 
1365 |     namespace: monitoring
1366 |   annotations:
1367 |     description: "The {{ $labels.pod }} Pod has crash-looped {{ $value }} time(s)\n"
1368 |     summary: "A pod is crash looping"
1369 | ```
1370 | 
1371 | </div> <!-- Create your 2nd Rule -->
1372 | 
1373 | ## Apply Alert Rules
1374 | 
1375 | <div id="Create-your-Alert-Rule-Apply-Alert-Rules">
1376 | 
1377 | Finally, we can apply our custom rules file.
1378 | 
1379 | But before that, we should add some `labels` to this file to Prometheus Operator picks it up automatically.
1380 | 
1381 | * How did I know about labels?
1382 |   ```shell
1383 |   kubectl get pod -n monitoring --show-labels
1384 |   ```
1385 | 
1386 | Let's see the whole part once again before applying:
1387 | 
1388 | ```yaml
1389 | apiVersion: monitoring.coreos.com/v1
1390 | kind: PrometheusRule
1391 | metadata:
1392 |   name: custom-rules
1393 |   namespace: monitoring
1394 |   labels:
1395 |     app: kube-prometheus-stack
1396 |     release: prometheus
1397 | spec:
1398 |   groups:
1399 |     - name: custom.rules
1400 |       rules:
1401 |         - alert: HostHighCpuLoad
1402 |           expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 50
1403 |           for: 2m
1404 |           labels:
1405 |             severity: warning
1406 |             namespace: monitoring
1407 |           annotations:
1408 |             description: "CPU load on a host is over 50%\n Instance = {{ $labels.instance }}\n Value = {{ $value }}"
1409 |             summary: "Host CPU load is high"
1410 |         - alert: KubernetesPodCrashLooping
1411 |           expr: kube_pod_container_status_restarts_total > 5
1412 |           for: 0m
1413 |           labels:
1414 |             severity: critical
1415 |             namespace: monitoring
1416 |           annotations:
1417 |             description: "The {{ $labels.pod }} Pod has crash-looped {{ $value }} time(s)\n"
1418 |             summary: "A pod is crash looping"
1419 | ```
1420 | 
1421 | * Apply the file
1422 |   ```shell
1423 |   kubectl apply -f alert-rules.yaml
1424 |   ```
1425 | * See all the Alert Rules in the cluster
1426 |   ```shell
1427 |   kubectl get promrule -n monitoring
1428 |   ```
1429 | * **NOTE:** How did I know about `promrule`? The `kubectl api-resources` command!
1430 | * Check if Prometheus Operator saw our changes (`config-reloader` container).
1431 |   ```shell
1432 |   kubectl logs pod/prometheus-prometheus-kube-prometheus-prometheus-0 -n monitoring -c config-reloader
1433 |   ```
1434 |   ```shell
1435 |   [...]
1436 |   level=info ts=2022-10-08T05:13:15.633414355Z caller=reloader.go:375 msg="Reload triggered" cfg_in=/etc/prometheus/config/prometheus.yaml.gz cfg_out=/etc/prometheus/config_out/prometheus.env.yaml watched_dirs=/etc/prometheus/rules/prometheus-prometheus-kube-prometheus-prometheus-rulefiles-0
1437 |   [...]
1438 |   ```
1439 | * Check if Prometheus Operator saw our changes (`prometheus` container).
1440 |   ```shell
1441 |   kubectl logs pod/prometheus-prometheus-kube-prometheus-prometheus-0 -n monitoring -c prometheus
1442 |   ```
1443 |   ```shell
1444 |   [...]
1445 |   ts=2022-10-08T05:13:15.633Z caller=main.go:1214 level=info msg="Completed loading of configuration file" filename=/etc/prometheus/config_out/prometheus.env.yaml totalDuration=84.012008ms db_storage=885ns remote_storage=2.204µs web_handler=425ns query_engine=841ns scrape=93.763µs scrape_sd=1.173801ms notify=28.174µs notify_sd=250.415µs rules=77.932673ms tracing=4.527µs
1446 |   [...]
1447 |   ```
1448 | 
1449 | ---
1450 | 
1451 | Everything seems OK!
1452 | 
1453 | Now, after a while, you should see our rules in Prometheus UI under the `Alerts` or head over
1454 | to [localhost:9090/alerts](http://localhost:9090/alerts)
1455 | 
1456 | * **NOTE:** It takes a couple of minutes to see it in Prometheus UI. Be patient.
1457 | 
1458 | <img src="images/prometheus-rule.png" alt="Rules in Prometheus UI">
1459 | 
1460 | </div> <!-- Apply Alert Rules -->
1461 | 
1462 | ## Test Alert Rule
1463 | 
1464 | <div id="Create-your-Alert-Rule-Test-Alert-Rule">
1465 | 
1466 | Alright, it's time to test our Rules!
1467 | 
1468 | We'll test the `HostHighCpuLoad` rule here.
1469 | 
1470 | First, in Grafana, open the `Kubernetes / Compute Resources / Cluster` dashboard.
1471 | 
1472 | If you look at the `CPU Utilisation` panel in the first row, you'll see your Cpu usage. For me, it is almost 15%
1473 | 
1474 | <img src="images/CPU.png" alt="Test Alert Rules">
1475 | 
1476 | We want to generate some Cpu stress!
1477 | 
1478 | For that, we have lots of tools and ways, but here we will
1479 | use [this image](https://hub.docker.com/r/containerstack/cpustress) ([source code](https://github.com/containerstack/docker-cpustress))
1480 | 
1481 | ---
1482 | 
1483 | Run this in your cluster:
1484 | 
1485 | ```shell
1486 | kubectl run cpu-test --image=containerstack/cpustress -- --cpu 4 --timeout 60s --metrics-brief 
1487 | ```
1488 | 
1489 | This command will create a pod, that generates CPU stresses for 1 minute.
1490 | 
1491 | * **NOTE:** If that command won't your CPU number above 50%, you should increase the `--cpu 4`. E.g., in my case, I
1492 |   should've set it to 10
1493 | 
1494 | ---
1495 | 
1496 | Aha!
1497 | After a while, you should see the `CPU Utilisation` number go up!
1498 | 
1499 | <img src="images/cpu-number.png" alt="Cpu number">
1500 | 
1501 | So, the Alert Rule will immediately go into the `Pending` state
1502 | 
1503 | <img src="images/pending.png" alt="Pending Alert Rule">
1504 | 
1505 | And after two minutes, Alert Rule will go into the `Firing` state
1506 | 
1507 | <img src="images/firing.png" alt="Firing Alert Rule">
1508 | 
1509 | </div> <!-- Test Alert Rule -->
1510 | <p align="right">(<a href="#top">back to top</a>)</p>
1511 | 
1512 | # Alertmanager
1513 | 
1514 | ## Introduction to Alertmanager
1515 | 
1516 | <div id="Alertmanager-Introduction-to-Alertmanager">
1517 | 
1518 | ### Firing State
1519 | 
1520 | <div id="Alertmanager-Firing-State">
1521 | 
1522 | * Till now, we've written and seen Alert Rules that after a condition is met, they become a `Firing` state.
1523 | * As we already know, `Firing` means sending an alert to the Alertmanager.
1524 | * Alertmanager is another application, and it is not activated by default.
1525 | * That's why we did not receive alerts till now.
1526 | * But we are going to activate it and receive the alerts.
1527 | * Alertmanager is the last piece in the pipeline.
1528 | * The Alertmanager dispatches notifications about the alert
1529 | * Takes care of **deduplicating**, **grouping**, and **routing** them to the correct receiver integration
1530 | * Here, we are going to configure the Alertmanager to dispatch **notification via email**
1531 | 
1532 | </div> <!-- Firing State -->
1533 | 
1534 | ### Alertmanager Configuration File
1535 | 
1536 | <div id="Alertmanager-Alertmanager-Configuration-File">
1537 | 
1538 | Notice that The Prometheus Server and the Alertmanager are two separate components, and each has its configuration.
1539 | 
1540 | You can see the Prometheus Server configuration by clicking `Status/Configuration` or heading over
1541 | to [localhost:9090/config](http://localhost:9090/config)
1542 | 
1543 | Let's see the Alertmanager Configuration.
1544 | 
1545 | ---
1546 | 
1547 | First, port-forward the Alertmanager Service to `localhost:9093`
1548 | 
1549 | ```shell
1550 | kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-alertmanager 9093:9093 &
1551 | ```
1552 | 
1553 | Then, open the browser and head over to [localhost:9093](http://localhost:9093).
1554 | 
1555 | <img src="images/Alertmanager-ui.png" alt="The Alertmanager UI">
1556 | 
1557 | Tada! this is the Alertmanager UI!
1558 | As you can see, it is a very straightforward UI.
1559 | 
1560 | You can see the Alertmanager configuration by clicking on `Status` or heading over
1561 | to [localhost:9093/#/status](http://localhost:9093/#/status).
1562 | 
1563 | <img src="images/alertmanager-status.png" alt="Alertmanager Status">
1564 | 
1565 | Let's talk a little about this configuration.
1566 | 
1567 | ---
1568 | 
1569 | First, as you see, it has five sections. You can see the complete explanation and details about each one of them in
1570 | the [official documentation](https://prometheus.io/docs/alerting/latest/configuration), but I will explain the
1571 | important ones very briefly.
1572 | 
1573 | 1) `global`: Global parameters are valid in all other configuration contexts.
1574 | 2) `route`: Which alerts to which receivers?
1575 | 3) `inhibit_rules`: A list of inhibition rules.
1576 | 4) `receivers`: A list of notification receivers. These are the notification integration. e.g., Email, Slack, etc.
1577 | 5) `templates`: Files from which custom notification template definitions are read. The last component may use a
1578 |    wildcard matcher, e.g. `templates/*.tmpl`.
1579 | 
1580 | * As you see, the `receivers` section is empty. That's why we didn't receive notifications.
1581 | 
1582 | Among all of those, I want to talk a little about `route`:
1583 | 
1584 | ```yaml
1585 | route:
1586 |   ###
1587 |   # Top-Level Route
1588 |   receiver: "null"
1589 |   group_by:
1590 |     - namespace
1591 |   continue: false
1592 |   ###
1593 | 
1594 |   ##
1595 |   # Specific Alert
1596 |   routes:
1597 |     - receiver: "null"
1598 |       matchers: # A list of matchers that an alert has to fulfill to match the node.
1599 |         - alertname=~"InfoInhibitor|Watchdog"
1600 |       continue: false
1601 |   ##
1602 | 
1603 |   # Send notifications for a group of alerts
1604 |   group_wait: 30s
1605 |   group_interval: 5m
1606 | 
1607 |   repeat_interval: 12h # How long to wait before sending the notification again?
1608 | ```
1609 | 
1610 | * Top-Level Route
1611 |     * Every alert enters the routing tree at the top-level route
1612 |     * Configuration applying to the **All alerts**
1613 | 
1614 | According to [official documentation](https://prometheus.io/docs/alerting/latest/configuration/#route):
1615 | 
1616 | A route block defines a node in a routing tree and its children. Its optional configuration parameters are inherited
1617 | from its parent node if not set.
1618 | 
1619 | Every alert enters the routing tree at the configured top-level route, which must match all alerts (i.e. not have any
1620 | configured matchers). It then traverses the child nodes. If `continue` is set to `false`, it stops after the first
1621 | matching child. If `continue` is `true` on a matching node, the alert will continue matching against subsequent
1622 | siblings. If an alert does not match any children of a node (no matching child nodes, or none exist), the alert is
1623 | handled based on the configuration parameters of the current node.
1624 | 
1625 | </div> <!-- Alertmanager Configuration File -->
1626 | </div> <!-- Introduction to Alertmanager -->
1627 | 
1628 | ## Configure Alertmanager with Email Receiver
1629 | 
1630 | <div id="Alertmanager-Configure-Alertmanager-with-Email-Receiver">
1631 | 
1632 | ### Configure Alertmanager
1633 | 
1634 | <div id="Alertmanager-Configure-Alertmanager">
1635 | 
1636 | Where this configuration comes from?
1637 | 
1638 | * First, get your secret
1639 |   ```shell
1640 |   kubectl -n monitoring get secrets/alertmanager-prometheus-kube-prometheus-alertmanager-generated -o yaml | grep alertmanager.yaml
1641 |   ```
1642 | * Then decode it (base64)
1643 |   ```shell
1644 |   echo Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgcmVjZWl2ZXI6ICJudWxsIgogIGdyb3VwX2J5OgogIC0gbmFtZXNwYWNlCiAgcm91dGVzOgogIC0gcmVjZWl2ZXI6ICJudWxsIgogICAgbWF0Y2hlcnM6CiAgICAtIGFsZXJ0bmFtZSA9fiAiSW5mb0luaGliaXRvcnxXYXRjaGRvZyIKICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAppbmhpYml0X3J1bGVzOgotIHRhcmdldF9tYXRjaGVyczoKICAtIHNldmVyaXR5ID1+IHdhcm5pbmd8aW5mbwogIHNvdXJjZV9tYXRjaGVyczoKICAtIHNldmVyaXR5ID0gY3JpdGljYWwKICBlcXVhbDoKICAtIG5hbWVzcGFjZQogIC0gYWxlcnRuYW1lCi0gdGFyZ2V0X21hdGNoZXJzOgogIC0gc2V2ZXJpdHkgPSBpbmZvCiAgc291cmNlX21hdGNoZXJzOgogIC0gc2V2ZXJpdHkgPSB3YXJuaW5nCiAgZXF1YWw6CiAgLSBuYW1lc3BhY2UKICAtIGFsZXJ0bmFtZQotIHRhcmdldF9tYXRjaGVyczoKICAtIHNldmVyaXR5ID0gaW5mbwogIHNvdXJjZV9tYXRjaGVyczoKICAtIGFsZXJ0bmFtZSA9IEluZm9JbmhpYml0b3IKICBlcXVhbDoKICAtIG5hbWVzcGFjZQpyZWNlaXZlcnM6Ci0gbmFtZTogIm51bGwiCnRlbXBsYXRlczoKLSAvZXRjL2FsZXJ0bWFuYWdlci9jb25maWcvKi50bXBsCg== | base64 --decode
1645 |   ```
1646 | * And here it is!
1647 |   ```yaml
1648 |   global:
1649 |     resolve_timeout: 5m
1650 |   route:
1651 |     receiver: "null"
1652 |     group_by:
1653 |     - namespace
1654 |     routes:
1655 |     - receiver: "null"
1656 |       matchers:
1657 |       - alertname =~ "InfoInhibitor|Watchdog"
1658 |     group_wait: 30s
1659 |     group_interval: 5m
1660 |     repeat_interval: 12h
1661 |   inhibit_rules:
1662 |   - target_matchers:
1663 |     - severity =~ warning|info
1664 |     source_matchers:
1665 |     - severity = critical
1666 |     equal:
1667 |     - namespace
1668 |     - alertname
1669 |   - target_matchers:
1670 |     - severity = info
1671 |     source_matchers:
1672 |     - severity = warning
1673 |     equal:
1674 |     - namespace
1675 |     - alertname
1676 |   - target_matchers:
1677 |     - severity = info
1678 |     source_matchers:
1679 |     - alertname = InfoInhibitor
1680 |     equal:
1681 |     - namespace
1682 |   receivers:
1683 |   - name: "null"
1684 |   templates:
1685 |   - /etc/alertmanager/config/*.tmpl
1686 |   ```
1687 | 
1688 | ---
1689 | 
1690 | * However, this Alertmanager is also managed by Operator.
1691 | * So, we shouldn't adjust and modify this file directly.
1692 | * Instead, as we already know, we will create a custom resource (`AlertmanagerConfig`) and apply it, and Prometheus
1693 |   Operator will take care of it the same way before.
1694 | * We'll use
1695 |   the [official documentation](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/alertmanagerconfig-monitoring-coreos-com-v1beta1.html)
1696 |   here.
1697 | 
1698 | </div> <!-- Configure Alertmanager -->
1699 | 
1700 | ### Configure Email Notification
1701 | 
1702 | <div id="Alertmanager-Configure-Email-Notification">
1703 | 
1704 | Let's create the `AlertmanagerConfig` component!
1705 | 
1706 | Again, we'll use
1707 | the [docs](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/alertmanagerconfig-monitoring-coreos-com-v1beta1.html)
1708 | here.
1709 | 
1710 | This is the starting point:
1711 | 
1712 | ```yaml
1713 | apiVersion: monitoring.coreos.com/v1alpha1
1714 | kind: AlertmanagerConfig
1715 | metadata:
1716 |   name: alert-manager-config
1717 |   namespace: monitoring
1718 | spec:
1719 | ```
1720 | 
1721 | How did we know about that starting point?
1722 | 
1723 | You can use the docs or this command:
1724 | 
1725 | ```shell
1726 | kubectl api-resources | grep alertmanagerconfigs
1727 | ```
1728 | 
1729 | Now, it's time to write the `spec` part.
1730 | 
1731 | ---
1732 | 
1733 | * As you see,
1734 |   the [.spec](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/alertmanagerconfig-monitoring-coreos-com-v1beta1.html#spec)
1735 |   has multiple keys, and one of them
1736 |   is [receivers](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/alertmanagerconfig-monitoring-coreos-com-v1beta1.html#spec-receivers)
1737 |   .
1738 | * `receivers` has **object** type, and the `name` attribute is required.
1739 | * We put `'email'` for the `name` key. Then we want to config this email. so we
1740 |   use [emailConfigs](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/alertmanagerconfig-monitoring-coreos-com-v1beta1.html#spec-receivers-emailconfigs)
1741 |   attribute which is **array** type.
1742 | 
1743 | I explain those which we will use in this file.
1744 | 
1745 | * `to`: From which email address do we want to send the alerts?
1746 | * `from`: Which email account should receive those alerts?
1747 | * `smarthost`: SMTP host
1748 |     * [What is an SMTP server?](https://sendgrid.com/blog/what-is-an-smtp-server)
1749 |     * We'll use Gmail SMTP host address and port, but for other mail providers, google them (e.g., SMTP Gmail address)
1750 | * `authIdentity`: The identity to use for authentication.
1751 | * `authUsername`: The username to use for authentication.
1752 | * `authPassword`: The **secret** key contains the password to use for authentication.
1753 | 
1754 | * For `authPassword`, I don't want to hard-code my email password.
1755 | * So, we will create a secret, and inside that secret, we'll put our email password and then reference it here.
1756 | * As you see,
1757 |   the [authPassword](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/alertmanagerconfig-monitoring-coreos-com-v1beta1.html#spec-receivers-emailconfigs-authpassword)
1758 |   has two keys: `name` and `key`.
1759 |     * `name`: The name of the secret
1760 |     * `key`: The key of the secret to select from.
1761 | * Let's put `gmail-auth` for `name` and `password` for `key` values for now.
1762 | 
1763 | Up to this point, our file looks like this:
1764 | 
1765 | ```yaml
1766 | apiVersion: monitoring.coreos.com/v1alpha1
1767 | kind: AlertmanagerConfig
1768 | metadata:
1769 |   name: alert-manager-config
1770 |   namespace: monitoring
1771 | spec:
1772 |   receivers:
1773 |     - name: 'email'
1774 |       emailConfigs:
1775 |         - to: 'test@gmail.com'
1776 |           from: 'test@gmail.com'
1777 |           smarthost: 'smtp.gmail.com:587'
1778 |           authIdentity: 'test@gmail.com'
1779 |           authUsername: 'test@gmail.com'
1780 |           authPassword:
1781 |             name: gmail-auth
1782 |             key: password
1783 | ```
1784 | 
1785 | The next step is to create that Secret file with key exactly.
1786 | 
1787 | ---
1788 | 
1789 | Create a YAML file, call it whatever you want, then in that file, paste this:
1790 | 
1791 | ```yaml
1792 | apiVersion: v1
1793 | kind: Secret
1794 | metadata:
1795 |   name: gmail-auth
1796 |   namespace: monitoring
1797 | type: Opaque
1798 | data:
1799 |   password: BASE64-ENCODED-VALUE-OF-YOUR-PASSWORD
1800 | ```
1801 | 
1802 | * **NOTE:** If you aren't familiar with Kubernetes Secrets,
1803 |   see [here](https://github.com/alifiroozi80/CKA/tree/main/CKA#configmap--secret-1)
1804 | 
1805 | Before applying it, there is one little note you should consider.
1806 | 
1807 | * If your email account doesn't have two-step verification, you should head over
1808 |   to [myaccount.google.com/lesssecureapps](https://myaccount.google.com/lesssecureapps) and enable it. It allows other
1809 |   applications (such as Alertmanager) to use your email.
1810 | * If your email account has two-step verification enabled, you should go to your settings and create an **Application
1811 |   Password** for Alertmanager. e.g., for creating an Application Password on Gmail,
1812 |   check [here](https://support.google.com/mail/answer/185833?hl=en).
1813 | * Other email providers have such steps. Google them.
1814 | 
1815 | ---
1816 | 
1817 | Let's write
1818 | the [route](https://docs.openshift.com/container-platform/4.11/rest_api/monitoring_apis/alertmanagerconfig-monitoring-coreos-com-v1beta1.html#spec-route)
1819 | section.
1820 | 
1821 | As you see, it has lots of attributes.
1822 | 
1823 | * `receiver`: Name of the receiver for this route. It should be listed in the `receivers` field. (We already did that!)
1824 | * `routes`: Child routes. Under the `routes`, you can override all the `route` attributes we've seen.
1825 | * `matchers[]`: Matcher defines how to match on alert labels.
1826 | 
1827 | ```yaml
1828 | route:
1829 |   receiver: 'email' # Name of the 'receiver' for this route. You can overwrite that for specific rules under 'routes'.
1830 |   repeatInterval: '30m' # How long to wait before repeating the last notification.
1831 |   routes: # Child routes
1832 |     - matchers:
1833 |         - name: 'alertname'
1834 |           matchType: '='
1835 |           value: 'HostHighCpuLoad'
1836 |       repeatInterval: '10m' # Overwrite the 'repeatInterval' for this specific rule.
1837 |     - matchers:
1838 |         - name: 'alertname'
1839 |           matchType: '='
1840 |           value: 'KubernetesPodCrashLooping'  
1841 | ```
1842 | 
1843 | In the next section, we'll apply these files and go through them.
1844 | 
1845 | </div> <!-- Configure Email Notification -->
1846 | </div> <!-- Configure Alertmanager with Email Receiver -->
1847 | 
1848 | ## Trigger Alerts for Email Receiver
1849 | 
1850 | <div id="Alertmanager-Trigger-Alerts-for-Email-Receiver">
1851 | 
1852 | Alright, let's apply these files.
1853 | 
1854 | Remember to apply your `gmail-auth` (Email's password secret file) first and then apply
1855 | your `alert-manager-configuration`.
1856 | 
1857 | Here is the complete file:
1858 | 
1859 | ```yaml
1860 | apiVersion: v1
1861 | kind: Secret
1862 | metadata:
1863 |   name: gmail-auth
1864 |   namespace: monitoring
1865 | type: Opaque
1866 | data:
1867 |   password: bm9obHh5dGRsZWNjcXp3Ywo=
1868 | ---
1869 | apiVersion: monitoring.coreos.com/v1alpha1
1870 | kind: AlertmanagerConfig
1871 | metadata:
1872 |   name: alert-manager-config
1873 |   namespace: monitoring
1874 | spec:
1875 |   route:
1876 |     receiver: 'email'
1877 |     repeatInterval: '30m'
1878 |     routes:
1879 |       - matchers:
1880 |           - name: 'alertname'
1881 |             matchType: '='
1882 |             value: 'HostHighCpuLoad'
1883 |         repeatInterval: '10m'
1884 |       - matchers:
1885 |           - name: 'alertname'
1886 |             matchType: '='
1887 |             value: 'KubernetesPodCrashLooping'
1888 |   receivers:
1889 |     - name: 'email'
1890 |       emailConfigs:
1891 |         - to: 'test@gmail.com'
1892 |           from: 'test@gmail.com'
1893 |           smarthost: 'smtp.gmail.com:587'
1894 |           authIdentity: 'test@gmail.com'
1895 |           authUsername: 'test@gmail.com'
1896 |           authPassword:
1897 |             name: gmail-auth
1898 |             key: password
1899 | ```
1900 | 
1901 | * Apply the file
1902 | * Check that the Alertmanager saw that
1903 |   ```shell
1904 |   kubectl -n monitoring logs pod/alertmanager-prometheus-kube-prometheus-alertmanager-0
1905 |   ```
1906 |   ```shell
1907 |   ts=2022-10-10T10:20:31.474Z caller=coordinator.go:126 level=info component=configuration msg="Completed loading of configuration file" file=/etc/alertmanager/config/alertmanager.yaml
1908 |   ```
1909 | * Check that the Alertmanager saw that (via `config-reloader` container)
1910 |   ```shell
1911 |   kubectl -n monitoring logs pod/alertmanager-prometheus-kube-prometheus-alertmanager-0 -c config-reloader
1912 |   ```
1913 |   ```shell
1914 |   [...]
1915 |   level=info ts=2022-10-10T10:20:31.484269487Z caller=reloader.go:375 msg="Reload triggered" cfg_in= cfg_out= watched_dirs=/etc/alertmanager/config
1916 |   ```
1917 | * Firing your rule
1918 |   ```shell
1919 |   kubectl run cpu-test --image=containerstack/cpustress -- --cpu 4 --timeout 60s --metrics-brief
1920 |   ```
1921 | 
1922 | ---
1923 | 
1924 | 
1925 | Check the Alertmanager see the alert head over to [localhost:9093/api/v2/alerts](http://localhost:9093/api/v2/alerts)
1926 | 
1927 | <img src="images/alertmanager-api.png" alt="Alertmanager API Access">
1928 | 
1929 | Now, an email has been sent to your Gmail.
1930 | 
1931 | <img src="images/gmail.jpg" alt="Gmail">
1932 | 
1933 | ---
1934 | 
1935 | Now, Let's retake a look at Alertmanager Configuration.
1936 | 
1937 | Click on `Status` in Alertmanager or head over [localhost:9093/#/status](http://localhost:9093/#/status) to see it.
1938 | 
1939 | Surprise! Our configuration is now in the middle of the previous configuration.
1940 | 
1941 | <img src="images/alertmanager-new-status.png" alt="Alertmanager Configuration">
1942 | 
1943 | As you see, under the `routes`, there is our configured receiver.
1944 | 
1945 | Under that, a `matches` holds a `namespace="monitoring"` array.
1946 | 
1947 | This label (`namespace="monitoring"`) will also be checked whenever the Alertmanager wants to send us the alert.
1948 | 
1949 | That's why we set `namespace: monitoring` in our **Alert Rules** file.
1950 | 
1951 | Then there is a `routes` again, and under that are our rules and `matchers`.
1952 | 
1953 | ---
1954 | 
1955 | Additionally, you can set more `matchers` other than `namespace` to be checked.
1956 | 
1957 | ```yaml
1958 | route:
1959 |   receiver: 'email'
1960 |   repeatInterval: '30m'
1961 |   ###
1962 |   matchers:
1963 |     - name: 'KEY'
1964 |       matchType: '=~'
1965 |       value: 'value-1|value-2'
1966 |   ###
1967 |   routes:
1968 |     - matchers:
1969 |         - name: 'alertname'
1970 |           matchType: '='
1971 |           value: 'HostHighCpuLoad'
1972 |       repeatInterval: '10m'
1973 |     - matchers:
1974 |         - name: 'alertname'
1975 |           matchType: '='
1976 |           value: 'KubernetesPodCrashLooping'
1977 | ```
1978 | 
1979 | Now, the alerts will not only be checked against the `alertname`, but also all alerts will check against the `namespace`
1980 | and `KEY` labels.
1981 | 
1982 | And that is it!
1983 | 
1984 | I hope you guys enjoyed it as much as I did!
1985 | 
1986 | </div> <!-- Trigger Alerts for Email Receiver -->
1987 | <p align="right">(<a href="#top">back to top</a>)</p>
1988 | 
1989 | # Monitor Third-Party Applications
1990 | 
1991 | ## Intro
1992 | 
1993 | <div id="Monitor-Third-Party-Applications-Intro">
1994 | 
1995 | * So far, we've learned
1996 |     * **Monitor** Kubernetes Components
1997 |     * **Monitor** Resource Consumption on the Nodes
1998 |     * **Monitor** Prometheus itself
1999 | 
2000 | * But how about
2001 |     * **Monitor** third-party applications (e.g. Redis)
2002 |     * **Monitor** our application (e.g., online shop)
2003 | 
2004 | E.g., in monitoring the Redis database (which is a part of our online shop), how can we watch:
2005 | 
2006 | * Too much load?
2007 | * Too many connections?
2008 | * Is Redis down?
2009 | * Note that we want to monitor the Redis Pod not only on Kubernetes level but also on **Application level** too
2010 | * How do we monitor third-party apps with Prometheus?
2011 |     * [x] Prometheus **Exporters**
2012 | 
2013 | ---
2014 | 
2015 | ### What are exporters?
2016 | 
2017 | According to  [metricfire](https://www.metricfire.com/blog/first-contact-with-prometheus), An exporter comprises
2018 | software features that produce metrics data and an HTTP server that exposes the generated metrics via a given endpoint.
2019 | Metrics are exposed according to a specific format that the Prometheus server can read and ingest (scraping).
2020 | 
2021 | What the hell does that mean?!!
2022 | 
2023 | In simple words:
2024 | 
2025 | 1) Exporter **gets metrics** data **from the service**
2026 | 2) Exporter **translates** these service-specific metrics to Prometheus understandable metrics
2027 | 3) Exporter **expose** these translated metrics under **`/metrics` endpoint**
2028 | 
2029 | <img src="images/exporter.png" alt="Exporter">
2030 | 
2031 | * We need to **tell Prometheus about this new Exporter**
2032 | * For that, **ServiceMonitor** (custom K8s resource) needs to be deployed
2033 | 
2034 | </div> <!-- Intro -->
2035 | 
2036 | ## Deploy Redis Exporter
2037 | 
2038 | <div id="Monitor-Third-Party-Applications-Deploy-Redis-Exporter">
2039 | 
2040 | Now that we understand what an Exporter is let's deploy a Redis Exporter.
2041 | 
2042 | If you still have the [online shop](https://github.com/alifiroozi80/CKA/tree/main/Helm#Demo-Online-Boutique-overview)
2043 | sample application in your cluster from previous sections is excellent, and we will go with it. If not, we will create a
2044 | Redis Deployment from scratch. I'm going with option number 2 (Deploy Redis from scratch and only Redis) for the rest of
2045 | this tutorial.
2046 | 
2047 | ---
2048 | 
2049 | ### Deploy Redis
2050 | 
2051 | First, let's create a simple Redis Deployment and a Service for that, all in the `redis` namespace.
2052 | 
2053 | ```yaml
2054 | ---
2055 | apiVersion: v1
2056 | kind: Namespace
2057 | metadata:
2058 |   name: redis
2059 | ---
2060 | apiVersion: apps/v1
2061 | kind: Deployment
2062 | metadata:
2063 |   namespace: redis
2064 |   name: redis
2065 | spec:
2066 |   replicas: 1
2067 |   selector:
2068 |     matchLabels:
2069 |       app: redis
2070 |   template:
2071 |     metadata:
2072 |       labels:
2073 |         app: redis
2074 |     spec:
2075 |       containers:
2076 |         - name: redis
2077 |           image: redis
2078 |           resources:
2079 |             requests:
2080 |               cpu: 100m
2081 |               memory: 100Mi
2082 |           ports:
2083 |             - containerPort: 6379
2084 | ---
2085 | apiVersion: v1
2086 | kind: Service
2087 | metadata:
2088 |   name: redis
2089 |   namespace: redis
2090 | spec:
2091 |   selector:
2092 |     app: redis
2093 |   type: ClusterIP
2094 |   ports:
2095 |     - name: redis
2096 |       protocol: TCP
2097 |       port: 6379
2098 |       targetPort: 6379
2099 | ```
2100 | 
2101 | Apply the file and see that everything is OK!
2102 | 
2103 | ---
2104 | 
2105 | ### Deploy Redis Exporter
2106 | 
2107 | * First of all, many exporters that Prometheus documentation has mentioned them.
2108 | * Check [EXPORTERS AND INTEGRATIONS](https://prometheus.io/docs/instrumenting/exporters/#exporters-and-integrations).
2109 | * In that docs, search for `Redis exporter`. You'll find it under
2110 |   the [Databases](https://prometheus.io/docs/instrumenting/exporters/#databases) section.
2111 | * It'll redirect you to [this Github page](https://github.com/oliver006/redis_exporter).
2112 | * That's the way you find and use Prometheus Exporters.
2113 | * But, we will not use this method. We will deploy it via Helm charts!
2114 | * The [Prometheus community](https://github.com/prometheus-community) has a
2115 |   helpful [Github page](https://github.com/prometheus-community/helm-charts/tree/main/charts) that provides Helm Charts
2116 |   for many of [those Exporters](https://prometheus.io/docs/instrumenting/exporters).
2117 | * We will use [this](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-redis-exporter)
2118 |   Chart for the rest of the tutorial.
2119 | * Also, I have to mention that I have a [Helm tutorial](https://github.com/alifiroozi80/CKA/tree/main/Helm) too, but we
2120 |   will not use Helm that much here!
2121 | 
2122 | ---
2123 | 
2124 | Let's look
2125 | at [values.yaml](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-redis-exporter/values.yaml)
2126 | and re-write those we want.
2127 | 
2128 | * Re-write the Redis Service: `redisAddress: redis://redis:6379`.
2129 | * Re-write the `serviceMonitor` section like this:
2130 |   ```yaml
2131 |   serviceMonitor:
2132 |     enabled: true
2133 |   ```
2134 | * We also have to add two more things to the `serviceMonitor`. (It describes the set of targets to be monitored by
2135 |   Prometheus)
2136 | 
2137 | 1) `namespace`. It is optional, but I want to deploy this in the `monitoring` namespace because all of my
2138 |    other `serviceMonitor`s are there.
2139 | 2) `labels`: It is required because we want to Prometheus Operator to find out about this. (same as the Alert Rules)
2140 | 
2141 | Here is the complete file:
2142 | 
2143 | ```yaml
2144 | redisAddress: redis://redis:6379
2145 | serviceMonitor:
2146 |   enabled: true
2147 |   namespace: monitoring
2148 |   labels:
2149 |     release: prometheus
2150 | ```
2151 | 
2152 | Note: How did I know about the `release: prometheus` label?
2153 | Well, if you look at one of the already deployed `serviceMonitor`s, you'll see that.
2154 | 
2155 | E.g.,
2156 | 
2157 | ```shell
2158 | kubectl -n monitoring get serviceMonitor/prometheus-prometheus-node-exporter -o yaml
2159 | ```
2160 | 
2161 | ```yaml
2162 | apiVersion: monitoring.coreos.com/v1
2163 | kind: ServiceMonitor
2164 | metadata:
2165 |   annotations:
2166 |     meta.helm.sh/release-name: prometheus
2167 |     meta.helm.sh/release-namespace: monitoring
2168 |   creationTimestamp: "2022-10-02T08:22:45Z"
2169 |   generation: 1
2170 |   labels:
2171 |     app: prometheus-node-exporter
2172 |     app.kubernetes.io/managed-by: Helm
2173 |     chart: prometheus-node-exporter-3.3.1
2174 |     heritage: Helm
2175 |     jobLabel: node-exporter
2176 |     release: prometheus
2177 |   name: prometheus-prometheus-node-exporter
2178 |   namespace: monitoring
2179 |   resourceVersion: "432508"
2180 |   uid: 25536aeb-f52b-4c48-bd58-db891ccfc364
2181 | spec:
2182 |   endpoints:
2183 |     - port: http-metrics
2184 |       scheme: http
2185 |   jobLabel: jobLabel
2186 |   selector:
2187 |     matchLabels:
2188 |       app: prometheus-node-exporter
2189 |       release: prometheus # Here it is
2190 | ```
2191 | 
2192 | Alright, let's deploy this, baby!
2193 | 
2194 | ```shell
2195 | helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
2196 | helm repo update
2197 | ```
2198 | 
2199 | ```shell
2200 | helm install redis-exporter prometheus-community/prometheus-redis-exporter -f redis-exporter-value.yaml -n redis
2201 | ```
2202 | 
2203 | ---
2204 | 
2205 | Everything seems good
2206 | 
2207 | Let's take a look at some components and Prometheus UI.
2208 | 
2209 | * See The Redis and its Exporter.
2210 |   ```shell
2211 |   kubectl -n redis get pod
2212 |   ```
2213 | * See the `serviceMonitor`s
2214 |   ```shell
2215 |   kubectl -n monitoring get serviceMonitor
2216 |   ```
2217 | * See the Redis `serviceMonitor`
2218 |   ```shell
2219 |   kubectl -n monitoring get serviceMonitor/redis-exporter-prometheus-redis-exporter -o yaml
2220 |   ```
2221 |   ```yaml
2222 |   apiVersion: monitoring.coreos.com/v1
2223 |   kind: ServiceMonitor
2224 |   metadata:
2225 |     annotations:
2226 |       meta.helm.sh/release-name: redis-exporter
2227 |       meta.helm.sh/release-namespace: redis
2228 |     creationTimestamp: "2022-10-11T09:36:13Z"
2229 |     generation: 1
2230 |     labels:
2231 |       app.kubernetes.io/managed-by: Helm
2232 |       release: prometheus # Notice
2233 |     name: redis-exporter-prometheus-redis-exporter
2234 |     namespace: monitoring # Notice
2235 |     resourceVersion: "532453"
2236 |     uid: 9e90939f-5b24-4f81-b290-5d29d370e8c3
2237 |   spec:
2238 |     endpoints:
2239 |     - port: redis-exporter # Notice
2240 |     jobLabel: redis-exporter-prometheus-redis-exporter
2241 |     namespaceSelector:
2242 |       matchNames:
2243 |       - redis # Notice
2244 |     selector:
2245 |       matchLabels:
2246 |         app.kubernetes.io/instance: redis-exporter
2247 |         app.kubernetes.io/name: prometheus-redis-exporter
2248 |   ```
2249 | 
2250 | ---
2251 | 
2252 | And now, if you head over to [localhost:9090/targets](http://localhost:9090/targets) or In Prometheus UI, click
2253 | on `Status/Targets`, you'll see that my newly Redis Exporter has been added here!
2254 | 
2255 | <img src="images/redis-exporter.png" alt="Redis Exporter in Prometheus UI">
2256 | 
2257 | And now the Redis Exporter PromQL queries are here!
2258 | 
2259 | <img src="images/redis-queries.png" alt="Redis Queries">
2260 | 
2261 | </div> <!-- Deploy Redis Exporter -->
2262 | 
2263 | ## Alert Rules & Grafana dashboard for Redis
2264 | 
2265 | <div id="Monitor-Third-Party-Applications-Alert-Rules-Grafana-dashboard-for-Redis">
2266 | 
2267 | ### Create Alert Rules for Redis
2268 | 
2269 | <div id="Deploy-Redis-Exporter-Create-Alert-Rules-for-Redis">
2270 | 
2271 | Alright, it's time to write some rules for our Redis application.
2272 | 
2273 | The syntax is the same. Here is a starting point:
2274 | 
2275 | ```yaml
2276 | apiVersion: monitoring.coreos.com/v1
2277 | kind: PrometheusRule
2278 | metadata:
2279 |   name: redis-rules
2280 |   namespace: monitoring
2281 |   labels:
2282 |     app: kube-prometheus-stack
2283 |     release: prometheus
2284 | spec:
2285 |   groups:
2286 |     - name: redis.rules
2287 |       rules: 
2288 | ```
2289 | 
2290 | * For the `rules` section, you can write PromQL queries as before or
2291 |   use [Awesome Prometheus alerts](https://awesome-prometheus-alerts.grep.to).
2292 | * The [Awesome Prometheus alerts](https://awesome-prometheus-alerts.grep.to) is
2293 |   a [open-source](https://github.com/samber/awesome-prometheus-alerts) that basically is a **Collection of Prometheus
2294 |   alerting rules**.
2295 | * It is a fantastic project, and I love it a lot. (Remember that I said you don't need that deep PromQL knowledge?
2296 |   That's why!)
2297 | * Under the `Databases and brokers`, you'll find the Redis.
2298 | * We'll need the [Redis down](https://awesome-prometheus-alerts.grep.to/rules#rule-redis-1-1)
2299 |   and [Redis too many connections](https://awesome-prometheus-alerts.grep.to/rules#rule-redis-1-10) rules.
2300 | * Just copy and paste them under the `rules` same as before.
2301 | 
2302 | Your file now should look like this:
2303 | 
2304 | ```yaml
2305 | apiVersion: monitoring.coreos.com/v1
2306 | kind: PrometheusRule
2307 | metadata:
2308 |   name: redis-rules
2309 |   namespace: monitoring
2310 |   labels:
2311 |     app: kube-prometheus-stack
2312 |     release: prometheus
2313 | spec:
2314 |   groups:
2315 |     - name: redis.rules
2316 |       rules:
2317 |         - alert: RedisDown
2318 |           expr: redis_up == 0
2319 |           for: 0m
2320 |           labels:
2321 |             severity: critical
2322 |           annotations:
2323 |             summary: Redis down (instance {{ $labels.instance }})
2324 |             description: "Redis instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
2325 |         - alert: RedisTooManyConnections
2326 |           expr: redis_connected_clients > 100
2327 |           for: 2m
2328 |           labels:
2329 |             severity: warning
2330 |           annotations:
2331 |             summary: Redis too many connections (instance {{ $labels.instance }})
2332 |             description: "Redis instance has too many connections\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
2333 | ```
2334 | 
2335 | Apply the file, and after a few seconds, you'll see the rules under the `Alerts` in Prometheus UI. (Or head over
2336 | to [localhost:9090/alerts](http://localhost:9090/alerts)).
2337 | 
2338 | <img src="images/redis-rules.png" alt="Redis Rules">
2339 | 
2340 | </div> <!-- Create Alert Rules for Redis -->
2341 | 
2342 | ### Trigger the 'Redis is Down'
2343 | 
2344 | <div id="Deploy-Redis-Exporter-Trigger-Redis-is-Down">
2345 | 
2346 | * Let's test one of our Rules (`RedisDown`).
2347 | * We can edit the YAML file, change the `replicas` to `0`, and re-apply the file.
2348 | * Or delete the Redis service.
2349 | * After a while (30 seconds), the alert would be in a `Firing` state.
2350 | 
2351 | <img src="images/redis-down.png" alt="Redis Rule">
2352 | 
2353 | ---
2354 | 
2355 | Why it took 30 seconds to alert finds out?
2356 | 
2357 | If you look at the Redis Exporter chart's
2358 | default [values.yaml](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-redis-exporter/values.yaml#L92)
2359 | , you'll see that the `interval` attribute is set to `30s`.
2360 | 
2361 | It means the Redis Exporter should scrape the Redis data every 30 seconds.
2362 | 
2363 | That's why.
2364 | 
2365 | </div> <!-- Trigger the 'Redis is Down' -->
2366 | 
2367 | ### Create Redis Dashboard in Grafana
2368 | 
2369 | <div id="Deploy-Redis-Exporter-Create-Redis-Dashboard-in-Grafana">
2370 | 
2371 | * Wouldn't it be nice if we had a dashboard for the Redis application too?
2372 | * Well, we can, and we will!
2373 | * We are going to:
2374 |     * Create a Grafana dashboard with Redis metrics ourselves
2375 |     * Use **existing Redis dashboard**
2376 | 
2377 | ---
2378 | 
2379 | * If you head over to [Grafana Dashboards](https://grafana.com/grafana/dashboards) page, you'll see a bunch of
2380 |   pre-created dashboards almost for everything!
2381 | * That's super awesome! (Remember, I said you don't need to have deep knowledge of PromQL).
2382 | * You can search for a dashboard and use it in your Grafana Dashboards.
2383 | 
2384 | Every dashboard in Grafana has a Dashboard ID and a Dashboard JSON file.
2385 | 
2386 | <img src="images/grafana-dash-id.png" alt="Grafana Dashboard ID">
2387 | 
2388 | We'll use this ID or JSON file to import the dashboard to our Dashboards.
2389 | 
2390 | ---
2391 | 
2392 | If we look at the Redis Exporter [GitHub page](https://github.com/oliver006/redis_exporter), you'll see
2393 | the [Grafana](https://github.com/oliver006/redis_exporter#what-it-looks-like) section for Grafana Dashboard.
2394 | 
2395 | As you see, the author put the Dashboard ID (`763`)
2396 | and [Dashboard JSON file](https://github.com/oliver006/redis_exporter/blob/master/contrib/grafana_prometheus_redis_dashboard.json)
2397 | .
2398 | 
2399 | So, let's add it.
2400 | 
2401 | ---
2402 | 
2403 | In Grafana UI, click on `Dashboards/Import` or head over
2404 | to [localhost:8080/dashboard/import](http://localhost:8080/dashboard/import).
2405 | 
2406 | <img src="images/add-dashboard.png" alt="Add the Dashboard">
2407 | 
2408 | Paste the Dashboard ID here, or upload the JSON file, and click on the `Load` button.
2409 | 
2410 | * Select `Prometheus` as the data source
2411 | * Select a folder that our dashboard should be on it (Here `General`)
2412 | 
2413 | <img src="images/add-dash-2.png" alt="Add the Dashboard">
2414 | 
2415 | Click on the `Import` button.
2416 | 
2417 | ---
2418 | 
2419 | First, you may see nothing. That's because probably the `instance` is wrong.
2420 | 
2421 | To select the correct one:
2422 | 
2423 | 1) See the endpoint (or describe the service):
2424 |     ```shell
2425 |     $ kubectl -n redis get ep/redis-exporter-prometheus-redis-exporter
2426 |     NAME                                       ENDPOINTS       
2427 |     redis-exporter-prometheus-redis-exporter   10.42.0.188:9121 <--- Here it is! 
2428 |     ```
2429 | 
2430 | 2) Select this endpoint as the instance
2431 | 
2432 | <img src="images/redis-dashboard.png" alt="Redis Dashboard">
2433 | 
2434 | That's it.
2435 | 
2436 | Know you know how to manage a third-party application completely.
2437 | 
2438 | Congrats 🍻
2439 | 
2440 | </div> <!-- Create Redis Dashboard in Grafana -->
2441 | </div> <!-- Alert Rules & Grafana dashboard for Redis -->
2442 | <p align="right">(<a href="#top">back to top</a>)</p>
2443 | 
2444 | # Monitor own App
2445 | 
2446 | ## Collect & Expose Metrics
2447 | 
2448 | <div id="Monitor-own-App-Collect-and-Expose-Metrics">
2449 | 
2450 | ### Intro
2451 | 
2452 | <div id="Collect-and-Expose-Metrics-Intro">
2453 | 
2454 | Till now, we've learned:
2455 | 
2456 | * [X] **Monitor** Resource Consumption on the Nodes
2457 | * [X] **Monitor** Kubernetes components
2458 | * [X] **Monitor** Prometheus itslef
2459 | * [X] **Monitor** third-party applications (Redis)
2460 | * [ ] **Monitor** Own Application (NodeJS)
2461 | 
2462 | It's time to monitor our applications.
2463 | 
2464 | * No Exporter available for our application
2465 | * We have to define the metrics
2466 | 
2467 | Solution?
2468 | 
2469 | * Client Libraries
2470 | 
2471 | What are Client Libraries?
2472 | 
2473 | * Choose a Prometheus client library that **matches** the **language in which your application is written**
2474 | * Abstract interface to expose your metrics
2475 | * Libraries implement the Prometheus metric types
2476 |     * Counter
2477 |     * Gauge
2478 |     * Histogram
2479 |     * Summary
2480 | 
2481 | ---
2482 | 
2483 | Steps to Monitor Own Application
2484 | 
2485 | 1) **Expose metrics** for our NodeJS application using **NodeJS client library**
2486 | 2) **Deploy** NodeJS app in the cluster
2487 | 3) Configure Prometheus to **scrape new target (ServiceMonitor)**
2488 | 4) **Visualize** scraped metrics in Grafana Dashboard
2489 | 
2490 | ---
2491 | 
2492 | * **REMEMBER:** This (Expose metrics for Applications) is **Developers** responsibility, not yours (DevOps Engineer)!
2493 |   As a DevOps Engineer, it's your responsibility to deploy and monitor their applications (via metrics), NOT implement
2494 |   metrics in Developer applications.
2495 | 
2496 | </div> <!-- Intro -->
2497 | 
2498 | ### Expose Metrics
2499 | 
2500 | <div id="Collect-and-Expose-Metrics-Expose-Metrics">
2501 | 
2502 | * OK, for the sake of this demo, let's keep it clean and simple.
2503 | * We will use a sample [NodeJS App](https://github.com/alifiroozi80/nodejs-prometheus) in this demo.
2504 | 
2505 | ---
2506 | 
2507 | * Look at [Prometheus Client Libraries](https://prometheus.io/docs/instrumenting/clientlibs/#client-libraries). You'll
2508 |   see a bunch of libraries which every library belongs to a language.
2509 | * Click on NodeJS, redirecting you to [Prometheus client for node.js](https://github.com/siimon/prom-client).
2510 | * We use this page as documentation.
2511 | 
2512 | ---
2513 | 
2514 | Here is the `server.js` file:
2515 | 
2516 | ```js
2517 | const express = require('express');
2518 | const app = express();
2519 | const client = require('prom-client');
2520 | 
2521 | 
2522 | // --------------- Default Metrics ---------------
2523 | const collectDefaultMetrics = client.collectDefaultMetrics;
2524 | // Probe every 5th second.
2525 | collectDefaultMetrics({timeout: 5000});
2526 | 
2527 | 
2528 | // --------------- Custom Metrics ---------------
2529 | const httpRequestsTotal = new client.Counter({
2530 |     name: 'http_request_operations_total',
2531 |     help: 'Total number of Http requests'
2532 | })
2533 | 
2534 | const httpRequestDurationSeconds = new client.Histogram({
2535 |     name: 'http_request_duration_seconds',
2536 |     help: 'Duration of Http requests in seconds',
2537 |     buckets: [0.1, 0.5, 2, 5, 10]
2538 | })
2539 | 
2540 | 
2541 | // --------------- Default Metrics ---------------
2542 | app.get('/metrics', async (req, res) => {
2543 |     res.set('Content-Type', client.register.contentType)
2544 |     res.end(await client.register.metrics())
2545 | })
2546 | 
2547 | app.get('/', function (req, res) {
2548 |     // Simulate sleep for a random number of milliseconds
2549 |     var start = new Date()
2550 |     var simulateTime = Math.floor(Math.random() * (10000 - 500 + 1) + 500)
2551 | 
2552 |     setTimeout(function (argument) {
2553 |         // Simulate execution time
2554 |         var end = new Date() - start
2555 |         httpRequestDurationSeconds.observe(end / 1000); //convert to seconds
2556 |     }, simulateTime)
2557 | 
2558 |     httpRequestsTotal.inc();
2559 |     res.send('<h1>Hello World!</h1>')
2560 | });
2561 | 
2562 | 
2563 | // --------------- Start the App ---------------
2564 | app.listen(3000, function () {
2565 |     console.log("app listening on port 3000!");
2566 | });
2567 | ```
2568 | 
2569 | What is hell is that?!!!
2570 | 
2571 | Well, as I said earlier, writing this file is **Developer** responsibility, NOT yours (DevOps Engineer), But for the
2572 | sake of the demo, let's walk through this file very briefly.
2573 | 
2574 | * [Default Metrics](https://github.com/siimon/prom-client#default-metrics)
2575 |     * Prometheus itself recommends default metrics
2576 |     * In addition, NodeJS [custom metrics](https://github.com/siimon/prom-client#custom-metrics) are included
2577 | * [Custom Metrics](https://github.com/siimon/prom-client#custom-metrics)
2578 |     * All metric types have two mandatory parameters: `name` and `help`
2579 | 
2580 | * `httpRequestsTotal`
2581 |     * It is a [Counter](https://github.com/siimon/prom-client#counter) type
2582 |     * A cumulative metric, whose **value can only increase**
2583 |     * Resets to `0` when the process restarts
2584 | * `httpRequestDurationSeconds`
2585 |     * It is [Histogram](https://github.com/siimon/prom-client#histogram) type
2586 |     * Sample observations and counts them in configurable buckets
2587 |     * Track sizes and frequency of events
2588 | 
2589 | Now, if you start the app and head over to [localhost:3000/metrics](http://localhost:3000/metrics), you'll see the
2590 | exposed metrics.
2591 | 
2592 | <img src="images/exposed-metrics.png" alt="NodeJS exposed metrics">
2593 | 
2594 | </div> <!-- Expose Metrics -->
2595 | 
2596 | ### Build Docker Image & Push it to a Repo
2597 | 
2598 | <div id="Collect-and-Expose-Metrics-Build-Docker-Image-Push-it-to-a-Repo">
2599 | 
2600 | Alright, we've seen the NodeJS app locally.
2601 | 
2602 | Now it's time to build and push it to a registry to use in the K8s cluster.
2603 | 
2604 | Here is the `Dockerfile`:
2605 | 
2606 | ```Dockerfile
2607 | FROM node:13-alpine
2608 | 
2609 | RUN mkdir -p /usr/app
2610 | 
2611 | COPY package.json /usr/app/
2612 | COPY app /usr/app/
2613 | 
2614 | WORKDIR /usr/app
2615 | 
2616 | EXPOSE 3000
2617 | 
2618 | RUN npm install
2619 | 
2620 | CMD ["node", "server.js"]
2621 | ```
2622 | 
2623 | * Build it
2624 |   ```shell
2625 |   docker build -t alifiroozizamani/nodejs-demo:1.0.0 .
2626 |   ```
2627 | * Push it
2628 |   ```shell
2629 |   docker push alifiroozizamani/nodejs-demo:1.0.0
2630 |   ```
2631 | * See it on the [DockerHub](https://hub.docker.com/r/alifiroozizamani/nodejs-demo)
2632 | 
2633 | It's now ready to be deployed in a K8s cluster.
2634 | 
2635 | </div> <!-- Build Docker Image & Push it to a Repo -->
2636 | 
2637 | ### Deploy App into K8s cluster
2638 | 
2639 | <div id="Collect-and-Expose-Metrics-Deploy-App-into-K8s-cluster">
2640 | 
2641 | This is the YAML file.
2642 | 
2643 | This is a simple Deployment and Service component file.
2644 | 
2645 | ```yaml
2646 | apiVersion: v1
2647 | kind: Namespace
2648 | metadata:
2649 |   name: node-demo
2650 | ---
2651 | apiVersion: apps/v1
2652 | kind: Deployment
2653 | metadata:
2654 |   name: nodeapp
2655 |   labels:
2656 |     app: nodeapp
2657 |   namespace: node-demo
2658 | spec:
2659 |   selector:
2660 |     matchLabels:
2661 |       app: nodeapp
2662 |   template:
2663 |     metadata:
2664 |       labels:
2665 |         app: nodeapp
2666 |     spec:
2667 |       containers:
2668 |         - name: nodeapp
2669 |           image: alifiroozizamani/nodejs-demo:1.0.0
2670 |           ports:
2671 |             - containerPort: 3000
2672 |           resources:
2673 |             requests:
2674 |               cpu: 100m
2675 |               memory: 100Mi
2676 |             limits:
2677 |               cpu: 100m
2678 |               memory: 100Mi
2679 | ---
2680 | apiVersion: v1
2681 | kind: Service
2682 | metadata:
2683 |   name: nodeapp
2684 |   labels:
2685 |     app: nodeapp
2686 |   namespace: node-demo
2687 | spec:
2688 |   type: ClusterIP
2689 |   selector:
2690 |     app: nodeapp
2691 |   ports:
2692 |     - name: service
2693 |       protocol: TCP
2694 |       port: 3000
2695 |       targetPort: 3000
2696 | ```
2697 | 
2698 | Apply the file.
2699 | 
2700 | * Make sure everything is OK
2701 |   ```shell
2702 |   kubectl get all -n node-demo 
2703 |   ```
2704 | * Get the `serviceIP:port` and see our app in the browser
2705 |   ```shell
2706 |   kubectl -n node-demo get svc/nodeapp -o jsonpath='{ .spec.clusterIP }:{ .spec.ports[0].port }'
2707 |   ```
2708 | 
2709 | </div> <!-- Deploy App into K8s cluster -->
2710 | </div> <!-- Collect & Expose Metrics -->
2711 | 
2712 | ## Configure Monitoring
2713 | 
2714 | <div id="Monitor-own-App-Configure-Monitoring">
2715 | 
2716 | ### Create ServiceMonitor
2717 | 
2718 | <div id="Configure-Monitoring-Create-ServiceMonitor">
2719 | 
2720 | * Let's tell Prometheus about our app.
2721 | * For that, we have `ServiceMonitor`. So, let's create one.
2722 | * We've already seen the `ServiceMonitor's and have talked about them.
2723 | * So, here we aren't walking through them.
2724 | * If it's unclear to you, review <a href="#Monitor-Third-Party-Applications-Intro">here</a>.
2725 | 
2726 | This is the `ServiceMonitor` configuration.
2727 | 
2728 | ```yaml
2729 | apiVersion: monitoring.coreos.com/v1
2730 | kind: ServiceMonitor
2731 | metadata:
2732 |   name: monitoring-demo-node-app
2733 |   labels:
2734 |     release: prometheus
2735 |     app: nodeapp
2736 |   namespace: monitoring # Optional
2737 | spec:
2738 |   endpoints:
2739 |     - path: /metrics
2740 |       port: service
2741 |       targetPort: 3000
2742 |   namespaceSelector:
2743 |     matchNames:
2744 |       - node-demo
2745 |   selector:
2746 |     matchLabels:
2747 |       app: nodeapp
2748 | ```
2749 | 
2750 | You can create or add a new file to our previous YAML file.
2751 | 
2752 | After that, apply the file, and after a couple of minutes, you'll see it in Prometheus UI under `Status/Targets`.
2753 | 
2754 | <img src="images/node-target.png" alt="NodeJS Target">
2755 | 
2756 | Also, you'll see that our NodeJS has been added to the configuration. (Under `Status/Configuration
2757 | or [localhost:9090/config](http://localhost:9090/config)
2758 | 
2759 | <img src="images/node-status.png" alt="NodeJS Status">
2760 | 
2761 | </div> <!-- Create ServiceMonitor -->
2762 | 
2763 | ### Create Grafana Dashboard
2764 | 
2765 | <div id="Configure-Monitoring-Create-Grafana-Dashboard">
2766 | 
2767 | Alright. It's time to Create a Dashboard for our NodeJS App in Grafana.
2768 | 
2769 | First, In Grafana UI, click on `Dashboards/New Dashboard` or head over
2770 | to [localhost:8080/dashboard/new](http://localhost:8080/dashboard/new).
2771 | 
2772 | Click on `Add a new panel`.
2773 | 
2774 | Here we should provide a PromQL query.
2775 | 
2776 | The first query we want to see is `http_request_operations_total`.
2777 | 
2778 | We combine it with [rate](https://prometheus.io/docs/prometheus/latest/querying/functions/#rate) function.
2779 | 
2780 | So, the query would be like `rate(http_request_operations_total[2m])`.
2781 | 
2782 | <img src="images/request-per-seconds.png" alt="Request Per Seconds">
2783 | 
2784 | The second query we want to see is `http_request_duration_seconds_sum`.
2785 | 
2786 | We combine it with [rate](https://prometheus.io/docs/prometheus/latest/querying/functions/#rate) function.
2787 | 
2788 | So, the query would be like `rate(http_request_duration_seconds_sum[2m])`.
2789 | 
2790 | <img src="images/request-duration.png" alt="Request Duration">
2791 | 
2792 | By the way, you can see these queries in Prometheus UI too.
2793 | 
2794 | Finally, save the dashboard.
2795 | 
2796 | <img src="images/requests-dash.png" alt="Requests Dashboards">
2797 | 
2798 | You can use this command to curl your Node app for a while and see a better view of your Dashboards:
2799 | 
2800 | ```shell
2801 | while true; do curl http://SERVICEIP:3000; done;
2802 | ```
2803 | 
2804 | That's it.
2805 | 
2806 | I hope you guys have enjoyed it as much as I do🍻
2807 | 
2808 | </div> <!-- Create Grafana Dashboard -->
2809 | </div> <!-- Configure Monitoring -->
2810 | <p align="right">(<a href="#top">back to top</a>)</p>


--------------------------------------------------------------------------------
/Prometheus/images/Alertmanager-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/Alertmanager-ui.png


--------------------------------------------------------------------------------
/Prometheus/images/CPU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/CPU.png


--------------------------------------------------------------------------------
/Prometheus/images/Grafana-UI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/Grafana-UI.png


--------------------------------------------------------------------------------
/Prometheus/images/Prometheus-Federations.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/Prometheus-Federations.jpg


--------------------------------------------------------------------------------
/Prometheus/images/a-rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/a-rule.png


--------------------------------------------------------------------------------
/Prometheus/images/add-dash-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/add-dash-2.png


--------------------------------------------------------------------------------
/Prometheus/images/add-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/add-dashboard.png


--------------------------------------------------------------------------------
/Prometheus/images/alert-rule-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/alert-rule-1.png


--------------------------------------------------------------------------------
/Prometheus/images/alert-rules.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/alert-rules.png


--------------------------------------------------------------------------------
/Prometheus/images/alertmanager-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/alertmanager-api.png


--------------------------------------------------------------------------------
/Prometheus/images/alertmanager-new-status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/alertmanager-new-status.png


--------------------------------------------------------------------------------
/Prometheus/images/alertmanager-status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/alertmanager-status.png


--------------------------------------------------------------------------------
/Prometheus/images/cpu-number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/cpu-number.png


--------------------------------------------------------------------------------
/Prometheus/images/create-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/create-graph.png


--------------------------------------------------------------------------------
/Prometheus/images/create-rule-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/create-rule-1.png


--------------------------------------------------------------------------------
/Prometheus/images/create-rule-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/create-rule-2.png


--------------------------------------------------------------------------------
/Prometheus/images/create-rule-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/create-rule-3.png


--------------------------------------------------------------------------------
/Prometheus/images/exporter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/exporter.png


--------------------------------------------------------------------------------
/Prometheus/images/exposed-metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/exposed-metrics.png


--------------------------------------------------------------------------------
/Prometheus/images/firing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/firing.png


--------------------------------------------------------------------------------
/Prometheus/images/gmail.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/gmail.jpg


--------------------------------------------------------------------------------
/Prometheus/images/grafana-cluster-rows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/grafana-cluster-rows.png


--------------------------------------------------------------------------------
/Prometheus/images/grafana-dash-id.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/grafana-dash-id.png


--------------------------------------------------------------------------------
/Prometheus/images/grafana-dashboards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/grafana-dashboards.png


--------------------------------------------------------------------------------
/Prometheus/images/grafana-data-sources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/grafana-data-sources.png


--------------------------------------------------------------------------------
/Prometheus/images/grafana-explore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/grafana-explore.png


--------------------------------------------------------------------------------
/Prometheus/images/grafana-node-pods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/grafana-node-pods.png


--------------------------------------------------------------------------------
/Prometheus/images/grafana-users.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/grafana-users.png


--------------------------------------------------------------------------------
/Prometheus/images/logo.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="64" height="64" viewBox="-4.649 -0.667 64 64" overflow="visible"><path d="M27.35-.667c-17.673 0-32 14.326-32 32s14.328 32 32 32 32-14.327 32-32-14.328-32-32-32zm0 59.89c-5.028 0-9.105-3.36-9.105-7.5h18.2c0 4.142-4.077 7.5-9.105 7.5zM42.4 49.24H12.31v-5.454H42.4v5.454zm-.108-8.26H12.397l-.297-.344c-3.08-3.74-3.804-5.7-4.508-7.68-.012-.066 3.734.766 6.39 1.363 0 0 1.367.316 3.364.68-1.918-2.25-3.057-5.107-3.057-8.03 0-6.415 4.92-12.02 3.145-16.55 1.728.14 3.575 3.646 3.7 9.126 1.837-2.538 2.605-7.172 2.605-10.014 0-2.942 1.94-6.36 3.878-6.477-1.73 2.85.448 5.29 2.382 11.35.726 2.276.633 6.106 1.193 8.535.186-5.045 1.053-12.405 4.254-14.946-1.412 3.2.21 7.205 1.318 9.13 1.79 3.106 2.873 5.46 2.873 9.9 0 2.984-1.102 5.793-2.96 8 2.113-.397 3.572-.754 3.572-.754l6.862-1.34c0-.001-.997 4.1-4.828 8.05z" fill="#da4e31"/></svg>


--------------------------------------------------------------------------------
/Prometheus/images/node-exporter-nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/node-exporter-nodes.png


--------------------------------------------------------------------------------
/Prometheus/images/node-status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/node-status.png


--------------------------------------------------------------------------------
/Prometheus/images/node-target.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/node-target.png


--------------------------------------------------------------------------------
/Prometheus/images/pending.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/pending.png


--------------------------------------------------------------------------------
/Prometheus/images/prometheus-rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/prometheus-rule.png


--------------------------------------------------------------------------------
/Prometheus/images/prometheus-server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/prometheus-server.png


--------------------------------------------------------------------------------
/Prometheus/images/prometheus-status-config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/prometheus-status-config.png


--------------------------------------------------------------------------------
/Prometheus/images/prometheus-target-expand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/prometheus-target-expand.png


--------------------------------------------------------------------------------
/Prometheus/images/prometheus-targets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/prometheus-targets.png


--------------------------------------------------------------------------------
/Prometheus/images/prometheus-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/prometheus-ui.png


--------------------------------------------------------------------------------
/Prometheus/images/promql-prometheus-ui-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/promql-prometheus-ui-1.png


--------------------------------------------------------------------------------
/Prometheus/images/promql-prometheus-ui-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/promql-prometheus-ui-2.png


--------------------------------------------------------------------------------
/Prometheus/images/promql-query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/promql-query.png


--------------------------------------------------------------------------------
/Prometheus/images/redis-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/redis-dashboard.png


--------------------------------------------------------------------------------
/Prometheus/images/redis-down.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/redis-down.png


--------------------------------------------------------------------------------
/Prometheus/images/redis-exporter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/redis-exporter.png


--------------------------------------------------------------------------------
/Prometheus/images/redis-queries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/redis-queries.png


--------------------------------------------------------------------------------
/Prometheus/images/redis-rules.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/redis-rules.png


--------------------------------------------------------------------------------
/Prometheus/images/request-duration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/request-duration.png


--------------------------------------------------------------------------------
/Prometheus/images/request-per-seconds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/request-per-seconds.png


--------------------------------------------------------------------------------
/Prometheus/images/requests-dash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alifiroozi80/CKA/e095fd6549cd89e2472bdd633bca83af446f5b40/Prometheus/images/requests-dash.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- PROJECT LOGO -->
  2 | <br />
  3 | <div align="center">
  4 |   <a href="https://github.com/alifiroozi80/CKA">
  5 |     <img src="CKA/images/logo.png" alt="Logo" width="200" height="200">
  6 |     <img src="Helm/images/logo.svg" alt="Logo" width="200" height="200">
  7 |     <img src="Operators/images/logo.png" alt="Logo" width="200" height="200">
  8 |     <img src="Prometheus/images/logo.svg" alt="Logo" width="200" height="200">
  9 |   </a>
 10 | 
 11 | <h3 align="center">Certified Kubernetes Administrator</h3>
 12 | 
 13 |   <p align="center">
 14 |     Become an EXPERT in Kubernetes from scratch!
 15 |     <br />
 16 |     <br />
 17 |     <a href="https://github.com/alifiroozi80/CKA/issues">Report Bug</a>
 18 |     ·
 19 |     <a href="https://github.com/alifiroozi80/CKA/issues">Request Feature</a>
 20 |   </p>
 21 | </div>
 22 | 
 23 | ---
 24 | 
 25 | If you want to become a Certified Kubernetes Administrator, or you want to become an EXPERT in Kubernetes, learn
 26 | Kubernetes from scratch and understand everything, this repo is a good choice.
 27 | 
 28 | In this repo, you'll be prepared to pass the exam. You'll also learn more about Kubernetes concepts that are not in the
 29 | exam, such as `Kubernetes on AWS (EKS)`, `Operators`, `Helm`, `Microservices`, `Prometheus`, and most
 30 | importantly, `install and set up a K8s cluster` from scratch (In the exam, you won't have to install a cluster, but
 31 | knowing how to do it gives you a **huge advantage!**)
 32 | 
 33 | With this repo and other sources I mentioned below, you'll prepare to pass the exam and be confident about your
 34 | Kubernetes knowledge!
 35 | 
 36 | Have Fun!
 37 | 
 38 | * Comprehensive
 39 | * Hands-On Demos
 40 | * Zero-To-Expert
 41 | 
 42 | ---
 43 | 
 44 | <!-- Table of content -->
 45 | 
 46 | ## Table of Content
 47 | 
 48 | This is the starting point.
 49 | 
 50 | **NOTE:** The Order matter here!
 51 | 
 52 | If you are a beginner and want to start from scratch, you should follow along with the order of this table.
 53 | 
 54 | | index | Name       | Link                                                                           | Website                                                                              | Description                                                                                                  |
 55 | |-------|------------|--------------------------------------------------------------------------------|--------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------|
 56 | | 1     | Kubernetes | [Start the Tutorial](https://github.com/alifiroozi80/CKA/tree/main/CKA)        | [kubernetes.io](https://kubernetes.io)                                               | Become a Certified Kubernetes Administrator, an EXPERT in K8s from scratch                                   |
 57 | | 2     | Helm       | [Start the Tutorial](https://github.com/alifiroozi80/CKA/tree/main/Helm)       | [helm.sh](https://helm.sh)                                                           | Become a master in Kubernetes package manager                                                                |
 58 | | 3     | Operator   | [Start the Tutorial](https://github.com/alifiroozi80/CKA/tree/main/Operators)  | [kubernetes.io/docs](https://kubernetes.io/docs/concepts/extend-kubernetes/operator) | Extend the functionality of the Kubernetes API                                                               |
 59 | | 4     | Prometheus | [Start the Tutorial](https://github.com/alifiroozi80/CKA/tree/main/Prometheus) | [prometheus.io](https://prometheus.io)                                               | The Prometheus monitoring system and time series database                                                    |
 60 | | 5     | EKS        | SOON                                                                           | [aws.amazon.com](https://aws.amazon.com/eks)                                         | Amazon EKS is a managed Kubernetes service to run Kubernetes in the AWS cloud and on-premises data centers.  |
 61 | 
 62 | ---
 63 | 
 64 | <!-- ROADMAP -->
 65 | 
 66 | ## Roadmap
 67 | 
 68 | - [x] Add real-world examples
 69 | - [x] Add exam tips
 70 | - [x] Add `Helm` - Package Manager of Kubernetes
 71 | - [x] Add `Operators`
 72 | - [x] Add monitoring - `Prometheus`
 73 | - [ ] Kubernetes on AWS - `EKS`
 74 | 
 75 | See the [open issues](https://github.com/alifiroozi80/CKA/issues) for a complete list of proposed features (and known
 76 | issues).
 77 | 
 78 | ---
 79 | 
 80 | <!-- CONTRIBUTING -->
 81 | 
 82 | ## Contributing
 83 | 
 84 | Any contributions you make are **greatly appreciated**.
 85 | 
 86 | If you have a suggestion to improve this, please fork the repo and create a pull request. You can also open an issue
 87 | with the tag "enhancement."
 88 | 
 89 | 1) Fork the Project
 90 | 2) Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
 91 | 3) Commit your changes (`git commit -m 'Add some AmazingFeature'`)
 92 | 4) Push to the Branch (`git push origin feature/AmazingFeature`)
 93 | 5) Open a Pull Request
 94 | 
 95 | ---
 96 | 
 97 | <!-- LICENSE -->
 98 | 
 99 | ## License
100 | 
101 | The license is under the MIT License. See [LICENSE](https://github.com/alifiroozi80/CKA/blob/main/LICENSE) for more
102 | information.
103 | 
104 | ---
105 | 
106 | <!-- ACKNOWLEDGMENTS -->
107 | 
108 | ## Acknowledgments
109 | 
110 | * [TechWorld with Nana](https://www.techworld-with-nana.com)
111 | 
112 |   In this repo, I used some fantastic images and great examples belonging
113 |   to [TechWorld with Nana](https://www.techworld-with-nana.com) to understand the concepts better.
114 | 
115 |   Nana is one of my GREATEST teachers, and I learned a lot from her. Also, she is one of the best DevOps mentors out
116 |   there, Be sure to check her [youtube channel](https://www.youtube.com/c/TechWorldwithNana) alongside your journey!
117 | 
118 | * [Bret Fisher](https://www.bretfisher.com)
119 | 
120 |   Bret is one of the best DevOps mentors out there, Be sure to check
121 |   his [youtube channel](https://www.youtube.com/BretFisherDockerandDevOps) and excellent examples
122 |   on [Github](https://github.com/BretFisher#my-examples-and-templates-new-stuff-on-top) alongside your journey!
123 | 
124 |   Also, I love his [podcast](https://www.bretfisher.com/podcast/), he invites great guests, and they discuss great
125 |   topics. So for being always up-to-date and learning much more, listen to it.
126 | 
127 | * [Container Training](https://github.com/jpetazzo/container.training)
128 | 
129 |   Jérôme is a DevOps Engineer who has a great repo about container training.
130 |   In this repo, for a better understanding of some concepts, I used some examples in his repo.
131 | 
132 |   Be sure to check his repo as well.
133 | 
134 | ---
135 | 
136 | ## ❤ Show your support
137 | 
138 | Give a ⭐️ if this project helped you!
139 | 


--------------------------------------------------------------------------------