├── .gitignore.save
├── helm-charts
    ├── duckdb-bundle
    │   ├── .python-version
    │   └── deployment
    │   │   └── deployment_duckdb_bundle.yaml
    └── mlflow-server
    │   ├── charts
    │       └── postgresql-11.9.6.tgz
    │   ├── templates
    │       ├── serviceaccount.yaml
    │       ├── service.yaml
    │       ├── tests
    │       │   └── test-connection.yaml
    │       ├── hpa.yaml
    │       ├── NOTES.txt
    │       ├── secret-env.yaml
    │       ├── _helpers.tpl
    │       ├── ingress.yaml
    │       └── deployment.yaml
    │   ├── .helmignore
    │   ├── Chart.yaml
    │   ├── README.md
    │   └── values.yaml
├── dags
    ├── test.py
    ├── example
    │   ├── Dockerfile
    │   └── example_pod.py
    ├── python_print.yaml
    └── basic
    │   ├── runners
    │       ├── csvtopqt.yaml
    │       └── accountbypartition.yaml
    │   └── first_pyspark_pipeline.py
├── docs
    ├── images
    │   ├── minio2.png
    │   ├── gitsync.png
    │   ├── airflow_job.png
    │   ├── lens_nodes.png
    │   ├── miniologin.png
    │   └── airflow_conn.png
    ├── 02-setting-up-minio.md
    ├── 04-setting-up-airflow.md
    ├── 03-setting-up-apachespark-k8s.md
    ├── 01-setting-up-cluster.md
    └── 05-cronjob-podcleaner.md
├── examples
    └── spark
    │   ├── test.yaml
    │   ├── wordcount.yaml
    │   └── pi.yaml
├── pyspark_jobs
    ├── sessionbuilder.py
    ├── basictransformation.py
    ├── csvtoparquet.py
    └── accountbypartition.py
├── Dockerfile.spark
├── rbac
    ├── prefect-rbac.yaml
    ├── airflow-role.yaml
    └── spark-rbac.yaml
├── .github
    ├── helm-docs.sh
    └── workflows
    │   ├── releasechart.yaml
    │   └── continuousintegration.yaml
├── helm_values
    ├── finaL_out.yml
    ├── sparkoperator_values.yaml
    ├── minio_values.yaml
    └── airflow_values.yaml
├── ok.yml
├── .gitignore
└── README.md


/.gitignore.save:
--------------------------------------------------------------------------------
1 | .idea
2 | .idea/*
3 | 


--------------------------------------------------------------------------------
/helm-charts/duckdb-bundle/.python-version:
--------------------------------------------------------------------------------
1 | 3.8.6
2 | 


--------------------------------------------------------------------------------
/dags/test.py:
--------------------------------------------------------------------------------
1 | 
2 | if __name__ == '__main__':
3 | 	print("ABC")
4 | 	print("*"*100)


--------------------------------------------------------------------------------
/docs/images/minio2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhishek-ch/data-machinelearning-the-boring-way/HEAD/docs/images/minio2.png


--------------------------------------------------------------------------------
/docs/images/gitsync.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhishek-ch/data-machinelearning-the-boring-way/HEAD/docs/images/gitsync.png


--------------------------------------------------------------------------------
/docs/images/airflow_job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhishek-ch/data-machinelearning-the-boring-way/HEAD/docs/images/airflow_job.png


--------------------------------------------------------------------------------
/docs/images/lens_nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhishek-ch/data-machinelearning-the-boring-way/HEAD/docs/images/lens_nodes.png


--------------------------------------------------------------------------------
/docs/images/miniologin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhishek-ch/data-machinelearning-the-boring-way/HEAD/docs/images/miniologin.png


--------------------------------------------------------------------------------
/docs/images/airflow_conn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhishek-ch/data-machinelearning-the-boring-way/HEAD/docs/images/airflow_conn.png


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/charts/postgresql-11.9.6.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhishek-ch/data-machinelearning-the-boring-way/HEAD/helm-charts/mlflow-server/charts/postgresql-11.9.6.tgz


--------------------------------------------------------------------------------
/dags/example/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim-buster
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | # COPY requirements.txt requirements.txt
 6 | # RUN pip3 install -r requirements.txt
 7 | 
 8 | COPY . .
 9 | 
10 | CMD [ "python3", "-m" , "test.py"]


--------------------------------------------------------------------------------
/dags/python_print.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: abc-python-cli
 5 | spec:
 6 |   containers:
 7 |     - name: abc-python-cli
 8 |       image: python-docker:test
 9 |       command: ["python3", "-m", "test"]
10 |       imagePullPolicy: IfNotPresent
11 |   restartPolicy: Always
12 | 


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccount.create -}}
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: {{ include "mlflow-server.serviceAccountName" . }}
 6 |   labels:
 7 |     {{- include "mlflow-server.labels" . | nindent 4 }}
 8 |   {{- with .Values.serviceAccount.annotations }}
 9 |   annotations:
10 |     {{- toYaml . | nindent 4 }}
11 |   {{- end }}
12 | {{- end }}
13 | 


--------------------------------------------------------------------------------
/examples/spark/test.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: abc-python-cli
 5 | spec:
 6 |   containers:
 7 |   - name: abc-python-cli
 8 |     image: pysparkexample:aws
 9 |     command: ["tail", "-f", "/dev/null"]
10 |     imagePullPolicy: IfNotPresent
11 |     resources:
12 |       requests:
13 |         cpu: 1000m
14 |         memory: "196Mi"
15 |       limits:
16 |         cpu: 2000m
17 |         memory: "1096Mi"
18 |   restartPolicy: Always


--------------------------------------------------------------------------------
/pyspark_jobs/sessionbuilder.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | import logging
 3 | 
 4 | 
 5 | def get_new_session(app_name):
 6 |     spark = SparkSession.builder.appName(app_name).getOrCreate()
 7 |     spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
 8 | 
 9 |     spark.sparkContext.setLogLevel("INFO")
10 |     log4j = spark._jvm.org.apache.log4j.Logger
11 |     _logger = log4j.getLogger(__name__)
12 | 
13 |     return (spark, _logger)
14 | 


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "mlflow-server.fullname" . }}
 5 |   labels:
 6 |     {{- include "mlflow-server.labels" . | nindent 4 }}
 7 | spec:
 8 |   type: {{ .Values.service.type }}
 9 |   ports:
10 |     - port: {{ .Values.service.port }}
11 |       targetPort: http
12 |       protocol: TCP
13 |       name: http
14 |   selector:
15 |     {{- include "mlflow-server.selectorLabels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/templates/tests/test-connection.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: "{{ include "mlflow-server.fullname" . }}-test-connection"
 5 |   labels:
 6 |     {{- include "mlflow-server.labels" . | nindent 4 }}
 7 |   annotations:
 8 |     "helm.sh/hook": test
 9 | spec:
10 |   containers:
11 |     - name: wget
12 |       image: busybox
13 |       command: ['wget']
14 |       args: ['{{ include "mlflow-server.fullname" . }}:{{ .Values.service.port }}']
15 |   restartPolicy: Never
16 | 


--------------------------------------------------------------------------------
/Dockerfile.spark:
--------------------------------------------------------------------------------
 1 | FROM pysparkexample:aws
 2 | 
 3 | ARG spark_uid=185
 4 | # ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar /opt/spark/jars/
 5 | # ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar /opt/spark/jars/
 6 | 
 7 | 
 8 | WORKDIR /app
 9 | USER 0
10 | 
11 | RUN mkdir -p /app/logs
12 | RUN mkdir -p /airflow/xcom/
13 | COPY pyspark_jobs/ /app/pyspark_jobs
14 | RUN zip -r pyspark_jobs.zip pyspark_jobs && \
15 |     chown -R ${spark_uid} /app && chown -R ${spark_uid} /airflow
16 | 
17 | USER ${spark_uid}


--------------------------------------------------------------------------------
/pyspark_jobs/basictransformation.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql.functions import *
 4 | from pyspark.sql.types import *
 5 | from sessionbuilder import *
 6 | 
 7 | def main():
 8 |     pass
 9 | 
10 | if __name__ == '__main__':
11 |     source_path = sys.argv[1]
12 |     destination_path = sys.argv[2]
13 |     print(
14 |         f'User Input Source Path {source_path} Write Path {destination_path}')
15 |     spark, _logger = get_new_session("CSV To Parquet")
16 | 
17 |     main(source_path=source_path, destination_path=destination_path)
18 |     spark.stop()


--------------------------------------------------------------------------------
/rbac/prefect-rbac.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   namespace: default
 5 |   name: prefect-agent-rbac
 6 | rules:
 7 |   - apiGroups: ['batch', 'extensions']
 8 |     resources: ['jobs']
 9 |     verbs: ['*']
10 |   - apiGroups: ['']
11 |     resources: ['events', 'pods']
12 |     verbs: ['*']
13 | ---
14 | apiVersion: rbac.authorization.k8s.io/v1
15 | kind: RoleBinding
16 | metadata:
17 |   namespace: default
18 |   name: prefect-agent-rbac
19 | subjects:
20 |   - kind: ServiceAccount
21 |     name: default
22 | roleRef:
23 |   kind: Role
24 |   name: prefect-agent-rbac
25 |   apiGroup: rbac.authorization.k8s.io


--------------------------------------------------------------------------------
/.github/helm-docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Running Helm Docs"
 4 | CHART_DIRS="$(git diff --find-renames --name-only "$(git rev-parse --abbrev-ref HEAD)" remotes/origin/main -- 'valeriano-manassero' | grep '[cC]hart.yaml' | sed -e 's#/[Cc]hart.yaml##g')"
 5 | HELM_DOCS_VERSION="1.12.0"
 6 | 
 7 | curl --silent --show-error --fail --location --output /tmp/helm-docs.tar.gz https://github.com/norwoodj/helm-docs/releases/download/v"${HELM_DOCS_VERSION}"/helm-docs_"${HELM_DOCS_VERSION}"_Linux_x86_64.tar.gz
 8 | tar -xf /tmp/helm-docs.tar.gz helm-docs
 9 | 
10 | for CHART_DIR in ${CHART_DIRS}; do
11 |   ./helm-docs -c ${CHART_DIR}
12 |   git diff --exit-code
13 | done


--------------------------------------------------------------------------------
/rbac/airflow-role.yaml:
--------------------------------------------------------------------------------
 1 | # Role for spark-on-k8s-operator to create resources on cluster
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRole
 4 | metadata:
 5 |   name: spark-cluster-cr
 6 |   labels:
 7 |     rbac.authorization.kubeflow.org/aggregate-to-kubeflow-edit: "true"
 8 | rules:
 9 |   - apiGroups:
10 |       - sparkoperator.k8s.io
11 |     resources:
12 |       - sparkapplications
13 |     verbs:
14 |       - '*'
15 | ---
16 | # Allow airflow-worker service account access for spark-on-k8s
17 | apiVersion: rbac.authorization.k8s.io/v1
18 | kind: ClusterRoleBinding
19 | metadata:
20 |   name: airflow-spark-crb
21 | roleRef:
22 |   apiGroup: rbac.authorization.k8s.io
23 |   kind: ClusterRole
24 |   name: spark-cluster-cr
25 | subjects:
26 |   - kind: ServiceAccount
27 |     name: airflow-worker
28 |     namespace: default


--------------------------------------------------------------------------------
/helm-charts/duckdb-bundle/deployment/deployment_duckdb_bundle.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: duckdb-bundle
 5 | spec:
 6 |   selector:
 7 |     matchLabels:
 8 |       app.kubernetes.io/name: app-duckdb-bundle
 9 |   replicas: 1
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app.kubernetes.io/name: app-duckdb-bundle
14 |     spec:   
15 |       containers:
16 |       - image: buntha/duckdb-bundle:0.0.3
17 |         imagePullPolicy: Always
18 |         name: duckdb-bundle
19 |         ports:
20 |         - containerPort: 8088
21 | ---
22 | apiVersion: v1
23 | kind: Service
24 | metadata:
25 |   name: duckdb-bundle
26 | spec:
27 |   ports:
28 |     - port: 8088
29 |       targetPort: 8088
30 |       protocol: TCP
31 |   type: NodePort
32 |   selector:
33 |     app.kubernetes.io/name: app-duckdb-bundle


--------------------------------------------------------------------------------
/rbac/spark-rbac.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: spark
 5 |   namespace: spark
 6 | ---
 7 | # Role for spark-on-k8s-operator to create resources on cluster
 8 | apiVersion: rbac.authorization.k8s.io/v1
 9 | kind: ClusterRole
10 | metadata:
11 |   name: spark-cluster-cr
12 |   labels:
13 |     rbac.authorization.kubeflow.org/aggregate-to-kubeflow-edit: "true"
14 | rules:
15 |   - apiGroups:
16 |       - sparkoperator.k8s.io
17 |     resources:
18 |       - sparkapplications
19 |     verbs:
20 |       - '*'
21 | ---
22 | apiVersion: rbac.authorization.k8s.io/v1
23 | kind: ClusterRoleBinding
24 | metadata:
25 |   name: spark-role
26 |   namespace: spark
27 | roleRef:
28 |   apiGroup: rbac.authorization.k8s.io
29 |   kind: ClusterRole
30 |   name: edit
31 | subjects:
32 | - kind: ServiceAccount
33 |   name: spark
34 |   namespace: spark
35 | - kind: ServiceAccount
36 |   name: spark
37 |   namespace: default
38 | 
39 | # kubectl create clusterrolebinding spark-role --clusterrole=cluster-admin --serviceaccount=default:spark --namespace=default


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/templates/hpa.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.autoscaling.enabled }}
 2 | apiVersion: autoscaling/v2beta1
 3 | kind: HorizontalPodAutoscaler
 4 | metadata:
 5 |   name: {{ include "mlflow-server.fullname" . }}
 6 |   labels:
 7 |     {{- include "mlflow-server.labels" . | nindent 4 }}
 8 | spec:
 9 |   scaleTargetRef:
10 |     apiVersion: apps/v1
11 |     kind: Deployment
12 |     name: {{ include "mlflow-server.fullname" . }}
13 |   minReplicas: {{ .Values.autoscaling.minReplicas }}
14 |   maxReplicas: {{ .Values.autoscaling.maxReplicas }}
15 |   metrics:
16 |     {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
17 |     - type: Resource
18 |       resource:
19 |         name: cpu
20 |         targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
21 |     {{- end }}
22 |     {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
23 |     - type: Resource
24 |       resource:
25 |         name: memory
26 |         targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
27 |     {{- end }}
28 | {{- end }}
29 | 


--------------------------------------------------------------------------------
/.github/workflows/releasechart.yaml:
--------------------------------------------------------------------------------
 1 | name: Release Charts
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   release:
10 |     # depending on default permission settings for your org (contents being read-only or read-write for workloads), you will have to add permissions
11 |     # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
12 |     permissions:
13 |       contents: write
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout
17 |         uses: actions/checkout@v1
18 |         # with:
19 |         #   fetch-depth: 0
20 | 
21 |       - name: Configure Git
22 |         run: |
23 |           git config user.name "$GITHUB_ACTOR"
24 |           git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
25 | 
26 |       - name: Install Helm
27 |         uses: azure/setup-helm@v3
28 |         with:
29 |           version: v3.10.0
30 | 
31 |       - name: Run chart-releaser
32 |         uses: helm/chart-releaser-action@v1.4.1
33 |         env:
34 |           CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
35 |         with:
36 |           charts_dir: helm-charts
37 |           # charts_repo_url: https://abhishek-ch.github.io/data-machinelearning-the-boring-way


--------------------------------------------------------------------------------
/helm_values/finaL_out.yml:
--------------------------------------------------------------------------------
 1 | Release "airflow" has been upgraded. Happy Helming!
 2 | NAME: airflow
 3 | LAST DEPLOYED: Sun May 15 12:02:58 2022
 4 | NAMESPACE: default
 5 | STATUS: deployed
 6 | REVISION: 8
 7 | TEST SUITE: None
 8 | NOTES:
 9 | Thank you for installing Apache Airflow 2.2.4!
10 | 
11 | Your release is named airflow.
12 | You can now access your dashboard(s) by executing the following command(s) and visiting the corresponding port at localhost in your browser:
13 | 
14 | Airflow Webserver:     kubectl port-forward svc/airflow-webserver 8080:8080 --namespace default
15 | Default Webserver (Airflow UI) Login credentials:
16 |     username: admin
17 |     password: admin
18 | Default Postgres connection credentials:
19 |     username: postgres
20 |     password: postgres
21 |     port: 5432
22 | 
23 | You can get Fernet Key value by running the following:
24 | 
25 |     echo Fernet Key: $(kubectl get secret --namespace default airflow-fernet-key -o jsonpath="{.data.fernet-key}" | base64 --decode)
26 | 
27 | WARNING:
28 |     Kubernetes workers task logs may not persist unless you configure log persistence or remote logging!
29 |     Logging options can be found at: https://airflow.apache.org/docs/helm-chart/stable/manage-logs.html
30 |     (This warning can be ignored if logging is configured with environment variables or secrets backend)
31 | 


--------------------------------------------------------------------------------
/docs/02-setting-up-minio.md:
--------------------------------------------------------------------------------
 1 | # MinIO
 2 | 
 3 | MinIO offers high-performance, S3 compatible object storage.
 4 | Minio is an object storage server that implements the same public API as Amazon S3. This means that applications that can be configured to talk to Amazon S3 can also be configured to talk to Minio
 5 | 
 6 | https://github.com/minio/minio
 7 | 
 8 | ## Install MinIO
 9 | `helm upgrade --install minio minio/minio -f minio_values.yaml -n default`
10 | 
11 | * Set the `accessKey` and `secretKey` inside users key. This key is the exact representation of AWS_KEY & AWS_SECRET with Admin Buckets
12 | * Creating 2 default buckets for Airflow log and normal data processing as `airflow-logs` & `test-files`
13 | * Change the PVC Size and resources based on the available resources.
14 | * Port forward on 9001
15 | * Create Airflow connection named s3_conn for Remote logging
16 |   * Add Extra 
17 |     `{"aws_access_key_id": "developer", "aws_secret_access_key": "software123", "host": "http://minio:9000"}`
18 |     Host is --endpoint_url and provide minio service http address
19 | * Try to port forward on minio service `kubectl port-forward services/minio-console 9001:9001 -n default`
20 |   
21 | ### Login with helm provuded credentials `developer / software123`
22 | 
23 |   ![](images/miniologin.png)
24 | 
25 | 
26 | ### After Login Screen 
27 | 
28 | ![](images/minio2.png)


--------------------------------------------------------------------------------
/pyspark_jobs/csvtoparquet.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql.functions import *
 4 | from pyspark.sql.types import *
 5 | from sessionbuilder import *
 6 | 
 7 | 
 8 | def main(source_path: str, destination_path: str) -> DataFrame:
 9 |     _logger.info(f'Reading from CSV Path {source_path}')
10 |     dataframe = spark.read.format("csv").option(
11 |         "header", "true").load(source_path)
12 |     dataframe_cast = dataframe.withColumn("account_created_at", to_date(col("account_created_at"),"yyyy-MM-dd HH:mm:ss")) \
13 |         .withColumn("account_created_at_interpolated", to_date(col("account_created_at_interpolated"),"yyyy-MM-dd HH:mm:ss"))
14 |     _logger.info(f'TESTING ABC...')
15 |     _logger.info(f'CSV As DataFrame {dataframe_cast.show()}')
16 |     dataframe_cast.printSchema()
17 | 
18 |     dataframe_cast.coalesce(1) \
19 |         .write \
20 |         .format("parquet") \
21 |         .save(destination_path, mode="overwrite")
22 | 
23 |     return dataframe
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     source_path = sys.argv[1]
28 |     destination_path = sys.argv[2]
29 |     print(
30 |         f'User Input Source Path {source_path} Write Path {destination_path}')
31 |     spark, _logger = get_new_session("CSV To Parquet")
32 | 
33 |     main(source_path=source_path, destination_path=destination_path)
34 |     spark.stop()
35 | 


--------------------------------------------------------------------------------
/examples/spark/wordcount.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: spark-word-count
 5 | spec:
 6 |   sparkConf:
 7 |     "spark.ui.port": "4045"
 8 |     "spark.eventLog.enabled": "false"
 9 |     "spark.hadoop.fs.s3a.access.key": "developer"
10 |     "spark.hadoop.fs.s3a.secret.key": "software123"
11 |     "spark.hadoop.fs.s3a.endpoint": "http://minio:9000"
12 |     "spark.hadoop.fs.s3n.impl": "org.apache.hadoop.fs.s3n.S3AFileSystem"
13 |     "spark.hadoop.fs.s3n.fast.upload": "true"
14 |     "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
15 |     "spark.hadoop.fs.s3a.connection.ssl.enabled": "false"
16 |     "spark.hadoop.fs.s3a.path.style.access": "true"
17 |   type: Python
18 |   pythonVersion: "3"
19 |   mode: cluster
20 |   image: "pysparkexample:aws2"
21 |   imagePullPolicy: Never
22 |   mainApplicationFile: local:///opt/spark/examples/src/main/python/wordcount.py
23 |   arguments:
24 |     - s3a://test-files/sparktemp/files.txt
25 |   sparkVersion: "3.1.1"
26 |   restartPolicy:
27 |     type: OnFailure
28 |     onFailureRetries: 1
29 |     onFailureRetryInterval: 1
30 |     onSubmissionFailureRetries: 1
31 |     onSubmissionFailureRetryInterval: 1
32 |   driver:
33 |     cores: 1
34 |     coreLimit: "1200m"
35 |     memory: "512m"
36 |     labels:
37 |       version: 3.1.1
38 |     serviceAccount: spark
39 |   executor:
40 |     cores: 1
41 |     instances: 1
42 |     memory: "512m"
43 |     labels:
44 |       version: 3.1.1


--------------------------------------------------------------------------------
/ok.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: user
 5 |   namespace: default
 6 | spec:
 7 |   sparkConf:
 8 |     "spark.ui.port": "4045"
 9 |     "spark.eventLog.enabled": "false"
10 |     "spark.hadoop.fs.s3a.access.key": "abhishek"
11 |     "spark.hadoop.fs.s3a.secret.key": "choudhary123"
12 |     "spark.hadoop.fs.s3a.endpoint": "http://minio:9000"
13 |     "spark.hadoop.fs.s3n.impl": "org.apache.hadoop.fs.s3n.S3AFileSystem"
14 |     "spark.hadoop.fs.s3n.fast.upload": "true"
15 |     "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
16 |     "spark.hadoop.fs.s3a.connection.ssl.enabled": "false"
17 |     "spark.hadoop.fs.s3a.path.style.access": "true"
18 |   type: Python
19 |   pythonVersion: "3"
20 |   mode: cluster
21 |   image: "pysparkexample:aws2"
22 |   imagePullPolicy: Never
23 |   mainApplicationFile: local:///app/pyspark_jobs/accountbypartition.py
24 |   arguments:
25 |     - s3a://test-files/sample/output/users
26 |     - s3a://test-files/sample/output/users_account"
27 |   sparkVersion: "3.1.1"
28 |   restartPolicy:
29 |     type: OnFailure
30 |     onFailureRetries: 0
31 |     onFailureRetryInterval: 1
32 |     onSubmissionFailureRetries: 0
33 |     onSubmissionFailureRetryInterval: 1
34 |   driver:
35 |     coreRequest: "500m"
36 |     coreLimit: "1000m"
37 |     memory: "512m"
38 |     labels:
39 |       version: 3.1.1
40 |     serviceAccount: spark
41 |   executor:
42 |     coreRequest: "250m"
43 |     instances: 2
44 |     memory: "512m"
45 |     labels:
46 |       version: 3.1.1


--------------------------------------------------------------------------------
/pyspark_jobs/accountbypartition.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark.sql.functions import *
 3 | from pyspark.sql.types import *
 4 | from sessionbuilder import *
 5 | import json
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | def main(source_path: str, destination_path: str) -> bool:
10 |     dataframe = spark.read.parquet(source_path) \
11 |         .withColumn("year", year("account_created_at")) \
12 |         .withColumn("month", month("account_created_at")) \
13 |         .withColumn("day", dayofmonth("account_created_at"))
14 | 
15 |     dataframe.coalesce(1) \
16 |         .write \
17 |         .format("parquet") \
18 |         .partitionBy("year", "month", "day") \
19 |         .save(destination_path, mode="overwrite")
20 | 
21 |     return True
22 | 
23 | 
24 | def push_to_xcom(destination_path: str) -> None:
25 |     # Path("/airflow/xcom/").mkdir(parents=True, exist_ok=True)
26 |     with open("/airflow/xcom/return.json", 'w') as file:
27 |         logging.info(f'XCom Push {destination_path}')
28 |         json.dump(destination_path, file)
29 | 
30 |     data = json.load(open("/airflow/xcom/return.json"))
31 |     logging.info(data)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     source_path = sys.argv[1]
36 |     destination_path = sys.argv[2]
37 | 
38 |     spark, _logger = get_new_session("CSV To Parquet")
39 |     push_to_xcom(destination_path)
40 |     _logger.info(
41 |         f'Parquet Source Path {source_path} Write Path {destination_path}')
42 | 
43 |     if main(source_path, destination_path):
44 |         push_to_xcom(destination_path)
45 | 
46 |     spark.stop()
47 | 


--------------------------------------------------------------------------------
/.github/workflows/continuousintegration.yaml:
--------------------------------------------------------------------------------
 1 | name: Validate Chart
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     types: [synchronize, opened, reopened, ready_for_review]
 7 | 
 8 | jobs:
 9 |   lint-docs:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v1
14 |       - name: Run helm-docs
15 |         run: .github/helm-docs.sh
16 |   install-chart:
17 |     name: install-chart
18 |     runs-on: ubuntu-latest
19 |     needs:
20 |       - lint-docs
21 |     strategy:
22 |       matrix:
23 |         k8s:
24 |           - v1.22.7
25 |           - v1.23.6
26 |           - v1.24.0
27 |     steps:
28 |       - name: Checkout
29 |         uses: actions/checkout@v1
30 |       - name: Create kind ${{ matrix.k8s }} cluster
31 |         uses: helm/kind-action@v1.2.0
32 |         with:
33 |           version: v0.13.0
34 |           node_image: kindest/node:${{ matrix.k8s }}
35 |       - name: Set up chart-testing
36 |         uses: helm/chart-testing-action@v2.0.1
37 |       - name: Add bitnami repo
38 |         run: helm repo add bitnami https://charts.bitnami.com/bitnami
39 |       - name: Run chart-testing (list-changed)
40 |         id: list-changed
41 |         run: |
42 |           changed=$(ct list-changed --chart-dirs=helm-charts --target-branch=main)
43 |           if [[ -n "$changed" ]]; then
44 |             echo "::set-output name=changed::true"
45 |           fi
46 |       - name: Run chart-testing (lint and install)
47 |         run: ct lint-and-install --chart-dirs=helm-charts --target-branch=main --helm-extra-args="--timeout=15m" --debug=true


--------------------------------------------------------------------------------
/dags/basic/runners/csvtopqt.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: csv-to-parquet.{{ ds }}.{{ task_instance.try_number }}
 5 |   namespace: default
 6 | spec:
 7 |   sparkConf:
 8 |     "spark.ui.port": "4045"
 9 |     "spark.eventLog.enabled": "false"
10 |     "spark.hadoop.fs.s3a.access.key": "developer"
11 |     "spark.hadoop.fs.s3a.secret.key": "software123"
12 |     "spark.hadoop.fs.s3a.endpoint": "http://minio:9000"
13 |     "spark.hadoop.fs.s3n.impl": "org.apache.hadoop.fs.s3n.S3AFileSystem"
14 |     "spark.hadoop.fs.s3n.fast.upload": "true"
15 |     "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
16 |     "spark.hadoop.fs.s3a.connection.ssl.enabled": "false"
17 |     "spark.hadoop.fs.s3a.path.style.access": "true"
18 |   type: Python
19 |   pythonVersion: "3"
20 |   mode: cluster
21 |   image: "pysparkexample:aws2"
22 |   imagePullPolicy: Never
23 |   mainApplicationFile: local:///app/pyspark_jobs/csvtoparquet.py
24 |   arguments:
25 |     - {{ dag_run.conf.source }}
26 |     - {{ dag_run.conf.destination }}
27 |   sparkVersion: "3.1.1"
28 |   restartPolicy:
29 |     type: OnFailure
30 |     onFailureRetries: 0
31 |     onFailureRetryInterval: 1
32 |     onSubmissionFailureRetries: 0
33 |     onSubmissionFailureRetryInterval: 1
34 |   driver:
35 |     coreRequest: "500m"
36 |     coreLimit: "1000m"
37 |     memory: "512m"
38 |     labels:
39 |       version: 3.1.1
40 |     serviceAccount: spark
41 |   executor:
42 |     coreRequest: "250m"
43 |     instances: 2
44 |     memory: "512m"
45 |     labels:
46 |       version: 3.1.1


--------------------------------------------------------------------------------
/docs/04-setting-up-airflow.md:
--------------------------------------------------------------------------------
 1 | # Setting up Airflow in Kubernetes
 2 | 
 3 | ## Create RBAC for role and services 
 4 | 
 5 | * Go to rbac dir and run `kubetl apply -f airflow-role.yaml`. This gives airflow access to run spark jobs and many priviledges on the default namespace
 6 | * Follow the doc for learn more about Airflow Helm, but for us, we need the following command 
 7 | ```
 8 | helm repo add apache-airflow https://airflow.apache.org
 9 | helm upgrade --install airflow apache-airflow/airflow --namespace airflow --create-namespace
10 | ```
11 | * Go to helm dir and install `helm upgrade --install airflow apache-airflow/airflow -n default -f airflow_values.yaml`
12 | * Run the Airflow Spark Workflow `Basic_Transformation`
13 | 
14 | ![](images/airflow_job.png)
15 | * Trigger the job with conf `{"source":"s3a://test-files/sample/users.csv","destination":"s3a://test-files/sample/output/users"}`
16 | * This will generate parquet partition output in the path `s3a://test-files/sample/output/users_account/year=*/month=*/day=*`
17 | 
18 | _User can use any csv  file with a column name `account_created_at` of __date__ datatype
19 | 
20 | ## Troubleshooting
21 | 
22 | * Check airflow_triggered & airflow_scheduler* pod is green or not. The usual problem is git_sync.   
23 | In case of error, please regenerate ssh key and approve the same on your github fork/project. All Sidecar pods must be initiated correctly.
24 | ![](images/gitsync.png)
25 | * Check for memort allocation in the airflow values.yaml file
26 | * Make sure of creating s3_conn & kubernetes_default connection from the _airflow ui_
27 | ![](images/airflow_conn.png)


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: mlflow-server
 3 | version: 0.1.9
 4 | appVersion: "2.1.1"
 5 | description: A Helm chart for MLFlow On Kubernetes
 6 | 
 7 | # A chart can be either an 'application' or a 'library' chart.
 8 | #
 9 | # Application charts are a collection of templates that can be packaged into versioned archives
10 | # to be deployed.
11 | #
12 | # Library charts provide useful utilities or functions for the chart developer. They're included as
13 | # a dependency of application charts to inject those utilities and functions into the rendering
14 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
15 | type: application
16 | 
17 | # This is the chart version. This version number should be incremented each time you make changes
18 | # to the chart and its templates, including the app version.
19 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
20 | 
21 | 
22 | home: https://mlflow.org/
23 | icon: https://mlflow.org/images/MLflow-logo-final-white-TM.png
24 | sources:
25 |   - https://github.com/abhishek-ch/data-machinelearning-the-boring-way/tree/main/helm-charts
26 |   - https://mlflow.org/
27 | maintainers:
28 |   - name: ABC
29 |     email: abhishek.create@gmail.com
30 |     url: https://github.com/abhishek-ch
31 | dependencies:
32 | - name: postgresql
33 |   version: 11.9.6
34 |   repository: https://charts.bitnami.com/bitnami
35 |   condition: postgresql.enabled
36 | keywords:
37 |   - mlflow
38 |   - machinelearning
39 |   - mlops
40 |   - "model tracking"
41 |   - "model versioning"
42 |   - "mlflow projects"
43 |   - "model registry"


--------------------------------------------------------------------------------
/dags/basic/runners/accountbypartition.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: userpartitions-by-account.{{ ds }}.{{ task_instance.try_number }}
 5 |   namespace: default
 6 | spec:
 7 |   sparkConf:
 8 |     "spark.ui.port": "4045"
 9 |     "spark.eventLog.enabled": "false"
10 |     "spark.hadoop.fs.s3a.access.key": "developer"
11 |     "spark.hadoop.fs.s3a.secret.key": "software123"
12 |     "spark.hadoop.fs.s3a.endpoint": "http://minio:9000"
13 |     "spark.hadoop.fs.s3n.impl": "org.apache.hadoop.fs.s3n.S3AFileSystem"
14 |     "spark.hadoop.fs.s3n.fast.upload": "true"
15 |     "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
16 |     "spark.hadoop.fs.s3a.connection.ssl.enabled": "false"
17 |     "spark.hadoop.fs.s3a.path.style.access": "true"
18 |   type: Python
19 |   pythonVersion: "3"
20 |   mode: cluster
21 |   image: "pysparkexample:aws2"
22 |   imagePullPolicy: Never
23 |   mainApplicationFile: local:///app/pyspark_jobs/accountbypartition.py
24 |   arguments:
25 |     - {{ dag_run.conf.destination }}
26 |     - "{{ dag_run.conf.destination }}_account"
27 |   sparkVersion: "3.1.1"
28 |   restartPolicy:
29 |     type: OnFailure
30 |     onFailureRetries: 0
31 |     onFailureRetryInterval: 1
32 |     onSubmissionFailureRetries: 0
33 |     onSubmissionFailureRetryInterval: 1
34 |   driver:
35 |     coreRequest: "500m"
36 |     coreLimit: "1000m"
37 |     memory: "512m"
38 |     labels:
39 |       version: 3.1.1
40 |     serviceAccount: spark
41 |   executor:
42 |     coreRequest: "250m"
43 |     instances: 2
44 |     memory: "512m"
45 |     labels:
46 |       version: 3.1.1


--------------------------------------------------------------------------------
/examples/spark/pi.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2018 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Support for Python is experimental, and requires building SNAPSHOT image of Apache Spark,
17 | # with `imagePullPolicy` set to Always
18 | 
19 | apiVersion: "sparkoperator.k8s.io/v1beta2"
20 | kind: SparkApplication
21 | metadata:
22 |   name: pyspark-pi
23 | spec:
24 |   type: Python
25 |   pythonVersion: "3"
26 |   mode: cluster
27 |   image: "gcr.io/spark-operator/spark-py:v3.1.1"
28 |   imagePullPolicy: IfNotPresent
29 |   mainApplicationFile: local:///opt/spark/examples/src/main/python/pi.py
30 |   sparkVersion: "3.1.1"
31 |   restartPolicy:
32 |     type: OnFailure
33 |     onFailureRetries: 3
34 |     onFailureRetryInterval: 10
35 |     onSubmissionFailureRetries: 5
36 |     onSubmissionFailureRetryInterval: 20
37 |   driver:
38 |     cores: 1
39 |     coreLimit: "1200m"
40 |     memory: "512m"
41 |     labels:
42 |       version: 3.1.1
43 |     serviceAccount: spark
44 |   executor:
45 |     cores: 1
46 |     instances: 1
47 |     memory: "512m"
48 |     labels:
49 |       version: 3.1.1


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | 1. Get the application URL by running these commands:
 2 | {{- if .Values.ingress.enabled }}
 3 | {{- range $host := .Values.ingress.hosts }}
 4 |   {{- range .paths }}
 5 |   http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
 6 |   {{- end }}
 7 | {{- end }}
 8 | {{- else if contains "NodePort" .Values.service.type }}
 9 |   export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "mlflow-server.fullname" . }})
10 |   export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
11 |   echo http://$NODE_IP:$NODE_PORT
12 | {{- else if contains "LoadBalancer" .Values.service.type }}
13 |      NOTE: It may take a few minutes for the LoadBalancer IP to be available.
14 |            You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "mlflow-server.fullname" . }}'
15 |   export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "mlflow-server.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
16 |   echo http://$SERVICE_IP:{{ .Values.service.port }}
17 | {{- else if contains "ClusterIP" .Values.service.type }}
18 |   export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "mlflow-server.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
19 |   export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
20 |   echo "Visit http://127.0.0.1:8080 to use your application"
21 |   kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
22 | {{- end }}
23 | 


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/templates/secret-env.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | apiVersion: v1
18 | kind: Secret
19 | metadata:
20 |   name: {{ template "mlflow-server.fullname" . }}-env
21 |   labels:
22 |     app: {{ template "mlflow-server.fullname" . }}
23 |     chart: {{ template "mlflow-server.chart" . }}
24 |     release: "{{ .Release.Name }}"
25 |     heritage: "{{ .Release.Service }}"
26 |   namespace: {{ .Release.Namespace }}
27 | type: Opaque
28 | stringData:
29 |     DB_HOST: {{ tpl .Values.database.host . | quote }}
30 |     DB_PORT: {{ .Values.database.port | quote }}
31 |     DB_USER: {{ .Values.database.db_user | quote }}
32 |     DB_PASS: {{ .Values.database.db_pass | quote }}
33 |     DB_NAME: {{ .Values.database.db_name | quote }}
34 |     DATABASE_URL: "postgresql+psycopg2://{{ .Values.database.user }}:{{ .Values.database.password }}@{{ .Values.database.host }}:{{ .Values.database.port }}/{{ .Values.database.name }}"
35 |     {{- if .Values.extraSecretEnv }}
36 |     {{- range $key, $value := .Values.extraSecretEnv }}
37 |     {{ $key }}: {{ $value | quote }}
38 |     {{- end }}
39 |     {{- end }}


--------------------------------------------------------------------------------
/docs/03-setting-up-apachespark-k8s.md:
--------------------------------------------------------------------------------
 1 | # Setting up Apache Spark in Kubernetes
 2 | 
 3 | # SPARK ON k8s TUTORIAL
 4 | [Check Youtube Video For Setting up Spark](https://www.youtube.com/watch?v=1CGGTMvy67c)
 5 | 
 6 | ## Setting up Spark Using Helm 
 7 | 
 8 | * Go to `rbac/spark-rbac.yaml`  
 9 |   
10 | _[RBAC](https://kubernetes.io/docs/reference/access-authn-authz/rbac/) is Role based Access Control to define User Access Priviledges.
11 |   K8s RBAC is Rest based and maps http verbs to the permissions_
12 |   > A RoleBinding grants permissions within a specific namespace whereas a ClusterRoleBinding grants that access cluster-wide
13 | 
14 | 
15 | * Go to `helm_values/sparkoperator_values.yaml` Read [more](https://github.com/GoogleCloudPlatform/spark-on-k8s-operator).
16 | 
17 | ### Exploring sparkoperator_values.yaml
18 |  1. Spark createRole and createClusterRole is set `true`
19 |  2. For now, we didn't enable monitoring using graffana or external service, so metrics & podMonitor is set to `false`
20 |  3. __resources__ entirely depends system/docker capacity, change it accordingly
21 | ```
22 | resources:
23 |   limits:
24 |     cpu: 2000m
25 |     memory: 8000Mi
26 |   requests:
27 |     cpu: 200m
28 |     memory: 100Mi
29 | ```
30 | * Execute the Spark Operator helm file
31 | ```
32 | $ helm repo add spark-operator https://googlecloudplatform.github.io/spark-on-k8s-operator
33 | 
34 | $ helm install spark-operator spark-operator/spark-operator -n default -f sparkoperator_values.yaml --create-namespace
35 | ```
36 | 
37 | 
38 | __Spark will create all pods inside _spark_ namespace only__
39 | 
40 | ## Test 1
41 | * Test Application by running `kubectl apply -f examples/spark/pi.yaml -n default` . Check the logs, a pi value will be logged if passed
42 | 
43 | ## Test 2 
44 | * Login to __minio__ and choose `test-files` bucket
45 | * Upload any temporary file in the bucket
46 | * Go to the director ../examples/spark
47 | * Execute the spark wordcount job `kubectl apply -f examples/spark/wordcount.yaml -n default`
48 | Spark should be able to read from minio which works like AWS s3


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Expand the name of the chart.
 3 | */}}
 4 | {{- define "mlflow-server.name" -}}
 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 6 | {{- end }}
 7 | 
 8 | {{/*
 9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "mlflow-server.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 | 
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "mlflow-server.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 | 
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "mlflow-server.labels" -}}
37 | helm.sh/chart: {{ include "mlflow-server.chart" . }}
38 | {{ include "mlflow-server.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 | 
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "mlflow-server.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "mlflow-server.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 | 
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "mlflow-server.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "mlflow-server.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}


--------------------------------------------------------------------------------
/dags/example/example_pod.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from airflow import DAG
 3 | from airflow.models import Variable
 4 | from airflow.operators.python import PythonOperator
 5 | from airflow.operators.dummy import DummyOperator
 6 | from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
 7 | from airflow.utils.dates import datetime
 8 | from airflow.configuration import conf
 9 | 
10 | default_args = {
11 |     'owner': 'abc',
12 |     'team': 'ABC Exmple',
13 |     'depends_on_past': False,
14 |     'start_date': datetime.utcnow(),
15 |     'email': ['abhishek.create@gmail.com'],
16 |     'email_on_failure': False,
17 |     'email_on_retry': False,
18 |     'max_active_runs': 1,
19 | }
20 | 
21 | # {"table_name":"example_abc","input_file":"s3://developement-purpose/user/abhishek_choudhary/samples_output/username-password-partition/part=2019,NA"}
22 | 
23 | with DAG(
24 |         'Pod_Example',
25 |         default_args=default_args,
26 |         description='Test ABC',
27 |         schedule_interval=None,
28 |         concurrency=10,
29 |         tags=['abc', 'test','example'],
30 | ) as dag:
31 |     
32 |     start = DummyOperator(task_id='start', dag=dag, trigger_rule='all_success')
33 |     
34 |     test = KubernetesPodOperator(
35 |         namespace='default',
36 |         image="nginx:1.14.2",
37 |         image_pull_policy="IfNotPresent",
38 |         name="nginx_run",
39 |         task_id="nginx_run",
40 |         dag=dag,
41 |         is_delete_operator_pod=False,
42 |         in_cluster=True,
43 |         startup_timeout_seconds=600,
44 |         get_logs=True
45 |     )
46 | 
47 | 
48 |     pytest = KubernetesPodOperator(
49 |         namespace='default',
50 |         image="python-docker:test",
51 |         image_pull_policy="IfNotPresent",
52 |         name="Pytest_Abc",
53 |         task_id="Python_test",
54 |         cmds=["/bin/sh", "-c",
55 |               f"python -u test.py"],
56 |         dag=dag,
57 |         is_delete_operator_pod=False,
58 |         in_cluster=True,
59 |         startup_timeout_seconds=600,
60 |         get_logs=True
61 |     )
62 | 
63 |     end = DummyOperator(task_id='end', dag=dag, trigger_rule='all_success')
64 | 
65 |     start >> test 
66 |     start >> pytest
67 |     pytest >> end
68 | 
69 | 


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/templates/ingress.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.ingress.enabled -}}
 2 | {{- $fullName := include "mlflow-server.fullname" . -}}
 3 | {{- $svcPort := .Values.service.port -}}
 4 | {{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
 5 |   {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
 6 |   {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
 7 |   {{- end }}
 8 | {{- end }}
 9 | {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
10 | apiVersion: networking.k8s.io/v1
11 | {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
12 | apiVersion: networking.k8s.io/v1beta1
13 | {{- else -}}
14 | apiVersion: extensions/v1beta1
15 | {{- end }}
16 | kind: Ingress
17 | metadata:
18 |   name: {{ $fullName }}
19 |   labels:
20 |     {{- include "mlflow-server.labels" . | nindent 4 }}
21 |   {{- with .Values.ingress.annotations }}
22 |   annotations:
23 |     {{- toYaml . | nindent 4 }}
24 |   {{- end }}
25 | spec:
26 |   {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
27 |   ingressClassName: {{ .Values.ingress.className }}
28 |   {{- end }}
29 |   {{- if .Values.ingress.tls }}
30 |   tls:
31 |     {{- range .Values.ingress.tls }}
32 |     - hosts:
33 |         {{- range .hosts }}
34 |         - {{ . | quote }}
35 |         {{- end }}
36 |       secretName: {{ .secretName }}
37 |     {{- end }}
38 |   {{- end }}
39 |   rules:
40 |     {{- range .Values.ingress.hosts }}
41 |     - host: {{ .host | quote }}
42 |       http:
43 |         paths:
44 |           {{- range .paths }}
45 |           - path: {{ .path }}
46 |             {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
47 |             pathType: {{ .pathType }}
48 |             {{- end }}
49 |             backend:
50 |               {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
51 |               service:
52 |                 name: {{ $fullName }}
53 |                 port:
54 |                   number: {{ $svcPort }}
55 |               {{- else }}
56 |               serviceName: {{ $fullName }}
57 |               servicePort: {{ $svcPort }}
58 |               {{- end }}
59 |           {{- end }}
60 |     {{- end }}
61 | {{- end }}
62 | 


--------------------------------------------------------------------------------
/docs/01-setting-up-cluster.md:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | 
 3 | Setting up Kind Cluster, example __abc__
 4 | 
 5 | > _Kind Cluster usually prefix desired name with kind, so if you are creating a cluster with name __abc__ the name will be __kindabc___
 6 | 
 7 | ## Install & Set Kind Cluster
 8 | ```
 9 | $ kind create cluster --name abc
10 | 
11 | $ kubectl config use-context kind-abc
12 | ```
13 | 
14 | ## Validating kind Cluster
15 | `kind get clusters`
16 | 
17 | > Finding the node name for a given and
18 | `kubectl get nodes -n default`
19 | 
20 | > Setting the Kind Cluster Context 
21 | `kubectl cluster-info --context kind-abc`
22 | 
23 | 
24 | 
25 | 
26 | ## Loading The Desired images
27 | 
28 | __Kind recommends to always load docker image and Never use Always image pull  policy__
29 | > NOTE: The Kubernetes default pull policy is IfNotPresent unless the image tag is :latest or omitted (and implicitly :latest) in which case the default policy is Always. IfNotPresent causes the Kubelet to skip pulling an image if it already exists. If you want those images loaded into node to work as expected, please:
30 | 
31 | For Lab 1, you need the following docker image locally. once the images are available locally, load the same in kind cluster named 'abc'
32 | 
33 | ```
34 | kind load docker-image --name abc gcr.io/spark-operator/spark-operator:3.1.1
35 | kind load docker-image --name abc docker.io/apache/airflow:2.2.4
36 | kind load docker-image --name abc docker.io/apache/airflow-statsd-exporter-2021.04.28-v0.17.0  
37 | kind load docker-image --name abc docker.io/bitnami/postgresql:11.12.0-debian-10-r44
38 | kind load docker-image --name abc gcr.io/spark-operator/spark-operator:3.1.1
39 | kind load docker-image --name abc quay.io/minio/mc:RELEASE.2022-04-16T21-11-21Z  
40 | kind load docker-image --name abc quay.io/minio/minio:RELEASE.2022-04-26T01-20-24Z
41 | kind load docker-image --name abc pysparkexample:aws
42 | kind load docker-image --name abc k8s.gcr.io/git-sync/git-sync:v3.4.0
43 | ```
44 | ### Verify all loaded Images
45 | 
46 | `docker exec -it abc-control-plane crictl images ps`
47 | 
48 | ## Validating the Cluster using Lens
49 | 
50 | Launch lens and validate the cluster, which will show no Pods, but Nodes
51 | 
52 | ![Nodes](images/lens_nodes.png)
53 | 
54 | # Troubleshooting
55 | 
56 | * If the cluster is not launching, check the Docker Memory/Cores 
57 | * Port Forwarding from Lens doesnt work properly, use commandline to do the same
58 | 


--------------------------------------------------------------------------------
/docs/05-cronjob-podcleaner.md:
--------------------------------------------------------------------------------
 1 | # Kubernetes: Pod Cleaner
 2 | 
 3 | _Cleaning all(any) Pods older than *n days_
 4 | 
 5 | Cleaning Pod is a pretty simple job by running the command `kubectl delete pod --field-selector=status.phase==Succeeded` but when the kubernetes cluster size is big and there are too many contributors, even deleting needs engineering!
 6 | 
 7 | ## Pod Cleaner
 8 | 
 9 | A very simple utility to delete pods. Its designed as a cron job
10 | 
11 | #### Example
12 | Delete all __Succeeded__ or __Failed__ pods @ 11:00 am everyday
13 | 
14 | ```yaml
15 | apiVersion: batch/v1
16 | kind: CronJob
17 | metadata:
18 |   name: podcleaner
19 |   namespace: default
20 |   labels:
21 |     app: podcleaner
22 | spec:
23 |   schedule: "0 11 * * *"
24 |   failedJobsHistoryLimit: 5
25 |   successfulJobsHistoryLimit: 10
26 |   jobTemplate:
27 |     spec:
28 |       template:
29 |         spec:
30 |           restartPolicy: OnFailure
31 |           containers:
32 |           - name: podcleaner
33 |             imagePullPolicy: IfNotPresent
34 |             image: buntha/podcleaner:0.2
35 |             env:
36 |               - name: MAX_DAYS
37 |                 value: "2"
38 |               - name: POD_STATUS
39 |                 value: "Succeeded, Failed"
40 |               - name: K8S_CONFIG
41 |                 value: "incluster"
42 |               - name: NAMESPACE
43 |                 value: "default"
44 | ```
45 | 
46 | ##### Parameters
47 | * __MAX_DAYS__ : Number of Days since the pod state
48 | * __POD_STATUS__: Succeeded, Failed, Running
49 | * __NAMESPACE__: Pass the desired namespace or all
50 | * __K8S_CONFIG__: incluster or anything
51 | 
52 | 
53 | ## Reference
54 | 
55 | This tiny project is fully inspired by https://github.com/dignajar/clean-pods but due to the kubernetes upgrade, the project is no more working, so I needed to change and create a new library.
56 | 
57 | 
58 | # Service account for Kubernetes
59 | Service account for the namespace demo with enoght permissions to list and delete pods.
60 | 
61 | Manifest service-account.yaml
62 | ```yaml
63 | ---
64 | apiVersion: v1
65 | kind: ServiceAccount
66 | metadata:
67 |   name: demo-user
68 |   namespace: demo
69 | 
70 | ---
71 | kind: Role
72 | apiVersion: rbac.authorization.k8s.io/v1
73 | metadata:
74 |   name: demo-user-role
75 |   namespace: demo
76 | rules:
77 | - apiGroups: [""]
78 |   resources: ["pods","pods/exec","pods/log"]
79 |   verbs: ["*"]
80 | 
81 | ---
82 | kind: RoleBinding
83 | apiVersion: rbac.authorization.k8s.io/v1
84 | metadata:
85 |   name: demo-user
86 |   namespace: demo
87 | subjects:
88 | - kind: ServiceAccount
89 |   name: demo-user
90 |   namespace: demo
91 | roleRef:
92 |   apiGroup: rbac.authorization.k8s.io
93 |   kind: Role
94 |   name: demo-user-role 
95 | ```


--------------------------------------------------------------------------------
/dags/basic/first_pyspark_pipeline.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from airflow import DAG
 3 | from airflow.models import Variable
 4 | from airflow.operators.python import PythonOperator
 5 | from airflow.operators.dummy import DummyOperator
 6 | from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
 7 | from airflow.utils.dates import datetime
 8 | from airflow.configuration import conf
 9 | from airflow.operators.bash import BashOperator
10 | from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
11 | from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import SparkKubernetesSensor
12 | from airflow.operators.python import BranchPythonOperator
13 | 
14 | default_args = {
15 |     'owner': 'abc',
16 |     'team': 'dataengineering',
17 |     'depends_on_past': False,
18 |     'start_date': datetime.utcnow(),
19 |     'email': ['somebody@foo.com'],
20 |     'email_on_failure': False,
21 |     'email_on_retry': False,
22 |     'max_active_runs': 1,
23 | }
24 | 
25 | with DAG(
26 |         'Basic_Transformation',
27 |         default_args=default_args,
28 |         description='Basics Transformation to work with CSV & parquet using PySpark',
29 |         schedule_interval=None,
30 |         concurrency=10,
31 |         tags=['basic', 'pyspark'],
32 | ) as dag:
33 | 
34 |     start = DummyOperator(task_id='start', dag=dag, trigger_rule='all_success')
35 | 
36 |     def print_variables(**context):
37 |         SOURCE = context["dag_run"].conf["source"]
38 |         DESTINATION = context["dag_run"].conf["destination"]
39 | 
40 |         print('Source Path {0}'.format(SOURCE))
41 |         print('Destination Path {0}'.format(DESTINATION))
42 | 
43 |     print_variable = PythonOperator(
44 |         task_id='print_the_context',
45 |         python_callable=print_variables,
46 |     )
47 | 
48 |     csv_to_parquet = SparkKubernetesOperator(
49 |         task_id="csv_to_parquet",
50 |         namespace="default",
51 |         application_file="runners/csvtopqt.yaml",
52 |         do_xcom_push=True
53 |     )
54 | 
55 |     csv_to_parquet_sensor = SparkKubernetesSensor(
56 |         task_id="csv_to_parquet_sensor",
57 |         namespace="default",
58 |         attach_log=True,
59 |         application_name="{{ ti.xcom_pull(task_ids='csv_to_parquet')['metadata']['name'] }}",
60 |         kubernetes_conn_id="kubernetes_default"
61 |     )
62 | 
63 |     process_data = SparkKubernetesOperator(
64 |         task_id="process_data",
65 |         namespace="default",
66 |         application_file="runners/accountbypartition.yaml",
67 |         do_xcom_push=True
68 |     )
69 | 
70 |     process_data_sensor = SparkKubernetesSensor(
71 |         task_id="process_data_sensor",
72 |         namespace="default",
73 |         attach_log=True,
74 |         application_name="{{ ti.xcom_pull(task_ids='process_data')['metadata']['name'] }}",
75 |         kubernetes_conn_id="kubernetes_default"
76 |     )
77 | 
78 |     end = DummyOperator(task_id='end', dag=dag, trigger_rule='all_success')
79 |     
80 |     start >> print_variable >> csv_to_parquet >> csv_to_parquet_sensor
81 |     csv_to_parquet_sensor >> process_data >> process_data_sensor >> end


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/templates/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ include "mlflow-server.fullname" . }}
 5 |   labels:
 6 |     {{- include "mlflow-server.labels" . | nindent 4 }}
 7 | spec:
 8 |   {{- if not .Values.autoscaling.enabled }}
 9 |   replicas: {{ .Values.replicaCount }}
10 |   {{- end }}
11 |   selector:
12 |     matchLabels:
13 |       {{- include "mlflow-server.selectorLabels" . | nindent 6 }}
14 |   template:
15 |     metadata:
16 |       {{- with .Values.podAnnotations }}
17 |       annotations:
18 |         {{- toYaml . | nindent 8 }}
19 |       {{- end }}
20 |       labels:
21 |         {{- include "mlflow-server.selectorLabels" . | nindent 8 }}
22 |     spec:
23 |       {{- with .Values.imagePullSecrets }}
24 |       imagePullSecrets:
25 |         {{- toYaml . | nindent 8 }}
26 |       {{- end }}
27 |       {{- if or (.Values.serviceAccount.create) (.Values.serviceAccountName) }}
28 |       serviceAccountName: {{ template "mlflow-server.serviceAccountName" . }}
29 |       {{- end }}
30 |       securityContext:
31 |         {{- toYaml .Values.podSecurityContext | nindent 8 }}
32 |       {{- if .Values.initContainers }}
33 |       initContainers:
34 |       {{-  tpl (toYaml .Values.initContainers) . | nindent 6 }}
35 |       {{- end }}
36 |       containers:
37 |         - name: {{ .Chart.Name }}
38 |           securityContext:
39 |             {{- toYaml .Values.securityContext | nindent 12 }}
40 |           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
41 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
42 |           env:
43 |             - name: "MLFLOW_PORT"
44 |               value: {{ .Values.service.port | quote}}
45 |             - name: "ARTIFACT_STORE"
46 |               value: {{ .Values.artifact.path | quote}}
47 |             {{- range $key, $value := .Values.extraEnv }}
48 |             - name: {{ $key | quote}}
49 |               value: {{ $value | quote }}
50 |             {{- end }}
51 |             {{- if .Values.extraEnvRaw }}
52 |             {{- toYaml .Values.extraEnvRaw | nindent 12 }}
53 |             {{- end }}
54 |           envFrom:
55 |             - secretRef:
56 |                 name: {{ tpl .Values.envFromSecret . | quote }}
57 |             {{- range .Values.envFromSecrets }}
58 |             - secretRef:
59 |                 name: {{ tpl . $ | quote }}
60 |             {{- end }}    
61 |           ports:
62 |             - name: http
63 |               containerPort: {{ .Values.service.port }}
64 |               protocol: TCP
65 |           livenessProbe:
66 |             httpGet:
67 |               path: /
68 |               port: http
69 |           readinessProbe:
70 |             httpGet:
71 |               path: /
72 |               port: http
73 |           resources:
74 |             {{- toYaml .Values.resources | nindent 12 }}
75 |       {{- with .Values.nodeSelector }}
76 |       nodeSelector:
77 |         {{- toYaml . | nindent 8 }}
78 |       {{- end }}
79 |       {{- with .Values.affinity }}
80 |       affinity:
81 |         {{- toYaml . | nindent 8 }}
82 |       {{- end }}
83 |       {{- with .Values.tolerations }}
84 |       tolerations:
85 |         {{- toYaml . | nindent 8 }}
86 |       {{- end }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea
  2 | .idea/*
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | credentials
 12 | credentials/
 13 | credentials/*
 14 | /credentials
 15 | /credentials/*
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | cover/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | db.sqlite3-journal
 71 | 
 72 | # Flask stuff:
 73 | instance/
 74 | .webassets-cache
 75 | 
 76 | # Scrapy stuff:
 77 | .scrapy
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | .pybuilder/
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | #   For a library or package, you might want to ignore these files since the code is
 95 | #   intended to run in multiple environments; otherwise, check them in:
 96 | # .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | #Pipfile.lock
104 | 
105 | # poetry
106 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
107 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
108 | #   commonly ignored for libraries.
109 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110 | #poetry.lock
111 | 
112 | # pdm
113 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114 | #pdm.lock
115 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
116 | #   in version control.
117 | #   https://pdm.fming.dev/#use-with-ide
118 | .pdm.toml
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | .DS_Store
170 | */.DS_Store
171 | .DS_Store/*
172 | 
173 | 
174 | # Ignore all charts Dir
175 | */charts/
176 | */*/charts/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data & Machine Learning - The Boring Way
 2 | 
 3 | This tutorial walks you through setting up and building a Data Engineering & Machine Learning Platform. 
 4 | The tutorial is designed to explore many different technologies for the similar problems without any bias. 
 5 | 
 6 | __This is not a Production Ready Setup__
 7 | 
 8 | ## Target Audience
 9 | Data Engineers, Machine Learning Engineer, Data Scientist, SRE, Infrastructure Engineer, Data Analysts, Data Analytics Engineer
10 | 
11 | # Expected Technologies & Workflow 
12 | 
13 | ## Data Engineering & Analytics
14 | - [X] Kubernetes Kind Installation [link](/docs/01-setting-up-cluster.md)
15 | - [X] [MinIO](https://min.io/) Integrate Object Storage on top of Kubernetes and use minio interface for simulating the s3 [link](/docs/02-setting-up-minio.md)
16 | - [X] [Apache Airflow](https://airflow.apache.org/) on top of Kubernetes & Running an end to end Airflow Workflow using Kubernetes Executor [link](docs/04-setting-up-airflow.md)
17 | - [X] [Apache Spark](https://spark.apache.org/) Deploy Apache Spark on Kubernetes and run an example [link](/docs/03-setting-up-apachespark-k8s.md)
18 | - [ ] [Prefect](https://www.prefect.io/) Setup & Running an end to end Workflow
19 | - [ ] [Dagster](https://dagster.io/) Setup & Running an end to end Workflow
20 | - [ ] Set up an ETL job running end-2-end on apache airflow. This job contains Spark & Python Operator
21 | - [ ] [Apache Hive](https://cwiki.apache.org/confluence/display/hive/design) Setting up Hive & Hive Metastore
22 | - [ ] Deploy Trino & Open Source [Presto](https://prestodb.io/) and run dana Analytics queries.
23 | - [ ] Integrate [Superset](https://superset.apache.org/) & [Metabase](https://www.metabase.com/) to run visualization. Integrate Presto with the visualization system.
24 | - [ ] Open Table Format using [Delta](https://docs.delta.io/latest/quick-start.html)
25 | - [ ] Open Table Format using [Apache Iceberg](https://iceberg.apache.org/)
26 | - [ ] Open Table Format using [Apache Hudi](https://hudi.apache.org/)
27 | - [ ] Metadata Management using [Amundsen](https://www.amundsen.io/)
28 | - [ ] Metadata Management using [Datahub](https://datahubproject.io/)
29 | - [ ] Setting up [Apache Kafka](https://kafka.apache.org/) distributed event streaming platform
30 | - [ ] Using Spark Structered Streaming to run an end-2-end pipeline over any realtime data sources
31 | - [ ] Using [Apache Flink](https://flink.apache.org/) to run an end-2-end pipeline over any realtime data sources
32 | - [ ] [Redpanda](https://redpanda.com/), streaming data platform to run similar workflow
33 | - [ ] [Airbyte](https://airbyte.com/) Data Integration platform
34 | - [ ] [Talend](https://www.talend.com/products/data-integration/) UI based Data Integration
35 | - [ ] [DBT](https://www.getdbt.com/) DBT Sql Pipeline to compare with Spark and other tech
36 | - [ ] [Debezium](https://debezium.io/) Change Data Capture using Debezium to sync multiple databases
37 | 
38 | ## Monitoring & Observability
39 | - [ ] [Grafana]([https://](https://grafana.com/)) Setting Up Grafana for Monitoring components. Start with Monitoring Pods
40 | - [ ] [FluentD](https://www.fluentd.org/) logging metrics from pods & interact the same with Monitoring layer
41 | - [ ] Setting up a full Monitoring and Alerting Platform & integrate minitoring across other technologies
42 | - [ ] Setting up an Observability system 
43 | 
44 | ## Machine Learning
45 | - [ ] Setup [Ray](https://www.ray.io/) for Data Transformations
46 | - [ ] Use [Scikit-learn](https://scikit-learn.org/) for an example ML training
47 | - [ ] Setup [Argo Pipeline](https://argoproj.github.io/) for deploying ML Jobs
48 | - [ ] Setup [Flyte](https://flyte.org/) Orchestrator for pythonic Deployment
49 | - [ ] Use [Pytorch Lightening](https://www.pytorchlightning.ai/) for runing ML training
50 | - [ ] Use Tensorflow for running ML training
51 | - [ ] Setup ML End-2-End Workflow on Flyte
52 | - [ ] Deploy [MLFlow](https://www.mlflow.org/docs/latest/index.html) for ML Model Tracking & Experimentation
53 | - [ ] Deploy [BentoML](https://www.bentoml.com/) For deploying ML Model
54 | - [ ] Deploy [Sendon Core](https://github.com/SeldonIO/seldon-core) for ML Model Management
55 | - [ ] Integrate MLflow with Seldon Core 
56 | 
57 | ## Prerequisites
58 | * 🐳 Docker Installed 
59 | * [kubectl](https://kubernetes.io/docs/tasks/tools/) Installed, The Kubernetes command-line tool, kubectl, allows you to run commands against Kubernetes clusters
60 | * [Lens](https://k8slens.dev/) Installed, UI for Kubernetes.  
61 | _This is optional, kubectl is enough for getting all relevant stats from kubernetes cluster_
62 | * [Helm](https://helm.sh/) The package manager for Kubernetes
63 | 
64 | ## Lab Basic Setup
65 | * [Setting Up Kind](https://kind.sigs.k8s.io/docs/user/quick-start/)
66 | * Deleting older Pods [PodCleaner](/docs/05-cronjob-podcleaner.md)


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/README.md:
--------------------------------------------------------------------------------
  1 | # mlflow-server
  2 | 
  3 | [![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/mlflowserver)](https://artifacthub.io/packages/search?repo=mlflowserver)
  4 | 
  5 | ![Version: 0.1.8](https://img.shields.io/badge/Version-0.1.8-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.1.1](https://img.shields.io/badge/AppVersion-2.1.1-informational?style=flat-square)
  6 | 
  7 | A Helm chart for MLFlow On Kubernetes
  8 | 
  9 | **Homepage:** <https://mlflow.org/>
 10 | 
 11 | ## Maintainers
 12 | 
 13 | | Name | Email | Url |
 14 | | ---- | ------ | --- |
 15 | | ABC | <abhishek.create@gmail.com> | <https://github.com/abhishek-ch> |
 16 | 
 17 | ## Source Code
 18 | 
 19 | * <https://github.com/abhishek-ch/data-machinelearning-the-boring-way/tree/main/helm-charts>
 20 | * <https://mlflow.org/>
 21 | 
 22 | ## Requirements
 23 | 
 24 | | Repository | Name | Version |
 25 | |------------|------|---------|
 26 | | https://charts.bitnami.com/bitnami | postgresql | 11.9.6 |
 27 | 
 28 | ## Values
 29 | 
 30 | | Key | Type | Default | Description |
 31 | |-----|------|---------|-------------|
 32 | | affinity | object | `{}` |  |
 33 | | artifact.path | string | `"/tmp/abc"` |  |
 34 | | autoscaling.enabled | bool | `false` |  |
 35 | | autoscaling.maxReplicas | int | `100` |  |
 36 | | autoscaling.minReplicas | int | `1` |  |
 37 | | autoscaling.targetCPUUtilizationPercentage | int | `80` |  |
 38 | | database.db_name | string | `"mlflow"` |  |
 39 | | database.db_pass | string | `"mlflow"` |  |
 40 | | database.db_user | string | `"mlflow"` |  |
 41 | | database.host | string | `"abc-postgresql"` |  |
 42 | | database.name | string | `"mlflow"` |  |
 43 | | database.password | string | `"mlflow"` |  |
 44 | | database.port | string | `"5432"` |  |
 45 | | database.user | string | `"mlflow"` |  |
 46 | | database.valueFromSecretKey | string | `nil` |  |
 47 | | database.valueFromSecretName | string | `nil` |  |
 48 | | envFromSecret | string | `"{{ template \"mlflow-server.fullname\" . }}-env"` |  |
 49 | | envFromSecret | string | `"{{ template \"mlflow-server.fullname\" . }}-env"` |  |
 50 | | extraEnv | object | `{}` |  |
 51 | | extraEnvRaw | list | `[]` |  |
 52 | | fullnameOverride | string | `""` |  |
 53 | | image.pullPolicy | string | `"IfNotPresent"` |  |
 54 | | image.repository | string | `"mlflow"` |  |
 55 | | image.tag | string | `"2.1.1"` |  |
 56 | | imagePullSecrets | list | `[]` |  |
 57 | | ingress.annotations | object | `{}` |  |
 58 | | ingress.className | string | `""` |  |
 59 | | ingress.enabled | bool | `false` |  |
 60 | | ingress.hosts[0].host | string | `"chart-example.local"` |  |
 61 | | ingress.hosts[0].paths[0].path | string | `"/"` |  |
 62 | | ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` |  |
 63 | | ingress.tls | list | `[]` |  |
 64 | | initContainers[0].command[0] | string | `"/bin/sh"` |  |
 65 | | initContainers[0].command[1] | string | `"-c"` |  |
 66 | | initContainers[0].command[2] | string | `"dockerize -wait \"tcp://$DB_HOST:$DB_PORT\" -timeout 120s"` |  |
 67 | | initContainers[0].envFrom[0].secretRef.name | string | `"{{ tpl .Values.envFromSecret . }}"` |  |
 68 | | initContainers[0].image | string | `"{{ .Values.initImage.repository }}:{{ .Values.initImage.tag }}"` |  |
 69 | | initContainers[0].imagePullPolicy | string | `"{{ .Values.initImage.pullPolicy }}"` |  |
 70 | | initContainers[0].name | string | `"wait-for-postgres"` |  |
 71 | | initImage.pullPolicy | string | `"IfNotPresent"` |  |
 72 | | initImage.repository | string | `"jwilder/dockerize"` |  |
 73 | | initImage.tag | string | `"latest"` |  |
 74 | | nameOverride | string | `""` |  |
 75 | | nodeSelector | object | `{}` |  |
 76 | | podAnnotations | object | `{}` |  |
 77 | | podSecurityContext | object | `{}` |  |
 78 | | postgresql.auth.database | string | `"mlflow"` |  |
 79 | | postgresql.auth.existingSecret | string | `nil` |  |
 80 | | postgresql.auth.password | string | `"mlflow"` |  |
 81 | | postgresql.auth.username | string | `"mlflow"` |  |
 82 | | postgresql.enabled | bool | `true` |  |
 83 | | postgresql.primary.persistence.accessModes[0] | string | `"ReadWriteOnce"` |  |
 84 | | postgresql.primary.persistence.enabled | bool | `true` |  |
 85 | | postgresql.primary.service.ports.postgresql | string | `"5432"` |  |
 86 | | replicaCount | int | `1` |  |
 87 | | resources | object | `{}` |  |
 88 | | securityContext | object | `{}` |  |
 89 | | service.port | int | `5000` |  |
 90 | | service.type | string | `"ClusterIP"` |  |
 91 | | serviceAccount.annotations | object | `{}` |  |
 92 | | serviceAccount.create | bool | `true` |  |
 93 | | serviceAccount.name | string | `""` |  |
 94 | | serviceAccountName | string | `"mlflow"` |  |
 95 | | tolerations | list | `[]` |  |
 96 | 
 97 | ----------------------------------------------
 98 | Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)
 99 | 
100 | _generated using `helm-docs --dry-run`_


--------------------------------------------------------------------------------
/helm-charts/mlflow-server/values.yaml:
--------------------------------------------------------------------------------
  1 | # Default values for mlflow-server.
  2 | # This is a YAML-formatted file.
  3 | # Declare variables to be passed into your templates.
  4 | # helm upgrade --install --values mlflow-server/values.yaml abc mlflow-server/ -n datascience
  5 | # helm template mlflow-server/values.yaml abc mlflow-server/ -n default > abc.yaml
  6 | 
  7 | replicaCount: 1
  8 | 
  9 | image:
 10 |   repository: buntha/mlflow
 11 |   pullPolicy: IfNotPresent
 12 |   # Overrides the image tag whose default is the chart appVersion.
 13 |   tag: "2.1.1"
 14 | 
 15 | initImage:
 16 |   repository: jwilder/dockerize
 17 |   tag: latest
 18 |   pullPolicy: IfNotPresent
 19 | 
 20 | imagePullSecrets: []
 21 | nameOverride: ""
 22 | fullnameOverride: ""
 23 | 
 24 | serviceAccountName: 'mlflow'
 25 | serviceAccount:
 26 |   # Specifies whether a service account should be created
 27 |   create: true
 28 |   # Annotations to add to the service account
 29 |   annotations: {}
 30 |   # The name of the service account to use.
 31 |   # If not set and create is true, a name is generated using the fullname template
 32 |   name: ""
 33 | 
 34 | podAnnotations: {}
 35 | 
 36 | podSecurityContext: {}
 37 |   # fsGroup: 2000
 38 | 
 39 | envFromSecret: '{{ template "mlflow-server.fullname" . }}-env'
 40 | 
 41 | securityContext: {}
 42 |   # capabilities:
 43 |   #   drop:
 44 |   #   - ALL
 45 |   # readOnlyRootFilesystem: true
 46 |   # runAsNonRoot: true
 47 |   # runAsUser: 1000
 48 | 
 49 | 
 50 | ## Extra environment variables that will be passed into pods
 51 | ##
 52 | extraEnv:
 53 |  {}
 54 | 
 55 | ## Extra environment variables in RAW format that will be passed into pods
 56 | ##
 57 | extraEnvRaw:
 58 |  []
 59 |   # Load DB password from other secret (e.g. for zalando operator)
 60 |   # - name: DB_PASS
 61 |   #   valueFrom:
 62 |   #     secretKeyRef:
 63 |   #       name: superset.superset-postgres.credentials.postgresql.acid.zalan.do
 64 |   #       key: password
 65 | 
 66 | 
 67 | ## The name of the secret which we will use to populate env vars in deployed pods
 68 | ## This can be useful for secret keys, etc.
 69 | ##
 70 | envFromSecret: '{{ template "mlflow-server.fullname" . }}-env'
 71 | # This can be a list of template strings
 72 | # envFromSecrets: []
 73 | 
 74 | artifact:
 75 |   path: "/tmp/abc"
 76 | 
 77 | # You need to change below configuration incase bringing own PostgresSQL instance and also set postgresql.enabled:false
 78 | database:
 79 |   name: mlflow
 80 |   # You need to change below configuration incase bringing own PostgresSQL instance and also set postgresql.enabled:false
 81 |   host: 'abc-postgresql'
 82 |   port: "5432"
 83 |   db_user: mlflow
 84 |   db_pass: mlflow
 85 |   db_name: mlflow
 86 |   user: mlflow
 87 |   password: mlflow
 88 |   # If user provides secret ref 
 89 |   valueFromSecretName: ~  # Secret file holding the database credentials
 90 |   valueFromSecretKey: ~ # Key inside the secret file
 91 | 
 92 | service:
 93 |   type: ClusterIP
 94 |   port: 5000
 95 | 
 96 | 
 97 | initContainers:
 98 |   - name: wait-for-postgres
 99 |     image: "{{ .Values.initImage.repository }}:{{ .Values.initImage.tag }}"
100 |     imagePullPolicy: "{{ .Values.initImage.pullPolicy }}"
101 |     envFrom:
102 |       - secretRef:
103 |           name: "{{ tpl .Values.envFromSecret . }}"
104 |     command:
105 |       - /bin/sh
106 |       - -c
107 |       - dockerize -wait "tcp://$DB_HOST:$DB_PORT" -timeout 120s
108 | 
109 | ingress:
110 |   enabled: false
111 |   className: ""
112 |   annotations: {}
113 |     # kubernetes.io/ingress.class: nginx
114 |     # kubernetes.io/tls-acme: "true"
115 |   hosts:
116 |     - host: chart-example.local
117 |       paths:
118 |         - path: /
119 |           pathType: ImplementationSpecific
120 |   tls: []
121 |   #  - secretName: chart-example-tls
122 |   #    hosts:
123 |   #      - chart-example.local
124 | 
125 | resources: {}
126 |   # We usually recommend not to specify default resources and to leave this as a conscious
127 |   # choice for the user. This also increases chances charts run on environments with little
128 |   # resources, such as Minikube. If you do want to specify resources, uncomment the following
129 |   # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
130 |   # limits:
131 |   #   cpu: 100m
132 |   #   memory: 128Mi
133 |   # requests:
134 |   #   cpu: 100m
135 |   #   memory: 128Mi
136 | 
137 | 
138 | ##
139 | ## Configuration values for the postgresql dependency.
140 | ## ref: https://github.com/kubernetes/charts/blob/master/stable/postgresql/README.md
141 | postgresql:
142 |   ##
143 |   ## Use the PostgreSQL chart dependency.
144 |   ## Set to false if bringing your own PostgreSQL.
145 |   enabled: true
146 | 
147 |   ## Authentication parameters
148 |   auth:
149 |     ## The name of an existing secret that contains the postgres password.
150 |     existingSecret:
151 |     ## PostgreSQL name for a custom user to create
152 |     username: mlflow
153 |     ## PostgreSQL password for the custom user to create. Ignored if `auth.existingSecret` with key `password` is provided
154 |     password: mlflow
155 |     ## PostgreSQL name for a custom database to create
156 |     database: mlflow
157 | 
158 | 
159 |   ## PostgreSQL Primary parameters
160 |   primary:
161 |     ##
162 |     ## Persistent Volume Storage configuration.
163 |     ## ref: https://kubernetes.io/docs/user-guide/persistent-volumes
164 |     persistence:
165 |       ##
166 |       ## Enable PostgreSQL persistence using Persistent Volume Claims.
167 |       enabled: true
168 |       ##
169 |       ## Persistant class
170 |       # storageClass: classname
171 |       ##
172 |       ## Access modes:
173 |       accessModes:
174 |         - ReadWriteOnce
175 |     ## PostgreSQL port
176 |     service:
177 |       ports:
178 |         postgresql: "5432"
179 | 
180 | autoscaling:
181 |   enabled: false
182 |   minReplicas: 1
183 |   maxReplicas: 100
184 |   targetCPUUtilizationPercentage: 80
185 |   # targetMemoryUtilizationPercentage: 80
186 | 
187 | nodeSelector: {}
188 | 
189 | tolerations: []
190 | 
191 | affinity: {}
192 | 


--------------------------------------------------------------------------------
/helm_values/sparkoperator_values.yaml:
--------------------------------------------------------------------------------
  1 | # Default values for spark-operator.
  2 | # This is a YAML-formatted file.
  3 | # Declare variables to be passed into your templates.
  4 | 
  5 | # replicaCount -- Desired number of pods, leaderElection will be enabled
  6 | # if this is greater than 1
  7 | replicaCount: 1
  8 | 
  9 | image:
 10 |   # -- Image repository
 11 |   repository: gcr.io/spark-operator/spark-operator
 12 |   # -- Image pull policy
 13 |   pullPolicy: Never
 14 |   # -- if set, override the image tag whose default is the chart appVersion.
 15 |   tag: 3.1.1
 16 | 
 17 | # -- Image pull secrets
 18 | imagePullSecrets: []
 19 | 
 20 | # -- String to partially override `spark-operator.fullname` template (will maintain the release name)
 21 | nameOverride: ""
 22 | 
 23 | # -- String to override release name
 24 | fullnameOverride: ""
 25 | 
 26 | rbac:
 27 |   # -- **DEPRECATED** use `createRole` and `createClusterRole`
 28 |   create: false
 29 |   # -- Create and use RBAC `Role` resources
 30 |   createRole: true
 31 |   # -- Create and use RBAC `ClusterRole` resources
 32 |   createClusterRole: true
 33 | 
 34 | serviceAccounts:
 35 |   spark:
 36 |     # -- Create a service account for spark apps
 37 |     create: true
 38 |     # -- Optional name for the spark service account
 39 |     name: "spark"
 40 |     # -- Optional annotations for the spark service account
 41 |     annotations: {}
 42 |   sparkoperator:
 43 |     # -- Create a service account for the operator
 44 |     create: true
 45 |     # -- Optional name for the operator service account
 46 |     name: "spark-operator"
 47 |     # -- Optional annotations for the operator service account
 48 |     annotations: {}
 49 | 
 50 | # -- Set this if running spark jobs in a different namespace than the operator
 51 | sparkJobNamespace: ""
 52 | 
 53 | # -- Operator concurrency, higher values might increase memory usage
 54 | controllerThreads: 10
 55 | 
 56 | # -- Operator resync interval. Note that the operator will respond to events (e.g. create, update)
 57 | # unrelated to this setting
 58 | resyncInterval: 10
 59 | 
 60 | uiService:
 61 |   # -- Enable UI service creation for Spark application
 62 |   enable: true
 63 | 
 64 | # -- Ingress URL format.
 65 | # Requires the UI service to be enabled by setting `uiService.enable` to true.
 66 | ingressUrlFormat: ""
 67 | 
 68 | # -- Set higher levels for more verbose logging
 69 | logLevel: 2
 70 | 
 71 | # podSecurityContext -- Pod security context
 72 | podSecurityContext: {}
 73 | 
 74 | # securityContext -- Operator container security context
 75 | securityContext: {}
 76 | 
 77 | webhook:
 78 |   # -- Enable webhook server
 79 |   enable: false
 80 |   # -- Webhook service port
 81 |   port: 8080
 82 |   # -- The webhook server will only operate on namespaces with this label, specified in the form key1=value1,key2=value2.
 83 |   # Empty string (default) will operate on all namespaces
 84 |   namespaceSelector: ""
 85 |   # -- The annotations applied to init job, required to restore certs deleted by the cleanup job during upgrade
 86 |   initAnnotations:
 87 |     "helm.sh/hook": pre-install, pre-upgrade
 88 |     "helm.sh/hook-weight": "50"
 89 |   # -- The annotations applied to the cleanup job, required for helm lifecycle hooks
 90 |   cleanupAnnotations:
 91 |     "helm.sh/hook": pre-delete, pre-upgrade
 92 |     "helm.sh/hook-delete-policy": hook-succeeded
 93 |     # -- Webhook Timeout in seconds
 94 |   timeout: 30
 95 | 
 96 | metrics:
 97 |   # -- Enable prometheus metric scraping
 98 |   enable: false
 99 |   # -- Metrics port
100 |   port: 10254
101 |   # -- Metrics port name
102 |   portName: metrics
103 |   # -- Metrics serving endpoint
104 |   endpoint: /metrics
105 |   # -- Metric prefix, will be added to all exported metrics
106 |   prefix: ""
107 | 
108 | # -- Prometheus pod monitor for operator's pod.
109 | podMonitor:
110 |   # -- If enabled, a pod monitor for operator's pod will be submitted. Note that prometheus metrics should be enabled as well.
111 |   enable: false
112 |   # -- Pod monitor labels
113 |   labels: {}
114 |   # -- The label to use to retrieve the job name from
115 |   jobLabel: spark-operator-podmonitor
116 |   # -- Prometheus metrics endpoint properties. `metrics.portName` will be used as a port
117 |   podMetricsEndpoint:
118 |     scheme: http
119 |     interval: 5s
120 | 
121 | # nodeSelector -- Node labels for pod assignment
122 | nodeSelector: {}
123 | 
124 | # tolerations -- List of node taints to tolerate
125 | tolerations: []
126 | 
127 | # affinity -- Affinity for pod assignment
128 | affinity: {}
129 | 
130 | # podAnnotations -- Additional annotations to add to the pod
131 | podAnnotations: {}
132 | 
133 | # podLabels -- Additional labels to add to the pod
134 | podLabels: {}
135 | 
136 | # resources -- Pod resource requests and limits
137 | # Note, that each job submission will spawn a JVM within the Spark Operator Pod using "/usr/local/openjdk-11/bin/java -Xmx128m".
138 | # Kubernetes may kill these Java processes at will to enforce resource limits. When that happens, you will see the following error:
139 | # 'failed to run spark-submit for SparkApplication [...]: signal: killed' - when this happens, you may want to increase memory limits.
140 | resources:
141 |   limits:
142 |     cpu: 2000m
143 |     memory: 8000Mi
144 |   requests:
145 |     cpu: 200m
146 |     memory: 100Mi
147 | 
148 | batchScheduler:
149 |   # -- Enable batch scheduler for spark jobs scheduling. If enabled, users can specify batch scheduler name in spark application
150 |   enable: false
151 | 
152 | resourceQuotaEnforcement:
153 |   # -- Whether to enable the ResourceQuota enforcement for SparkApplication resources.
154 |   # Requires the webhook to be enabled by setting `webhook.enable` to true.
155 |   # Ref: https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/blob/master/docs/user-guide.md#enabling-resource-quota-enforcement.
156 |   enable: false
157 | 
158 | leaderElection:
159 |   # -- Leader election lock name.
160 |   # Ref: https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/blob/master/docs/user-guide.md#enabling-leader-election-for-high-availability.
161 |   lockName: "spark-operator-lock"
162 |   # -- Optionally store the lock in another namespace. Defaults to operator's namespace
163 |   lockNamespace: ""
164 | 
165 | istio:
166 |   # -- When using `istio`, spark jobs need to run without a sidecar to properly terminate
167 |   enabled: false
168 | 
169 | # labelSelectorFilter -- A comma-separated list of key=value, or key labels to filter resources during watch and list based on the specified labels.
170 | labelSelectorFilter: ""


--------------------------------------------------------------------------------
/helm_values/minio_values.yaml:
--------------------------------------------------------------------------------
  1 | ## Provide a name in place of minio for `app:` labels
  2 | ##
  3 | nameOverride: ""
  4 | 
  5 | ## Provide a name to substitute for the full names of resources
  6 | ##
  7 | fullnameOverride: ""
  8 | 
  9 | ## set kubernetes cluster domain where minio is running
 10 | ##
 11 | clusterDomain: cluster.local
 12 | 
 13 | ## Set default image, imageTag, and imagePullPolicy. mode is used to indicate the
 14 | ##
 15 | image:
 16 |   repository: quay.io/minio/minio
 17 |   tag: RELEASE.2022-04-26T01-20-24Z
 18 |   pullPolicy: IfNotPresent
 19 | 
 20 | imagePullSecrets: []
 21 | # - name: "image-pull-secret"
 22 | 
 23 | ## Set default image, imageTag, and imagePullPolicy for the `mc` (the minio
 24 | ## client used to create a default bucket).
 25 | ##
 26 | mcImage:
 27 |   repository: quay.io/minio/mc
 28 |   tag: RELEASE.2022-04-16T21-11-21Z
 29 |   pullPolicy: IfNotPresent
 30 | 
 31 | ## minio mode, i.e. standalone or distributed or gateway.
 32 | mode: standalone ## other supported values are "standalone", "gateway"
 33 | 
 34 | ## Additional labels to include with deployment or statefulset
 35 | additionalLabels: []
 36 | 
 37 | ## Additional annotations to include with deployment or statefulset
 38 | additionalAnnotations: []
 39 | 
 40 | ## Typically the deployment/statefulset includes checksums of secrets/config,
 41 | ## So that when these change on a subsequent helm install, the deployment/statefulset 
 42 | ## is restarted. This can result in unnecessary restarts under GitOps tooling such as 
 43 | ## flux, so set to "true" to disable this behaviour.
 44 | ignoreChartChecksums: false
 45 | 
 46 | ## Additional arguments to pass to minio binary
 47 | extraArgs: []
 48 | 
 49 | ## Port number for MinIO S3 API Access
 50 | minioAPIPort: "9000"
 51 | 
 52 | ## Port number for MinIO Browser COnsole Access
 53 | minioConsolePort: "9001"
 54 | 
 55 | ## Update strategy for Deployments
 56 | DeploymentUpdate:
 57 |   type: RollingUpdate
 58 |   maxUnavailable: 0
 59 |   maxSurge: 100%
 60 | 
 61 | ## Update strategy for StatefulSets
 62 | StatefulSetUpdate:
 63 |   updateStrategy: RollingUpdate
 64 | 
 65 | ## Pod priority settings
 66 | ## ref: https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/
 67 | ##
 68 | priorityClassName: ""
 69 | 
 70 | ## Set default rootUser, rootPassword
 71 | ## AccessKey and secretKey is generated when not set
 72 | ## Distributed MinIO ref: https://docs.minio.io/docs/distributed-minio-quickstart-guide
 73 | ##
 74 | rootUser: ""
 75 | rootPassword: ""
 76 | 
 77 | ## Use existing Secret that store following variables:
 78 | ##
 79 | ## | Chart var             | .data.<key> in Secret    |
 80 | ## |:----------------------|:-------------------------|
 81 | ## | rootUser              | rootUser                 |
 82 | ## | rootPassword          | rootPassword             |
 83 | ##
 84 | ## All mentioned variables will be ignored in values file.
 85 | ## .data.rootUser and .data.rootPassword are mandatory,
 86 | ## others depend on enabled status of corresponding sections.
 87 | existingSecret: ""
 88 | 
 89 | ## Directory on the MinIO pof
 90 | certsPath: "/etc/minio/certs/"
 91 | configPathmc: "/etc/minio/mc/"
 92 | 
 93 | ## Path where PV would be mounted on the MinIO Pod
 94 | mountPath: "/export"
 95 | ## Override the root directory which the minio server should serve from.
 96 | ## If left empty, it defaults to the value of {{ .Values.mountPath }}
 97 | ## If defined, it must be a sub-directory of the path specified in {{ .Values.mountPath }}
 98 | ##
 99 | bucketRoot: ""
100 | 
101 | # Number of drives attached to a node
102 | drivesPerNode: 1
103 | # Number of MinIO containers running
104 | replicas: 16
105 | # Number of expanded MinIO clusters
106 | pools: 1
107 | 
108 | # Deploy if 'mode == gateway' - 4 replicas.
109 | gateway:
110 |   type: "nas" # currently only "nas" are supported.
111 |   replicas: 4
112 | 
113 | ## TLS Settings for MinIO
114 | tls:
115 |   enabled: false
116 |   ## Create a secret with private.key and public.crt files and pass that here. Ref: https://github.com/minio/minio/tree/master/docs/tls/kubernetes#2-create-kubernetes-secret
117 |   certSecret: ""
118 |   publicCrt: public.crt
119 |   privateKey: private.key
120 | 
121 | ## Trusted Certificates Settings for MinIO. Ref: https://docs.minio.io/docs/how-to-secure-access-to-minio-server-with-tls#install-certificates-from-third-party-cas
122 | ## Bundle multiple trusted certificates into one secret and pass that here. Ref: https://github.com/minio/minio/tree/master/docs/tls/kubernetes#2-create-kubernetes-secret
123 | ## When using self-signed certificates, remember to include MinIO's own certificate in the bundle with key public.crt.
124 | ## If certSecret is left empty and tls is enabled, this chart installs the public certificate from .Values.tls.certSecret.
125 | trustedCertsSecret: ""
126 | 
127 | ## Enable persistence using Persistent Volume Claims
128 | ## ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
129 | ##
130 | persistence:
131 |   enabled: true
132 |   annotations: {}
133 | 
134 |   ## A manually managed Persistent Volume and Claim
135 |   ## Requires persistence.enabled: true
136 |   ## If defined, PVC must be created manually before volume will be bound
137 |   existingClaim: ""
138 | 
139 |   ## minio data Persistent Volume Storage Class
140 |   ## If defined, storageClassName: <storageClass>
141 |   ## If set to "-", storageClassName: "", which disables dynamic provisioning
142 |   ## If undefined (the default) or set to null, no storageClassName spec is
143 |   ##   set, choosing the default provisioner.  (gp2 on AWS, standard on
144 |   ##   GKE, AWS & OpenStack)
145 |   ##
146 |   ## Storage class of PV to bind. By default it looks for standard storage class.
147 |   ## If the PV uses a different storage class, specify that here.
148 |   storageClass: ""
149 |   VolumeName: ""
150 |   accessMode: ReadWriteOnce
151 |   size: 1Gi
152 | 
153 |   ## If subPath is set mount a sub folder of a volume instead of the root of the volume.
154 |   ## This is especially handy for volume plugins that don't natively support sub mounting (like glusterfs).
155 |   ##
156 |   subPath: ""
157 | 
158 | ## Expose the MinIO service to be accessed from outside the cluster (LoadBalancer service).
159 | ## or access it from within the cluster (ClusterIP service). Set the service type and the port to serve it.
160 | ## ref: http://kubernetes.io/docs/user-guide/services/
161 | ##
162 | service:
163 |   type: ClusterIP
164 |   clusterIP: ~
165 |   ## Make sure to match it to minioAPIPort
166 |   port: "9000"
167 |   nodePort: 32000
168 | 
169 | ## Configure Ingress based on the documentation here: https://kubernetes.io/docs/concepts/services-networking/ingress/
170 | ##
171 | 
172 | ingress:
173 |   enabled: true
174 |   # ingressClassName: ""
175 |   labels: {}
176 |     # node-role.kubernetes.io/ingress: platform
177 | 
178 |   annotations: {}
179 |     # kubernetes.io/ingress.class: nginx
180 |     # kubernetes.io/tls-acme: "true"
181 |     # kubernetes.io/ingress.allow-http: "false"
182 |     # kubernetes.io/ingress.global-static-ip-name: ""
183 |     # nginx.ingress.kubernetes.io/secure-backends: "true"
184 |     # nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
185 |     # nginx.ingress.kubernetes.io/whitelist-source-range: 0.0.0.0/0
186 |   path: /
187 |   hosts:
188 |     - minio-example.local
189 |   tls: []
190 |   #  - secretName: chart-example-tls
191 |   #    hosts:
192 |   #      - chart-example.local
193 | 
194 | consoleService:
195 |   type: ClusterIP
196 |   clusterIP: ~
197 |   ## Make sure to match it to minioConsolePort
198 |   port: "9001"
199 |   nodePort: 32001
200 | 
201 | consoleIngress:
202 |   enabled: false
203 |   # ingressClassName: ""
204 |   labels: {}
205 |     # node-role.kubernetes.io/ingress: platform
206 | 
207 |   annotations: {}
208 |     # kubernetes.io/ingress.class: nginx
209 |     # kubernetes.io/tls-acme: "true"
210 |     # kubernetes.io/ingress.allow-http: "false"
211 |     # kubernetes.io/ingress.global-static-ip-name: ""
212 |     # nginx.ingress.kubernetes.io/secure-backends: "true"
213 |     # nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
214 |     # nginx.ingress.kubernetes.io/whitelist-source-range: 0.0.0.0/0
215 |   path: /
216 |   hosts:
217 |     - console.minio-example.local
218 |   tls: []
219 |   #  - secretName: chart-example-tls
220 |   #    hosts:
221 |   #      - chart-example.local
222 | 
223 | ## Node labels for pod assignment
224 | ## Ref: https://kubernetes.io/docs/user-guide/node-selection/
225 | ##
226 | nodeSelector: {}
227 | tolerations: []
228 | affinity: {}
229 | 
230 | ## Add stateful containers to have security context, if enabled MinIO will run as this
231 | ## user and group NOTE: securityContext is only enabled if persistence.enabled=true
232 | securityContext:
233 |   enabled: true
234 |   runAsUser: 1000
235 |   runAsGroup: 1000
236 |   fsGroup: 1000
237 |   fsGroupChangePolicy: "OnRootMismatch"
238 | 
239 | # Additational pod annotations
240 | podAnnotations: {}
241 | 
242 | # Additional pod labels
243 | podLabels: {}
244 | 
245 | ## Configure resource requests and limits
246 | ## ref: http://kubernetes.io/docs/user-guide/compute-resources/
247 | ##
248 | resources:
249 |   requests:
250 |     memory: 2Gi
251 | 
252 | ## List of policies to be created after minio install
253 | ##
254 | ## In addition to default policies [readonly|readwrite|writeonly|consoleAdmin|diagnostics]
255 | ## you can define additional policies with custom supported actions and resources
256 | policies: []
257 | ## writeexamplepolicy policy grants creation or deletion of buckets with name 
258 | ## starting with example. In addition, grants objects write permissions on buckets starting with
259 | ## example.
260 | # - name: writeexamplepolicy
261 | #   statements:
262 | #     - resources: 
263 | #         - 'arn:aws:s3:::example*/*'
264 | #       actions:
265 | #         - "s3:AbortMultipartUpload"
266 | #         - "s3:GetObject"
267 | #         - "s3:DeleteObject"
268 | #         - "s3:PutObject"
269 | #         - "s3:ListMultipartUploadParts"
270 | #     - resources: 
271 | #         - 'arn:aws:s3:::example*'
272 | #       actions:
273 | #         - "s3:CreateBucket"
274 | #         - "s3:DeleteBucket"
275 | #         - "s3:GetBucketLocation"
276 | #         - "s3:ListBucket"
277 | #         - "s3:ListBucketMultipartUploads"
278 | ## readonlyexamplepolicy policy grants access to buckets with name starting with example. 
279 | ## In addition, grants objects read permissions on buckets starting with example.
280 | # - name: readonlyexamplepolicy
281 | #   statements:
282 | #     - resources: 
283 | #         - 'arn:aws:s3:::example*/*'
284 | #       actions:
285 | #         - "s3:GetObject"
286 | #     - resources: 
287 | #         - 'arn:aws:s3:::example*'
288 | #       actions:
289 | #         - "s3:GetBucketLocation"
290 | #         - "s3:ListBucket"
291 | #         - "s3:ListBucketMultipartUploads"
292 | ## Additional Annotations for the Kubernetes Job makePolicyJob
293 | makePolicyJob:
294 |   podAnnotations:
295 |   annotations:
296 |   securityContext:
297 |     enabled: false
298 |     runAsUser: 1000
299 |     runAsGroup: 1000
300 |     fsGroup: 1000
301 |   resources:
302 |     requests:
303 |       memory: 128Mi
304 |   nodeSelector: {}
305 |   tolerations: []
306 |   affinity: {}
307 | 
308 | ## List of users to be created after minio install
309 | ##
310 | users:
311 |   ## Username, password and policy to be assigned to the user
312 |   ## Default policies are [readonly|readwrite|writeonly|consoleAdmin|diagnostics]
313 |   ## Add new policies as explained here https://docs.min.io/docs/minio-multi-user-quickstart-guide.html
314 |   ## NOTE: this will fail if LDAP is enabled in your MinIO deployment
315 |   ## make sure to disable this if you are using LDAP.
316 |   - accessKey: developer
317 |     secretKey: software123
318 |     policy: consoleAdmin
319 |   # Or you can refer to specific secret
320 |   #- accessKey: externalSecret
321 |   #  existingSecret: my-secret
322 |   #  existingSecretKey: password
323 |   #  policy: readonly
324 | 
325 | 
326 | ## Additional Annotations for the Kubernetes Job makeUserJob
327 | makeUserJob:
328 |   podAnnotations:
329 |   annotations:
330 |   securityContext:
331 |     enabled: false
332 |     runAsUser: 1000
333 |     runAsGroup: 1000
334 |     fsGroup: 1000
335 |   resources:
336 |     requests:
337 |       memory: 128Mi
338 |   nodeSelector: {}
339 |   tolerations: []
340 |   affinity: {}
341 | 
342 | ## List of buckets to be created after minio install
343 | ##
344 | buckets:
345 |   - name: airflow-logs
346 |     policy: none
347 |   - name: test-files
348 |     policy: none
349 |   #   # Name of the bucket
350 |   # - name: bucket1
351 |   #   # Policy to be set on the
352 |   #   # bucket [none|download|upload|public]
353 |   #   policy: none
354 |   #   # Purge if bucket exists already
355 |   #   purge: false
356 |   #   # set versioning for
357 |   #   # bucket [true|false]
358 |   #   versioning: false
359 |   # - name: bucket2
360 |   #   policy: none
361 |   #   purge: false
362 |   #   versioning: true
363 | 
364 | ## Additional Annotations for the Kubernetes Job makeBucketJob
365 | makeBucketJob:
366 |   podAnnotations:
367 |   annotations:
368 |   securityContext:
369 |     enabled: false
370 |     runAsUser: 1000
371 |     runAsGroup: 1000
372 |     fsGroup: 1000
373 |   resources:
374 |     requests:
375 |       memory: 128Mi
376 |   nodeSelector: {}
377 |   tolerations: []
378 |   affinity: {}
379 | 
380 | ## List of command to run after minio install
381 | ## NOTE: the mc command TARGET is always "myminio"
382 | customCommands:
383 |   # - command: "admin policy set myminio consoleAdmin group='cn=ops,cn=groups,dc=example,dc=com'"
384 | 
385 | ## Additional Annotations for the Kubernetes Job customCommandJob
386 | customCommandJob:
387 |   podAnnotations:
388 |   annotations:
389 |   securityContext:
390 |     enabled: false
391 |     runAsUser: 1000
392 |     runAsGroup: 1000
393 |     fsGroup: 1000
394 |   resources:
395 |     requests:
396 |       memory: 128Mi
397 |   nodeSelector: {}
398 |   tolerations: []
399 |   affinity: {}
400 | 
401 | ## Use this field to add environment variables relevant to MinIO server. These fields will be passed on to MinIO container(s)
402 | ## when Chart is deployed
403 | environment:
404 |   ## Please refer for comprehensive list https://docs.min.io/minio/baremetal/reference/minio-server/minio-server.html
405 |   ## MINIO_SUBNET_LICENSE: "License key obtained from https://subnet.min.io"
406 |   ## MINIO_BROWSER: "off"
407 | 
408 | ## The name of a secret in the same kubernetes namespace which contain secret values
409 | ## This can be useful for LDAP password, etc
410 | ## The key in the secret must be 'config.env'
411 | ##
412 | # extraSecret: minio-extraenv
413 | 
414 | networkPolicy:
415 |   enabled: false
416 |   allowExternal: true
417 | 
418 | ## PodDisruptionBudget settings
419 | ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
420 | ##
421 | podDisruptionBudget:
422 |   enabled: false
423 |   maxUnavailable: 1
424 | 
425 | ## Specify the service account to use for the MinIO pods. If 'create' is set to 'false'
426 | ## and 'name' is left unspecified, the account 'default' will be used.
427 | serviceAccount:
428 |   create: true
429 |   ## The name of the service account to use. If 'create' is 'true', a service account with that name
430 |   ## will be created.
431 |   name: "minio-sa"
432 | 
433 | metrics:
434 |   serviceMonitor:
435 |     enabled: false
436 |     public: true
437 |     additionalLabels: {}
438 |     relabelConfigs: {}
439 |     # namespace: monitoring
440 |     # interval: 30s
441 |     # scrapeTimeout: 10s
442 | 
443 | ## ETCD settings: https://github.com/minio/minio/blob/master/docs/sts/etcd.md
444 | ## Define endpoints to enable this section.
445 | etcd:
446 |   endpoints: []
447 |   pathPrefix: ""
448 |   corednsPathPrefix: ""
449 |   clientCert: ""
450 |   clientCertKey: ""
451 | 
452 | 


--------------------------------------------------------------------------------
/helm_values/airflow_values.yaml:
--------------------------------------------------------------------------------
   1 | # Licensed to the Apache Software Foundation (ASF) under one
   2 | # or more contributor license agreements.  See the NOTICE file
   3 | # distributed with this work for additional information
   4 | # regarding copyright ownership.  The ASF licenses this file
   5 | # to you under the Apache License, Version 2.0 (the
   6 | # "License"); you may not use this file except in compliance
   7 | # with the License.  You may obtain a copy of the License at
   8 | #
   9 | #   http://www.apache.org/licenses/LICENSE-2.0
  10 | #
  11 | # Unless required by applicable law or agreed to in writing,
  12 | # software distributed under the License is distributed on an
  13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 | # KIND, either express or implied.  See the License for the
  15 | # specific language governing permissions and limitations
  16 | # under the License.
  17 | ---
  18 | # Default values for airflow.
  19 | # This is a YAML-formatted file.
  20 | # Declare variables to be passed into your templates.
  21 | 
  22 | # Provide a name to substitute for the full names of resources
  23 | fullnameOverride: ""
  24 | 
  25 | # Provide a name to substitute for the name of the chart
  26 | nameOverride: ""
  27 | 
  28 | # Provide a Kubernetes version (used for API Version selection) to override the auto-detected version
  29 | kubeVersionOverride: ""
  30 | 
  31 | # User and group of airflow user
  32 | uid: 50000
  33 | gid: 0
  34 | 
  35 | # Default security context for airflow
  36 | securityContext: {}
  37 | #  runAsUser: 50000
  38 | #  fsGroup: 0
  39 | #  runAsGroup: 0
  40 | 
  41 | # Airflow home directory
  42 | # Used for mount paths
  43 | airflowHome: /opt/airflow
  44 | 
  45 | # Default airflow repository -- overrides all the specific images below
  46 | defaultAirflowRepository: apache/airflow
  47 | 
  48 | # Default airflow tag to deploy
  49 | defaultAirflowTag: "2.2.4"
  50 | 
  51 | # Airflow version (Used to make some decisions based on Airflow Version being deployed)
  52 | airflowVersion: "2.2.4"
  53 | 
  54 | # Images
  55 | images:
  56 |   airflow:
  57 |     repository: ~
  58 |     tag: ~
  59 |     pullPolicy: IfNotPresent
  60 |   # To avoid images with user code, you can turn this to 'true' and
  61 |   # all the 'run-airflow-migrations' and 'wait-for-airflow-migrations' containers/jobs
  62 |   # will use the images from 'defaultAirflowRepository:defaultAirflowTag' values
  63 |   # to run and wait for DB migrations .
  64 |   useDefaultImageForMigration: false
  65 |   # timeout (in seconds) for airflow-migrations to complete
  66 |   migrationsWaitTimeout: 60
  67 |   pod_template:
  68 |     repository: ~
  69 |     tag: ~
  70 |     pullPolicy: IfNotPresent
  71 |   flower:
  72 |     repository: ~
  73 |     tag: ~
  74 |     pullPolicy: IfNotPresent
  75 |   statsd:
  76 |     repository: apache/airflow
  77 |     tag: airflow-statsd-exporter-2021.04.28-v0.17.0
  78 |     pullPolicy: IfNotPresent
  79 |   redis:
  80 |     repository: redis
  81 |     tag: 6-bullseye
  82 |     pullPolicy: IfNotPresent
  83 |   pgbouncer:
  84 |     repository: apache/airflow
  85 |     tag: airflow-pgbouncer-2021.04.28-1.14.0
  86 |     pullPolicy: IfNotPresent
  87 |   pgbouncerExporter:
  88 |     repository: apache/airflow
  89 |     tag: airflow-pgbouncer-exporter-2021.09.22-0.12.0
  90 |     pullPolicy: IfNotPresent
  91 |   gitSync:
  92 |     repository: k8s.gcr.io/git-sync/git-sync
  93 |     tag: v3.4.0
  94 |     pullPolicy: IfNotPresent
  95 | 
  96 | # Select certain nodes for airflow pods.
  97 | nodeSelector: {}
  98 | affinity: {}
  99 | tolerations: []
 100 | 
 101 | # Add common labels to all objects and pods defined in this chart.
 102 | labels: {}
 103 | 
 104 | # Ingress configuration
 105 | ingress:
 106 |   # Enable ingress resource
 107 |   enabled: false
 108 | 
 109 |   # Configs for the Ingress of the web Service
 110 |   web:
 111 |     # Annotations for the web Ingress
 112 |     annotations: {}
 113 | 
 114 |     # The path for the web Ingress
 115 |     path: "/"
 116 | 
 117 |     # The pathType for the above path (used only with Kubernetes v1.19 and above)
 118 |     pathType: "ImplementationSpecific"
 119 | 
 120 |     # The hostname for the web Ingress (Deprecated - renamed to `ingress.web.hosts`)
 121 |     host: ""
 122 | 
 123 |     # The hostnames or hosts configuration for the web Ingress
 124 |     hosts: []
 125 |     # - name: ""
 126 |     #   # configs for web Ingress TLS
 127 |     #   tls:
 128 |     #     # Enable TLS termination for the web Ingress
 129 |     #     enabled: false
 130 |     #     # the name of a pre-created Secret containing a TLS private key and certificate
 131 |     #     secretName: ""
 132 | 
 133 |     # The Ingress Class for the web Ingress (used only with Kubernetes v1.19 and above)
 134 |     ingressClassName: ""
 135 | 
 136 |     # configs for web Ingress TLS (Deprecated - renamed to `ingress.web.hosts[*].tls`)
 137 |     tls:
 138 |       # Enable TLS termination for the web Ingress
 139 |       enabled: false
 140 |       # the name of a pre-created Secret containing a TLS private key and certificate
 141 |       secretName: ""
 142 | 
 143 |     # HTTP paths to add to the web Ingress before the default path
 144 |     precedingPaths: []
 145 | 
 146 |     # Http paths to add to the web Ingress after the default path
 147 |     succeedingPaths: []
 148 | 
 149 |   # Configs for the Ingress of the flower Service
 150 |   flower:
 151 |     # Annotations for the flower Ingress
 152 |     annotations: {}
 153 | 
 154 |     # The path for the flower Ingress
 155 |     path: "/"
 156 | 
 157 |     # The pathType for the above path (used only with Kubernetes v1.19 and above)
 158 |     pathType: "ImplementationSpecific"
 159 | 
 160 |     # The hostname for the flower Ingress (Deprecated - renamed to `ingress.flower.hosts`)
 161 |     host: ""
 162 | 
 163 |     # The hostnames or hosts configuration for the flower Ingress
 164 |     hosts: []
 165 |     # - name: ""
 166 |     #   tls:
 167 |     #     # Enable TLS termination for the flower Ingress
 168 |     #     enabled: false
 169 |     #     # the name of a pre-created Secret containing a TLS private key and certificate
 170 |     #     secretName: ""
 171 | 
 172 |     # The Ingress Class for the flower Ingress (used only with Kubernetes v1.19 and above)
 173 |     ingressClassName: ""
 174 | 
 175 |     # configs for flower Ingress TLS (Deprecated - renamed to `ingress.flower.hosts[*].tls`)
 176 |     tls:
 177 |       # Enable TLS termination for the flower Ingress
 178 |       enabled: false
 179 |       # the name of a pre-created Secret containing a TLS private key and certificate
 180 |       secretName: ""
 181 | 
 182 | # Network policy configuration
 183 | networkPolicies:
 184 |   # Enabled network policies
 185 |   enabled: false
 186 | 
 187 | # Extra annotations to apply to all
 188 | # Airflow pods
 189 | airflowPodAnnotations: {}
 190 | 
 191 | # Extra annotations to apply to
 192 | # main Airflow configmap
 193 | airflowConfigAnnotations: {}
 194 | 
 195 | # `airflow_local_settings` file as a string (can be templated).
 196 | airflowLocalSettings: |-
 197 |   {{- if semverCompare ">=2.2.0" .Values.airflowVersion }}
 198 |   {{- if not (or .Values.webserverSecretKey .Values.webserverSecretKeySecretName) }}
 199 |   from airflow.www.utils import UIAlert
 200 | 
 201 |   DASHBOARD_UIALERTS = [
 202 |     UIAlert(
 203 |       'Usage of a dynamic webserver secret key detected. We recommend a static webserver secret key instead.'
 204 |       ' See the <a href='
 205 |       '"https://airflow.apache.org/docs/helm-chart/stable/production-guide.html#webserver-secret-key">'
 206 |       'Helm Chart Production Guide</a> for more details.',
 207 |       category="warning",
 208 |       roles=["Admin"],
 209 |       html=True,
 210 |     )
 211 |   ]
 212 |   {{- end }}
 213 |   {{- end }}
 214 | 
 215 | # Enable RBAC (default on most clusters these days)
 216 | rbac:
 217 |   # Specifies whether RBAC resources should be created
 218 |   create: true
 219 |   createSCCRoleBinding: false
 220 | 
 221 | # Airflow executor
 222 | # Options: LocalExecutor, CeleryExecutor, KubernetesExecutor, CeleryKubernetesExecutor
 223 | executor: "KubernetesExecutor"
 224 | 
 225 | # If this is true and using LocalExecutor/KubernetesExecutor/CeleryKubernetesExecutor, the scheduler's
 226 | # service account will have access to communicate with the api-server and launch pods.
 227 | # If this is true and using CeleryExecutor/KubernetesExecutor/CeleryKubernetesExecutor, the workers
 228 | # will be able to launch pods.
 229 | allowPodLaunching: true
 230 | 
 231 | # Environment variables for all airflow containers
 232 | env:
 233 |   - name: AIRFLOW__CORE__LOAD_EXAMPLES
 234 |     value: 'True'
 235 |   - name: AIRFLOW__KUBERNETES__DELETE_WORKER_PODS_ON_FAILURE
 236 |     value: 'False'
 237 |   - name: AIRFLOW__KUBERNETES__DELETE_WORKER_PODS
 238 |     value: 'False'
 239 |   - name: AIRFLOW__KUBERNETES__MULTI_NAMESPACE_MODE
 240 |     value: 'True'
 241 |   - name: AIRFLOW__LOGGING__REMOTE_LOGGING
 242 |     value: 'True'
 243 |   - name: AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID
 244 |     value: "s3_conn"
 245 |   - name: AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER
 246 |     value: "s3://airflow-logs"
 247 |   - name: AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL
 248 |     value: '60'
 249 |   - name: AIRFLOW__KUBERNETES__VERIFY_SSL
 250 |     value: 'False'
 251 | 
 252 | # Secrets for all airflow containers
 253 | secret: []
 254 | # - envName: ""
 255 | #   secretName: ""
 256 | #   secretKey: ""
 257 | 
 258 | # Enables selected built-in secrets that are set via environment variables by default.
 259 | # Those secrets are provided by the Helm Chart secrets by default but in some cases you
 260 | # might want to provide some of those variables with _CMD or _SECRET variable, and you should
 261 | # in this case disable setting of those variables by setting the relevant configuration to false.
 262 | enableBuiltInSecretEnvVars:
 263 |   AIRFLOW__CORE__FERNET_KEY: true
 264 |   AIRFLOW__CORE__SQL_ALCHEMY_CONN: true
 265 |   AIRFLOW_CONN_AIRFLOW_DB: true
 266 |   AIRFLOW__WEBSERVER__SECRET_KEY: true
 267 |   AIRFLOW__CELERY__CELERY_RESULT_BACKEND: true
 268 |   AIRFLOW__CELERY__RESULT_BACKEND: true
 269 |   AIRFLOW__CELERY__BROKER_URL: true
 270 |   AIRFLOW__ELASTICSEARCH__HOST: true
 271 |   AIRFLOW__ELASTICSEARCH__ELASTICSEARCH_HOST: true
 272 | 
 273 | # Extra secrets that will be managed by the chart
 274 | # (You can use them with extraEnv or extraEnvFrom or some of the extraVolumes values).
 275 | # The format is "key/value" where
 276 | #    * key (can be templated) is the name of the secret that will be created
 277 | #    * value: an object with the standard 'data' or 'stringData' key (or both).
 278 | #          The value associated with those keys must be a string (can be templated)
 279 | extraSecrets:
 280 |   airflow-git-ssh-secret:
 281 |     data: |
 282 |       gitSshKey: 'LS0tLS1CRUdJTiBPUEVOU1NIIFBSSVZBVEUgS0VZLS0tLS0KYjNCbGJuTnphQzFyWlhrdGRqRUFBQUFBQkc1dmJtVUFBQUFFYm05dVpRQUFBQUFBQUFBQkFBQUNGd0FBQUFkemMyZ3RjbgpOaEFBQUFBd0VBQVFBQUFnRUF2V2NaS1IxKy94dHExOFR2NXZqNU4yTlhlNEtLNkxpYVhlSlJWanpwTXlDSVJTcCtVMjUxCldzVkxHeWprMU1CUDZnZVZzeXRTVDZ3a2RJYzBqcHJ2Qm94UkNNR1h3RDFyUEhURU1jMy9sSnBZRHpmZjliTjRZcHFTRnAKRjJVVTIrTHRJR2dqZGFUbk1MMG1ybjZ6WW1WbmZHclNsVnNhbXV5VjNEcGJzRnpaRGN1bStmR3dya2dvY2I4S1BNZGxUNgpjclpyK0QzbzZTcmZlWSt2eURxb0EzV0JYaWViSHdGNEJvT3VYN0NFSk5lV1hQV0FWOFJuOGlXbzhRWDJhK0FncEVZQ0dCCndYS2RtY1IyR1VqbXZsZzFtSUhsb3ZOa0JGbHJGZFViYU5ra1FhbHc2Zm9JaUhYVmQ3QkN3a09FM1psZU1NMFdWbENxME4KTnkxaVZCTzNWMFY3eU00RDhMVHNwS0diaGp1Qk1hdGZzMksveldVNnN5ekFoVVE0eU56bklOemMvVzV3RkdTY0ZjeFVpLwpCMklNTmZkaHEvUVk4anVUdE9sVzhRSllxa2N2Vkx4Umx1ZnVHLzZJKzhMSDlpUFMrc3JMNnE2SnNObVIwYWt1YmkyM1ZxCndKYURBbEluVXFCN3NVdE5rdTlHQk40U0NKN0RDam8rQ3dRc0FMdE1MRmQ5RUZ0U0ZhT0E0MVpONzZIQkp3RXJxbXZlODEKNVgrTDhaSkUwcTZxandZcGZVQ1JvaXdpSCsyZGl0UGM4TTUzNHpQcC9qZSsrS1FFUmVIMVJzVm5hZkN2b3ZMV3pZRFl2eApzNHQ0aHkxRGhFeVVNYWRNZ2ZJQjlzNllYNUZWS3MyQXFqcEtxdXVEenhaV2RXdlZGamNsMkpxM0Y0REFJVjQzMmw5cytTCjBBQUFkUTFlU3NpOVhrcklzQUFBQUhjM05vTFhKellRQUFBZ0VBdldjWktSMSsveHRxMThUdjV2ajVOMk5YZTRLSzZMaWEKWGVKUlZqenBNeUNJUlNwK1UyNTFXc1ZMR3lqazFNQlA2Z2VWc3l0U1Q2d2tkSWMwanBydkJveFJDTUdYd0QxclBIVEVNYwozL2xKcFlEemZmOWJONFlwcVNGcEYyVVUyK0x0SUdnamRhVG5NTDBtcm42elltVm5mR3JTbFZzYW11eVYzRHBic0Z6WkRjCnVtK2ZHd3JrZ29jYjhLUE1kbFQ2Y3JacitEM282U3JmZVkrdnlEcW9BM1dCWGllYkh3RjRCb091WDdDRUpOZVdYUFdBVjgKUm44aVdvOFFYMmErQWdwRVlDR0J3WEtkbWNSMkdVam12bGcxbUlIbG92TmtCRmxyRmRVYmFOa2tRYWx3NmZvSWlIWFZkNwpCQ3drT0UzWmxlTU0wV1ZsQ3EwTk55MWlWQk8zVjBWN3lNNEQ4TFRzcEtHYmhqdUJNYXRmczJLL3pXVTZzeXpBaFVRNHlOCnpuSU56Yy9XNXdGR1NjRmN4VWkvQjJJTU5mZGhxL1FZOGp1VHRPbFc4UUpZcWtjdlZMeFJsdWZ1Ry82SSs4TEg5aVBTK3MKckw2cTZKc05tUjBha3ViaTIzVnF3SmFEQWxJblVxQjdzVXROa3U5R0JONFNDSjdEQ2pvK0N3UXNBTHRNTEZkOUVGdFNGYQpPQTQxWk43NkhCSndFcnFtdmU4MTVYK0w4WkpFMHE2cWp3WXBmVUNSb2l3aUgrMmRpdFBjOE01MzR6UHAvamUrK0tRRVJlCkgxUnNWbmFmQ3ZvdkxXellEWXZ4czR0NGh5MURoRXlVTWFkTWdmSUI5czZZWDVGVktzMkFxanBLcXV1RHp4WldkV3ZWRmoKY2wySnEzRjREQUlWNDMybDlzK1MwQUFBQURBUUFCQUFBQ0FIQXpxMHdZRlN1N2FrZjRlc1l4cURHUHdEdUMwaGViaEpjcApnMVc3TDhzQjhyaEd2QjlneUZURm5BQ3k4UnkzOEorTGJydDlabjJjVk54VFhHYytrV3NucExtek5kckVoOHFBd2RsZlY1CkVJL2pIRnYvOG1HcU1QUG92d0VFK0o1cWlGOHlVZzlYYVQ0T1QrUC9XdlZuYlM3ZmVYRFJUNm1IamgrdzByQmYwSlhEbDEKOXFObWlJaHEwYnAwenFNckpGTGdqb1c0NTBsNkJIbFhEYU5vWEMvd1pIc0txM0pxeEZmWG1tMUZmSUwyMURTdG15SmpUOQpDN1RxRzZEb2xKZFF6Y0RGR01Fb3pRUlN4a09rVkUxZXBDRjhralUrU25ucEh1OElGL3k4NWlXZjR5dzQydVU4S1FHR2dZClhqeWNvZEdSZC81ME9xOWdaUHZTL3ZVZjJQUHRQTG9zTkFNM2hZeThHcnFHd3I5RjJ4cTZaVjZ3WVdrVERWdzNsNVdXQjEKVjN6VXk5TEZGRU05TitVVTFXTHF2VzBsMXltM1VCeFQzRmlJblcraTlzRFhRQlpabWpOUGJVQzRmNHJzVURraDBPNGpRRwpjRFZvL2U3UU5meVR2eDBBdi9yOHFISUdVOUR1bmNEYWhUOWVBK0NNemtRQ0hmK0hTWDlRdko4TzR3UFhrMDNac3FDMW1aCmtkYkpyRm9MdjZ2dDU3Z0hyY3lacFRHUnEvV0NTUmlZWHRWd1dVRDVERStKNmRLcXlxZUJIQ0NYUTZpN25UWEdxTG9NNGUKdUdvQU84Uk1BNjBVK3ZDSnJ5SitoVWROQlJLRy9IbEhyVXRZYzVhK1hNOE5DeUMzYXdXK3lLZUt5bmcxSzlISjlQOU5HSwpnNUpYUzA3MEhiUnJZcXlkWmhBQUFCQVFEaldIN0VndEdQYndCV1YyWHpPdlVQdzBPUUxqNk5WYlVaSDZtdkhsenhZUkpVCnVNd0hqWEpNZjdFV3YwbHdZV3RHNXMzeFc3aGNTR0dFSXEyM0tTVi9IUFcvV0hxNlh2Z1ROdDI1dnpLQXFCdWdFM25PclgKbm0vSTNSUjN0NnVWN0pzYUo4dlVwTHJTbE5UazFIT05wUjV6dDJtTENTMW54L1FIUC9JZFpvR0NpWU1lRE50R1gvRDBXRAppNG5jMVF0d0thSktycUUyeE4xWmdZb05hNVJZYVlXUVhvV25uRXVMeHVoTHI2MS9VZUFVcXFuNjU0YnA5TWllNnJuazlwCjZVRVhDMGNybE1QZXI2dVo5UzM0VGhIdHFGS3hNNEwvbGJwank5Z3RSOWhQS2xOVHRENWlsY1I5ZHBlYTZvNS9EdWtnZE4Kb3NwRXMzMGpkaGRPUm5rRkFBQUJBUURrV3ZuNlZhRE1aRTEwM3p2K1Y0aFRoSkdXOUhUTU84SmdDRUNJMFgvZjhCelNCTApITFZGNHZBeG42bnFmMjUydFpkSWRpMndWWjJaWnd4N3h5ei9nK2IrLzJnV0JaVGtZcXFYUVU3SXFRbVRnZXIweW4yWjZjCmJpbUN5T09XSGIvYkhYR0pMT2I3eDUyekIxY0wySURubnFwT2htSEliSDdhcVJzNW1IN2FQRmJ4MTlyMWRuSmhCc2hNMXgKanNpd3JLYjk0SmQ5N1lnSUI5czZNZmU4bDBoNFFQVlFWbklGZzV4VG1xZENzWGVhWXMzb25SZUVVYms0am9oODJvWTVBMQpqSjkrUTRxZGxMWTNPODhMM1RTNUVsVGdadEJaWVdtSnpMdmpreHpPWkhRUm5kaVpkNWo2cjZnM3QwT3lJNFJxU3BOT3ByClovaFJnS3d6ZFhXM0NEQUFBQkFRRFVWTzMvRzFvZVl0djlsR3RiVFMwQlliSVdFU0RhMUF0bzBla0VzNzFmVUJiR1RhTGUKNjdmLzNSWkpObDRZdFBDUVlvc1hoZDNOYUlYL1NleXFKTTlPcHZzUVdKY0VTZ29oSjNlUWNuMWNhRVZCOG9yUUoveHhZdgp3WE01WFRUZFZKWnV2TUQ1WnB6T2ZyN3ZMMXVWN1FTL0lwaUhzMXNzdDd5L2NPWTB0ZFJzZ3hOM056NHZSSWxMTlZ6MHoxCjZkL2xoLzBwenN5V3NVaXlKNzllYUNkckVuZUVxbHMrMFczVllrWGl1bHA2SkZoTm5mMFo5VmNERlBUUUx2dFNhNUp5MUsKdms4cGVFUUV4R2hjOFkyWG9iZlRQR0xVWFl0eDJrN21RYWhwSDdLSW44QWFUaU1ZRlQ3ekY1azg2UmdXWlIxMzJFaGRFSQpLdWNTOEFMNXVHQ1BBQUFBR1dGaWFHbHphR1ZyTG1OeVpXRjBaVUJuYldGcGJDNWpiMjBCCi0tLS0tRU5EIE9QRU5TU0ggUFJJVkFURSBLRVktLS0tLQo='
 283 | # eg:
 284 | # extraSecrets:
 285 | #   '{{ .Release.Name }}-airflow-connections':
 286 | #     type: 'Opaque'
 287 | #     data: |
 288 | #       AIRFLOW_CONN_GCP: 'base64_encoded_gcp_conn_string'
 289 | #       AIRFLOW_CONN_AWS: 'base64_encoded_aws_conn_string'
 290 | #     stringData: |
 291 | #       AIRFLOW_CONN_OTHER: 'other_conn'
 292 | #   '{{ .Release.Name }}-other-secret-name-suffix':
 293 | #     data: |
 294 | #        ...
 295 | 
 296 | # Extra ConfigMaps that will be managed by the chart
 297 | # (You can use them with extraEnv or extraEnvFrom or some of the extraVolumes values).
 298 | # The format is "key/value" where
 299 | #    * key (can be templated) is the name of the configmap that will be created
 300 | #    * value: an object with the standard 'data' key.
 301 | #          The value associated with this keys must be a string (can be templated)
 302 | extraConfigMaps: {}
 303 | # eg:
 304 | # extraConfigMaps:
 305 | #   '{{ .Release.Name }}-airflow-variables':
 306 | #     data: |
 307 | #       AIRFLOW_VAR_HELLO_MESSAGE: "Hi!"
 308 | #       AIRFLOW_VAR_KUBERNETES_NAMESPACE: "{{ .Release.Namespace }}"
 309 | 
 310 | # Extra env 'items' that will be added to the definition of airflow containers
 311 | # a string is expected (can be templated).
 312 | # TODO: difference from `env`? This is a templated string. Probably should template `env` and remove this.
 313 | # Get config from https://airflow.apache.org/docs/apache-airflow/stable/configurations-ref.html
 314 | extraEnv: ~
 315 | 
 316 | 
 317 | 
 318 | 
 319 | # Extra envFrom 'items' that will be added to the definition of airflow containers
 320 | # A string is expected (can be templated).
 321 | extraEnvFrom: ~
 322 | # eg:
 323 | # extraEnvFrom: |
 324 | #   - secretRef:
 325 | #       name: '{{ .Release.Name }}-airflow-connections'
 326 | #   - configMapRef:
 327 | #       name: '{{ .Release.Name }}-airflow-variables'
 328 | 
 329 | # Airflow database & redis config
 330 | data:
 331 |   # If secret names are provided, use those secrets
 332 |   metadataSecretName: ~
 333 |   resultBackendSecretName: ~
 334 |   brokerUrlSecretName: ~
 335 | 
 336 |   # Otherwise pass connection values in
 337 |   metadataConnection:
 338 |     user: postgres
 339 |     pass: postgres
 340 |     protocol: postgresql
 341 |     host: ~
 342 |     port: 5432
 343 |     db: postgres
 344 |     sslmode: disable
 345 |   # resultBackendConnection defaults to the same database as metadataConnection
 346 |   resultBackendConnection: ~
 347 |   # or, you can use a different database
 348 |   # resultBackendConnection:
 349 |   #   user: postgres
 350 |   #   pass: postgres
 351 |   #   protocol: postgresql
 352 |   #   host: ~
 353 |   #   port: 5432
 354 |   #   db: postgres
 355 |   #   sslmode: disable
 356 |   # Note: brokerUrl can only be set during install, not upgrade
 357 |   brokerUrl: ~
 358 | 
 359 | # Fernet key settings
 360 | # Note: fernetKey can only be set during install, not upgrade
 361 | fernetKey: ~
 362 | fernetKeySecretName: ~
 363 | 
 364 | # Flask secret key for Airflow Webserver: `[webserver] secret_key` in airflow.cfg
 365 | webserverSecretKey: "asdsdjhhjbas78sadnmbjcas&&jkasbjkjda"
 366 | webserverSecretKeySecretName: ~
 367 | 
 368 | # In order to use kerberos you need to create secret containing the keytab file
 369 | # The secret name should follow naming convention of the application where resources are
 370 | # name {{ .Release-name }}-<POSTFIX>. In case of the keytab file, the postfix is "kerberos-keytab"
 371 | # So if your release is named "my-release" the name of the secret should be "my-release-kerberos-keytab"
 372 | #
 373 | # The Keytab content should be available in the "kerberos.keytab" key of the secret.
 374 | #
 375 | #  apiVersion: v1
 376 | #  kind: Secret
 377 | #  data:
 378 | #    kerberos.keytab: <base64_encoded keytab file content>
 379 | #  type: Opaque
 380 | #
 381 | #
 382 | #  If you have such keytab file you can do it with similar
 383 | #
 384 | #  kubectl create secret generic {{ .Release.name }}-kerberos-keytab --from-file=kerberos.keytab
 385 | #
 386 | #
 387 | #  Alternatively, instead of manually creating the secret, it is possible to specify
 388 | #  kerberos.keytabBase64Content parameter. This parameter should contain base64 encoded keytab.
 389 | #
 390 | 
 391 | kerberos:
 392 |   enabled: false
 393 |   ccacheMountPath: /var/kerberos-ccache
 394 |   ccacheFileName: cache
 395 |   configPath: /etc/krb5.conf
 396 |   keytabBase64Content: ~
 397 |   keytabPath: /etc/airflow.keytab
 398 |   principal: airflow@FOO.COM
 399 |   reinitFrequency: 3600
 400 |   config: |
 401 |     # This is an example config showing how you can use templating and how "example" config
 402 |     # might look like. It works with the test kerberos server that we are using during integration
 403 |     # testing at Apache Airflow (see `scripts/ci/docker-compose/integration-kerberos.yml` but in
 404 |     # order to make it production-ready you must replace it with your own configuration that
 405 |     # Matches your kerberos deployment. Administrators of your Kerberos instance should
 406 |     # provide the right configuration.
 407 | 
 408 |     [logging]
 409 |     default = "FILE:{{ template "airflow_logs_no_quote" . }}/kerberos_libs.log"
 410 |     kdc = "FILE:{{ template "airflow_logs_no_quote" . }}/kerberos_kdc.log"
 411 |     admin_server = "FILE:{{ template "airflow_logs_no_quote" . }}/kadmind.log"
 412 | 
 413 |     [libdefaults]
 414 |     default_realm = FOO.COM
 415 |     ticket_lifetime = 10h
 416 |     renew_lifetime = 7d
 417 |     forwardable = true
 418 | 
 419 |     [realms]
 420 |     FOO.COM = {
 421 |       kdc = kdc-server.foo.com
 422 |       admin_server = admin_server.foo.com
 423 |     }
 424 | 
 425 | # Airflow Worker Config
 426 | workers:
 427 |   # Number of airflow celery workers in StatefulSet
 428 |   replicas: 1
 429 | 
 430 |   # Command to use when running Airflow workers (templated).
 431 |   command: ~
 432 |   # Args to use when running Airflow workers (templated).
 433 |   args:
 434 |     - "bash"
 435 |     - "-c"
 436 |     # The format below is necessary to get `helm lint` happy
 437 |     - |-
 438 |       exec \
 439 |       airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "celery worker" "worker" }}
 440 | 
 441 |   # Update Strategy when worker is deployed as a StatefulSet
 442 |   updateStrategy: ~
 443 |   # Update Strategy when worker is deployed as a Deployment
 444 |   strategy:
 445 |     rollingUpdate:
 446 |       maxSurge: "100%"
 447 |       maxUnavailable: "50%"
 448 | 
 449 |   # When not set, the values defined in the global securityContext will be used
 450 |   securityContext: {}
 451 |   #  runAsUser: 50000
 452 |   #  fsGroup: 0
 453 |   #  runAsGroup: 0
 454 | 
 455 |   # Create ServiceAccount
 456 |   serviceAccount:
 457 |     # Specifies whether a ServiceAccount should be created
 458 |     create: true
 459 |     # The name of the ServiceAccount to use.
 460 |     # If not set and create is true, a name is generated using the release name
 461 |     name: ~
 462 | 
 463 |     # Annotations to add to worker kubernetes service account.
 464 |     annotations: {}
 465 | 
 466 |   # Allow KEDA autoscaling.
 467 |   # Persistence.enabled must be set to false to use KEDA.
 468 |   keda:
 469 |     enabled: false
 470 |     namespaceLabels: {}
 471 | 
 472 |     # How often KEDA polls the airflow DB to report new scale requests to the HPA
 473 |     pollingInterval: 5
 474 | 
 475 |     # How many seconds KEDA will wait before scaling to zero.
 476 |     # Note that HPA has a separate cooldown period for scale-downs
 477 |     cooldownPeriod: 30
 478 | 
 479 |     # Minimum number of workers created by keda
 480 |     minReplicaCount: 0
 481 | 
 482 |     # Maximum number of workers created by keda
 483 |     maxReplicaCount: 10
 484 | 
 485 |   persistence:
 486 |     # Enable persistent volumes
 487 |     enabled: true
 488 |     # Volume size for worker StatefulSet
 489 |     size: 500Mi
 490 |     # If using a custom storageClass, pass name ref to all statefulSets here
 491 |     storageClassName:
 492 |     # Execute init container to chown log directory.
 493 |     # This is currently only needed in kind, due to usage
 494 |     # of local-path provisioner.
 495 |     fixPermissions: false
 496 | 
 497 |   kerberosSidecar:
 498 |     # Enable kerberos sidecar
 499 |     enabled: false
 500 |     resources: {}
 501 |     #  limits:
 502 |     #   cpu: 100m
 503 |     #   memory: 128Mi
 504 |     #  requests:
 505 |     #   cpu: 100m
 506 |     #   memory: 128Mi
 507 | 
 508 |   resources: {}
 509 |   #  limits:
 510 |   #   cpu: 100m
 511 |   #   memory: 128Mi
 512 |   #  requests:
 513 |   #   cpu: 100m
 514 |   #   memory: 128Mi
 515 | 
 516 |   # Grace period for tasks to finish after SIGTERM is sent from kubernetes
 517 |   terminationGracePeriodSeconds: 600
 518 | 
 519 |   # This setting tells kubernetes that its ok to evict
 520 |   # when it wants to scale a node down.
 521 |   safeToEvict: true
 522 | 
 523 |   # Launch additional containers into worker.
 524 |   # Note: If used with KubernetesExecutor, you are responsible for signaling sidecars to exit when the main
 525 |   # container finishes so Airflow can continue the worker shutdown process!
 526 |   extraContainers: []
 527 |   # Add additional init containers into workers.
 528 |   extraInitContainers: []
 529 | 
 530 |   # Mount additional volumes into worker.
 531 |   extraVolumes: []
 532 |   extraVolumeMounts: []
 533 | 
 534 |   # Select certain nodes for airflow worker pods.
 535 |   nodeSelector: {}
 536 |   priorityClassName: ~
 537 |   affinity: {}
 538 |   # default worker affinity is:
 539 |   #  podAntiAffinity:
 540 |   #    preferredDuringSchedulingIgnoredDuringExecution:
 541 |   #    - podAffinityTerm:
 542 |   #        labelSelector:
 543 |   #          matchLabels:
 544 |   #            component: worker
 545 |   #        topologyKey: kubernetes.io/hostname
 546 |   #      weight: 100
 547 |   tolerations: []
 548 |   # hostAliases to use in worker pods.
 549 |   # See:
 550 |   # https://kubernetes.io/docs/concepts/services-networking/add-entries-to-pod-etc-hosts-with-host-aliases/
 551 |   hostAliases: []
 552 |   # - ip: "127.0.0.2"
 553 |   #   hostnames:
 554 |   #   - "test.hostname.one"
 555 |   # - ip: "127.0.0.3"
 556 |   #   hostnames:
 557 |   #   - "test.hostname.two"
 558 | 
 559 |   podAnnotations: {}
 560 | 
 561 |   logGroomerSidecar:
 562 |     # Command to use when running the Airflow worker log groomer sidecar (templated).
 563 |     command: ~
 564 |     # Args to use when running the Airflow worker log groomer sidecar (templated).
 565 |     args: ["bash", "/clean-logs"]
 566 |     # Number of days to retain logs
 567 |     retentionDays: 15
 568 |     resources: {}
 569 |     #  limits:
 570 |     #   cpu: 100m
 571 |     #   memory: 128Mi
 572 |     #  requests:
 573 |     #   cpu: 100m
 574 |     #   memory: 128Mi
 575 | 
 576 | # Airflow scheduler settings
 577 | scheduler:
 578 |   # If the scheduler stops heartbeating for 5 minutes (5*60s) kill the
 579 |   # scheduler and let Kubernetes restart it
 580 |   livenessProbe:
 581 |     initialDelaySeconds: 10
 582 |     timeoutSeconds: 20
 583 |     failureThreshold: 5
 584 |     periodSeconds: 60
 585 |     command:
 586 |       - sh
 587 |       - -c
 588 |       - |
 589 |         CONNECTION_CHECK_MAX_COUNT=0 exec /entrypoint python -Wignore -c "
 590 |         import os
 591 |         os.environ['AIRFLOW__CORE__LOGGING_LEVEL'] = 'ERROR'
 592 |         os.environ['AIRFLOW__LOGGING__LOGGING_LEVEL'] = 'ERROR'
 593 |         from airflow.jobs.scheduler_job import SchedulerJob
 594 |         from airflow.utils.db import create_session
 595 |         from airflow.utils.net import get_hostname
 596 |         import sys
 597 |         with create_session() as session:
 598 |             job = session.query(SchedulerJob).filter_by(hostname=get_hostname()).order_by(
 599 |                 SchedulerJob.latest_heartbeat.desc()).limit(1).first()
 600 |         sys.exit(0 if job.is_alive() else 1)"
 601 | 
 602 |   # Airflow 2.0 allows users to run multiple schedulers,
 603 |   # However this feature is only recommended for MySQL 8+ and Postgres
 604 |   replicas: 1
 605 | 
 606 |   # Command to use when running the Airflow scheduler (templated).
 607 |   command: ~
 608 |   # Args to use when running the Airflow scheduler (templated).
 609 |   args: ["bash", "-c", "exec airflow scheduler"]
 610 | 
 611 |   # Update Strategy when scheduler is deployed as a StatefulSet
 612 |   # (when using LocalExecutor and workers.persistence)
 613 |   updateStrategy: ~
 614 |   # Update Strategy when scheduler is deployed as a Deployment
 615 |   # (when not using LocalExecutor and workers.persistence)
 616 |   strategy: ~
 617 | 
 618 |   # When not set, the values defined in the global securityContext will be used
 619 |   securityContext: {}
 620 |   #  runAsUser: 50000
 621 |   #  fsGroup: 0
 622 |   #  runAsGroup: 0
 623 | 
 624 |   # Create ServiceAccount
 625 |   serviceAccount:
 626 |     # Specifies whether a ServiceAccount should be created
 627 |     create: true
 628 |     # The name of the ServiceAccount to use.
 629 |     # If not set and create is true, a name is generated using the release name
 630 |     name: ~
 631 | 
 632 |     # Annotations to add to scheduler kubernetes service account.
 633 |     annotations: {}
 634 | 
 635 |   # Scheduler pod disruption budget
 636 |   podDisruptionBudget:
 637 |     enabled: false
 638 | 
 639 |     # PDB configuration
 640 |     config:
 641 |       maxUnavailable: 1
 642 | 
 643 |   resources: {}
 644 |   #  limits:
 645 |   #   cpu: 100m
 646 |   #   memory: 128Mi
 647 |   #  requests:
 648 |   #   cpu: 100m
 649 |   #   memory: 128Mi
 650 | 
 651 |   # This setting tells kubernetes that its ok to evict
 652 |   # when it wants to scale a node down.
 653 |   safeToEvict: true
 654 | 
 655 |   # Launch additional containers into scheduler.
 656 |   extraContainers: []
 657 |   # Add additional init containers into scheduler.
 658 |   extraInitContainers: []
 659 | 
 660 |   # Mount additional volumes into scheduler.
 661 |   extraVolumes: []
 662 |   extraVolumeMounts: []
 663 | 
 664 |   # Select certain nodes for airflow scheduler pods.
 665 |   nodeSelector: {}
 666 |   affinity: {}
 667 |   # default scheduler affinity is:
 668 |   #  podAntiAffinity:
 669 |   #    preferredDuringSchedulingIgnoredDuringExecution:
 670 |   #    - podAffinityTerm:
 671 |   #        labelSelector:
 672 |   #          matchLabels:
 673 |   #            component: scheduler
 674 |   #        topologyKey: kubernetes.io/hostname
 675 |   #      weight: 100
 676 |   tolerations: []
 677 | 
 678 |   priorityClassName: ~
 679 | 
 680 |   podAnnotations: {}
 681 | 
 682 |   logGroomerSidecar:
 683 |     # Whether to deploy the Airflow scheduler log groomer sidecar.
 684 |     enabled: true
 685 |     # Command to use when running the Airflow scheduler log groomer sidecar (templated).
 686 |     command: ~
 687 |     # Args to use when running the Airflow scheduler log groomer sidecar (templated).
 688 |     args: ["bash", "/clean-logs"]
 689 |     # Number of days to retain logs
 690 |     retentionDays: 15
 691 |     resources: {}
 692 |     #  limits:
 693 |     #   cpu: 100m
 694 |     #   memory: 128Mi
 695 |     #  requests:
 696 |     #   cpu: 100m
 697 |     #   memory: 128Mi
 698 | 
 699 | # Airflow create user job settings
 700 | createUserJob:
 701 |   # Command to use when running the create user job (templated).
 702 |   command: ~
 703 |   # Args to use when running the create user job (templated).
 704 |   args:
 705 |     - "bash"
 706 |     - "-c"
 707 |     # The format below is necessary to get `helm lint` happy
 708 |     - |-
 709 |       exec \
 710 |       airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "users create" "create_user" }} "$@"
 711 |     - --
 712 |     - "-r"
 713 |     - "{{ .Values.webserver.defaultUser.role }}"
 714 |     - "-u"
 715 |     - "{{ .Values.webserver.defaultUser.username }}"
 716 |     - "-e"
 717 |     - "{{ .Values.webserver.defaultUser.email }}"
 718 |     - "-f"
 719 |     - "{{ .Values.webserver.defaultUser.firstName }}"
 720 |     - "-l"
 721 |     - "{{ .Values.webserver.defaultUser.lastName }}"
 722 |     - "-p"
 723 |     - "{{ .Values.webserver.defaultUser.password }}"
 724 | 
 725 |   # Annotations on the create user job pod
 726 |   annotations: {}
 727 |   # jobAnnotations are annotations on the create user job
 728 |   jobAnnotations: {}
 729 | 
 730 |   # When not set, the values defined in the global securityContext will be used
 731 |   securityContext: {}
 732 |   #  runAsUser: 50000
 733 |   #  fsGroup: 0
 734 |   #  runAsGroup: 0
 735 | 
 736 |   # Create ServiceAccount
 737 |   serviceAccount:
 738 |     # Specifies whether a ServiceAccount should be created
 739 |     create: true
 740 |     # The name of the ServiceAccount to use.
 741 |     # If not set and create is true, a name is generated using the release name
 742 |     name: ~
 743 | 
 744 |     # Annotations to add to create user kubernetes service account.
 745 |     annotations: {}
 746 | 
 747 |   # Launch additional containers into user creation job
 748 |   extraContainers: []
 749 | 
 750 |   # Mount additional volumes into user creation job
 751 |   extraVolumes: []
 752 |   extraVolumeMounts: []
 753 | 
 754 |   nodeSelector: {}
 755 |   affinity: {}
 756 |   tolerations: []
 757 |   # In case you need to disable the helm hooks that create the jobs after install.
 758 |   # Disable this if you are using ArgoCD for example
 759 |   useHelmHooks: true
 760 | 
 761 |   resources: {}
 762 |   #  limits:
 763 |   #   cpu: 100m
 764 |   #   memory: 128Mi
 765 |   #  requests:
 766 |   #   cpu: 100m
 767 |   #   memory: 128Mi
 768 | 
 769 | # Airflow database migration job settings
 770 | migrateDatabaseJob:
 771 |   # Command to use when running the migrate database job (templated).
 772 |   command: ~
 773 |   # Args to use when running the migrate database job (templated).
 774 |   args:
 775 |     - "bash"
 776 |     - "-c"
 777 |     # The format below is necessary to get `helm lint` happy
 778 |     - |-
 779 |       exec \
 780 |       airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "db upgrade" "upgradedb" }}
 781 | 
 782 |   # Annotations on the database migration pod
 783 |   annotations: {}
 784 |   # jobAnnotations are annotations on the database migration job
 785 |   jobAnnotations: {}
 786 | 
 787 |   # When not set, the values defined in the global securityContext will be used
 788 |   securityContext: {}
 789 |   #  runAsUser: 50000
 790 |   #  fsGroup: 0
 791 |   #  runAsGroup: 0
 792 | 
 793 |   # Create ServiceAccount
 794 |   serviceAccount:
 795 |     # Specifies whether a ServiceAccount should be created
 796 |     create: true
 797 |     # The name of the ServiceAccount to use.
 798 |     # If not set and create is true, a name is generated using the release name
 799 |     name: ~
 800 | 
 801 |     # Annotations to add to migrate database job kubernetes service account.
 802 |     annotations: {}
 803 | 
 804 |   resources: {}
 805 |   #  limits:
 806 |   #   cpu: 100m
 807 |   #   memory: 128Mi
 808 |   #  requests:
 809 |   #   cpu: 100m
 810 |   #   memory: 128Mi
 811 | 
 812 |   # Launch additional containers into database migration job
 813 |   extraContainers: []
 814 | 
 815 |   # Mount additional volumes into database migration job
 816 |   extraVolumes: []
 817 |   extraVolumeMounts: []
 818 | 
 819 |   nodeSelector: {}
 820 |   affinity: {}
 821 |   tolerations: []
 822 |   # In case you need to disable the helm hooks that create the jobs after install.
 823 |   # Disable this if you are using ArgoCD for example
 824 |   useHelmHooks: true
 825 | 
 826 | # Airflow webserver settings
 827 | webserver:
 828 |   allowPodLogReading: true
 829 |   livenessProbe:
 830 |     initialDelaySeconds: 15
 831 |     timeoutSeconds: 30
 832 |     failureThreshold: 20
 833 |     periodSeconds: 5
 834 | 
 835 |   readinessProbe:
 836 |     initialDelaySeconds: 15
 837 |     timeoutSeconds: 30
 838 |     failureThreshold: 20
 839 |     periodSeconds: 5
 840 | 
 841 |   # Number of webservers
 842 |   replicas: 1
 843 | 
 844 |   # Command to use when running the Airflow webserver (templated).
 845 |   command: ~
 846 |   # Args to use when running the Airflow webserver (templated).
 847 |   args: ["bash", "-c", "exec airflow webserver"]
 848 | 
 849 |   # Create ServiceAccount
 850 |   serviceAccount:
 851 |     # Specifies whether a ServiceAccount should be created
 852 |     create: true
 853 |     # The name of the ServiceAccount to use.
 854 |     # If not set and create is true, a name is generated using the release name
 855 |     name: ~
 856 | 
 857 |     # Annotations to add to webserver kubernetes service account.
 858 |     annotations: {}
 859 | 
 860 |   # Allow overriding Update Strategy for Webserver
 861 |   strategy: ~
 862 | 
 863 |   # When not set, the values defined in the global securityContext will be used
 864 |   securityContext: {}
 865 |   #  runAsUser: 50000
 866 |   #  fsGroup: 0
 867 |   #  runAsGroup: 0
 868 | 
 869 |   # Additional network policies as needed (Deprecated - renamed to `webserver.networkPolicy.ingress.from`)
 870 |   extraNetworkPolicies: []
 871 |   networkPolicy:
 872 |     ingress:
 873 |       # Peers for webserver NetworkPolicy ingress
 874 |       from: []
 875 |       # Ports for webserver NetworkPolicy ingress (if `from` is set)
 876 |       ports:
 877 |         - port: "{{ .Values.ports.airflowUI }}"
 878 | 
 879 |   resources: {}
 880 |   #   limits:
 881 |   #     cpu: 100m
 882 |   #     memory: 128Mi
 883 |   #   requests:
 884 |   #     cpu: 100m
 885 |   #     memory: 128Mi
 886 | 
 887 |   # Create initial user.
 888 |   defaultUser:
 889 |     enabled: true
 890 |     role: Admin
 891 |     username: admin
 892 |     email: admin@example.com
 893 |     firstName: admin
 894 |     lastName: user
 895 |     password: admin
 896 | 
 897 |   # Launch additional containers into webserver.
 898 |   extraContainers: []
 899 |   # Add additional init containers into webserver.
 900 |   extraInitContainers: []
 901 | 
 902 |   # Mount additional volumes into webserver.
 903 |   extraVolumes: []
 904 |   extraVolumeMounts: []
 905 | 
 906 |   # This string (can be templated) will be mounted into the Airflow Webserver as a custom
 907 |   # webserver_config.py. You can bake a webserver_config.py in to your image instead.
 908 |   webserverConfig: ~
 909 |   # webserverConfig: |
 910 |   #   from airflow import configuration as conf
 911 | 
 912 |   #   # The SQLAlchemy connection string.
 913 |   #   SQLALCHEMY_DATABASE_URI = conf.get('core', 'SQL_ALCHEMY_CONN')
 914 | 
 915 |   #   # Flask-WTF flag for CSRF
 916 |   #   CSRF_ENABLED = True
 917 | 
 918 |   service:
 919 |     type: ClusterIP
 920 |     ## service annotations
 921 |     annotations: {}
 922 |     ports:
 923 |       - name: airflow-ui
 924 |         port: "{{ .Values.ports.airflowUI }}"
 925 |     # To change the port used to access the webserver:
 926 |     # ports:
 927 |     #   - name: airflow-ui
 928 |     #     port: 80
 929 |     #     targetPort: airflow-ui
 930 |     # To only expose a sidecar, not the webserver directly:
 931 |     # ports:
 932 |     #   - name: only_sidecar
 933 |     #     port: 80
 934 |     #     targetPort: 8888
 935 |     loadBalancerIP: ~
 936 |     ## Limit load balancer source ips to list of CIDRs
 937 |     # loadBalancerSourceRanges:
 938 |     #   - "10.123.0.0/16"
 939 |     loadBalancerSourceRanges: []
 940 | 
 941 |   # Select certain nodes for airflow webserver pods.
 942 |   nodeSelector: {}
 943 |   priorityClassName: ~
 944 |   affinity: {}
 945 |   # default webserver affinity is:
 946 |   #  podAntiAffinity:
 947 |   #    preferredDuringSchedulingIgnoredDuringExecution:
 948 |   #    - podAffinityTerm:
 949 |   #        labelSelector:
 950 |   #          matchLabels:
 951 |   #            component: webserver
 952 |   #        topologyKey: kubernetes.io/hostname
 953 |   #      weight: 100
 954 |   tolerations: []
 955 | 
 956 |   podAnnotations: {}
 957 | 
 958 | # Airflow Triggerer Config
 959 | triggerer:
 960 |   enabled: true
 961 |   # Number of airflow triggerers in the deployment
 962 |   replicas: 1
 963 | 
 964 |   # Command to use when running Airflow triggerers (templated).
 965 |   command: ~
 966 |   # Args to use when running Airflow triggerer (templated).
 967 |   args: ["bash", "-c", "exec airflow triggerer"]
 968 | 
 969 |   # Update Strategy for triggerers
 970 |   strategy:
 971 |     rollingUpdate:
 972 |       maxSurge: "100%"
 973 |       maxUnavailable: "50%"
 974 | 
 975 |   # If the triggerer stops heartbeating for 5 minutes (5*60s) kill the
 976 |   # triggerer and let Kubernetes restart it
 977 |   livenessProbe:
 978 |     initialDelaySeconds: 10
 979 |     timeoutSeconds: 20
 980 |     failureThreshold: 5
 981 |     periodSeconds: 60
 982 |     command:
 983 |       - sh
 984 |       - -c
 985 |       - |
 986 |         CONNECTION_CHECK_MAX_COUNT=0 exec /entrypoint python -Wignore -c "
 987 |         import os
 988 |         os.environ['AIRFLOW__CORE__LOGGING_LEVEL'] = 'ERROR'
 989 |         os.environ['AIRFLOW__LOGGING__LOGGING_LEVEL'] = 'ERROR'
 990 | 
 991 |         from airflow.jobs.triggerer_job import TriggererJob
 992 |         from airflow.utils.db import create_session
 993 |         from airflow.utils.net import get_hostname
 994 |         import sys
 995 | 
 996 |         with create_session() as session:
 997 |             job = session.query(TriggererJob).filter_by(hostname=get_hostname()).order_by(
 998 |                 TriggererJob.latest_heartbeat.desc()).limit(1).first()
 999 | 
1000 |         sys.exit(0 if job.is_alive() else 1)
1001 |         "
1002 | 
1003 |   # Create ServiceAccount
1004 |   serviceAccount:
1005 |     # Specifies whether a ServiceAccount should be created
1006 |     create: true
1007 |     # The name of the ServiceAccount to use.
1008 |     # If not set and create is true, a name is generated using the release name
1009 |     name: ~
1010 | 
1011 |     # Annotations to add to triggerer kubernetes service account.
1012 |     annotations: {}
1013 | 
1014 |   # When not set, the values defined in the global securityContext will be used
1015 |   securityContext: {}
1016 |   #  runAsUser: 50000
1017 |   #  fsGroup: 0
1018 |   #  runAsGroup: 0
1019 | 
1020 |   resources: {}
1021 |   #  limits:
1022 |   #   cpu: 100m
1023 |   #   memory: 128Mi
1024 |   #  requests:
1025 |   #   cpu: 100m
1026 |   #   memory: 128Mi
1027 | 
1028 |   # Grace period for triggerer to finish after SIGTERM is sent from kubernetes
1029 |   terminationGracePeriodSeconds: 60
1030 | 
1031 |   # This setting tells kubernetes that its ok to evict
1032 |   # when it wants to scale a node down.
1033 |   safeToEvict: true
1034 | 
1035 |   # Launch additional containers into triggerer.
1036 |   extraContainers: []
1037 |   # Add additional init containers into triggerers.
1038 |   extraInitContainers: []
1039 | 
1040 |   # Mount additional volumes into triggerer.
1041 |   extraVolumes: []
1042 |   extraVolumeMounts: []
1043 | 
1044 |   # Select certain nodes for airflow triggerer pods.
1045 |   nodeSelector: {}
1046 |   affinity: {}
1047 |   # default triggerer affinity is:
1048 |   #  podAntiAffinity:
1049 |   #    preferredDuringSchedulingIgnoredDuringExecution:
1050 |   #    - podAffinityTerm:
1051 |   #        labelSelector:
1052 |   #          matchLabels:
1053 |   #            component: triggerer
1054 |   #        topologyKey: kubernetes.io/hostname
1055 |   #      weight: 100
1056 |   tolerations: []
1057 | 
1058 |   priorityClassName: ~
1059 | 
1060 |   podAnnotations: {}
1061 | 
1062 | # Flower settings
1063 | flower:
1064 |   # Enable flower.
1065 |   # If True, and using CeleryExecutor/CeleryKubernetesExecutor, will deploy flower app.
1066 |   enabled: false
1067 | 
1068 |   # Command to use when running flower (templated).
1069 |   command: ~
1070 |   # Args to use when running flower (templated).
1071 |   args:
1072 |     - "bash"
1073 |     - "-c"
1074 |     # The format below is necessary to get `helm lint` happy
1075 |     - |-
1076 |       exec \
1077 |       airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "celery flower" "flower" }}
1078 | 
1079 |   # Additional network policies as needed (Deprecated - renamed to `flower.networkPolicy.ingress.from`)
1080 |   extraNetworkPolicies: []
1081 |   networkPolicy:
1082 |     ingress:
1083 |       # Peers for flower NetworkPolicy ingress
1084 |       from: []
1085 |       # Ports for flower NetworkPolicy ingress (if ingressPeers is set)
1086 |       ports:
1087 |         - port: "{{ .Values.ports.flowerUI }}"
1088 | 
1089 |   resources: {}
1090 |   #   limits:
1091 |   #     cpu: 100m
1092 |   #     memory: 128Mi
1093 |   #   requests:
1094 |   #     cpu: 100m
1095 |   #     memory: 128Mi
1096 | 
1097 |   # When not set, the values defined in the global securityContext will be used
1098 |   securityContext: {}
1099 |   #  runAsUser: 50000
1100 |   #  fsGroup: 0
1101 |   #  runAsGroup: 0
1102 | 
1103 |   # Create ServiceAccount
1104 |   serviceAccount:
1105 |     # Specifies whether a ServiceAccount should be created
1106 |     create: true
1107 |     # The name of the ServiceAccount to use.
1108 |     # If not set and create is true, a name is generated using the release name
1109 |     name: ~
1110 | 
1111 |     # Annotations to add to worker kubernetes service account.
1112 |     annotations: {}
1113 | 
1114 |   # A secret containing the connection
1115 |   secretName: ~
1116 | 
1117 |   # Else, if username and password are set, create secret from username and password
1118 |   username: ~
1119 |   password: ~
1120 | 
1121 |   service:
1122 |     type: ClusterIP
1123 |     ## service annotations
1124 |     annotations: {}
1125 |     ports:
1126 |       - name: flower-ui
1127 |         port: "{{ .Values.ports.flowerUI }}"
1128 |     # To change the port used to access flower:
1129 |     # ports:
1130 |     #   - name: flower-ui
1131 |     #     port: 8080
1132 |     #     targetPort: flower-ui
1133 |     loadBalancerIP: ~
1134 |     ## Limit load balancer source ips to list of CIDRs
1135 |     # loadBalancerSourceRanges:
1136 |     #   - "10.123.0.0/16"
1137 |     loadBalancerSourceRanges: []
1138 | 
1139 |   # Launch additional containers into the flower pods.
1140 |   extraContainers: []
1141 |   # Mount additional volumes into the flower pods.
1142 |   extraVolumes: []
1143 | 
1144 |   # Select certain nodes for airflow flower pods.
1145 |   nodeSelector: {}
1146 |   affinity: {}
1147 |   tolerations: []
1148 | 
1149 |   priorityClassName: ~
1150 | 
1151 |   podAnnotations: {}
1152 | 
1153 | # Statsd settings
1154 | statsd:
1155 |   enabled: false
1156 | 
1157 |   # Create ServiceAccount
1158 |   serviceAccount:
1159 |     # Specifies whether a ServiceAccount should be created
1160 |     create: true
1161 |     # The name of the ServiceAccount to use.
1162 |     # If not set and create is true, a name is generated using the release name
1163 |     name: ~
1164 | 
1165 |     # Annotations to add to worker kubernetes service account.
1166 |     annotations: {}
1167 | 
1168 |   # When not set, the values defined in the global securityContext will be used
1169 |   securityContext: {}
1170 |   #  runAsUser: 65534
1171 |   #  fsGroup: 0
1172 |   #  runAsGroup: 0
1173 | 
1174 |   # Additional network policies as needed
1175 |   extraNetworkPolicies: []
1176 |   resources: {}
1177 |   #   limits:
1178 |   #     cpu: 100m
1179 |   #     memory: 128Mi
1180 |   #   requests:
1181 |   #     cpu: 100m
1182 |   #     memory: 128Mi
1183 | 
1184 |   service:
1185 |     extraAnnotations: {}
1186 | 
1187 |   # Select certain nodes for statsd pods.
1188 |   nodeSelector: {}
1189 |   affinity: {}
1190 |   tolerations: []
1191 | 
1192 |   priorityClassName: ~
1193 | 
1194 |   # Additional mappings for statsd exporter.
1195 |   extraMappings: []
1196 | 
1197 |   uid: 65534
1198 | 
1199 | # PgBouncer settings
1200 | pgbouncer:
1201 |   # Enable PgBouncer
1202 |   enabled: false
1203 |   # Command to use for PgBouncer(templated).
1204 |   command: ["pgbouncer", "-u", "nobody", "/etc/pgbouncer/pgbouncer.ini"]
1205 |   # Args to use for PgBouncer(templated).
1206 |   args: ~
1207 | 
1208 |   # Create ServiceAccount
1209 |   serviceAccount:
1210 |     # Specifies whether a ServiceAccount should be created
1211 |     create: true
1212 |     # The name of the ServiceAccount to use.
1213 |     # If not set and create is true, a name is generated using the release name
1214 |     name: ~
1215 | 
1216 |     # Annotations to add to worker kubernetes service account.
1217 |     annotations: {}
1218 | 
1219 |   # Additional network policies as needed
1220 |   extraNetworkPolicies: []
1221 | 
1222 |   # Pool sizes
1223 |   metadataPoolSize: 10
1224 |   resultBackendPoolSize: 5
1225 | 
1226 |   # Maximum clients that can connect to PgBouncer (higher = more file descriptors)
1227 |   maxClientConn: 100
1228 | 
1229 |   # supply the name of existing secret with pgbouncer.ini and users.txt defined
1230 |   # you can load them to a k8s secret like the one below
1231 |   #  apiVersion: v1
1232 |   #  kind: Secret
1233 |   #  metadata:
1234 |   #    name: pgbouncer-config-secret
1235 |   #  data:
1236 |   #     pgbouncer.ini: <base64_encoded pgbouncer.ini file content>
1237 |   #     users.txt: <base64_encoded users.txt file content>
1238 |   #  type: Opaque
1239 |   #
1240 |   #  configSecretName: pgbouncer-config-secret
1241 |   #
1242 |   configSecretName: ~
1243 | 
1244 |   # PgBouncer pod disruption budget
1245 |   podDisruptionBudget:
1246 |     enabled: false
1247 | 
1248 |     # PDB configuration
1249 |     config:
1250 |       maxUnavailable: 1
1251 | 
1252 |   # Limit the resources to PgBouncer.
1253 |   # When you specify the resource request the k8s scheduler uses this information to decide which node to
1254 |   # place the Pod on. When you specify a resource limit for a Container, the kubelet enforces those limits so
1255 |   # that the running container is not allowed to use more of that resource than the limit you set.
1256 |   # See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
1257 |   # Example:
1258 |   #
1259 |   # resource:
1260 |   #   limits:
1261 |   #     cpu: 100m
1262 |   #     memory: 128Mi
1263 |   #   requests:
1264 |   #     cpu: 100m
1265 |   #     memory: 128Mi
1266 |   resources: {}
1267 | 
1268 |   service:
1269 |     extraAnnotations: {}
1270 | 
1271 |   # https://www.pgbouncer.org/config.html
1272 |   verbose: 0
1273 |   logDisconnections: 0
1274 |   logConnections: 0
1275 | 
1276 |   sslmode: "prefer"
1277 |   ciphers: "normal"
1278 | 
1279 |   ssl:
1280 |     ca: ~
1281 |     cert: ~
1282 |     key: ~
1283 | 
1284 |   # Add extra PgBouncer ini configuration in the databases section:
1285 |   # https://www.pgbouncer.org/config.html#section-databases
1286 |   extraIniMetadata: ~
1287 |   extraIniResultBackend: ~
1288 |   # Add extra general PgBouncer ini configuration: https://www.pgbouncer.org/config.html
1289 |   extraIni: ~
1290 | 
1291 |   # Mount additional volumes into pgbouncer.
1292 |   extraVolumes: []
1293 |   extraVolumeMounts: []
1294 | 
1295 |   # Select certain nodes for PgBouncer pods.
1296 |   nodeSelector: {}
1297 |   affinity: {}
1298 |   tolerations: []
1299 | 
1300 |   priorityClassName: ~
1301 | 
1302 |   uid: 65534
1303 | 
1304 |   metricsExporterSidecar:
1305 |     resources: {}
1306 |     #  limits:
1307 |     #   cpu: 100m
1308 |     #   memory: 128Mi
1309 |     #  requests:
1310 |     #   cpu: 100m
1311 |     #   memory: 128Mi
1312 |     sslmode: "disable"
1313 | 
1314 | # Configuration for the redis provisioned by the chart
1315 | redis:
1316 |   enabled: false
1317 |   terminationGracePeriodSeconds: 600
1318 | 
1319 |   # Create ServiceAccount
1320 |   serviceAccount:
1321 |     # Specifies whether a ServiceAccount should be created
1322 |     create: true
1323 |     # The name of the ServiceAccount to use.
1324 |     # If not set and create is true, a name is generated using the release name
1325 |     name: ~
1326 | 
1327 |     # Annotations to add to worker kubernetes service account.
1328 |     annotations: {}
1329 | 
1330 |   persistence:
1331 |     # Enable persistent volumes
1332 |     enabled: true
1333 |     # Volume size for worker StatefulSet
1334 |     size: 1Gi
1335 |     # If using a custom storageClass, pass name ref to all statefulSets here
1336 |     storageClassName:
1337 | 
1338 |   resources: {}
1339 |   #  limits:
1340 |   #   cpu: 100m
1341 |   #   memory: 128Mi
1342 |   #  requests:
1343 |   #   cpu: 100m
1344 |   #   memory: 128Mi
1345 | 
1346 |   # If set use as redis secret. Make sure to also set data.brokerUrlSecretName value.
1347 |   passwordSecretName: ~
1348 | 
1349 |   # Else, if password is set, create secret with it,
1350 |   # Otherwise a new password will be generated on install
1351 |   # Note: password can only be set during install, not upgrade.
1352 |   password: ~
1353 | 
1354 |   # This setting tells kubernetes that its ok to evict
1355 |   # when it wants to scale a node down.
1356 |   safeToEvict: true
1357 | 
1358 |   # Select certain nodes for redis pods.
1359 |   nodeSelector: {}
1360 |   affinity: {}
1361 |   tolerations: []
1362 | 
1363 | # Auth secret for a private registry
1364 | # This is used if pulling airflow images from a private registry
1365 | registry:
1366 |   secretName: ~
1367 | 
1368 |   # Example:
1369 |   # connection:
1370 |   #   user: ~
1371 |   #   pass: ~
1372 |   #   host: ~
1373 |   #   email: ~
1374 |   connection: {}
1375 | 
1376 | # Elasticsearch logging configuration
1377 | elasticsearch:
1378 |   # Enable elasticsearch task logging
1379 |   enabled: false
1380 |   # A secret containing the connection
1381 |   secretName: ~
1382 |   # Or an object representing the connection
1383 |   # Example:
1384 |   # connection:
1385 |   #   user: ~
1386 |   #   pass: ~
1387 |   #   host: ~
1388 |   #   port: ~
1389 |   connection: {}
1390 | 
1391 | # All ports used by chart
1392 | ports:
1393 |   flowerUI: 5555
1394 |   airflowUI: 8080
1395 |   workerLogs: 8793
1396 |   redisDB: 6379
1397 |   statsdIngest: 9125
1398 |   statsdScrape: 9102
1399 |   pgbouncer: 6543
1400 |   pgbouncerScrape: 9127
1401 | 
1402 | # Define any ResourceQuotas for namespace
1403 | quotas: {}
1404 | 
1405 | # Define default/max/min values for pods and containers in namespace
1406 | limits: []
1407 | 
1408 | # This runs as a CronJob to cleanup old pods.
1409 | cleanup:
1410 |   enabled: false
1411 |   # Run every 15 minutes
1412 |   schedule: "*/15 * * * *"
1413 |   # Command to use when running the cleanup cronjob (templated).
1414 |   command: ~
1415 |   # Args to use when running the cleanup cronjob (templated).
1416 |   args: ["bash", "-c", "exec airflow kubernetes cleanup-pods --namespace={{ .Release.Namespace }}"]
1417 | 
1418 | 
1419 |   # Select certain nodes for airflow cleanup pods.
1420 |   nodeSelector: {}
1421 |   affinity: {}
1422 |   tolerations: []
1423 | 
1424 |   podAnnotations: {}
1425 | 
1426 |   resources: {}
1427 |   #  limits:
1428 |   #   cpu: 100m
1429 |   #   memory: 128Mi
1430 |   #  requests:
1431 |   #   cpu: 100m
1432 |   #   memory: 128Mi
1433 | 
1434 |   # Create ServiceAccount
1435 |   serviceAccount:
1436 |     # Specifies whether a ServiceAccount should be created
1437 |     create: true
1438 |     # The name of the ServiceAccount to use.
1439 |     # If not set and create is true, a name is generated using the release name
1440 |     name: ~
1441 | 
1442 |     # Annotations to add to cleanup cronjob kubernetes service account.
1443 |     annotations: {}
1444 | 
1445 |   # When not set, the values defined in the global securityContext will be used
1446 |   securityContext: {}
1447 |   #  runAsUser: 50000
1448 |   #  runAsGroup: 0
1449 | 
1450 | # Configuration for postgresql subchart
1451 | # Not recommended for production
1452 | postgresql:
1453 |   enabled: true
1454 |   postgresqlPassword: postgres
1455 |   postgresqlUsername: postgres
1456 | 
1457 | # Config settings to go into the mounted airflow.cfg
1458 | #
1459 | # Please note that these values are passed through the `tpl` function, so are
1460 | # all subject to being rendered as go templates. If you need to include a
1461 | # literal `{{` in a value, it must be expressed like this:
1462 | #
1463 | #    a: '{{ "{{ not a template }}" }}'
1464 | #
1465 | # Do not set config containing secrets via plain text values, use Env Var or k8s secret object
1466 | # yamllint disable rule:line-length
1467 | config:
1468 |   core:
1469 |     dags_folder: '{{ include "airflow_dags" . }}'
1470 |     # This is ignored when used with the official Docker image
1471 |     load_examples: 'False'
1472 |     executor: '{{ .Values.executor }}'
1473 |     # For Airflow 1.10, backward compatibility; moved to [logging] in 2.0
1474 |     colored_console_log: 'False'
1475 |     remote_logging: '{{- ternary "True" "False" .Values.elasticsearch.enabled }}'
1476 |   logging:
1477 |     remote_logging: '{{- ternary "True" "False" .Values.elasticsearch.enabled }}'
1478 |     colored_console_log: 'False'
1479 |   metrics:
1480 |     statsd_on: '{{ ternary "True" "False" .Values.statsd.enabled }}'
1481 |     statsd_port: 9125
1482 |     statsd_prefix: airflow
1483 |     statsd_host: '{{ printf "%s-statsd" .Release.Name }}'
1484 |   webserver:
1485 |     enable_proxy_fix: 'True'
1486 |     # For Airflow 1.10
1487 |     rbac: 'True'
1488 |   celery:
1489 |     worker_concurrency: 16
1490 |   scheduler:
1491 |     # statsd params included for Airflow 1.10 backward compatibility; moved to [metrics] in 2.0
1492 |     statsd_on: '{{ ternary "True" "False" .Values.statsd.enabled }}'
1493 |     statsd_port: 9125
1494 |     statsd_prefix: airflow
1495 |     statsd_host: '{{ printf "%s-statsd" .Release.Name }}'
1496 |     # `run_duration` included for Airflow 1.10 backward compatibility; removed in 2.0.
1497 |     run_duration: 41460
1498 |   elasticsearch:
1499 |     json_format: 'True'
1500 |     log_id_template: "{dag_id}_{task_id}_{execution_date}_{try_number}"
1501 |   elasticsearch_configs:
1502 |     max_retries: 3
1503 |     timeout: 30
1504 |     retry_timeout: 'True'
1505 |   kerberos:
1506 |     keytab: '{{ .Values.kerberos.keytabPath }}'
1507 |     reinit_frequency: '{{ .Values.kerberos.reinitFrequency }}'
1508 |     principal: '{{ .Values.kerberos.principal }}'
1509 |     ccache: '{{ .Values.kerberos.ccacheMountPath }}/{{ .Values.kerberos.ccacheFileName }}'
1510 |   celery_kubernetes_executor:
1511 |     kubernetes_queue: 'kubernetes'
1512 |   kubernetes:
1513 |     namespace: '{{ .Release.Namespace }}'
1514 |     airflow_configmap: '{{ include "airflow_config" . }}'
1515 |     airflow_local_settings_configmap: '{{ include "airflow_config" . }}'
1516 |     pod_template_file: '{{ include "airflow_pod_template_file" . }}/pod_template_file.yaml'
1517 |     worker_container_repository: '{{ .Values.images.airflow.repository | default .Values.defaultAirflowRepository }}'
1518 |     worker_container_tag: '{{ .Values.images.airflow.tag | default .Values.defaultAirflowTag }}'
1519 |     multi_namespace_mode: '{{ if .Values.multiNamespaceMode }}True{{ else }}False{{ end }}'
1520 | # yamllint enable rule:line-length
1521 | 
1522 | # Whether Airflow can launch workers and/or pods in multiple namespaces
1523 | # If true, it creates ClusterRole/ClusterRolebinding (with access to entire cluster)
1524 | multiNamespaceMode: true
1525 | 
1526 | # `podTemplate` is a templated string containing the contents of `pod_template_file.yaml` used for
1527 | # KubernetesExecutor workers. The default `podTemplate` will use normal `workers` configuration parameters
1528 | # (e.g. `workers.resources`). As such, you normally won't need to override this directly, however,
1529 | # you can still provide a completely custom `pod_template_file.yaml` if desired.
1530 | # If not set, a default one is created using `files/pod-template-file.kubernetes-helm-yaml`.
1531 | podTemplate: ~
1532 | # The following example is NOT functional, but meant to be illustrative of how you can provide a custom
1533 | # `pod_template_file`. You're better off starting with the default in
1534 | # `files/pod-template-file.kubernetes-helm-yaml` and modifying from there.
1535 | # We will set `priorityClassName` in this example:
1536 | # podTemplate: |
1537 | #   apiVersion: v1
1538 | #   kind: Pod
1539 | #   metadata:
1540 | #     name: dummy-name
1541 | #     labels:
1542 | #       tier: airflow
1543 | #       component: worker
1544 | #       release: {{ .Release.Name }}
1545 | #   spec:
1546 | #     priorityClassName: high-priority
1547 | #     containers:
1548 | #       - name: base
1549 | #         ...
1550 | 
1551 | # Git sync
1552 | dags:
1553 |   persistence:
1554 |     # Enable persistent volume for storing dags
1555 |     enabled: false
1556 |     # Volume size for dags
1557 |     size: 1Gi
1558 |     # If using a custom storageClass, pass name here
1559 |     storageClassName:
1560 |     # access mode of the persistent volume
1561 |     accessMode: ReadWriteOnce
1562 |     ## the name of an existing PVC to use
1563 |     existingClaim:
1564 |   gitSync:
1565 |     enabled: true
1566 |     # git repo clone url
1567 |     # ssh examples ssh://git@github.com/apache/airflow.git
1568 |     # git@github.com:apache/airflow.git
1569 |     # https example: https://github.com/apache/airflow.git
1570 |     # repo: https://airflow2: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@github.com/abhishek-ch/data-machinelearning-the-boring-way.git
1571 |     repo: ssh://git@github.com/abhishek-ch/data-machinelearning-the-boring-way.git
1572 |     branch: main
1573 |     rev: HEAD
1574 |     depth: 1
1575 |     # the number of consecutive failures allowed before aborting
1576 |     maxFailures: 1
1577 |     # subpath within the repo where dags are located
1578 |     # should be "" if dags are at repo root
1579 |     subPath: "dags"
1580 |     # if your repo needs a user name password
1581 |     # you can load them to a k8s secret like the one below
1582 |     #   ---
1583 |     #   apiVersion: v1
1584 |     #   kind: Secret
1585 |     #   metadata:
1586 |     #     name: git-credentials
1587 |     #   data:
1588 |     #     GIT_SYNC_USERNAME: <base64_encoded_git_username>
1589 |     #     GIT_SYNC_PASSWORD: <base64_encoded_git_password>
1590 |     # and specify the name of the secret below
1591 |     #
1592 |     # credentialsSecret: git-credentials
1593 |     #
1594 |     #
1595 |     # If you are using an ssh clone url, you can load
1596 |     # the ssh private key to a k8s secret like the one below
1597 |     #   ---
1598 |     #   apiVersion: v1
1599 |     #   kind: Secret
1600 |     #   metadata:
1601 |     #     name: airflow-ssh-secret
1602 |     #   data:
1603 |     #     # key needs to be gitSshKey
1604 |     #     gitSshKey: <base64_encoded_data>
1605 |     # and specify the name of the secret below
1606 |     sshKeySecret: airflow-git-ssh-secret
1607 |     #
1608 |     # If you are using an ssh private key, you can additionally
1609 |     # specify the content of your known_hosts file, example:
1610 |     #
1611 |     knownHosts: |
1612 |      github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==
1613 |     # interval between git sync attempts in seconds
1614 |     wait: 60
1615 |     containerName: git-sync
1616 |     uid: 65533
1617 | 
1618 |     # When not set, the values defined in the global securityContext will be used
1619 |     securityContext: {}
1620 |     #  runAsUser: 65533
1621 |     #  runAsGroup: 0
1622 | 
1623 |     extraVolumeMounts: []
1624 |     env: []
1625 |     resources: {}
1626 |     #  limits:
1627 |     #   cpu: 100m
1628 |     #   memory: 128Mi
1629 |     #  requests:
1630 |     #   cpu: 100m
1631 |     #   memory: 128Mi
1632 | 
1633 | logs:
1634 |   persistence:
1635 |     # Enable persistent volume for storing logs
1636 |     enabled: false
1637 |     # Volume size for logs
1638 |     size: 100Gi
1639 |     # If using a custom storageClass, pass name here
1640 |     storageClassName:
1641 |     ## the name of an existing PVC to use
1642 |     existingClaim:
1643 | 
1644 | 


--------------------------------------------------------------------------------