├── .gitignore ├── dags ├── invoice │ ├── __init__.py │ ├── lib │ │ ├── __init__.py │ │ ├── embed.py │ │ ├── ocr.py │ │ └── vss.py │ └── invoice_dag.py └── .airflowignore ├── logs └── .gitignore ├── stop.sh ├── kind ├── destroy.sh ├── metallb.yaml ├── build.sh └── config.yaml ├── images ├── graph.png ├── inbox.png ├── Docai_Arch.jpg ├── Docai_Flow.jpg ├── triggerer.png ├── variables.png └── disposition.png ├── LICENSE ├── samples ├── Adatum-1.pdf ├── Adatum-2.pdf ├── Adatum-3.pdf ├── Contoso-3.pdf ├── Contoso-4.pdf ├── Contoso-5.pdf ├── Adatum-2-rotated.png ├── Adatum-1-converted.png ├── Contoso-3-reduced.png └── Contoso-4-blurred.jpg ├── airflow ├── Dockerfile ├── build.sh ├── airflow_volumes.yaml └── values.yaml ├── redis ├── rec.yaml └── redb.yaml ├── azure ├── destroy.sh └── build.sh ├── README.md ├── requirements.txt └── start.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | misc/ -------------------------------------------------------------------------------- /dags/invoice/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dags/invoice/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dags/.airflowignore: -------------------------------------------------------------------------------- 1 | invoice/lib/.* -------------------------------------------------------------------------------- /logs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /stop.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | ./kind/destroy.sh 3 | ./azure/destroy.sh -------------------------------------------------------------------------------- /kind/destroy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kind delete cluster --name $USER-kind-cluster -------------------------------------------------------------------------------- /images/graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/images/graph.png -------------------------------------------------------------------------------- /images/inbox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/images/inbox.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Redis, Inc. proprietary, subject to the Redis Enterprise Software and/or Cloud Services license -------------------------------------------------------------------------------- /images/Docai_Arch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/images/Docai_Arch.jpg -------------------------------------------------------------------------------- /images/Docai_Flow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/images/Docai_Flow.jpg -------------------------------------------------------------------------------- /images/triggerer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/images/triggerer.png -------------------------------------------------------------------------------- /images/variables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/images/variables.png -------------------------------------------------------------------------------- /samples/Adatum-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Adatum-1.pdf -------------------------------------------------------------------------------- /samples/Adatum-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Adatum-2.pdf -------------------------------------------------------------------------------- /samples/Adatum-3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Adatum-3.pdf -------------------------------------------------------------------------------- /samples/Contoso-3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Contoso-3.pdf -------------------------------------------------------------------------------- /samples/Contoso-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Contoso-4.pdf -------------------------------------------------------------------------------- /samples/Contoso-5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Contoso-5.pdf -------------------------------------------------------------------------------- /images/disposition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/images/disposition.png -------------------------------------------------------------------------------- /samples/Adatum-2-rotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Adatum-2-rotated.png -------------------------------------------------------------------------------- /samples/Adatum-1-converted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Adatum-1-converted.png -------------------------------------------------------------------------------- /samples/Contoso-3-reduced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Contoso-3-reduced.png -------------------------------------------------------------------------------- /samples/Contoso-4-blurred.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redis-developer/docai_pipeline/main/samples/Contoso-4-blurred.jpg -------------------------------------------------------------------------------- /airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.7.1-python3.10 2 | COPY requirements.txt / 3 | USER airflow 4 | RUN pip install --no-cache-dir "apache-airflow==${AIRFLOW_VERSION}" -r /requirements.txt -------------------------------------------------------------------------------- /kind/metallb.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: metallb.io/v1beta1 2 | kind: IPAddressPool 3 | metadata: 4 | name: example 5 | namespace: metallb-system 6 | spec: 7 | addresses: 8 | - 172.21.0.10-172.21.0.100 9 | --- 10 | apiVersion: metallb.io/v1beta1 11 | kind: L2Advertisement 12 | metadata: 13 | name: empty 14 | namespace: metallb-system 15 | -------------------------------------------------------------------------------- /redis/rec.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | metadata: 3 | name: ${REC_NAME} 4 | data: 5 | password: ${REC_PWD_B64} 6 | username: ${REC_USER_B64} 7 | kind: Secret 8 | type: Opaque 9 | --- 10 | apiVersion: "app.redislabs.com/v1" 11 | kind: "RedisEnterpriseCluster" 12 | metadata: 13 | name: ${REC_NAME} 14 | labels: 15 | app: rec 16 | spec: 17 | nodes: 3 18 | servicesRiggerSpec: 19 | databaseServiceType: load_balancer -------------------------------------------------------------------------------- /azure/destroy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # delete resource group 4 | az group delete \ 5 | --name $USER-ai-services-resource-group \ 6 | --yes 7 | 8 | # delete openai resource 9 | az cognitiveservices account purge \ 10 | --name openai-resource \ 11 | --resource-group $USER-ai-services-resource-group \ 12 | --location eastus 13 | 14 | # delete ocr resource 15 | az cognitiveservices account purge \ 16 | --name formrecognizer-resource \ 17 | --resource-group $USER-ai-services-resource-group \ 18 | --location eastus -------------------------------------------------------------------------------- /redis/redb.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | metadata: 3 | name: redis-enterprise-mydb 4 | data: 5 | password: ${REDB_PWD_B64} 6 | username: ${REDB_USER_B64} 7 | kind: Secret 8 | type: Opaque 9 | --- 10 | apiVersion: app.redislabs.com/v1alpha1 11 | kind: RedisEnterpriseDatabase 12 | metadata: 13 | name: ${REDB_NAME} 14 | labels: 15 | app: redisdb 16 | spec: 17 | memorySize: 100MB 18 | shardCount: 1 19 | databasePort: ${REDB_PORT} 20 | databaseSecretName: redis-enterprise-mydb 21 | modulesList: 22 | - name: search 23 | version: ${SEARCH_VERSION} 24 | - name: ReJSON 25 | version: ${JSON_VERSION} -------------------------------------------------------------------------------- /airflow/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Create a custom Airflow docker image that includes the Python modules used by the DAGs 4 | if [ -z "$(docker image ls -q myairflow:2.7.1)" ] 5 | then 6 | docker build -t myairflow:2.7.1 -f $PWD/airflow/Dockerfile . 7 | fi 8 | 9 | # Load that image on the Kind cluster 10 | kind --name $USER-kind-cluster load docker-image myairflow:2.7.1 11 | 12 | # Pull the helm chart for Airflow 13 | if [ -z "$(helm repo list | grep apache-airflow)" ] 14 | then 15 | helm repo add apache-airflow https://airflow.apache.org 16 | fi 17 | helm repo update apache-airflow 18 | 19 | # Create PVs + PVCs for the local host storage locations - dags, logs, invoices 20 | kubectl apply -f $PWD/airflow/airflow_volumes.yaml 21 | 22 | # Deploy Airflow 23 | helm install airflow apache-airflow/airflow -f $PWD/airflow/values.yaml --timeout 15m -------------------------------------------------------------------------------- /dags/invoice/lib/embed.py: -------------------------------------------------------------------------------- 1 | import openai 2 | from airflow.models import Variable 3 | import json 4 | from tenacity import ( retry, stop_after_attempt, wait_random_exponential ) 5 | 6 | openai_var = Variable.get("openai", deserialize_json=True, default_var=None) 7 | if (type(openai_var) != 'dict'): # hack for an apparent bug in airflow 8 | openai_var = json.loads(openai_var) 9 | 10 | openai.api_type = openai_var["type"] 11 | openai.api_key = openai_var["key"] 12 | openai.api_base = openai_var["endpoint"] 13 | openai.api_version = openai_var["version"] 14 | 15 | @retry(wait=wait_random_exponential(min=3, max=100), stop=stop_after_attempt(10)) 16 | def get_embedding(text: str) -> [float]: 17 | response = openai.Embedding.create( 18 | input=text, 19 | engine="EmbeddingModel" 20 | ) 21 | return response['data'][0]['embedding'] -------------------------------------------------------------------------------- /kind/build.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | kind create cluster --config=$PWD/kind/config.yaml --name $USER-kind-cluster 4 | 5 | echo -e "\n*** Deploy Loadbalancer ***" 6 | kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.13.7/config/manifests/metallb-native.yaml; sleep 5 7 | kubectl wait --namespace metallb-system --for=condition=ready pod --selector=app=metallb --timeout=-1s 8 | SUBNET=`docker network inspect kind | jq '.[].IPAM.Config[0].Subnet' | cut -d . -f 1,2,3 | sed -e 's/^"//'` 9 | ADDRESSES=${SUBNET}.10-${SUBNET}.100 10 | cat > $PWD/kind/metallb.yaml < str: 8 | """ Accepts a Azure Form Recognizer Document as input and then creates a space-separated 9 | string selected fields (configurable via Airflow Variable) 10 | """ 11 | invoice_string = '' 12 | customer_name = '' 13 | for field_name in sorted(vector_fields): 14 | field = invoice.fields.get(field_name) 15 | value = '' 16 | if field: 17 | if field_name == 'Items': 18 | for idx, item in enumerate(field.value): 19 | description = item.value.get('Description') 20 | if description: 21 | value += f'Description {description.value} ' 22 | 23 | quantity = item.value.get('Quantity') 24 | if quantity: 25 | value += f'Quantity {quantity.value} ' 26 | 27 | amount = item.value.get('Amount') 28 | if amount: 29 | value += f'Amount {amount.value} ' 30 | else: 31 | value = str(field.value) 32 | value = value.replace('\n', ' ') 33 | value = value.replace(' ', ' ') 34 | invoice_string += f'{field_name} {value} ' 35 | if field_name == 'CustomerName': 36 | if value: 37 | customer_name = value 38 | else: 39 | customer_name = 'unknown' 40 | return {'customer_name': customer_name, 'ocr': invoice_string} 41 | 42 | @retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(3)) 43 | def ocr(filepath: str) -> dict: 44 | """ Executes Azure Form Recognized OCR and returns a Python dict that includes a text string 45 | of space-separated values from the input invoice. 46 | """ 47 | formrec_var = Variable.get("formrec", deserialize_json=True, default_var=None) 48 | if (type(formrec_var) != 'dict'): # hack for an apparent bug in airflow 49 | formrec_var = json.loads(formrec_var) 50 | 51 | key = formrec_var["key"] 52 | endpoint = formrec_var["endpoint"] 53 | vector_fields = formrec_var["fields"] 54 | client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) 55 | with open(filepath, "rb") as f: 56 | poller = client.begin_analyze_document("prebuilt-invoice", document=f, locale="en-US") 57 | 58 | invoice = (poller.result()).documents[0] 59 | return stringify(invoice, vector_fields) -------------------------------------------------------------------------------- /dags/invoice/lib/vss.py: -------------------------------------------------------------------------------- 1 | import redis 2 | from redis.commands.search.field import VectorField, TextField 3 | from redis.commands.search.indexDefinition import IndexDefinition, IndexType 4 | from redis.commands.search.query import Query 5 | import shutil 6 | import json 7 | from airflow.models import Variable 8 | import numpy as np 9 | import uuid 10 | import logging 11 | import os 12 | 13 | def dedup(invoice: dict) -> str: 14 | """ Accepts a Python dict that includes a vector of a given invoice file. That vector is then sent into 15 | Redis VSS to determine disposition. If there's another invoice in Redis within a given vector distance of the input invoice, 16 | this invoice is disposed as a duplicate moved to the 'dups' directory. Otherwise, it is disposed as a net new invoice 17 | and moved to the 'processed' directory. 18 | """ 19 | re_var = Variable.get("re", deserialize_json=True, default_var=None) 20 | if (type(re_var) != 'dict'): # hack for an apparent bug in airflow 21 | re_var = json.loads(re_var) 22 | 23 | storage_var = Variable.get("storage", deserialize_json=True, default_var=None) 24 | if (type(storage_var) != 'dict'): # hack for an apparent bug in airflow 25 | storage_var = json.loads(storage_var) 26 | 27 | creds = redis.UsernamePasswordCredentialProvider(re_var['user'], re_var['pwd']) 28 | client = redis.Redis(host=re_var['host'], port=re_var['port'], credential_provider=creds) 29 | 30 | try: 31 | client.ft(re_var['vector_index']).info() 32 | except: 33 | idx_def = IndexDefinition(index_type=IndexType.HASH, prefix=[re_var['vector_prefix']]) 34 | schema = [ 35 | TextField('customer_name'), 36 | VectorField('vector', 37 | 'HNSW', 38 | { 'TYPE': re_var['vector_type'], 'DIM': re_var['vector_dim'], 'DISTANCE_METRIC': re_var['vector_metric'] } 39 | ) 40 | ] 41 | client.ft(re_var['vector_index']).create_index(schema, definition=idx_def) 42 | 43 | vec = np.array(invoice['vector'], dtype=np.float32).tobytes() 44 | q = Query(f'@customer_name:({invoice["customer_name"]}) => [KNN 1 @vector $query_vec AS score]')\ 45 | .return_fields('score')\ 46 | .dialect(2) 47 | 48 | results = client.ft(re_var['vector_index']).search(q, query_params={'query_vec': vec}) 49 | docs = results.docs 50 | if len(docs) > 0 and 1 - float(docs[0].score) > re_var['vector_similarity_bound']: 51 | print(f'score:{float(docs[0].score)}') 52 | shutil.move(invoice['file'], storage_var['dups']) 53 | logging.info(f'Duplicate invoice:{os.path.basename(invoice["file"])}, Similarity:{round(1 - float(docs[0].score), 2)}') 54 | return 'duplicate' 55 | else: 56 | if len(docs) > 0: 57 | similarity = round(1 - float(docs[0].score), 2) 58 | else: 59 | similarity = 'N/A' 60 | 61 | client.hset(f'invoice:{uuid.uuid4()}', 62 | mapping={'customer_name': invoice['customer_name'], 'file': os.path.basename(invoice['file']),'vector': vec}) 63 | shutil.move(invoice['file'], storage_var['processed']) 64 | logging.info(f'Processed invoice:{os.path.basename(invoice["file"])}, Similarity:{similarity}') 65 | return 'processed' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Invoice De-duplication Demo 2 | 3 | ## Contents 4 | 1. [Summary](#summary) 5 | 2. [Features](#features) 6 | 3. [Prerequisites](#prerequisites) 7 | 4. [Installation](#installation) 8 | 5. [Usage](#usage) 9 | 6. [Architecture](#architecture) 10 | 7. [Task Flow](#flow) 11 | 8. [Results](#results) 12 | 13 | 14 | ## Summary 15 | This is a demonstration of duplication detection of invoice documents. This leverages Apache Airflow to create a task flow that performs the following: 16 | - Local file deposit triggering of workflow 17 | - OCR of a given invoice file via Azure Form Recognizer 18 | - Embedding of OCR output via Azure OpenAI 19 | - De-duplication via Redis Vector Similarity Search (VSS) 20 | 21 | ## Features 22 | - Kubernetes architecture (local - Kind) 23 | - Redis Enterprise: 3 node cluster 24 | - Apache Airflow-managed workflow (Dag) 25 | - Azure Document Intelligence form parsing (Form Recognizer) 26 | - Azure OpenAI embedding 27 | - Redis vector/metadata storage + search (VSS) 28 | 29 | ## Prerequisites 30 | - kind 31 | - kubectl 32 | - docker 33 | - azure cli 34 | - azure account 35 | 36 | ## Installation 37 | ```bash 38 | git clone https://github.com/Redislabs-Solution-Architects/docai_pipeline.git && cd docai_pipeline 39 | ``` 40 | - Note 1: This is scripted to be fully-automatic; however, the first usage of Azure's Document Intelligence API(s) requires a manual step of building a resource/deployment and then accepting their AI-usage terms. 41 | - Note 2: Apache Airflow will be writing to the local 'invoices' directory. Airflow operates with a uid of 50000 and gid of 0 (root). You will need to change the group of the dags, invoices, and logs directories such that Airflow has access to them. 42 | ```bash 43 | sudo chgrp -R root dags 44 | sudo chgrp -R root invoices 45 | sudo chgrp -R root logs 46 | ``` 47 | 48 | ## Usage 49 | ### Start 50 | #### Kubernetes Environment Build Out 51 | ```bash 52 | ./start.sh 53 | ``` 54 | #### DAG Trigger 55 | The Invoice DAG is currently set with no schedule. This is form demo purposes. In a normal setting this DAG would be scheduled for hourly or daily execution. Use Admin UI to manually start the DAG via the 'Trigger DAG' button. Username: admin Password: admin 56 | 57 | ![triggerer](./images/triggerer.png) 58 | 59 | ### Stop 60 | ```bash 61 | ./stop.sh 62 | ``` 63 | 64 | ## Architecture 65 | ![architecture](./images/Docai_Arch.jpg) 66 | 67 | ## Task Flow 68 | ![flow](./images/Docai_Flow.jpg) 69 | 70 | ## Results 71 | ### Input Dataset 72 | There are total of 10 sample invoices. Four are duplicates. 73 | - Adatum-1-converted.png: File format conversion of Adatum-1 (PDF to PNG) 74 | - Adatum-2-rotated.png: Format conversion and 90 degree rotation of Adatum-2 75 | - Contoso-3-reduced.png: Format conversion and size reduction of Contoso-3. 76 | - Contoso-4-blurred.jpg: Format conversion and blurring of Contoso-4. 77 | 78 | ![inbox](./images/inbox.png) 79 | 80 | ### Airflow Variables 81 | ![variables](./images/variables.png) 82 | 83 | ### Airflow Status 84 | ![graph](./images/graph.png) 85 | 86 | ### Invoice File Dispositions 87 | ![disposition](./images/disposition.png) 88 | 89 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.5 2 | aiosignal==1.3.1 3 | alembic==1.12.0 4 | annotated-types==0.5.0 5 | anyio==4.0.0 6 | apache-airflow==2.7.1 7 | apache-airflow-providers-cncf-kubernetes==7.5.0 8 | apache-airflow-providers-common-sql==1.7.1 9 | apache-airflow-providers-ftp==3.5.1 10 | apache-airflow-providers-http==4.5.1 11 | apache-airflow-providers-imap==3.3.1 12 | apache-airflow-providers-sqlite==3.4.3 13 | apispec==6.3.0 14 | argcomplete==3.1.1 15 | asgiref==3.7.2 16 | async-timeout==4.0.3 17 | attrs==23.1.0 18 | azure-ai-formrecognizer==3.3.0 19 | azure-common==1.1.28 20 | azure-core==1.29.4 21 | Babel==2.12.1 22 | backoff==2.2.1 23 | blinker==1.6.2 24 | cachelib==0.9.0 25 | cachetools==5.3.1 26 | cattrs==23.1.2 27 | certifi==2023.7.22 28 | cffi==1.15.1 29 | charset-normalizer==3.2.0 30 | click==8.1.7 31 | clickclick==20.10.2 32 | colorama==0.4.6 33 | colorlog==4.8.0 34 | ConfigUpdater==3.1.1 35 | connexion==2.14.2 36 | cron-descriptor==1.4.0 37 | croniter==1.4.1 38 | cryptography==41.0.3 39 | Deprecated==1.2.14 40 | dill==0.3.7 41 | dnspython==2.4.2 42 | docutils==0.20.1 43 | email-validator==1.3.1 44 | et-xmlfile==1.1.0 45 | exceptiongroup==1.1.3 46 | Flask==2.2.5 47 | Flask-AppBuilder==4.3.6 48 | Flask-Babel==2.0.0 49 | Flask-Caching==2.0.2 50 | Flask-JWT-Extended==4.5.2 51 | Flask-Limiter==3.5.0 52 | Flask-Login==0.6.2 53 | Flask-Session==0.5.0 54 | Flask-SQLAlchemy==2.5.1 55 | Flask-WTF==1.1.1 56 | frozenlist==1.4.0 57 | google-auth==2.22.0 58 | google-re2==1.1 59 | googleapis-common-protos==1.60.0 60 | graphviz==0.20.1 61 | greenlet==2.0.2 62 | grpcio==1.58.0 63 | gunicorn==21.2.0 64 | h11==0.14.0 65 | httpcore==0.17.3 66 | httpx==0.24.1 67 | idna==3.4 68 | importlib-metadata==6.8.0 69 | importlib-resources==6.0.1 70 | inflection==0.5.1 71 | isodate==0.6.1 72 | itsdangerous==2.1.2 73 | Jinja2==3.1.2 74 | jsonschema==4.19.0 75 | jsonschema-specifications==2023.7.1 76 | kubernetes==23.6.0 77 | kubernetes-asyncio==24.2.3 78 | lazy-object-proxy==1.9.0 79 | limits==3.6.0 80 | linkify-it-py==2.0.2 81 | lockfile==0.12.2 82 | Mako==1.2.4 83 | Markdown==3.4.4 84 | markdown-it-py==3.0.0 85 | MarkupSafe==2.1.3 86 | marshmallow==3.20.1 87 | marshmallow-oneofschema==3.0.1 88 | marshmallow-sqlalchemy==0.26.1 89 | mdit-py-plugins==0.4.0 90 | mdurl==0.1.2 91 | msrest==0.7.1 92 | multidict==6.0.4 93 | numpy==1.25.2 94 | oauthlib==3.2.2 95 | openai==0.28.0 96 | openpyxl==3.1.2 97 | opentelemetry-api==1.20.0 98 | opentelemetry-exporter-otlp==1.20.0 99 | opentelemetry-exporter-otlp-proto-common==1.20.0 100 | opentelemetry-exporter-otlp-proto-grpc==1.20.0 101 | opentelemetry-exporter-otlp-proto-http==1.20.0 102 | opentelemetry-proto==1.20.0 103 | opentelemetry-sdk==1.20.0 104 | opentelemetry-semantic-conventions==0.41b0 105 | ordered-set==4.1.0 106 | packaging==23.1 107 | pandas==2.1.0 108 | pandas-stubs==2.0.3.230814 109 | pathspec==0.11.2 110 | pendulum==2.1.2 111 | pluggy==1.3.0 112 | prison==0.2.1 113 | protobuf==4.24.3 114 | psutil==5.9.5 115 | pyasn1==0.5.0 116 | pyasn1-modules==0.3.0 117 | pycparser==2.21 118 | pydantic==2.3.0 119 | pydantic_core==2.6.3 120 | Pygments==2.16.1 121 | PyJWT==2.8.0 122 | python-daemon==3.0.1 123 | python-dateutil==2.8.2 124 | python-nvd3==0.15.0 125 | python-slugify==8.0.1 126 | pytz==2023.3.post1 127 | pytzdata==2020.1 128 | PyYAML==6.0.1 129 | redis==5.0.0 130 | referencing==0.30.2 131 | requests==2.31.0 132 | requests-oauthlib==1.3.1 133 | requests-toolbelt==1.0.0 134 | rfc3339-validator==0.1.4 135 | rich==13.5.2 136 | rich-argparse==1.3.0 137 | rpds-py==0.10.2 138 | rsa==4.9 139 | setproctitle==1.3.2 140 | six==1.16.0 141 | sniffio==1.3.0 142 | SQLAlchemy==1.4.49 143 | SQLAlchemy-JSONField==1.0.1.post0 144 | SQLAlchemy-Utils==0.41.1 145 | sqlparse==0.4.4 146 | tabulate==0.9.0 147 | tenacity==8.2.3 148 | termcolor==2.3.0 149 | text-unidecode==1.3 150 | tqdm==4.66.1 151 | types-pytz==2023.3.0.1 152 | typing_extensions==4.7.1 153 | tzdata==2023.3 154 | uc-micro-py==1.0.2 155 | unicodecsv==0.14.1 156 | urllib3==1.26.16 157 | websocket-client==1.6.2 158 | Werkzeug==2.2.3 159 | wrapt==1.15.0 160 | WTForms==3.0.1 161 | yarl==1.9.2 162 | zipp==3.16.2 163 | -------------------------------------------------------------------------------- /dags/invoice/invoice_dag.py: -------------------------------------------------------------------------------- 1 | # File overview: Main DAG/workflow. Performs the following: 2 | # - triggered on file deposit to the Invoice/inbox dir 3 | # - Generates a worker pod per file for Azure OCR (Formrecognizer) 4 | # - Generates a worker pod per OCR output for OpenAI embedding 5 | # - Generates a worker pod per Embedding for Redis VSS disposition (process invoice or categorize as a duplicate) 6 | 7 | from datetime import datetime 8 | from airflow.decorators import dag, task 9 | from kubernetes.client import models as k8s 10 | from airflow.sensors.base import PokeReturnValue 11 | from airflow.models import Variable 12 | import os 13 | import json 14 | import logging 15 | import pprint 16 | 17 | # local host volume mount for the Invoices directory 18 | executor_config_volume_mount = { 19 | "pod_override": k8s.V1Pod( 20 | spec=k8s.V1PodSpec( 21 | containers=[ 22 | k8s.V1Container( 23 | name="base", 24 | volume_mounts=[ 25 | k8s.V1VolumeMount(mount_path="/opt/airflow/invoices", name="inv-vol") 26 | ], 27 | ) 28 | ], 29 | volumes=[ 30 | k8s.V1Volume( 31 | name="inv-vol", 32 | persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource(claim_name='invoice-claim') 33 | ) 34 | ], 35 | ) 36 | ), 37 | } 38 | 39 | # DAG decorator to establish work flow. Non-scheduled dag/trigger to start. 40 | @dag(dag_id="invoice-processor", schedule=None, start_date=datetime(2023,1,1), catchup=False) 41 | 42 | 43 | def invoice_flow(): 44 | 45 | @task.sensor(task_id="check_inbox", mode="reschedule", timeout=10, executor_config=executor_config_volume_mount) 46 | def check_inbox() -> PokeReturnValue: 47 | """ File sensor for invoices inbox. If files are detected in the inbox, a cascade processing tasks are triggered: 48 | OCR, Embed, Dedup. 49 | """ 50 | storage_var = Variable.get("storage", deserialize_json=True, default_var=None) 51 | if (type(storage_var) != 'dict'): # hack for an apparent bug in airflow 52 | storage_var = json.loads(storage_var) 53 | inbox_path = storage_var['inbox'] 54 | 55 | inbox_files = list(map(lambda file: os.path.join(inbox_path, file), os.listdir(inbox_path))) 56 | logging.info(f'Number of files to be processed: {len(inbox_files)}') 57 | if len(inbox_files) > 0: 58 | return PokeReturnValue(is_done=True, xcom_value=inbox_files) 59 | else: 60 | return PokeReturnValue(is_done=False) 61 | 62 | @task(task_id='parse_invoice', executor_config=executor_config_volume_mount) 63 | def parse_invoice(inbox_file: str) -> dict: 64 | """ OCR is performed on each of invoices in the inbox. The result of OCR is space delimited string of a 65 | configurable number of invoice fields. 66 | """ 67 | from invoice.lib.ocr import ocr 68 | invoice = ocr(inbox_file) 69 | invoice['file'] = inbox_file 70 | logging.info(f'Invoice: {pprint.pformat(invoice)}') 71 | return invoice 72 | 73 | @task(task_id='embed_invoice') 74 | def embed_invoice(invoice: dict) -> dict: 75 | """ Accepts a invoice dict that includes a text field of the OCR output 76 | and adds an OpenAI embedding (array of floats) to that dict 77 | 78 | """ 79 | from invoice.lib.embed import get_embedding 80 | vector = get_embedding(invoice['ocr']) 81 | invoice['vector'] = vector 82 | logging.info(f'Invoice: {invoice["file"]}, Vector len: {invoice["vector"]}') 83 | return invoice 84 | 85 | @task(task_id='dedup_invoice', executor_config=executor_config_volume_mount) 86 | def dedup_invoice(invoice: dict) -> None: 87 | """ Sends the invoice dict into a Redis VSS lookup to determine disposition - process or call it a duplicate 88 | """ 89 | from invoice.lib.vss import dedup 90 | result = dedup(invoice) 91 | logging.info(f'Invoice: {invoice["file"]}, Result: {result}') 92 | 93 | dedup_invoice.expand( 94 | invoice=embed_invoice.expand( 95 | invoice=parse_invoice.expand( 96 | inbox_file=check_inbox() 97 | ) 98 | ) 99 | ) 100 | 101 | invoice_flow() -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: start.sh 3 | # Description: Builds a 3-worker K8S cluster. Starts a 3-node Redis Enterpise cluster + Redis target DB, 4 | # builds an Airflow pipeline of Azure OCR, OpenAI embedding and Redis VSS. 5 | 6 | echo -e "\n*** Deploy Kind Cluster ***" 7 | ./kind/build.sh 8 | 9 | echo -e "\n*** Deploy Redis Operator ***" 10 | kubectl create namespace re 11 | kubectl config set-context --current --namespace=re 12 | RE_LATEST=`curl --silent https://api.github.com/repos/RedisLabs/redis-enterprise-k8s-docs/releases/latest | grep tag_name | awk -F'"' '{print $4}'` 13 | kubectl apply -f https://raw.githubusercontent.com/RedisLabs/redis-enterprise-k8s-docs/$RE_LATEST/bundle.yaml; sleep 1 14 | kubectl rollout status deployment redis-enterprise-operator 15 | 16 | echo -e "\n*** Deploy Redis Cluster ***" 17 | REC_USER="demo@redis.com" 18 | REC_PWD=$(apg -a 1 -m 20 -n 1 -M NCL) 19 | echo "REC Username: $REC_USER" 20 | echo "REC Password: $REC_PWD" 21 | export REC_USER_B64=$(echo -n $REC_USER | base64) 22 | export REC_PWD_B64=$(echo -n $REC_PWD | base64) 23 | export REC_NAME=mycluster 24 | envsubst < ./redis/rec.yaml | kubectl apply -f -; sleep 1 25 | kubectl rollout status sts/$REC_NAME 26 | 27 | echo -e "\n*** Deploy Redis Database ***" 28 | export JSON_VERSION=`kubectl exec -it $REC_NAME-0 -c redis-enterprise-node -- \ 29 | curl -k -u "$REC_USER:$REC_PWD" https://localhost:9443/v1/modules | jq '.[] | select(.display_name=="RedisJSON").semantic_version' | tr -d '"'` 30 | 31 | export SEARCH_VERSION=`kubectl exec -it $REC_NAME-0 -c redis-enterprise-node -- \ 32 | curl -k -u "$REC_USER:$REC_PWD" https://localhost:9443/v1/modules | jq '.[] | select(.display_name=="RediSearch 2").semantic_version' | tr -d '"'` 33 | 34 | export REDB_USER="default" 35 | export REDB_PWD=$(apg -a 1 -m 20 -n 1 -M NCL) 36 | echo "REDB Username: $REDB_USER" 37 | echo "REDB Password: $REDB_PWD" 38 | export REDB_USER_B64=$(echo -n $REDB_USER | base64) 39 | export REDB_PWD_B64=$(echo -n $REDB_PWD | base64) 40 | export REDB_NAME="mydb" 41 | export REDB_PORT=12000 42 | envsubst < ./redis/redb.yaml | kubectl apply -f - 43 | REDB_HOST="" 44 | while [ -z $REDB_HOST ] 45 | do 46 | sleep 3 47 | REDB_HOST=$(kubectl get service $REDB_NAME-load-balancer -o jsonpath='{.status.loadBalancer.ingress[0].*}' 2>/dev/null) 48 | done 49 | echo "REDB Host and Port: $REDB_HOST $REDB_PORT" 50 | 51 | echo -e "\n*** Deploy Azure Cognitive Services ***" 52 | ./azure/build.sh 53 | 54 | # JSON strings that will be saved to Airflow as Variables 55 | re_json=$(echo \ 56 | '{ 57 | "host":"'"$REDB_HOST"'", 58 | "port":"'"$REDB_PORT"'", 59 | "user":"'"$REDB_USER"'", 60 | "pwd":"'"$REDB_PWD"'", 61 | "vector_index":"invoice_idx", 62 | "vector_prefix":"invoice:", 63 | "vector_dim":1536, 64 | "vector_type":"FLOAT32", 65 | "vector_metric":"COSINE", 66 | "vector_similarity_bound":0.97 67 | }' \ 68 | | tr -d '[:space:]') 69 | 70 | storage_json=$(echo \ 71 | '{ 72 | "dups":"/opt/airflow/invoices/dups", 73 | "inbox":"/opt/airflow/invoices/inbox", 74 | "processed":"/opt/airflow/invoices/processed" 75 | }' \ 76 | | tr -d '[:space:]') 77 | 78 | openai_json=$(echo \ 79 | '{ 80 | "key":"'"$(az cognitiveservices account keys list \ 81 | --name openai-resource \ 82 | --resource-group $USER-ai-services-resource-group \ 83 | | jq -r .key1)"'", 84 | "endpoint":"'"$(az cognitiveservices account show \ 85 | --name openai-resource \ 86 | --resource-group $USER-ai-services-resource-group \ 87 | | jq -r .properties.endpoint)"'", 88 | "version":"2023-05-15", 89 | "type":"azure" 90 | }' \ 91 | | tr -d '[:space:]') 92 | 93 | formrec_json=$(echo \ 94 | '{ 95 | "key":"'"$(az cognitiveservices account keys list \ 96 | --name formrecognizer-resource \ 97 | --resource-group $USER-ai-services-resource-group \ 98 | | jq -r .key1)"'", 99 | "endpoint": "'"$(az cognitiveservices account show \ 100 | --name formrecognizer-resource \ 101 | --resource-group $USER-ai-services-resource-group \ 102 | | jq -r .properties.endpoint)"'", 103 | "fields": ["InvoiceId","CustomerName","CustomerId","Items","InvoiceTotal","VendorName","PurchaseOrder"] 104 | }' \ 105 | | tr -d '[:space:]') 106 | 107 | echo -e "\n*** Deploy Airflow ***" 108 | kubectl create namespace airflow 109 | kubectl config set-context --current --namespace=airflow 110 | ./airflow/build.sh 111 | kubectl -n airflow rollout status sts/airflow-triggerer 112 | # set airflow variables with the JSON objects above 113 | kubectl -n airflow -c triggerer exec -it airflow-triggerer-0 -- \ 114 | airflow variables set -j storage "$storage_json" 115 | kubectl -n airflow -c triggerer exec -it airflow-triggerer-0 -- \ 116 | airflow variables set -j re "$re_json" 117 | kubectl -n airflow -c triggerer exec -it airflow-triggerer-0 -- \ 118 | airflow variables set -j openai "$openai_json" 119 | kubectl -n airflow -c triggerer exec -it airflow-triggerer-0 -- \ 120 | airflow variables set -j formrec "$formrec_json" 121 | 122 | cp ./samples/* ./invoices/inbox 123 | AIRFLOW_HOST=$(kubectl get service airflow-webserver -o jsonpath='{.status.loadBalancer.ingress[0].*}') 124 | 125 | echo -e "\n*** Build Complete ***" 126 | echo "K8s Cluster env: kubectl get nodes" 127 | echo "Redis K8s env: kubectl -n re get all" 128 | echo "Airflow K8s env: kubectl -n airflow get all" 129 | echo "Airflow webserver: http://$AIRFLOW_HOST:8080" -------------------------------------------------------------------------------- /airflow/values.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | --- 18 | # Default values for airflow. 19 | # This is a YAML-formatted file. 20 | # Declare variables to be passed into your templates. 21 | 22 | # Provide a name to substitute for the full names of resources 23 | fullnameOverride: "" 24 | 25 | # Provide a name to substitute for the name of the chart 26 | nameOverride: "" 27 | 28 | # Max number of old replicasets to retain. Can be overridden by each deployment's revisionHistoryLimit 29 | revisionHistoryLimit: ~ 30 | 31 | # User and group of airflow user 32 | uid: 50000 33 | gid: 0 34 | 35 | # Default security context for airflow (deprecated, use `securityContexts` instead) 36 | securityContext: {} 37 | # runAsUser: 50000 38 | # fsGroup: 0 39 | # runAsGroup: 0 40 | 41 | # Detailed default security context for airflow deployments 42 | securityContexts: 43 | pod: {} 44 | containers: {} 45 | 46 | # Airflow home directory 47 | # Used for mount paths 48 | airflowHome: /opt/airflow 49 | 50 | # Default airflow repository -- overridden by all the specific images below 51 | defaultAirflowRepository: apache/airflow 52 | 53 | # Default airflow tag to deploy 54 | defaultAirflowTag: "2.7.1" 55 | 56 | # Default airflow digest. If specified, it takes precedence over tag 57 | defaultAirflowDigest: ~ 58 | 59 | # Airflow version (Used to make some decisions based on Airflow Version being deployed) 60 | airflowVersion: "2.7.1" 61 | 62 | # Images 63 | images: 64 | airflow: 65 | repository: myairflow 66 | tag: 2.7.1 67 | #repository: ~ 68 | #tag: ~ 69 | # Specifying digest takes precedence over tag. 70 | #digest: ~ 71 | pullPolicy: IfNotPresent 72 | # To avoid images with user code, you can turn this to 'true' and 73 | # all the 'run-airflow-migrations' and 'wait-for-airflow-migrations' containers/jobs 74 | # will use the images from 'defaultAirflowRepository:defaultAirflowTag' values 75 | # to run and wait for DB migrations . 76 | useDefaultImageForMigration: false 77 | # timeout (in seconds) for airflow-migrations to complete 78 | migrationsWaitTimeout: 60 79 | pod_template: 80 | # Note that `images.pod_template.repository` and `images.pod_template.tag` parameters 81 | # can be overridden in `config.kubernetes` section. So for these parameters to have effect 82 | # `config.kubernetes.worker_container_repository` and `config.kubernetes.worker_container_tag` 83 | # must be not set . 84 | repository: ~ 85 | tag: ~ 86 | pullPolicy: IfNotPresent 87 | flower: 88 | repository: ~ 89 | tag: ~ 90 | pullPolicy: IfNotPresent 91 | statsd: 92 | repository: quay.io/prometheus/statsd-exporter 93 | tag: v0.22.8 94 | pullPolicy: IfNotPresent 95 | redis: 96 | repository: redis 97 | tag: 7-bullseye 98 | pullPolicy: IfNotPresent 99 | pgbouncer: 100 | repository: apache/airflow 101 | tag: airflow-pgbouncer-2023.02.24-1.16.1 102 | pullPolicy: IfNotPresent 103 | pgbouncerExporter: 104 | repository: apache/airflow 105 | tag: airflow-pgbouncer-exporter-2023.02.21-0.14.0 106 | pullPolicy: IfNotPresent 107 | gitSync: 108 | repository: registry.k8s.io/git-sync/git-sync 109 | tag: v3.6.3 110 | pullPolicy: IfNotPresent 111 | 112 | # Select certain nodes for airflow pods. 113 | nodeSelector: {} 114 | affinity: {} 115 | tolerations: [] 116 | topologySpreadConstraints: [] 117 | 118 | # Add common labels to all objects and pods defined in this chart. 119 | labels: {} 120 | 121 | # Ingress configuration 122 | ingress: 123 | # Enable all ingress resources (deprecated - use ingress.web.enabled and ingress.flower.enabled) 124 | enabled: ~ 125 | 126 | # Configs for the Ingress of the web Service 127 | web: 128 | # Enable web ingress resource 129 | enabled: false 130 | 131 | # Annotations for the web Ingress 132 | annotations: {} 133 | 134 | # The path for the web Ingress 135 | path: "/" 136 | 137 | # The pathType for the above path (used only with Kubernetes v1.19 and above) 138 | pathType: "ImplementationSpecific" 139 | 140 | # The hostname for the web Ingress (Deprecated - renamed to `ingress.web.hosts`) 141 | host: "" 142 | 143 | # The hostnames or hosts configuration for the web Ingress 144 | hosts: [] 145 | # - name: "" 146 | # # configs for web Ingress TLS 147 | # tls: 148 | # # Enable TLS termination for the web Ingress 149 | # enabled: false 150 | # # the name of a pre-created Secret containing a TLS private key and certificate 151 | # secretName: "" 152 | 153 | # The Ingress Class for the web Ingress (used only with Kubernetes v1.19 and above) 154 | ingressClassName: "" 155 | 156 | # configs for web Ingress TLS (Deprecated - renamed to `ingress.web.hosts[*].tls`) 157 | tls: 158 | # Enable TLS termination for the web Ingress 159 | enabled: false 160 | # the name of a pre-created Secret containing a TLS private key and certificate 161 | secretName: "" 162 | 163 | # HTTP paths to add to the web Ingress before the default path 164 | precedingPaths: [] 165 | 166 | # Http paths to add to the web Ingress after the default path 167 | succeedingPaths: [] 168 | 169 | # Configs for the Ingress of the flower Service 170 | flower: 171 | # Enable web ingress resource 172 | enabled: false 173 | 174 | # Annotations for the flower Ingress 175 | annotations: {} 176 | 177 | # The path for the flower Ingress 178 | path: "/" 179 | 180 | # The pathType for the above path (used only with Kubernetes v1.19 and above) 181 | pathType: "ImplementationSpecific" 182 | 183 | # The hostname for the flower Ingress (Deprecated - renamed to `ingress.flower.hosts`) 184 | host: "" 185 | 186 | # The hostnames or hosts configuration for the flower Ingress 187 | hosts: [] 188 | # - name: "" 189 | # tls: 190 | # # Enable TLS termination for the flower Ingress 191 | # enabled: false 192 | # # the name of a pre-created Secret containing a TLS private key and certificate 193 | # secretName: "" 194 | 195 | # The Ingress Class for the flower Ingress (used only with Kubernetes v1.19 and above) 196 | ingressClassName: "" 197 | 198 | # configs for flower Ingress TLS (Deprecated - renamed to `ingress.flower.hosts[*].tls`) 199 | tls: 200 | # Enable TLS termination for the flower Ingress 201 | enabled: false 202 | # the name of a pre-created Secret containing a TLS private key and certificate 203 | secretName: "" 204 | 205 | # Network policy configuration 206 | networkPolicies: 207 | # Enabled network policies 208 | enabled: false 209 | 210 | # Extra annotations to apply to all 211 | # Airflow pods 212 | airflowPodAnnotations: {} 213 | 214 | # Extra annotations to apply to 215 | # main Airflow configmap 216 | airflowConfigAnnotations: {} 217 | 218 | # `airflow_local_settings` file as a string (can be templated). 219 | airflowLocalSettings: |- 220 | {{- if semverCompare ">=2.2.0" .Values.airflowVersion }} 221 | {{- if not (or .Values.webserverSecretKey .Values.webserverSecretKeySecretName) }} 222 | from airflow.www.utils import UIAlert 223 | 224 | DASHBOARD_UIALERTS = [ 225 | UIAlert( 226 | 'Usage of a dynamic webserver secret key detected. We recommend a static webserver secret key instead.' 227 | ' See the ' 229 | 'Helm Chart Production Guide for more details.', 230 | category="warning", 231 | roles=["Admin"], 232 | html=True, 233 | ) 234 | ] 235 | {{- end }} 236 | {{- end }} 237 | 238 | # Enable RBAC (default on most clusters these days) 239 | rbac: 240 | # Specifies whether RBAC resources should be created 241 | create: true 242 | createSCCRoleBinding: false 243 | 244 | # Airflow executor 245 | # One of: LocalExecutor, LocalKubernetesExecutor, CeleryExecutor, KubernetesExecutor, CeleryKubernetesExecutor 246 | executor: "KubernetesExecutor" 247 | 248 | # If this is true and using LocalExecutor/KubernetesExecutor/CeleryKubernetesExecutor, the scheduler's 249 | # service account will have access to communicate with the api-server and launch pods. 250 | # If this is true and using CeleryExecutor/KubernetesExecutor/CeleryKubernetesExecutor, the workers 251 | # will be able to launch pods. 252 | allowPodLaunching: true 253 | 254 | # Environment variables for all airflow containers 255 | env: [] 256 | # - name: "" 257 | # value: "" 258 | 259 | # Volumes for all airflow containers 260 | volumes: [] 261 | 262 | 263 | # VolumeMounts for all airflow containers 264 | volumeMounts: [] 265 | 266 | # Secrets for all airflow containers 267 | secret: [] 268 | # - envName: "" 269 | # secretName: "" 270 | # secretKey: "" 271 | 272 | # Enables selected built-in secrets that are set via environment variables by default. 273 | # Those secrets are provided by the Helm Chart secrets by default but in some cases you 274 | # might want to provide some of those variables with _CMD or _SECRET variable, and you should 275 | # in this case disable setting of those variables by setting the relevant configuration to false. 276 | enableBuiltInSecretEnvVars: 277 | AIRFLOW__CORE__FERNET_KEY: true 278 | # For Airflow <2.3, backward compatibility; moved to [database] in 2.3 279 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: true 280 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: true 281 | AIRFLOW_CONN_AIRFLOW_DB: true 282 | AIRFLOW__WEBSERVER__SECRET_KEY: true 283 | AIRFLOW__CELERY__CELERY_RESULT_BACKEND: true 284 | AIRFLOW__CELERY__RESULT_BACKEND: true 285 | AIRFLOW__CELERY__BROKER_URL: true 286 | AIRFLOW__ELASTICSEARCH__HOST: true 287 | AIRFLOW__ELASTICSEARCH__ELASTICSEARCH_HOST: true 288 | 289 | # Extra secrets that will be managed by the chart 290 | # (You can use them with extraEnv or extraEnvFrom or some of the extraVolumes values). 291 | # The format for secret data is "key/value" where 292 | # * key (can be templated) is the name of the secret that will be created 293 | # * value: an object with the standard 'data' or 'stringData' key (or both). 294 | # The value associated with those keys must be a string (can be templated) 295 | extraSecrets: {} 296 | # eg: 297 | # extraSecrets: 298 | # '{{ .Release.Name }}-airflow-connections': 299 | # type: 'Opaque' 300 | # labels: 301 | # my.custom.label/v1: my_custom_label_value_1 302 | # data: | 303 | # AIRFLOW_CONN_GCP: 'base64_encoded_gcp_conn_string' 304 | # AIRFLOW_CONN_AWS: 'base64_encoded_aws_conn_string' 305 | # stringData: | 306 | # AIRFLOW_CONN_OTHER: 'other_conn' 307 | # '{{ .Release.Name }}-other-secret-name-suffix': 308 | # data: | 309 | # ... 310 | 311 | # Extra ConfigMaps that will be managed by the chart 312 | # (You can use them with extraEnv or extraEnvFrom or some of the extraVolumes values). 313 | # The format for configmap data is "key/value" where 314 | # * key (can be templated) is the name of the configmap that will be created 315 | # * value: an object with the standard 'data' key. 316 | # The value associated with this keys must be a string (can be templated) 317 | extraConfigMaps: {} 318 | # eg: 319 | # extraConfigMaps: 320 | # '{{ .Release.Name }}-airflow-variables': 321 | # labels: 322 | # my.custom.label/v2: my_custom_label_value_2 323 | # data: | 324 | # AIRFLOW_VAR_HELLO_MESSAGE: "Hi!" 325 | # AIRFLOW_VAR_KUBERNETES_NAMESPACE: "{{ .Release.Namespace }}" 326 | 327 | # Extra env 'items' that will be added to the definition of airflow containers 328 | # a string is expected (can be templated). 329 | # TODO: difference from `env`? This is a templated string. Probably should template `env` and remove this. 330 | extraEnv: ~ 331 | # eg: 332 | # extraEnv: | 333 | # - name: AIRFLOW__CORE__LOAD_EXAMPLES 334 | # value: 'True' 335 | 336 | # Extra envFrom 'items' that will be added to the definition of airflow containers 337 | # A string is expected (can be templated). 338 | extraEnvFrom: ~ 339 | # eg: 340 | # extraEnvFrom: | 341 | # - secretRef: 342 | # name: '{{ .Release.Name }}-airflow-connections' 343 | # - configMapRef: 344 | # name: '{{ .Release.Name }}-airflow-variables' 345 | 346 | # Airflow database & redis config 347 | data: 348 | # If secret names are provided, use those secrets 349 | # These secrets must be created manually, eg: 350 | # 351 | # kind: Secret 352 | # apiVersion: v1 353 | # metadata: 354 | # name: custom-airflow-metadata-secret 355 | # type: Opaque 356 | # data: 357 | # connection: base64_encoded_connection_string 358 | 359 | metadataSecretName: ~ 360 | # When providing secret names and using the same database for metadata and 361 | # result backend, for Airflow < 2.4.0 it is necessary to create a separate 362 | # secret for result backend but with a db+ scheme prefix. 363 | # For Airflow >= 2.4.0 it is possible to not specify the secret again, 364 | # as Airflow will use sql_alchemy_conn with a db+ scheme prefix by default. 365 | resultBackendSecretName: ~ 366 | brokerUrlSecretName: ~ 367 | 368 | # Otherwise pass connection values in 369 | metadataConnection: 370 | user: postgres 371 | pass: postgres 372 | protocol: postgresql 373 | host: ~ 374 | port: 5432 375 | db: postgres 376 | sslmode: disable 377 | # resultBackendConnection defaults to the same database as metadataConnection 378 | resultBackendConnection: ~ 379 | # or, you can use a different database 380 | # resultBackendConnection: 381 | # user: postgres 382 | # pass: postgres 383 | # protocol: postgresql 384 | # host: ~ 385 | # port: 5432 386 | # db: postgres 387 | # sslmode: disable 388 | # Note: brokerUrl can only be set during install, not upgrade 389 | brokerUrl: ~ 390 | 391 | # Fernet key settings 392 | # Note: fernetKey can only be set during install, not upgrade 393 | fernetKey: ~ 394 | fernetKeySecretName: ~ 395 | 396 | # Flask secret key for Airflow Webserver: `[webserver] secret_key` in airflow.cfg 397 | webserverSecretKey: 8efd81cb8c810f0224a6f2bff7026224 398 | webserverSecretKeySecretName: ~ 399 | 400 | # In order to use kerberos you need to create secret containing the keytab file 401 | # The secret name should follow naming convention of the application where resources are 402 | # name {{ .Release-name }}-. In case of the keytab file, the postfix is "kerberos-keytab" 403 | # So if your release is named "my-release" the name of the secret should be "my-release-kerberos-keytab" 404 | # 405 | # The Keytab content should be available in the "kerberos.keytab" key of the secret. 406 | # 407 | # apiVersion: v1 408 | # kind: Secret 409 | # data: 410 | # kerberos.keytab: 411 | # type: Opaque 412 | # 413 | # 414 | # If you have such keytab file you can do it with similar 415 | # 416 | # kubectl create secret generic {{ .Release.name }}-kerberos-keytab --from-file=kerberos.keytab 417 | # 418 | # 419 | # Alternatively, instead of manually creating the secret, it is possible to specify 420 | # kerberos.keytabBase64Content parameter. This parameter should contain base64 encoded keytab. 421 | # 422 | 423 | kerberos: 424 | enabled: false 425 | ccacheMountPath: /var/kerberos-ccache 426 | ccacheFileName: cache 427 | configPath: /etc/krb5.conf 428 | keytabBase64Content: ~ 429 | keytabPath: /etc/airflow.keytab 430 | principal: airflow@FOO.COM 431 | reinitFrequency: 3600 432 | config: | 433 | # This is an example config showing how you can use templating and how "example" config 434 | # might look like. It works with the test kerberos server that we are using during integration 435 | # testing at Apache Airflow (see `scripts/ci/docker-compose/integration-kerberos.yml` but in 436 | # order to make it production-ready you must replace it with your own configuration that 437 | # Matches your kerberos deployment. Administrators of your Kerberos instance should 438 | # provide the right configuration. 439 | 440 | [logging] 441 | default = "FILE:{{ template "airflow_logs_no_quote" . }}/kerberos_libs.log" 442 | kdc = "FILE:{{ template "airflow_logs_no_quote" . }}/kerberos_kdc.log" 443 | admin_server = "FILE:{{ template "airflow_logs_no_quote" . }}/kadmind.log" 444 | 445 | [libdefaults] 446 | default_realm = FOO.COM 447 | ticket_lifetime = 10h 448 | renew_lifetime = 7d 449 | forwardable = true 450 | 451 | [realms] 452 | FOO.COM = { 453 | kdc = kdc-server.foo.com 454 | admin_server = admin_server.foo.com 455 | } 456 | 457 | # Airflow Worker Config 458 | workers: 459 | # Number of airflow celery workers in StatefulSet 460 | replicas: 1 461 | # Max number of old replicasets to retain 462 | revisionHistoryLimit: ~ 463 | 464 | # Command to use when running Airflow workers (templated). 465 | command: ~ 466 | # Args to use when running Airflow workers (templated). 467 | args: 468 | - "bash" 469 | - "-c" 470 | # The format below is necessary to get `helm lint` happy 471 | - |- 472 | exec \ 473 | airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "celery worker" "worker" }} 474 | 475 | # If the worker stops responding for 5 minutes (5*60s) kill the 476 | # worker and let Kubernetes restart it 477 | livenessProbe: 478 | enabled: true 479 | initialDelaySeconds: 10 480 | timeoutSeconds: 20 481 | failureThreshold: 5 482 | periodSeconds: 60 483 | command: ~ 484 | 485 | # Update Strategy when worker is deployed as a StatefulSet 486 | updateStrategy: ~ 487 | # Update Strategy when worker is deployed as a Deployment 488 | strategy: 489 | rollingUpdate: 490 | maxSurge: "100%" 491 | maxUnavailable: "50%" 492 | 493 | # When not set, the values defined in the global securityContext will be used 494 | securityContext: {} 495 | # runAsUser: 50000 496 | # fsGroup: 0 497 | # runAsGroup: 0 498 | 499 | # Detailed default security context for worker deployments for container and pod level 500 | securityContexts: 501 | pod: {} 502 | container: {} 503 | 504 | # Create ServiceAccount 505 | serviceAccount: 506 | # Specifies whether a ServiceAccount should be created 507 | create: true 508 | # The name of the ServiceAccount to use. 509 | # If not set and create is true, a name is generated using the release name 510 | name: ~ 511 | 512 | # Annotations to add to worker kubernetes service account. 513 | annotations: {} 514 | 515 | # Allow KEDA autoscaling. 516 | # Persistence.enabled must be set to false to use KEDA. 517 | keda: 518 | enabled: false 519 | namespaceLabels: {} 520 | 521 | # How often KEDA polls the airflow DB to report new scale requests to the HPA 522 | pollingInterval: 5 523 | 524 | # How many seconds KEDA will wait before scaling to zero. 525 | # Note that HPA has a separate cooldown period for scale-downs 526 | cooldownPeriod: 30 527 | 528 | # Minimum number of workers created by keda 529 | minReplicaCount: 0 530 | 531 | # Maximum number of workers created by keda 532 | maxReplicaCount: 10 533 | 534 | # Specify HPA related options 535 | advanced: {} 536 | # horizontalPodAutoscalerConfig: 537 | # behavior: 538 | # scaleDown: 539 | # stabilizationWindowSeconds: 300 540 | # policies: 541 | # - type: Percent 542 | # value: 100 543 | # periodSeconds: 15 544 | 545 | persistence: 546 | # Enable persistent volumes 547 | enabled: true 548 | # Volume size for worker StatefulSet 549 | size: 100Gi 550 | # If using a custom storageClass, pass name ref to all statefulSets here 551 | storageClassName: 552 | # Execute init container to chown log directory. 553 | # This is currently only needed in kind, due to usage 554 | # of local-path provisioner. 555 | fixPermissions: false 556 | # Annotations to add to worker volumes 557 | annotations: {} 558 | # Detailed default security context for persistence for container level 559 | securityContexts: 560 | container: {} 561 | 562 | kerberosSidecar: 563 | # Enable kerberos sidecar 564 | enabled: false 565 | resources: {} 566 | # limits: 567 | # cpu: 100m 568 | # memory: 128Mi 569 | # requests: 570 | # cpu: 100m 571 | # memory: 128Mi 572 | # Detailed default security context for kerberosSidecar for container level 573 | securityContexts: 574 | container: {} 575 | 576 | resources: {} 577 | # limits: 578 | # cpu: 100m 579 | # memory: 128Mi 580 | # requests: 581 | # cpu: 100m 582 | # memory: 128Mi 583 | 584 | # Grace period for tasks to finish after SIGTERM is sent from kubernetes 585 | terminationGracePeriodSeconds: 600 586 | 587 | # This setting tells kubernetes that its ok to evict 588 | # when it wants to scale a node down. 589 | safeToEvict: true 590 | 591 | # Launch additional containers into worker. 592 | # Note: If used with KubernetesExecutor, you are responsible for signaling sidecars to exit when the main 593 | # container finishes so Airflow can continue the worker shutdown process! 594 | extraContainers: [] 595 | # Add additional init containers into workers. 596 | extraInitContainers: [] 597 | 598 | # Mount additional volumes into worker. It can be templated like in the following example: 599 | # extraVolumes: 600 | # - name: my-templated-extra-volume 601 | # secret: 602 | # secretName: '{{ include "my_secret_template" . }}' 603 | # defaultMode: 0640 604 | # optional: true 605 | # 606 | # extraVolumeMounts: 607 | # - name: my-templated-extra-volume 608 | # mountPath: "{{ .Values.my_custom_path }}" 609 | # readOnly: true 610 | extraVolumes: [] 611 | extraVolumeMounts: [] 612 | 613 | # Select certain nodes for airflow worker pods. 614 | nodeSelector: {} 615 | priorityClassName: ~ 616 | affinity: {} 617 | # default worker affinity is: 618 | # podAntiAffinity: 619 | # preferredDuringSchedulingIgnoredDuringExecution: 620 | # - podAffinityTerm: 621 | # labelSelector: 622 | # matchLabels: 623 | # component: worker 624 | # topologyKey: kubernetes.io/hostname 625 | # weight: 100 626 | tolerations: [] 627 | topologySpreadConstraints: [] 628 | # hostAliases to use in worker pods. 629 | # See: 630 | # https://kubernetes.io/docs/concepts/services-networking/add-entries-to-pod-etc-hosts-with-host-aliases/ 631 | hostAliases: [] 632 | # - ip: "127.0.0.2" 633 | # hostnames: 634 | # - "test.hostname.one" 635 | # - ip: "127.0.0.3" 636 | # hostnames: 637 | # - "test.hostname.two" 638 | 639 | # annotations for the worker resource 640 | annotations: {} 641 | 642 | podAnnotations: {} 643 | 644 | # Labels specific to workers objects and pods 645 | labels: {} 646 | 647 | logGroomerSidecar: 648 | # Whether to deploy the Airflow worker log groomer sidecar. 649 | enabled: true 650 | # Command to use when running the Airflow worker log groomer sidecar (templated). 651 | command: ~ 652 | # Args to use when running the Airflow worker log groomer sidecar (templated). 653 | args: ["bash", "/clean-logs"] 654 | # Number of days to retain logs 655 | retentionDays: 15 656 | resources: {} 657 | # limits: 658 | # cpu: 100m 659 | # memory: 128Mi 660 | # requests: 661 | # cpu: 100m 662 | # memory: 128Mi 663 | # Detailed default security context for logGroomerSidecar for container level 664 | securityContexts: 665 | container: {} 666 | 667 | waitForMigrations: 668 | # Whether to create init container to wait for db migrations 669 | enabled: true 670 | env: [] 671 | # Detailed default security context for waitForMigrations for container level 672 | securityContexts: 673 | container: {} 674 | 675 | env: [] 676 | 677 | # Airflow scheduler settings 678 | scheduler: 679 | # hostAliases for the scheduler pod 680 | hostAliases: [] 681 | # - ip: "127.0.0.1" 682 | # hostnames: 683 | # - "foo.local" 684 | # - ip: "10.1.2.3" 685 | # hostnames: 686 | # - "foo.remote" 687 | 688 | # If the scheduler stops heartbeating for 5 minutes (5*60s) kill the 689 | # scheduler and let Kubernetes restart it 690 | livenessProbe: 691 | initialDelaySeconds: 10 692 | timeoutSeconds: 20 693 | failureThreshold: 5 694 | periodSeconds: 60 695 | command: ~ 696 | # Airflow 2.0 allows users to run multiple schedulers, 697 | # However this feature is only recommended for MySQL 8+ and Postgres 698 | replicas: 1 699 | # Max number of old replicasets to retain 700 | revisionHistoryLimit: ~ 701 | 702 | # Command to use when running the Airflow scheduler (templated). 703 | command: ~ 704 | # Args to use when running the Airflow scheduler (templated). 705 | args: ["bash", "-c", "exec airflow scheduler"] 706 | 707 | # Update Strategy when scheduler is deployed as a StatefulSet 708 | # (when using LocalExecutor and workers.persistence) 709 | updateStrategy: ~ 710 | # Update Strategy when scheduler is deployed as a Deployment 711 | # (when not using LocalExecutor and workers.persistence) 712 | strategy: ~ 713 | 714 | # When not set, the values defined in the global securityContext will be used 715 | # (deprecated, use `securityContexts` instead) 716 | securityContext: {} 717 | # runAsUser: 50000 718 | # fsGroup: 0 719 | # runAsGroup: 0 720 | 721 | # Detailed default security context for scheduler deployments for container and pod level 722 | securityContexts: 723 | pod: {} 724 | container: {} 725 | 726 | # Create ServiceAccount 727 | serviceAccount: 728 | # Specifies whether a ServiceAccount should be created 729 | create: true 730 | # The name of the ServiceAccount to use. 731 | # If not set and create is true, a name is generated using the release name 732 | name: ~ 733 | 734 | # Annotations to add to scheduler kubernetes service account. 735 | annotations: {} 736 | 737 | # Scheduler pod disruption budget 738 | podDisruptionBudget: 739 | enabled: false 740 | 741 | # PDB configuration 742 | config: 743 | # minAvailable and maxUnavailable are mutually exclusive 744 | maxUnavailable: 1 745 | # minAvailable: 1 746 | 747 | resources: {} 748 | # limits: 749 | # cpu: 100m 750 | # memory: 128Mi 751 | # requests: 752 | # cpu: 100m 753 | # memory: 128Mi 754 | 755 | # This setting tells kubernetes that its ok to evict 756 | # when it wants to scale a node down. 757 | safeToEvict: true 758 | 759 | # Launch additional containers into scheduler. 760 | extraContainers: [] 761 | # Add additional init containers into scheduler. 762 | extraInitContainers: [] 763 | 764 | # Mount additional volumes into scheduler. It can be templated like in the following example: 765 | # extraVolumes: 766 | # - name: my-templated-extra-volume 767 | # secret: 768 | # secretName: '{{ include "my_secret_template" . }}' 769 | # defaultMode: 0640 770 | # optional: true 771 | # 772 | # extraVolumeMounts: 773 | # - name: my-templated-extra-volume 774 | # mountPath: "{{ .Values.my_custom_path }}" 775 | # readOnly: true 776 | extraVolumes: [] 777 | extraVolumeMounts: [] 778 | 779 | # Select certain nodes for airflow scheduler pods. 780 | nodeSelector: {} 781 | affinity: {} 782 | # default scheduler affinity is: 783 | # podAntiAffinity: 784 | # preferredDuringSchedulingIgnoredDuringExecution: 785 | # - podAffinityTerm: 786 | # labelSelector: 787 | # matchLabels: 788 | # component: scheduler 789 | # topologyKey: kubernetes.io/hostname 790 | # weight: 100 791 | tolerations: [] 792 | topologySpreadConstraints: [] 793 | 794 | priorityClassName: ~ 795 | 796 | # annotations for scheduler deployment 797 | annotations: {} 798 | 799 | podAnnotations: {} 800 | 801 | # Labels specific to scheduler objects and pods 802 | labels: {} 803 | 804 | logGroomerSidecar: 805 | # Whether to deploy the Airflow scheduler log groomer sidecar. 806 | enabled: true 807 | # Command to use when running the Airflow scheduler log groomer sidecar (templated). 808 | command: ~ 809 | # Args to use when running the Airflow scheduler log groomer sidecar (templated). 810 | args: ["bash", "/clean-logs"] 811 | # Number of days to retain logs 812 | retentionDays: 15 813 | resources: {} 814 | # limits: 815 | # cpu: 100m 816 | # memory: 128Mi 817 | # requests: 818 | # cpu: 100m 819 | # memory: 128Mi 820 | # Detailed default security context for logGroomerSidecar for container level 821 | securityContexts: 822 | container: {} 823 | 824 | waitForMigrations: 825 | # Whether to create init container to wait for db migrations 826 | enabled: true 827 | env: [] 828 | # Detailed default security context for waitForMigrations for container level 829 | securityContexts: 830 | container: {} 831 | 832 | env: [] 833 | 834 | # Airflow create user job settings 835 | createUserJob: 836 | # Limit the lifetime of the job object after it finished execution. 837 | ttlSecondsAfterFinished: 300 838 | # Command to use when running the create user job (templated). 839 | command: ~ 840 | # Args to use when running the create user job (templated). 841 | args: 842 | - "bash" 843 | - "-c" 844 | # The format below is necessary to get `helm lint` happy 845 | - |- 846 | exec \ 847 | airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "users create" "create_user" }} "$@" 848 | - -- 849 | - "-r" 850 | - "{{ .Values.webserver.defaultUser.role }}" 851 | - "-u" 852 | - "{{ .Values.webserver.defaultUser.username }}" 853 | - "-e" 854 | - "{{ .Values.webserver.defaultUser.email }}" 855 | - "-f" 856 | - "{{ .Values.webserver.defaultUser.firstName }}" 857 | - "-l" 858 | - "{{ .Values.webserver.defaultUser.lastName }}" 859 | - "-p" 860 | - "{{ .Values.webserver.defaultUser.password }}" 861 | 862 | # Annotations on the create user job pod 863 | annotations: {} 864 | # jobAnnotations are annotations on the create user job 865 | jobAnnotations: {} 866 | 867 | # Labels specific to createUserJob objects and pods 868 | labels: {} 869 | 870 | # When not set, the values defined in the global securityContext will be used 871 | securityContext: {} 872 | # runAsUser: 50000 873 | # fsGroup: 0 874 | # runAsGroup: 0 875 | 876 | # Detailed default security context for createUserJob for container and pod level 877 | securityContexts: 878 | pod: {} 879 | container: {} 880 | 881 | # Create ServiceAccount 882 | serviceAccount: 883 | # Specifies whether a ServiceAccount should be created 884 | create: true 885 | # The name of the ServiceAccount to use. 886 | # If not set and create is true, a name is generated using the release name 887 | name: ~ 888 | 889 | # Annotations to add to create user kubernetes service account. 890 | annotations: {} 891 | 892 | # Launch additional containers into user creation job 893 | extraContainers: [] 894 | 895 | # Mount additional volumes into user creation job. It can be templated like in the following example: 896 | # extraVolumes: 897 | # - name: my-templated-extra-volume 898 | # secret: 899 | # secretName: '{{ include "my_secret_template" . }}' 900 | # defaultMode: 0640 901 | # optional: true 902 | # 903 | # extraVolumeMounts: 904 | # - name: my-templated-extra-volume 905 | # mountPath: "{{ .Values.my_custom_path }}" 906 | # readOnly: true 907 | extraVolumes: [] 908 | extraVolumeMounts: [] 909 | 910 | nodeSelector: {} 911 | affinity: {} 912 | tolerations: [] 913 | topologySpreadConstraints: [] 914 | # In case you need to disable the helm hooks that create the jobs after install. 915 | # Disable this if you are using ArgoCD for example 916 | useHelmHooks: true 917 | applyCustomEnv: true 918 | 919 | env: [] 920 | 921 | resources: {} 922 | # limits: 923 | # cpu: 100m 924 | # memory: 128Mi 925 | # requests: 926 | # cpu: 100m 927 | # memory: 128Mi 928 | 929 | # Airflow database migration job settings 930 | migrateDatabaseJob: 931 | enabled: true 932 | # Limit the lifetime of the job object after it finished execution. 933 | ttlSecondsAfterFinished: 300 934 | # Command to use when running the migrate database job (templated). 935 | command: ~ 936 | # Args to use when running the migrate database job (templated). 937 | args: 938 | - "bash" 939 | - "-c" 940 | # The format below is necessary to get `helm lint` happy 941 | - |- 942 | exec \ 943 | airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "db upgrade" "upgradedb" }} 944 | 945 | # Annotations on the database migration pod 946 | annotations: {} 947 | # jobAnnotations are annotations on the database migration job 948 | jobAnnotations: {} 949 | 950 | # When not set, the values defined in the global securityContext will be used 951 | securityContext: {} 952 | # runAsUser: 50000 953 | # fsGroup: 0 954 | # runAsGroup: 0 955 | 956 | # Detailed default security context for migrateDatabaseJob for container and pod level 957 | securityContexts: 958 | pod: {} 959 | container: {} 960 | 961 | # Create ServiceAccount 962 | serviceAccount: 963 | # Specifies whether a ServiceAccount should be created 964 | create: true 965 | # The name of the ServiceAccount to use. 966 | # If not set and create is true, a name is generated using the release name 967 | name: ~ 968 | 969 | # Annotations to add to migrate database job kubernetes service account. 970 | annotations: {} 971 | 972 | resources: {} 973 | # limits: 974 | # cpu: 100m 975 | # memory: 128Mi 976 | # requests: 977 | # cpu: 100m 978 | # memory: 128Mi 979 | 980 | # Launch additional containers into database migration job 981 | extraContainers: [] 982 | 983 | # Mount additional volumes into database migration job. It can be templated like in the following example: 984 | # extraVolumes: 985 | # - name: my-templated-extra-volume 986 | # secret: 987 | # secretName: '{{ include "my_secret_template" . }}' 988 | # defaultMode: 0640 989 | # optional: true 990 | # 991 | # extraVolumeMounts: 992 | # - name: my-templated-extra-volume 993 | # mountPath: "{{ .Values.my_custom_path }}" 994 | # readOnly: true 995 | extraVolumes: [] 996 | extraVolumeMounts: [] 997 | 998 | nodeSelector: {} 999 | affinity: {} 1000 | tolerations: [] 1001 | topologySpreadConstraints: [] 1002 | # In case you need to disable the helm hooks that create the jobs after install. 1003 | # Disable this if you are using ArgoCD for example 1004 | useHelmHooks: true 1005 | applyCustomEnv: true 1006 | 1007 | # Airflow webserver settings 1008 | webserver: 1009 | # hostAliases for the webserver pod 1010 | hostAliases: [] 1011 | # - ip: "127.0.0.1" 1012 | # hostnames: 1013 | # - "foo.local" 1014 | # - ip: "10.1.2.3" 1015 | # hostnames: 1016 | # - "foo.remote" 1017 | allowPodLogReading: true 1018 | livenessProbe: 1019 | initialDelaySeconds: 15 1020 | timeoutSeconds: 5 1021 | failureThreshold: 5 1022 | periodSeconds: 10 1023 | scheme: HTTP 1024 | 1025 | readinessProbe: 1026 | initialDelaySeconds: 15 1027 | timeoutSeconds: 5 1028 | failureThreshold: 5 1029 | periodSeconds: 10 1030 | scheme: HTTP 1031 | 1032 | # Number of webservers 1033 | replicas: 1 1034 | # Max number of old replicasets to retain 1035 | revisionHistoryLimit: ~ 1036 | 1037 | # Command to use when running the Airflow webserver (templated). 1038 | command: ~ 1039 | # Args to use when running the Airflow webserver (templated). 1040 | args: ["bash", "-c", "exec airflow webserver"] 1041 | 1042 | # Create ServiceAccount 1043 | serviceAccount: 1044 | # Specifies whether a ServiceAccount should be created 1045 | create: true 1046 | # The name of the ServiceAccount to use. 1047 | # If not set and create is true, a name is generated using the release name 1048 | name: ~ 1049 | 1050 | # Annotations to add to webserver kubernetes service account. 1051 | annotations: {} 1052 | 1053 | # Webserver pod disruption budget 1054 | podDisruptionBudget: 1055 | enabled: false 1056 | 1057 | # PDB configuration 1058 | config: 1059 | # minAvailable and maxUnavailable are mutually exclusive 1060 | maxUnavailable: 1 1061 | # minAvailable: 1 1062 | 1063 | # Allow overriding Update Strategy for Webserver 1064 | strategy: ~ 1065 | 1066 | # When not set, the values defined in the global securityContext will be used 1067 | # (deprecated, use `securityContexts` instead) 1068 | securityContext: {} 1069 | # runAsUser: 50000 1070 | # fsGroup: 0 1071 | # runAsGroup: 0 1072 | 1073 | # Detailed default security contexts for webserver deployments for container and pod level 1074 | securityContexts: 1075 | pod: {} 1076 | container: {} 1077 | 1078 | # Additional network policies as needed (Deprecated - renamed to `webserver.networkPolicy.ingress.from`) 1079 | extraNetworkPolicies: [] 1080 | networkPolicy: 1081 | ingress: 1082 | # Peers for webserver NetworkPolicy ingress 1083 | from: [] 1084 | # Ports for webserver NetworkPolicy ingress (if `from` is set) 1085 | ports: 1086 | - port: "{{ .Values.ports.airflowUI }}" 1087 | 1088 | resources: {} 1089 | # limits: 1090 | # cpu: 100m 1091 | # memory: 128Mi 1092 | # requests: 1093 | # cpu: 100m 1094 | # memory: 128Mi 1095 | 1096 | # Create initial user. 1097 | defaultUser: 1098 | enabled: true 1099 | role: Admin 1100 | username: admin 1101 | email: admin@example.com 1102 | firstName: admin 1103 | lastName: user 1104 | password: admin 1105 | 1106 | # Launch additional containers into webserver. 1107 | extraContainers: [] 1108 | # Add additional init containers into webserver. 1109 | extraInitContainers: [] 1110 | 1111 | # Mount additional volumes into webserver. It can be templated like in the following example: 1112 | # extraVolumes: 1113 | # - name: my-templated-extra-volume 1114 | # secret: 1115 | # secretName: '{{ include "my_secret_template" . }}' 1116 | # defaultMode: 0640 1117 | # optional: true 1118 | # 1119 | # extraVolumeMounts: 1120 | # - name: my-templated-extra-volume 1121 | # mountPath: "{{ .Values.my_custom_path }}" 1122 | # readOnly: true 1123 | extraVolumes: [] 1124 | extraVolumeMounts: [] 1125 | 1126 | # This string (can be templated) will be mounted into the Airflow Webserver 1127 | # as a custom webserver_config.py. You can bake a webserver_config.py in to 1128 | # your image instead or specify a configmap containing the 1129 | # webserver_config.py. 1130 | webserverConfig: ~ 1131 | # webserverConfig: | 1132 | # from airflow import configuration as conf 1133 | 1134 | # # The SQLAlchemy connection string. 1135 | # SQLALCHEMY_DATABASE_URI = conf.get('database', 'SQL_ALCHEMY_CONN') 1136 | 1137 | # # Flask-WTF flag for CSRF 1138 | # CSRF_ENABLED = True 1139 | webserverConfigConfigMapName: ~ 1140 | 1141 | service: 1142 | type: LoadBalancer 1143 | ## service annotations 1144 | annotations: {} 1145 | ports: 1146 | - name: airflow-ui 1147 | port: "{{ .Values.ports.airflowUI }}" 1148 | # To change the port used to access the webserver: 1149 | # ports: 1150 | # - name: airflow-ui 1151 | # port: 80 1152 | # targetPort: airflow-ui 1153 | # To only expose a sidecar, not the webserver directly: 1154 | # ports: 1155 | # - name: only_sidecar 1156 | # port: 80 1157 | # targetPort: 8888 1158 | # If you have a public IP, set NodePort to set an external port. 1159 | # Service type must be 'NodePort': 1160 | # ports: 1161 | # - name: airflow-ui 1162 | # port: 8080 1163 | # targetPort: 8080 1164 | # nodePort: 31151 1165 | loadBalancerIP: ~ 1166 | ## Limit load balancer source ips to list of CIDRs 1167 | # loadBalancerSourceRanges: 1168 | # - "10.123.0.0/16" 1169 | loadBalancerSourceRanges: [] 1170 | 1171 | # Select certain nodes for airflow webserver pods. 1172 | nodeSelector: {} 1173 | priorityClassName: ~ 1174 | affinity: {} 1175 | # default webserver affinity is: 1176 | # podAntiAffinity: 1177 | # preferredDuringSchedulingIgnoredDuringExecution: 1178 | # - podAffinityTerm: 1179 | # labelSelector: 1180 | # matchLabels: 1181 | # component: webserver 1182 | # topologyKey: kubernetes.io/hostname 1183 | # weight: 100 1184 | tolerations: [] 1185 | topologySpreadConstraints: [] 1186 | 1187 | # annotations for webserver deployment 1188 | annotations: {} 1189 | 1190 | podAnnotations: {} 1191 | 1192 | # Labels specific webserver app 1193 | labels: {} 1194 | 1195 | waitForMigrations: 1196 | # Whether to create init container to wait for db migrations 1197 | enabled: true 1198 | env: [] 1199 | # Detailed default security context for waitForMigrations for container level 1200 | securityContexts: 1201 | container: {} 1202 | 1203 | env: [] 1204 | 1205 | # Airflow Triggerer Config 1206 | triggerer: 1207 | enabled: true 1208 | # Number of airflow triggerers in the deployment 1209 | replicas: 1 1210 | # Max number of old replicasets to retain 1211 | revisionHistoryLimit: ~ 1212 | 1213 | # Command to use when running Airflow triggerers (templated). 1214 | command: ~ 1215 | # Args to use when running Airflow triggerer (templated). 1216 | args: ["bash", "-c", "exec airflow triggerer"] 1217 | 1218 | # Update Strategy when triggerer is deployed as a StatefulSet 1219 | updateStrategy: ~ 1220 | # Update Strategy when triggerer is deployed as a Deployment 1221 | strategy: 1222 | rollingUpdate: 1223 | maxSurge: "100%" 1224 | maxUnavailable: "50%" 1225 | 1226 | # If the triggerer stops heartbeating for 5 minutes (5*60s) kill the 1227 | # triggerer and let Kubernetes restart it 1228 | livenessProbe: 1229 | initialDelaySeconds: 10 1230 | timeoutSeconds: 20 1231 | failureThreshold: 5 1232 | periodSeconds: 60 1233 | command: ~ 1234 | 1235 | # Create ServiceAccount 1236 | serviceAccount: 1237 | # Specifies whether a ServiceAccount should be created 1238 | create: true 1239 | # The name of the ServiceAccount to use. 1240 | # If not set and create is true, a name is generated using the release name 1241 | name: ~ 1242 | 1243 | # Annotations to add to triggerer kubernetes service account. 1244 | annotations: {} 1245 | 1246 | # When not set, the values defined in the global securityContext will be used 1247 | securityContext: {} 1248 | # runAsUser: 50000 1249 | # fsGroup: 0 1250 | # runAsGroup: 0 1251 | 1252 | # Detailed default security context for triggerer for container and pod level 1253 | securityContexts: 1254 | pod: {} 1255 | container: {} 1256 | persistence: 1257 | # Enable persistent volumes 1258 | enabled: true 1259 | # Volume size for triggerer StatefulSet 1260 | size: 100Gi 1261 | # If using a custom storageClass, pass name ref to all statefulSets here 1262 | storageClassName: 1263 | # Execute init container to chown log directory. 1264 | # This is currently only needed in kind, due to usage 1265 | # of local-path provisioner. 1266 | fixPermissions: false 1267 | # Annotations to add to triggerer volumes 1268 | annotations: {} 1269 | 1270 | resources: {} 1271 | # limits: 1272 | # cpu: 100m 1273 | # memory: 128Mi 1274 | # requests: 1275 | # cpu: 100m 1276 | # memory: 128Mi 1277 | 1278 | # Grace period for triggerer to finish after SIGTERM is sent from kubernetes 1279 | terminationGracePeriodSeconds: 60 1280 | 1281 | # This setting tells kubernetes that its ok to evict 1282 | # when it wants to scale a node down. 1283 | safeToEvict: true 1284 | 1285 | # Launch additional containers into triggerer. 1286 | extraContainers: [] 1287 | # Add additional init containers into triggerers. 1288 | extraInitContainers: [] 1289 | 1290 | # Mount additional volumes into triggerer. It can be templated like in the following example: 1291 | # extraVolumes: 1292 | # - name: my-templated-extra-volume 1293 | # secret: 1294 | # secretName: '{{ include "my_secret_template" . }}' 1295 | # defaultMode: 0640 1296 | # optional: true 1297 | # 1298 | # extraVolumeMounts: 1299 | # - name: my-templated-extra-volume 1300 | # mountPath: "{{ .Values.my_custom_path }}" 1301 | # readOnly: true 1302 | extraVolumes: [] 1303 | extraVolumeMounts: [] 1304 | 1305 | # Select certain nodes for airflow triggerer pods. 1306 | nodeSelector: {} 1307 | affinity: {} 1308 | # default triggerer affinity is: 1309 | # podAntiAffinity: 1310 | # preferredDuringSchedulingIgnoredDuringExecution: 1311 | # - podAffinityTerm: 1312 | # labelSelector: 1313 | # matchLabels: 1314 | # component: triggerer 1315 | # topologyKey: kubernetes.io/hostname 1316 | # weight: 100 1317 | tolerations: [] 1318 | topologySpreadConstraints: [] 1319 | 1320 | priorityClassName: ~ 1321 | 1322 | # annotations for the triggerer deployment 1323 | annotations: {} 1324 | 1325 | podAnnotations: {} 1326 | 1327 | # Labels specific to triggerer objects and pods 1328 | labels: {} 1329 | 1330 | logGroomerSidecar: 1331 | # Whether to deploy the Airflow triggerer log groomer sidecar. 1332 | enabled: true 1333 | # Command to use when running the Airflow triggerer log groomer sidecar (templated). 1334 | command: ~ 1335 | # Args to use when running the Airflow triggerer log groomer sidecar (templated). 1336 | args: ["bash", "/clean-logs"] 1337 | # Number of days to retain logs 1338 | retentionDays: 15 1339 | resources: {} 1340 | # limits: 1341 | # cpu: 100m 1342 | # memory: 128Mi 1343 | # requests: 1344 | # cpu: 100m 1345 | # memory: 128Mi 1346 | # Detailed default security context for logGroomerSidecar for container level 1347 | securityContexts: 1348 | container: {} 1349 | 1350 | waitForMigrations: 1351 | # Whether to create init container to wait for db migrations 1352 | enabled: true 1353 | env: [] 1354 | # Detailed default security context for waitForMigrations for container level 1355 | securityContexts: 1356 | container: {} 1357 | 1358 | env: [] 1359 | 1360 | # Airflow Dag Processor Config 1361 | dagProcessor: 1362 | enabled: false 1363 | # Number of airflow dag processors in the deployment 1364 | replicas: 1 1365 | # Max number of old replicasets to retain 1366 | revisionHistoryLimit: ~ 1367 | 1368 | # Command to use when running Airflow dag processors (templated). 1369 | command: ~ 1370 | # Args to use when running Airflow dag processor (templated). 1371 | args: ["bash", "-c", "exec airflow dag-processor"] 1372 | 1373 | # Update Strategy for dag processors 1374 | strategy: 1375 | rollingUpdate: 1376 | maxSurge: "100%" 1377 | maxUnavailable: "50%" 1378 | 1379 | # If the dag processor stops heartbeating for 5 minutes (5*60s) kill the 1380 | # dag processor and let Kubernetes restart it 1381 | livenessProbe: 1382 | initialDelaySeconds: 10 1383 | timeoutSeconds: 20 1384 | failureThreshold: 5 1385 | periodSeconds: 60 1386 | command: ~ 1387 | 1388 | # Create ServiceAccount 1389 | serviceAccount: 1390 | # Specifies whether a ServiceAccount should be created 1391 | create: true 1392 | # The name of the ServiceAccount to use. 1393 | # If not set and create is true, a name is generated using the release name 1394 | name: ~ 1395 | 1396 | # Annotations to add to dag processor kubernetes service account. 1397 | annotations: {} 1398 | 1399 | # When not set, the values defined in the global securityContext will be used 1400 | securityContext: {} 1401 | # runAsUser: 50000 1402 | # fsGroup: 0 1403 | # runAsGroup: 0 1404 | 1405 | # Detailed default security context for dagProcessor for container and pod level 1406 | securityContexts: 1407 | pod: {} 1408 | container: {} 1409 | 1410 | resources: {} 1411 | # limits: 1412 | # cpu: 100m 1413 | # memory: 128Mi 1414 | # requests: 1415 | # cpu: 100m 1416 | # memory: 128Mi 1417 | 1418 | # Grace period for dag processor to finish after SIGTERM is sent from kubernetes 1419 | terminationGracePeriodSeconds: 60 1420 | 1421 | # This setting tells kubernetes that its ok to evict 1422 | # when it wants to scale a node down. 1423 | safeToEvict: true 1424 | 1425 | # Launch additional containers into dag processor. 1426 | extraContainers: [] 1427 | # Add additional init containers into dag processors. 1428 | extraInitContainers: [] 1429 | 1430 | # Mount additional volumes into dag processor. It can be templated like in the following example: 1431 | # extraVolumes: 1432 | # - name: my-templated-extra-volume 1433 | # secret: 1434 | # secretName: '{{ include "my_secret_template" . }}' 1435 | # defaultMode: 0640 1436 | # optional: true 1437 | # 1438 | # extraVolumeMounts: 1439 | # - name: my-templated-extra-volume 1440 | # mountPath: "{{ .Values.my_custom_path }}" 1441 | # readOnly: true 1442 | extraVolumes: [] 1443 | extraVolumeMounts: [] 1444 | 1445 | # Select certain nodes for airflow dag processor pods. 1446 | nodeSelector: {} 1447 | affinity: {} 1448 | # default dag processor affinity is: 1449 | # podAntiAffinity: 1450 | # preferredDuringSchedulingIgnoredDuringExecution: 1451 | # - podAffinityTerm: 1452 | # labelSelector: 1453 | # matchLabels: 1454 | # component: dag-processor 1455 | # topologyKey: kubernetes.io/hostname 1456 | # weight: 100 1457 | tolerations: [] 1458 | topologySpreadConstraints: [] 1459 | 1460 | priorityClassName: ~ 1461 | 1462 | # annotations for the dag processor deployment 1463 | annotations: {} 1464 | 1465 | podAnnotations: {} 1466 | 1467 | logGroomerSidecar: 1468 | # Whether to deploy the Airflow dag processor log groomer sidecar. 1469 | enabled: true 1470 | # Command to use when running the Airflow dag processor log groomer sidecar (templated). 1471 | command: ~ 1472 | # Args to use when running the Airflow dag processor log groomer sidecar (templated). 1473 | args: ["bash", "/clean-logs"] 1474 | # Number of days to retain logs 1475 | retentionDays: 15 1476 | resources: {} 1477 | # limits: 1478 | # cpu: 100m 1479 | # memory: 128Mi 1480 | # requests: 1481 | # cpu: 100m 1482 | # memory: 128Mi 1483 | 1484 | waitForMigrations: 1485 | # Whether to create init container to wait for db migrations 1486 | enabled: true 1487 | env: [] 1488 | 1489 | env: [] 1490 | 1491 | # Flower settings 1492 | flower: 1493 | # Enable flower. 1494 | # If True, and using CeleryExecutor/CeleryKubernetesExecutor, will deploy flower app. 1495 | enabled: false 1496 | # Max number of old replicasets to retain 1497 | revisionHistoryLimit: ~ 1498 | 1499 | # Command to use when running flower (templated). 1500 | command: ~ 1501 | # Args to use when running flower (templated). 1502 | args: 1503 | - "bash" 1504 | - "-c" 1505 | # The format below is necessary to get `helm lint` happy 1506 | - |- 1507 | exec \ 1508 | airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "celery flower" "flower" }} 1509 | 1510 | # Additional network policies as needed (Deprecated - renamed to `flower.networkPolicy.ingress.from`) 1511 | extraNetworkPolicies: [] 1512 | networkPolicy: 1513 | ingress: 1514 | # Peers for flower NetworkPolicy ingress 1515 | from: [] 1516 | # Ports for flower NetworkPolicy ingress (if ingressPeers is set) 1517 | ports: 1518 | - port: "{{ .Values.ports.flowerUI }}" 1519 | 1520 | resources: {} 1521 | # limits: 1522 | # cpu: 100m 1523 | # memory: 128Mi 1524 | # requests: 1525 | # cpu: 100m 1526 | # memory: 128Mi 1527 | 1528 | # When not set, the values defined in the global securityContext will be used 1529 | securityContext: {} 1530 | # runAsUser: 50000 1531 | # fsGroup: 0 1532 | # runAsGroup: 0 1533 | 1534 | # Detailed default security context for flower for container and pod level 1535 | securityContexts: 1536 | pod: {} 1537 | container: {} 1538 | 1539 | # Create ServiceAccount 1540 | serviceAccount: 1541 | # Specifies whether a ServiceAccount should be created 1542 | create: true 1543 | # The name of the ServiceAccount to use. 1544 | # If not set and create is true, a name is generated using the release name 1545 | name: ~ 1546 | 1547 | # Annotations to add to worker kubernetes service account. 1548 | annotations: {} 1549 | 1550 | # A secret containing the connection 1551 | secretName: ~ 1552 | 1553 | # Else, if username and password are set, create secret from username and password 1554 | username: ~ 1555 | password: ~ 1556 | 1557 | service: 1558 | type: ClusterIP 1559 | ## service annotations 1560 | annotations: {} 1561 | ports: 1562 | - name: flower-ui 1563 | port: "{{ .Values.ports.flowerUI }}" 1564 | # To change the port used to access flower: 1565 | # ports: 1566 | # - name: flower-ui 1567 | # port: 8080 1568 | # targetPort: flower-ui 1569 | loadBalancerIP: ~ 1570 | ## Limit load balancer source ips to list of CIDRs 1571 | # loadBalancerSourceRanges: 1572 | # - "10.123.0.0/16" 1573 | loadBalancerSourceRanges: [] 1574 | 1575 | # Launch additional containers into the flower pods. 1576 | extraContainers: [] 1577 | # Mount additional volumes into the flower pods. It can be templated like in the following example: 1578 | # extraVolumes: 1579 | # - name: my-templated-extra-volume 1580 | # secret: 1581 | # secretName: '{{ include "my_secret_template" . }}' 1582 | # defaultMode: 0640 1583 | # optional: true 1584 | # 1585 | # extraVolumeMounts: 1586 | # - name: my-templated-extra-volume 1587 | # mountPath: "{{ .Values.my_custom_path }}" 1588 | # readOnly: true 1589 | extraVolumes: [] 1590 | extraVolumeMounts: [] 1591 | 1592 | # Select certain nodes for airflow flower pods. 1593 | nodeSelector: {} 1594 | affinity: {} 1595 | tolerations: [] 1596 | topologySpreadConstraints: [] 1597 | 1598 | priorityClassName: ~ 1599 | 1600 | # annotations for the flower deployment 1601 | annotations: {} 1602 | 1603 | podAnnotations: {} 1604 | 1605 | # Labels specific to flower objects and pods 1606 | labels: {} 1607 | env: [] 1608 | 1609 | # StatsD settings 1610 | statsd: 1611 | enabled: true 1612 | # Max number of old replicasets to retain 1613 | revisionHistoryLimit: ~ 1614 | 1615 | # Arguments for StatsD exporter command. 1616 | args: ["--statsd.mapping-config=/etc/statsd-exporter/mappings.yml"] 1617 | 1618 | # Annotations to add to the StatsD Deployment. 1619 | annotations: {} 1620 | 1621 | # Create ServiceAccount 1622 | serviceAccount: 1623 | # Specifies whether a ServiceAccount should be created 1624 | create: true 1625 | # The name of the ServiceAccount to use. 1626 | # If not set and create is true, a name is generated using the release name 1627 | name: ~ 1628 | 1629 | # Annotations to add to worker kubernetes service account. 1630 | annotations: {} 1631 | 1632 | uid: 65534 1633 | # When not set, `statsd.uid` will be used 1634 | 1635 | # (deprecated, use `securityContexts` instead) 1636 | securityContext: {} 1637 | # runAsUser: 65534 1638 | # fsGroup: 0 1639 | # runAsGroup: 0 1640 | 1641 | # Detailed default security context for statsd deployments for container and pod level 1642 | securityContexts: 1643 | pod: {} 1644 | container: {} 1645 | 1646 | # Additional network policies as needed 1647 | extraNetworkPolicies: [] 1648 | resources: {} 1649 | # limits: 1650 | # cpu: 100m 1651 | # memory: 128Mi 1652 | # requests: 1653 | # cpu: 100m 1654 | # memory: 128Mi 1655 | 1656 | service: 1657 | extraAnnotations: {} 1658 | 1659 | # Select certain nodes for StatsD pods. 1660 | nodeSelector: {} 1661 | affinity: {} 1662 | tolerations: [] 1663 | topologySpreadConstraints: [] 1664 | 1665 | priorityClassName: ~ 1666 | 1667 | # Additional mappings for StatsD exporter. 1668 | # If set, will merge default mapping and extra mappings, default mapping has higher priority. 1669 | # So, if you want to change some default mapping, please use `overrideMappings` 1670 | extraMappings: [] 1671 | 1672 | # Override mappings for StatsD exporter. 1673 | # If set, will ignore setting item in default and `extraMappings`. 1674 | # So, If you use it, ensure all mapping item contains in it. 1675 | overrideMappings: [] 1676 | 1677 | podAnnotations: {} 1678 | 1679 | # PgBouncer settings 1680 | pgbouncer: 1681 | # Enable PgBouncer 1682 | enabled: false 1683 | # Number of PgBouncer replicas to run in Deployment 1684 | replicas: 1 1685 | # Max number of old replicasets to retain 1686 | revisionHistoryLimit: ~ 1687 | # Command to use for PgBouncer(templated). 1688 | command: ["pgbouncer", "-u", "nobody", "/etc/pgbouncer/pgbouncer.ini"] 1689 | # Args to use for PgBouncer(templated). 1690 | args: ~ 1691 | auth_type: md5 1692 | auth_file: /etc/pgbouncer/users.txt 1693 | 1694 | # annotations to be added to the PgBouncer deployment 1695 | annotations: {} 1696 | 1697 | podAnnotations: {} 1698 | 1699 | # Create ServiceAccount 1700 | serviceAccount: 1701 | # Specifies whether a ServiceAccount should be created 1702 | create: true 1703 | # The name of the ServiceAccount to use. 1704 | # If not set and create is true, a name is generated using the release name 1705 | name: ~ 1706 | 1707 | # Annotations to add to worker kubernetes service account. 1708 | annotations: {} 1709 | 1710 | # Additional network policies as needed 1711 | extraNetworkPolicies: [] 1712 | 1713 | # Pool sizes 1714 | metadataPoolSize: 10 1715 | resultBackendPoolSize: 5 1716 | 1717 | # Maximum clients that can connect to PgBouncer (higher = more file descriptors) 1718 | maxClientConn: 100 1719 | 1720 | # supply the name of existing secret with pgbouncer.ini and users.txt defined 1721 | # you can load them to a k8s secret like the one below 1722 | # apiVersion: v1 1723 | # kind: Secret 1724 | # metadata: 1725 | # name: pgbouncer-config-secret 1726 | # data: 1727 | # pgbouncer.ini: 1728 | # users.txt: 1729 | # type: Opaque 1730 | # 1731 | # configSecretName: pgbouncer-config-secret 1732 | # 1733 | configSecretName: ~ 1734 | 1735 | # PgBouncer pod disruption budget 1736 | podDisruptionBudget: 1737 | enabled: false 1738 | 1739 | # PDB configuration 1740 | config: 1741 | # minAvailable and maxUnavailable are mutually exclusive 1742 | maxUnavailable: 1 1743 | # minAvailable: 1 1744 | 1745 | # Limit the resources to PgBouncer. 1746 | # When you specify the resource request the k8s scheduler uses this information to decide which node to 1747 | # place the Pod on. When you specify a resource limit for a Container, the kubelet enforces those limits so 1748 | # that the running container is not allowed to use more of that resource than the limit you set. 1749 | # See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ 1750 | # Example: 1751 | # 1752 | # resource: 1753 | # limits: 1754 | # cpu: 100m 1755 | # memory: 128Mi 1756 | # requests: 1757 | # cpu: 100m 1758 | # memory: 128Mi 1759 | resources: {} 1760 | 1761 | service: 1762 | extraAnnotations: {} 1763 | 1764 | # https://www.pgbouncer.org/config.html 1765 | verbose: 0 1766 | logDisconnections: 0 1767 | logConnections: 0 1768 | 1769 | sslmode: "prefer" 1770 | ciphers: "normal" 1771 | 1772 | ssl: 1773 | ca: ~ 1774 | cert: ~ 1775 | key: ~ 1776 | 1777 | # Add extra PgBouncer ini configuration in the databases section: 1778 | # https://www.pgbouncer.org/config.html#section-databases 1779 | extraIniMetadata: ~ 1780 | extraIniResultBackend: ~ 1781 | # Add extra general PgBouncer ini configuration: https://www.pgbouncer.org/config.html 1782 | extraIni: ~ 1783 | 1784 | # Mount additional volumes into pgbouncer. It can be templated like in the following example: 1785 | # extraVolumes: 1786 | # - name: my-templated-extra-volume 1787 | # secret: 1788 | # secretName: '{{ include "my_secret_template" . }}' 1789 | # defaultMode: 0640 1790 | # optional: true 1791 | # 1792 | # extraVolumeMounts: 1793 | # - name: my-templated-extra-volume 1794 | # mountPath: "{{ .Values.my_custom_path }}" 1795 | # readOnly: true 1796 | extraVolumes: [] 1797 | extraVolumeMounts: [] 1798 | 1799 | # Select certain nodes for PgBouncer pods. 1800 | nodeSelector: {} 1801 | affinity: {} 1802 | tolerations: [] 1803 | topologySpreadConstraints: [] 1804 | 1805 | priorityClassName: ~ 1806 | 1807 | uid: 65534 1808 | 1809 | # Detailed default security context for pgbouncer for container level 1810 | securityContexts: 1811 | container: {} 1812 | 1813 | metricsExporterSidecar: 1814 | resources: {} 1815 | # limits: 1816 | # cpu: 100m 1817 | # memory: 128Mi 1818 | # requests: 1819 | # cpu: 100m 1820 | # memory: 128Mi 1821 | sslmode: "disable" 1822 | 1823 | # Detailed default security context for metricsExporterSidecar for container level 1824 | securityContexts: 1825 | container: {} 1826 | 1827 | livenessProbe: 1828 | initialDelaySeconds: 10 1829 | periodSeconds: 10 1830 | timeoutSeconds: 1 1831 | 1832 | readinessProbe: 1833 | initialDelaySeconds: 10 1834 | periodSeconds: 10 1835 | timeoutSeconds: 1 1836 | 1837 | # Configuration for the redis provisioned by the chart 1838 | redis: 1839 | enabled: true 1840 | terminationGracePeriodSeconds: 600 1841 | 1842 | # Create ServiceAccount 1843 | serviceAccount: 1844 | # Specifies whether a ServiceAccount should be created 1845 | create: true 1846 | # The name of the ServiceAccount to use. 1847 | # If not set and create is true, a name is generated using the release name 1848 | name: ~ 1849 | 1850 | # Annotations to add to worker kubernetes service account. 1851 | annotations: {} 1852 | 1853 | persistence: 1854 | # Enable persistent volumes 1855 | enabled: true 1856 | # Volume size for worker StatefulSet 1857 | size: 1Gi 1858 | # If using a custom storageClass, pass name ref to all statefulSets here 1859 | storageClassName: 1860 | # Annotations to add to redis volumes 1861 | annotations: {} 1862 | 1863 | resources: {} 1864 | # limits: 1865 | # cpu: 100m 1866 | # memory: 128Mi 1867 | # requests: 1868 | # cpu: 100m 1869 | # memory: 128Mi 1870 | 1871 | # If set use as redis secret. Make sure to also set data.brokerUrlSecretName value. 1872 | passwordSecretName: ~ 1873 | 1874 | # Else, if password is set, create secret with it, 1875 | # Otherwise a new password will be generated on install 1876 | # Note: password can only be set during install, not upgrade. 1877 | password: ~ 1878 | 1879 | # This setting tells kubernetes that its ok to evict 1880 | # when it wants to scale a node down. 1881 | safeToEvict: true 1882 | 1883 | # Select certain nodes for redis pods. 1884 | nodeSelector: {} 1885 | affinity: {} 1886 | tolerations: [] 1887 | topologySpreadConstraints: [] 1888 | 1889 | # Set to 0 for backwards-compatiblity 1890 | uid: 0 1891 | # If not set, `redis.uid` will be used 1892 | securityContext: {} 1893 | # runAsUser: 999 1894 | # runAsGroup: 0 1895 | 1896 | # Detailed default security context for redis for container and pod level 1897 | securityContexts: 1898 | pod: {} 1899 | container: {} 1900 | 1901 | podAnnotations: {} 1902 | # Auth secret for a private registry 1903 | # This is used if pulling airflow images from a private registry 1904 | registry: 1905 | secretName: ~ 1906 | 1907 | # Example: 1908 | # connection: 1909 | # user: ~ 1910 | # pass: ~ 1911 | # host: ~ 1912 | # email: ~ 1913 | connection: {} 1914 | 1915 | # Elasticsearch logging configuration 1916 | elasticsearch: 1917 | # Enable elasticsearch task logging 1918 | enabled: false 1919 | # A secret containing the connection 1920 | secretName: ~ 1921 | # Or an object representing the connection 1922 | # Example: 1923 | # connection: 1924 | # user: ~ 1925 | # pass: ~ 1926 | # host: ~ 1927 | # port: ~ 1928 | connection: {} 1929 | 1930 | # All ports used by chart 1931 | ports: 1932 | flowerUI: 5555 1933 | airflowUI: 8080 1934 | workerLogs: 8793 1935 | triggererLogs: 8794 1936 | redisDB: 6379 1937 | statsdIngest: 9125 1938 | statsdScrape: 9102 1939 | pgbouncer: 6543 1940 | pgbouncerScrape: 9127 1941 | 1942 | # Define any ResourceQuotas for namespace 1943 | quotas: {} 1944 | 1945 | # Define default/max/min values for pods and containers in namespace 1946 | limits: [] 1947 | 1948 | # This runs as a CronJob to cleanup old pods. 1949 | cleanup: 1950 | enabled: false 1951 | # Run every 15 minutes 1952 | schedule: "*/15 * * * *" 1953 | # Command to use when running the cleanup cronjob (templated). 1954 | command: ~ 1955 | # Args to use when running the cleanup cronjob (templated). 1956 | args: ["bash", "-c", "exec airflow kubernetes cleanup-pods --namespace={{ .Release.Namespace }}"] 1957 | 1958 | # jobAnnotations are annotations on the cleanup CronJob 1959 | jobAnnotations: {} 1960 | 1961 | # Select certain nodes for airflow cleanup pods. 1962 | nodeSelector: {} 1963 | affinity: {} 1964 | tolerations: [] 1965 | topologySpreadConstraints: [] 1966 | 1967 | podAnnotations: {} 1968 | 1969 | # Labels specific to cleanup objects and pods 1970 | labels: {} 1971 | 1972 | resources: {} 1973 | # limits: 1974 | # cpu: 100m 1975 | # memory: 128Mi 1976 | # requests: 1977 | # cpu: 100m 1978 | # memory: 128Mi 1979 | 1980 | # Create ServiceAccount 1981 | serviceAccount: 1982 | # Specifies whether a ServiceAccount should be created 1983 | create: true 1984 | # The name of the ServiceAccount to use. 1985 | # If not set and create is true, a name is generated using the release name 1986 | name: ~ 1987 | 1988 | # Annotations to add to cleanup cronjob kubernetes service account. 1989 | annotations: {} 1990 | 1991 | # When not set, the values defined in the global securityContext will be used 1992 | securityContext: {} 1993 | # runAsUser: 50000 1994 | # runAsGroup: 0 1995 | env: [] 1996 | 1997 | # Detailed default security context for cleanup for container level 1998 | securityContexts: 1999 | container: {} 2000 | 2001 | # Specify history limit 2002 | # When set, overwrite the default k8s number of successful and failed CronJob executions that are saved. 2003 | failedJobsHistoryLimit: ~ 2004 | successfulJobsHistoryLimit: ~ 2005 | 2006 | # Configuration for postgresql subchart 2007 | # Not recommended for production 2008 | postgresql: 2009 | enabled: true 2010 | image: 2011 | tag: "11" 2012 | auth: 2013 | enablePostgresUser: true 2014 | postgresPassword: postgres 2015 | username: "" 2016 | password: "" 2017 | 2018 | # Config settings to go into the mounted airflow.cfg 2019 | # 2020 | # Please note that these values are passed through the `tpl` function, so are 2021 | # all subject to being rendered as go templates. If you need to include a 2022 | # literal `{{` in a value, it must be expressed like this: 2023 | # 2024 | # a: '{{ "{{ not a template }}" }}' 2025 | # 2026 | # Do not set config containing secrets via plain text values, use Env Var or k8s secret object 2027 | # yamllint disable rule:line-length 2028 | config: 2029 | core: 2030 | dags_folder: '{{ include "airflow_dags" . }}' 2031 | # This is ignored when used with the official Docker image 2032 | load_examples: 'False' 2033 | executor: '{{ .Values.executor }}' 2034 | # For Airflow 1.10, backward compatibility; moved to [logging] in 2.0 2035 | colored_console_log: 'False' 2036 | remote_logging: '{{- ternary "True" "False" .Values.elasticsearch.enabled }}' 2037 | logging: 2038 | remote_logging: '{{- ternary "True" "False" .Values.elasticsearch.enabled }}' 2039 | colored_console_log: 'False' 2040 | metrics: 2041 | statsd_on: '{{ ternary "True" "False" .Values.statsd.enabled }}' 2042 | statsd_port: 9125 2043 | statsd_prefix: airflow 2044 | statsd_host: '{{ printf "%s-statsd" .Release.Name }}' 2045 | webserver: 2046 | enable_proxy_fix: 'True' 2047 | # For Airflow 1.10 2048 | rbac: 'True' 2049 | celery: 2050 | flower_url_prefix: '{{ .Values.ingress.flower.path }}' 2051 | worker_concurrency: 16 2052 | scheduler: 2053 | standalone_dag_processor: '{{ ternary "True" "False" .Values.dagProcessor.enabled }}' 2054 | # statsd params included for Airflow 1.10 backward compatibility; moved to [metrics] in 2.0 2055 | statsd_on: '{{ ternary "True" "False" .Values.statsd.enabled }}' 2056 | statsd_port: 9125 2057 | statsd_prefix: airflow 2058 | statsd_host: '{{ printf "%s-statsd" .Release.Name }}' 2059 | # `run_duration` included for Airflow 1.10 backward compatibility; removed in 2.0. 2060 | run_duration: 41460 2061 | elasticsearch: 2062 | json_format: 'True' 2063 | log_id_template: "{dag_id}_{task_id}_{execution_date}_{try_number}" 2064 | elasticsearch_configs: 2065 | max_retries: 3 2066 | timeout: 30 2067 | retry_timeout: 'True' 2068 | kerberos: 2069 | keytab: '{{ .Values.kerberos.keytabPath }}' 2070 | reinit_frequency: '{{ .Values.kerberos.reinitFrequency }}' 2071 | principal: '{{ .Values.kerberos.principal }}' 2072 | ccache: '{{ .Values.kerberos.ccacheMountPath }}/{{ .Values.kerberos.ccacheFileName }}' 2073 | celery_kubernetes_executor: 2074 | kubernetes_queue: 'kubernetes' 2075 | # The `kubernetes` section is deprecated in Airflow >= 2.5.0 due to an airflow.cfg schema change. 2076 | # The `kubernetes` section can be removed once the helm chart no longer supports Airflow < 2.5.0. 2077 | kubernetes: 2078 | namespace: '{{ .Release.Namespace }}' 2079 | # The following `airflow_` entries are for Airflow 1, and can be removed when it is no longer supported. 2080 | airflow_configmap: '{{ include "airflow_config" . }}' 2081 | airflow_local_settings_configmap: '{{ include "airflow_config" . }}' 2082 | pod_template_file: '{{ include "airflow_pod_template_file" . }}/pod_template_file.yaml' 2083 | worker_container_repository: '{{ .Values.images.airflow.repository | default .Values.defaultAirflowRepository }}' 2084 | worker_container_tag: '{{ .Values.images.airflow.tag | default .Values.defaultAirflowTag }}' 2085 | multi_namespace_mode: '{{ ternary "True" "False" .Values.multiNamespaceMode }}' 2086 | # The `kubernetes_executor` section duplicates the `kubernetes` section in Airflow >= 2.5.0 due to an airflow.cfg schema change. 2087 | kubernetes_executor: 2088 | namespace: '{{ .Release.Namespace }}' 2089 | pod_template_file: '{{ include "airflow_pod_template_file" . }}/pod_template_file.yaml' 2090 | #pod_template_file: /home/joeywhelan/Dev/docai-pipeline/airflow/pod.yaml 2091 | worker_container_repository: '{{ .Values.images.airflow.repository | default .Values.defaultAirflowRepository }}' 2092 | worker_container_tag: '{{ .Values.images.airflow.tag | default .Values.defaultAirflowTag }}' 2093 | multi_namespace_mode: '{{ ternary "True" "False" .Values.multiNamespaceMode }}' 2094 | # yamllint enable rule:line-length 2095 | 2096 | # Whether Airflow can launch workers and/or pods in multiple namespaces 2097 | # If true, it creates ClusterRole/ClusterRolebinding (with access to entire cluster) 2098 | multiNamespaceMode: false 2099 | 2100 | # `podTemplate` is a templated string containing the contents of `pod_template_file.yaml` used for 2101 | # KubernetesExecutor workers. The default `podTemplate` will use normal `workers` configuration parameters 2102 | # (e.g. `workers.resources`). As such, you normally won't need to override this directly, however, 2103 | # you can still provide a completely custom `pod_template_file.yaml` if desired. 2104 | # If not set, a default one is created using `files/pod-template-file.kubernetes-helm-yaml`. 2105 | #kubernetesPodTemplate: 2106 | # extraVolumeMounts: 2107 | # - name: worker-invoices 2108 | # mountPath: /opt/airflow/invoices 2109 | # readOnly: false 2110 | 2111 | # extraVolumes: 2112 | # - name: worker-invoices 2113 | # persistentVolumeClaim: 2114 | # claimName: invoice-claim 2115 | 2116 | 2117 | 2118 | #podTemplate: /home/joeywhelan/Dev/docai-pipeline/airflow/pod_template.yaml 2119 | podTemplate: ~ 2120 | # The following example is NOT functional, but meant to be illustrative of how you can provide a custom 2121 | # `pod_template_file`. You're better off starting with the default in 2122 | # `files/pod-template-file.kubernetes-helm-yaml` and modifying from there. 2123 | # We will set `priorityClassName` in this example: 2124 | # podTemplate: | 2125 | # apiVersion: v1 2126 | # kind: Pod 2127 | # metadata: 2128 | # name: placeholder-name 2129 | # labels: 2130 | # tier: airflow 2131 | # component: worker 2132 | # release: {{ .Release.Name }} 2133 | # spec: 2134 | # priorityClassName: high-priority 2135 | # containers: 2136 | # - name: base 2137 | # ... 2138 | 2139 | # Git sync 2140 | dags: 2141 | persistence: 2142 | # Annotations for dags PVC 2143 | annotations: {} 2144 | # Enable persistent volume for storing dags 2145 | enabled: true 2146 | # Volume size for dags 2147 | size: 1Gi 2148 | # If using a custom storageClass, pass name here 2149 | storageClassName: 2150 | # access mode of the persistent volume 2151 | accessMode: ReadWriteOnce 2152 | ## the name of an existing PVC to use 2153 | existingClaim: dag-claim 2154 | ## optional subpath for dag volume mount 2155 | subPath: ~ 2156 | gitSync: 2157 | enabled: false 2158 | 2159 | # git repo clone url 2160 | # ssh example: git@github.com:apache/airflow.git 2161 | # https example: https://github.com/apache/airflow.git 2162 | repo: https://github.com/apache/airflow.git 2163 | branch: v2-2-stable 2164 | rev: HEAD 2165 | depth: 1 2166 | # the number of consecutive failures allowed before aborting 2167 | maxFailures: 0 2168 | # subpath within the repo where dags are located 2169 | # should be "" if dags are at repo root 2170 | subPath: "tests/dags" 2171 | # if your repo needs a user name password 2172 | # you can load them to a k8s secret like the one below 2173 | # --- 2174 | # apiVersion: v1 2175 | # kind: Secret 2176 | # metadata: 2177 | # name: git-credentials 2178 | # data: 2179 | # GIT_SYNC_USERNAME: 2180 | # GIT_SYNC_PASSWORD: 2181 | # and specify the name of the secret below 2182 | # 2183 | # credentialsSecret: git-credentials 2184 | # 2185 | # 2186 | # If you are using an ssh clone url, you can load 2187 | # the ssh private key to a k8s secret like the one below 2188 | # --- 2189 | # apiVersion: v1 2190 | # kind: Secret 2191 | # metadata: 2192 | # name: airflow-ssh-secret 2193 | # data: 2194 | # # key needs to be gitSshKey 2195 | # gitSshKey: 2196 | # and specify the name of the secret below 2197 | # sshKeySecret: airflow-ssh-secret 2198 | # 2199 | # If you are using an ssh private key, you can additionally 2200 | # specify the content of your known_hosts file, example: 2201 | # 2202 | # knownHosts: | 2203 | # , 2204 | # , 2205 | 2206 | # interval between git sync attempts in seconds 2207 | # high values are more likely to cause DAGs to become out of sync between different components 2208 | # low values cause more traffic to the remote git repository 2209 | wait: 5 2210 | containerName: git-sync 2211 | uid: 65533 2212 | 2213 | # When not set, the values defined in the global securityContext will be used 2214 | securityContext: {} 2215 | # runAsUser: 65533 2216 | # runAsGroup: 0 2217 | 2218 | securityContexts: 2219 | container: {} 2220 | 2221 | # Mount additional volumes into git-sync. It can be templated like in the following example: 2222 | # extraVolumeMounts: 2223 | # - name: my-templated-extra-volume 2224 | # mountPath: "{{ .Values.my_custom_path }}" 2225 | # readOnly: true 2226 | extraVolumeMounts: [] 2227 | env: [] 2228 | # Supported env vars for gitsync can be found at https://github.com/kubernetes/git-sync 2229 | # - name: "" 2230 | # value: "" 2231 | 2232 | resources: {} 2233 | # limits: 2234 | # cpu: 100m 2235 | # memory: 128Mi 2236 | # requests: 2237 | # cpu: 100m 2238 | # memory: 128Mi 2239 | 2240 | logs: 2241 | persistence: 2242 | # Enable persistent volume for storing logs 2243 | enabled: true 2244 | # Volume size for logs 2245 | size: 4Gi 2246 | # Annotations for the logs PVC 2247 | annotations: {} 2248 | # If using a custom storageClass, pass name here 2249 | storageClassName: 2250 | ## the name of an existing PVC to use 2251 | existingClaim: log-claim 2252 | --------------------------------------------------------------------------------