├── README.md ├── airflow-k8sexecutor ├── Dockerfile ├── Makefile ├── dags │ ├── example │ │ └── example.py │ └── generated │ │ └── example-generated.py └── k8s │ ├── airflow-db-svc.yaml │ ├── airflow-db.yaml │ ├── airflow-svc-external.yaml │ ├── airflow-svc-internal.yaml │ ├── airflow.yaml │ ├── namespace.yaml │ └── serviceaccount.yaml ├── airflow-k8spodoperator ├── Dockerfile ├── Makefile ├── dags │ ├── complete.py │ ├── example │ │ └── example.py │ └── generated │ │ └── generated.py └── k8s │ ├── airflow-db-svc.yaml │ ├── airflow-db.yaml │ ├── airflow-svc-external.yaml │ ├── airflow-svc-internal.yaml │ ├── airflow.yaml │ ├── namespace.yaml │ └── serviceaccount.yaml └── airflow-keda ├── Makefile ├── dags-docker-image ├── .astro │ └── config.yaml ├── .dockerignore ├── .gitignore ├── Dockerfile ├── Makefile ├── dags │ ├── example-dag.py │ └── example-generated.py ├── packages.txt ├── plugins │ └── example-plugin.py └── requirements.txt ├── k8s └── namespace.yaml └── keda └── scaledobject.yaml /README.md: -------------------------------------------------------------------------------- 1 | # Code examples for "Three ways to run Airflow on Kubernetes" 2 | 3 | * airflow-k8spodoperator 4 | * airflow-k8sexecutor 5 | * airflow-keda 6 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/Dockerfile: -------------------------------------------------------------------------------- 1 | # take Airflow base image 2 | FROM apache/airflow:1.10.11 3 | 4 | # add dependencies for http basic auth 5 | RUN pip install --user --upgrade apache-airflow[password]==1.10.11 apache-airflow[gcp] 6 | 7 | # add dags 8 | ADD dags /opt/airflow/dags 9 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/Makefile: -------------------------------------------------------------------------------- 1 | # This makefile can be used to deploy infrastructure to a GKE cluster 2 | 3 | REGISTRY= 4 | GCP_PROJECT= 5 | IMAGE= 6 | TAG= 7 | PROJECT= 8 | 9 | build: 10 | docker build -t ${IMAGE}:${TAG} . 11 | docker tag ${IMAGE}:${TAG} ${REGISTRY}/${GCP_PROJECT}/${IMAGE}:${TAG} 12 | docker push ${REGISTRY}/${GCP_PROJECT}/${IMAGE}:${TAG} 13 | 14 | login: | configure_cluster 15 | 16 | set_project: 17 | gcloud config set project ${PROJECT} 18 | 19 | NAMESPACE=airflow-k8sexecutor 20 | configure_cluster: authenticate 21 | kubectl config set-context --current --namespace=${NAMESPACE} 22 | 23 | CLUSTER_NAME= 24 | authenticate: 25 | gcloud container clusters get-credentials ${CLUSTER_NAME} 26 | 27 | deploy: 28 | kubectl apply -f k8s/namespace.yaml 29 | kubectl apply -f k8s/serviceaccount.yaml 30 | kubectl apply -f k8s/role.yaml 31 | kubectl apply -f k8s/rolebinding.yaml 32 | kubectl apply -f k8s/airflow-db.yaml 33 | kubectl apply -f k8s/airflow-db-svc.yaml 34 | kubectl apply -f k8s/airflow.yaml 35 | kubectl apply -f k8s/airflow-svc-external.yaml 36 | 37 | redeploy: 38 | kubectl rollout restart deployment/airflow 39 | kubectl rollout status --timeout=3m deployment/airflow 40 | 41 | IP:=$(shell kubectl get svc airflow-svc -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') 42 | get_url: 43 | echo http://${IP}/admin 44 | 45 | 46 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/dags/example/example.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from datetime import datetime, timedelta 4 | from pathlib import Path 5 | 6 | from airflow import DAG 7 | from airflow.operators.python_operator import PythonOperator 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | dag = DAG( 12 | "example_using_k8s_executor", 13 | schedule_interval="0 1 * * *", 14 | catchup=False, 15 | default_args={ 16 | "owner": "airflow", 17 | "depends_on_past": False, 18 | "start_date": datetime(2020, 8, 7), 19 | "email_on_failure": False, 20 | "email_on_retry": False, 21 | "retries": 2, 22 | "retry_delay": timedelta(seconds=30), 23 | "sla": timedelta(hours=23), 24 | }, 25 | ) 26 | 27 | def use_airflow_binary(): 28 | rc = os.system("airflow -h") 29 | assert rc == 0 30 | 31 | with dag: 32 | task_1 = PythonOperator( 33 | task_id="task-1", 34 | python_callable=use_airflow_binary, 35 | ) 36 | task_2 = PythonOperator( 37 | task_id="task-2", 38 | python_callable=use_airflow_binary, 39 | ) 40 | 41 | task_1 >> task_2 42 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/dags/generated/example-generated.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.dummy_operator import DummyOperator 3 | from airflow.operators.bash_operator import BashOperator 4 | from airflow.operators.python_operator import PythonOperator 5 | from datetime import datetime, timedelta 6 | 7 | 8 | def my_custom_function(ts,**kwargs): 9 | """ 10 | This can be any python code you want and is called from the python operator. The code is not executed until 11 | the task is run by the airflow scheduler. 12 | """ 13 | print(f"I am task number {kwargs['task_number']}. This DAG Run execution date is {ts} and the current time is {datetime.now()}") 14 | print('Here is the full DAG Run context. It is available because provide_context=True') 15 | print(kwargs) 16 | 17 | 18 | # Default settings applied to all tasks 19 | default_args = { 20 | 'owner': 'airflow', 21 | 'depends_on_past': False, 22 | 'email_on_failure': False, 23 | 'email_on_retry': False, 24 | 'retries': 1, 25 | 'retry_delay': timedelta(minutes=5) 26 | } 27 | 28 | # Using a DAG context manager, you don't have to specify the dag property of each task 29 | with DAG('example_dag_generated', 30 | start_date=datetime(2019, 1, 1), 31 | max_active_runs=3, 32 | schedule_interval=timedelta(minutes=30), # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs 33 | default_args=default_args, 34 | # catchup=False # enable if you don't want historical dag runs to run 35 | ) as dag: 36 | 37 | t0 = DummyOperator( 38 | task_id='start' 39 | ) 40 | 41 | t1 = DummyOperator( 42 | task_id='group_bash_tasks' 43 | ) 44 | t2 = BashOperator( 45 | task_id='bash_print_date1', 46 | bash_command='sleep $[ ( $RANDOM % 30 ) + 1 ]s && date') 47 | t3 = BashOperator( 48 | task_id='bash_print_date2', 49 | bash_command='sleep $[ ( $RANDOM % 30 ) + 1 ]s && date') 50 | 51 | # generate tasks with a loop. task_id must be unique 52 | for task in range(0, 5): 53 | tn = PythonOperator( 54 | task_id=f'python_print_date_{task}', 55 | python_callable=my_custom_function, # make sure you don't include the () of the function 56 | op_kwargs={'task_number': task}, 57 | provide_context=True 58 | ) 59 | 60 | 61 | t0 >> tn # indented inside for loop so each task is added downstream of t0 62 | 63 | t0 >> t1 64 | t1 >> [t2, t3] # lists can be used to specify mutliple tasks 65 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/k8s/airflow-db-svc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: airflow-db 5 | namespace: airflow-k8sexecutor 6 | spec: 7 | clusterIP: None 8 | ports: 9 | - port: 5432 10 | protocol: TCP 11 | targetPort: 5432 12 | selector: 13 | name: airflow-db 14 | sessionAffinity: None 15 | type: ClusterIP 16 | status: 17 | loadBalancer: {} 18 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/k8s/airflow-db.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: airflow-db 5 | namespace: airflow-k8sexecutor 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | name: airflow-db 11 | template: 12 | metadata: 13 | labels: 14 | name: airflow-db 15 | spec: 16 | containers: 17 | - env: 18 | - name: POSTGRES_PASSWORD 19 | value: password 20 | image: postgres:9.6 21 | imagePullPolicy: IfNotPresent 22 | resources: 23 | limits: 24 | memory: "1Gi" 25 | livenessProbe: 26 | exec: 27 | command: 28 | - psql 29 | - -w 30 | - -U 31 | - postgres 32 | - -d 33 | - postgres 34 | - -c 35 | - SELECT 1 36 | failureThreshold: 3 37 | initialDelaySeconds: 45 38 | periodSeconds: 2 39 | successThreshold: 1 40 | timeoutSeconds: 1 41 | name: airflow-db 42 | readinessProbe: 43 | exec: 44 | command: 45 | - psql 46 | - -w 47 | - -U 48 | - postgres 49 | - -d 50 | - postgres 51 | - -c 52 | - SELECT 1 53 | failureThreshold: 3 54 | initialDelaySeconds: 15 55 | periodSeconds: 3 56 | successThreshold: 1 57 | timeoutSeconds: 1 58 | terminationMessagePath: /dev/termination-log 59 | terminationMessagePolicy: File 60 | volumeMounts: 61 | - mountPath: /var/lib/postgresql/data 62 | mountPropagation: None 63 | name: postgresql-data 64 | dnsPolicy: ClusterFirst 65 | restartPolicy: Always 66 | schedulerName: default-scheduler 67 | terminationGracePeriodSeconds: 30 68 | volumes: 69 | - emptyDir: {} 70 | name: postgresql-data 71 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/k8s/airflow-svc-external.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: airflow-svc 5 | namespace: airflow-k8sexecutor 6 | spec: 7 | externalTrafficPolicy: Cluster 8 | ports: 9 | - nodePort: 31725 10 | port: 80 11 | protocol: TCP 12 | targetPort: 8000 13 | selector: 14 | name: airflow 15 | sessionAffinity: None 16 | type: LoadBalancer 17 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/k8s/airflow-svc-internal.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: airflow 5 | namespace: airflow-k8sexecutor 6 | spec: 7 | ports: 8 | - port: 80 9 | protocol: TCP 10 | targetPort: 8000 11 | selector: 12 | name: airflow 13 | sessionAffinity: None 14 | type: ClusterIP 15 | status: 16 | loadBalancer: {} 17 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/k8s/airflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: airflow 5 | namespace: airflow-k8sexecutor 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | name: airflow 11 | template: 12 | metadata: 13 | labels: 14 | name: airflow 15 | spec: 16 | automountServiceAccountToken: true 17 | containers: 18 | - args: 19 | - webserver 20 | - -p 21 | - "8000" 22 | env: 23 | - name: AIRFLOW__CORE__SQL_ALCHEMY_CONN 24 | value: postgresql://postgres:password@airflow-db:5432/postgres 25 | - name: AIRFLOW__CORE__EXECUTOR 26 | value: KubernetesExecutor 27 | - name: AIRFLOW__WEBSERVER__AUTHENTICATE 28 | value: "True" 29 | - name: AIRFLOW__WEBSERVER__LOG_FETCH_TIMEOUT_SEC 30 | value: "15" 31 | - name: AIRFLOW__WEBSERVER__AUTH_BACKEND 32 | value: airflow.contrib.auth.backends.password_auth 33 | - name: AIRFLOW__API__AUTH_BACKEND 34 | value: airflow.contrib.auth.backends.password_auth 35 | - name: AIRFLOW__KUBERNETES__NAMESPACE 36 | value: airflow-k8sexecutor 37 | - name: AIRFLOW__KUBERNETES__WORKER_SERVICE_ACCOUNT_NAME 38 | value: default 39 | - name: AIRFLOW__KUBERNETES__IN_CLUSTER 40 | value: 'true' 41 | - name: AIRFLOW__KUBERNETES__DAGS_IN_IMAGE 42 | value: 'true' 43 | image: # Use your airflow docker image here, either selfbuilt with dags or apache/airflow:10.10.12 44 | imagePullPolicy: Always 45 | livenessProbe: 46 | failureThreshold: 30 47 | httpGet: 48 | path: /admin/ 49 | port: 8000 50 | scheme: HTTP 51 | periodSeconds: 10 52 | successThreshold: 1 53 | timeoutSeconds: 3 54 | name: airflow 55 | readinessProbe: 56 | failureThreshold: 3 57 | httpGet: 58 | path: /admin/ 59 | port: 8000 60 | scheme: HTTP 61 | periodSeconds: 10 62 | successThreshold: 1 63 | timeoutSeconds: 1 64 | resources: {} 65 | volumeMounts: 66 | - mountPath: /opt/airflow/logs/ 67 | mountPropagation: None 68 | name: airflow-logs 69 | - args: 70 | - scheduler 71 | env: 72 | - name: AIRFLOW__KUBERNETES_ENVIRONMENT_VARIABLES__AIRFLOW__WEBSERVER__LOG_FETCH_TIMEOUT_SEC 73 | value: "15" 74 | - name: AIRFLOW__CORE__SQL_ALCHEMY_CONN 75 | value: postgresql://postgres:password@airflow-db:5432/postgres 76 | - name: AIRFLOW__CORE__EXECUTOR 77 | value: KubernetesExecutor 78 | - name: AIRFLOW__KUBERNETES__NAMESPACE 79 | value: airflow-k8sexecutor 80 | - name: AIRFLOW__KUBERNETES__WORKER_SERVICE_ACCOUNT_NAME 81 | value: default 82 | - name: AIRFLOW__KUBERNETES__IN_CLUSTER 83 | value: 'true' 84 | - name: AIRFLOW__KUBERNETES__WORKER_CONTAINER_REPOSITORY 85 | value: # Supply your container worker image here, either selfbuilt with dags or apache/airflow 86 | - name: AIRFLOW__KUBERNETES__WORKER_CONTAINER_TAG 87 | value: # Your container worker image tag or 10.10.12 if using apache/airflow 88 | - name: AIRFLOW__KUBERNETES__DAGS_IN_IMAGE 89 | value: 'true' 90 | - name: AIRFLOW__KUBERNETES__RUN_AS_USER 91 | value: '50000' 92 | image: # Use your airflow docker image here, either selfbuilt with dags or apache/airflow:10.10.12 93 | imagePullPolicy: Always 94 | name: airflow-scheduler 95 | terminationMessagePath: /dev/termination-log 96 | terminationMessagePolicy: File 97 | volumeMounts: 98 | - mountPath: /opt/airflow/logs/ 99 | mountPropagation: None 100 | name: airflow-logs 101 | dnsPolicy: ClusterFirst 102 | initContainers: 103 | - args: 104 | - initdb 105 | env: 106 | - name: AIRFLOW__CORE__REMOTE_LOGGING 107 | value: 'true' 108 | - name: AIRFLOW__CORE__REMOTE_BASE_LOG_FOLDER 109 | value: 'gs://airflow-k8sexecutor-logs' 110 | - name: AIRFLOW__CORE__REMOTE_LOG_CONN_ID 111 | value: 'gcs-logs' 112 | - name: AIRFLOW__CORE__SQL_ALCHEMY_CONN 113 | value: postgresql://postgres:password@airflow-db:5432/postgres 114 | - name: AIRFLOW__CORE__EXECUTOR 115 | value: KubernetesExecutor 116 | - name: AIRFLOW__KUBERNETES__NAMESPACE 117 | value: airflow-k8sexecutor 118 | - name: AIRFLOW__KUBERNETES__WORKER_SERVICE_ACCOUNT_NAME 119 | value: default 120 | - name: AIRFLOW__KUBERNETES__IN_CLUSTER 121 | value: 'true' 122 | - name: AIRFLOW__KUBERNETES__WORKER_CONTAINER_REPOSITORY 123 | value: # Supply your container worker image here, either selfbuilt with dags or apache/airflow 124 | - name: AIRFLOW__KUBERNETES__WORKER_CONTAINER_TAG 125 | value: # Your container worker image tag or 10.10.12 if using apache/airflow 126 | - name: AIRFLOW__KUBERNETES__DAGS_IN_IMAGE 127 | value: 'true' 128 | image: # Use your airflow docker image here, either selfbuilt with dags or apache/airflow:10.10.12 129 | imagePullPolicy: Always 130 | restartPolicy: Always 131 | schedulerName: default-scheduler 132 | securityContext: {} 133 | shareProcessNamespace: false 134 | terminationGracePeriodSeconds: 600 135 | volumes: 136 | - emptyDir: {} 137 | name: airflow-data 138 | - emptyDir: {} 139 | name: airflow-logs 140 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/k8s/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: airflow-k8sexecutor 5 | -------------------------------------------------------------------------------- /airflow-k8sexecutor/k8s/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: airflow-k8sexecutor 5 | namespace: airflow-k8sexecutor 6 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/Dockerfile: -------------------------------------------------------------------------------- 1 | # take Airflow base image 2 | FROM apache/airflow:1.10.12 3 | 4 | # add dependencies for http basic auth 5 | RUN pip install --user --upgrade apache-airflow[password]==1.10.12 6 | 7 | # add dags 8 | ADD dags /opt/airflow/dags 9 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/Makefile: -------------------------------------------------------------------------------- 1 | # This makefile can be used to deploy infrastructure to a GKE cluster 2 | 3 | REGISTRY= 4 | GCP_PROJECT= 5 | IMAGE= 6 | TAG= 7 | PROJECT= 8 | 9 | build: 10 | docker build -t ${IMAGE}:${TAG} . 11 | docker tag ${IMAGE}:${TAG} ${REGISTRY}/${GCP_PROJECT}/${IMAGE}:${TAG} 12 | docker push ${REGISTRY}/${GCP_PROJECT}/${IMAGE}:${TAG} 13 | 14 | login: | configure_cluster 15 | 16 | set_project: 17 | gcloud config set project ${PROJECT} 18 | 19 | NAMESPACE=airflow-k8spodoperator 20 | configure_cluster: authenticate 21 | kubectl config set-context --current --namespace=${NAMESPACE} 22 | 23 | CLUSTER_NAME= 24 | authenticate: 25 | gcloud container clusters get-credentials ${CLUSTER_NAME} 26 | 27 | deploy: 28 | kubectl apply -f k8s/namespace.yaml 29 | kubectl apply -f k8s/serviceaccount.yaml 30 | kubectl apply -f k8s/role.yaml 31 | kubectl apply -f k8s/rolebinding.yaml 32 | kubectl apply -f k8s/airflow-db.yaml 33 | kubectl apply -f k8s/airflow-db-svc.yaml 34 | kubectl apply -f k8s/airflow.yaml 35 | kubectl apply -f k8s/airflow-svc-external.yaml 36 | 37 | redeploy: 38 | kubectl rollout restart deployment/airflow 39 | kubectl rollout status --timeout=3m deployment/airflow 40 | 41 | IP:=$(shell kubectl get svc airflow-svc -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') 42 | get_url: 43 | @echo http://${IP}/admin 44 | 45 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/dags/complete.py: -------------------------------------------------------------------------------- 1 | from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator 2 | from airflow.kubernetes.secret import Secret 3 | from airflow.kubernetes.volume import Volume 4 | from airflow.kubernetes.volume_mount import VolumeMount 5 | from airflow.kubernetes.pod import Port 6 | 7 | 8 | secret_file = Secret('volume', '/etc/sql_conn', 'airflow-secrets', 'sql_alchemy_conn') 9 | secret_env = Secret('env', 'SQL_CONN', 'airflow-secrets', 'sql_alchemy_conn') 10 | secret_all_keys = Secret('env', None, 'airflow-secrets-2') 11 | volume_mount = VolumeMount('test-volume', 12 | mount_path='/root/mount_file', 13 | sub_path=None, 14 | read_only=True) 15 | port = Port('http', 80) 16 | configmaps = ['test-configmap-1', 'test-configmap-2'] 17 | 18 | volume_config= { 19 | 'persistentVolumeClaim': 20 | { 21 | 'claimName': 'test-volume' 22 | } 23 | } 24 | volume = Volume(name='test-volume', configs=volume_config) 25 | 26 | affinity = { 27 | 'nodeAffinity': { 28 | 'preferredDuringSchedulingIgnoredDuringExecution': [ 29 | { 30 | "weight": 1, 31 | "preference": { 32 | "matchExpressions": { 33 | "key": "disktype", 34 | "operator": "In", 35 | "values": ["ssd"] 36 | } 37 | } 38 | } 39 | ] 40 | }, 41 | "podAffinity": { 42 | "requiredDuringSchedulingIgnoredDuringExecution": [ 43 | { 44 | "labelSelector": { 45 | "matchExpressions": [ 46 | { 47 | "key": "security", 48 | "operator": "In", 49 | "values": ["S1"] 50 | } 51 | ] 52 | }, 53 | "topologyKey": "failure-domain.beta.kubernetes.io/zone" 54 | } 55 | ] 56 | }, 57 | "podAntiAffinity": { 58 | "requiredDuringSchedulingIgnoredDuringExecution": [ 59 | { 60 | "labelSelector": { 61 | "matchExpressions": [ 62 | { 63 | "key": "security", 64 | "operator": "In", 65 | "values": ["S2"] 66 | } 67 | ] 68 | }, 69 | "topologyKey": "kubernetes.io/hostname" 70 | } 71 | ] 72 | } 73 | } 74 | 75 | tolerations = [ 76 | { 77 | 'key': "key", 78 | 'operator': 'Equal', 79 | 'value': 'value' 80 | } 81 | ] 82 | 83 | k = KubernetesPodOperator(namespace='default', 84 | image="ubuntu:16.04", 85 | cmds=["bash", "-cx"], 86 | arguments=["echo", "10"], 87 | labels={"foo": "bar"}, 88 | secrets=[secret_file, secret_env, secret_all_keys], 89 | ports=[port] 90 | volumes=[volume], 91 | volume_mounts=[volume_mount] 92 | name="test", 93 | task_id="task", 94 | affinity=affinity, 95 | is_delete_operator_pod=True, 96 | hostnetwork=False, 97 | tolerations=tolerations, 98 | configmaps=configmaps 99 | ) 100 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/dags/example/example.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime, timedelta 3 | from pathlib import Path 4 | 5 | from airflow import DAG 6 | from airflow.operators.dummy_operator import DummyOperator 7 | from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | dag = DAG( 12 | "example_using_k8s_pod_operator", 13 | schedule_interval="0 1 * * *", 14 | catchup=False, 15 | default_args={ 16 | "owner": "admin", 17 | "depends_on_past": False, 18 | "start_date": datetime(2020, 8, 7), 19 | "email_on_failure": False, 20 | "email_on_retry": False, 21 | "retries": 2, 22 | "retry_delay": timedelta(seconds=30), 23 | "sla": timedelta(hours=23), 24 | }, 25 | ) 26 | 27 | with dag: 28 | task_1 = KubernetesPodOperator( 29 | image="ubuntu:16.04", 30 | namespace="airflow-k8spodoperator", 31 | cmds=["bash", "-cx"], 32 | arguments=["echo", "10"], 33 | labels={"foo": "bar"}, 34 | name="test-using-k8spodoperator-task-1", 35 | task_id="task-1-echo", 36 | is_delete_operator_pod=False, 37 | in_cluster=True, 38 | ) 39 | task_2 = KubernetesPodOperator( 40 | image="ubuntu:16.04", 41 | namespace="airflow-k8spodoperator", 42 | cmds=["sleep"], 43 | arguments=["300"], 44 | labels={"foo": "bar"}, 45 | name="test-using-k8spodoperator-task-2", 46 | task_id="task-2-sleep", 47 | is_delete_operator_pod=False, 48 | in_cluster=True, 49 | ) 50 | 51 | task_1 >> task_2 52 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/dags/generated/generated.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime, timedelta 3 | from pathlib import Path 4 | 5 | from airflow import DAG 6 | from airflow.operators.dummy_operator import DummyOperator 7 | from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | dag = DAG( 12 | "generated_dag", 13 | schedule_interval="0 1 * * *", 14 | catchup=False, 15 | default_args={ 16 | "owner": "admin", 17 | "depends_on_past": False, 18 | "start_date": datetime(2020, 8, 7), 19 | "email_on_failure": False, 20 | "email_on_retry": False, 21 | "retries": 2, 22 | "retry_delay": timedelta(seconds=30), 23 | "sla": timedelta(hours=23), 24 | }, 25 | ) 26 | 27 | with dag: 28 | 29 | task = KubernetesPodOperator( 30 | image="ubuntu:16.04", 31 | namespace="airflow-k8spodoperator", 32 | cmds=["sleep"], 33 | arguments=["300"], 34 | labels={"foo": "bar"}, 35 | name="downstream-task", 36 | task_id="task-2-sleep", 37 | is_delete_operator_pod=False, 38 | in_cluster=True, 39 | ) 40 | 41 | for i in range(0, 10): 42 | task_name = f"generated-task-{i}" 43 | labels = {"task_id": i} 44 | task_id = f"task-{i}-echo" 45 | generated_task = KubernetesPodOperator( 46 | image="ubuntu:16.04", 47 | namespace="airflow-k8spodoperator", 48 | cmds=["bash", "-cx"], 49 | arguments=["echo", "I am generated"], 50 | labels={"foo": "bar"}, 51 | name=task_name, 52 | task_id=task_id, 53 | is_delete_operator_pod=False, 54 | in_cluster=True, 55 | ) 56 | 57 | generated_task >> task 58 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/k8s/airflow-db-svc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: airflow-db 5 | namespace: airflow-k8spodoperator 6 | spec: 7 | clusterIP: None 8 | ports: 9 | - port: 5432 10 | protocol: TCP 11 | targetPort: 5432 12 | selector: 13 | name: airflow-db 14 | sessionAffinity: None 15 | type: ClusterIP 16 | status: 17 | loadBalancer: {} 18 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/k8s/airflow-db.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: airflow-db 5 | namespace: airflow-k8spodoperator 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | name: airflow-db 11 | template: 12 | metadata: 13 | labels: 14 | name: airflow-db 15 | spec: 16 | containers: 17 | - env: 18 | - name: POSTGRES_PASSWORD 19 | value: password 20 | image: postgres:9.6 21 | imagePullPolicy: IfNotPresent 22 | livenessProbe: 23 | exec: 24 | command: 25 | - psql 26 | - -w 27 | - -U 28 | - postgres 29 | - -d 30 | - postgres 31 | - -c 32 | - SELECT 1 33 | failureThreshold: 3 34 | initialDelaySeconds: 45 35 | periodSeconds: 2 36 | successThreshold: 1 37 | timeoutSeconds: 1 38 | name: airflow-db 39 | readinessProbe: 40 | exec: 41 | command: 42 | - psql 43 | - -w 44 | - -U 45 | - postgres 46 | - -d 47 | - postgres 48 | - -c 49 | - SELECT 1 50 | failureThreshold: 3 51 | initialDelaySeconds: 15 52 | periodSeconds: 3 53 | successThreshold: 1 54 | timeoutSeconds: 1 55 | terminationMessagePath: /dev/termination-log 56 | terminationMessagePolicy: File 57 | volumeMounts: 58 | - mountPath: /var/lib/postgresql/data 59 | mountPropagation: None 60 | name: postgresql-data 61 | dnsPolicy: ClusterFirst 62 | restartPolicy: Always 63 | schedulerName: default-scheduler 64 | terminationGracePeriodSeconds: 30 65 | volumes: 66 | - emptyDir: {} 67 | name: postgresql-data 68 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/k8s/airflow-svc-external.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: airflow-svc 5 | namespace: airflow-k8spodoperator 6 | spec: 7 | externalTrafficPolicy: Cluster 8 | ports: 9 | - nodePort: 31724 10 | port: 80 11 | protocol: TCP 12 | targetPort: 8000 13 | selector: 14 | name: airflow 15 | sessionAffinity: None 16 | type: LoadBalancer 17 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/k8s/airflow-svc-internal.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: airflow 5 | namespace: airflow-k8spodoperator 6 | spec: 7 | ports: 8 | - port: 80 9 | protocol: TCP 10 | targetPort: 8000 11 | selector: 12 | name: airflow 13 | sessionAffinity: None 14 | type: ClusterIP 15 | status: 16 | loadBalancer: {} 17 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/k8s/airflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: airflow 5 | namespace: airflow-k8spodoperator 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | name: airflow 11 | template: 12 | metadata: 13 | labels: 14 | name: airflow 15 | spec: 16 | automountServiceAccountToken: true 17 | containers: 18 | - args: 19 | - webserver 20 | - -p 21 | - "8000" 22 | env: 23 | - name: AIRFLOW__CORE__SQL_ALCHEMY_CONN 24 | value: postgresql://postgres:password@airflow-db:5432/postgres 25 | - name: AIRFLOW__CORE__EXECUTOR 26 | value: LocalExecutor 27 | - name: AIRFLOW__WEBSERVER__AUTHENTICATE 28 | value: "True" 29 | - name: AIRFLOW__WEBSERVER__AUTH_BACKEND 30 | value: airflow.contrib.auth.backends.password_auth 31 | - name: AIRFLOW__API__AUTH_BACKEND 32 | value: airflow.contrib.auth.backends.password_auth 33 | image: # Use your airflow docker image here, either selfbuilt with dags or apache/airflow:10.10.12 34 | imagePullPolicy: Always 35 | livenessProbe: 36 | failureThreshold: 20 37 | httpGet: 38 | path: /admin/ 39 | port: 8000 40 | scheme: HTTP 41 | periodSeconds: 10 42 | successThreshold: 1 43 | timeoutSeconds: 3 44 | name: airflow 45 | readinessProbe: 46 | failureThreshold: 3 47 | httpGet: 48 | path: /admin/ 49 | port: 8000 50 | scheme: HTTP 51 | periodSeconds: 10 52 | successThreshold: 1 53 | timeoutSeconds: 1 54 | resources: {} 55 | volumeMounts: 56 | - mountPath: /opt/airflow/logs/ 57 | mountPropagation: None 58 | name: airflow-logs 59 | - args: 60 | - scheduler 61 | env: 62 | - name: AIRFLOW__CORE__SQL_ALCHEMY_CONN 63 | value: postgresql://postgres:password@airflow-db:5432/postgres 64 | - name: AIRFLOW__CORE__EXECUTOR 65 | value: LocalExecutor 66 | image: # Use your airflow docker image here, either selfbuilt with dags or apache/airflow:10.10.12 67 | imagePullPolicy: Always 68 | name: airflow-scheduler 69 | terminationMessagePath: /dev/termination-log 70 | terminationMessagePolicy: File 71 | volumeMounts: 72 | - mountPath: /opt/airflow/logs/ 73 | mountPropagation: None 74 | name: airflow-logs 75 | dnsPolicy: ClusterFirst 76 | initContainers: 77 | - args: 78 | - initdb 79 | env: 80 | - name: AIRFLOW__CORE__SQL_ALCHEMY_CONN 81 | value: postgresql://postgres:password@airflow-db:5432/postgres 82 | - name: AIRFLOW__CORE__EXECUTOR 83 | value: LocalExecutor 84 | image: # Use your airflow docker image here, either selfbuilt with dags or apache/airflow:10.10.12 85 | imagePullPolicy: Always 86 | name: airflow-initdb 87 | restartPolicy: Always 88 | schedulerName: default-scheduler 89 | securityContext: {} 90 | shareProcessNamespace: false 91 | terminationGracePeriodSeconds: 600 92 | volumes: 93 | - emptyDir: {} 94 | name: airflow-data 95 | - emptyDir: {} 96 | name: airflow-logs 97 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/k8s/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: airflow-k8spodoperator 5 | -------------------------------------------------------------------------------- /airflow-k8spodoperator/k8s/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: airflow-k8spodoperator 5 | namespace: airflow-k8spodoperator 6 | -------------------------------------------------------------------------------- /airflow-keda/Makefile: -------------------------------------------------------------------------------- 1 | # This makefile can be used to deploy infrastructure to a GKE cluster 2 | 3 | REGISTRY= 4 | GCP_PROJECT= 5 | IMAGE= 6 | TAG= 7 | PROJECT= 8 | 9 | build: 10 | docker build -t ${IMAGE}:${TAG} . 11 | docker tag ${IMAGE}:${TAG} ${REGISTRY}/${GCP_PROJECT}/${IMAGE}:${TAG} 12 | docker push ${REGISTRY}/${GCP_PROJECT}/${IMAGE}:${TAG} 13 | 14 | login: | configure_cluster 15 | 16 | set_project: 17 | gcloud config set project ${PROJECT} 18 | 19 | NAMESPACE=airflow 20 | configure_cluster: authenticate 21 | kubectl config set-context --current --namespace=${NAMESPACE} 22 | 23 | CLUSTER_NAME= 24 | authenticate: 25 | gcloud container clusters get-credentials ${CLUSTER_NAME} --region=europe-west1 26 | 27 | deploy: 28 | kubectl apply -f k8s/namespace.yaml 29 | 30 | helm_update: 31 | helm repo add kedacore https://kedacore.github.io/charts 32 | helm repo add astronomer https://helm.astronomer.io 33 | helm repo update 34 | 35 | helm_keda: deploy 36 | helm install keda \ 37 | --namespace keda kedacore/keda 38 | 39 | helm_airflow: 40 | helm install airflow \ 41 | --set executor=CeleryExecutor \ 42 | --set workers.keda.enabled=true \ 43 | --set workers.persistence.enabled=false \ 44 | --namespace airflow \ 45 | astronomer/airflow 46 | 47 | airflow_service: 48 | kubectl port-forward svc/airflow-webserver 8080:8080 49 | 50 | flower_service: 51 | kubectl port-forward svc/airflow-flower 5555:5555 52 | 53 | IP=$(shell kubectl get svc airflow-svc -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') 54 | get_url: 55 | echo http://${IP}/admin 56 | 57 | 58 | -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/.astro/config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | name: project 3 | -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/.dockerignore: -------------------------------------------------------------------------------- 1 | .astro 2 | .git 3 | .env 4 | airflow_settings.yaml 5 | logs/ -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | .env 3 | airflow_settings.yaml -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM astronomerinc/ap-airflow:latest-onbuild 2 | -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/Makefile: -------------------------------------------------------------------------------- 1 | REGISTRY= 2 | GCP_PROJECT= 3 | IMAGE= 4 | TAG= 5 | PROJECT= 6 | 7 | build: 8 | docker build -t ${IMAGE}:${TAG} . 9 | docker tag ${IMAGE}:${TAG} ${REGISTRY}/${GCP_PROJECT}/${IMAGE}:${TAG} 10 | docker push ${REGISTRY}/${GCP_PROJECT}/${IMAGE}:${TAG} 11 | 12 | update_dags: 13 | helm upgrade airflow -n airflow \ 14 | --reuse-values \ 15 | --set images.airflow.repository=${REGISTRY}/${GCP_PROJECT}/${IMAGE} \ 16 | --set images.airflow.tag=${TAG} \ 17 | astronomer/airflow 18 | -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/dags/example-dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.dummy_operator import DummyOperator 3 | from airflow.operators.bash_operator import BashOperator 4 | from airflow.operators.python_operator import PythonOperator 5 | from datetime import datetime, timedelta 6 | 7 | 8 | def my_custom_function(ts,**kwargs): 9 | """ 10 | This can be any python code you want and is called from the python operator. The code is not executed until 11 | the task is run by the airflow scheduler. 12 | """ 13 | print(f"I am task number {kwargs['task_number']}. This DAG Run execution date is {ts} and the current time is {datetime.now()}") 14 | print('Here is the full DAG Run context. It is available because provide_context=True') 15 | print(kwargs) 16 | 17 | 18 | # Default settings applied to all tasks 19 | default_args = { 20 | 'owner': 'airflow', 21 | 'depends_on_past': False, 22 | 'email_on_failure': False, 23 | 'email_on_retry': False, 24 | 'retries': 1, 25 | 'retry_delay': timedelta(minutes=5) 26 | } 27 | 28 | # Using a DAG context manager, you don't have to specify the dag property of each task 29 | with DAG('example_dag', 30 | start_date=datetime(2019, 1, 1), 31 | max_active_runs=3, 32 | schedule_interval=timedelta(minutes=30), # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs 33 | default_args=default_args, 34 | # catchup=False # enable if you don't want historical dag runs to run 35 | ) as dag: 36 | 37 | t0 = DummyOperator( 38 | task_id='start' 39 | ) 40 | 41 | t1 = DummyOperator( 42 | task_id='group_bash_tasks' 43 | ) 44 | t2 = BashOperator( 45 | task_id='bash_print_date1', 46 | bash_command='sleep $[ ( $RANDOM % 30 ) + 1 ]s && date') 47 | t3 = BashOperator( 48 | task_id='bash_print_date2', 49 | bash_command='sleep $[ ( $RANDOM % 30 ) + 1 ]s && date') 50 | 51 | # generate tasks with a loop. task_id must be unique 52 | for task in range(5): 53 | tn = PythonOperator( 54 | task_id=f'python_print_date_{task}', 55 | python_callable=my_custom_function, # make sure you don't include the () of the function 56 | op_kwargs={'task_number': task}, 57 | provide_context=True 58 | ) 59 | 60 | 61 | t0 >> tn # indented inside for loop so each task is added downstream of t0 62 | 63 | t0 >> t1 64 | t1 >> [t2, t3] # lists can be used to specify mutliple tasks 65 | -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/dags/example-generated.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.dummy_operator import DummyOperator 3 | from airflow.operators.python_operator import PythonOperator 4 | from datetime import datetime, timedelta 5 | 6 | 7 | def my_custom_function(ts,**kwargs): 8 | """ 9 | This can be any python code you want and is called from the python operator. The code is not executed until 10 | the task is run by the airflow scheduler. 11 | """ 12 | print(f"I am task number {kwargs['task_number']}. This DAG Run execution date is {ts} and the current time is {datetime.now()}") 13 | print('Here is the full DAG Run context. It is available because provide_context=True') 14 | print(kwargs) 15 | 16 | 17 | default_args = { 18 | 'owner': 'airflow', 19 | 'depends_on_past': False, 20 | 'email_on_failure': False, 21 | 'email_on_retry': False, 22 | 'retries': 1, 23 | 'retry_delay': timedelta(minutes=5) 24 | } 25 | 26 | with DAG('example_dag_generated', 27 | start_date=datetime(2019, 1, 1), 28 | max_active_runs=3, 29 | schedule_interval=timedelta(minutes=30), 30 | default_args=default_args, 31 | ) as dag: 32 | 33 | t0 = DummyOperator( 34 | task_id='start' 35 | ) 36 | 37 | # generate tasks with a loop. task_id must be unique 38 | for task in range(20): 39 | tn = PythonOperator( 40 | task_id=f'python_print_date_{task}', 41 | python_callable=my_custom_function, 42 | op_kwargs={'task_number': task}, 43 | provide_context=True 44 | ) 45 | 46 | 47 | t0 >> tn 48 | 49 | -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/packages.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patchandpray/three-ways-to-run-airflow-on-kubernetes-code-examples/cda4b0075a916a288e57575837716927cc9f353b/airflow-keda/dags-docker-image/packages.txt -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/plugins/example-plugin.py: -------------------------------------------------------------------------------- 1 | from airflow.plugins_manager import AirflowPlugin 2 | from flask_admin.base import MenuLink 3 | 4 | """ 5 | Look for the Astronomer tab in the UI. 6 | """ 7 | airflow_plugins_ml = MenuLink( 8 | category='Astronomer', 9 | name='Airflow-Plugins', 10 | url='https://github.com/airflow-plugins/') 11 | 12 | astro_docs_ml = MenuLink( 13 | category='Astronomer', 14 | name='Astronomer Docs', 15 | url='https://www.astronomer.io/docs/') 16 | 17 | astro_guides_ml = MenuLink( 18 | category='Astronomer', 19 | name='Airflow Guides', 20 | url='https://www.astronomer.io/guides/') 21 | 22 | 23 | class AstroLinksPlugin(AirflowPlugin): 24 | name = 'astronomer_menu_links' 25 | operators = [] 26 | flask_blueprints = [] 27 | hooks = [] 28 | executors = [] 29 | macros = [] 30 | admin_views = [] 31 | menu_links = [airflow_plugins_ml, astro_docs_ml, astro_guides_ml] 32 | appbuilder_views = [] 33 | appbuilder_menu_items = [ 34 | { 35 | "name": ml.name, 36 | "category": ml.category, 37 | "category_icon": "fa-rocket", 38 | "href": ml.url, 39 | } for ml in menu_links 40 | ] -------------------------------------------------------------------------------- /airflow-keda/dags-docker-image/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patchandpray/three-ways-to-run-airflow-on-kubernetes-code-examples/cda4b0075a916a288e57575837716927cc9f353b/airflow-keda/dags-docker-image/requirements.txt -------------------------------------------------------------------------------- /airflow-keda/k8s/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: airflow 5 | --- 6 | apiVersion: v1 7 | kind: Namespace 8 | metadata: 9 | name: keda 10 | -------------------------------------------------------------------------------- /airflow-keda/keda/scaledobject.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: keda.k8s.io/v1alpha1 2 | kind: ScaledObject 3 | metadata: 4 | name: airflow-worker 5 | spec: 6 | scaleTargetRef: 7 | deploymentName: airflow-worker 8 | pollingInterval: 10 # Optional. Default: 30 seconds 9 | cooldownPeriod: 30 # Optional. Default: 300 seconds 10 | maxReplicaCount: 10 # Optional. Default: 100 11 | triggers: 12 | - type: postgresql 13 | metadata: 14 | connection: AIRFLOW_CONN_AIRFLOW_DB 15 | query: "SELECT ceil(COUNT(*)::decimal / 4) FROM task_instance WHERE state='running' OR state='queued'" 16 | targetQueryValue: "1" 17 | --------------------------------------------------------------------------------