├── setup-pentaho ├── kettle-properties │ ├── dev-kettle.properties │ ├── qa-kettle.properties │ ├── prod-kettle.properties │ └── localhost-kettle.properties ├── carte-master-config.xml ├── repositories.xml ├── carte-slave-config.xml ├── simple-jndi │ └── jdbc.properties ├── docker-entrypoint.sh └── Dockerfile ├── .gitignore ├── setup-airflow ├── Dockerfile ├── execute-carte.sh └── airflow.cfg ├── source-code ├── dags │ ├── async-trigger.py │ ├── hello-world.py │ ├── sync-trigger.py │ ├── load-testing.py │ └── utils │ │ └── execute_pdi.py └── ktrs │ ├── helloworld │ ├── helloworld-job.kjb │ └── helloworld-trans.ktr │ └── process1 │ ├── task2.ktr │ └── task1.ktr ├── docker-compose.yaml └── README.md /setup-pentaho/kettle-properties/dev-kettle.properties: -------------------------------------------------------------------------------- 1 | HOST_ENV=dev 2 | PDI_TEST_MESSAGE="Hi from Dev PDI!!!" 3 | -------------------------------------------------------------------------------- /setup-pentaho/kettle-properties/qa-kettle.properties: -------------------------------------------------------------------------------- 1 | HOST_ENV=qa 2 | PDI_TEST_MESSAGE="Hi from QA PDI!!!" 3 | -------------------------------------------------------------------------------- /setup-pentaho/kettle-properties/prod-kettle.properties: -------------------------------------------------------------------------------- 1 | HOST_ENV=prod 2 | PDI_TEST_MESSAGE="Hi from Prod PDI!!!" 3 | -------------------------------------------------------------------------------- /setup-pentaho/kettle-properties/localhost-kettle.properties: -------------------------------------------------------------------------------- 1 | HOST_ENV=localhost 2 | PDI_TEST_MESSAGE="Hi from Localhost PDI!!!" 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | setup-airflow/logs 3 | setup-airflow/plugins 4 | setup-pentaho/logs 5 | __pycache__ 6 | .meta 7 | .env 8 | # jdbc.properties -------------------------------------------------------------------------------- /setup-pentaho/carte-master-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | CARTE_NAME 5 | CARTE_NETWORK_INTERFACE 6 | CARTE_PORT 7 | CARTE_USER 8 | CARTE_PASSWORD 9 | CARTE_IS_MASTER 10 | 11 | 12 | -------------------------------------------------------------------------------- /setup-pentaho/repositories.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | KettleFileRepository 5 | test-repo 6 | .kjb or .ktr files 7 | /home/pentaho/repositories 8 | N 9 | N 10 | 11 | -------------------------------------------------------------------------------- /setup-airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.0.1 2 | 3 | USER root 4 | 5 | # Install environment dependencies 6 | RUN apt-get update \ 7 | # xmlstarlet package is required by Airflow to read XML log generated by Carte server running in separate container 8 | && apt-get install xmlstarlet -y \ 9 | # Upgrade PIP 10 | && pip install --upgrade pip \ 11 | # Install project specific packages 12 | && pip install 'apache-airflow[postgres]' 13 | 14 | USER airflow -------------------------------------------------------------------------------- /setup-pentaho/carte-slave-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | CARTE_MASTER_NAME 7 | CARTE_MASTER_HOSTNAME 8 | CARTE_MASTER_PORT 9 | CARTE_MASTER_USER 10 | CARTE_MASTER_PASSWORD 11 | CARTE_MASTER_IS_MASTER 12 | 13 | 14 | 15 | 16 | CARTE_REPORT_TO_MASTERS 17 | 18 | 19 | CARTE_NAME 20 | CARTE_NETWORK_INTERFACE 21 | CARTE_PORT 22 | CARTE_USER 23 | CARTE_PASSWORD 24 | CARTE_IS_MASTER 25 | 26 | 27 | -------------------------------------------------------------------------------- /setup-pentaho/simple-jndi/jdbc.properties: -------------------------------------------------------------------------------- 1 | # Reference: https://help.pentaho.com/Documentation/9.1/Setup/JDBC_drivers_reference 2 | # Caution: this file stores DB credentials, hence it should be added to .gitignore. Instead create this file in the server and mount it to PDI container using docker compose. 3 | 4 | 5 | # Localhost PostGres DB connection string 6 | db-localhost/type=javax.sql.DataSource 7 | db-localhost/driver=org.postgresql.Driver 8 | db-localhost/url=jdbc:postgresql://host.docker.internal/database 9 | db-localhost/user=postgres 10 | db-localhost/password=postgres 11 | 12 | # Dev PostGres DB connection string 13 | db-dev/type=javax.sql.DataSource 14 | db-dev/driver=org.postgresql.Driver 15 | db-dev/url=jdbc:postgresql://[:]/ 16 | db-dev/user=postgres 17 | db-dev/password=postgres 18 | 19 | # QA PostGres DB connection string 20 | db-qa/type=javax.sql.DataSource 21 | db-qa/driver=org.postgresql.Driver 22 | db-qa/url=jdbc:postgresql://[:]/ 23 | db-qa/user=postgres 24 | db-qa/password=postgres 25 | 26 | # PROD PostGres DB connection string 27 | db-prod/type=javax.sql.DataSource 28 | db-prod/driver=org.postgresql.Driver 29 | db-prod/url=jdbc:postgresql://[:]/ 30 | db-prod/user=postgres 31 | db-prod/password=postgres -------------------------------------------------------------------------------- /source-code/dags/async-trigger.py: -------------------------------------------------------------------------------- 1 | # To illustrate asynchronous data processing triggers from Airflow container to PDI container 2 | 3 | from airflow import DAG 4 | from airflow.utils.dates import days_ago 5 | from airflow.operators.bash_operator import BashOperator 6 | from airflow.operators.dummy import DummyOperator 7 | 8 | args = { 9 | "owner": "airflow", 10 | "start_date": days_ago(1), 11 | "depends_on_past": False, 12 | "wait_for_downstream": False, 13 | "catchup": False, 14 | } 15 | 16 | 17 | with DAG( 18 | dag_id="async-trigger", 19 | default_args=args, 20 | schedule_interval=None, 21 | catchup=False, 22 | description=f"To illustrate asynchronous task triggers from Airflow container to PDI container", 23 | ) as dag: 24 | 25 | t1 = DummyOperator( 26 | task_id='Start', 27 | ) 28 | 29 | t2 = BashOperator( 30 | task_id='Task_1', 31 | bash_command='curl "${PDI_CONN_STR}/kettle/executeTrans/?rep=test-repo&d&trans=/process1/task1"' 32 | ) 33 | 34 | t3 = BashOperator( 35 | task_id='Task_2', 36 | bash_command='curl "${PDI_CONN_STR}/kettle/executeTrans/?rep=test-repo&trans=/process1/task2"' 37 | ) 38 | 39 | t4 = DummyOperator( 40 | task_id='Stop', 41 | ) 42 | 43 | t1 >> t2 >> t3 >> t4 -------------------------------------------------------------------------------- /source-code/dags/hello-world.py: -------------------------------------------------------------------------------- 1 | # To illustrate how we can trigger a job/transformation in the PDI container via Carte APIs 2 | # Reference: https://help.pentaho.com/Documentation/9.1/Developer_center/REST_API_Reference/Carte 3 | 4 | from airflow import DAG 5 | from airflow.utils.dates import days_ago 6 | from airflow.operators.bash_operator import BashOperator 7 | from airflow.operators.dummy import DummyOperator 8 | 9 | args = { 10 | "owner": "airflow", 11 | "start_date": days_ago(1), 12 | "depends_on_past": False, 13 | "wait_for_downstream": False, 14 | "catchup": False, 15 | } 16 | 17 | 18 | with DAG( 19 | dag_id="hello-world", 20 | default_args=args, 21 | schedule_interval=None, 22 | catchup=False, 23 | description=f"Hello world!!!", 24 | ) as dag: 25 | 26 | start = DummyOperator( 27 | task_id='Start', 28 | ) 29 | 30 | t1 = BashOperator( 31 | task_id='Trigger_Job', 32 | bash_command='curl "${PDI_CONN_STR}/kettle/executeJob/?rep=test-repo&job=/helloworld/helloworld-job"' 33 | ) 34 | 35 | t2 = BashOperator( 36 | task_id='Trigger_Transformation', 37 | bash_command='curl "${PDI_CONN_STR}/kettle/executeTrans/?rep=test-repo&trans=/helloworld/helloworld-trans"' 38 | ) 39 | 40 | stop = DummyOperator( 41 | task_id='Stop', 42 | ) 43 | 44 | start >> [t1, t2] >> stop -------------------------------------------------------------------------------- /source-code/dags/sync-trigger.py: -------------------------------------------------------------------------------- 1 | # To illustrate synchronous task triggers from Airflow container to PDI container 2 | 3 | from airflow import DAG 4 | from airflow.utils.dates import days_ago 5 | from airflow.operators.bash_operator import BashOperator 6 | from airflow.operators.dummy import DummyOperator 7 | from utils.execute_pdi import execute_trans 8 | 9 | args = { 10 | "owner": "airflow", 11 | "start_date": days_ago(1), 12 | "depends_on_past": False, 13 | "wait_for_downstream": False, 14 | "catchup": False, 15 | } 16 | 17 | 18 | with DAG( 19 | dag_id="sync-trigger", 20 | default_args=args, 21 | schedule_interval=None, 22 | catchup=False, 23 | description=f"To illustrate synchronous task triggers from Airflow container to PDI container", 24 | ) as dag: 25 | 26 | t1 = DummyOperator( 27 | task_id='Start', 28 | ) 29 | 30 | t2 = BashOperator( 31 | task_id='Task_1', 32 | bash_command=execute_trans( 33 | rep="test-repo", 34 | task="task1", 35 | dir="/process1/", 36 | param="" 37 | ) 38 | ) 39 | 40 | t3 = BashOperator( 41 | task_id='Task_2', 42 | bash_command=execute_trans( 43 | rep="test-repo", 44 | task="task2", 45 | dir="/process1/", 46 | param="" 47 | ) 48 | ) 49 | 50 | t4 = DummyOperator( 51 | task_id='Stop', 52 | ) 53 | 54 | t1 >> t2 >> t3 >> t4 -------------------------------------------------------------------------------- /source-code/dags/load-testing.py: -------------------------------------------------------------------------------- 1 | # To illustrate how we can trigger a job/transformation in the PDI container via Carte APIs 2 | # Reference: https://help.pentaho.com/Documentation/9.1/Developer_center/REST_API_Reference/Carte 3 | 4 | from airflow import DAG 5 | from airflow.utils.dates import days_ago 6 | from airflow.operators.bash_operator import BashOperator 7 | from airflow.operators.dummy import DummyOperator 8 | 9 | args = { 10 | "owner": "airflow", 11 | "start_date": days_ago(1), 12 | "depends_on_past": False, 13 | "wait_for_downstream": False, 14 | "catchup": False, 15 | } 16 | 17 | 18 | with DAG( 19 | dag_id="load-testing", 20 | default_args=args, 21 | schedule_interval=None, 22 | catchup=False, 23 | description=f"Run multiple tasks in parallel for load testing", 24 | ) as dag: 25 | 26 | start = DummyOperator( 27 | task_id='Start', 28 | ) 29 | 30 | t1 = BashOperator( 31 | task_id='Trigger_Job1', 32 | bash_command='curl "${PDI_CONN_STR}/kettle/executeJob/?rep=test-repo&job=/helloworld/helloworld-job"' 33 | ) 34 | 35 | t2 = BashOperator( 36 | task_id='Trigger_Job2', 37 | bash_command='curl "${PDI_CONN_STR}/kettle/executeJob/?rep=test-repo&job=/helloworld/helloworld-job"' 38 | ) 39 | 40 | t3 = BashOperator( 41 | task_id='Trigger_Job3', 42 | bash_command='curl "${PDI_CONN_STR}/kettle/executeJob/?rep=test-repo&job=/helloworld/helloworld-job"' 43 | ) 44 | 45 | t4 = BashOperator( 46 | task_id='Trigger_Transformation', 47 | bash_command='curl "${PDI_CONN_STR}/kettle/executeTrans/?rep=test-repo&trans=/helloworld/helloworld-trans"' 48 | ) 49 | 50 | stop = DummyOperator( 51 | task_id='Stop', 52 | ) 53 | 54 | start >> [t1, t2, t3, t4] >> stop -------------------------------------------------------------------------------- /setup-pentaho/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # based on https://github.com/aloysius-lim/docker-pentaho-di/blob/master/docker/Dockerfile 3 | set -e 4 | 5 | if [ "$1" = 'carte.sh' ]; then 6 | if [ ! -f "$KETTLE_HOME/carte.config.xml" ]; then 7 | # Set variables to default if not explicitly provided 8 | : ${CARTE_NAME:=carte-server} 9 | : ${CARTE_NETWORK_INTERFACE:=eth0} 10 | : ${CARTE_PORT:=8181} 11 | : ${CARTE_USER:=cluster} 12 | : ${CARTE_PASSWORD:=cluster} 13 | : ${CARTE_IS_MASTER:=Y} 14 | 15 | : ${CARTE_INCLUDE_MASTERS:=N} 16 | 17 | : ${CARTE_REPORT_TO_MASTERS:=Y} 18 | : ${CARTE_MASTER_NAME:=carte-master} 19 | : ${CARTE_MASTER_HOSTNAME:=localhost} 20 | : ${CARTE_MASTER_PORT:=8181} 21 | : ${CARTE_MASTER_USER:=cluster} 22 | : ${CARTE_MASTER_PASSWORD:=cluster} 23 | : ${CARTE_MASTER_IS_MASTER:=Y} 24 | 25 | # Copy master or slave config file based on the CARTE_INCLUDE_MASTERS flag 26 | if [ "$CARTE_INCLUDE_MASTERS" = "Y" ]; then 27 | cp $PENTAHO_HOME/templates/carte-slave-config.xml "$KETTLE_HOME/carte.config.xml" 28 | sed -i "s/CARTE_REPORT_TO_MASTERS/$CARTE_REPORT_TO_MASTERS/" "$KETTLE_HOME/carte.config.xml" 29 | sed -i "s/CARTE_MASTER_NAME/$CARTE_MASTER_NAME/" "$KETTLE_HOME/carte.config.xml" 30 | sed -i "s/CARTE_MASTER_HOSTNAME/$CARTE_MASTER_HOSTNAME/" "$KETTLE_HOME/carte.config.xml" 31 | sed -i "s/CARTE_MASTER_PORT/$CARTE_MASTER_PORT/" "$KETTLE_HOME/carte.config.xml" 32 | sed -i "s/CARTE_MASTER_USER/$CARTE_MASTER_USER/" "$KETTLE_HOME/carte.config.xml" 33 | sed -i "s/CARTE_MASTER_PASSWORD/$CARTE_MASTER_PASSWORD/" "$KETTLE_HOME/carte.config.xml" 34 | sed -i "s/CARTE_MASTER_IS_MASTER/$CARTE_MASTER_IS_MASTER/" "$KETTLE_HOME/carte.config.xml" 35 | else 36 | cp $PENTAHO_HOME/templates/carte-master-config.xml "$KETTLE_HOME/carte.config.xml" 37 | fi 38 | sed -i "s/CARTE_NAME/$CARTE_NAME/" "$KETTLE_HOME/carte.config.xml" 39 | sed -i "s/CARTE_NETWORK_INTERFACE/$CARTE_NETWORK_INTERFACE/" "$KETTLE_HOME/carte.config.xml" 40 | sed -i "s/CARTE_PORT/$CARTE_PORT/" "$KETTLE_HOME/carte.config.xml" 41 | sed -i "s/CARTE_USER/$CARTE_USER/" "$KETTLE_HOME/carte.config.xml" 42 | sed -i "s/CARTE_PASSWORD/$CARTE_PASSWORD/" "$KETTLE_HOME/carte.config.xml" 43 | sed -i "s/CARTE_IS_MASTER/$CARTE_IS_MASTER/" "$KETTLE_HOME/carte.config.xml" 44 | fi 45 | fi 46 | 47 | exec "$@" -------------------------------------------------------------------------------- /source-code/dags/utils/execute_pdi.py: -------------------------------------------------------------------------------- 1 | """ 2 | Summary: This module contains helper functions required to build 3 | the Carte API URL trigger for PDI Jobs/ Transformations. 4 | Params: 5 | - rep: PDI repository containing the job/trans; value must exist in the tag of repositories.xml 6 | - task: job/trans file name present in the folder; do not include .ktr/.kjb extension 7 | - dir: '/' if job/trans file is in root directory inside . Else '/subfolder1/subfolder2/.../'. Do not include job/trans file name 8 | - param: pass parameters for the job/trans. Add '&' if multiple,ex: param1=value¶m2=value 9 | """ 10 | 11 | 12 | def execute_command(executionType, task_type, rep, task, dir, param): 13 | 14 | command = "bash /opt/airflow/execute-carte.sh " 15 | command += f'''{task} "{executionType}/?rep={rep}&{task_type}={dir}{task}&{param}"''' 16 | 17 | return command 18 | 19 | def execute_trans(rep, task, dir, param=''): 20 | """Summary: Build executeTrans Carte API URL 21 | 22 | Args: 23 | rep (string): [PDI repository containing the transformation .ktr file. Value must exist in the tag of repositories.xml] 24 | task ([string]): [transformation file name present in the folder. Do not include .ktr extension] 25 | dir ([string]): ['/' if file is in root directory inside . Else '/subfolder1/subfolder2/.../'. Do not include file name] 26 | param ([string]): [If required to pass parameters to the transformation. Add '&' if multiple, ex: param1=value¶m2=value] 27 | 28 | Returns: 29 | [string]: [Carte executeTrans API URL for the transformation] 30 | """ 31 | 32 | command = execute_command( 33 | executionType="executeTrans", task_type="trans", 34 | rep=rep, task=task, dir=dir,param=param 35 | ) 36 | 37 | return command 38 | 39 | def execute_job(rep, task, dir, param=''): 40 | """Summary: Build executeJob Carte API URL 41 | 42 | Args: 43 | rep (string): [PDI repository containing the job .kjb file. Value must exist in the tag of repositories.xml] 44 | task ([string]): [job file name present in the folder. Do not include .kjb extension] 45 | dir ([string]): ['/' if file is in root directory inside . Else '/subfolder1/subfolder2/.../'. Do not include file name] 46 | param ([string]): [If required to pass parameters to the job. Add '&' if multiple, ex: param1=value¶m2=value] 47 | 48 | Returns: 49 | [string]: [Carte executeJob API URL for the job] 50 | """ 51 | command = execute_command( 52 | executionType="executeJob", task_type="job", 53 | rep=rep, task=task, dir=dir,param=param 54 | ) 55 | 56 | return command -------------------------------------------------------------------------------- /setup-airflow/execute-carte.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This file enables Airflow container to stream the log generated by Carte server. 3 | # Helps in tracking the job/transformation status and trigger/skip downstream processes accordingly. 4 | # Based on and modified from the log tracking mechanism discussed in : 5 | # https://diethardsteiner.github.io/pdi/2020/04/01/Scheduling-a-PDI-Job-on-Apache-Airflow.html 6 | 7 | 8 | CARTE_SERVER_URL=$PDI_CONN_STR 9 | PDI_LOG_LEVEL=Basic 10 | SLEEP_INTERVAL_SECONDS=5 11 | PDI_TASK=$1 12 | PDI_TASK_CMD=$2 13 | 14 | 15 | if [[ $PDI_TASK_CMD == *"rep="* ]] && [[ $PDI_TASK_CMD == *"job="* || $PDI_TASK_CMD == *"trans="* ]]; then 16 | 17 | set PDI_TASK_ID 18 | set PDI_TASK_STATUS 19 | 20 | 21 | # Execute task and get its Task ID 22 | if [[ $PDI_TASK_CMD == *"executeJob"* ]]; then 23 | PDI_TASK_ID=$(curl -s "${CARTE_SERVER_URL}/kettle/${PDI_TASK_CMD}&level=${PDI_LOG_LEVEL}" | xmlstarlet sel -t -m '/webresult/id' -v . -n) 24 | echo "The PDI Task ID is: " ${PDI_TASK_ID} 25 | else 26 | PDI_TASK_ID=$(curl -s "${CARTE_SERVER_URL}/kettle/${PDI_TASK_CMD}&level=${PDI_LOG_LEVEL}") 27 | fi 28 | 29 | getPDITaskStatus() { 30 | if [[ $PDI_TASK_CMD == *"executeTrans"* ]]; then 31 | curl -s "${CARTE_SERVER_URL}/kettle/transStatus/?name=${PDI_TASK}&id=${PDI_TASK_ID}&xml=Y" | xmlstarlet sel -t -m '/transstatus/status_desc' -v . -n 32 | else 33 | curl -s "${CARTE_SERVER_URL}/kettle/jobStatus/?name=${PDI_TASK}&id=${PDI_TASK_ID}&xml=Y" | xmlstarlet sel -t -m '/jobstatus/status_desc' -v . -n 34 | fi 35 | } 36 | 37 | getPDITaskFullLog() { 38 | if [[ $PDI_TASK_CMD == *"executeTrans"* ]]; then 39 | echo "Check carte server for transformation log!!!" 40 | else 41 | curl -s "${CARTE_SERVER_URL}/kettle/jobStatus/?name=${PDI_TASK}&id=${PDI_TASK_ID}&xml=Y" | xmlstarlet sel -t -m 'jobstatus/result/log_text' -v . -n 42 | fi 43 | } 44 | 45 | PDI_TASK_STATUS=$(getPDITaskStatus) 46 | 47 | # loop as long as the job is running 48 | while [ ${PDI_TASK_STATUS} == "Running" ] 49 | do 50 | PDI_TASK_STATUS=$(getPDITaskStatus) 51 | echo "The PDI task status is: " ${PDI_TASK_STATUS} 52 | echo "I'll check in ${SLEEP_INTERVAL_SECONDS} seconds again" 53 | # check every x seconds 54 | sleep ${SLEEP_INTERVAL_SECONDS} 55 | done 56 | 57 | # get and print full pdi task log 58 | echo "The PDI task status is: " ${PDI_TASK_STATUS} 59 | echo "Printing full log ..." 60 | echo $(getPDITaskFullLog) 61 | 62 | # Check if any error. Send exit 1 if so. 63 | if [[ ${PDI_TASK_STATUS} == "Finished" ]]; then 64 | exit 0 65 | else 66 | exit 1 67 | fi 68 | 69 | else 70 | echo "Error executing: ${PDI_TASK_CMD}\n File or directory not found." 71 | exit 1 72 | fi -------------------------------------------------------------------------------- /setup-pentaho/Dockerfile: -------------------------------------------------------------------------------- 1 | # Get Base image 2 | FROM openjdk:8-jre 3 | 4 | LABEL maintainer="saritkumarsi@gmail.com" \ 5 | version="1.1" \ 6 | description="Docker file builds container with Pentaho Data Integration & Carte Server" 7 | 8 | # Set PDI user with permissions same as the Host machine. 9 | ARG PENTAHO_GID 10 | ARG PENTAHO_UID 11 | 12 | # Set required environment vars 13 | ENV PDI_RELEASE=9.1 \ 14 | PDI_VERSION=9.1.0.0-324 \ 15 | PENTAHO_JAVA_HOME=/usr/local/openjdk-8 \ 16 | PENTAHO_HOME=/home/pentaho \ 17 | PENTAHO_UID=${PENTAHO_UID} \ 18 | PENTAHO_GID=${PENTAHO_GID} \ 19 | KETTLE_HOME=/opt/data-integration \ 20 | PATH=${KETTLE_HOME}:${PATH} 21 | 22 | # Create Pentaho user home directory and required sub-folders 23 | RUN mkdir -p ${PENTAHO_HOME}/templates ${PENTAHO_HOME}/scripts \ 24 | # Create Pentaho group passed as PENTAHO_GID arg 25 | && groupadd -r ${PENTAHO_GID} \ 26 | # Create Pentaho user with PENTAHO_UID same as Host UID, and assign to new group 27 | && useradd -s /bin/bash -d ${PENTAHO_HOME} -r -g ${PENTAHO_GID} -u ${PENTAHO_UID} pentaho \ 28 | # Download PDI and save in PENTAHO_HOME 29 | && /usr/bin/wget --progress=dot:giga \ 30 | https://sourceforge.net/projects/pentaho/files/Pentaho%20${PDI_RELEASE}/client-tools/pdi-ce-${PDI_VERSION}.zip \ 31 | -P ${PENTAHO_HOME} \ 32 | # Unzip PDI to /opt/ directory 33 | && /usr/bin/unzip -q ${PENTAHO_HOME}/pdi-ce-${PDI_VERSION}.zip -d /opt/ \ 34 | # Clean up downloaded files 35 | && rm -R ${PENTAHO_HOME}/pdi-ce-${PDI_VERSION}.zip \ 36 | # Create directory for the kettle.properties file 37 | && mkdir ${KETTLE_HOME}/.kettle \ 38 | # Make pentaho user owner of both PENTAHO_HOME and KETTLE_HOME directories 39 | && chown -R pentaho ${PENTAHO_HOME} ${KETTLE_HOME} 40 | 41 | # Copy carte configs and docker entrypoint file 42 | COPY carte-*-config.xml ${PENTAHO_HOME}/templates/ 43 | COPY docker-entrypoint.sh ${PENTAHO_HOME}/scripts/ 44 | 45 | # Make entrypoint as executable 46 | RUN chmod +x ${PENTAHO_HOME}/scripts/docker-entrypoint.sh 47 | 48 | # # Install App dependent packages, if any 49 | # # Download MySQL JDBC Connector to PENTAHO_HOME 50 | # RUN /usr/bin/wget --progress=dot:giga \ 51 | # https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.49.zip -P ${PENTAHO_HOME} \ 52 | # # Unzip MySQL jar file 53 | # && /usr/bin/unzip -q ${PENTAHO_HOME}/mysql-connector-java-5.1.49.zip -d ${PENTAHO_HOME} \ 54 | # # Copy unzipped jar file to /opt/data-integration 55 | # && cp ${PENTAHO_HOME}/mysql-connector-java-5.1.49/mysql-connector-java-5.1.49-bin.jar /opt/data-integration/lib \ 56 | # # Clean up downloaded files 57 | # && rm -R ${PENTAHO_HOME}/mysql-connector* 58 | 59 | USER pentaho 60 | 61 | # Expose Carte Server 62 | EXPOSE ${CARTE_PORT} 63 | 64 | # Set working directory 65 | WORKDIR ${KETTLE_HOME} 66 | 67 | # Set container entrypoint. Sets all required configs for carte server. 68 | ENTRYPOINT ["/home/pentaho/scripts/docker-entrypoint.sh"] 69 | 70 | # Start Carte Server - the entry point sets configs in carte.config.xml which is passed to carte.sh 71 | CMD ["carte.sh", "carte.config.xml"] -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | x-pdi-common: 3 | &pdi-common 4 | build: 5 | context: ./setup-pentaho 6 | dockerfile: Dockerfile 7 | args: 8 | PENTAHO_UID: ${PENTAHO_UID} 9 | PENTAHO_GID: ${PENTAHO_GID} 10 | image: pdi 11 | environment: 12 | &pdi-common-env 13 | HOST_ENV: ${HOST_ENV:-localhost} 14 | PENTAHO_DI_JAVA_OPTIONS: ${PENTAHO_DI_JAVA_OPTIONS} 15 | CARTE_USER: ${CARTE_USER} 16 | CARTE_PASSWORD: ${CARTE_PASSWORD} 17 | volumes: 18 | # - /var/run/docker.sock:/var/run/docker.sock 19 | - ./source-code/ktrs:/home/pentaho/repositories 20 | - ./setup-pentaho/logs:/opt/data-integration/logs 21 | - ./setup-pentaho/repositories.xml:/opt/data-integration/.kettle/repositories.xml 22 | - ./setup-pentaho/kettle-properties/${HOST_ENV:-localhost}-kettle.properties:/opt/data-integration/.kettle/kettle.properties 23 | - ./setup-pentaho/simple-jndi:/opt/data-integration/simple-jndi 24 | deploy: 25 | restart_policy: 26 | condition: on-failure 27 | max_attempts: 3 28 | 29 | x-airflow-common: 30 | &airflow-common 31 | build: ./setup-airflow 32 | image: airflow 33 | environment: 34 | &airflow-common-env 35 | HOST_ENV: ${HOST_ENV} 36 | AIRFLOW_UID: ${AIRFLOW_UID} 37 | AIRFLOW_GID: ${AIRFLOW_GID} 38 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 39 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@airflow-database/airflow 40 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@airflow-database/airflow 41 | AIRFLOW__CELERY__BROKER_URL: redis://:@airflow-broker:6379/0 42 | AIRFLOW__CORE__FERNET_KEY: '' 43 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 44 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 45 | PDI_CONN_STR: http://${CARTE_USER:-cluster}:${CARTE_PASSWORD:-cluster}@pdi-master:${CARTE_HOST_PORT:-8181} 46 | volumes: 47 | - ./source-code/dags:/opt/airflow/dags 48 | - ./setup-airflow/plugins:/opt/airflow/plugins 49 | - ./setup-airflow/logs:/opt/airflow/logs 50 | - ./setup-airflow/execute-carte.sh:/opt/airflow/execute-carte.sh 51 | - ./setup-airflow/airflow.cfg:/opt/airflow/airflow.cfg 52 | depends_on: 53 | airflow-broker: 54 | condition: service_healthy 55 | airflow-database: 56 | condition: service_healthy 57 | 58 | 59 | services: 60 | # Airflow-DB 61 | airflow-database: 62 | image: postgres:13 63 | container_name: airflow-database 64 | environment: 65 | POSTGRES_USER: airflow 66 | POSTGRES_PASSWORD: airflow 67 | POSTGRES_DB: airflow 68 | volumes: 69 | - postgres-db-volume:/var/lib/postgresql/data 70 | healthcheck: 71 | test: ["CMD", "pg_isready", "-U", "airflow"] 72 | interval: 5s 73 | retries: 5 74 | restart: always 75 | 76 | # Airflow-messenger 77 | airflow-broker: 78 | image: redis:latest 79 | container_name: airflow-broker 80 | ports: 81 | - 6379:6379 82 | healthcheck: 83 | test: ["CMD", "redis-cli", "ping"] 84 | interval: 5s 85 | timeout: 30s 86 | retries: 50 87 | restart: always 88 | 89 | # Airflow-webserver 90 | airflow-webserver: 91 | <<: *airflow-common 92 | container_name: airflow-webserver 93 | command: webserver 94 | ports: 95 | - ${AIRFLOW_HOST_PORT:-8080}:8080 96 | healthcheck: 97 | test: ["CMD", "curl", "--fail", "http://localhost:${AIRFLOW_HOST_PORT:-8080}/health"] 98 | interval: 10s 99 | timeout: 10s 100 | retries: 5 101 | restart: always 102 | 103 | # Airflow-scheduler 104 | airflow-scheduler: 105 | <<: *airflow-common 106 | container_name: airflow-scheduler 107 | command: scheduler 108 | restart: always 109 | 110 | # Airflow-worker 111 | airflow-worker: 112 | <<: *airflow-common 113 | command: celery worker 114 | restart: always 115 | 116 | # Airflow-DB-initialize 117 | airflow-init: 118 | <<: *airflow-common 119 | container_name: airflow-init 120 | command: version 121 | environment: 122 | <<: *airflow-common-env 123 | _AIRFLOW_DB_UPGRADE: 'true' 124 | _AIRFLOW_WWW_USER_CREATE: 'true' 125 | _AIRFLOW_WWW_USER_USERNAME: ${AIRFLOW_ADMIN_USER:-airflow} 126 | _AIRFLOW_WWW_USER_PASSWORD: ${AIRFLOW_ADMIN_PASSWORD:-airflow} 127 | _AIRFLOW_WWW_USER_EMAIL: ${AIRFLOW_ADMIN_EMAIL:-admin@admin.com} 128 | 129 | # Pentaho 130 | pdi-master: 131 | << : *pdi-common 132 | container_name: pdi-master 133 | environment: 134 | <<: *pdi-common-env 135 | ports: 136 | - ${CARTE_HOST_PORT:-8181}:8181 137 | 138 | # pdi-child: 139 | # << : *pdi-common 140 | # container_name: pdi-child 141 | # ports: 142 | # - 8182 143 | # depends_on: 144 | # - pdi-master 145 | # environment: 146 | # <<: *pdi-common-env 147 | # CARTE_PORT: 8182 148 | # CARTE_IS_MASTER: 'N' 149 | # CARTE_INCLUDE_MASTERS: 'Y' 150 | # CARTE_MASTER_HOSTNAME: 'pdi-master' 151 | # CARTE_MASTER_PORT: ${CARTE_HOST_PORT:-8181} 152 | 153 | volumes: 154 | postgres-db-volume: -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Step by step approach to easily dockerize Airflow and Pentaho Data Integration **IN SEPARATE CONTAINERS**. 4 | Below is the high level architecture of the setup: 5 | - Airflow: 6 | - Orchestrator container 7 | - Sends transformation/job metadata as task to Pentaho container 8 | 9 | - Pentaho: 10 | - Container receives transformation/job details as task to be done 11 | - Performs (runs) the actual task (transformation/job) 12 | 13 | 14 | # Pre-requisites 15 | - [Docker Engine](https://docs.docker.com/engine/install/) 16 | - [Docker Compose](https://docs.docker.com/compose/install/) 17 | 18 | # Versions 19 | - Airflow 2.0 20 | - PDI 9.1 21 | 22 | # Setup 23 | Change directory to the project folder before performing below steps. 24 | 25 | ### Environment variables, files & folders for containers 26 | - Create a .env file and add the user and group Ids for the respective containers. 27 | This is required for the containers to have same access privileges as that of the host user during docker compose. 28 | 29 | echo -e "PENTAHO_UID=$(id -u)\nPENTAHO_GID=0\nAIRFLOW_UID=$(id -u)\nAIRFLOW_GID=0" > .env 30 | 31 | - If needed, append the below optional variables to the above .env file. 32 | 33 | echo -e "=" >> .env 34 | - HOST_ENV --> run containers as localhost/dev/qa/prod. This will copy corresponding kettle.properties into the PDI container. Also enables PDI transformations to pick environment specific DB JNDI connections during execution. Can be used by Airflow to connect to corresponding resources. 35 | - CARTE_USER --> Default: cluster 36 | - CARTE_PASSWORD --> Default: cluster 37 | - AIRFLOW_ADMIN_USER --> Create Web UI user. Default: airflow 38 | - AIRFLOW_ADMIN_PASSWORD --> Default: airflow 39 | - AIRFLOW_ADMIN_EMAIL --> Required if new user to be created 40 | - PENTAHO_DI_JAVA_OPTIONS --> Allocate JVM memory to PDI container, based on host machine RAM. Increase if container crashes due to GC Out of memory. Ex: for Min. 1G and Max 4G, set this to "-Xms1g -Xmx4g" 41 | - CARTE_HOST_PORT --> Default: 8181 42 | - AIRFLOW_HOST_PORT --> Default: 8080 43 | 44 | - Create below folders for the container volumes to bind 45 | 46 | mkdir ./setup-airflow/logs ./setup-airflow/plugins ./setup-pentaho/logs 47 | 48 | 49 | - Source Code 50 | Since the DAGs/PDI source code files might undergo frequent updates, they are not copied into the container during image build, instead mounted via docker compose. Any update to these source code files on host will automatically get visible inside the container. 51 | 52 | - Airflow: 53 | - Default folder for DAGs on host is ./source-code/dags 54 | - Replace the above default folder in the docker compose file, with the desired folder location on host. 55 | - Place all the DAG files in the above host dags folder. 56 | 57 | - Pentaho: 58 | - Default folder for ktr/kjb files on host is ./source-code/ktrs 59 | - Replace the above default folder in the docker compose file, with the desired folder location on host. 60 | - Place all the PDI files in the above host ktrs folder. 61 | - Update repositories.xml file accordingly, to make them visible to Carte. 62 | 63 | ### Build & Deploy 64 | Below command will build (if first time) and start all the services. 65 | 66 | docker-compose up 67 | To run as daemon, add -d option. 68 | 69 | # Web UI 70 | - If not localhost, replace with server endpoint Url 71 | - If not below default ports, replace with the ones used during CARTE_HOST_PORT & AIRFLOW_HOST_PORT setup. 72 | 73 | Airflow Webserver 74 | 75 | localhost:8080/home 76 | 77 | Carte Webserver 78 | 79 | localhost:8181/kettle/status 80 | 81 | # How to trigger tasks from a DAG 82 | 83 | As per [Carte REST API documentaion](https://help.pentaho.com/Documentation/9.1/Developer_center/REST_API_Reference/Carte), executeJob and executeTrans APIs can be used to trigger tasks remotely. 84 | 85 | ## Method 1: 86 | Job trigger: 87 | 88 | job = BashOperator( 89 | task_id='Trigger_Job', 90 | bash_command='curl "${PDI_CONN_STR}/kettle/executeJob/?rep=test-repo&job=/helloworld/helloworld-job"' 91 | ) 92 | 93 | Transformation trigger: 94 | 95 | trans = BashOperator( 96 | task_id='Trigger_Transformation', 97 | bash_command='curl "${PDI_CONN_STR}/kettle/executeTrans/?rep=test-repo&trans=/helloworld/helloworld-trans"' 98 | ) 99 | 100 | - Parameters can be added to curl command by adding &, ex: ¶m1=value1¶m2=value2 101 | 102 | - PDI_CONN_STR: this is an environment variable in compose file, set to the PDI docker container URL. Used by Airflow DAG to send tasks to Carte. Below URL has ```pdi-master``` as the container name (used in compose file) with Carte running in it. 103 | 104 | http://${CARTE_USER:-cluster}:${CARTE_PASSWORD:-cluster}@pdi-master:${CARTE_HOST_PORT:-8181} 105 | 106 | 107 | ## Method 2 108 | In DAG file, import the user defined helper functions defined in ```utils/execute_pdi.py```. 109 | Unlike method 1, this makes use of ```execute-carte.sh``` file which not only keeps checking Carte task status but also gets actual task log (for jobs only) generated by Carte, into Airflow log. 110 | 111 | Job trigger: 112 | 113 | job = BashOperator( 114 | task_id='Trigger_Job', 115 | bash_command=execute_job( 116 | rep="test-repo", 117 | task="helloworld-job", 118 | dir="/helloworld/", 119 | param="" 120 | ) 121 | ) 122 | 123 | Transformation trigger: 124 | 125 | trans = BashOperator( 126 | task_id='Trigger_Transformation', 127 | bash_command=execute_trans( 128 | rep="test-repo", 129 | task="helloworld-trans", 130 | dir="/helloworld/", 131 | param="" 132 | ) 133 | ) 134 | # Best practices 135 | - ```jdbc.properties``` file, which contains database access credentials, has been included in this repo for reference purpose only. In actual development, this should be avoided and needs to be added to gitignore instead. After first code pull to a server, update it with all JNDI details before docker compose. 136 | 137 | - ```.env``` file also may contain sensitive information, like environment dependent access keys. This also should be added to .gitignore file. Instead create this file with necessary parameters during image build. 138 | 139 | - ```HOST_ENV``` setting this parameter gives us a flexibility to choose appropriate ```kettle.properties``` file. For example, QA and PROD mailing server SMTP details may differ. This can be included in separate kettle properties file, to be selected dynamically based on the host environment. Not only this, if one uses the ```jdbc.properties``` file, we can enable PDI container dynamically select the correct JNDI from ```jdbc.properties``` file. For ex: if one needs to test a transformation in QA environemnt using Postgres JNDI connection encoded as ```db-${HOST_ENV}```, running PDI service with ```HOST_ENV=qa```, will render ```db-qa``` database JNDI, thus using QA data for testing. 140 | 141 | - ```PENTAHO_DI_JAVA_OPTIONS``` Having this option lets the user tweak the amount of memory PDI gets inside the container, to run a task. Depending on the host machine memory and average task complexity, this can be modified to avoid PDI container crash due to "GC Out of Memory" errors. If host machine has ample RAM and PDI container is crashing due to the default memory limits, we can increase it by setting ```PENTAHO_DI_JAVA_OPTIONS=-Xms2g -Xmx4g``` 2GB and 4GB being the lower and upper limits respectively. 142 | 143 | # References & Credits 144 | - [What is Carte Server ?](https://wiki.pentaho.com/display/EAI/Carte+User+Documentation) 145 | 146 | - [Configure Carte Server](https://help.pentaho.com/Documentation/8.0/Products/Data_Integration/Carte_Clusters/060) 147 | 148 | - [Set Repository on the Carte Server](https://help.pentaho.com/Documentation/9.1/Products/Use_Carte_Clusters) 149 | 150 | - [Carte APIs to trigger kettle transformation/jobs](https://help.pentaho.com/Documentation/9.1/Developer_center/REST_API_Reference/Carte) 151 | 152 | - [Monitoring Carte logs from Airlfow container](https://diethardsteiner.github.io/pdi/2020/04/01/Scheduling-a-PDI-Job-on-Apache-Airflow.html) 153 | 154 | - [Docker entrypoint logic](https://github.com/aloysius-lim/docker-pentaho-di/blob/master/docker/Dockerfile) -------------------------------------------------------------------------------- /source-code/ktrs/helloworld/helloworld-job.kjb: -------------------------------------------------------------------------------- 1 | 2 | 3 | helloworld-job 4 | 5 | 6 | 7 | 0 8 | / 9 | - 10 | 2015/06/01 15:33:25.423 11 | - 12 | 2015/06/01 15:34:07.453 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | ID_JOB 26 | Y 27 | ID_JOB 28 | 29 | 30 | CHANNEL_ID 31 | Y 32 | CHANNEL_ID 33 | 34 | 35 | JOBNAME 36 | Y 37 | JOBNAME 38 | 39 | 40 | STATUS 41 | Y 42 | STATUS 43 | 44 | 45 | LINES_READ 46 | Y 47 | LINES_READ 48 | 49 | 50 | LINES_WRITTEN 51 | Y 52 | LINES_WRITTEN 53 | 54 | 55 | LINES_UPDATED 56 | Y 57 | LINES_UPDATED 58 | 59 | 60 | LINES_INPUT 61 | Y 62 | LINES_INPUT 63 | 64 | 65 | LINES_OUTPUT 66 | Y 67 | LINES_OUTPUT 68 | 69 | 70 | LINES_REJECTED 71 | Y 72 | LINES_REJECTED 73 | 74 | 75 | ERRORS 76 | Y 77 | ERRORS 78 | 79 | 80 | STARTDATE 81 | Y 82 | STARTDATE 83 | 84 | 85 | ENDDATE 86 | Y 87 | ENDDATE 88 | 89 | 90 | LOGDATE 91 | Y 92 | LOGDATE 93 | 94 | 95 | DEPDATE 96 | Y 97 | DEPDATE 98 | 99 | 100 | REPLAYDATE 101 | Y 102 | REPLAYDATE 103 | 104 | 105 | LOG_FIELD 106 | Y 107 | LOG_FIELD 108 | 109 | 110 | EXECUTING_SERVER 111 | N 112 | EXECUTING_SERVER 113 | 114 | 115 | EXECUTING_USER 116 | N 117 | EXECUTING_USER 118 | 119 | 120 | START_JOB_ENTRY 121 | N 122 | START_JOB_ENTRY 123 | 124 | 125 | CLIENT 126 | N 127 | CLIENT 128 | 129 | 130 | 131 | 132 | 133 |

134 | 135 | 136 | ID_BATCH 137 | Y 138 | ID_BATCH 139 | 140 | 141 | CHANNEL_ID 142 | Y 143 | CHANNEL_ID 144 | 145 | 146 | LOG_DATE 147 | Y 148 | LOG_DATE 149 | 150 | 151 | JOBNAME 152 | Y 153 | TRANSNAME 154 | 155 | 156 | JOBENTRYNAME 157 | Y 158 | STEPNAME 159 | 160 | 161 | LINES_READ 162 | Y 163 | LINES_READ 164 | 165 | 166 | LINES_WRITTEN 167 | Y 168 | LINES_WRITTEN 169 | 170 | 171 | LINES_UPDATED 172 | Y 173 | LINES_UPDATED 174 | 175 | 176 | LINES_INPUT 177 | Y 178 | LINES_INPUT 179 | 180 | 181 | LINES_OUTPUT 182 | Y 183 | LINES_OUTPUT 184 | 185 | 186 | LINES_REJECTED 187 | Y 188 | LINES_REJECTED 189 | 190 | 191 | ERRORS 192 | Y 193 | ERRORS 194 | 195 | 196 | RESULT 197 | Y 198 | RESULT 199 | 200 | 201 | NR_RESULT_ROWS 202 | Y 203 | NR_RESULT_ROWS 204 | 205 | 206 | NR_RESULT_FILES 207 | Y 208 | NR_RESULT_FILES 209 | 210 | 211 | LOG_FIELD 212 | N 213 | LOG_FIELD 214 | 215 | 216 | COPY_NR 217 | N 218 | COPY_NR 219 | 220 | 221 | 222 | 223 | 224 |

225 | 226 | 227 | ID_BATCH 228 | Y 229 | ID_BATCH 230 | 231 | 232 | CHANNEL_ID 233 | Y 234 | CHANNEL_ID 235 | 236 | 237 | LOG_DATE 238 | Y 239 | LOG_DATE 240 | 241 | 242 | LOGGING_OBJECT_TYPE 243 | Y 244 | LOGGING_OBJECT_TYPE 245 | 246 | 247 | OBJECT_NAME 248 | Y 249 | OBJECT_NAME 250 | 251 | 252 | OBJECT_COPY 253 | Y 254 | OBJECT_COPY 255 | 256 | 257 | REPOSITORY_DIRECTORY 258 | Y 259 | REPOSITORY_DIRECTORY 260 | 261 | 262 | FILENAME 263 | Y 264 | FILENAME 265 | 266 | 267 | OBJECT_ID 268 | Y 269 | OBJECT_ID 270 | 271 | 272 | OBJECT_REVISION 273 | Y 274 | OBJECT_REVISION 275 | 276 | 277 | PARENT_CHANNEL_ID 278 | Y 279 | PARENT_CHANNEL_ID 280 | 281 | 282 | ROOT_CHANNEL_ID 283 | Y 284 | ROOT_CHANNEL_ID 285 | 286 | 287 | N 288 | 289 | 290 | 291 | START 292 | 293 | SPECIAL 294 | 295 | Y 296 | N 297 | N 298 | 0 299 | 0 300 | 60 301 | 12 302 | 0 303 | 1 304 | 1 305 | N 306 | Y 307 | 0 308 | 128 309 | 64 310 | 311 | 312 | 313 | trans 314 | 315 | TRANS 316 | 317 | filename 318 | 319 | ${Internal.Entry.Current.Directory}/helloworld/helloworld-trans 320 | 321 | N 322 | N 323 | N 324 | N 325 | N 326 | N 327 | 328 | 329 | N 330 | N 331 | Basic 332 | N 333 | 334 | N 335 | Y 336 | N 337 | N 338 | N 339 | Pentaho local 340 | 341 | Y 342 | 343 | MY_MESSAGE 344 | 345 | ${PDI_TEST_WELCOME_MESSAGE} 346 | 347 | 348 | N 349 | Y 350 | 0 351 | 288 352 | 64 353 | 354 | 355 | 356 | 357 | 358 | START 359 | trans 360 | 0 361 | 0 362 | Y 363 | Y 364 | Y 365 | 366 | 367 | 368 | 369 | 370 | 371 | METASTORE.pentaho 372 | 373 | Default Run Configuration 374 | {"namespace":"pentaho","id":"Default Run Configuration","name":"Default Run Configuration","description":"Defines a default run configuration","metaStoreName":null} 375 | 376 | 377 | 378 | {"_":"Embedded MetaStore Elements","namespace":"pentaho","type":"Default Run Configuration"} 379 | 380 | Pentaho local 381 | {"children":[{"children":[],"id":"server","value":null},{"children":[],"id":"clustered","value":"N"},{"children":[],"id":"name","value":"Pentaho local"},{"children":[],"id":"description","value":null},{"children":[],"id":"pentaho","value":"N"},{"children":[],"id":"readOnly","value":"Y"},{"children":[],"id":"sendResources","value":"N"},{"children":[],"id":"logRemoteExecutionLocally","value":"N"},{"children":[],"id":"remote","value":"N"},{"children":[],"id":"local","value":"Y"},{"children":[],"id":"showTransformations","value":"N"}],"id":"Pentaho local","value":null,"name":"Pentaho local","owner":null,"ownerPermissionsList":[]} 382 | 383 | 384 | 385 | 386 | -------------------------------------------------------------------------------- /source-code/ktrs/process1/task2.ktr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | task2 5 | 6 | 7 | 8 | Normal 9 | 0 10 | / 11 | 12 | 13 | 14 | 15 | 16 | 17 |

18 | 19 | 20 | 21 | 22 | ID_BATCH 23 | Y 24 | ID_BATCH 25 | 26 | 27 | CHANNEL_ID 28 | Y 29 | CHANNEL_ID 30 | 31 | 32 | TRANSNAME 33 | Y 34 | TRANSNAME 35 | 36 | 37 | STATUS 38 | Y 39 | STATUS 40 | 41 | 42 | LINES_READ 43 | Y 44 | LINES_READ 45 | 46 | 47 | 48 | LINES_WRITTEN 49 | Y 50 | LINES_WRITTEN 51 | 52 | 53 | 54 | LINES_UPDATED 55 | Y 56 | LINES_UPDATED 57 | 58 | 59 | 60 | LINES_INPUT 61 | Y 62 | LINES_INPUT 63 | 64 | 65 | 66 | LINES_OUTPUT 67 | Y 68 | LINES_OUTPUT 69 | 70 | 71 | 72 | LINES_REJECTED 73 | Y 74 | LINES_REJECTED 75 | 76 | 77 | 78 | ERRORS 79 | Y 80 | ERRORS 81 | 82 | 83 | STARTDATE 84 | Y 85 | STARTDATE 86 | 87 | 88 | ENDDATE 89 | Y 90 | ENDDATE 91 | 92 | 93 | LOGDATE 94 | Y 95 | LOGDATE 96 | 97 | 98 | DEPDATE 99 | Y 100 | DEPDATE 101 | 102 | 103 | REPLAYDATE 104 | Y 105 | REPLAYDATE 106 | 107 | 108 | LOG_FIELD 109 | Y 110 | LOG_FIELD 111 | 112 | 113 | EXECUTING_SERVER 114 | N 115 | EXECUTING_SERVER 116 | 117 | 118 | EXECUTING_USER 119 | N 120 | EXECUTING_USER 121 | 122 | 123 | CLIENT 124 | N 125 | CLIENT 126 | 127 | 128 | 129 | 130 | 131 |

132 | 133 | 134 | 135 | ID_BATCH 136 | Y 137 | ID_BATCH 138 | 139 | 140 | SEQ_NR 141 | Y 142 | SEQ_NR 143 | 144 | 145 | LOGDATE 146 | Y 147 | LOGDATE 148 | 149 | 150 | TRANSNAME 151 | Y 152 | TRANSNAME 153 | 154 | 155 | STEPNAME 156 | Y 157 | STEPNAME 158 | 159 | 160 | STEP_COPY 161 | Y 162 | STEP_COPY 163 | 164 | 165 | LINES_READ 166 | Y 167 | LINES_READ 168 | 169 | 170 | LINES_WRITTEN 171 | Y 172 | LINES_WRITTEN 173 | 174 | 175 | LINES_UPDATED 176 | Y 177 | LINES_UPDATED 178 | 179 | 180 | LINES_INPUT 181 | Y 182 | LINES_INPUT 183 | 184 | 185 | LINES_OUTPUT 186 | Y 187 | LINES_OUTPUT 188 | 189 | 190 | LINES_REJECTED 191 | Y 192 | LINES_REJECTED 193 | 194 | 195 | ERRORS 196 | Y 197 | ERRORS 198 | 199 | 200 | INPUT_BUFFER_ROWS 201 | Y 202 | INPUT_BUFFER_ROWS 203 | 204 | 205 | OUTPUT_BUFFER_ROWS 206 | Y 207 | OUTPUT_BUFFER_ROWS 208 | 209 | 210 | 211 | 212 | 213 |

214 | 215 | 216 | ID_BATCH 217 | Y 218 | ID_BATCH 219 | 220 | 221 | CHANNEL_ID 222 | Y 223 | CHANNEL_ID 224 | 225 | 226 | LOG_DATE 227 | Y 228 | LOG_DATE 229 | 230 | 231 | LOGGING_OBJECT_TYPE 232 | Y 233 | LOGGING_OBJECT_TYPE 234 | 235 | 236 | OBJECT_NAME 237 | Y 238 | OBJECT_NAME 239 | 240 | 241 | OBJECT_COPY 242 | Y 243 | OBJECT_COPY 244 | 245 | 246 | REPOSITORY_DIRECTORY 247 | Y 248 | REPOSITORY_DIRECTORY 249 | 250 | 251 | FILENAME 252 | Y 253 | FILENAME 254 | 255 | 256 | OBJECT_ID 257 | Y 258 | OBJECT_ID 259 | 260 | 261 | OBJECT_REVISION 262 | Y 263 | OBJECT_REVISION 264 | 265 | 266 | PARENT_CHANNEL_ID 267 | Y 268 | PARENT_CHANNEL_ID 269 | 270 | 271 | ROOT_CHANNEL_ID 272 | Y 273 | ROOT_CHANNEL_ID 274 | 275 | 276 | 277 | 278 | 279 |

280 | 281 | 282 | ID_BATCH 283 | Y 284 | ID_BATCH 285 | 286 | 287 | CHANNEL_ID 288 | Y 289 | CHANNEL_ID 290 | 291 | 292 | LOG_DATE 293 | Y 294 | LOG_DATE 295 | 296 | 297 | TRANSNAME 298 | Y 299 | TRANSNAME 300 | 301 | 302 | STEPNAME 303 | Y 304 | STEPNAME 305 | 306 | 307 | STEP_COPY 308 | Y 309 | STEP_COPY 310 | 311 | 312 | LINES_READ 313 | Y 314 | LINES_READ 315 | 316 | 317 | LINES_WRITTEN 318 | Y 319 | LINES_WRITTEN 320 | 321 | 322 | LINES_UPDATED 323 | Y 324 | LINES_UPDATED 325 | 326 | 327 | LINES_INPUT 328 | Y 329 | LINES_INPUT 330 | 331 | 332 | LINES_OUTPUT 333 | Y 334 | LINES_OUTPUT 335 | 336 | 337 | LINES_REJECTED 338 | Y 339 | LINES_REJECTED 340 | 341 | 342 | ERRORS 343 | Y 344 | ERRORS 345 | 346 | 347 | LOG_FIELD 348 | N 349 | LOG_FIELD 350 | 351 | 352 | 353 | 354 | 355 |

356 | 357 | 358 | ID_BATCH 359 | Y 360 | ID_BATCH 361 | 362 | 363 | CHANNEL_ID 364 | Y 365 | CHANNEL_ID 366 | 367 | 368 | LOG_DATE 369 | Y 370 | LOG_DATE 371 | 372 | 373 | METRICS_DATE 374 | Y 375 | METRICS_DATE 376 | 377 | 378 | METRICS_CODE 379 | Y 380 | METRICS_CODE 381 | 382 | 383 | METRICS_DESCRIPTION 384 | Y 385 | METRICS_DESCRIPTION 386 | 387 | 388 | METRICS_SUBJECT 389 | Y 390 | METRICS_SUBJECT 391 | 392 | 393 | METRICS_TYPE 394 | Y 395 | METRICS_TYPE 396 | 397 | 398 | METRICS_VALUE 399 | Y 400 | METRICS_VALUE 401 | 402 | 403 | 404 | 405 | 406 |

407 | 408 | 0.0 409 | 0.0 410 | 411 | 10000 412 | 50 413 | 50 414 | N 415 | Y 416 | 50000 417 | Y 418 | 419 | N 420 | 1000 421 | 100 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | - 431 | 2015/06/01 15:33:04.607 432 | - 433 | 2015/06/01 15:50:19.484 434 | H4sIAAAAAAAAAAMAAAAAAAAAAAA= 435 | N 436 | 437 | 438 | 439 | 440 | 441 | Get Variables 442 | Write to log 443 | Y 444 | 445 | 446 | 447 | Get Variables 448 | GetVariable 449 | 450 | Y 451 | 452 | 1 453 | 454 | none 455 | 456 | 457 | 458 | 459 | PDI_TEST_MESSAGE 460 | ${PDI_TEST_MESSAGE} 461 | String 462 | 463 | 464 | 465 | 466 | -1 467 | -1 468 | none 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 128 481 | 64 482 | Y 483 | 484 | 485 | 486 | Write to log 487 | WriteToLog 488 | 489 | Y 490 | 491 | 1 492 | 493 | none 494 | 495 | 496 | log_level_basic 497 | Y 498 | N 499 | 0 500 | 501 | 502 | 503 | PDI_TEST_MESSAGE 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 288 516 | 64 517 | Y 518 | 519 | 520 | 521 | 522 | 523 | 524 | N 525 | 526 | 527 | -------------------------------------------------------------------------------- /source-code/ktrs/process1/task1.ktr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | task1 5 | 6 | 7 | 8 | Normal 9 | 0 10 | / 11 | 12 | 13 | 14 | 15 | 16 | 17 |

407 | 408 | 0.0 409 | 0.0 410 | 411 | 10000 412 | 50 413 | 50 414 | N 415 | Y 416 | 50000 417 | Y 418 | 419 | N 420 | 1000 421 | 100 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | - 431 | 2021/04/19 10:30:54.841 432 | - 433 | 2021/04/19 10:30:54.841 434 | H4sIAAAAAAAAAAMAAAAAAAAAAAA= 435 | N 436 | 437 | 438 | 439 | 440 | 441 | Get Variable 442 | Delay row 443 | Y 444 | 445 | 446 | Delay row 447 | Abort 448 | Y 449 | 450 | 451 | 452 | Abort 453 | Abort 454 | 455 | Y 456 | 457 | 1 458 | 459 | none 460 | 461 | 462 | 0 463 | 464 | Y 465 | ABORT_WITH_ERROR 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 320 476 | 128 477 | Y 478 | 479 | 480 | 481 | Delay row 482 | Delay 483 | 484 | Y 485 | 486 | 1 487 | 488 | none 489 | 490 | 491 | 5 492 | seconds 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 208 503 | 128 504 | Y 505 | 506 | 507 | 508 | Get Variable 509 | GetVariable 510 | 511 | Y 512 | 513 | 1 514 | 515 | none 516 | 517 | 518 | 519 | 520 | PDI_TEST_WELCOME_MESSAGE 521 | ${PDI_TEST_WELCOME_MESSAGE} 522 | String 523 | 524 | 525 | 526 | 527 | -1 528 | -1 529 | none 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 80 542 | 128 543 | Y 544 | 545 | 546 | 547 | 548 | 549 | 550 | N 551 | 552 | 553 | -------------------------------------------------------------------------------- /setup-airflow/airflow.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | # The folder where your airflow pipelines live, most likely a 3 | # subfolder in a code repository. This path must be absolute. 4 | dags_folder = /opt/airflow/dags 5 | 6 | # Hostname by providing a path to a callable, which will resolve the hostname. 7 | # The format is "package.function". 8 | # 9 | # For example, default value "socket.getfqdn" means that result from getfqdn() of "socket" 10 | # package will be used as hostname. 11 | # 12 | # No argument should be required in the function specified. 13 | # If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address`` 14 | hostname_callable = socket.getfqdn 15 | 16 | # Default timezone in case supplied date times are naive 17 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam) 18 | default_timezone = utc 19 | 20 | # The executor class that airflow should use. Choices include 21 | # ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``, ``DaskExecutor``, 22 | # ``KubernetesExecutor``, ``CeleryKubernetesExecutor`` or the 23 | # full import path to the class when using a custom executor. 24 | executor = SequentialExecutor 25 | 26 | # The SqlAlchemy connection string to the metadata database. 27 | # SqlAlchemy supports many different database engine, more information 28 | # their website 29 | sql_alchemy_conn = sqlite:////opt/airflow/airflow.db 30 | 31 | # The encoding for the databases 32 | sql_engine_encoding = utf-8 33 | 34 | # Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding. 35 | # This is particularly useful in case of mysql with utf8mb4 encoding because 36 | # primary keys for XCom table has too big size and ``sql_engine_collation_for_ids`` should 37 | # be set to ``utf8mb3_general_ci``. 38 | # sql_engine_collation_for_ids = 39 | 40 | # If SqlAlchemy should pool database connections. 41 | sql_alchemy_pool_enabled = True 42 | 43 | # The SqlAlchemy pool size is the maximum number of database connections 44 | # in the pool. 0 indicates no limit. 45 | sql_alchemy_pool_size = 5 46 | 47 | # The maximum overflow size of the pool. 48 | # When the number of checked-out connections reaches the size set in pool_size, 49 | # additional connections will be returned up to this limit. 50 | # When those additional connections are returned to the pool, they are disconnected and discarded. 51 | # It follows then that the total number of simultaneous connections the pool will allow 52 | # is pool_size + max_overflow, 53 | # and the total number of "sleeping" connections the pool will allow is pool_size. 54 | # max_overflow can be set to ``-1`` to indicate no overflow limit; 55 | # no limit will be placed on the total number of concurrent connections. Defaults to ``10``. 56 | sql_alchemy_max_overflow = 10 57 | 58 | # The SqlAlchemy pool recycle is the number of seconds a connection 59 | # can be idle in the pool before it is invalidated. This config does 60 | # not apply to sqlite. If the number of DB connections is ever exceeded, 61 | # a lower config value will allow the system to recover faster. 62 | sql_alchemy_pool_recycle = 1800 63 | 64 | # Check connection at the start of each connection pool checkout. 65 | # Typically, this is a simple statement like "SELECT 1". 66 | # More information here: 67 | # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic 68 | sql_alchemy_pool_pre_ping = True 69 | 70 | # The schema to use for the metadata database. 71 | # SqlAlchemy supports databases with the concept of multiple schemas. 72 | sql_alchemy_schema = 73 | 74 | # Import path for connect args in SqlAlchemy. Defaults to an empty dict. 75 | # This is useful when you want to configure db engine args that SqlAlchemy won't parse 76 | # in connection string. 77 | # See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args 78 | # sql_alchemy_connect_args = 79 | 80 | # The amount of parallelism as a setting to the executor. This defines 81 | # the max number of task instances that should run simultaneously 82 | # on this airflow installation 83 | parallelism = 32 84 | 85 | # The number of task instances allowed to run concurrently by the scheduler 86 | # in one DAG. Can be overridden by ``concurrency`` on DAG level. 87 | dag_concurrency = 16 88 | 89 | # Are DAGs paused by default at creation 90 | dags_are_paused_at_creation = True 91 | 92 | # The maximum number of active DAG runs per DAG 93 | max_active_runs_per_dag = 16 94 | 95 | # Whether to load the DAG examples that ship with Airflow. It's good to 96 | # get started, but you probably want to set this to ``False`` in a production 97 | # environment 98 | load_examples = False 99 | 100 | # Whether to load the default connections that ship with Airflow. It's good to 101 | # get started, but you probably want to set this to ``False`` in a production 102 | # environment 103 | load_default_connections = True 104 | 105 | # Path to the folder containing Airflow plugins 106 | plugins_folder = /opt/airflow/plugins 107 | 108 | # Should tasks be executed via forking of the parent process ("False", 109 | # the speedier option) or by spawning a new python process ("True" slow, 110 | # but means plugin changes picked up by tasks straight away) 111 | execute_tasks_new_python_interpreter = False 112 | 113 | # Secret key to save connection passwords in the db 114 | fernet_key = hjIFXCPQL6ZZx-dN7Kpr5yULTMFmLK-skgH9KdKeA1I= 115 | 116 | # Whether to disable pickling dags 117 | donot_pickle = True 118 | 119 | # How long before timing out a python file import 120 | dagbag_import_timeout = 30.0 121 | 122 | # Should a traceback be shown in the UI for dagbag import errors, 123 | # instead of just the exception message 124 | dagbag_import_error_tracebacks = True 125 | 126 | # If tracebacks are shown, how many entries from the traceback should be shown 127 | dagbag_import_error_traceback_depth = 2 128 | 129 | # How long before timing out a DagFileProcessor, which processes a dag file 130 | dag_file_processor_timeout = 50 131 | 132 | # The class to use for running task instances in a subprocess. 133 | # Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class 134 | # when using a custom task runner. 135 | task_runner = StandardTaskRunner 136 | 137 | # If set, tasks without a ``run_as_user`` argument will be run with this user 138 | # Can be used to de-elevate a sudo user running Airflow when executing tasks 139 | default_impersonation = 140 | 141 | # What security module to use (for example kerberos) 142 | security = 143 | 144 | # Turn unit test mode on (overwrites many configuration options with test 145 | # values at runtime) 146 | unit_test_mode = False 147 | 148 | # Whether to enable pickling for xcom (note that this is insecure and allows for 149 | # RCE exploits). 150 | enable_xcom_pickling = False 151 | 152 | # When a task is killed forcefully, this is the amount of time in seconds that 153 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED 154 | killed_task_cleanup_time = 60 155 | 156 | # Whether to override params with dag_run.conf. If you pass some key-value pairs 157 | # through ``airflow dags backfill -c`` or 158 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params. 159 | dag_run_conf_overrides_params = True 160 | 161 | # When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``. 162 | dag_discovery_safe_mode = True 163 | 164 | # The number of retries each task is going to have by default. Can be overridden at dag or task level. 165 | default_task_retries = 0 166 | 167 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. 168 | min_serialized_dag_update_interval = 30 169 | 170 | # Fetching serialized DAG can not be faster than a minimum interval to reduce database 171 | # read rate. This config controls when your DAGs are updated in the Webserver 172 | min_serialized_dag_fetch_interval = 10 173 | 174 | # Whether to persist DAG files code in DB. 175 | # If set to True, Webserver reads file contents from DB instead of 176 | # trying to access files in a DAG folder. 177 | # Example: store_dag_code = False 178 | # store_dag_code = 179 | 180 | # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store 181 | # in the Database. 182 | # All the template_fields for each of Task Instance are stored in the Database. 183 | # Keeping this number small may cause an error when you try to view ``Rendered`` tab in 184 | # TaskInstance view for older tasks. 185 | max_num_rendered_ti_fields_per_task = 30 186 | 187 | # On each dagrun check against defined SLAs 188 | check_slas = True 189 | 190 | # Path to custom XCom class that will be used to store and resolve operators results 191 | # Example: xcom_backend = path.to.CustomXCom 192 | xcom_backend = airflow.models.xcom.BaseXCom 193 | 194 | # By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``, 195 | # if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module. 196 | lazy_load_plugins = True 197 | 198 | # By default Airflow providers are lazily-discovered (discovery and imports happen only when required). 199 | # Set it to False, if you want to discover providers whenever 'airflow' is invoked via cli or 200 | # loaded from module. 201 | lazy_discover_providers = True 202 | 203 | # Number of times the code should be retried in case of DB Operational Errors. 204 | # Not all transactions will be retried as it can cause undesired state. 205 | # Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``. 206 | max_db_retries = 3 207 | 208 | [logging] 209 | # The folder where airflow should store its log files 210 | # This path must be absolute 211 | base_log_folder = /opt/airflow/logs 212 | 213 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search. 214 | # Set this to True if you want to enable remote logging. 215 | remote_logging = False 216 | 217 | # Users must supply an Airflow connection id that provides access to the storage 218 | # location. 219 | remote_log_conn_id = 220 | 221 | # Path to Google Credential JSON file. If omitted, authorization based on `the Application Default 222 | # Credentials 223 | # `__ will 224 | # be used. 225 | google_key_path = 226 | 227 | # Storage bucket URL for remote logging 228 | # S3 buckets should start with "s3://" 229 | # Cloudwatch log groups should start with "cloudwatch://" 230 | # GCS buckets should start with "gs://" 231 | # WASB buckets should start with "wasb" just to help Airflow select correct handler 232 | # Stackdriver logs should start with "stackdriver://" 233 | remote_base_log_folder = 234 | 235 | # Use server-side encryption for logs stored in S3 236 | encrypt_s3_logs = False 237 | 238 | # Logging level 239 | logging_level = INFO 240 | 241 | # Logging level for Flask-appbuilder UI 242 | fab_logging_level = WARN 243 | 244 | # Logging class 245 | # Specify the class that will specify the logging configuration 246 | # This class has to be on the python classpath 247 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG 248 | logging_config_class = 249 | 250 | # Flag to enable/disable Colored logs in Console 251 | # Colour the logs when the controlling terminal is a TTY. 252 | colored_console_log = True 253 | 254 | # Log format for when Colored logs is enabled 255 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s 256 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter 257 | 258 | # Format of Log line 259 | log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s 260 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s 261 | 262 | # Specify prefix pattern like mentioned below with stream handler TaskHandlerWithCustomFormatter 263 | # Example: task_log_prefix_template = {ti.dag_id}-{ti.task_id}-{execution_date}-{try_number} 264 | task_log_prefix_template = 265 | 266 | # Formatting for how airflow generates file names/paths for each task run. 267 | log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log 268 | 269 | # Formatting for how airflow generates file names for log 270 | log_processor_filename_template = {{ filename }}.log 271 | 272 | # full path of dag_processor_manager logfile 273 | dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log 274 | 275 | # Name of handler to read task instance logs. 276 | # Defaults to use ``task`` handler. 277 | task_log_reader = task 278 | 279 | # A comma\-separated list of third-party logger names that will be configured to print messages to 280 | # consoles\. 281 | # Example: extra_loggers = connexion,sqlalchemy 282 | extra_loggers = 283 | 284 | [metrics] 285 | 286 | # StatsD (https://github.com/etsy/statsd) integration settings. 287 | # Enables sending metrics to StatsD. 288 | statsd_on = False 289 | statsd_host = localhost 290 | statsd_port = 8125 291 | statsd_prefix = airflow 292 | 293 | # If you want to avoid sending all the available metrics to StatsD, 294 | # you can configure an allow list of prefixes (comma separated) to send only the metrics that 295 | # start with the elements of the list (e.g: "scheduler,executor,dagrun") 296 | statsd_allow_list = 297 | 298 | # A function that validate the statsd stat name, apply changes to the stat name if necessary and return 299 | # the transformed stat name. 300 | # 301 | # The function should have the following signature: 302 | # def func_name(stat_name: str) -> str: 303 | stat_name_handler = 304 | 305 | # To enable datadog integration to send airflow metrics. 306 | statsd_datadog_enabled = False 307 | 308 | # List of datadog tags attached to all metrics(e.g: key1:value1,key2:value2) 309 | statsd_datadog_tags = 310 | 311 | # If you want to utilise your own custom Statsd client set the relevant 312 | # module path below. 313 | # Note: The module path must exist on your PYTHONPATH for Airflow to pick it up 314 | # statsd_custom_client_path = 315 | 316 | [secrets] 317 | # Full class name of secrets backend to enable (will precede env vars and metastore in search path) 318 | # Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend 319 | backend = 320 | 321 | # The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class. 322 | # See documentation for the secrets backend you are using. JSON is expected. 323 | # Example for AWS Systems Manager ParameterStore: 324 | # ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}`` 325 | backend_kwargs = 326 | 327 | [cli] 328 | # In what way should the cli access the API. The LocalClient will use the 329 | # database directly, while the json_client will use the api running on the 330 | # webserver 331 | api_client = airflow.api.client.local_client 332 | 333 | # If you set web_server_url_prefix, do NOT forget to append it here, ex: 334 | # ``endpoint_url = http://localhost:8080/myroot`` 335 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...`` 336 | endpoint_url = http://localhost:8080 337 | 338 | [debug] 339 | # Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first 340 | # failed task. Helpful for debugging purposes. 341 | fail_fast = False 342 | 343 | [api] 344 | # Enables the deprecated experimental API. Please note that these APIs do not have access control. 345 | # The authenticated user has full access. 346 | # 347 | # .. warning:: 348 | # 349 | # This `Experimental REST API `__ is 350 | # deprecated since version 2.0. Please consider using 351 | # `the Stable REST API `__. 352 | # For more information on migration, see 353 | # `UPDATING.md `_ 354 | enable_experimental_api = False 355 | 356 | # How to authenticate users of the API. See 357 | # https://airflow.apache.org/docs/stable/security.html for possible values. 358 | # ("airflow.api.auth.backend.default" allows all requests for historic reasons) 359 | auth_backend = airflow.api.auth.backend.deny_all 360 | 361 | # Used to set the maximum page limit for API requests 362 | maximum_page_limit = 100 363 | 364 | # Used to set the default page limit when limit is zero. A default limit 365 | # of 100 is set on OpenApi spec. However, this particular default limit 366 | # only work when limit is set equal to zero(0) from API requests. 367 | # If no limit is supplied, the OpenApi spec default is used. 368 | fallback_page_limit = 100 369 | 370 | # The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested. 371 | # Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com 372 | google_oauth2_audience = 373 | 374 | # Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on 375 | # `the Application Default Credentials 376 | # `__ will 377 | # be used. 378 | # Example: google_key_path = /files/service-account-json 379 | google_key_path = 380 | 381 | [lineage] 382 | # what lineage backend to use 383 | backend = 384 | 385 | [atlas] 386 | sasl_enabled = False 387 | host = 388 | port = 21000 389 | username = 390 | password = 391 | 392 | [operators] 393 | # The default owner assigned to each new operator, unless 394 | # provided explicitly or passed via ``default_args`` 395 | default_owner = airflow 396 | default_cpus = 1 397 | default_ram = 512 398 | default_disk = 512 399 | default_gpus = 0 400 | 401 | # Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator. 402 | # If set to False, an exception will be thrown, otherwise only the console message will be displayed. 403 | allow_illegal_arguments = False 404 | 405 | [hive] 406 | # Default mapreduce queue for HiveOperator tasks 407 | default_hive_mapred_queue = 408 | 409 | # Template for mapred_job_name in HiveOperator, supports the following named parameters 410 | # hostname, dag_id, task_id, execution_date 411 | # mapred_job_name_template = 412 | 413 | [webserver] 414 | # The base url of your website as airflow cannot guess what domain or 415 | # cname you are using. This is used in automated emails that 416 | # airflow sends to point links to the right web server 417 | base_url = http://localhost:8080 418 | 419 | # Default timezone to display all dates in the UI, can be UTC, system, or 420 | # any IANA timezone string (e.g. Europe/Amsterdam). If left empty the 421 | # default value of core/default_timezone will be used 422 | # Example: default_ui_timezone = America/New_York 423 | default_ui_timezone = UTC 424 | 425 | # The ip specified when starting the web server 426 | web_server_host = 0.0.0.0 427 | 428 | # The port on which to run the web server 429 | web_server_port = 8080 430 | 431 | # Paths to the SSL certificate and key for the web server. When both are 432 | # provided SSL will be enabled. This does not change the web server port. 433 | web_server_ssl_cert = 434 | 435 | # Paths to the SSL certificate and key for the web server. When both are 436 | # provided SSL will be enabled. This does not change the web server port. 437 | web_server_ssl_key = 438 | 439 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond 440 | web_server_master_timeout = 120 441 | 442 | # Number of seconds the gunicorn webserver waits before timing out on a worker 443 | web_server_worker_timeout = 120 444 | 445 | # Number of workers to refresh at a time. When set to 0, worker refresh is 446 | # disabled. When nonzero, airflow periodically refreshes webserver workers by 447 | # bringing up new ones and killing old ones. 448 | worker_refresh_batch_size = 1 449 | 450 | # Number of seconds to wait before refreshing a batch of workers. 451 | worker_refresh_interval = 30 452 | 453 | # If set to True, Airflow will track files in plugins_folder directory. When it detects changes, 454 | # then reload the gunicorn. 455 | reload_on_plugin_change = False 456 | 457 | # Secret key used to run your flask app 458 | # It should be as random as possible 459 | secret_key = JK3PU6syfBItlK8mgHrYnA== 460 | 461 | # Number of workers to run the Gunicorn web server 462 | workers = 4 463 | 464 | # The worker class gunicorn should use. Choices include 465 | # sync (default), eventlet, gevent 466 | worker_class = sync 467 | 468 | # Log files for the gunicorn webserver. '-' means log to stderr. 469 | access_logfile = - 470 | 471 | # Log files for the gunicorn webserver. '-' means log to stderr. 472 | error_logfile = - 473 | 474 | # Access log format for gunicorn webserver. 475 | # default format is %%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s" 476 | # documentation - https://docs.gunicorn.org/en/stable/settings.html#access-log-format 477 | access_logformat = 478 | 479 | # Expose the configuration file in the web server 480 | expose_config = False 481 | 482 | # Expose hostname in the web server 483 | expose_hostname = True 484 | 485 | # Expose stacktrace in the web server 486 | expose_stacktrace = True 487 | 488 | # Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times`` 489 | dag_default_view = graph 490 | 491 | # Default DAG orientation. Valid values are: 492 | # ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top) 493 | dag_orientation = LR 494 | 495 | # Puts the webserver in demonstration mode; blurs the names of Operators for 496 | # privacy. 497 | demo_mode = False 498 | 499 | # The amount of time (in secs) webserver will wait for initial handshake 500 | # while fetching logs from other worker machine 501 | log_fetch_timeout_sec = 5 502 | 503 | # Time interval (in secs) to wait before next log fetching. 504 | log_fetch_delay_sec = 2 505 | 506 | # Distance away from page bottom to enable auto tailing. 507 | log_auto_tailing_offset = 30 508 | 509 | # Animation speed for auto tailing log display. 510 | log_animation_speed = 1000 511 | 512 | # By default, the webserver shows paused DAGs. Flip this to hide paused 513 | # DAGs by default 514 | hide_paused_dags_by_default = False 515 | 516 | # Consistent page size across all listing views in the UI 517 | page_size = 100 518 | 519 | # Define the color of navigation bar 520 | navbar_color = #fff 521 | 522 | # Default dagrun to show in UI 523 | default_dag_run_display_number = 25 524 | 525 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy 526 | enable_proxy_fix = False 527 | 528 | # Number of values to trust for ``X-Forwarded-For``. 529 | # More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/ 530 | proxy_fix_x_for = 1 531 | 532 | # Number of values to trust for ``X-Forwarded-Proto`` 533 | proxy_fix_x_proto = 1 534 | 535 | # Number of values to trust for ``X-Forwarded-Host`` 536 | proxy_fix_x_host = 1 537 | 538 | # Number of values to trust for ``X-Forwarded-Port`` 539 | proxy_fix_x_port = 1 540 | 541 | # Number of values to trust for ``X-Forwarded-Prefix`` 542 | proxy_fix_x_prefix = 1 543 | 544 | # Set secure flag on session cookie 545 | cookie_secure = False 546 | 547 | # Set samesite policy on session cookie 548 | cookie_samesite = Lax 549 | 550 | # Default setting for wrap toggle on DAG code and TI log views. 551 | default_wrap = False 552 | 553 | # Allow the UI to be rendered in a frame 554 | x_frame_enabled = True 555 | 556 | # Send anonymous user activity to your analytics tool 557 | # choose from google_analytics, segment, or metarouter 558 | # analytics_tool = 559 | 560 | # Unique ID of your account in the analytics tool 561 | # analytics_id = 562 | 563 | # 'Recent Tasks' stats will show for old DagRuns if set 564 | show_recent_stats_for_completed_runs = True 565 | 566 | # Update FAB permissions and sync security manager roles 567 | # on webserver startup 568 | update_fab_perms = True 569 | 570 | # The UI cookie lifetime in minutes. User will be logged out from UI after 571 | # ``session_lifetime_minutes`` of non-activity 572 | session_lifetime_minutes = 43200 573 | 574 | [email] 575 | 576 | # Configuration email backend and whether to 577 | # send email alerts on retry or failure 578 | # Email backend to use 579 | email_backend = airflow.utils.email.send_email_smtp 580 | 581 | # Whether email alerts should be sent when a task is retried 582 | default_email_on_retry = True 583 | 584 | # Whether email alerts should be sent when a task failed 585 | default_email_on_failure = True 586 | 587 | [smtp] 588 | 589 | # If you want airflow to send emails on retries, failure, and you want to use 590 | # the airflow.utils.email.send_email_smtp function, you have to configure an 591 | # smtp server here 592 | smtp_host = localhost 593 | smtp_starttls = True 594 | smtp_ssl = False 595 | # Example: smtp_user = airflow 596 | # smtp_user = 597 | # Example: smtp_password = airflow 598 | # smtp_password = 599 | smtp_port = 25 600 | smtp_mail_from = airflow@example.com 601 | smtp_timeout = 30 602 | smtp_retry_limit = 5 603 | 604 | [sentry] 605 | 606 | # Sentry (https://docs.sentry.io) integration. Here you can supply 607 | # additional configuration options based on the Python platform. See: 608 | # https://docs.sentry.io/error-reporting/configuration/?platform=python. 609 | # Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``, 610 | # ``ignore_errors``, ``before_breadcrumb``, ``before_send``, ``transport``. 611 | # Enable error reporting to Sentry 612 | sentry_on = false 613 | sentry_dsn = 614 | 615 | [celery_kubernetes_executor] 616 | 617 | # This section only applies if you are using the ``CeleryKubernetesExecutor`` in 618 | # ``[core]`` section above 619 | # Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``. 620 | # When the queue of a task is ``kubernetes_queue``, the task is executed via ``KubernetesExecutor``, 621 | # otherwise via ``CeleryExecutor`` 622 | kubernetes_queue = kubernetes 623 | 624 | [celery] 625 | 626 | # This section only applies if you are using the CeleryExecutor in 627 | # ``[core]`` section above 628 | # The app name that will be used by celery 629 | celery_app_name = airflow.executors.celery_executor 630 | 631 | # The concurrency that will be used when starting workers with the 632 | # ``airflow celery worker`` command. This defines the number of task instances that 633 | # a worker will take, so size up your workers based on the resources on 634 | # your worker box and the nature of your tasks 635 | worker_concurrency = 8 636 | 637 | # The maximum and minimum concurrency that will be used when starting workers with the 638 | # ``airflow celery worker`` command (always keep minimum processes, but grow 639 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency 640 | # Pick these numbers based on resources on worker box and the nature of the task. 641 | # If autoscale option is available, worker_concurrency will be ignored. 642 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale 643 | # Example: worker_autoscale = 16,12 644 | # worker_autoscale = 645 | 646 | # Used to increase the number of tasks that a worker prefetches which can improve performance. 647 | # The number of processes multiplied by worker_prefetch_multiplier is the number of tasks 648 | # that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily 649 | # blocked if there are multiple workers and one worker prefetches tasks that sit behind long 650 | # running tasks while another worker has unutilized processes that are unable to process the already 651 | # claimed blocked tasks. 652 | # https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits 653 | # Example: worker_prefetch_multiplier = 1 654 | # worker_prefetch_multiplier = 655 | 656 | # When you start an airflow worker, airflow starts a tiny web server 657 | # subprocess to serve the workers local log files to the airflow main 658 | # web server, who then builds pages and sends them to users. This defines 659 | # the port on which the logs are served. It needs to be unused, and open 660 | # visible from the main web server to connect into the workers. 661 | worker_log_server_port = 8793 662 | 663 | # Umask that will be used when starting workers with the ``airflow celery worker`` 664 | # in daemon mode. This control the file-creation mode mask which determines the initial 665 | # value of file permission bits for newly created files. 666 | worker_umask = 0o077 667 | 668 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally 669 | # a sqlalchemy database. Refer to the Celery documentation for more information. 670 | broker_url = redis://redis:6379/0 671 | 672 | # The Celery result_backend. When a job finishes, it needs to update the 673 | # metadata of the job. Therefore it will post a message on a message bus, 674 | # or insert it into a database (depending of the backend) 675 | # This status is used by the scheduler to update the state of the task 676 | # The use of a database is highly recommended 677 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings 678 | result_backend = db+postgresql://postgres:airflow@postgres/airflow 679 | 680 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start 681 | # it ``airflow celery flower``. This defines the IP that Celery Flower runs on 682 | flower_host = 0.0.0.0 683 | 684 | # The root URL for Flower 685 | # Example: flower_url_prefix = /flower 686 | flower_url_prefix = 687 | 688 | # This defines the port that Celery Flower runs on 689 | flower_port = 5555 690 | 691 | # Securing Flower with Basic Authentication 692 | # Accepts user:password pairs separated by a comma 693 | # Example: flower_basic_auth = user1:password1,user2:password2 694 | flower_basic_auth = 695 | 696 | # Default queue that tasks get assigned to and that worker listen on. 697 | default_queue = default 698 | 699 | # How many processes CeleryExecutor uses to sync task state. 700 | # 0 means to use max(1, number of cores - 1) processes. 701 | sync_parallelism = 0 702 | 703 | # Import path for celery configuration options 704 | celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG 705 | ssl_active = False 706 | ssl_key = 707 | ssl_cert = 708 | ssl_cacert = 709 | 710 | # Celery Pool implementation. 711 | # Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``. 712 | # See: 713 | # https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency 714 | # https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html 715 | pool = prefork 716 | 717 | # The number of seconds to wait before timing out ``send_task_to_executor`` or 718 | # ``fetch_celery_task_state`` operations. 719 | operation_timeout = 1.0 720 | 721 | # Celery task will report its status as 'started' when the task is executed by a worker. 722 | # This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted 723 | # or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob. 724 | task_track_started = True 725 | 726 | # Time in seconds after which Adopted tasks are cleared by CeleryExecutor. This is helpful to clear 727 | # stalled tasks. 728 | task_adoption_timeout = 600 729 | 730 | # The Maximum number of retries for publishing task messages to the broker when failing 731 | # due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed. 732 | task_publish_max_retries = 3 733 | 734 | # Worker initialisation check to validate Metadata Database connection 735 | worker_precheck = False 736 | 737 | [celery_broker_transport_options] 738 | 739 | # This section is for specifying options which can be passed to the 740 | # underlying celery broker transport. See: 741 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options 742 | # The visibility timeout defines the number of seconds to wait for the worker 743 | # to acknowledge the task before the message is redelivered to another worker. 744 | # Make sure to increase the visibility timeout to match the time of the longest 745 | # ETA you're planning to use. 746 | # visibility_timeout is only supported for Redis and SQS celery brokers. 747 | # See: 748 | # http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options 749 | # Example: visibility_timeout = 21600 750 | # visibility_timeout = 751 | 752 | [dask] 753 | 754 | # This section only applies if you are using the DaskExecutor in 755 | # [core] section above 756 | # The IP address and port of the Dask cluster's scheduler. 757 | cluster_address = 127.0.0.1:8786 758 | 759 | # TLS/ SSL settings to access a secured Dask scheduler. 760 | tls_ca = 761 | tls_cert = 762 | tls_key = 763 | 764 | [scheduler] 765 | # Task instances listen for external kill signal (when you clear tasks 766 | # from the CLI or the UI), this defines the frequency at which they should 767 | # listen (in seconds). 768 | job_heartbeat_sec = 5 769 | 770 | # How often (in seconds) to check and tidy up 'running' TaskInstancess 771 | # that no longer have a matching DagRun 772 | clean_tis_without_dagrun_interval = 15.0 773 | 774 | # The scheduler constantly tries to trigger new tasks (look at the 775 | # scheduler section in the docs for more information). This defines 776 | # how often the scheduler should run (in seconds). 777 | scheduler_heartbeat_sec = 5 778 | 779 | # The number of times to try to schedule each DAG file 780 | # -1 indicates unlimited number 781 | num_runs = -1 782 | 783 | # The number of seconds to wait between consecutive DAG file processing 784 | processor_poll_interval = 1 785 | 786 | # after how much time (seconds) a new DAGs should be picked up from the filesystem 787 | min_file_process_interval = 30 788 | 789 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. 790 | dag_dir_list_interval = 300 791 | 792 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats 793 | print_stats_interval = 30 794 | 795 | # How often (in seconds) should pool usage stats be sent to statsd (if statsd_on is enabled) 796 | pool_metrics_interval = 5.0 797 | 798 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold 799 | # ago (in seconds), scheduler is considered unhealthy. 800 | # This is used by the health check in the "/health" endpoint 801 | scheduler_health_check_threshold = 30 802 | 803 | # How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs 804 | orphaned_tasks_check_interval = 300.0 805 | child_process_log_directory = /opt/airflow/logs/scheduler 806 | 807 | # Local task jobs periodically heartbeat to the DB. If the job has 808 | # not heartbeat in this many seconds, the scheduler will mark the 809 | # associated task instance as failed and will re-schedule the task. 810 | scheduler_zombie_task_threshold = 300 811 | 812 | # Turn off scheduler catchup by setting this to ``False``. 813 | # Default behavior is unchanged and 814 | # Command Line Backfills still work, but the scheduler 815 | # will not do scheduler catchup if this is ``False``, 816 | # however it can be set on a per DAG basis in the 817 | # DAG definition (catchup) 818 | catchup_by_default = True 819 | 820 | # This changes the batch size of queries in the scheduling main loop. 821 | # If this is too high, SQL query performance may be impacted by one 822 | # or more of the following: 823 | # - reversion to full table scan 824 | # - complexity of query predicate 825 | # - excessive locking 826 | # Additionally, you may hit the maximum allowable query length for your db. 827 | # Set this to 0 for no limit (not advised) 828 | max_tis_per_query = 512 829 | 830 | # Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries. 831 | # If this is set to False then you should not run more than a single 832 | # scheduler at once 833 | use_row_level_locking = True 834 | 835 | # Max number of DAGs to create DagRuns for per scheduler loop 836 | # 837 | # Default: 10 838 | # max_dagruns_to_create_per_loop = 839 | 840 | # How many DagRuns should a scheduler examine (and lock) when scheduling 841 | # and queuing tasks. 842 | # 843 | # Default: 20 844 | # max_dagruns_per_loop_to_schedule = 845 | 846 | # Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the 847 | # same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other 848 | # dags in some circumstances 849 | # 850 | # Default: True 851 | # schedule_after_task_execution = 852 | 853 | # The scheduler can run multiple processes in parallel to parse dags. 854 | # This defines how many processes will run. 855 | parsing_processes = 2 856 | 857 | # Turn off scheduler use of cron intervals by setting this to False. 858 | # DAGs submitted manually in the web UI or with trigger_dag will still run. 859 | use_job_schedule = True 860 | 861 | # Allow externally triggered DagRuns for Execution Dates in the future 862 | # Only has effect if schedule_interval is set to None in DAG 863 | allow_trigger_in_future = False 864 | 865 | [kerberos] 866 | ccache = /tmp/airflow_krb5_ccache 867 | 868 | # gets augmented with fqdn 869 | principal = airflow 870 | reinit_frequency = 3600 871 | kinit_path = kinit 872 | keytab = airflow.keytab 873 | 874 | [github_enterprise] 875 | api_rev = v3 876 | 877 | [admin] 878 | # UI to hide sensitive variable fields when set to True 879 | hide_sensitive_variable_fields = True 880 | 881 | # A comma-separated list of sensitive keywords to look for in variables names. 882 | sensitive_variable_fields = 883 | 884 | [elasticsearch] 885 | # Elasticsearch host 886 | host = 887 | 888 | # Format of the log_id, which is used to query for a given tasks logs 889 | log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number} 890 | 891 | # Used to mark the end of a log stream for a task 892 | end_of_log_mark = end_of_log 893 | 894 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id 895 | # Code will construct log_id using the log_id template from the argument above. 896 | # NOTE: The code will prefix the https:// automatically, don't include that here. 897 | frontend = 898 | 899 | # Write the task logs to the stdout of the worker, rather than the default files 900 | write_stdout = False 901 | 902 | # Instead of the default log formatter, write the log lines as JSON 903 | json_format = False 904 | 905 | # Log fields to also attach to the json output, if enabled 906 | json_fields = asctime, filename, lineno, levelname, message 907 | 908 | [elasticsearch_configs] 909 | use_ssl = False 910 | verify_certs = True 911 | 912 | [kubernetes] 913 | # Path to the YAML pod file. If set, all other kubernetes-related fields are ignored. 914 | pod_template_file = 915 | 916 | # The repository of the Kubernetes Image for the Worker to Run 917 | worker_container_repository = 918 | 919 | # The tag of the Kubernetes Image for the Worker to Run 920 | worker_container_tag = 921 | 922 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default`` 923 | namespace = default 924 | 925 | # If True, all worker pods will be deleted upon termination 926 | delete_worker_pods = True 927 | 928 | # If False (and delete_worker_pods is True), 929 | # failed worker pods will not be deleted so users can investigate them. 930 | delete_worker_pods_on_failure = False 931 | 932 | # Number of Kubernetes Worker Pod creation calls per scheduler loop. 933 | # Note that the current default of "1" will only launch a single pod 934 | # per-heartbeat. It is HIGHLY recommended that users increase this 935 | # number to match the tolerance of their kubernetes cluster for 936 | # better performance. 937 | worker_pods_creation_batch_size = 1 938 | 939 | # Allows users to launch pods in multiple namespaces. 940 | # Will require creating a cluster-role for the scheduler 941 | multi_namespace_mode = False 942 | 943 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster. 944 | # It's intended for clients that expect to be running inside a pod running on kubernetes. 945 | # It will raise an exception if called from a process not running in a kubernetes environment. 946 | in_cluster = True 947 | 948 | # When running with in_cluster=False change the default cluster_context or config_file 949 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has. 950 | # cluster_context = 951 | 952 | # Path to the kubernetes configfile to be used when ``in_cluster`` is set to False 953 | # config_file = 954 | 955 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods 956 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string. 957 | # List of supported params are similar for all core_v1_apis, hence a single config 958 | # variable for all apis. See: 959 | # https://raw.githubusercontent.com/kubernetes-client/python/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/api/core_v1_api.py 960 | kube_client_request_args = 961 | 962 | # Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client 963 | # ``core_v1_api`` method when using the Kubernetes Executor. 964 | # This should be an object and can contain any of the options listed in the ``v1DeleteOptions`` 965 | # class defined here: 966 | # https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19 967 | # Example: delete_option_kwargs = {"grace_period_seconds": 10} 968 | delete_option_kwargs = 969 | 970 | # Enables TCP keepalive mechanism. This prevents Kubernetes API requests to hang indefinitely 971 | # when idle connection is time-outed on services like cloud load balancers or firewalls. 972 | enable_tcp_keepalive = False 973 | 974 | # When the `enable_tcp_keepalive` option is enabled, TCP probes a connection that has 975 | # been idle for `tcp_keep_idle` seconds. 976 | tcp_keep_idle = 120 977 | 978 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond 979 | # to a keepalive probe, TCP retransmits the probe after `tcp_keep_intvl` seconds. 980 | tcp_keep_intvl = 30 981 | 982 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond 983 | # to a keepalive probe, TCP retransmits the probe `tcp_keep_cnt number` of times before 984 | # a connection is considered to be broken. 985 | tcp_keep_cnt = 6 986 | 987 | [smart_sensor] 988 | # When `use_smart_sensor` is True, Airflow redirects multiple qualified sensor tasks to 989 | # smart sensor task. 990 | use_smart_sensor = False 991 | 992 | # `shard_code_upper_limit` is the upper limit of `shard_code` value. The `shard_code` is generated 993 | # by `hashcode % shard_code_upper_limit`. 994 | shard_code_upper_limit = 10000 995 | 996 | # The number of running smart sensor processes for each service. 997 | shards = 5 998 | 999 | # comma separated sensor classes support in smart_sensor. 1000 | sensors_enabled = NamedHivePartitionSensor --------------------------------------------------------------------------------