├── ETL-Data-pipelines
    ├── airflow-data
    │   ├── creds
    │   │   └── s3
    │   └── airflow.cfg
    ├── Dockerfile
    ├── sparkFiles
    │   └── sparkProcess.py
    ├── docker-compose.yml
    └── dags
    │   └── dagRun.py
├── archiveweb.conf
├── archiveweb.psgi
├── LICENSE
├── stk.patch
└── README.md


/ETL-Data-pipelines/airflow-data/creds/s3:
--------------------------------------------------------------------------------
1 | [airflow-spark1]
2 | aws_access_key_id =
3 | aws_secret_access_key =
4 | 


--------------------------------------------------------------------------------
/archiveweb.conf:
--------------------------------------------------------------------------------
1 | # rename this file to archiveweb.yml and put a ':' after 'name' if
2 | # you want to use YAML like in old versions of Catalyst
3 | name ArchiveWeb
4 | 


--------------------------------------------------------------------------------
/archiveweb.psgi:
--------------------------------------------------------------------------------
1 | use strict;
2 | use warnings;
3 | 
4 | use ArchiveWeb;
5 | 
6 | my $app = ArchiveWeb->apply_default_middlewares(ArchiveWeb->psgi_app);
7 | $app;
8 | 
9 | 


--------------------------------------------------------------------------------
/ETL-Data-pipelines/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.0.1}
 2 | 
 3 | USER root
 4 | # Install OpenJDK-8
 5 | RUN apt-get update && \
 6 |     apt-get install -y openjdk-11-jre-headless && \
 7 |     apt-get clean
 8 | 
 9 | USER airflow
10 | RUN pip install --upgrade pip
11 | 
12 | COPY requirements.txt /opt/airflow
13 | WORKDIR /opt/airflow
14 | RUN pip install -r requirements.txt


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Ramesh chinnaraj
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/stk.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/bin/file-source-loader b/bin/file-source-loader
 2 | index c576631..17abcee 100755
 3 | --- a/bin/file-source-loader
 4 | +++ b/bin/file-source-loader
 5 | @@ -345,6 +345,54 @@ sub process_file_sinks {
 6 |    }
 7 |  }
 8 |  
 9 | +sub process_file_transforms {
10 | +  my($file_source, $file) = @_;
11 | +
12 | +  # get the details of file transforms that are yet to be processed for this file
13 | +  my $file_transforms = get_pending_file_transforms($file_source->{id}, $file->{id});
14 | +
15 | +#FIXME
16 | +  for my $file_transform_id (sort keys %$file_sinks) {
17 | +    my $file_sink = $file_sinks->{$file_sink_id};
18 | +    my $src_path = get_archived_file_path($file_source, $file);
19 | +#    my $dst_path = transform_filename($file_source, $file,
20 | +#                                      $file_sink->{filename_transform});
21 | +#
22 | +#    # Remove the password component from the URL for display
23 | +#    my $url = URI->new($file_sink->{url});
24 | +#    if ($url->scheme eq "ftp") {
25 | +#      $url->userinfo($url->user);
26 | +#    }
27 | +#    msg_file_source(
28 | +#      $file_source,
29 | +#      "uploading file '$src_path' to '$url/$dst_path'",
30 | +#      LOG_INFO
31 | +#    );
32 | +#
33 | +#    # connect to FTP server
34 | +#    my $ftp = connect_ftp($file_source, $file_sink->{url});
35 | +#
36 | +#    if (!$options{test}) {
37 | +#
38 | +#      # upload file
39 | +#      $ftp->put($src_path, $dst_path) or
40 | +#        msg_file_source(
41 | +#          $file_source,
42 | +#          "FTP put of '$src_path' failed: " . $ftp->message,
43 | +#          LOG_CRIT
44 | +#        );
45 | +#
46 | +#      # mark this file_sink as done for this file
47 | +#      $dbh->do('
48 | +#        INSERT INTO file_sink_file_processed
49 | +#               (file_sink_id, file_id)
50 | +#        VALUES (?, ?);
51 | +#      ', {}, $file_sink_id, $file->{id});
52 | +#    }
53 | +#
54 | +  }
55 | +}
56 | +
57 |  sub mark_file_as_processed {
58 |    my($file_id) = @_;
59 |  
60 | @@ -401,6 +449,7 @@ sub process_file {
61 |  
62 |    process_file_sinks($file_source, $file);
63 |    process_virtual_ftp_file_sinks($file_source, $file);
64 | +  process_file_transforms($file_source, $file);
65 |  
66 |    # mark this file as processed
67 |    # if process_file_sinks() and process_virtual_ftp_file_sinks() return, all
68 | 


--------------------------------------------------------------------------------
/ETL-Data-pipelines/sparkFiles/sparkProcess.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | import pyspark.sql.functions as F
 3 | from pyspark.sql.window import Window
 4 | from pyspark.sql.functions import lead, lag
 5 | import os
 6 | 
 7 | # the parsed data csv file
 8 | parsedData = '/opt/airflow/sparkFiles/parsedData.csv'
 9 | 
10 | # start a spark session and set up its configuration
11 | spark = SparkSession \
12 |     .builder \
13 |     .appName("Pysparkexample") \
14 |     .config("spark.some.config.option", "some-value") \
15 |     .getOrCreate()
16 | 
17 | # create a spark dataframe using the data in the csv
18 | df = spark.read.csv(parsedData,
19 |                     header='true',
20 |                     inferSchema='true',
21 |                     ignoreLeadingWhiteSpace=True,
22 |                     ignoreTrailingWhiteSpace=True)
23 | 
24 | # list for columns subtractions
25 | colDiffs = []
26 | # get only the county columns from the df columns list
27 | countyCols = df.columns[1:]
28 | # change the schema/type of the dateFor column from string to date
29 | df = df.withColumn("dateFor", F.to_date("dateFor", "yyyy-MM-dd"))
30 | # Window function spec to partition the df and sort it by Dates descending
31 | # The entire dataset is partitioned (no argument passed to partitionBy) as there are no dates that show multiple times.
32 | windowSpec = Window.partitionBy().orderBy(F.col('dateFor').desc())
33 | # for each county column in the columns list
34 | for county in countyCols:
35 |     # add a new column, countynameDiff, to the df containing the same numbers but shifted up by one using "lead"
36 |     # E.g.: if a column X contains the numbers [1, 2, 3], applying the "lead" window function, with 1 as argument, will
37 |     # shift everything up by 1 and the new XDiff column will contain [2, 3, none]
38 |     df = df.withColumn(f'{county}Diff', lead(county, 1).over(windowSpec))
39 |     # add the subtraction to the list with the condition that if the calculated value is lower than 0, then save 0
40 |     # this saves the subtraction formula in the list, not the result of the subtraction.
41 |     # the header of the subtraction result column will be the same as the "county" by applying "alias"
42 |     colDiffs.append(F.when((df[county] - df[f'{county}Diff']) < 0, 0)
43 |                     .otherwise(df[county] - df[f'{county}Diff']).alias(county))
44 | # select the dateFor column and calculate the subtractions in the df, returning a new dataframe with the results
45 | result = df.select('dateFor', *colDiffs).fillna(0)
46 | # convert the result to a pandas dataframe and save it as a csv
47 | # warning: the conversion is executed in memory. Other methods might be better suited for large datasets
48 | result.toPandas().to_csv('/opt/airflow/sparkFiles/results.csv',
49 |                          sep=',',
50 |                          header=True,
51 |                          index=False)
52 | 
53 | # delete the parsed data csv from the working directory
54 | os.remove(parsedData)
55 | 


--------------------------------------------------------------------------------
/ETL-Data-pipelines/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME         - Docker image name used to run Airflow.
 27 | #                              Default: apache/airflow:master-python3.8
 28 | # AIRFLOW_UID                - User ID in Airflow containers
 29 | #                              Default: 50000
 30 | # AIRFLOW_GID                - Group ID in Airflow containers
 31 | #                              Default: 50000
 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account.
 33 | #                              Default: airflow
 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account.
 35 | #                              Default: airflow
 36 | #
 37 | # Feel free to modify this file to suit your needs.
 38 | #---
 39 | 
 40 | version: '3'
 41 | x-airflow-common:
 42 |   &airflow-common
 43 |   image: apache/airflow:2.0.1
 44 |   environment:
 45 |     &airflow-common-env
 46 |     AIRFLOW__CORE__EXECUTOR: LocalExecutor
 47 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 48 |     AIRFLOW__CORE__FERNET_KEY: ''
 49 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 50 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
 51 |     AIRFLOW_CONN_AWS_DEFAULT: aws://?profile=airflow-spark1&s3_config_file=/opt/airflow/creds/s3&s3_config_format=aws
 52 |   volumes:
 53 |     - ./airflow-data/creds:/opt/airflow/creds
 54 |     - ./dags:/opt/airflow/dags
 55 |     - ./sparkFiles:/opt/airflow/sparkFiles
 56 |     - ./airflow-data/logs:/opt/airflow/logs
 57 |     - ./airflow-data/plugins:/opt/airflow/plugins
 58 |     - ./airflow-data/airflow.cfg:/opt/airlfow/airflow.cfg
 59 |   user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}"
 60 |   depends_on:
 61 |     postgres:
 62 |       condition: service_healthy
 63 | 
 64 | services:
 65 |   postgres:
 66 |     image: postgres:13
 67 |     environment:
 68 |       POSTGRES_USER: airflow
 69 |       POSTGRES_PASSWORD: airflow
 70 |       POSTGRES_DB: airflow
 71 |     volumes:
 72 |       - postgres-db-volume:/var/lib/postgresql/data
 73 |     healthcheck:
 74 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 75 |       interval: 5s
 76 |       retries: 5
 77 |     restart: always
 78 | 
 79 |   airflow-webserver:
 80 |     build:
 81 |       context: .
 82 |       dockerfile: Dockerfile
 83 |     <<: *airflow-common
 84 |     command: webserver
 85 |     ports:
 86 |       - 8080:8080
 87 |     healthcheck:
 88 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
 89 |       interval: 10s
 90 |       timeout: 10s
 91 |       retries: 5
 92 |     restart: always
 93 | 
 94 |   airflow-scheduler:
 95 |     <<: *airflow-common
 96 |     command: scheduler
 97 |     restart: always
 98 | 
 99 |   airflow-init:
100 |     <<: *airflow-common
101 |     command: version
102 |     environment:
103 |       <<: *airflow-common-env
104 |       _AIRFLOW_DB_UPGRADE: 'true'
105 |       _AIRFLOW_WWW_USER_CREATE: 'true'
106 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
107 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
108 | 
109 | volumes:
110 |   postgres-db-volume:


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ETL-for-Educational-Institutions-
  2 | 
  3 | About
  4 | 
  5 | Educational project on how to build an ETL (Extract, Transform, Load) data pipeline, orchestrated with Airflow.
  6 | 
  7 | An AWS s3 bucket is used as a Data Lake in which json files are stored. The data is extracted from a json and parsed (cleaned). It is then transformed/processed with Spark (PySpark) and loaded/stored in either a Mongodb database or in an Amazon Redshift Data Warehouse.
  8 | 
  9 | The pipeline architecture - author's interpretation:
 10 | 
 11 | ![image](https://user-images.githubusercontent.com/110036451/184507455-2ffd0d6f-3a9c-44fd-965b-05b14579cc1f.png)
 12 | 
 13 | 
 14 | Note: Since this project was built for learning purposes and as an example, it functions only for a single scenario and data schema.
 15 | The project is built in Python and it has 2 main parts:
 16 | 
 17 | The Airflow DAG file, dags/dagRun.py, which orchestrates the data pipeline tasks.
 18 | The PySpark data transformation/processing script, located in sparkFiles/sparkProcess.py
 19 | Note: The code and especially the comments in the python files dags/dagRun.py and sparkFiles/sparkProcess.py are intentionally verbose for a better understanding of the functionality.
 20 | Scenario
 21 | The Romanian COVID-19 data, provided by https://datelazi.ro/, contains COVID-19 data for each county, including the total COVID numbers from one day to the next. It does not contain the difference in numbers between the days (i.e. for county X in day 1 there were 7 cases, in day 2 there were 37 cases). This data is loaded as a json file in the s3 bucket.
 22 | 
 23 | Find the differences between days for all counties (i.e. for county X there were 30 more cases in day 2 than in day 1). If the difference is smaller than 0 (e.g. because of a data recording error), then the difference for that day should be 0.
 24 | 
 25 | Base concepts
 26 | Data Engineering
 27 | ETL (Extract, Transform, Load)
 28 | Pipeline
 29 | Data Lake
 30 | Data Warehouse
 31 | Data Schema
 32 | Apache Airflow (wikipedia page)
 33 | Airflow DAG
 34 | Airflow XCom
 35 | Apache Spark, speciffically the PySpark api (wikipedia page)
 36 | Amazon Web Services (AWS) (wikipedia page)
 37 | s3 (wikipedia page)
 38 | Redshift (Wikipedia page)
 39 | mongoDB (wikipedia page)
 40 | Prerequisites
 41 | Docker
 42 | Docker Compose
 43 | AWS s3 bucket
 44 | mongoDB database
 45 | Amazon Redshift database
 46 | Set-up
 47 | Download / pull the repo to your desired location.
 48 | 
 49 | You will have to create an AWS s3 user specifficaly for Airflow to interact with the s3 bucket. The credentials for that user will have to be saved in the s3 file found the directory /airflow-data/creds:
 50 | 
 51 | [airflow-spark1]
 52 | aws_access_key_id = 
 53 | aws_secret_access_key = 
 54 | On rows 17 and 18 in dags/dagRun.py you have the option to choose what databases system to use, mongoDB (noSQL) or Amazon Redshift (RDBMS), just by commenting/uncommenting one or the other:
 55 | 
 56 | # database = 'mongoDB'
 57 | database = 'Redshift'
 58 | If you want to use mongoDB, you will have to enter the mongoDB connection string (or environment variable or file with the string) in the dags/dagRun.py file, line 22:
 59 | 
 60 | client = pymongo.MongoClient('mongoDB_connection_string')
 61 | If you want to use a Redshift cluster, you will have to provide your Amazon Redshift database name, host and the rest of the credentials from row 29 to 34 in dags/dagRun.py:
 62 | 
 63 | dbname = 'testairflow'
 64 | host = '*******************************.eu-central-1.redshift.amazonaws.com'
 65 | port = '****'
 66 | user = '*********'
 67 | password = '********************'
 68 | awsIAMrole = 'arn:aws:iam::************:role/*******
 69 | You will have to change the s3 bucket name and file key (the name of the file saved in the s3 bucket) located at lines 148 and line 150 in dags/dagRun.py:
 70 | 
 71 | # name of the file in the AWS s3 bucket
 72 | key = 'countyData.json'
 73 | # name of the AWS s3 bucket
 74 | bucket = 'renato-airflow-raw'
 75 | In the repo directory, execute the following command that will create the .env file containig the Airflow UID and GID needed by docker-compose:
 76 | 
 77 | echo -e "AIRFLOW_UID=$(id -u)\nAIRFLOW_GID=0" > .env
 78 | 
 79 | 
 80 | https://user-images.githubusercontent.com/19210522/114414670-b43ab980-9bb7-11eb-8ea8-061385b14980.gif
 81 | 
 82 | Installation
 83 | Start the installation with:
 84 | 
 85 | docker-compose up -d
 86 | This command will pull and create Docker images and containers for Airflow, according to the instructions in the docker-compose.yml file:
 87 | 
 88 | 
 89 | 
 90 | After everything has been installed, you can check the status of your containers (if they are healthy) with:
 91 | 
 92 | docker ps
 93 | 
 94 | ![image](https://user-images.githubusercontent.com/110036451/184507487-d0fa1f2e-5914-492a-88d7-3e1a260d7026.png)
 95 | 
 96 | 
 97 | Note: it might take up to 30 seconds for the containers to have the healthy flag after starting.
 98 | 
 99 | 
100 | 
101 | Airflow Interface
102 | You can now access the Airflow web interface by going to http://localhost:8080/. If you have not changed them in the docker-compose.yml file, the default user is airflow and password is airflow:
103 | 
104 | 
105 | ![image](https://user-images.githubusercontent.com/110036451/184507507-ea8e1d34-b56b-4b19-b168-6d138546e02c.png)
106 | 
107 | 
108 | After signing in, the Airflow home page is the DAGs list page. Here you will see all your DAGs and the Airflow example DAGs, sorted alphabetically.
109 | 
110 | Any DAG python script saved in the directory dags/, will show up on the DAGs page (e.g. the first DAG, analyze_json_data, is the one built for this project).
111 | 
112 | Note: If you update the code in the python DAG script, the airflow DAGs page has to be refreshed
113 | 
114 | Note: If you do not want to see any Airflow example dags, se the AIRFLOW__CORE__LOAD_EXAMPLES: flag to False in the docker-compose.yml file before starting the installation.
115 | 
116 | 
117 | ![image](https://user-images.githubusercontent.com/110036451/184507520-9f51ed71-65ee-4417-83a4-3e02659880fb.png)
118 | 
119 | 
120 | Click on the name of the dag to open the DAG details page:
121 | 
122 | 
123 | ![image](https://user-images.githubusercontent.com/110036451/184507526-8719bd8b-9057-40f9-b808-44bdc3009855.png)
124 | 
125 | 
126 | On the Graph View page you can see the dag running through each task (getLastProcessedDate, getDate, etc) after it has been unpaused and trigerred:
127 | 
128 | 
129 | https://user-images.githubusercontent.com/19210522/114459521-50c97f80-9be9-11eb-907a-3627a21d52dc.gif
130 | 
131 | Pipeline Task by Task
132 | 
133 | Task getLastProcessedDate
134 | 
135 | Finds the last processed date in the mongo database and saves/pushes it in an Airflow XCom
136 | 
137 | Task getDate
138 | 
139 | Grabs the data saved in the XCom and depending of the value pulled, returns the task id parseJsonFile or the task id endRun
140 | 
141 | Task parseJsonFile
142 | 
143 | The json contains unnecessary data for this case, so it needs to be parsed to extract only the daily total numbers for each county.
144 | 
145 | If there is any new data to be processed (the date extracted in the task getLastProcessedDate is older than dates in the data) it is saved in a temp file in the directory sparkFiles:
146 | 
147 | 
148 | ![image](https://user-images.githubusercontent.com/110036451/184507547-54a07928-f9ae-4880-88fe-d8939a359331.png)
149 | 
150 | 
151 | i.e.: for the county AB, on the 7th of April, there were 19046 COVID cases, on the 8th of April there were 19150 cases
152 | 
153 | It also returns the task id endRun if there was no new data, or the task ID processParsedData
154 | 
155 | Task processParsedData
156 | Executes the PySpark script sparkFiles/sparkProcess.py.
157 | 
158 | The parsed data is processed and the result is saved in another temporary file in the sparkFiles directory:
159 | 
160 | 
161 | ![image](https://user-images.githubusercontent.com/110036451/184507558-b33a3043-efa2-4cf0-a052-5bf219dd3004.png)
162 | 
163 | 
164 | 
165 | i.e.: for the county AB, on the 8th of April there were 104 more cases than on the 7th of April
166 | 
167 | Task saveToDB
168 | 
169 | Save the processed data either in the mongoDB database:
170 | 
171 | 
172 | ![image](https://user-images.githubusercontent.com/110036451/184507569-004a67c6-c747-4bbf-80c2-9e2c467e0331.png)
173 | 
174 | 
175 | 
176 | Or in Redshift:
177 | 
178 | 
179 | ![image](https://user-images.githubusercontent.com/110036451/184507596-f96cee55-4fbd-4812-bca6-c9e4381c4582.png)
180 | 
181 | 
182 | Note: The Redshift column names are the full name of the counties as the short version for some of them conflicts with SQL reserved words
183 | 
184 | Task endRun
185 | 
186 | Dummy task used as the end of the pipeline
187 | 
188 | Shut Down and Restart Airflow
189 | If you want to make changes to any of the configuration files docker-compose.yml, Dockerfile, requirements.txt you will have to shut down the Airflow instance with:
190 | 
191 | docker-compose down
192 | 
193 | This command will shut down and delete any containers created/used by Airflow.
194 | 
195 | For any changes made in the configuration files to be applied, you will have to rebuild the Airflow images with the command:
196 | 
197 | docker-compose build
198 | 
199 | Recreate all the containers with:
200 | 
201 | docker-compose up -d
202 | 


--------------------------------------------------------------------------------
/ETL-Data-pipelines/dags/dagRun.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pandas as pd
  3 | import os
  4 | # import boto3
  5 | import datetime
  6 | import pymongo
  7 | from airflow.models import DAG
  8 | from airflow.operators.bash_operator import BashOperator
  9 | from airflow.operators.python_operator import PythonOperator
 10 | from airflow.operators.python_operator import BranchPythonOperator
 11 | from airflow.operators.dummy_operator import DummyOperator
 12 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook
 13 | import psycopg2
 14 | 
 15 | # select the database system to be used: mongoDB (noSQL) or Amazon Redshift (RDBMS)
 16 | # database = 'mongoDB'
 17 | database = 'Redshift'
 18 | 
 19 | if database == 'mongoDB':
 20 |     # connect to the MONGO database
 21 |     # the mongo DB connection string
 22 |     client = pymongo.MongoClient('mongoDB_connection_string')
 23 |     # the database to be used
 24 |     db = client.testairflow
 25 | 
 26 | else:
 27 |     # Amazon Redshift database connection details
 28 |     # the details bellow can also be saved in environment variables
 29 |     dbname = 'testairflow'
 30 |     host = '*******************************.eu-central-1.redshift.amazonaws.com'
 31 |     port = '****'
 32 |     user = '*********'
 33 |     password = '********************'
 34 |     awsIAMrole = 'arn:aws:iam::************:role/*******'
 35 | 
 36 | 
 37 | def getDBdate(ti):
 38 |     """
 39 |     Args:
 40 |         ti: task instance argument used by Airflow to push and pull xcom data
 41 | 
 42 |     Pushes an xcom to the Airflow database with the date of the most recent db entry
 43 |     (An xcom is a tool used to share small amounts of data between Airflow dag tasks. Similar to Airflow variable, but
 44 |     unlike Airflow variables which are global and can be shared between multiple dags, xcom shares between the tasks
 45 |     of a dag.)
 46 |     """
 47 | 
 48 |     # try-except error handler: if the db aggregation fails, return None to not process the same data multiple times
 49 |     try:
 50 |         if database == 'mongoDB':
 51 |             # >>> use the MONGO database
 52 |             # Find the the latest database document
 53 |             # filter for the Mongo db aggregation: the key 'dateFor' has to be exist in the collection
 54 |             aggFilter = {'dateFor': {'$exists': True}}
 55 |             # in the collection/table (db.countyDiff), apply the filter to the aggregation, convert the dates
 56 |             # from str to datetime, sort descending and return one (the first one)
 57 |             dateDoc = list(db.countyDiff.aggregate([{'$match': aggFilter},
 58 |                                                     {'$project': {
 59 |                                                         'date': {
 60 |                                                             '$dateFromString': {
 61 |                                                                 'dateString': '$dateFor',
 62 |                                                                 'format': '%Y-%m-%d'}
 63 |                                                         }
 64 |                                                     }},
 65 |                                                     {'$sort': {'date': -1}},
 66 |                                                     {'$limit': 1}
 67 |                                                     ]))
 68 | 
 69 |             # try-except error handler to set base parsing start date if the aggregation above returns and empty
 70 |             # list (i.e.: no data found)
 71 |             try:
 72 |                 fetchedDate = dateDoc[0]['date'].strftime('%Y-%m-%d')
 73 |             except:
 74 |                 fetchedDate = '2020-01-01'
 75 | 
 76 |         else:
 77 |             # >>> use the AMAZON REDSHIFT database
 78 |             # set up the connection to the Redshift database
 79 |             conn = psycopg2.connect(f'dbname={dbname} host={host} port={port} user={user} password={password}')
 80 |             # start the database cursor
 81 |             cursor = conn.cursor()
 82 |             # grab the latest date from the counties collection
 83 |             sql = """SELECT dateFor FROM counties ORDER BY dateFor DESC LIMIT 1;"""
 84 |             # try-except error handler to return a parsing start date in case the cursor execute fails
 85 |             try:
 86 |                 cursor.execute(sql)
 87 |                 # convert the date from datetime to string
 88 |                 fetchedDate = cursor.fetchall()[0][0].strftime('%Y-%m-%d')
 89 |                 # close the connection and cursor
 90 |                 cursor.close()
 91 |                 conn.close()
 92 |             except:
 93 |                 fetchedDate = '2020-01-01'
 94 |                 cursor.close()
 95 |                 conn.close()
 96 | 
 97 |     except:
 98 |         fetchedDate = None
 99 | 
100 |     # push the task instance (key, value format) to an xcom
101 |     ti.xcom_push(key='fetchedDate', value=fetchedDate)
102 | 
103 | 
104 | def getLastDate(ti):
105 |     """
106 |     Pull the date from xcom and return one task id, or multiple task IDs if they are inside a list, to be executed
107 |     Args:
108 |         ti: task instance argument used by Airflow to push and pull xcom data
109 | 
110 |     Returns: the airflow task ID to be executed based on the xcom data
111 |     """
112 | 
113 |     # pull the xcom data (in this case a list containing only one element: the date string or None)
114 |     fetchedDate = ti.xcom_pull(key='fetchedDate', task_ids=['getLastProcessedDate'])
115 |     # if the date is None then execute the 'parseJsonFile' task, else execute the 'endRun' task
116 |     if fetchedDate[0] is not None:
117 |         return 'parseJsonFile'
118 |     return 'endRun'
119 | 
120 | 
121 | def readJsonData(ti):
122 |     """
123 |     Read and parse a Json, save the parsed data to a CSV
124 |     Args:
125 |         ti: task instance argument used by Airflow to push and pull xcom data
126 | 
127 |     Returns: a task id to be executed
128 |     """
129 | 
130 |     # get the date data from the xcom
131 |     fetchedDate = ti.xcom_pull(key='fetchedDate', task_ids=['getLastProcessedDate'])
132 |     # grab the first element (the date string) from the list pulled from xcom and convert it from string to datetime
133 |     lastDBDate = datetime.datetime.strptime(fetchedDate[0], '%Y-%m-%d')
134 | 
135 |     # connect to s3 using the boto3 AWS python module (credentials are saved in an env variable)
136 |     # set up the connection to the Amazon cloud bucket
137 |     # s3 = boto3.client('s3')
138 |     # get the file data from the S3 bucket
139 |     # obj = s3.get_object(Bucket='renato-airflow-raw', Key='mar.json')
140 |     # open the file in memory
141 |     # filename = obj['Body'].read().decode('utf-8', errors='ignore')
142 | 
143 |     # connect to s3 using the Airflow hook system. Credentials are saved in env variables when the container is
144 |     # built, using the AIRFLOW_CONN_AWS_DEFAULT environment flag in the docker-compose.yml file.
145 |     # s3 hook object
146 |     hook = S3Hook()
147 |     # name of the file in the AWS s3 bucket
148 |     key = 'countyData.json'
149 |     # name of the AWS s3 bucket
150 |     bucket = 'renato-airflow-raw'
151 |     # directory in which the file will be saved
152 |     path = '/opt/airflow/sparkFiles'
153 |     # download the file
154 |     filename = hook.download_file(
155 |         key=key,
156 |         bucket_name=bucket,
157 |         local_path=path
158 |     )
159 | 
160 |     # open the json data with the correct encoding. 'latin-1' encoding was used as the json contains some chars that
161 |     # are not encoded in utf-8 (which is usually used)
162 |     with open(filename, encoding='latin-1') as data:
163 |         # load the data as a dictionary (key, value pairs)
164 |         jsonData = json.load(data)
165 |         # make sure that historicalData is in the dictionary keys
166 |         if 'historicalData' in jsonData.keys():
167 |             # empty list for saving the parsed data
168 |             dfData = []
169 |             # for each date (key) in the 'historicalData' dictionary
170 |             for key in jsonData['historicalData'].keys():
171 |                 # if the last db date is smaller/earlier or equal with the date key
172 |                 if lastDBDate <= datetime.datetime.strptime(key, '%Y-%m-%d'):
173 |                     # check if the value containing teh data is a dictionary. This way None values are skipped.
174 |                     if type(jsonData["historicalData"][key]['countyInfectionsNumbers']) == dict:
175 |                         # create a new empty dict for each date
176 |                         parsedLine = {}
177 |                         # save the date in the dict with the key 'dateFor'
178 |                         parsedLine['dateFor'] = key
179 |                         # update the new dict (add/append) with the required json data for each date
180 |                         parsedLine.update(jsonData["historicalData"][key]['countyInfectionsNumbers'])
181 |                         # save the new dict to the list created above
182 |                         dfData.append(parsedLine)
183 | 
184 |             # check if the data list is not empty (has a length of 0)
185 |             if len(dfData) > 0:
186 |                 # convert the list ot a Pandas dataframe
187 |                 df = pd.DataFrame(dfData)
188 |                 # replace NAN (None) values with 0
189 |                 df = df.fillna(0)
190 |                 # save the df to a csv with the headers added and with utf8 encoding
191 |                 df.to_csv('/opt/airflow/sparkFiles/parsedData.csv',
192 |                           encoding='utf8',
193 |                           index=False,
194 |                           header=True)
195 | 
196 |                 # delete the file downloaded from s3
197 |                 os.remove(filename)
198 | 
199 |                 return 'processParsedData'
200 |             return 'endRun'
201 |         return 'endRun'
202 | 
203 | 
204 | def uploadToDB(ti):
205 |     """
206 |     Upload the results data to the database
207 |     """
208 |     results = '/opt/airflow/sparkFiles/results.csv'
209 | 
210 |     # get the date from the xcom
211 |     fetchedDate = ti.xcom_pull(key='fetchedDate', task_ids=['getLastProcessedDate'])
212 |     lastDBDate = fetchedDate[0]
213 | 
214 |     # read the results CSV to a Pandas dataframe
215 |     pandasDf = pd.read_csv(results)
216 |     # remove the row that has the same dateFor as the previously last processed date to avoid any data errors
217 |     newDf = pd.concat([pandasDf.loc[pandasDf.dateFor != lastDBDate]])
218 | 
219 |     if database == 'mongoDB':
220 |         # >>> using the MONGO database
221 | 
222 |         # convert to a list of dictionary objects
223 |         resultsList = newDf.to_dict(orient='records')
224 |         # save the data to the database as bulk
225 |         # insertToDB = db.countyDiff.insert_many(resultsList)
226 | 
227 |         # save data by replacing documents already found in the collection if the dateFor fields match, if not insert
228 |         # new document by setting the upsert flag to True
229 |         for item in resultsList:
230 |             insertToDB = db.countyDiff.replace_one({'dateFor': item['dateFor']},
231 |                                                    item,
232 |                                                    upsert=True)
233 | 
234 |     else:
235 |         # >>> using the AMAZON REDSHIFT database
236 | 
237 |         # overwrite the CSV with the new data
238 |         newDf.to_csv(results,
239 |                      sep=',',
240 |                      header=True,
241 |                      index=False)
242 |         #upload results csv to S3
243 |         # s3 hook object
244 |         hook = S3Hook()
245 |         # name of the file in the AWS s3 bucket
246 |         key = 'results.csv'
247 |         # name of the AWS s3 bucket
248 |         bucket = 'renato-airflow-raw'
249 |         # load/upload the results file to the s3 bucket
250 |         loadToS3 = hook.load_file(
251 |             filename=results,
252 |             key=key,
253 |             bucket_name=bucket,
254 |             replace=True
255 |         )
256 | 
257 |         # set up the connection to the Redshift database
258 |         conn = psycopg2.connect(f'dbname={dbname} host={host} port={port} user={user} password={password}')
259 |         # start the database cursor
260 |         cursor = conn.cursor()
261 |         # COPY the data from the s3 loaded file into the Redshift counties collection.
262 |         # the COPY command only appends the CSV data to the table. It does not replace
263 |         sql = f"""COPY counties FROM 's3://renato-airflow-raw/results.csv' 
264 |                   iam_role '{awsIAMrole}' 
265 |                   DELIMITER AS ',' 
266 |                   DATEFORMAT 'YYYY-MM-DD' 
267 |                   IGNOREHEADER 1 ;"""
268 | 
269 |         cursor.execute(sql)
270 |         conn.commit()
271 |         cursor.close()
272 |         conn.close()
273 | 
274 |     # delete the parsed data csv from the working directory
275 |     os.remove(results)
276 | 
277 | 
278 | # set up DAG arguments
279 | defaultArgs = {
280 |     'owner': 'Renato_Otescu',
281 |     'start_date': datetime.datetime(2021, 1, 1),
282 |     'retries': 3,
283 |     'retry_delay': datetime.timedelta(seconds=30)
284 | }
285 | 
286 | # plan DAG run/pipeline (schedule_interval='0 8 * * *' for each day at 8AM or schedule_interval='@daily' for 0AM)
287 | # the first argument is the name of the DAG: 'analyze_json_data'
288 | with DAG('analyze_json_data',
289 |          schedule_interval='@daily',
290 |          default_args=defaultArgs,
291 |          catchup=False) as dag:
292 | 
293 |     # task that calls the function getDBdate by using a PythonOperator
294 |     getLastProcessedDate = PythonOperator(
295 |         task_id='getLastProcessedDate',
296 |         python_callable=getDBdate
297 |     )
298 | 
299 |     # Call the function getLastDate. A BranchPythonOperator is used here as the getLastDate function returns either
300 |     # the 'parseJsonFile' task id or the 'endRun' task id.
301 |     # 2 branches are created: one for the task 'parseJsonFile' and the other one for the task id 'endRun'.
302 |     # If multiple tasks need to be executed at the same time, the return of the function has to be a list containing
303 |     # all the tasks ids that need to be executed at the same time (i.e.: if ['task_id_1', 'task_id_2', etc])
304 |     # The flag do_xcom_push is set to False because each xcom also creates a separate data point in the Airflow DB
305 |     # which in this case is useless
306 |     getDate = BranchPythonOperator(
307 |         task_id='getDate',
308 |         python_callable=getLastDate,
309 |         do_xcom_push=False
310 |     )
311 | 
312 |     # if the BranchPythonOperator above returns the task id 'parseJsonFile', then the readJsonData function is called
313 |     # note that the 'parseJsonFile' task is also a BranchPythonOperator because the function it calls, readJsonData,
314 |     # also returns 2 task ids: 'processParsedData' and 'endRun'
315 |     parseJsonFile = BranchPythonOperator(
316 |         task_id='parseJsonFile',
317 |         python_callable=readJsonData,
318 |         do_xcom_push=False
319 |     )
320 | 
321 |     # as the Spark (PySpark) script is located in a different directory, it is executed using a BashOperator
322 |     processParsedData = BashOperator(
323 |         task_id='processParsedData',
324 |         bash_command='python /opt/airflow/sparkFiles/sparkProcess.py'
325 |     )
326 | 
327 |     # execute the 'uploadToDB' function using a PythonOperator
328 |     saveToDB = PythonOperator(
329 |         task_id='saveToDB',
330 |         python_callable=uploadToDB
331 |     )
332 | 
333 |     # the last task is a DummyOperator used only to point the branch operators above
334 |     # The trigger rule 'none_failed_or_skipped' ensures that the dummy task is executed if at least one parent succeeds
335 |     endRun = DummyOperator(
336 |         task_id='endRun',
337 |         trigger_rule='none_failed_or_skipped'
338 |     )
339 | 
340 |     # set tasks relations (the order the tasks are executed)
341 |     getLastProcessedDate >> getDate
342 |     # the task 'getDate' is a BranchPythonOperator so after it 2 other tasks can be executed parseJsonFile and endRun
343 |     getDate >> [parseJsonFile, endRun]
344 |     # same as above for the branch parseJsonFile
345 |     parseJsonFile >> [processParsedData, endRun]
346 |     processParsedData >> saveToDB >> endRun
347 | 


--------------------------------------------------------------------------------
/ETL-Data-pipelines/airflow-data/airflow.cfg:
--------------------------------------------------------------------------------
   1 | [core]
   2 | # The folder where your airflow pipelines live, most likely a
   3 | # subfolder in a code repository. This path must be absolute.
   4 | dags_folder = /opt/airflow/dags
   5 | 
   6 | # Hostname by providing a path to a callable, which will resolve the hostname.
   7 | # The format is "package.function".
   8 | #
   9 | # For example, default value "socket.getfqdn" means that result from getfqdn() of "socket"
  10 | # package will be used as hostname.
  11 | #
  12 | # No argument should be required in the function specified.
  13 | # If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address``
  14 | hostname_callable = socket.getfqdn
  15 | 
  16 | # Default timezone in case supplied date times are naive
  17 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam)
  18 | default_timezone = utc
  19 | 
  20 | # The executor class that airflow should use. Choices include
  21 | # ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``, ``DaskExecutor``,
  22 | # ``KubernetesExecutor``, ``CeleryKubernetesExecutor`` or the
  23 | # full import path to the class when using a custom executor.
  24 | executor = SequentialExecutor
  25 | 
  26 | # The SqlAlchemy connection string to the metadata database.
  27 | # SqlAlchemy supports many different database engine, more information
  28 | # their website
  29 | sql_alchemy_conn = sqlite:////opt/airflow/airflow.db
  30 | 
  31 | # The encoding for the databases
  32 | sql_engine_encoding = utf-8
  33 | 
  34 | # Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding.
  35 | # This is particularly useful in case of mysql with utf8mb4 encoding because
  36 | # primary keys for XCom table has too big size and ``sql_engine_collation_for_ids`` should
  37 | # be set to ``utf8mb3_general_ci``.
  38 | # sql_engine_collation_for_ids =
  39 | 
  40 | # If SqlAlchemy should pool database connections.
  41 | sql_alchemy_pool_enabled = True
  42 | 
  43 | # The SqlAlchemy pool size is the maximum number of database connections
  44 | # in the pool. 0 indicates no limit.
  45 | sql_alchemy_pool_size = 5
  46 | 
  47 | # The maximum overflow size of the pool.
  48 | # When the number of checked-out connections reaches the size set in pool_size,
  49 | # additional connections will be returned up to this limit.
  50 | # When those additional connections are returned to the pool, they are disconnected and discarded.
  51 | # It follows then that the total number of simultaneous connections the pool will allow
  52 | # is pool_size + max_overflow,
  53 | # and the total number of "sleeping" connections the pool will allow is pool_size.
  54 | # max_overflow can be set to ``-1`` to indicate no overflow limit;
  55 | # no limit will be placed on the total number of concurrent connections. Defaults to ``10``.
  56 | sql_alchemy_max_overflow = 10
  57 | 
  58 | # The SqlAlchemy pool recycle is the number of seconds a connection
  59 | # can be idle in the pool before it is invalidated. This config does
  60 | # not apply to sqlite. If the number of DB connections is ever exceeded,
  61 | # a lower config value will allow the system to recover faster.
  62 | sql_alchemy_pool_recycle = 1800
  63 | 
  64 | # Check connection at the start of each connection pool checkout.
  65 | # Typically, this is a simple statement like "SELECT 1".
  66 | # More information here:
  67 | # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic
  68 | sql_alchemy_pool_pre_ping = True
  69 | 
  70 | # The schema to use for the metadata database.
  71 | # SqlAlchemy supports databases with the concept of multiple schemas.
  72 | sql_alchemy_schema =
  73 | 
  74 | # Import path for connect args in SqlAlchemy. Defaults to an empty dict.
  75 | # This is useful when you want to configure db engine args that SqlAlchemy won't parse
  76 | # in connection string.
  77 | # See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args
  78 | # sql_alchemy_connect_args =
  79 | 
  80 | # The amount of parallelism as a setting to the executor. This defines
  81 | # the max number of task instances that should run simultaneously
  82 | # on this airflow installation
  83 | parallelism = 32
  84 | 
  85 | # The number of task instances allowed to run concurrently by the scheduler
  86 | # in one DAG. Can be overridden by ``concurrency`` on DAG level.
  87 | dag_concurrency = 16
  88 | 
  89 | # Are DAGs paused by default at creation
  90 | dags_are_paused_at_creation = True
  91 | 
  92 | # The maximum number of active DAG runs per DAG
  93 | max_active_runs_per_dag = 16
  94 | 
  95 | # Whether to load the DAG examples that ship with Airflow. It's good to
  96 | # get started, but you probably want to set this to ``False`` in a production
  97 | # environment
  98 | load_examples = True
  99 | 
 100 | # Whether to load the default connections that ship with Airflow. It's good to
 101 | # get started, but you probably want to set this to ``False`` in a production
 102 | # environment
 103 | load_default_connections = True
 104 | 
 105 | # Path to the folder containing Airflow plugins
 106 | plugins_folder = /opt/airflow/plugins
 107 | 
 108 | # Should tasks be executed via forking of the parent process ("False",
 109 | # the speedier option) or by spawning a new python process ("True" slow,
 110 | # but means plugin changes picked up by tasks straight away)
 111 | execute_tasks_new_python_interpreter = False
 112 | 
 113 | # Secret key to save connection passwords in the db
 114 | fernet_key = hjIFXCPQL6ZZx-dN7Kpr5yULTMFmLK-skgH9KdKeA1I=
 115 | 
 116 | # Whether to disable pickling dags
 117 | donot_pickle = True
 118 | 
 119 | # How long before timing out a python file import
 120 | dagbag_import_timeout = 30.0
 121 | 
 122 | # Should a traceback be shown in the UI for dagbag import errors,
 123 | # instead of just the exception message
 124 | dagbag_import_error_tracebacks = True
 125 | 
 126 | # If tracebacks are shown, how many entries from the traceback should be shown
 127 | dagbag_import_error_traceback_depth = 2
 128 | 
 129 | # How long before timing out a DagFileProcessor, which processes a dag file
 130 | dag_file_processor_timeout = 50
 131 | 
 132 | # The class to use for running task instances in a subprocess.
 133 | # Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class
 134 | # when using a custom task runner.
 135 | task_runner = StandardTaskRunner
 136 | 
 137 | # If set, tasks without a ``run_as_user`` argument will be run with this user
 138 | # Can be used to de-elevate a sudo user running Airflow when executing tasks
 139 | default_impersonation =
 140 | 
 141 | # What security module to use (for example kerberos)
 142 | security =
 143 | 
 144 | # Turn unit test mode on (overwrites many configuration options with test
 145 | # values at runtime)
 146 | unit_test_mode = False
 147 | 
 148 | # Whether to enable pickling for xcom (note that this is insecure and allows for
 149 | # RCE exploits).
 150 | enable_xcom_pickling = False
 151 | 
 152 | # When a task is killed forcefully, this is the amount of time in seconds that
 153 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
 154 | killed_task_cleanup_time = 60
 155 | 
 156 | # Whether to override params with dag_run.conf. If you pass some key-value pairs
 157 | # through ``airflow dags backfill -c`` or
 158 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
 159 | dag_run_conf_overrides_params = True
 160 | 
 161 | # When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``.
 162 | dag_discovery_safe_mode = True
 163 | 
 164 | # The number of retries each task is going to have by default. Can be overridden at dag or task level.
 165 | default_task_retries = 0
 166 | 
 167 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
 168 | min_serialized_dag_update_interval = 30
 169 | 
 170 | # Fetching serialized DAG can not be faster than a minimum interval to reduce database
 171 | # read rate. This config controls when your DAGs are updated in the Webserver
 172 | min_serialized_dag_fetch_interval = 10
 173 | 
 174 | # Whether to persist DAG files code in DB.
 175 | # If set to True, Webserver reads file contents from DB instead of
 176 | # trying to access files in a DAG folder.
 177 | # Example: store_dag_code = False
 178 | # store_dag_code =
 179 | 
 180 | # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
 181 | # in the Database.
 182 | # All the template_fields for each of Task Instance are stored in the Database.
 183 | # Keeping this number small may cause an error when you try to view ``Rendered`` tab in
 184 | # TaskInstance view for older tasks.
 185 | max_num_rendered_ti_fields_per_task = 30
 186 | 
 187 | # On each dagrun check against defined SLAs
 188 | check_slas = True
 189 | 
 190 | # Path to custom XCom class that will be used to store and resolve operators results
 191 | # Example: xcom_backend = path.to.CustomXCom
 192 | xcom_backend = airflow.models.xcom.BaseXCom
 193 | 
 194 | # By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``,
 195 | # if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module.
 196 | lazy_load_plugins = True
 197 | 
 198 | # By default Airflow providers are lazily-discovered (discovery and imports happen only when required).
 199 | # Set it to False, if you want to discover providers whenever 'airflow' is invoked via cli or
 200 | # loaded from module.
 201 | lazy_discover_providers = True
 202 | 
 203 | # Number of times the code should be retried in case of DB Operational Errors.
 204 | # Not all transactions will be retried as it can cause undesired state.
 205 | # Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
 206 | max_db_retries = 3
 207 | 
 208 | [logging]
 209 | # The folder where airflow should store its log files
 210 | # This path must be absolute
 211 | base_log_folder = /opt/airflow/logs
 212 | 
 213 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search.
 214 | # Set this to True if you want to enable remote logging.
 215 | remote_logging = False
 216 | 
 217 | # Users must supply an Airflow connection id that provides access to the storage
 218 | # location.
 219 | remote_log_conn_id =
 220 | 
 221 | # Path to Google Credential JSON file. If omitted, authorization based on `the Application Default
 222 | # Credentials
 223 | # <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
 224 | # be used.
 225 | google_key_path =
 226 | 
 227 | # Storage bucket URL for remote logging
 228 | # S3 buckets should start with "s3://"
 229 | # Cloudwatch log groups should start with "cloudwatch://"
 230 | # GCS buckets should start with "gs://"
 231 | # WASB buckets should start with "wasb" just to help Airflow select correct handler
 232 | # Stackdriver logs should start with "stackdriver://"
 233 | remote_base_log_folder =
 234 | 
 235 | # Use server-side encryption for logs stored in S3
 236 | encrypt_s3_logs = False
 237 | 
 238 | # Logging level
 239 | logging_level = INFO
 240 | 
 241 | # Logging level for Flask-appbuilder UI
 242 | fab_logging_level = WARN
 243 | 
 244 | # Logging class
 245 | # Specify the class that will specify the logging configuration
 246 | # This class has to be on the python classpath
 247 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
 248 | logging_config_class =
 249 | 
 250 | # Flag to enable/disable Colored logs in Console
 251 | # Colour the logs when the controlling terminal is a TTY.
 252 | colored_console_log = True
 253 | 
 254 | # Log format for when Colored logs is enabled
 255 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
 256 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
 257 | 
 258 | # Format of Log line
 259 | log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
 260 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
 261 | 
 262 | # Specify prefix pattern like mentioned below with stream handler TaskHandlerWithCustomFormatter
 263 | # Example: task_log_prefix_template = {ti.dag_id}-{ti.task_id}-{execution_date}-{try_number}
 264 | task_log_prefix_template =
 265 | 
 266 | # Formatting for how airflow generates file names/paths for each task run.
 267 | log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log
 268 | 
 269 | # Formatting for how airflow generates file names for log
 270 | log_processor_filename_template = {{ filename }}.log
 271 | 
 272 | # full path of dag_processor_manager logfile
 273 | dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log
 274 | 
 275 | # Name of handler to read task instance logs.
 276 | # Defaults to use ``task`` handler.
 277 | task_log_reader = task
 278 | 
 279 | # A comma\-separated list of third-party logger names that will be configured to print messages to
 280 | # consoles\.
 281 | # Example: extra_loggers = connexion,sqlalchemy
 282 | extra_loggers =
 283 | 
 284 | [metrics]
 285 | 
 286 | # StatsD (https://github.com/etsy/statsd) integration settings.
 287 | # Enables sending metrics to StatsD.
 288 | statsd_on = False
 289 | statsd_host = localhost
 290 | statsd_port = 8125
 291 | statsd_prefix = airflow
 292 | 
 293 | # If you want to avoid sending all the available metrics to StatsD,
 294 | # you can configure an allow list of prefixes (comma separated) to send only the metrics that
 295 | # start with the elements of the list (e.g: "scheduler,executor,dagrun")
 296 | statsd_allow_list =
 297 | 
 298 | # A function that validate the statsd stat name, apply changes to the stat name if necessary and return
 299 | # the transformed stat name.
 300 | #
 301 | # The function should have the following signature:
 302 | # def func_name(stat_name: str) -> str:
 303 | stat_name_handler =
 304 | 
 305 | # To enable datadog integration to send airflow metrics.
 306 | statsd_datadog_enabled = False
 307 | 
 308 | # List of datadog tags attached to all metrics(e.g: key1:value1,key2:value2)
 309 | statsd_datadog_tags =
 310 | 
 311 | # If you want to utilise your own custom Statsd client set the relevant
 312 | # module path below.
 313 | # Note: The module path must exist on your PYTHONPATH for Airflow to pick it up
 314 | # statsd_custom_client_path =
 315 | 
 316 | [secrets]
 317 | # Full class name of secrets backend to enable (will precede env vars and metastore in search path)
 318 | # Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend
 319 | backend =
 320 | 
 321 | # The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class.
 322 | # See documentation for the secrets backend you are using. JSON is expected.
 323 | # Example for AWS Systems Manager ParameterStore:
 324 | # ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}``
 325 | backend_kwargs =
 326 | 
 327 | [cli]
 328 | # In what way should the cli access the API. The LocalClient will use the
 329 | # database directly, while the json_client will use the api running on the
 330 | # webserver
 331 | api_client = airflow.api.client.local_client
 332 | 
 333 | # If you set web_server_url_prefix, do NOT forget to append it here, ex:
 334 | # ``endpoint_url = http://localhost:8080/myroot``
 335 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
 336 | endpoint_url = http://localhost:8080
 337 | 
 338 | [debug]
 339 | # Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first
 340 | # failed task. Helpful for debugging purposes.
 341 | fail_fast = False
 342 | 
 343 | [api]
 344 | # Enables the deprecated experimental API. Please note that these APIs do not have access control.
 345 | # The authenticated user has full access.
 346 | #
 347 | # .. warning::
 348 | #
 349 | #   This `Experimental REST API <https://airflow.readthedocs.io/en/latest/rest-api-ref.html>`__ is
 350 | #   deprecated since version 2.0. Please consider using
 351 | #   `the Stable REST API <https://airflow.readthedocs.io/en/latest/stable-rest-api-ref.html>`__.
 352 | #   For more information on migration, see
 353 | #   `UPDATING.md <https://github.com/apache/airflow/blob/master/UPDATING.md>`_
 354 | enable_experimental_api = False
 355 | 
 356 | # How to authenticate users of the API. See
 357 | # https://airflow.apache.org/docs/stable/security.html for possible values.
 358 | # ("airflow.api.auth.backend.default" allows all requests for historic reasons)
 359 | auth_backend = airflow.api.auth.backend.deny_all
 360 | 
 361 | # Used to set the maximum page limit for API requests
 362 | maximum_page_limit = 100
 363 | 
 364 | # Used to set the default page limit when limit is zero. A default limit
 365 | # of 100 is set on OpenApi spec. However, this particular default limit
 366 | # only work when limit is set equal to zero(0) from API requests.
 367 | # If no limit is supplied, the OpenApi spec default is used.
 368 | fallback_page_limit = 100
 369 | 
 370 | # The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested.
 371 | # Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com
 372 | google_oauth2_audience =
 373 | 
 374 | # Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on
 375 | # `the Application Default Credentials
 376 | # <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
 377 | # be used.
 378 | # Example: google_key_path = /files/service-account-json
 379 | google_key_path =
 380 | 
 381 | [lineage]
 382 | # what lineage backend to use
 383 | backend =
 384 | 
 385 | [atlas]
 386 | sasl_enabled = False
 387 | host =
 388 | port = 21000
 389 | username =
 390 | password =
 391 | 
 392 | [operators]
 393 | # The default owner assigned to each new operator, unless
 394 | # provided explicitly or passed via ``default_args``
 395 | default_owner = airflow
 396 | default_cpus = 1
 397 | default_ram = 512
 398 | default_disk = 512
 399 | default_gpus = 0
 400 | 
 401 | # Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator.
 402 | # If set to False, an exception will be thrown, otherwise only the console message will be displayed.
 403 | allow_illegal_arguments = False
 404 | 
 405 | [hive]
 406 | # Default mapreduce queue for HiveOperator tasks
 407 | default_hive_mapred_queue =
 408 | 
 409 | # Template for mapred_job_name in HiveOperator, supports the following named parameters
 410 | # hostname, dag_id, task_id, execution_date
 411 | # mapred_job_name_template =
 412 | 
 413 | [webserver]
 414 | # The base url of your website as airflow cannot guess what domain or
 415 | # cname you are using. This is used in automated emails that
 416 | # airflow sends to point links to the right web server
 417 | base_url = http://localhost:8080
 418 | 
 419 | # Default timezone to display all dates in the UI, can be UTC, system, or
 420 | # any IANA timezone string (e.g. Europe/Amsterdam). If left empty the
 421 | # default value of core/default_timezone will be used
 422 | # Example: default_ui_timezone = America/New_York
 423 | default_ui_timezone = UTC
 424 | 
 425 | # The ip specified when starting the web server
 426 | web_server_host = 0.0.0.0
 427 | 
 428 | # The port on which to run the web server
 429 | web_server_port = 8080
 430 | 
 431 | # Paths to the SSL certificate and key for the web server. When both are
 432 | # provided SSL will be enabled. This does not change the web server port.
 433 | web_server_ssl_cert =
 434 | 
 435 | # Paths to the SSL certificate and key for the web server. When both are
 436 | # provided SSL will be enabled. This does not change the web server port.
 437 | web_server_ssl_key =
 438 | 
 439 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond
 440 | web_server_master_timeout = 120
 441 | 
 442 | # Number of seconds the gunicorn webserver waits before timing out on a worker
 443 | web_server_worker_timeout = 120
 444 | 
 445 | # Number of workers to refresh at a time. When set to 0, worker refresh is
 446 | # disabled. When nonzero, airflow periodically refreshes webserver workers by
 447 | # bringing up new ones and killing old ones.
 448 | worker_refresh_batch_size = 1
 449 | 
 450 | # Number of seconds to wait before refreshing a batch of workers.
 451 | worker_refresh_interval = 30
 452 | 
 453 | # If set to True, Airflow will track files in plugins_folder directory. When it detects changes,
 454 | # then reload the gunicorn.
 455 | reload_on_plugin_change = False
 456 | 
 457 | # Secret key used to run your flask app
 458 | # It should be as random as possible
 459 | secret_key = JK3PU6syfBItlK8mgHrYnA==
 460 | 
 461 | # Number of workers to run the Gunicorn web server
 462 | workers = 4
 463 | 
 464 | # The worker class gunicorn should use. Choices include
 465 | # sync (default), eventlet, gevent
 466 | worker_class = sync
 467 | 
 468 | # Log files for the gunicorn webserver. '-' means log to stderr.
 469 | access_logfile = -
 470 | 
 471 | # Log files for the gunicorn webserver. '-' means log to stderr.
 472 | error_logfile = -
 473 | 
 474 | # Access log format for gunicorn webserver.
 475 | # default format is %%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"
 476 | # documentation - https://docs.gunicorn.org/en/stable/settings.html#access-log-format
 477 | access_logformat =
 478 | 
 479 | # Expose the configuration file in the web server
 480 | expose_config = False
 481 | 
 482 | # Expose hostname in the web server
 483 | expose_hostname = True
 484 | 
 485 | # Expose stacktrace in the web server
 486 | expose_stacktrace = True
 487 | 
 488 | # Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times``
 489 | dag_default_view = tree
 490 | 
 491 | # Default DAG orientation. Valid values are:
 492 | # ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
 493 | dag_orientation = LR
 494 | 
 495 | # Puts the webserver in demonstration mode; blurs the names of Operators for
 496 | # privacy.
 497 | demo_mode = False
 498 | 
 499 | # The amount of time (in secs) webserver will wait for initial handshake
 500 | # while fetching logs from other worker machine
 501 | log_fetch_timeout_sec = 5
 502 | 
 503 | # Time interval (in secs) to wait before next log fetching.
 504 | log_fetch_delay_sec = 2
 505 | 
 506 | # Distance away from page bottom to enable auto tailing.
 507 | log_auto_tailing_offset = 30
 508 | 
 509 | # Animation speed for auto tailing log display.
 510 | log_animation_speed = 1000
 511 | 
 512 | # By default, the webserver shows paused DAGs. Flip this to hide paused
 513 | # DAGs by default
 514 | hide_paused_dags_by_default = False
 515 | 
 516 | # Consistent page size across all listing views in the UI
 517 | page_size = 100
 518 | 
 519 | # Define the color of navigation bar
 520 | navbar_color = #fff
 521 | 
 522 | # Default dagrun to show in UI
 523 | default_dag_run_display_number = 25
 524 | 
 525 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy
 526 | enable_proxy_fix = False
 527 | 
 528 | # Number of values to trust for ``X-Forwarded-For``.
 529 | # More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/
 530 | proxy_fix_x_for = 1
 531 | 
 532 | # Number of values to trust for ``X-Forwarded-Proto``
 533 | proxy_fix_x_proto = 1
 534 | 
 535 | # Number of values to trust for ``X-Forwarded-Host``
 536 | proxy_fix_x_host = 1
 537 | 
 538 | # Number of values to trust for ``X-Forwarded-Port``
 539 | proxy_fix_x_port = 1
 540 | 
 541 | # Number of values to trust for ``X-Forwarded-Prefix``
 542 | proxy_fix_x_prefix = 1
 543 | 
 544 | # Set secure flag on session cookie
 545 | cookie_secure = False
 546 | 
 547 | # Set samesite policy on session cookie
 548 | cookie_samesite = Lax
 549 | 
 550 | # Default setting for wrap toggle on DAG code and TI log views.
 551 | default_wrap = False
 552 | 
 553 | # Allow the UI to be rendered in a frame
 554 | x_frame_enabled = True
 555 | 
 556 | # Send anonymous user activity to your analytics tool
 557 | # choose from google_analytics, segment, or metarouter
 558 | # analytics_tool =
 559 | 
 560 | # Unique ID of your account in the analytics tool
 561 | # analytics_id =
 562 | 
 563 | # 'Recent Tasks' stats will show for old DagRuns if set
 564 | show_recent_stats_for_completed_runs = True
 565 | 
 566 | # Update FAB permissions and sync security manager roles
 567 | # on webserver startup
 568 | update_fab_perms = True
 569 | 
 570 | # The UI cookie lifetime in minutes. User will be logged out from UI after
 571 | # ``session_lifetime_minutes`` of non-activity
 572 | session_lifetime_minutes = 43200
 573 | 
 574 | [email]
 575 | 
 576 | # Configuration email backend and whether to
 577 | # send email alerts on retry or failure
 578 | # Email backend to use
 579 | email_backend = airflow.utils.email.send_email_smtp
 580 | 
 581 | # Whether email alerts should be sent when a task is retried
 582 | default_email_on_retry = True
 583 | 
 584 | # Whether email alerts should be sent when a task failed
 585 | default_email_on_failure = True
 586 | 
 587 | [smtp]
 588 | 
 589 | # If you want airflow to send emails on retries, failure, and you want to use
 590 | # the airflow.utils.email.send_email_smtp function, you have to configure an
 591 | # smtp server here
 592 | smtp_host = localhost
 593 | smtp_starttls = True
 594 | smtp_ssl = False
 595 | # Example: smtp_user = airflow
 596 | # smtp_user =
 597 | # Example: smtp_password = airflow
 598 | # smtp_password =
 599 | smtp_port = 25
 600 | smtp_mail_from = airflow@example.com
 601 | smtp_timeout = 30
 602 | smtp_retry_limit = 5
 603 | 
 604 | [sentry]
 605 | 
 606 | # Sentry (https://docs.sentry.io) integration. Here you can supply
 607 | # additional configuration options based on the Python platform. See:
 608 | # https://docs.sentry.io/error-reporting/configuration/?platform=python.
 609 | # Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``,
 610 | # ``ignore_errors``, ``before_breadcrumb``, ``before_send``, ``transport``.
 611 | # Enable error reporting to Sentry
 612 | sentry_on = false
 613 | sentry_dsn =
 614 | 
 615 | [celery_kubernetes_executor]
 616 | 
 617 | # This section only applies if you are using the ``CeleryKubernetesExecutor`` in
 618 | # ``[core]`` section above
 619 | # Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``.
 620 | # When the queue of a task is ``kubernetes_queue``, the task is executed via ``KubernetesExecutor``,
 621 | # otherwise via ``CeleryExecutor``
 622 | kubernetes_queue = kubernetes
 623 | 
 624 | [celery]
 625 | 
 626 | # This section only applies if you are using the CeleryExecutor in
 627 | # ``[core]`` section above
 628 | # The app name that will be used by celery
 629 | celery_app_name = airflow.executors.celery_executor
 630 | 
 631 | # The concurrency that will be used when starting workers with the
 632 | # ``airflow celery worker`` command. This defines the number of task instances that
 633 | # a worker will take, so size up your workers based on the resources on
 634 | # your worker box and the nature of your tasks
 635 | worker_concurrency = 8
 636 | 
 637 | # The maximum and minimum concurrency that will be used when starting workers with the
 638 | # ``airflow celery worker`` command (always keep minimum processes, but grow
 639 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency
 640 | # Pick these numbers based on resources on worker box and the nature of the task.
 641 | # If autoscale option is available, worker_concurrency will be ignored.
 642 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale
 643 | # Example: worker_autoscale = 16,12
 644 | # worker_autoscale =
 645 | 
 646 | # Used to increase the number of tasks that a worker prefetches which can improve performance.
 647 | # The number of processes multiplied by worker_prefetch_multiplier is the number of tasks
 648 | # that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily
 649 | # blocked if there are multiple workers and one worker prefetches tasks that sit behind long
 650 | # running tasks while another worker has unutilized processes that are unable to process the already
 651 | # claimed blocked tasks.
 652 | # https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits
 653 | # Example: worker_prefetch_multiplier = 1
 654 | # worker_prefetch_multiplier =
 655 | 
 656 | # When you start an airflow worker, airflow starts a tiny web server
 657 | # subprocess to serve the workers local log files to the airflow main
 658 | # web server, who then builds pages and sends them to users. This defines
 659 | # the port on which the logs are served. It needs to be unused, and open
 660 | # visible from the main web server to connect into the workers.
 661 | worker_log_server_port = 8793
 662 | 
 663 | # Umask that will be used when starting workers with the ``airflow celery worker``
 664 | # in daemon mode. This control the file-creation mode mask which determines the initial
 665 | # value of file permission bits for newly created files.
 666 | worker_umask = 0o077
 667 | 
 668 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
 669 | # a sqlalchemy database. Refer to the Celery documentation for more information.
 670 | broker_url = redis://redis:6379/0
 671 | 
 672 | # The Celery result_backend. When a job finishes, it needs to update the
 673 | # metadata of the job. Therefore it will post a message on a message bus,
 674 | # or insert it into a database (depending of the backend)
 675 | # This status is used by the scheduler to update the state of the task
 676 | # The use of a database is highly recommended
 677 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings
 678 | result_backend = db+postgresql://postgres:airflow@postgres/airflow
 679 | 
 680 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
 681 | # it ``airflow celery flower``. This defines the IP that Celery Flower runs on
 682 | flower_host = 0.0.0.0
 683 | 
 684 | # The root URL for Flower
 685 | # Example: flower_url_prefix = /flower
 686 | flower_url_prefix =
 687 | 
 688 | # This defines the port that Celery Flower runs on
 689 | flower_port = 5555
 690 | 
 691 | # Securing Flower with Basic Authentication
 692 | # Accepts user:password pairs separated by a comma
 693 | # Example: flower_basic_auth = user1:password1,user2:password2
 694 | flower_basic_auth =
 695 | 
 696 | # Default queue that tasks get assigned to and that worker listen on.
 697 | default_queue = default
 698 | 
 699 | # How many processes CeleryExecutor uses to sync task state.
 700 | # 0 means to use max(1, number of cores - 1) processes.
 701 | sync_parallelism = 0
 702 | 
 703 | # Import path for celery configuration options
 704 | celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG
 705 | ssl_active = False
 706 | ssl_key =
 707 | ssl_cert =
 708 | ssl_cacert =
 709 | 
 710 | # Celery Pool implementation.
 711 | # Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``.
 712 | # See:
 713 | # https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency
 714 | # https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html
 715 | pool = prefork
 716 | 
 717 | # The number of seconds to wait before timing out ``send_task_to_executor`` or
 718 | # ``fetch_celery_task_state`` operations.
 719 | operation_timeout = 1.0
 720 | 
 721 | # Celery task will report its status as 'started' when the task is executed by a worker.
 722 | # This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted
 723 | # or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob.
 724 | task_track_started = True
 725 | 
 726 | # Time in seconds after which Adopted tasks are cleared by CeleryExecutor. This is helpful to clear
 727 | # stalled tasks.
 728 | task_adoption_timeout = 600
 729 | 
 730 | # The Maximum number of retries for publishing task messages to the broker when failing
 731 | # due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed.
 732 | task_publish_max_retries = 3
 733 | 
 734 | # Worker initialisation check to validate Metadata Database connection
 735 | worker_precheck = False
 736 | 
 737 | [celery_broker_transport_options]
 738 | 
 739 | # This section is for specifying options which can be passed to the
 740 | # underlying celery broker transport. See:
 741 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options
 742 | # The visibility timeout defines the number of seconds to wait for the worker
 743 | # to acknowledge the task before the message is redelivered to another worker.
 744 | # Make sure to increase the visibility timeout to match the time of the longest
 745 | # ETA you're planning to use.
 746 | # visibility_timeout is only supported for Redis and SQS celery brokers.
 747 | # See:
 748 | # http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options
 749 | # Example: visibility_timeout = 21600
 750 | # visibility_timeout =
 751 | 
 752 | [dask]
 753 | 
 754 | # This section only applies if you are using the DaskExecutor in
 755 | # [core] section above
 756 | # The IP address and port of the Dask cluster's scheduler.
 757 | cluster_address = 127.0.0.1:8786
 758 | 
 759 | # TLS/ SSL settings to access a secured Dask scheduler.
 760 | tls_ca =
 761 | tls_cert =
 762 | tls_key =
 763 | 
 764 | [scheduler]
 765 | # Task instances listen for external kill signal (when you clear tasks
 766 | # from the CLI or the UI), this defines the frequency at which they should
 767 | # listen (in seconds).
 768 | job_heartbeat_sec = 5
 769 | 
 770 | # How often (in seconds) to check and tidy up 'running' TaskInstancess
 771 | # that no longer have a matching DagRun
 772 | clean_tis_without_dagrun_interval = 15.0
 773 | 
 774 | # The scheduler constantly tries to trigger new tasks (look at the
 775 | # scheduler section in the docs for more information). This defines
 776 | # how often the scheduler should run (in seconds).
 777 | scheduler_heartbeat_sec = 5
 778 | 
 779 | # The number of times to try to schedule each DAG file
 780 | # -1 indicates unlimited number
 781 | num_runs = -1
 782 | 
 783 | # The number of seconds to wait between consecutive DAG file processing
 784 | processor_poll_interval = 1
 785 | 
 786 | # after how much time (seconds) a new DAGs should be picked up from the filesystem
 787 | min_file_process_interval = 0
 788 | 
 789 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
 790 | dag_dir_list_interval = 300
 791 | 
 792 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats
 793 | print_stats_interval = 30
 794 | 
 795 | # How often (in seconds) should pool usage stats be sent to statsd (if statsd_on is enabled)
 796 | pool_metrics_interval = 5.0
 797 | 
 798 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold
 799 | # ago (in seconds), scheduler is considered unhealthy.
 800 | # This is used by the health check in the "/health" endpoint
 801 | scheduler_health_check_threshold = 30
 802 | 
 803 | # How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs
 804 | orphaned_tasks_check_interval = 300.0
 805 | child_process_log_directory = /opt/airflow/logs/scheduler
 806 | 
 807 | # Local task jobs periodically heartbeat to the DB. If the job has
 808 | # not heartbeat in this many seconds, the scheduler will mark the
 809 | # associated task instance as failed and will re-schedule the task.
 810 | scheduler_zombie_task_threshold = 300
 811 | 
 812 | # Turn off scheduler catchup by setting this to ``False``.
 813 | # Default behavior is unchanged and
 814 | # Command Line Backfills still work, but the scheduler
 815 | # will not do scheduler catchup if this is ``False``,
 816 | # however it can be set on a per DAG basis in the
 817 | # DAG definition (catchup)
 818 | catchup_by_default = True
 819 | 
 820 | # This changes the batch size of queries in the scheduling main loop.
 821 | # If this is too high, SQL query performance may be impacted by one
 822 | # or more of the following:
 823 | # - reversion to full table scan
 824 | # - complexity of query predicate
 825 | # - excessive locking
 826 | # Additionally, you may hit the maximum allowable query length for your db.
 827 | # Set this to 0 for no limit (not advised)
 828 | max_tis_per_query = 512
 829 | 
 830 | # Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries.
 831 | # If this is set to False then you should not run more than a single
 832 | # scheduler at once
 833 | use_row_level_locking = True
 834 | 
 835 | # Max number of DAGs to create DagRuns for per scheduler loop
 836 | #
 837 | # Default: 10
 838 | # max_dagruns_to_create_per_loop =
 839 | 
 840 | # How many DagRuns should a scheduler examine (and lock) when scheduling
 841 | # and queuing tasks.
 842 | #
 843 | # Default: 20
 844 | # max_dagruns_per_loop_to_schedule =
 845 | 
 846 | # Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the
 847 | # same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other
 848 | # dags in some circumstances
 849 | #
 850 | # Default: True
 851 | # schedule_after_task_execution =
 852 | 
 853 | # The scheduler can run multiple processes in parallel to parse dags.
 854 | # This defines how many processes will run.
 855 | parsing_processes = 2
 856 | 
 857 | # Turn off scheduler use of cron intervals by setting this to False.
 858 | # DAGs submitted manually in the web UI or with trigger_dag will still run.
 859 | use_job_schedule = True
 860 | 
 861 | # Allow externally triggered DagRuns for Execution Dates in the future
 862 | # Only has effect if schedule_interval is set to None in DAG
 863 | allow_trigger_in_future = False
 864 | 
 865 | [kerberos]
 866 | ccache = /tmp/airflow_krb5_ccache
 867 | 
 868 | # gets augmented with fqdn
 869 | principal = airflow
 870 | reinit_frequency = 3600
 871 | kinit_path = kinit
 872 | keytab = airflow.keytab
 873 | 
 874 | [github_enterprise]
 875 | api_rev = v3
 876 | 
 877 | [admin]
 878 | # UI to hide sensitive variable fields when set to True
 879 | hide_sensitive_variable_fields = True
 880 | 
 881 | # A comma-separated list of sensitive keywords to look for in variables names.
 882 | sensitive_variable_fields =
 883 | 
 884 | [elasticsearch]
 885 | # Elasticsearch host
 886 | host =
 887 | 
 888 | # Format of the log_id, which is used to query for a given tasks logs
 889 | log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number}
 890 | 
 891 | # Used to mark the end of a log stream for a task
 892 | end_of_log_mark = end_of_log
 893 | 
 894 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id
 895 | # Code will construct log_id using the log_id template from the argument above.
 896 | # NOTE: The code will prefix the https:// automatically, don't include that here.
 897 | frontend =
 898 | 
 899 | # Write the task logs to the stdout of the worker, rather than the default files
 900 | write_stdout = False
 901 | 
 902 | # Instead of the default log formatter, write the log lines as JSON
 903 | json_format = False
 904 | 
 905 | # Log fields to also attach to the json output, if enabled
 906 | json_fields = asctime, filename, lineno, levelname, message
 907 | 
 908 | [elasticsearch_configs]
 909 | use_ssl = False
 910 | verify_certs = True
 911 | 
 912 | [kubernetes]
 913 | # Path to the YAML pod file. If set, all other kubernetes-related fields are ignored.
 914 | pod_template_file =
 915 | 
 916 | # The repository of the Kubernetes Image for the Worker to Run
 917 | worker_container_repository =
 918 | 
 919 | # The tag of the Kubernetes Image for the Worker to Run
 920 | worker_container_tag =
 921 | 
 922 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default``
 923 | namespace = default
 924 | 
 925 | # If True, all worker pods will be deleted upon termination
 926 | delete_worker_pods = True
 927 | 
 928 | # If False (and delete_worker_pods is True),
 929 | # failed worker pods will not be deleted so users can investigate them.
 930 | delete_worker_pods_on_failure = False
 931 | 
 932 | # Number of Kubernetes Worker Pod creation calls per scheduler loop.
 933 | # Note that the current default of "1" will only launch a single pod
 934 | # per-heartbeat. It is HIGHLY recommended that users increase this
 935 | # number to match the tolerance of their kubernetes cluster for
 936 | # better performance.
 937 | worker_pods_creation_batch_size = 1
 938 | 
 939 | # Allows users to launch pods in multiple namespaces.
 940 | # Will require creating a cluster-role for the scheduler
 941 | multi_namespace_mode = False
 942 | 
 943 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster.
 944 | # It's intended for clients that expect to be running inside a pod running on kubernetes.
 945 | # It will raise an exception if called from a process not running in a kubernetes environment.
 946 | in_cluster = True
 947 | 
 948 | # When running with in_cluster=False change the default cluster_context or config_file
 949 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has.
 950 | # cluster_context =
 951 | 
 952 | # Path to the kubernetes configfile to be used when ``in_cluster`` is set to False
 953 | # config_file =
 954 | 
 955 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods
 956 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string.
 957 | # List of supported params are similar for all core_v1_apis, hence a single config
 958 | # variable for all apis. See:
 959 | # https://raw.githubusercontent.com/kubernetes-client/python/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/api/core_v1_api.py
 960 | kube_client_request_args =
 961 | 
 962 | # Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client
 963 | # ``core_v1_api`` method when using the Kubernetes Executor.
 964 | # This should be an object and can contain any of the options listed in the ``v1DeleteOptions``
 965 | # class defined here:
 966 | # https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19
 967 | # Example: delete_option_kwargs = {"grace_period_seconds": 10}
 968 | delete_option_kwargs =
 969 | 
 970 | # Enables TCP keepalive mechanism. This prevents Kubernetes API requests to hang indefinitely
 971 | # when idle connection is time-outed on services like cloud load balancers or firewalls.
 972 | enable_tcp_keepalive = False
 973 | 
 974 | # When the `enable_tcp_keepalive` option is enabled, TCP probes a connection that has
 975 | # been idle for `tcp_keep_idle` seconds.
 976 | tcp_keep_idle = 120
 977 | 
 978 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond
 979 | # to a keepalive probe, TCP retransmits the probe after `tcp_keep_intvl` seconds.
 980 | tcp_keep_intvl = 30
 981 | 
 982 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond
 983 | # to a keepalive probe, TCP retransmits the probe `tcp_keep_cnt number` of times before
 984 | # a connection is considered to be broken.
 985 | tcp_keep_cnt = 6
 986 | 
 987 | [smart_sensor]
 988 | # When `use_smart_sensor` is True, Airflow redirects multiple qualified sensor tasks to
 989 | # smart sensor task.
 990 | use_smart_sensor = False
 991 | 
 992 | # `shard_code_upper_limit` is the upper limit of `shard_code` value. The `shard_code` is generated
 993 | # by `hashcode % shard_code_upper_limit`.
 994 | shard_code_upper_limit = 10000
 995 | 
 996 | # The number of running smart sensor processes for each service.
 997 | shards = 5
 998 | 
 999 | # comma separated sensor classes support in smart_sensor.
1000 | sensors_enabled = NamedHivePartitionSensor


--------------------------------------------------------------------------------