├── ETL-Data-pipelines ├── airflow-data │ ├── creds │ │ └── s3 │ └── airflow.cfg ├── Dockerfile ├── sparkFiles │ └── sparkProcess.py ├── docker-compose.yml └── dags │ └── dagRun.py ├── archiveweb.conf ├── archiveweb.psgi ├── LICENSE ├── stk.patch └── README.md /ETL-Data-pipelines/airflow-data/creds/s3: -------------------------------------------------------------------------------- 1 | [airflow-spark1] 2 | aws_access_key_id = 3 | aws_secret_access_key = 4 | -------------------------------------------------------------------------------- /archiveweb.conf: -------------------------------------------------------------------------------- 1 | # rename this file to archiveweb.yml and put a ':' after 'name' if 2 | # you want to use YAML like in old versions of Catalyst 3 | name ArchiveWeb 4 | -------------------------------------------------------------------------------- /archiveweb.psgi: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use ArchiveWeb; 5 | 6 | my $app = ArchiveWeb->apply_default_middlewares(ArchiveWeb->psgi_app); 7 | $app; 8 | 9 | -------------------------------------------------------------------------------- /ETL-Data-pipelines/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.0.1} 2 | 3 | USER root 4 | # Install OpenJDK-8 5 | RUN apt-get update && \ 6 | apt-get install -y openjdk-11-jre-headless && \ 7 | apt-get clean 8 | 9 | USER airflow 10 | RUN pip install --upgrade pip 11 | 12 | COPY requirements.txt /opt/airflow 13 | WORKDIR /opt/airflow 14 | RUN pip install -r requirements.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Ramesh chinnaraj 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /stk.patch: -------------------------------------------------------------------------------- 1 | diff --git a/bin/file-source-loader b/bin/file-source-loader 2 | index c576631..17abcee 100755 3 | --- a/bin/file-source-loader 4 | +++ b/bin/file-source-loader 5 | @@ -345,6 +345,54 @@ sub process_file_sinks { 6 | } 7 | } 8 | 9 | +sub process_file_transforms { 10 | + my($file_source, $file) = @_; 11 | + 12 | + # get the details of file transforms that are yet to be processed for this file 13 | + my $file_transforms = get_pending_file_transforms($file_source->{id}, $file->{id}); 14 | + 15 | +#FIXME 16 | + for my $file_transform_id (sort keys %$file_sinks) { 17 | + my $file_sink = $file_sinks->{$file_sink_id}; 18 | + my $src_path = get_archived_file_path($file_source, $file); 19 | +# my $dst_path = transform_filename($file_source, $file, 20 | +# $file_sink->{filename_transform}); 21 | +# 22 | +# # Remove the password component from the URL for display 23 | +# my $url = URI->new($file_sink->{url}); 24 | +# if ($url->scheme eq "ftp") { 25 | +# $url->userinfo($url->user); 26 | +# } 27 | +# msg_file_source( 28 | +# $file_source, 29 | +# "uploading file '$src_path' to '$url/$dst_path'", 30 | +# LOG_INFO 31 | +# ); 32 | +# 33 | +# # connect to FTP server 34 | +# my $ftp = connect_ftp($file_source, $file_sink->{url}); 35 | +# 36 | +# if (!$options{test}) { 37 | +# 38 | +# # upload file 39 | +# $ftp->put($src_path, $dst_path) or 40 | +# msg_file_source( 41 | +# $file_source, 42 | +# "FTP put of '$src_path' failed: " . $ftp->message, 43 | +# LOG_CRIT 44 | +# ); 45 | +# 46 | +# # mark this file_sink as done for this file 47 | +# $dbh->do(' 48 | +# INSERT INTO file_sink_file_processed 49 | +# (file_sink_id, file_id) 50 | +# VALUES (?, ?); 51 | +# ', {}, $file_sink_id, $file->{id}); 52 | +# } 53 | +# 54 | + } 55 | +} 56 | + 57 | sub mark_file_as_processed { 58 | my($file_id) = @_; 59 | 60 | @@ -401,6 +449,7 @@ sub process_file { 61 | 62 | process_file_sinks($file_source, $file); 63 | process_virtual_ftp_file_sinks($file_source, $file); 64 | + process_file_transforms($file_source, $file); 65 | 66 | # mark this file as processed 67 | # if process_file_sinks() and process_virtual_ftp_file_sinks() return, all 68 | -------------------------------------------------------------------------------- /ETL-Data-pipelines/sparkFiles/sparkProcess.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | import pyspark.sql.functions as F 3 | from pyspark.sql.window import Window 4 | from pyspark.sql.functions import lead, lag 5 | import os 6 | 7 | # the parsed data csv file 8 | parsedData = '/opt/airflow/sparkFiles/parsedData.csv' 9 | 10 | # start a spark session and set up its configuration 11 | spark = SparkSession \ 12 | .builder \ 13 | .appName("Pysparkexample") \ 14 | .config("spark.some.config.option", "some-value") \ 15 | .getOrCreate() 16 | 17 | # create a spark dataframe using the data in the csv 18 | df = spark.read.csv(parsedData, 19 | header='true', 20 | inferSchema='true', 21 | ignoreLeadingWhiteSpace=True, 22 | ignoreTrailingWhiteSpace=True) 23 | 24 | # list for columns subtractions 25 | colDiffs = [] 26 | # get only the county columns from the df columns list 27 | countyCols = df.columns[1:] 28 | # change the schema/type of the dateFor column from string to date 29 | df = df.withColumn("dateFor", F.to_date("dateFor", "yyyy-MM-dd")) 30 | # Window function spec to partition the df and sort it by Dates descending 31 | # The entire dataset is partitioned (no argument passed to partitionBy) as there are no dates that show multiple times. 32 | windowSpec = Window.partitionBy().orderBy(F.col('dateFor').desc()) 33 | # for each county column in the columns list 34 | for county in countyCols: 35 | # add a new column, countynameDiff, to the df containing the same numbers but shifted up by one using "lead" 36 | # E.g.: if a column X contains the numbers [1, 2, 3], applying the "lead" window function, with 1 as argument, will 37 | # shift everything up by 1 and the new XDiff column will contain [2, 3, none] 38 | df = df.withColumn(f'{county}Diff', lead(county, 1).over(windowSpec)) 39 | # add the subtraction to the list with the condition that if the calculated value is lower than 0, then save 0 40 | # this saves the subtraction formula in the list, not the result of the subtraction. 41 | # the header of the subtraction result column will be the same as the "county" by applying "alias" 42 | colDiffs.append(F.when((df[county] - df[f'{county}Diff']) < 0, 0) 43 | .otherwise(df[county] - df[f'{county}Diff']).alias(county)) 44 | # select the dateFor column and calculate the subtractions in the df, returning a new dataframe with the results 45 | result = df.select('dateFor', *colDiffs).fillna(0) 46 | # convert the result to a pandas dataframe and save it as a csv 47 | # warning: the conversion is executed in memory. Other methods might be better suited for large datasets 48 | result.toPandas().to_csv('/opt/airflow/sparkFiles/results.csv', 49 | sep=',', 50 | header=True, 51 | index=False) 52 | 53 | # delete the parsed data csv from the working directory 54 | os.remove(parsedData) 55 | -------------------------------------------------------------------------------- /ETL-Data-pipelines/docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:master-python3.8 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # AIRFLOW_GID - Group ID in Airflow containers 31 | # Default: 50000 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account. 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account. 35 | # Default: airflow 36 | # 37 | # Feel free to modify this file to suit your needs. 38 | #--- 39 | 40 | version: '3' 41 | x-airflow-common: 42 | &airflow-common 43 | image: apache/airflow:2.0.1 44 | environment: 45 | &airflow-common-env 46 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 47 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 48 | AIRFLOW__CORE__FERNET_KEY: '' 49 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 50 | AIRFLOW__CORE__LOAD_EXAMPLES: 'true' 51 | AIRFLOW_CONN_AWS_DEFAULT: aws://?profile=airflow-spark1&s3_config_file=/opt/airflow/creds/s3&s3_config_format=aws 52 | volumes: 53 | - ./airflow-data/creds:/opt/airflow/creds 54 | - ./dags:/opt/airflow/dags 55 | - ./sparkFiles:/opt/airflow/sparkFiles 56 | - ./airflow-data/logs:/opt/airflow/logs 57 | - ./airflow-data/plugins:/opt/airflow/plugins 58 | - ./airflow-data/airflow.cfg:/opt/airlfow/airflow.cfg 59 | user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" 60 | depends_on: 61 | postgres: 62 | condition: service_healthy 63 | 64 | services: 65 | postgres: 66 | image: postgres:13 67 | environment: 68 | POSTGRES_USER: airflow 69 | POSTGRES_PASSWORD: airflow 70 | POSTGRES_DB: airflow 71 | volumes: 72 | - postgres-db-volume:/var/lib/postgresql/data 73 | healthcheck: 74 | test: ["CMD", "pg_isready", "-U", "airflow"] 75 | interval: 5s 76 | retries: 5 77 | restart: always 78 | 79 | airflow-webserver: 80 | build: 81 | context: . 82 | dockerfile: Dockerfile 83 | <<: *airflow-common 84 | command: webserver 85 | ports: 86 | - 8080:8080 87 | healthcheck: 88 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 89 | interval: 10s 90 | timeout: 10s 91 | retries: 5 92 | restart: always 93 | 94 | airflow-scheduler: 95 | <<: *airflow-common 96 | command: scheduler 97 | restart: always 98 | 99 | airflow-init: 100 | <<: *airflow-common 101 | command: version 102 | environment: 103 | <<: *airflow-common-env 104 | _AIRFLOW_DB_UPGRADE: 'true' 105 | _AIRFLOW_WWW_USER_CREATE: 'true' 106 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 107 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 108 | 109 | volumes: 110 | postgres-db-volume: -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ETL-for-Educational-Institutions- 2 | 3 | About 4 | 5 | Educational project on how to build an ETL (Extract, Transform, Load) data pipeline, orchestrated with Airflow. 6 | 7 | An AWS s3 bucket is used as a Data Lake in which json files are stored. The data is extracted from a json and parsed (cleaned). It is then transformed/processed with Spark (PySpark) and loaded/stored in either a Mongodb database or in an Amazon Redshift Data Warehouse. 8 | 9 | The pipeline architecture - author's interpretation: 10 | 11 | ![image](https://user-images.githubusercontent.com/110036451/184507455-2ffd0d6f-3a9c-44fd-965b-05b14579cc1f.png) 12 | 13 | 14 | Note: Since this project was built for learning purposes and as an example, it functions only for a single scenario and data schema. 15 | The project is built in Python and it has 2 main parts: 16 | 17 | The Airflow DAG file, dags/dagRun.py, which orchestrates the data pipeline tasks. 18 | The PySpark data transformation/processing script, located in sparkFiles/sparkProcess.py 19 | Note: The code and especially the comments in the python files dags/dagRun.py and sparkFiles/sparkProcess.py are intentionally verbose for a better understanding of the functionality. 20 | Scenario 21 | The Romanian COVID-19 data, provided by https://datelazi.ro/, contains COVID-19 data for each county, including the total COVID numbers from one day to the next. It does not contain the difference in numbers between the days (i.e. for county X in day 1 there were 7 cases, in day 2 there were 37 cases). This data is loaded as a json file in the s3 bucket. 22 | 23 | Find the differences between days for all counties (i.e. for county X there were 30 more cases in day 2 than in day 1). If the difference is smaller than 0 (e.g. because of a data recording error), then the difference for that day should be 0. 24 | 25 | Base concepts 26 | Data Engineering 27 | ETL (Extract, Transform, Load) 28 | Pipeline 29 | Data Lake 30 | Data Warehouse 31 | Data Schema 32 | Apache Airflow (wikipedia page) 33 | Airflow DAG 34 | Airflow XCom 35 | Apache Spark, speciffically the PySpark api (wikipedia page) 36 | Amazon Web Services (AWS) (wikipedia page) 37 | s3 (wikipedia page) 38 | Redshift (Wikipedia page) 39 | mongoDB (wikipedia page) 40 | Prerequisites 41 | Docker 42 | Docker Compose 43 | AWS s3 bucket 44 | mongoDB database 45 | Amazon Redshift database 46 | Set-up 47 | Download / pull the repo to your desired location. 48 | 49 | You will have to create an AWS s3 user specifficaly for Airflow to interact with the s3 bucket. The credentials for that user will have to be saved in the s3 file found the directory /airflow-data/creds: 50 | 51 | [airflow-spark1] 52 | aws_access_key_id = 53 | aws_secret_access_key = 54 | On rows 17 and 18 in dags/dagRun.py you have the option to choose what databases system to use, mongoDB (noSQL) or Amazon Redshift (RDBMS), just by commenting/uncommenting one or the other: 55 | 56 | # database = 'mongoDB' 57 | database = 'Redshift' 58 | If you want to use mongoDB, you will have to enter the mongoDB connection string (or environment variable or file with the string) in the dags/dagRun.py file, line 22: 59 | 60 | client = pymongo.MongoClient('mongoDB_connection_string') 61 | If you want to use a Redshift cluster, you will have to provide your Amazon Redshift database name, host and the rest of the credentials from row 29 to 34 in dags/dagRun.py: 62 | 63 | dbname = 'testairflow' 64 | host = '*******************************.eu-central-1.redshift.amazonaws.com' 65 | port = '****' 66 | user = '*********' 67 | password = '********************' 68 | awsIAMrole = 'arn:aws:iam::************:role/******* 69 | You will have to change the s3 bucket name and file key (the name of the file saved in the s3 bucket) located at lines 148 and line 150 in dags/dagRun.py: 70 | 71 | # name of the file in the AWS s3 bucket 72 | key = 'countyData.json' 73 | # name of the AWS s3 bucket 74 | bucket = 'renato-airflow-raw' 75 | In the repo directory, execute the following command that will create the .env file containig the Airflow UID and GID needed by docker-compose: 76 | 77 | echo -e "AIRFLOW_UID=$(id -u)\nAIRFLOW_GID=0" > .env 78 | 79 | 80 | https://user-images.githubusercontent.com/19210522/114414670-b43ab980-9bb7-11eb-8ea8-061385b14980.gif 81 | 82 | Installation 83 | Start the installation with: 84 | 85 | docker-compose up -d 86 | This command will pull and create Docker images and containers for Airflow, according to the instructions in the docker-compose.yml file: 87 | 88 | 89 | 90 | After everything has been installed, you can check the status of your containers (if they are healthy) with: 91 | 92 | docker ps 93 | 94 | ![image](https://user-images.githubusercontent.com/110036451/184507487-d0fa1f2e-5914-492a-88d7-3e1a260d7026.png) 95 | 96 | 97 | Note: it might take up to 30 seconds for the containers to have the healthy flag after starting. 98 | 99 | 100 | 101 | Airflow Interface 102 | You can now access the Airflow web interface by going to http://localhost:8080/. If you have not changed them in the docker-compose.yml file, the default user is airflow and password is airflow: 103 | 104 | 105 | ![image](https://user-images.githubusercontent.com/110036451/184507507-ea8e1d34-b56b-4b19-b168-6d138546e02c.png) 106 | 107 | 108 | After signing in, the Airflow home page is the DAGs list page. Here you will see all your DAGs and the Airflow example DAGs, sorted alphabetically. 109 | 110 | Any DAG python script saved in the directory dags/, will show up on the DAGs page (e.g. the first DAG, analyze_json_data, is the one built for this project). 111 | 112 | Note: If you update the code in the python DAG script, the airflow DAGs page has to be refreshed 113 | 114 | Note: If you do not want to see any Airflow example dags, se the AIRFLOW__CORE__LOAD_EXAMPLES: flag to False in the docker-compose.yml file before starting the installation. 115 | 116 | 117 | ![image](https://user-images.githubusercontent.com/110036451/184507520-9f51ed71-65ee-4417-83a4-3e02659880fb.png) 118 | 119 | 120 | Click on the name of the dag to open the DAG details page: 121 | 122 | 123 | ![image](https://user-images.githubusercontent.com/110036451/184507526-8719bd8b-9057-40f9-b808-44bdc3009855.png) 124 | 125 | 126 | On the Graph View page you can see the dag running through each task (getLastProcessedDate, getDate, etc) after it has been unpaused and trigerred: 127 | 128 | 129 | https://user-images.githubusercontent.com/19210522/114459521-50c97f80-9be9-11eb-907a-3627a21d52dc.gif 130 | 131 | Pipeline Task by Task 132 | 133 | Task getLastProcessedDate 134 | 135 | Finds the last processed date in the mongo database and saves/pushes it in an Airflow XCom 136 | 137 | Task getDate 138 | 139 | Grabs the data saved in the XCom and depending of the value pulled, returns the task id parseJsonFile or the task id endRun 140 | 141 | Task parseJsonFile 142 | 143 | The json contains unnecessary data for this case, so it needs to be parsed to extract only the daily total numbers for each county. 144 | 145 | If there is any new data to be processed (the date extracted in the task getLastProcessedDate is older than dates in the data) it is saved in a temp file in the directory sparkFiles: 146 | 147 | 148 | ![image](https://user-images.githubusercontent.com/110036451/184507547-54a07928-f9ae-4880-88fe-d8939a359331.png) 149 | 150 | 151 | i.e.: for the county AB, on the 7th of April, there were 19046 COVID cases, on the 8th of April there were 19150 cases 152 | 153 | It also returns the task id endRun if there was no new data, or the task ID processParsedData 154 | 155 | Task processParsedData 156 | Executes the PySpark script sparkFiles/sparkProcess.py. 157 | 158 | The parsed data is processed and the result is saved in another temporary file in the sparkFiles directory: 159 | 160 | 161 | ![image](https://user-images.githubusercontent.com/110036451/184507558-b33a3043-efa2-4cf0-a052-5bf219dd3004.png) 162 | 163 | 164 | 165 | i.e.: for the county AB, on the 8th of April there were 104 more cases than on the 7th of April 166 | 167 | Task saveToDB 168 | 169 | Save the processed data either in the mongoDB database: 170 | 171 | 172 | ![image](https://user-images.githubusercontent.com/110036451/184507569-004a67c6-c747-4bbf-80c2-9e2c467e0331.png) 173 | 174 | 175 | 176 | Or in Redshift: 177 | 178 | 179 | ![image](https://user-images.githubusercontent.com/110036451/184507596-f96cee55-4fbd-4812-bca6-c9e4381c4582.png) 180 | 181 | 182 | Note: The Redshift column names are the full name of the counties as the short version for some of them conflicts with SQL reserved words 183 | 184 | Task endRun 185 | 186 | Dummy task used as the end of the pipeline 187 | 188 | Shut Down and Restart Airflow 189 | If you want to make changes to any of the configuration files docker-compose.yml, Dockerfile, requirements.txt you will have to shut down the Airflow instance with: 190 | 191 | docker-compose down 192 | 193 | This command will shut down and delete any containers created/used by Airflow. 194 | 195 | For any changes made in the configuration files to be applied, you will have to rebuild the Airflow images with the command: 196 | 197 | docker-compose build 198 | 199 | Recreate all the containers with: 200 | 201 | docker-compose up -d 202 | -------------------------------------------------------------------------------- /ETL-Data-pipelines/dags/dagRun.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import os 4 | # import boto3 5 | import datetime 6 | import pymongo 7 | from airflow.models import DAG 8 | from airflow.operators.bash_operator import BashOperator 9 | from airflow.operators.python_operator import PythonOperator 10 | from airflow.operators.python_operator import BranchPythonOperator 11 | from airflow.operators.dummy_operator import DummyOperator 12 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 13 | import psycopg2 14 | 15 | # select the database system to be used: mongoDB (noSQL) or Amazon Redshift (RDBMS) 16 | # database = 'mongoDB' 17 | database = 'Redshift' 18 | 19 | if database == 'mongoDB': 20 | # connect to the MONGO database 21 | # the mongo DB connection string 22 | client = pymongo.MongoClient('mongoDB_connection_string') 23 | # the database to be used 24 | db = client.testairflow 25 | 26 | else: 27 | # Amazon Redshift database connection details 28 | # the details bellow can also be saved in environment variables 29 | dbname = 'testairflow' 30 | host = '*******************************.eu-central-1.redshift.amazonaws.com' 31 | port = '****' 32 | user = '*********' 33 | password = '********************' 34 | awsIAMrole = 'arn:aws:iam::************:role/*******' 35 | 36 | 37 | def getDBdate(ti): 38 | """ 39 | Args: 40 | ti: task instance argument used by Airflow to push and pull xcom data 41 | 42 | Pushes an xcom to the Airflow database with the date of the most recent db entry 43 | (An xcom is a tool used to share small amounts of data between Airflow dag tasks. Similar to Airflow variable, but 44 | unlike Airflow variables which are global and can be shared between multiple dags, xcom shares between the tasks 45 | of a dag.) 46 | """ 47 | 48 | # try-except error handler: if the db aggregation fails, return None to not process the same data multiple times 49 | try: 50 | if database == 'mongoDB': 51 | # >>> use the MONGO database 52 | # Find the the latest database document 53 | # filter for the Mongo db aggregation: the key 'dateFor' has to be exist in the collection 54 | aggFilter = {'dateFor': {'$exists': True}} 55 | # in the collection/table (db.countyDiff), apply the filter to the aggregation, convert the dates 56 | # from str to datetime, sort descending and return one (the first one) 57 | dateDoc = list(db.countyDiff.aggregate([{'$match': aggFilter}, 58 | {'$project': { 59 | 'date': { 60 | '$dateFromString': { 61 | 'dateString': '$dateFor', 62 | 'format': '%Y-%m-%d'} 63 | } 64 | }}, 65 | {'$sort': {'date': -1}}, 66 | {'$limit': 1} 67 | ])) 68 | 69 | # try-except error handler to set base parsing start date if the aggregation above returns and empty 70 | # list (i.e.: no data found) 71 | try: 72 | fetchedDate = dateDoc[0]['date'].strftime('%Y-%m-%d') 73 | except: 74 | fetchedDate = '2020-01-01' 75 | 76 | else: 77 | # >>> use the AMAZON REDSHIFT database 78 | # set up the connection to the Redshift database 79 | conn = psycopg2.connect(f'dbname={dbname} host={host} port={port} user={user} password={password}') 80 | # start the database cursor 81 | cursor = conn.cursor() 82 | # grab the latest date from the counties collection 83 | sql = """SELECT dateFor FROM counties ORDER BY dateFor DESC LIMIT 1;""" 84 | # try-except error handler to return a parsing start date in case the cursor execute fails 85 | try: 86 | cursor.execute(sql) 87 | # convert the date from datetime to string 88 | fetchedDate = cursor.fetchall()[0][0].strftime('%Y-%m-%d') 89 | # close the connection and cursor 90 | cursor.close() 91 | conn.close() 92 | except: 93 | fetchedDate = '2020-01-01' 94 | cursor.close() 95 | conn.close() 96 | 97 | except: 98 | fetchedDate = None 99 | 100 | # push the task instance (key, value format) to an xcom 101 | ti.xcom_push(key='fetchedDate', value=fetchedDate) 102 | 103 | 104 | def getLastDate(ti): 105 | """ 106 | Pull the date from xcom and return one task id, or multiple task IDs if they are inside a list, to be executed 107 | Args: 108 | ti: task instance argument used by Airflow to push and pull xcom data 109 | 110 | Returns: the airflow task ID to be executed based on the xcom data 111 | """ 112 | 113 | # pull the xcom data (in this case a list containing only one element: the date string or None) 114 | fetchedDate = ti.xcom_pull(key='fetchedDate', task_ids=['getLastProcessedDate']) 115 | # if the date is None then execute the 'parseJsonFile' task, else execute the 'endRun' task 116 | if fetchedDate[0] is not None: 117 | return 'parseJsonFile' 118 | return 'endRun' 119 | 120 | 121 | def readJsonData(ti): 122 | """ 123 | Read and parse a Json, save the parsed data to a CSV 124 | Args: 125 | ti: task instance argument used by Airflow to push and pull xcom data 126 | 127 | Returns: a task id to be executed 128 | """ 129 | 130 | # get the date data from the xcom 131 | fetchedDate = ti.xcom_pull(key='fetchedDate', task_ids=['getLastProcessedDate']) 132 | # grab the first element (the date string) from the list pulled from xcom and convert it from string to datetime 133 | lastDBDate = datetime.datetime.strptime(fetchedDate[0], '%Y-%m-%d') 134 | 135 | # connect to s3 using the boto3 AWS python module (credentials are saved in an env variable) 136 | # set up the connection to the Amazon cloud bucket 137 | # s3 = boto3.client('s3') 138 | # get the file data from the S3 bucket 139 | # obj = s3.get_object(Bucket='renato-airflow-raw', Key='mar.json') 140 | # open the file in memory 141 | # filename = obj['Body'].read().decode('utf-8', errors='ignore') 142 | 143 | # connect to s3 using the Airflow hook system. Credentials are saved in env variables when the container is 144 | # built, using the AIRFLOW_CONN_AWS_DEFAULT environment flag in the docker-compose.yml file. 145 | # s3 hook object 146 | hook = S3Hook() 147 | # name of the file in the AWS s3 bucket 148 | key = 'countyData.json' 149 | # name of the AWS s3 bucket 150 | bucket = 'renato-airflow-raw' 151 | # directory in which the file will be saved 152 | path = '/opt/airflow/sparkFiles' 153 | # download the file 154 | filename = hook.download_file( 155 | key=key, 156 | bucket_name=bucket, 157 | local_path=path 158 | ) 159 | 160 | # open the json data with the correct encoding. 'latin-1' encoding was used as the json contains some chars that 161 | # are not encoded in utf-8 (which is usually used) 162 | with open(filename, encoding='latin-1') as data: 163 | # load the data as a dictionary (key, value pairs) 164 | jsonData = json.load(data) 165 | # make sure that historicalData is in the dictionary keys 166 | if 'historicalData' in jsonData.keys(): 167 | # empty list for saving the parsed data 168 | dfData = [] 169 | # for each date (key) in the 'historicalData' dictionary 170 | for key in jsonData['historicalData'].keys(): 171 | # if the last db date is smaller/earlier or equal with the date key 172 | if lastDBDate <= datetime.datetime.strptime(key, '%Y-%m-%d'): 173 | # check if the value containing teh data is a dictionary. This way None values are skipped. 174 | if type(jsonData["historicalData"][key]['countyInfectionsNumbers']) == dict: 175 | # create a new empty dict for each date 176 | parsedLine = {} 177 | # save the date in the dict with the key 'dateFor' 178 | parsedLine['dateFor'] = key 179 | # update the new dict (add/append) with the required json data for each date 180 | parsedLine.update(jsonData["historicalData"][key]['countyInfectionsNumbers']) 181 | # save the new dict to the list created above 182 | dfData.append(parsedLine) 183 | 184 | # check if the data list is not empty (has a length of 0) 185 | if len(dfData) > 0: 186 | # convert the list ot a Pandas dataframe 187 | df = pd.DataFrame(dfData) 188 | # replace NAN (None) values with 0 189 | df = df.fillna(0) 190 | # save the df to a csv with the headers added and with utf8 encoding 191 | df.to_csv('/opt/airflow/sparkFiles/parsedData.csv', 192 | encoding='utf8', 193 | index=False, 194 | header=True) 195 | 196 | # delete the file downloaded from s3 197 | os.remove(filename) 198 | 199 | return 'processParsedData' 200 | return 'endRun' 201 | return 'endRun' 202 | 203 | 204 | def uploadToDB(ti): 205 | """ 206 | Upload the results data to the database 207 | """ 208 | results = '/opt/airflow/sparkFiles/results.csv' 209 | 210 | # get the date from the xcom 211 | fetchedDate = ti.xcom_pull(key='fetchedDate', task_ids=['getLastProcessedDate']) 212 | lastDBDate = fetchedDate[0] 213 | 214 | # read the results CSV to a Pandas dataframe 215 | pandasDf = pd.read_csv(results) 216 | # remove the row that has the same dateFor as the previously last processed date to avoid any data errors 217 | newDf = pd.concat([pandasDf.loc[pandasDf.dateFor != lastDBDate]]) 218 | 219 | if database == 'mongoDB': 220 | # >>> using the MONGO database 221 | 222 | # convert to a list of dictionary objects 223 | resultsList = newDf.to_dict(orient='records') 224 | # save the data to the database as bulk 225 | # insertToDB = db.countyDiff.insert_many(resultsList) 226 | 227 | # save data by replacing documents already found in the collection if the dateFor fields match, if not insert 228 | # new document by setting the upsert flag to True 229 | for item in resultsList: 230 | insertToDB = db.countyDiff.replace_one({'dateFor': item['dateFor']}, 231 | item, 232 | upsert=True) 233 | 234 | else: 235 | # >>> using the AMAZON REDSHIFT database 236 | 237 | # overwrite the CSV with the new data 238 | newDf.to_csv(results, 239 | sep=',', 240 | header=True, 241 | index=False) 242 | #upload results csv to S3 243 | # s3 hook object 244 | hook = S3Hook() 245 | # name of the file in the AWS s3 bucket 246 | key = 'results.csv' 247 | # name of the AWS s3 bucket 248 | bucket = 'renato-airflow-raw' 249 | # load/upload the results file to the s3 bucket 250 | loadToS3 = hook.load_file( 251 | filename=results, 252 | key=key, 253 | bucket_name=bucket, 254 | replace=True 255 | ) 256 | 257 | # set up the connection to the Redshift database 258 | conn = psycopg2.connect(f'dbname={dbname} host={host} port={port} user={user} password={password}') 259 | # start the database cursor 260 | cursor = conn.cursor() 261 | # COPY the data from the s3 loaded file into the Redshift counties collection. 262 | # the COPY command only appends the CSV data to the table. It does not replace 263 | sql = f"""COPY counties FROM 's3://renato-airflow-raw/results.csv' 264 | iam_role '{awsIAMrole}' 265 | DELIMITER AS ',' 266 | DATEFORMAT 'YYYY-MM-DD' 267 | IGNOREHEADER 1 ;""" 268 | 269 | cursor.execute(sql) 270 | conn.commit() 271 | cursor.close() 272 | conn.close() 273 | 274 | # delete the parsed data csv from the working directory 275 | os.remove(results) 276 | 277 | 278 | # set up DAG arguments 279 | defaultArgs = { 280 | 'owner': 'Renato_Otescu', 281 | 'start_date': datetime.datetime(2021, 1, 1), 282 | 'retries': 3, 283 | 'retry_delay': datetime.timedelta(seconds=30) 284 | } 285 | 286 | # plan DAG run/pipeline (schedule_interval='0 8 * * *' for each day at 8AM or schedule_interval='@daily' for 0AM) 287 | # the first argument is the name of the DAG: 'analyze_json_data' 288 | with DAG('analyze_json_data', 289 | schedule_interval='@daily', 290 | default_args=defaultArgs, 291 | catchup=False) as dag: 292 | 293 | # task that calls the function getDBdate by using a PythonOperator 294 | getLastProcessedDate = PythonOperator( 295 | task_id='getLastProcessedDate', 296 | python_callable=getDBdate 297 | ) 298 | 299 | # Call the function getLastDate. A BranchPythonOperator is used here as the getLastDate function returns either 300 | # the 'parseJsonFile' task id or the 'endRun' task id. 301 | # 2 branches are created: one for the task 'parseJsonFile' and the other one for the task id 'endRun'. 302 | # If multiple tasks need to be executed at the same time, the return of the function has to be a list containing 303 | # all the tasks ids that need to be executed at the same time (i.e.: if ['task_id_1', 'task_id_2', etc]) 304 | # The flag do_xcom_push is set to False because each xcom also creates a separate data point in the Airflow DB 305 | # which in this case is useless 306 | getDate = BranchPythonOperator( 307 | task_id='getDate', 308 | python_callable=getLastDate, 309 | do_xcom_push=False 310 | ) 311 | 312 | # if the BranchPythonOperator above returns the task id 'parseJsonFile', then the readJsonData function is called 313 | # note that the 'parseJsonFile' task is also a BranchPythonOperator because the function it calls, readJsonData, 314 | # also returns 2 task ids: 'processParsedData' and 'endRun' 315 | parseJsonFile = BranchPythonOperator( 316 | task_id='parseJsonFile', 317 | python_callable=readJsonData, 318 | do_xcom_push=False 319 | ) 320 | 321 | # as the Spark (PySpark) script is located in a different directory, it is executed using a BashOperator 322 | processParsedData = BashOperator( 323 | task_id='processParsedData', 324 | bash_command='python /opt/airflow/sparkFiles/sparkProcess.py' 325 | ) 326 | 327 | # execute the 'uploadToDB' function using a PythonOperator 328 | saveToDB = PythonOperator( 329 | task_id='saveToDB', 330 | python_callable=uploadToDB 331 | ) 332 | 333 | # the last task is a DummyOperator used only to point the branch operators above 334 | # The trigger rule 'none_failed_or_skipped' ensures that the dummy task is executed if at least one parent succeeds 335 | endRun = DummyOperator( 336 | task_id='endRun', 337 | trigger_rule='none_failed_or_skipped' 338 | ) 339 | 340 | # set tasks relations (the order the tasks are executed) 341 | getLastProcessedDate >> getDate 342 | # the task 'getDate' is a BranchPythonOperator so after it 2 other tasks can be executed parseJsonFile and endRun 343 | getDate >> [parseJsonFile, endRun] 344 | # same as above for the branch parseJsonFile 345 | parseJsonFile >> [processParsedData, endRun] 346 | processParsedData >> saveToDB >> endRun 347 | -------------------------------------------------------------------------------- /ETL-Data-pipelines/airflow-data/airflow.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | # The folder where your airflow pipelines live, most likely a 3 | # subfolder in a code repository. This path must be absolute. 4 | dags_folder = /opt/airflow/dags 5 | 6 | # Hostname by providing a path to a callable, which will resolve the hostname. 7 | # The format is "package.function". 8 | # 9 | # For example, default value "socket.getfqdn" means that result from getfqdn() of "socket" 10 | # package will be used as hostname. 11 | # 12 | # No argument should be required in the function specified. 13 | # If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address`` 14 | hostname_callable = socket.getfqdn 15 | 16 | # Default timezone in case supplied date times are naive 17 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam) 18 | default_timezone = utc 19 | 20 | # The executor class that airflow should use. Choices include 21 | # ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``, ``DaskExecutor``, 22 | # ``KubernetesExecutor``, ``CeleryKubernetesExecutor`` or the 23 | # full import path to the class when using a custom executor. 24 | executor = SequentialExecutor 25 | 26 | # The SqlAlchemy connection string to the metadata database. 27 | # SqlAlchemy supports many different database engine, more information 28 | # their website 29 | sql_alchemy_conn = sqlite:////opt/airflow/airflow.db 30 | 31 | # The encoding for the databases 32 | sql_engine_encoding = utf-8 33 | 34 | # Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding. 35 | # This is particularly useful in case of mysql with utf8mb4 encoding because 36 | # primary keys for XCom table has too big size and ``sql_engine_collation_for_ids`` should 37 | # be set to ``utf8mb3_general_ci``. 38 | # sql_engine_collation_for_ids = 39 | 40 | # If SqlAlchemy should pool database connections. 41 | sql_alchemy_pool_enabled = True 42 | 43 | # The SqlAlchemy pool size is the maximum number of database connections 44 | # in the pool. 0 indicates no limit. 45 | sql_alchemy_pool_size = 5 46 | 47 | # The maximum overflow size of the pool. 48 | # When the number of checked-out connections reaches the size set in pool_size, 49 | # additional connections will be returned up to this limit. 50 | # When those additional connections are returned to the pool, they are disconnected and discarded. 51 | # It follows then that the total number of simultaneous connections the pool will allow 52 | # is pool_size + max_overflow, 53 | # and the total number of "sleeping" connections the pool will allow is pool_size. 54 | # max_overflow can be set to ``-1`` to indicate no overflow limit; 55 | # no limit will be placed on the total number of concurrent connections. Defaults to ``10``. 56 | sql_alchemy_max_overflow = 10 57 | 58 | # The SqlAlchemy pool recycle is the number of seconds a connection 59 | # can be idle in the pool before it is invalidated. This config does 60 | # not apply to sqlite. If the number of DB connections is ever exceeded, 61 | # a lower config value will allow the system to recover faster. 62 | sql_alchemy_pool_recycle = 1800 63 | 64 | # Check connection at the start of each connection pool checkout. 65 | # Typically, this is a simple statement like "SELECT 1". 66 | # More information here: 67 | # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic 68 | sql_alchemy_pool_pre_ping = True 69 | 70 | # The schema to use for the metadata database. 71 | # SqlAlchemy supports databases with the concept of multiple schemas. 72 | sql_alchemy_schema = 73 | 74 | # Import path for connect args in SqlAlchemy. Defaults to an empty dict. 75 | # This is useful when you want to configure db engine args that SqlAlchemy won't parse 76 | # in connection string. 77 | # See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args 78 | # sql_alchemy_connect_args = 79 | 80 | # The amount of parallelism as a setting to the executor. This defines 81 | # the max number of task instances that should run simultaneously 82 | # on this airflow installation 83 | parallelism = 32 84 | 85 | # The number of task instances allowed to run concurrently by the scheduler 86 | # in one DAG. Can be overridden by ``concurrency`` on DAG level. 87 | dag_concurrency = 16 88 | 89 | # Are DAGs paused by default at creation 90 | dags_are_paused_at_creation = True 91 | 92 | # The maximum number of active DAG runs per DAG 93 | max_active_runs_per_dag = 16 94 | 95 | # Whether to load the DAG examples that ship with Airflow. It's good to 96 | # get started, but you probably want to set this to ``False`` in a production 97 | # environment 98 | load_examples = True 99 | 100 | # Whether to load the default connections that ship with Airflow. It's good to 101 | # get started, but you probably want to set this to ``False`` in a production 102 | # environment 103 | load_default_connections = True 104 | 105 | # Path to the folder containing Airflow plugins 106 | plugins_folder = /opt/airflow/plugins 107 | 108 | # Should tasks be executed via forking of the parent process ("False", 109 | # the speedier option) or by spawning a new python process ("True" slow, 110 | # but means plugin changes picked up by tasks straight away) 111 | execute_tasks_new_python_interpreter = False 112 | 113 | # Secret key to save connection passwords in the db 114 | fernet_key = hjIFXCPQL6ZZx-dN7Kpr5yULTMFmLK-skgH9KdKeA1I= 115 | 116 | # Whether to disable pickling dags 117 | donot_pickle = True 118 | 119 | # How long before timing out a python file import 120 | dagbag_import_timeout = 30.0 121 | 122 | # Should a traceback be shown in the UI for dagbag import errors, 123 | # instead of just the exception message 124 | dagbag_import_error_tracebacks = True 125 | 126 | # If tracebacks are shown, how many entries from the traceback should be shown 127 | dagbag_import_error_traceback_depth = 2 128 | 129 | # How long before timing out a DagFileProcessor, which processes a dag file 130 | dag_file_processor_timeout = 50 131 | 132 | # The class to use for running task instances in a subprocess. 133 | # Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class 134 | # when using a custom task runner. 135 | task_runner = StandardTaskRunner 136 | 137 | # If set, tasks without a ``run_as_user`` argument will be run with this user 138 | # Can be used to de-elevate a sudo user running Airflow when executing tasks 139 | default_impersonation = 140 | 141 | # What security module to use (for example kerberos) 142 | security = 143 | 144 | # Turn unit test mode on (overwrites many configuration options with test 145 | # values at runtime) 146 | unit_test_mode = False 147 | 148 | # Whether to enable pickling for xcom (note that this is insecure and allows for 149 | # RCE exploits). 150 | enable_xcom_pickling = False 151 | 152 | # When a task is killed forcefully, this is the amount of time in seconds that 153 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED 154 | killed_task_cleanup_time = 60 155 | 156 | # Whether to override params with dag_run.conf. If you pass some key-value pairs 157 | # through ``airflow dags backfill -c`` or 158 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params. 159 | dag_run_conf_overrides_params = True 160 | 161 | # When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``. 162 | dag_discovery_safe_mode = True 163 | 164 | # The number of retries each task is going to have by default. Can be overridden at dag or task level. 165 | default_task_retries = 0 166 | 167 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. 168 | min_serialized_dag_update_interval = 30 169 | 170 | # Fetching serialized DAG can not be faster than a minimum interval to reduce database 171 | # read rate. This config controls when your DAGs are updated in the Webserver 172 | min_serialized_dag_fetch_interval = 10 173 | 174 | # Whether to persist DAG files code in DB. 175 | # If set to True, Webserver reads file contents from DB instead of 176 | # trying to access files in a DAG folder. 177 | # Example: store_dag_code = False 178 | # store_dag_code = 179 | 180 | # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store 181 | # in the Database. 182 | # All the template_fields for each of Task Instance are stored in the Database. 183 | # Keeping this number small may cause an error when you try to view ``Rendered`` tab in 184 | # TaskInstance view for older tasks. 185 | max_num_rendered_ti_fields_per_task = 30 186 | 187 | # On each dagrun check against defined SLAs 188 | check_slas = True 189 | 190 | # Path to custom XCom class that will be used to store and resolve operators results 191 | # Example: xcom_backend = path.to.CustomXCom 192 | xcom_backend = airflow.models.xcom.BaseXCom 193 | 194 | # By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``, 195 | # if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module. 196 | lazy_load_plugins = True 197 | 198 | # By default Airflow providers are lazily-discovered (discovery and imports happen only when required). 199 | # Set it to False, if you want to discover providers whenever 'airflow' is invoked via cli or 200 | # loaded from module. 201 | lazy_discover_providers = True 202 | 203 | # Number of times the code should be retried in case of DB Operational Errors. 204 | # Not all transactions will be retried as it can cause undesired state. 205 | # Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``. 206 | max_db_retries = 3 207 | 208 | [logging] 209 | # The folder where airflow should store its log files 210 | # This path must be absolute 211 | base_log_folder = /opt/airflow/logs 212 | 213 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search. 214 | # Set this to True if you want to enable remote logging. 215 | remote_logging = False 216 | 217 | # Users must supply an Airflow connection id that provides access to the storage 218 | # location. 219 | remote_log_conn_id = 220 | 221 | # Path to Google Credential JSON file. If omitted, authorization based on `the Application Default 222 | # Credentials 223 | # `__ will 224 | # be used. 225 | google_key_path = 226 | 227 | # Storage bucket URL for remote logging 228 | # S3 buckets should start with "s3://" 229 | # Cloudwatch log groups should start with "cloudwatch://" 230 | # GCS buckets should start with "gs://" 231 | # WASB buckets should start with "wasb" just to help Airflow select correct handler 232 | # Stackdriver logs should start with "stackdriver://" 233 | remote_base_log_folder = 234 | 235 | # Use server-side encryption for logs stored in S3 236 | encrypt_s3_logs = False 237 | 238 | # Logging level 239 | logging_level = INFO 240 | 241 | # Logging level for Flask-appbuilder UI 242 | fab_logging_level = WARN 243 | 244 | # Logging class 245 | # Specify the class that will specify the logging configuration 246 | # This class has to be on the python classpath 247 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG 248 | logging_config_class = 249 | 250 | # Flag to enable/disable Colored logs in Console 251 | # Colour the logs when the controlling terminal is a TTY. 252 | colored_console_log = True 253 | 254 | # Log format for when Colored logs is enabled 255 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s 256 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter 257 | 258 | # Format of Log line 259 | log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s 260 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s 261 | 262 | # Specify prefix pattern like mentioned below with stream handler TaskHandlerWithCustomFormatter 263 | # Example: task_log_prefix_template = {ti.dag_id}-{ti.task_id}-{execution_date}-{try_number} 264 | task_log_prefix_template = 265 | 266 | # Formatting for how airflow generates file names/paths for each task run. 267 | log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log 268 | 269 | # Formatting for how airflow generates file names for log 270 | log_processor_filename_template = {{ filename }}.log 271 | 272 | # full path of dag_processor_manager logfile 273 | dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log 274 | 275 | # Name of handler to read task instance logs. 276 | # Defaults to use ``task`` handler. 277 | task_log_reader = task 278 | 279 | # A comma\-separated list of third-party logger names that will be configured to print messages to 280 | # consoles\. 281 | # Example: extra_loggers = connexion,sqlalchemy 282 | extra_loggers = 283 | 284 | [metrics] 285 | 286 | # StatsD (https://github.com/etsy/statsd) integration settings. 287 | # Enables sending metrics to StatsD. 288 | statsd_on = False 289 | statsd_host = localhost 290 | statsd_port = 8125 291 | statsd_prefix = airflow 292 | 293 | # If you want to avoid sending all the available metrics to StatsD, 294 | # you can configure an allow list of prefixes (comma separated) to send only the metrics that 295 | # start with the elements of the list (e.g: "scheduler,executor,dagrun") 296 | statsd_allow_list = 297 | 298 | # A function that validate the statsd stat name, apply changes to the stat name if necessary and return 299 | # the transformed stat name. 300 | # 301 | # The function should have the following signature: 302 | # def func_name(stat_name: str) -> str: 303 | stat_name_handler = 304 | 305 | # To enable datadog integration to send airflow metrics. 306 | statsd_datadog_enabled = False 307 | 308 | # List of datadog tags attached to all metrics(e.g: key1:value1,key2:value2) 309 | statsd_datadog_tags = 310 | 311 | # If you want to utilise your own custom Statsd client set the relevant 312 | # module path below. 313 | # Note: The module path must exist on your PYTHONPATH for Airflow to pick it up 314 | # statsd_custom_client_path = 315 | 316 | [secrets] 317 | # Full class name of secrets backend to enable (will precede env vars and metastore in search path) 318 | # Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend 319 | backend = 320 | 321 | # The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class. 322 | # See documentation for the secrets backend you are using. JSON is expected. 323 | # Example for AWS Systems Manager ParameterStore: 324 | # ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}`` 325 | backend_kwargs = 326 | 327 | [cli] 328 | # In what way should the cli access the API. The LocalClient will use the 329 | # database directly, while the json_client will use the api running on the 330 | # webserver 331 | api_client = airflow.api.client.local_client 332 | 333 | # If you set web_server_url_prefix, do NOT forget to append it here, ex: 334 | # ``endpoint_url = http://localhost:8080/myroot`` 335 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...`` 336 | endpoint_url = http://localhost:8080 337 | 338 | [debug] 339 | # Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first 340 | # failed task. Helpful for debugging purposes. 341 | fail_fast = False 342 | 343 | [api] 344 | # Enables the deprecated experimental API. Please note that these APIs do not have access control. 345 | # The authenticated user has full access. 346 | # 347 | # .. warning:: 348 | # 349 | # This `Experimental REST API `__ is 350 | # deprecated since version 2.0. Please consider using 351 | # `the Stable REST API `__. 352 | # For more information on migration, see 353 | # `UPDATING.md `_ 354 | enable_experimental_api = False 355 | 356 | # How to authenticate users of the API. See 357 | # https://airflow.apache.org/docs/stable/security.html for possible values. 358 | # ("airflow.api.auth.backend.default" allows all requests for historic reasons) 359 | auth_backend = airflow.api.auth.backend.deny_all 360 | 361 | # Used to set the maximum page limit for API requests 362 | maximum_page_limit = 100 363 | 364 | # Used to set the default page limit when limit is zero. A default limit 365 | # of 100 is set on OpenApi spec. However, this particular default limit 366 | # only work when limit is set equal to zero(0) from API requests. 367 | # If no limit is supplied, the OpenApi spec default is used. 368 | fallback_page_limit = 100 369 | 370 | # The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested. 371 | # Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com 372 | google_oauth2_audience = 373 | 374 | # Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on 375 | # `the Application Default Credentials 376 | # `__ will 377 | # be used. 378 | # Example: google_key_path = /files/service-account-json 379 | google_key_path = 380 | 381 | [lineage] 382 | # what lineage backend to use 383 | backend = 384 | 385 | [atlas] 386 | sasl_enabled = False 387 | host = 388 | port = 21000 389 | username = 390 | password = 391 | 392 | [operators] 393 | # The default owner assigned to each new operator, unless 394 | # provided explicitly or passed via ``default_args`` 395 | default_owner = airflow 396 | default_cpus = 1 397 | default_ram = 512 398 | default_disk = 512 399 | default_gpus = 0 400 | 401 | # Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator. 402 | # If set to False, an exception will be thrown, otherwise only the console message will be displayed. 403 | allow_illegal_arguments = False 404 | 405 | [hive] 406 | # Default mapreduce queue for HiveOperator tasks 407 | default_hive_mapred_queue = 408 | 409 | # Template for mapred_job_name in HiveOperator, supports the following named parameters 410 | # hostname, dag_id, task_id, execution_date 411 | # mapred_job_name_template = 412 | 413 | [webserver] 414 | # The base url of your website as airflow cannot guess what domain or 415 | # cname you are using. This is used in automated emails that 416 | # airflow sends to point links to the right web server 417 | base_url = http://localhost:8080 418 | 419 | # Default timezone to display all dates in the UI, can be UTC, system, or 420 | # any IANA timezone string (e.g. Europe/Amsterdam). If left empty the 421 | # default value of core/default_timezone will be used 422 | # Example: default_ui_timezone = America/New_York 423 | default_ui_timezone = UTC 424 | 425 | # The ip specified when starting the web server 426 | web_server_host = 0.0.0.0 427 | 428 | # The port on which to run the web server 429 | web_server_port = 8080 430 | 431 | # Paths to the SSL certificate and key for the web server. When both are 432 | # provided SSL will be enabled. This does not change the web server port. 433 | web_server_ssl_cert = 434 | 435 | # Paths to the SSL certificate and key for the web server. When both are 436 | # provided SSL will be enabled. This does not change the web server port. 437 | web_server_ssl_key = 438 | 439 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond 440 | web_server_master_timeout = 120 441 | 442 | # Number of seconds the gunicorn webserver waits before timing out on a worker 443 | web_server_worker_timeout = 120 444 | 445 | # Number of workers to refresh at a time. When set to 0, worker refresh is 446 | # disabled. When nonzero, airflow periodically refreshes webserver workers by 447 | # bringing up new ones and killing old ones. 448 | worker_refresh_batch_size = 1 449 | 450 | # Number of seconds to wait before refreshing a batch of workers. 451 | worker_refresh_interval = 30 452 | 453 | # If set to True, Airflow will track files in plugins_folder directory. When it detects changes, 454 | # then reload the gunicorn. 455 | reload_on_plugin_change = False 456 | 457 | # Secret key used to run your flask app 458 | # It should be as random as possible 459 | secret_key = JK3PU6syfBItlK8mgHrYnA== 460 | 461 | # Number of workers to run the Gunicorn web server 462 | workers = 4 463 | 464 | # The worker class gunicorn should use. Choices include 465 | # sync (default), eventlet, gevent 466 | worker_class = sync 467 | 468 | # Log files for the gunicorn webserver. '-' means log to stderr. 469 | access_logfile = - 470 | 471 | # Log files for the gunicorn webserver. '-' means log to stderr. 472 | error_logfile = - 473 | 474 | # Access log format for gunicorn webserver. 475 | # default format is %%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s" 476 | # documentation - https://docs.gunicorn.org/en/stable/settings.html#access-log-format 477 | access_logformat = 478 | 479 | # Expose the configuration file in the web server 480 | expose_config = False 481 | 482 | # Expose hostname in the web server 483 | expose_hostname = True 484 | 485 | # Expose stacktrace in the web server 486 | expose_stacktrace = True 487 | 488 | # Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times`` 489 | dag_default_view = tree 490 | 491 | # Default DAG orientation. Valid values are: 492 | # ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top) 493 | dag_orientation = LR 494 | 495 | # Puts the webserver in demonstration mode; blurs the names of Operators for 496 | # privacy. 497 | demo_mode = False 498 | 499 | # The amount of time (in secs) webserver will wait for initial handshake 500 | # while fetching logs from other worker machine 501 | log_fetch_timeout_sec = 5 502 | 503 | # Time interval (in secs) to wait before next log fetching. 504 | log_fetch_delay_sec = 2 505 | 506 | # Distance away from page bottom to enable auto tailing. 507 | log_auto_tailing_offset = 30 508 | 509 | # Animation speed for auto tailing log display. 510 | log_animation_speed = 1000 511 | 512 | # By default, the webserver shows paused DAGs. Flip this to hide paused 513 | # DAGs by default 514 | hide_paused_dags_by_default = False 515 | 516 | # Consistent page size across all listing views in the UI 517 | page_size = 100 518 | 519 | # Define the color of navigation bar 520 | navbar_color = #fff 521 | 522 | # Default dagrun to show in UI 523 | default_dag_run_display_number = 25 524 | 525 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy 526 | enable_proxy_fix = False 527 | 528 | # Number of values to trust for ``X-Forwarded-For``. 529 | # More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/ 530 | proxy_fix_x_for = 1 531 | 532 | # Number of values to trust for ``X-Forwarded-Proto`` 533 | proxy_fix_x_proto = 1 534 | 535 | # Number of values to trust for ``X-Forwarded-Host`` 536 | proxy_fix_x_host = 1 537 | 538 | # Number of values to trust for ``X-Forwarded-Port`` 539 | proxy_fix_x_port = 1 540 | 541 | # Number of values to trust for ``X-Forwarded-Prefix`` 542 | proxy_fix_x_prefix = 1 543 | 544 | # Set secure flag on session cookie 545 | cookie_secure = False 546 | 547 | # Set samesite policy on session cookie 548 | cookie_samesite = Lax 549 | 550 | # Default setting for wrap toggle on DAG code and TI log views. 551 | default_wrap = False 552 | 553 | # Allow the UI to be rendered in a frame 554 | x_frame_enabled = True 555 | 556 | # Send anonymous user activity to your analytics tool 557 | # choose from google_analytics, segment, or metarouter 558 | # analytics_tool = 559 | 560 | # Unique ID of your account in the analytics tool 561 | # analytics_id = 562 | 563 | # 'Recent Tasks' stats will show for old DagRuns if set 564 | show_recent_stats_for_completed_runs = True 565 | 566 | # Update FAB permissions and sync security manager roles 567 | # on webserver startup 568 | update_fab_perms = True 569 | 570 | # The UI cookie lifetime in minutes. User will be logged out from UI after 571 | # ``session_lifetime_minutes`` of non-activity 572 | session_lifetime_minutes = 43200 573 | 574 | [email] 575 | 576 | # Configuration email backend and whether to 577 | # send email alerts on retry or failure 578 | # Email backend to use 579 | email_backend = airflow.utils.email.send_email_smtp 580 | 581 | # Whether email alerts should be sent when a task is retried 582 | default_email_on_retry = True 583 | 584 | # Whether email alerts should be sent when a task failed 585 | default_email_on_failure = True 586 | 587 | [smtp] 588 | 589 | # If you want airflow to send emails on retries, failure, and you want to use 590 | # the airflow.utils.email.send_email_smtp function, you have to configure an 591 | # smtp server here 592 | smtp_host = localhost 593 | smtp_starttls = True 594 | smtp_ssl = False 595 | # Example: smtp_user = airflow 596 | # smtp_user = 597 | # Example: smtp_password = airflow 598 | # smtp_password = 599 | smtp_port = 25 600 | smtp_mail_from = airflow@example.com 601 | smtp_timeout = 30 602 | smtp_retry_limit = 5 603 | 604 | [sentry] 605 | 606 | # Sentry (https://docs.sentry.io) integration. Here you can supply 607 | # additional configuration options based on the Python platform. See: 608 | # https://docs.sentry.io/error-reporting/configuration/?platform=python. 609 | # Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``, 610 | # ``ignore_errors``, ``before_breadcrumb``, ``before_send``, ``transport``. 611 | # Enable error reporting to Sentry 612 | sentry_on = false 613 | sentry_dsn = 614 | 615 | [celery_kubernetes_executor] 616 | 617 | # This section only applies if you are using the ``CeleryKubernetesExecutor`` in 618 | # ``[core]`` section above 619 | # Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``. 620 | # When the queue of a task is ``kubernetes_queue``, the task is executed via ``KubernetesExecutor``, 621 | # otherwise via ``CeleryExecutor`` 622 | kubernetes_queue = kubernetes 623 | 624 | [celery] 625 | 626 | # This section only applies if you are using the CeleryExecutor in 627 | # ``[core]`` section above 628 | # The app name that will be used by celery 629 | celery_app_name = airflow.executors.celery_executor 630 | 631 | # The concurrency that will be used when starting workers with the 632 | # ``airflow celery worker`` command. This defines the number of task instances that 633 | # a worker will take, so size up your workers based on the resources on 634 | # your worker box and the nature of your tasks 635 | worker_concurrency = 8 636 | 637 | # The maximum and minimum concurrency that will be used when starting workers with the 638 | # ``airflow celery worker`` command (always keep minimum processes, but grow 639 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency 640 | # Pick these numbers based on resources on worker box and the nature of the task. 641 | # If autoscale option is available, worker_concurrency will be ignored. 642 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale 643 | # Example: worker_autoscale = 16,12 644 | # worker_autoscale = 645 | 646 | # Used to increase the number of tasks that a worker prefetches which can improve performance. 647 | # The number of processes multiplied by worker_prefetch_multiplier is the number of tasks 648 | # that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily 649 | # blocked if there are multiple workers and one worker prefetches tasks that sit behind long 650 | # running tasks while another worker has unutilized processes that are unable to process the already 651 | # claimed blocked tasks. 652 | # https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits 653 | # Example: worker_prefetch_multiplier = 1 654 | # worker_prefetch_multiplier = 655 | 656 | # When you start an airflow worker, airflow starts a tiny web server 657 | # subprocess to serve the workers local log files to the airflow main 658 | # web server, who then builds pages and sends them to users. This defines 659 | # the port on which the logs are served. It needs to be unused, and open 660 | # visible from the main web server to connect into the workers. 661 | worker_log_server_port = 8793 662 | 663 | # Umask that will be used when starting workers with the ``airflow celery worker`` 664 | # in daemon mode. This control the file-creation mode mask which determines the initial 665 | # value of file permission bits for newly created files. 666 | worker_umask = 0o077 667 | 668 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally 669 | # a sqlalchemy database. Refer to the Celery documentation for more information. 670 | broker_url = redis://redis:6379/0 671 | 672 | # The Celery result_backend. When a job finishes, it needs to update the 673 | # metadata of the job. Therefore it will post a message on a message bus, 674 | # or insert it into a database (depending of the backend) 675 | # This status is used by the scheduler to update the state of the task 676 | # The use of a database is highly recommended 677 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings 678 | result_backend = db+postgresql://postgres:airflow@postgres/airflow 679 | 680 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start 681 | # it ``airflow celery flower``. This defines the IP that Celery Flower runs on 682 | flower_host = 0.0.0.0 683 | 684 | # The root URL for Flower 685 | # Example: flower_url_prefix = /flower 686 | flower_url_prefix = 687 | 688 | # This defines the port that Celery Flower runs on 689 | flower_port = 5555 690 | 691 | # Securing Flower with Basic Authentication 692 | # Accepts user:password pairs separated by a comma 693 | # Example: flower_basic_auth = user1:password1,user2:password2 694 | flower_basic_auth = 695 | 696 | # Default queue that tasks get assigned to and that worker listen on. 697 | default_queue = default 698 | 699 | # How many processes CeleryExecutor uses to sync task state. 700 | # 0 means to use max(1, number of cores - 1) processes. 701 | sync_parallelism = 0 702 | 703 | # Import path for celery configuration options 704 | celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG 705 | ssl_active = False 706 | ssl_key = 707 | ssl_cert = 708 | ssl_cacert = 709 | 710 | # Celery Pool implementation. 711 | # Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``. 712 | # See: 713 | # https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency 714 | # https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html 715 | pool = prefork 716 | 717 | # The number of seconds to wait before timing out ``send_task_to_executor`` or 718 | # ``fetch_celery_task_state`` operations. 719 | operation_timeout = 1.0 720 | 721 | # Celery task will report its status as 'started' when the task is executed by a worker. 722 | # This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted 723 | # or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob. 724 | task_track_started = True 725 | 726 | # Time in seconds after which Adopted tasks are cleared by CeleryExecutor. This is helpful to clear 727 | # stalled tasks. 728 | task_adoption_timeout = 600 729 | 730 | # The Maximum number of retries for publishing task messages to the broker when failing 731 | # due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed. 732 | task_publish_max_retries = 3 733 | 734 | # Worker initialisation check to validate Metadata Database connection 735 | worker_precheck = False 736 | 737 | [celery_broker_transport_options] 738 | 739 | # This section is for specifying options which can be passed to the 740 | # underlying celery broker transport. See: 741 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options 742 | # The visibility timeout defines the number of seconds to wait for the worker 743 | # to acknowledge the task before the message is redelivered to another worker. 744 | # Make sure to increase the visibility timeout to match the time of the longest 745 | # ETA you're planning to use. 746 | # visibility_timeout is only supported for Redis and SQS celery brokers. 747 | # See: 748 | # http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options 749 | # Example: visibility_timeout = 21600 750 | # visibility_timeout = 751 | 752 | [dask] 753 | 754 | # This section only applies if you are using the DaskExecutor in 755 | # [core] section above 756 | # The IP address and port of the Dask cluster's scheduler. 757 | cluster_address = 127.0.0.1:8786 758 | 759 | # TLS/ SSL settings to access a secured Dask scheduler. 760 | tls_ca = 761 | tls_cert = 762 | tls_key = 763 | 764 | [scheduler] 765 | # Task instances listen for external kill signal (when you clear tasks 766 | # from the CLI or the UI), this defines the frequency at which they should 767 | # listen (in seconds). 768 | job_heartbeat_sec = 5 769 | 770 | # How often (in seconds) to check and tidy up 'running' TaskInstancess 771 | # that no longer have a matching DagRun 772 | clean_tis_without_dagrun_interval = 15.0 773 | 774 | # The scheduler constantly tries to trigger new tasks (look at the 775 | # scheduler section in the docs for more information). This defines 776 | # how often the scheduler should run (in seconds). 777 | scheduler_heartbeat_sec = 5 778 | 779 | # The number of times to try to schedule each DAG file 780 | # -1 indicates unlimited number 781 | num_runs = -1 782 | 783 | # The number of seconds to wait between consecutive DAG file processing 784 | processor_poll_interval = 1 785 | 786 | # after how much time (seconds) a new DAGs should be picked up from the filesystem 787 | min_file_process_interval = 0 788 | 789 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. 790 | dag_dir_list_interval = 300 791 | 792 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats 793 | print_stats_interval = 30 794 | 795 | # How often (in seconds) should pool usage stats be sent to statsd (if statsd_on is enabled) 796 | pool_metrics_interval = 5.0 797 | 798 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold 799 | # ago (in seconds), scheduler is considered unhealthy. 800 | # This is used by the health check in the "/health" endpoint 801 | scheduler_health_check_threshold = 30 802 | 803 | # How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs 804 | orphaned_tasks_check_interval = 300.0 805 | child_process_log_directory = /opt/airflow/logs/scheduler 806 | 807 | # Local task jobs periodically heartbeat to the DB. If the job has 808 | # not heartbeat in this many seconds, the scheduler will mark the 809 | # associated task instance as failed and will re-schedule the task. 810 | scheduler_zombie_task_threshold = 300 811 | 812 | # Turn off scheduler catchup by setting this to ``False``. 813 | # Default behavior is unchanged and 814 | # Command Line Backfills still work, but the scheduler 815 | # will not do scheduler catchup if this is ``False``, 816 | # however it can be set on a per DAG basis in the 817 | # DAG definition (catchup) 818 | catchup_by_default = True 819 | 820 | # This changes the batch size of queries in the scheduling main loop. 821 | # If this is too high, SQL query performance may be impacted by one 822 | # or more of the following: 823 | # - reversion to full table scan 824 | # - complexity of query predicate 825 | # - excessive locking 826 | # Additionally, you may hit the maximum allowable query length for your db. 827 | # Set this to 0 for no limit (not advised) 828 | max_tis_per_query = 512 829 | 830 | # Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries. 831 | # If this is set to False then you should not run more than a single 832 | # scheduler at once 833 | use_row_level_locking = True 834 | 835 | # Max number of DAGs to create DagRuns for per scheduler loop 836 | # 837 | # Default: 10 838 | # max_dagruns_to_create_per_loop = 839 | 840 | # How many DagRuns should a scheduler examine (and lock) when scheduling 841 | # and queuing tasks. 842 | # 843 | # Default: 20 844 | # max_dagruns_per_loop_to_schedule = 845 | 846 | # Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the 847 | # same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other 848 | # dags in some circumstances 849 | # 850 | # Default: True 851 | # schedule_after_task_execution = 852 | 853 | # The scheduler can run multiple processes in parallel to parse dags. 854 | # This defines how many processes will run. 855 | parsing_processes = 2 856 | 857 | # Turn off scheduler use of cron intervals by setting this to False. 858 | # DAGs submitted manually in the web UI or with trigger_dag will still run. 859 | use_job_schedule = True 860 | 861 | # Allow externally triggered DagRuns for Execution Dates in the future 862 | # Only has effect if schedule_interval is set to None in DAG 863 | allow_trigger_in_future = False 864 | 865 | [kerberos] 866 | ccache = /tmp/airflow_krb5_ccache 867 | 868 | # gets augmented with fqdn 869 | principal = airflow 870 | reinit_frequency = 3600 871 | kinit_path = kinit 872 | keytab = airflow.keytab 873 | 874 | [github_enterprise] 875 | api_rev = v3 876 | 877 | [admin] 878 | # UI to hide sensitive variable fields when set to True 879 | hide_sensitive_variable_fields = True 880 | 881 | # A comma-separated list of sensitive keywords to look for in variables names. 882 | sensitive_variable_fields = 883 | 884 | [elasticsearch] 885 | # Elasticsearch host 886 | host = 887 | 888 | # Format of the log_id, which is used to query for a given tasks logs 889 | log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number} 890 | 891 | # Used to mark the end of a log stream for a task 892 | end_of_log_mark = end_of_log 893 | 894 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id 895 | # Code will construct log_id using the log_id template from the argument above. 896 | # NOTE: The code will prefix the https:// automatically, don't include that here. 897 | frontend = 898 | 899 | # Write the task logs to the stdout of the worker, rather than the default files 900 | write_stdout = False 901 | 902 | # Instead of the default log formatter, write the log lines as JSON 903 | json_format = False 904 | 905 | # Log fields to also attach to the json output, if enabled 906 | json_fields = asctime, filename, lineno, levelname, message 907 | 908 | [elasticsearch_configs] 909 | use_ssl = False 910 | verify_certs = True 911 | 912 | [kubernetes] 913 | # Path to the YAML pod file. If set, all other kubernetes-related fields are ignored. 914 | pod_template_file = 915 | 916 | # The repository of the Kubernetes Image for the Worker to Run 917 | worker_container_repository = 918 | 919 | # The tag of the Kubernetes Image for the Worker to Run 920 | worker_container_tag = 921 | 922 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default`` 923 | namespace = default 924 | 925 | # If True, all worker pods will be deleted upon termination 926 | delete_worker_pods = True 927 | 928 | # If False (and delete_worker_pods is True), 929 | # failed worker pods will not be deleted so users can investigate them. 930 | delete_worker_pods_on_failure = False 931 | 932 | # Number of Kubernetes Worker Pod creation calls per scheduler loop. 933 | # Note that the current default of "1" will only launch a single pod 934 | # per-heartbeat. It is HIGHLY recommended that users increase this 935 | # number to match the tolerance of their kubernetes cluster for 936 | # better performance. 937 | worker_pods_creation_batch_size = 1 938 | 939 | # Allows users to launch pods in multiple namespaces. 940 | # Will require creating a cluster-role for the scheduler 941 | multi_namespace_mode = False 942 | 943 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster. 944 | # It's intended for clients that expect to be running inside a pod running on kubernetes. 945 | # It will raise an exception if called from a process not running in a kubernetes environment. 946 | in_cluster = True 947 | 948 | # When running with in_cluster=False change the default cluster_context or config_file 949 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has. 950 | # cluster_context = 951 | 952 | # Path to the kubernetes configfile to be used when ``in_cluster`` is set to False 953 | # config_file = 954 | 955 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods 956 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string. 957 | # List of supported params are similar for all core_v1_apis, hence a single config 958 | # variable for all apis. See: 959 | # https://raw.githubusercontent.com/kubernetes-client/python/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/api/core_v1_api.py 960 | kube_client_request_args = 961 | 962 | # Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client 963 | # ``core_v1_api`` method when using the Kubernetes Executor. 964 | # This should be an object and can contain any of the options listed in the ``v1DeleteOptions`` 965 | # class defined here: 966 | # https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19 967 | # Example: delete_option_kwargs = {"grace_period_seconds": 10} 968 | delete_option_kwargs = 969 | 970 | # Enables TCP keepalive mechanism. This prevents Kubernetes API requests to hang indefinitely 971 | # when idle connection is time-outed on services like cloud load balancers or firewalls. 972 | enable_tcp_keepalive = False 973 | 974 | # When the `enable_tcp_keepalive` option is enabled, TCP probes a connection that has 975 | # been idle for `tcp_keep_idle` seconds. 976 | tcp_keep_idle = 120 977 | 978 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond 979 | # to a keepalive probe, TCP retransmits the probe after `tcp_keep_intvl` seconds. 980 | tcp_keep_intvl = 30 981 | 982 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond 983 | # to a keepalive probe, TCP retransmits the probe `tcp_keep_cnt number` of times before 984 | # a connection is considered to be broken. 985 | tcp_keep_cnt = 6 986 | 987 | [smart_sensor] 988 | # When `use_smart_sensor` is True, Airflow redirects multiple qualified sensor tasks to 989 | # smart sensor task. 990 | use_smart_sensor = False 991 | 992 | # `shard_code_upper_limit` is the upper limit of `shard_code` value. The `shard_code` is generated 993 | # by `hashcode % shard_code_upper_limit`. 994 | shard_code_upper_limit = 10000 995 | 996 | # The number of running smart sensor processes for each service. 997 | shards = 5 998 | 999 | # comma separated sensor classes support in smart_sensor. 1000 | sensors_enabled = NamedHivePartitionSensor --------------------------------------------------------------------------------