├── dags
    ├── __init__.py
    ├── good_dags
    │   ├── __init__.py
    │   ├── sample
    │   │   ├── __init__.py
    │   │   ├── lib
    │   │   │   └── __init__.py
    │   │   └── sample_dag.py
    │   ├── stock_retrieve
    │   │   ├── __init__.py
    │   │   ├── lib
    │   │   │   ├── __init__.py
    │   │   │   ├── epoch_manipulate.py
    │   │   │   └── stock_history.py
    │   │   └── stock_retrieve_dag.py
    │   └── dag_factory.py
    └── bad_dags
    │   ├── dag1.py
    │   ├── dag3.py
    │   └── dag2.py
├── grafana
    ├── .gitignore
    └── volume
    │   ├── dashboards
    │       ├── .gitignore
    │       └── airflow_metrics.json
    │   ├── datasources
    │       └── .gitignore
    │   └── provisioning
    │       ├── .gitignore
    │       ├── notifiers
    │           └── .gitignore
    │       ├── dashboards
    │           ├── .gitignore
    │           └── default.yaml
    │       └── datasources
    │           ├── .gitignore
    │           └── default.yaml
├── requirements.txt
├── prometheus
    ├── volume
    │   └── .gitignore
    └── prometheus.yml
├── documentation
    ├── grafana_dashboard.png
    └── airflow_docker_metrics_diagram.png
├── .gitignore
├── docker-compose.yml
├── README.md
├── LICENSE
└── airflow.cfg


/dags/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grafana/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dags/good_dags/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dags/good_dags/sample/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dags/good_dags/sample/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grafana/volume/dashboards/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grafana/volume/datasources/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grafana/volume/provisioning/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dags/good_dags/stock_retrieve/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dags/good_dags/stock_retrieve/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grafana/volume/provisioning/notifiers/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/prometheus/volume/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore
2 | /*


--------------------------------------------------------------------------------
/grafana/volume/provisioning/dashboards/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/grafana/volume/provisioning/datasources/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/documentation/grafana_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sarahkb125/airflow-docker-metrics/HEAD/documentation/grafana_dashboard.png


--------------------------------------------------------------------------------
/documentation/airflow_docker_metrics_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sarahkb125/airflow-docker-metrics/HEAD/documentation/airflow_docker_metrics_diagram.png


--------------------------------------------------------------------------------
/grafana/volume/provisioning/datasources/default.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 | datasources:
3 |   - name: Prometheus
4 |     type: prometheus
5 |     access: proxy
6 |     url: http://prometheus:9090
7 | 


--------------------------------------------------------------------------------
/grafana/volume/provisioning/dashboards/default.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | providers:
 3 |   - name: dashboards
 4 |     folder: General
 5 |     type: file
 6 |     editable: true
 7 |     updateIntervalSeconds: 10
 8 |     allowUiUpdates: true
 9 |     options:
10 |       path: /grafana/dashboards
11 |       foldersFromFilesStructure: true
12 | 


--------------------------------------------------------------------------------
/dags/good_dags/sample/sample_dag.py:
--------------------------------------------------------------------------------
1 | from good_dags.dag_factory import add_bash_task, create_dag
2 | 
3 | schedule = "0 6 * * *"
4 | dag = create_dag(name="sample", schedule=schedule)
5 | bash_1 = add_bash_task(name="print_date", command="date", dag=dag)
6 | bash_2 = add_bash_task(name="sleep", command="sleep 1", dag=dag)
7 | 
8 | bash_1 >> bash_2
9 | 


--------------------------------------------------------------------------------
/prometheus/prometheus.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |     scrape_interval: 30s
 3 |     evaluation_interval: 30s
 4 |     scrape_timeout: 10s
 5 |     external_labels:
 6 |         monitor: 'codelab-monitor'
 7 | 
 8 | scrape_configs:
 9 |   - job_name: 'prometheus'
10 |     static_configs:
11 |       - targets: ['airflow-prometheus:9090']
12 | 
13 |   - job_name: 'statsd-exporter'
14 |     static_configs:
15 |       - targets: ['airflow-statsd-exporter:9102']
16 |   
17 |     tls_config:
18 |       insecure_skip_verify: true
19 | 


--------------------------------------------------------------------------------
/dags/good_dags/stock_retrieve/stock_retrieve_dag.py:
--------------------------------------------------------------------------------
 1 | from good_dags.dag_factory import add_bash_task, add_python_task, create_dag
 2 | from good_dags.stock_retrieve.lib.stock_history import StockHistory
 3 | 
 4 | schedule = "0 6 * * *"
 5 | dag = create_dag(name="stock_retrieve", schedule=schedule)
 6 | bash_1 = add_bash_task(name="print_date", command="date", dag=dag)
 7 | bash_2 = add_bash_task(name="sleep", command="sleep 1", dag=dag)
 8 | python_3 = add_python_task(
 9 |     name="stock_retrieve_task",
10 |     function=StockHistory().stock_retrieve,
11 |     kwargs={"stock": "GOOGL"},
12 |     dag=dag
13 | )
14 | 
15 | bash_1 >> bash_2 >> python_3
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/dags/good_dags/stock_retrieve/lib/epoch_manipulate.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, time, timedelta
 2 | 
 3 | 
 4 | # NOTE: This is over-abstracted on purpose to showcase with the /lib/ folder is for.
 5 | class EpochManipulate(object):
 6 |     def __init__(self):
 7 |         pass
 8 | 
 9 |     def _get_current_epoch(self):
10 |         current = datetime.combine(datetime.today(), time.min)
11 |         return int(current.timestamp())
12 | 
13 |     def _get_previous_epoch_days(self, days):
14 |         current_datetime = datetime.combine(datetime.today(), time.min)
15 |         prev_datetime = current_datetime - timedelta(days=days)
16 |         return int(prev_datetime.timestamp())
17 | 


--------------------------------------------------------------------------------
/dags/bad_dags/dag1.py:
--------------------------------------------------------------------------------
 1 | # import io
 2 | # import time
 3 | # from datetime import datetime, timedelta
 4 | # from time import sleep
 5 | 
 6 | # import pandas as pd
 7 | # import requests
 8 | # from airflow import DAG
 9 | # from airflow.operators.bash_operator import BashOperator
10 | 
11 | # # Default parameters
12 | # default_args = {
13 | #     "owner": "airflow",
14 | #     "depends_on_past": False,
15 | #     "start_date": datetime(2020, 9, 1),
16 | #     "retries": 1,
17 | #     "retry_delay": timedelta(minutes=2),
18 | # }
19 | 
20 | 
21 | # # At 6am every day
22 | # schedule = "0 6 * * *"
23 | 
24 | 
25 | # # Declare dag
26 | # dag = DAG("sample", default_args=default_args, schedule_interval=schedule)
27 | 
28 | 
29 | # # Instantiate sample bash operator
30 | # t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag)
31 | # t2 = BashOperator(task_id="sleep", bash_command="sleep 1", dag=dag)
32 | 
33 | 
34 | # # Set dependencies
35 | # t1 >> t2
36 | 


--------------------------------------------------------------------------------
/dags/bad_dags/dag3.py:
--------------------------------------------------------------------------------
 1 | # import io
 2 | # import time
 3 | # from datetime import datetime, timedelta
 4 | # from time import sleep
 5 | 
 6 | # import pandas as pd
 7 | # import requests
 8 | # from airflow import DAG
 9 | # from airflow.operators.bash_operator import BashOperator
10 | 
11 | # # Default parameters
12 | # default_args = {
13 | #     "owner": "airflow",
14 | #     "depends_on_past": False,
15 | #     "start_date": datetime(2020, 9, 1),
16 | #     "retries": 1,
17 | #     "retry_delay": timedelta(minutes=2),
18 | # }
19 | 
20 | 
21 | # # At 6am every day
22 | # schedule = "0 6 * * *"
23 | 
24 | 
25 | # # Declare dag
26 | # dag = DAG("sample1", default_args=default_args, schedule_interval=schedule)
27 | 
28 | 
29 | # # Instantiate sample bash operator
30 | # t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag)
31 | # t2 = BashOperator(task_id="sleep", bash_command="sleep 1", dag=dag)
32 | 
33 | 
34 | # # Set dependencies
35 | # t1 >> t2
36 | 


--------------------------------------------------------------------------------
/dags/good_dags/dag_factory.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow.models import DAG
 4 | from airflow.operators.bash_operator import BashOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | 
 7 | DEFAULT_TRIGGER_RULE = "none_failed"
 8 | DEFAULT_RETRIES = 2
 9 | 
10 | 
11 | def create_dag(name, schedule, args=None):
12 |     default_args = {
13 |         "owner": "airflow",
14 |         "catchup": False,
15 |         "depends_on_past": False,
16 |         "start_date": datetime(year=2021, month=7, day=1),
17 |         "concurrency": 1,
18 |         "retries": 1,
19 |         "retry_delay": timedelta(minutes=2),
20 |         "max_active_runs": 1,
21 |     }
22 |     args = args if args else default_args
23 | 
24 |     return DAG(dag_id=name, default_args=args, schedule_interval=schedule)
25 | 
26 | 
27 | def add_python_task(
28 |     dag, name, function, kwargs=None,
29 |         trigger_rule=DEFAULT_TRIGGER_RULE, retries=DEFAULT_RETRIES):
30 | 
31 |     return PythonOperator(
32 |         task_id=name,
33 |         python_callable=function,
34 |         op_args=kwargs,
35 |         trigger_rule=trigger_rule,
36 |         retries=retries,
37 |         dag=dag
38 |     )
39 | 
40 | 
41 | def add_bash_task(
42 |     dag, name, command,
43 |         trigger_rule=DEFAULT_TRIGGER_RULE, retries=DEFAULT_RETRIES):
44 | 
45 |     return BashOperator(
46 |         task_id=name,
47 |         bash_command=command,
48 |         trigger_rule=trigger_rule,
49 |         retries=retries,
50 |         dag=dag
51 |     )
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/dags/good_dags/stock_retrieve/lib/stock_history.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from time import sleep
 3 | 
 4 | import pandas as pd
 5 | import requests
 6 | from good_dags.stock_retrieve.lib.epoch_manipulate import EpochManipulate
 7 | 
 8 | 
 9 | class StockHistory(object):
10 |     def __init__(self):
11 |         self.epoch = EpochManipulate()
12 | 
13 |     def stock_retrieve(
14 |         self, stock, period1=None, period2=None, interval="1d", events="history", **kwargs
15 |     ):
16 |         period1 = self.epoch._get_previous_epoch_days(7) if not period1 else period1
17 |         period2 = self.epoch._get_current_epoch() if not period2 else period2
18 | 
19 |         print(
20 |             f"[DataRetriever] Getting data for: stock {stock}, {period1} to {period2}, interval: {interval}, events: {events}..."
21 |         )
22 | 
23 |         stock = stock.strip()
24 |         url = f"https://query1.finance.yahoo.com/v7/finance/download/{stock}?period1={period1}&period2={period2}&interval={interval}&events={events}"
25 |         response = requests.get(url)
26 | 
27 |         if not response.ok:
28 |             # Retry one time due to page not found
29 |             print("[DataRetriever] Failure occurred, waiting 10 seconds then retrying once...")
30 |             sleep(10)
31 | 
32 |             response = requests.get(url)
33 |             if not response.ok:
34 |                 raise Exception(f"[DataRetriever] Yahoo request error. Response: {response.text}")
35 | 
36 |         data = response.content.decode("utf8")
37 |         df_history = pd.read_csv(io.StringIO(data))
38 |         print(df_history)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     StockHistory().stock_retrieve('GOOGL')
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | pip-wheel-metadata/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 94 | __pypackages__/
 95 | 
 96 | # Celery stuff and Airflow
 97 | celerybeat-schedule
 98 | celerybeat.pid
 99 | airflow-data/
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | .vscode/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/dags/bad_dags/dag2.py:
--------------------------------------------------------------------------------
 1 | # import io
 2 | # from datetime import datetime, time, timedelta
 3 | # from time import sleep
 4 | 
 5 | # import pandas as pd
 6 | # import requests
 7 | # from airflow import DAG
 8 | # from airflow.operators.bash_operator import BashOperator
 9 | # from airflow.operators.python_operator import PythonOperator
10 | 
11 | 
12 | # def _get_current_epoch():
13 | #     current = datetime.combine(datetime.today(), time.min)
14 | #     return int(current.timestamp())
15 | 
16 | 
17 | # def _get_previous_epoch_days(days):
18 | #     current_datetime = datetime.combine(datetime.today(), time.min)
19 | #     prev_datetime = current_datetime - timedelta(days=days)
20 | #     return int(prev_datetime.timestamp())
21 | 
22 | 
23 | # def stock_retrieve(stock, period1=None, period2=None, interval="1d", events="history", **kwargs):
24 | #     period1 = _get_previous_epoch_days(7) if not period1 else period1
25 | #     period2 = _get_current_epoch() if not period2 else period2
26 | 
27 | #     print(
28 | #         f"[DataRetriever] Getting data for: stock {stock}, {period1} to {period2}, interval: {interval}, events: {events}..."
29 | #     )
30 | 
31 | #     stock = stock.strip()
32 | #     url = f"https://query1.finance.yahoo.com/v7/finance/download/{stock}?period1={period1}&period2={period2}&interval={interval}&events={events}"
33 | #     response = requests.get(url)
34 | 
35 | #     if not response.ok:
36 | #         # Retry one time due to page not found
37 | #         print("[DataRetriever] Failure occurred, waiting 10 seconds then retrying once...")
38 | #         sleep(10)
39 | 
40 | #         response = requests.get(url)
41 | #         if not response.ok:
42 | #             raise Exception(f"[DataRetriever] Yahoo request error. Response: {response.text}")
43 | 
44 | #     data = response.content.decode("utf8")
45 | #     df_history = pd.read_csv(io.StringIO(data))
46 | #     print(df_history)
47 | 
48 | 
49 | # # Default parameters
50 | # default_args = {
51 | #     "owner": "airflow",
52 | #     "depends_on_past": False,
53 | #     "start_date": datetime(2021, 7, 5),
54 | #     "retries": 1,
55 | #     "retry_delay": timedelta(minutes=2),
56 | # }
57 | 
58 | 
59 | # # Declare dag, tasks and dependencies
60 | # schedule = "0 6 * * *"
61 | # dag = DAG("stock_retrieve", default_args=default_args, schedule_interval=schedule)
62 | # t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag)
63 | # t2 = BashOperator(task_id="sleep", bash_command="sleep 1", dag=dag)
64 | # t3 = PythonOperator(
65 | #     task_id="stock_retrieve", python_callable=stock_retrieve, op_kwargs={"stock": "GOOGL"}, dag=dag
66 | # )
67 | # t1 >> t2 >> t3
68 | 


--------------------------------------------------------------------------------
/grafana/volume/dashboards/airflow_metrics.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "annotations": {
  3 |     "list": [
  4 |       {
  5 |         "builtIn": 1,
  6 |         "datasource": "-- Grafana --",
  7 |         "enable": true,
  8 |         "hide": true,
  9 |         "iconColor": "rgba(0, 211, 255, 1)",
 10 |         "name": "Annotations & Alerts",
 11 |         "type": "dashboard"
 12 |       }
 13 |     ]
 14 |   },
 15 |   "editable": true,
 16 |   "gnetId": null,
 17 |   "graphTooltip": 0,
 18 |   "id": 1,
 19 |   "links": [],
 20 |   "panels": [
 21 |     {
 22 |       "aliasColors": {},
 23 |       "bars": false,
 24 |       "dashLength": 10,
 25 |       "dashes": false,
 26 |       "datasource": "Prometheus",
 27 |       "fieldConfig": {
 28 |         "defaults": {
 29 |           "custom": {}
 30 |         },
 31 |         "overrides": []
 32 |       },
 33 |       "fill": 1,
 34 |       "fillGradient": 0,
 35 |       "gridPos": {
 36 |         "h": 9,
 37 |         "w": 12,
 38 |         "x": 0,
 39 |         "y": 0
 40 |       },
 41 |       "hiddenSeries": false,
 42 |       "id": 2,
 43 |       "legend": {
 44 |         "avg": false,
 45 |         "current": false,
 46 |         "max": false,
 47 |         "min": false,
 48 |         "show": true,
 49 |         "total": false,
 50 |         "values": false
 51 |       },
 52 |       "lines": true,
 53 |       "linewidth": 1,
 54 |       "nullPointMode": "null",
 55 |       "percentage": false,
 56 |       "pluginVersion": "7.1.5",
 57 |       "pointradius": 2,
 58 |       "points": false,
 59 |       "renderer": "flot",
 60 |       "seriesOverrides": [],
 61 |       "spaceLength": 10,
 62 |       "stack": false,
 63 |       "steppedLine": false,
 64 |       "targets": [
 65 |         {
 66 |           "expr": "airflow_executor_queued_tasks",
 67 |           "interval": "",
 68 |           "legendFormat": "Airflow Queued Tasks",
 69 |           "refId": "A"
 70 |         },
 71 |         {
 72 |           "expr": "airflow_executor_running_tasks",
 73 |           "interval": "",
 74 |           "legendFormat": "Airflow Running Tasks",
 75 |           "refId": "B"
 76 |         }
 77 |       ],
 78 |       "thresholds": [],
 79 |       "timeFrom": null,
 80 |       "timeRegions": [],
 81 |       "timeShift": null,
 82 |       "title": "Airflow Running and Queued Tasks",
 83 |       "tooltip": {
 84 |         "shared": true,
 85 |         "sort": 0,
 86 |         "value_type": "individual"
 87 |       },
 88 |       "type": "graph",
 89 |       "xaxis": {
 90 |         "buckets": null,
 91 |         "mode": "time",
 92 |         "name": null,
 93 |         "show": true,
 94 |         "values": []
 95 |       },
 96 |       "yaxes": [
 97 |         {
 98 |           "format": "short",
 99 |           "label": null,
100 |           "logBase": 1,
101 |           "max": null,
102 |           "min": null,
103 |           "show": true
104 |         },
105 |         {
106 |           "format": "short",
107 |           "label": null,
108 |           "logBase": 1,
109 |           "max": null,
110 |           "min": null,
111 |           "show": true
112 |         }
113 |       ],
114 |       "yaxis": {
115 |         "align": false,
116 |         "alignLevel": null
117 |       }
118 |     }
119 |   ],
120 |   "schemaVersion": 26,
121 |   "style": "dark",
122 |   "tags": [],
123 |   "templating": {
124 |     "list": []
125 |   },
126 |   "time": {
127 |     "from": "now-6h",
128 |     "to": "now"
129 |   },
130 |   "timepicker": {},
131 |   "timezone": "",
132 |   "title": "Airflow Metrics",
133 |   "uid": "Ii6amwOGk",
134 |   "version": 1
135 | }


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: '3'
  2 | x-airflow-common:
  3 |   &airflow-common
  4 |   image: apache/airflow:2.1.0
  5 |   environment:
  6 |     - LOAD_EX=n
  7 |     - POSTGRES_USER=airflow
  8 |     - POSTGRES_PASSWORD=airflow
  9 |     - POSTGRES_DB=airflow
 10 |     - REDIS_HOST=redis
 11 |     - REDIS_PASSWORD=redis
 12 |     - AIRFLOW__SCHEDULER__STATSD_ON=True
 13 |     - AIRFLOW__SCHEDULER__STATSD_HOST=statsd-exporter
 14 |     - AIRFLOW__SCHEDULER__STATSD_PORT=8125
 15 |     - AIRFLOW__SCHEDULER__STATSD_PREFIX=airflow
 16 |     - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql://airflow:airflow@postgres/airflow
 17 |     - AIRFLOW__CELERY__BROKER_URL=redis://:redis@redis:6379/1
 18 |     - AIRFLOW__CELERY__RESULT_BACKEND=db+postgresql://airflow:airflow@postgres/airflow
 19 |     - AIRFLOW__CORE__FERNET_KEY=pMrhjIcqUNHMYRk_ZOBmMptWR6o1DahCXCKn5lEMpzM=
 20 |     - AIRFLOW__CORE__EXECUTOR=CeleryExecutor
 21 |     - AIRFLOW_HOME=/opt/airflow
 22 |     - AIRFLOW_CONFIG=/opt/airflow/airflow.cfg
 23 |     - AIRFLOW__CORE__LOAD_EXAMPLES=False
 24 |     - AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False
 25 |     - AIRFLOW__WEBSERVER__WORKERS=2
 26 |     - AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL=1800
 27 |   volumes:
 28 |     - ./dags:/opt/airflow/dags/
 29 |     - ./airflow.cfg:/opt/airflow/airflow.cfg
 30 |     - ./airflow-data/logs:/opt/airflow/logs
 31 |     - ./airflow-data/plugins:/opt/airflow/plugins
 32 | 
 33 | services:
 34 |     airflow-init:
 35 |         << : *airflow-common
 36 |         container_name: airflow-init
 37 |         entrypoint: /bin/bash
 38 |         command:
 39 |         - -c
 40 |         - airflow users list || ( airflow db init &&
 41 |             airflow users create
 42 |             --role Admin
 43 |             --username admin
 44 |             --password password
 45 |             --email airflow@airflow.com
 46 |             --firstname first
 47 |             --lastname last )
 48 |         restart: on-failure
 49 | 
 50 |     redis:
 51 |         container_name: airflow-redis
 52 |         image: 'redis:5.0.5'
 53 |         depends_on:
 54 |             - statsd-exporter
 55 |         command: redis-server --requirepass redis
 56 |         ports:
 57 |             - 6380:6379
 58 | 
 59 |     postgres:
 60 |         container_name: airflow-postgres
 61 |         image: postgres:9.6
 62 |         depends_on:
 63 |             - statsd-exporter
 64 |         environment:
 65 |             - POSTGRES_USER=airflow
 66 |             - POSTGRES_PASSWORD=airflow
 67 |             - POSTGRES_DB=airflow
 68 | 
 69 |     webserver:
 70 |         << : *airflow-common
 71 |         container_name: airflow-webserver
 72 |         restart: always
 73 |         depends_on:
 74 |             - postgres
 75 |             - redis
 76 |             - statsd-exporter
 77 |             - airflow-init
 78 |         ports:
 79 |             - "8080:8080"
 80 |         command: airflow webserver
 81 |         healthcheck:
 82 |             test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"]
 83 |             interval: 30s
 84 |             timeout: 30s
 85 |             retries: 3
 86 | 
 87 |     flower:
 88 |         << : *airflow-common
 89 |         container_name: airflow-flower
 90 |         restart: always
 91 |         depends_on:
 92 |             - redis
 93 |             - statsd-exporter
 94 |             - airflow-init
 95 |         ports:
 96 |             - "5555:5555"
 97 |         command: airflow celery flower
 98 | 
 99 |     scheduler:
100 |         << : *airflow-common
101 |         container_name: airflow-scheduler
102 |         restart: always
103 |         depends_on:
104 |             - postgres
105 |             - webserver
106 |             - airflow-init
107 |         command: airflow scheduler
108 | 
109 |     worker:
110 |         << : *airflow-common
111 |         container_name: airflow-worker
112 |         restart: always
113 |         sysctls:
114 |             - net.ipv4.tcp_keepalive_time=200
115 |             - net.ipv4.tcp_keepalive_intvl=200
116 |             - net.ipv4.tcp_keepalive_probes=5
117 |         depends_on:
118 |             - redis
119 |             - scheduler
120 |             - airflow-init
121 |         ports:
122 |             - "8793"
123 |         command: airflow celery worker
124 | 
125 |     statsd-exporter:
126 |         image: prom/statsd-exporter
127 |         container_name: airflow-statsd-exporter
128 |         command: "--statsd.listen-udp=:8125 --web.listen-address=:9102"
129 |         ports:
130 |             - 9123:9102
131 |             - 8125:8125/udp
132 |     
133 |     prometheus:
134 |         image: prom/prometheus
135 |         container_name: airflow-prometheus
136 |         user: "0"
137 |         ports:
138 |             - 9090:9090
139 |         volumes:
140 |             - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
141 |             - ./prometheus/volume:/prometheus
142 |         
143 |     grafana:
144 |         image: grafana/grafana:7.1.5
145 |         container_name: airflow-grafana
146 |         environment:
147 |             GF_SECURITY_ADMIN_USER: admin
148 |             GF_SECURITY_ADMIN_PASSWORD: password
149 |             GF_PATHS_PROVISIONING: /grafana/provisioning
150 |         ports:
151 |             - 3000:3000
152 |         volumes:
153 |             - ./grafana/volume/data:/grafana
154 |             - ./grafana/volume/datasources:/grafana/datasources
155 |             - ./grafana/volume/dashboards:/grafana/dashboards
156 |             - ./grafana/volume/provisioning:/grafana/provisioning
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Overview
  2 | 
  3 | This repository is one deployment of Airflow, a data orchestration tool, from the official `apache/airflow` image in Docker with reporting dashboards setup in Grafana. To configure Airflow metrics to be usable in Grafana, the following tools are also used: StatsD, StatsD-Exporter, and Prometheus. The setup is described below. Associated blog post [here](https://towardsdatascience.com/airflow-in-docker-metrics-reporting-83ad017a24eb).
  4 | 
  5 | Additionally, this repository showcases an example of a scalable DAG folder and code architecture, discussed in my Airflow Summit 2021 talk, `Writing Dry Code in Airflow`.
  6 | 
  7 | Now, you may ask, what's the motivation for monitoring Airflow with external resources, instead of using the UI? I answer a question with a question: how would you be notified if the scheduler queue was filled and no tasks were executing, or the scheduler went down? Without this architecture, notifications of that kind would not be intuitative to setup.
  8 | 
  9 | # Architecture: Monitoring
 10 | 
 11 | All of the services are run as separate containers, with Airflow using multiple containers (described below). 
 12 | 
 13 | ![Architecture](.//documentation/airflow_docker_metrics_diagram.png)
 14 | 
 15 | The resources are located at the following URLs:
 16 | - Airflow Webserver UI: http://localhost:8080
 17 | - StatsD Metrics list: http://localhost:9123/metrics
 18 | - Prometheus: http://localhost:9090
 19 | - Grafana: 
 20 |     - URL: http://localhost:3000
 21 |         - Login: username = admin, password = password
 22 |     - Dashboards: http://localhost:3000/dashboards
 23 |     - Datasources: http://localhost:3000/datasources
 24 | 
 25 | ## Airflow:
 26 | 
 27 | ### Official Image
 28 | 
 29 | The Airflow containers are built using the official `apache/airflow` Docker image, version 2.0.0. This contains significant updates when compared to the `puckel/docker-airflow` image. This repository can be compared to the `puckel/docker-airflow` repository to indicate changes needed to migrate to the official image, but the main differences are also outlined below.
 30 | 
 31 | ### Containers
 32 | 
 33 | There are several Airflow containers including Redis, Postgres, the Webserver, and the Scheduler. On the webserver and scheduler, a sample DAG is loaded from the `dags/` folder. The current deployment uses the CeleryExecutor; the metrics have not been tested with the KubernetesExecutor.
 34 | 
 35 | ### StatsD
 36 | 
 37 | Airflow emits metrics in the StatsD format automatically, if certain environment variables (starting with `AIRFLOW__SCHEDULER__STATSD_`) are set. More information on StatsD found [here](https://github.com/statsd/statsd).
 38 | 
 39 | ## StatsD-Exporter
 40 | 
 41 | The StatsD-Exporter container converts Airflow's metrics in StatsD format to Prometheus format, the datasource for Grafana. More information on StatsD-Exporter found [here](https://github.com/prometheus/statsd_exporter).
 42 | 
 43 | ## Prometheus
 44 | 
 45 | Prometheus is a service commonly used for time-series data reporting. It is particularly convenient when using Grafana as a reporting UI since Prometheus is a supported datasource. More information on Prometheus found [here](https://prometheus.io/).
 46 | 
 47 | ## Grafana
 48 | 
 49 | ### UI
 50 | 
 51 | Grafana is a reporting UI layer that is often used to connect to non-relational databases. In this exercise, Grafana uses Prometheus as a datasource for building dashboards.
 52 | 
 53 | ### Provisioning
 54 | 
 55 | The current deployment leverages provisioning, which uses code to define datasources, dashboards, and notifiers in Grafana upon startup (more information [here](https://grafana.com/docs/grafana/latest/administration/provisioning/)). The Prometheus datasource is already provisioned, as well as an `Airflow Metrics` dashboard tracking the number of running and queued tasks over time. The datasources, dashboards, and `prometheus.yml` configuration file are mounted as volumes on the container.
 56 | 
 57 | # Running the Containers
 58 | 
 59 | ## Requirements:
 60 | - Docker
 61 | - docker-compose
 62 | - Python3
 63 | - Git
 64 | 
 65 | ## Steps
 66 | 
 67 | The following steps are to be run in a terminal:
 68 | 
 69 | - Clone the repository: `git clone https://github.com/sarahmk125/airflow-docker-metrics.git`
 70 | - Navigate to the cloned folder: `cd airflow-docker-metrics`
 71 | - Startup the containers: `docker-compose -f docker-compose.yml up -d`. (They can be stopped by running the same command except with `stop` at the end, or `down` to remove them)
 72 | - Note: a generic admin user is created when bringing up the containers. Username is `admin` and password is `password`.
 73 | 
 74 | ## The Result
 75 | 
 76 | An `Airflow Metrics` dashboard has been provisioned in Grafana in this repository:
 77 | 
 78 | ![Architecture](.//documentation/grafana_dashboard.png)
 79 | 
 80 | There are many more useful StatsD metrics made available, such as DAG and task duration. These can all be leveraged to create more dashboards.
 81 | 
 82 | ## Deployment
 83 | 
 84 | The repository has been run locally, but can be deployed to any instance on GCP or AWS. SSHing onto the instance will allow the user to pull the repository, install the requirements, and start the Docker containers.
 85 | 
 86 | GCP Cloud Composer is a hosted Airflow deployment, however the configuration is not exposed. Capturing StatsD metrics may not be straight forward with that approach. This repository is a guide for a self-managed Airflow deployment; other hosted options for Airflow (such as Astronomer) exist.
 87 | 
 88 | # Technical Notes
 89 | 
 90 | ## Transitioning to the Official Airflow Image
 91 | 
 92 | The primary steps taken to transition from the puckel/docker-airflow to the apache/airflow image are:
 93 | - The `airflow initdb` to initialize Airflow's backend database is run as a command when bringing up the webserver container, declared in the `docker-compose.yml` file.
 94 | - If using the CeleryExecutor, variables needed should be defined as ENV variables in the `docker-compose.yml` file.
 95 | 
 96 | ## Future Improvements
 97 | 
 98 | Ways to improve the current architecture include:
 99 | - Not running a `sleep` command in the scheduler to wait for the `db init` command in the `webserver` to complete.
100 | - Bugfixes: sometimes DAGs don't update in the UI, and don't show any errors. Cause is unknown.
101 | 
102 | Have suggestions? I love talking about data stacks. Shoot me a message [here](https://www.linkedin.com/in/sarah-krasnik/).
103 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2020 Sarah Krasnik
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/airflow.cfg:
--------------------------------------------------------------------------------
   1 | #
   2 | # Licensed to the Apache Software Foundation (ASF) under one
   3 | # or more contributor license agreements.  See the NOTICE file
   4 | # distributed with this work for additional information
   5 | # regarding copyright ownership.  The ASF licenses this file
   6 | # to you under the Apache License, Version 2.0 (the
   7 | # "License"); you may not use this file except in compliance
   8 | # with the License.  You may obtain a copy of the License at
   9 | #
  10 | #   http://www.apache.org/licenses/LICENSE-2.0
  11 | #
  12 | # Unless required by applicable law or agreed to in writing,
  13 | # software distributed under the License is distributed on an
  14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15 | # KIND, either express or implied.  See the License for the
  16 | # specific language governing permissions and limitations
  17 | # under the License.
  18 | 
  19 | 
  20 | # This is the template for Airflow's default configuration. When Airflow is
  21 | # imported, it looks for a configuration file at $AIRFLOW_HOME/airflow.cfg. If
  22 | # it doesn't exist, Airflow uses this template to generate it by replacing
  23 | # variables in curly braces with their global values from configuration.py.
  24 | 
  25 | # Users should not modify this file; they should customize the generated
  26 | # airflow.cfg instead.
  27 | 
  28 | 
  29 | # ----------------------- TEMPLATE BEGINS HERE -----------------------
  30 | 
  31 | # REFERENCE: https://github.com/marclamberti/docker-airflow/blob/main/airflow-data/airflow.cfg
  32 | 
  33 | [core]
  34 | # The folder where your airflow pipelines live, most likely a
  35 | # subfolder in a code repository. This path must be absolute.
  36 | dags_folder = /opt/airflow/dags
  37 | 
  38 | # Hostname by providing a path to a callable, which will resolve the hostname.
  39 | # The format is "package.function".
  40 | #
  41 | # For example, default value "socket.getfqdn" means that result from getfqdn() of "socket"
  42 | # package will be used as hostname.
  43 | #
  44 | # No argument should be required in the function specified.
  45 | # If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address``
  46 | hostname_callable = socket.getfqdn
  47 | 
  48 | # Default timezone in case supplied date times are naive
  49 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam)
  50 | default_timezone = utc
  51 | 
  52 | # The executor class that airflow should use. Choices include
  53 | # ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``, ``DaskExecutor``,
  54 | # ``KubernetesExecutor``, ``CeleryKubernetesExecutor`` or the
  55 | # full import path to the class when using a custom executor.
  56 | executor = CeleryExecutor
  57 | 
  58 | # The SqlAlchemy connection string to the metadata database.
  59 | # SqlAlchemy supports many different database engine, more information
  60 | # their website
  61 | # sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres:5432/airflow
  62 | 
  63 | # The encoding for the databases
  64 | sql_engine_encoding = utf-8
  65 | 
  66 | # Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding.
  67 | # This is particularly useful in case of mysql with utf8mb4 encoding because
  68 | # primary keys for XCom table has too big size and ``sql_engine_collation_for_ids`` should
  69 | # be set to ``utf8mb3_general_ci``.
  70 | # sql_engine_collation_for_ids =
  71 | 
  72 | # If SqlAlchemy should pool database connections.
  73 | sql_alchemy_pool_enabled = True
  74 | 
  75 | # The SqlAlchemy pool size is the maximum number of database connections
  76 | # in the pool. 0 indicates no limit.
  77 | sql_alchemy_pool_size = 5
  78 | 
  79 | # The maximum overflow size of the pool.
  80 | # When the number of checked-out connections reaches the size set in pool_size,
  81 | # additional connections will be returned up to this limit.
  82 | # When those additional connections are returned to the pool, they are disconnected and discarded.
  83 | # It follows then that the total number of simultaneous connections the pool will allow
  84 | # is pool_size + max_overflow,
  85 | # and the total number of "sleeping" connections the pool will allow is pool_size.
  86 | # max_overflow can be set to ``-1`` to indicate no overflow limit;
  87 | # no limit will be placed on the total number of concurrent connections. Defaults to ``10``.
  88 | sql_alchemy_max_overflow = 10
  89 | 
  90 | # The SqlAlchemy pool recycle is the number of seconds a connection
  91 | # can be idle in the pool before it is invalidated. This config does
  92 | # not apply to sqlite. If the number of DB connections is ever exceeded,
  93 | # a lower config value will allow the system to recover faster.
  94 | sql_alchemy_pool_recycle = 1800
  95 | 
  96 | # Check connection at the start of each connection pool checkout.
  97 | # Typically, this is a simple statement like "SELECT 1".
  98 | # More information here:
  99 | # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic
 100 | sql_alchemy_pool_pre_ping = True
 101 | 
 102 | # The schema to use for the metadata database.
 103 | # SqlAlchemy supports databases with the concept of multiple schemas.
 104 | sql_alchemy_schema =
 105 | 
 106 | # Import path for connect args in SqlAlchemy. Defaults to an empty dict.
 107 | # This is useful when you want to configure db engine args that SqlAlchemy won't parse
 108 | # in connection string.
 109 | # See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args
 110 | # sql_alchemy_connect_args =
 111 | 
 112 | # The amount of parallelism as a setting to the executor. This defines
 113 | # the max number of task instances that should run simultaneously
 114 | # on this airflow installation
 115 | parallelism = 32
 116 | 
 117 | # The number of task instances allowed to run concurrently by the scheduler
 118 | # in one DAG. Can be overridden by ``concurrency`` on DAG level.
 119 | dag_concurrency = 16
 120 | 
 121 | # Are DAGs paused by default at creation
 122 | dags_are_paused_at_creation = True
 123 | 
 124 | # The maximum number of active DAG runs per DAG
 125 | max_active_runs_per_dag = 16
 126 | 
 127 | # Whether to load the DAG examples that ship with Airflow. It's good to
 128 | # get started, but you probably want to set this to ``False`` in a production
 129 | # environment
 130 | load_examples = True
 131 | 
 132 | # Whether to load the default connections that ship with Airflow. It's good to
 133 | # get started, but you probably want to set this to ``False`` in a production
 134 | # environment
 135 | load_default_connections = True
 136 | 
 137 | # Path to the folder containing Airflow plugins
 138 | plugins_folder = /opt/airflow/plugins
 139 | 
 140 | # Should tasks be executed via forking of the parent process ("False",
 141 | # the speedier option) or by spawning a new python process ("True" slow,
 142 | # but means plugin changes picked up by tasks straight away)
 143 | execute_tasks_new_python_interpreter = False
 144 | 
 145 | # Secret key to save connection passwords in the db
 146 | fernet_key = $FERNET_KEY
 147 | 
 148 | # Whether to disable pickling dags
 149 | donot_pickle = True
 150 | 
 151 | # How long before timing out a python file import
 152 | dagbag_import_timeout = 30.0
 153 | 
 154 | # Should a traceback be shown in the UI for dagbag import errors,
 155 | # instead of just the exception message
 156 | dagbag_import_error_tracebacks = True
 157 | 
 158 | # If tracebacks are shown, how many entries from the traceback should be shown
 159 | dagbag_import_error_traceback_depth = 2
 160 | 
 161 | # How long before timing out a DagFileProcessor, which processes a dag file
 162 | dag_file_processor_timeout = 50
 163 | 
 164 | # The class to use for running task instances in a subprocess.
 165 | # Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class
 166 | # when using a custom task runner.
 167 | task_runner = StandardTaskRunner
 168 | 
 169 | # If set, tasks without a ``run_as_user`` argument will be run with this user
 170 | # Can be used to de-elevate a sudo user running Airflow when executing tasks
 171 | default_impersonation =
 172 | 
 173 | # What security module to use (for example kerberos)
 174 | security =
 175 | 
 176 | # Turn unit test mode on (overwrites many configuration options with test
 177 | # values at runtime)
 178 | unit_test_mode = False
 179 | 
 180 | # Whether to enable pickling for xcom (note that this is insecure and allows for
 181 | # RCE exploits).
 182 | enable_xcom_pickling = False
 183 | 
 184 | # When a task is killed forcefully, this is the amount of time in seconds that
 185 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
 186 | killed_task_cleanup_time = 60
 187 | 
 188 | # Whether to override params with dag_run.conf. If you pass some key-value pairs
 189 | # through ``airflow dags backfill -c`` or
 190 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
 191 | dag_run_conf_overrides_params = True
 192 | 
 193 | # When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``.
 194 | dag_discovery_safe_mode = True
 195 | 
 196 | # The number of retries each task is going to have by default. Can be overridden at dag or task level.
 197 | default_task_retries = 0
 198 | 
 199 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
 200 | min_serialized_dag_update_interval = 30
 201 | 
 202 | # Fetching serialized DAG can not be faster than a minimum interval to reduce database
 203 | # read rate. This config controls when your DAGs are updated in the Webserver
 204 | min_serialized_dag_fetch_interval = 10
 205 | 
 206 | # Whether to persist DAG files code in DB.
 207 | # If set to True, Webserver reads file contents from DB instead of
 208 | # trying to access files in a DAG folder.
 209 | # Example: store_dag_code = False
 210 | # store_dag_code =
 211 | 
 212 | # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
 213 | # in the Database.
 214 | # All the template_fields for each of Task Instance are stored in the Database.
 215 | # Keeping this number small may cause an error when you try to view ``Rendered`` tab in
 216 | # TaskInstance view for older tasks.
 217 | max_num_rendered_ti_fields_per_task = 30
 218 | 
 219 | # On each dagrun check against defined SLAs
 220 | check_slas = True
 221 | 
 222 | # Path to custom XCom class that will be used to store and resolve operators results
 223 | # Example: xcom_backend = path.to.CustomXCom
 224 | xcom_backend = airflow.models.xcom.BaseXCom
 225 | 
 226 | # By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``,
 227 | # if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module.
 228 | lazy_load_plugins = True
 229 | 
 230 | # By default Airflow providers are lazily-discovered (discovery and imports happen only when required).
 231 | # Set it to False, if you want to discover providers whenever 'airflow' is invoked via cli or
 232 | # loaded from module.
 233 | lazy_discover_providers = True
 234 | 
 235 | # Number of times the code should be retried in case of DB Operational Errors.
 236 | # Not all transactions will be retried as it can cause undesired state.
 237 | # Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
 238 | max_db_retries = 3
 239 | 
 240 | [logging]
 241 | # The folder where airflow should store its log files
 242 | # This path must be absolute
 243 | base_log_folder = /opt/airflow/logs
 244 | 
 245 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search.
 246 | # Set this to True if you want to enable remote logging.
 247 | remote_logging = False
 248 | 
 249 | # Users must supply an Airflow connection id that provides access to the storage
 250 | # location.
 251 | remote_log_conn_id =
 252 | 
 253 | # Path to Google Credential JSON file. If omitted, authorization based on `the Application Default
 254 | # Credentials
 255 | # <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
 256 | # be used.
 257 | google_key_path =
 258 | 
 259 | # Storage bucket URL for remote logging
 260 | # S3 buckets should start with "s3://"
 261 | # Cloudwatch log groups should start with "cloudwatch://"
 262 | # GCS buckets should start with "gs://"
 263 | # WASB buckets should start with "wasb" just to help Airflow select correct handler
 264 | # Stackdriver logs should start with "stackdriver://"
 265 | remote_base_log_folder =
 266 | 
 267 | # Use server-side encryption for logs stored in S3
 268 | encrypt_s3_logs = False
 269 | 
 270 | # Logging level
 271 | logging_level = INFO
 272 | 
 273 | # Logging level for Flask-appbuilder UI
 274 | fab_logging_level = WARN
 275 | 
 276 | # Logging class
 277 | # Specify the class that will specify the logging configuration
 278 | # This class has to be on the python classpath
 279 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
 280 | logging_config_class =
 281 | 
 282 | # Flag to enable/disable Colored logs in Console
 283 | # Colour the logs when the controlling terminal is a TTY.
 284 | colored_console_log = True
 285 | 
 286 | # Log format for when Colored logs is enabled
 287 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
 288 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
 289 | 
 290 | # Format of Log line
 291 | log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
 292 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
 293 | 
 294 | # Specify prefix pattern like mentioned below with stream handler TaskHandlerWithCustomFormatter
 295 | # Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{try_number}}
 296 | task_log_prefix_template =
 297 | 
 298 | # Formatting for how airflow generates file names/paths for each task run.
 299 | log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log
 300 | 
 301 | # Formatting for how airflow generates file names for log
 302 | log_processor_filename_template = {{ filename }}.log
 303 | 
 304 | # full path of dag_processor_manager logfile
 305 | dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log
 306 | 
 307 | # Name of handler to read task instance logs.
 308 | # Defaults to use ``task`` handler.
 309 | task_log_reader = task
 310 | 
 311 | # A comma\-separated list of third-party logger names that will be configured to print messages to
 312 | # consoles\.
 313 | # Example: extra_loggers = connexion,sqlalchemy
 314 | extra_loggers =
 315 | 
 316 | [metrics]
 317 | 
 318 | # StatsD (https://github.com/etsy/statsd) integration settings.
 319 | # Enables sending metrics to StatsD.
 320 | statsd_on = True
 321 | statsd_host = localhost
 322 | statsd_port = 8125
 323 | statsd_prefix = airflow
 324 | 
 325 | # If you want to avoid sending all the available metrics to StatsD,
 326 | # you can configure an allow list of prefixes (comma separated) to send only the metrics that
 327 | # start with the elements of the list (e.g: "scheduler,executor,dagrun")
 328 | statsd_allow_list =
 329 | 
 330 | # A function that validate the statsd stat name, apply changes to the stat name if necessary and return
 331 | # the transformed stat name.
 332 | #
 333 | # The function should have the following signature:
 334 | # def func_name(stat_name: str) -> str:
 335 | stat_name_handler =
 336 | 
 337 | # To enable datadog integration to send airflow metrics.
 338 | statsd_datadog_enabled = False
 339 | 
 340 | # List of datadog tags attached to all metrics(e.g: key1:value1,key2:value2)
 341 | statsd_datadog_tags =
 342 | 
 343 | # If you want to utilise your own custom Statsd client set the relevant
 344 | # module path below.
 345 | # Note: The module path must exist on your PYTHONPATH for Airflow to pick it up
 346 | # statsd_custom_client_path =
 347 | 
 348 | [secrets]
 349 | # Full class name of secrets backend to enable (will precede env vars and metastore in search path)
 350 | # Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend
 351 | backend =
 352 | 
 353 | # The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class.
 354 | # See documentation for the secrets backend you are using. JSON is expected.
 355 | # Example for AWS Systems Manager ParameterStore:
 356 | # ``{{"connections_prefix": "/airflow/connections", "profile_name": "default"}}``
 357 | backend_kwargs =
 358 | 
 359 | [cli]
 360 | # In what way should the cli access the API. The LocalClient will use the
 361 | # database directly, while the json_client will use the api running on the
 362 | # webserver
 363 | api_client = airflow.api.client.local_client
 364 | 
 365 | # If you set web_server_url_prefix, do NOT forget to append it here, ex:
 366 | # ``endpoint_url = http://localhost:8080/myroot``
 367 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
 368 | endpoint_url = http://localhost:8080
 369 | 
 370 | [debug]
 371 | # Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first
 372 | # failed task. Helpful for debugging purposes.
 373 | fail_fast = False
 374 | 
 375 | [api]
 376 | # Enables the deprecated experimental API. Please note that these APIs do not have access control.
 377 | # The authenticated user has full access.
 378 | #
 379 | # .. warning::
 380 | #
 381 | #   This `Experimental REST API <https://airflow.readthedocs.io/en/latest/rest-api-ref.html>`__ is
 382 | #   deprecated since version 2.0. Please consider using
 383 | #   `the Stable REST API <https://airflow.readthedocs.io/en/latest/stable-rest-api-ref.html>`__.
 384 | #   For more information on migration, see
 385 | #   `UPDATING.md <https://github.com/apache/airflow/blob/master/UPDATING.md>`_
 386 | enable_experimental_api = False
 387 | 
 388 | # How to authenticate users of the API. See
 389 | # https://airflow.apache.org/docs/stable/security.html for possible values.
 390 | # ("airflow.api.auth.backend.default" allows all requests for historic reasons)
 391 | auth_backend = airflow.api.auth.backend.deny_all
 392 | 
 393 | # Used to set the maximum page limit for API requests
 394 | maximum_page_limit = 100
 395 | 
 396 | # Used to set the default page limit when limit is zero. A default limit
 397 | # of 100 is set on OpenApi spec. However, this particular default limit
 398 | # only work when limit is set equal to zero(0) from API requests.
 399 | # If no limit is supplied, the OpenApi spec default is used.
 400 | fallback_page_limit = 100
 401 | 
 402 | # The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested.
 403 | # Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com
 404 | google_oauth2_audience =
 405 | 
 406 | # Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on
 407 | # `the Application Default Credentials
 408 | # <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
 409 | # be used.
 410 | # Example: google_key_path = /files/service-account-json
 411 | google_key_path =
 412 | 
 413 | [lineage]
 414 | # what lineage backend to use
 415 | backend =
 416 | 
 417 | [atlas]
 418 | sasl_enabled = False
 419 | host =
 420 | port = 21000
 421 | username =
 422 | password =
 423 | 
 424 | [operators]
 425 | # The default owner assigned to each new operator, unless
 426 | # provided explicitly or passed via ``default_args``
 427 | default_owner = airflow
 428 | default_cpus = 1
 429 | default_ram = 512
 430 | default_disk = 512
 431 | default_gpus = 0
 432 | 
 433 | # Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator.
 434 | # If set to False, an exception will be thrown, otherwise only the console message will be displayed.
 435 | allow_illegal_arguments = False
 436 | 
 437 | [hive]
 438 | # Default mapreduce queue for HiveOperator tasks
 439 | default_hive_mapred_queue =
 440 | 
 441 | # Template for mapred_job_name in HiveOperator, supports the following named parameters
 442 | # hostname, dag_id, task_id, execution_date
 443 | # mapred_job_name_template =
 444 | 
 445 | [webserver]
 446 | rbac = True
 447 | 
 448 | # The base url of your website as airflow cannot guess what domain or
 449 | # cname you are using. This is used in automated emails that
 450 | # airflow sends to point links to the right web server
 451 | base_url = http://localhost:8080
 452 | 
 453 | # Default timezone to display all dates in the UI, can be UTC, system, or
 454 | # any IANA timezone string (e.g. Europe/Amsterdam). If left empty the
 455 | # default value of core/default_timezone will be used
 456 | # Example: default_ui_timezone = America/New_York
 457 | default_ui_timezone = UTC
 458 | 
 459 | # The ip specified when starting the web server
 460 | web_server_host = 0.0.0.0
 461 | 
 462 | # The port on which to run the web server
 463 | web_server_port = 8080
 464 | 
 465 | # Paths to the SSL certificate and key for the web server. When both are
 466 | # provided SSL will be enabled. This does not change the web server port.
 467 | web_server_ssl_cert =
 468 | 
 469 | # Paths to the SSL certificate and key for the web server. When both are
 470 | # provided SSL will be enabled. This does not change the web server port.
 471 | web_server_ssl_key =
 472 | 
 473 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond
 474 | web_server_master_timeout = 120
 475 | 
 476 | # Number of seconds the gunicorn webserver waits before timing out on a worker
 477 | web_server_worker_timeout = 120
 478 | 
 479 | # Number of workers to refresh at a time. When set to 0, worker refresh is
 480 | # disabled. When nonzero, airflow periodically refreshes webserver workers by
 481 | # bringing up new ones and killing old ones.
 482 | worker_refresh_batch_size = 1
 483 | 
 484 | # Number of seconds to wait before refreshing a batch of workers.
 485 | worker_refresh_interval = 30
 486 | 
 487 | # If set to True, Airflow will track files in plugins_folder directory. When it detects changes,
 488 | # then reload the gunicorn.
 489 | reload_on_plugin_change = False
 490 | 
 491 | # Secret key used to run your flask app
 492 | # It should be as random as possible
 493 | secret_key = asdfalkubnasdf
 494 | 
 495 | # Number of workers to run the Gunicorn web server
 496 | workers = 4
 497 | 
 498 | # The worker class gunicorn should use. Choices include
 499 | # sync (default), eventlet, gevent
 500 | worker_class = sync
 501 | 
 502 | # Log files for the gunicorn webserver. '-' means log to stderr.
 503 | access_logfile = -
 504 | 
 505 | # Log files for the gunicorn webserver. '-' means log to stderr.
 506 | error_logfile = -
 507 | 
 508 | # Access log format for gunicorn webserver.
 509 | # default format is %%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"
 510 | # documentation - https://docs.gunicorn.org/en/stable/settings.html#access-log-format
 511 | access_logformat =
 512 | 
 513 | # Expose the configuration file in the web server
 514 | expose_config = False
 515 | 
 516 | # Expose hostname in the web server
 517 | expose_hostname = True
 518 | 
 519 | # Expose stacktrace in the web server
 520 | expose_stacktrace = True
 521 | 
 522 | # Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times``
 523 | dag_default_view = tree
 524 | 
 525 | # Default DAG orientation. Valid values are:
 526 | # ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
 527 | dag_orientation = LR
 528 | 
 529 | # Puts the webserver in demonstration mode; blurs the names of Operators for
 530 | # privacy.
 531 | demo_mode = False
 532 | 
 533 | # The amount of time (in secs) webserver will wait for initial handshake
 534 | # while fetching logs from other worker machine
 535 | log_fetch_timeout_sec = 5
 536 | 
 537 | # Time interval (in secs) to wait before next log fetching.
 538 | log_fetch_delay_sec = 2
 539 | 
 540 | # Distance away from page bottom to enable auto tailing.
 541 | log_auto_tailing_offset = 30
 542 | 
 543 | # Animation speed for auto tailing log display.
 544 | log_animation_speed = 1000
 545 | 
 546 | # By default, the webserver shows paused DAGs. Flip this to hide paused
 547 | # DAGs by default
 548 | hide_paused_dags_by_default = False
 549 | 
 550 | # Consistent page size across all listing views in the UI
 551 | page_size = 100
 552 | 
 553 | # Define the color of navigation bar
 554 | navbar_color = #fff
 555 | 
 556 | # Default dagrun to show in UI
 557 | default_dag_run_display_number = 25
 558 | 
 559 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy
 560 | enable_proxy_fix = False
 561 | 
 562 | # Number of values to trust for ``X-Forwarded-For``.
 563 | # More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/
 564 | proxy_fix_x_for = 1
 565 | 
 566 | # Number of values to trust for ``X-Forwarded-Proto``
 567 | proxy_fix_x_proto = 1
 568 | 
 569 | # Number of values to trust for ``X-Forwarded-Host``
 570 | proxy_fix_x_host = 1
 571 | 
 572 | # Number of values to trust for ``X-Forwarded-Port``
 573 | proxy_fix_x_port = 1
 574 | 
 575 | # Number of values to trust for ``X-Forwarded-Prefix``
 576 | proxy_fix_x_prefix = 1
 577 | 
 578 | # Set secure flag on session cookie
 579 | cookie_secure = False
 580 | 
 581 | # Set samesite policy on session cookie
 582 | cookie_samesite = Lax
 583 | 
 584 | # Default setting for wrap toggle on DAG code and TI log views.
 585 | default_wrap = False
 586 | 
 587 | # Allow the UI to be rendered in a frame
 588 | x_frame_enabled = True
 589 | 
 590 | # Send anonymous user activity to your analytics tool
 591 | # choose from google_analytics, segment, or metarouter
 592 | # analytics_tool =
 593 | 
 594 | # Unique ID of your account in the analytics tool
 595 | # analytics_id =
 596 | 
 597 | # 'Recent Tasks' stats will show for old DagRuns if set
 598 | show_recent_stats_for_completed_runs = True
 599 | 
 600 | # Update FAB permissions and sync security manager roles
 601 | # on webserver startup
 602 | update_fab_perms = True
 603 | 
 604 | # The UI cookie lifetime in minutes. User will be logged out from UI after
 605 | # ``session_lifetime_minutes`` of non-activity
 606 | session_lifetime_minutes = 43200
 607 | 
 608 | [email]
 609 | 
 610 | # Configuration email backend and whether to
 611 | # send email alerts on retry or failure
 612 | # Email backend to use
 613 | email_backend = airflow.utils.email.send_email_smtp
 614 | 
 615 | # Whether email alerts should be sent when a task is retried
 616 | default_email_on_retry = True
 617 | 
 618 | # Whether email alerts should be sent when a task failed
 619 | default_email_on_failure = True
 620 | 
 621 | [smtp]
 622 | 
 623 | # If you want airflow to send emails on retries, failure, and you want to use
 624 | # the airflow.utils.email.send_email_smtp function, you have to configure an
 625 | # smtp server here
 626 | smtp_host = localhost
 627 | smtp_starttls = True
 628 | smtp_ssl = False
 629 | # Example: smtp_user = airflow
 630 | # smtp_user =
 631 | # Example: smtp_password = airflow
 632 | # smtp_password =
 633 | smtp_port = 25
 634 | smtp_mail_from = airflow@airflow.com
 635 | smtp_timeout = 30
 636 | smtp_retry_limit = 5
 637 | 
 638 | [sentry]
 639 | 
 640 | # Sentry (https://docs.sentry.io) integration. Here you can supply
 641 | # additional configuration options based on the Python platform. See:
 642 | # https://docs.sentry.io/error-reporting/configuration/?platform=python.
 643 | # Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``,
 644 | # ``ignore_errors``, ``before_breadcrumb``, ``before_send``, ``transport``.
 645 | # Enable error reporting to Sentry
 646 | sentry_on = false
 647 | sentry_dsn =
 648 | 
 649 | [celery_kubernetes_executor]
 650 | 
 651 | # This section only applies if you are using the ``CeleryKubernetesExecutor`` in
 652 | # ``[core]`` section above
 653 | # Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``.
 654 | # When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``),
 655 | # the task is executed via ``KubernetesExecutor``,
 656 | # otherwise via ``CeleryExecutor``
 657 | kubernetes_queue = kubernetes
 658 | 
 659 | [celery]
 660 | 
 661 | # This section only applies if you are using the CeleryExecutor in
 662 | # ``[core]`` section above
 663 | # The app name that will be used by celery
 664 | celery_app_name = airflow.executors.celery_executor
 665 | 
 666 | # The concurrency that will be used when starting workers with the
 667 | # ``airflow celery worker`` command. This defines the number of task instances that
 668 | # a worker will take, so size up your workers based on the resources on
 669 | # your worker box and the nature of your tasks
 670 | worker_concurrency = 16
 671 | 
 672 | # The maximum and minimum concurrency that will be used when starting workers with the
 673 | # ``airflow celery worker`` command (always keep minimum processes, but grow
 674 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency
 675 | # Pick these numbers based on resources on worker box and the nature of the task.
 676 | # If autoscale option is available, worker_concurrency will be ignored.
 677 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale
 678 | # Example: worker_autoscale = 16,12
 679 | # worker_autoscale =
 680 | 
 681 | # Used to increase the number of tasks that a worker prefetches which can improve performance.
 682 | # The number of processes multiplied by worker_prefetch_multiplier is the number of tasks
 683 | # that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily
 684 | # blocked if there are multiple workers and one worker prefetches tasks that sit behind long
 685 | # running tasks while another worker has unutilized processes that are unable to process the already
 686 | # claimed blocked tasks.
 687 | # https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits
 688 | # Example: worker_prefetch_multiplier = 1
 689 | # worker_prefetch_multiplier =
 690 | 
 691 | # When you start an airflow worker, airflow starts a tiny web server
 692 | # subprocess to serve the workers local log files to the airflow main
 693 | # web server, who then builds pages and sends them to users. This defines
 694 | # the port on which the logs are served. It needs to be unused, and open
 695 | # visible from the main web server to connect into the workers.
 696 | worker_log_server_port = 8793
 697 | 
 698 | # Umask that will be used when starting workers with the ``airflow celery worker``
 699 | # in daemon mode. This control the file-creation mode mask which determines the initial
 700 | # value of file permission bits for newly created files.
 701 | worker_umask = 0o077
 702 | 
 703 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
 704 | # a sqlalchemy database. Refer to the Celery documentation for more information.
 705 | broker_url = redis://redis:6379/1
 706 | 
 707 | # The Celery result_backend. When a job finishes, it needs to update the
 708 | # metadata of the job. Therefore it will post a message on a message bus,
 709 | # or insert it into a database (depending of the backend)
 710 | # This status is used by the scheduler to update the state of the task
 711 | # The use of a database is highly recommended
 712 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings
 713 | # result_backedn = db+postgresql://airflow:IL234H53gnta1bOAr@postgres/airflow
 714 | 
 715 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
 716 | # it ``airflow celery flower``. This defines the IP that Celery Flower runs on
 717 | flower_host = 0.0.0.0
 718 | 
 719 | # The root URL for Flower
 720 | # Example: flower_url_prefix = /flower
 721 | flower_url_prefix =
 722 | 
 723 | # This defines the port that Celery Flower runs on
 724 | flower_port = 5555
 725 | 
 726 | # Securing Flower with Basic Authentication
 727 | # Accepts user:password pairs separated by a comma
 728 | # Example: flower_basic_auth = user1:password1,user2:password2
 729 | flower_basic_auth =
 730 | 
 731 | # Default queue that tasks get assigned to and that worker listen on.
 732 | default_queue = default
 733 | 
 734 | # How many processes CeleryExecutor uses to sync task state.
 735 | # 0 means to use max(1, number of cores - 1) processes.
 736 | sync_parallelism = 0
 737 | 
 738 | # Import path for celery configuration options
 739 | celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG
 740 | ssl_active = False
 741 | ssl_key =
 742 | ssl_cert =
 743 | ssl_cacert =
 744 | 
 745 | # Celery Pool implementation.
 746 | # Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``.
 747 | # See:
 748 | # https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency
 749 | # https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html
 750 | pool = prefork
 751 | 
 752 | # The number of seconds to wait before timing out ``send_task_to_executor`` or
 753 | # ``fetch_celery_task_state`` operations.
 754 | operation_timeout = 1.0
 755 | 
 756 | # Celery task will report its status as 'started' when the task is executed by a worker.
 757 | # This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted
 758 | # or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob.
 759 | task_track_started = True
 760 | 
 761 | # Time in seconds after which Adopted tasks are cleared by CeleryExecutor. This is helpful to clear
 762 | # stalled tasks.
 763 | task_adoption_timeout = 600
 764 | 
 765 | # The Maximum number of retries for publishing task messages to the broker when failing
 766 | # due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed.
 767 | task_publish_max_retries = 3
 768 | 
 769 | # Worker initialisation check to validate Metadata Database connection
 770 | worker_precheck = False
 771 | 
 772 | [celery_broker_transport_options]
 773 | 
 774 | # This section is for specifying options which can be passed to the
 775 | # underlying celery broker transport. See:
 776 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options
 777 | # The visibility timeout defines the number of seconds to wait for the worker
 778 | # to acknowledge the task before the message is redelivered to another worker.
 779 | # Make sure to increase the visibility timeout to match the time of the longest
 780 | # ETA you're planning to use.
 781 | # visibility_timeout is only supported for Redis and SQS celery brokers.
 782 | # See:
 783 | # http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options
 784 | # Example: visibility_timeout = 21600
 785 | # visibility_timeout =
 786 | 
 787 | [dask]
 788 | 
 789 | # This section only applies if you are using the DaskExecutor in
 790 | # [core] section above
 791 | # The IP address and port of the Dask cluster's scheduler.
 792 | cluster_address = 127.0.0.1:8786
 793 | 
 794 | # TLS/ SSL settings to access a secured Dask scheduler.
 795 | tls_ca =
 796 | tls_cert =
 797 | tls_key =
 798 | 
 799 | [scheduler]
 800 | # Task instances listen for external kill signal (when you clear tasks
 801 | # from the CLI or the UI), this defines the frequency at which they should
 802 | # listen (in seconds).
 803 | job_heartbeat_sec = 5
 804 | 
 805 | # How often (in seconds) to check and tidy up 'running' TaskInstancess
 806 | # that no longer have a matching DagRun
 807 | clean_tis_without_dagrun_interval = 15.0
 808 | 
 809 | # The scheduler constantly tries to trigger new tasks (look at the
 810 | # scheduler section in the docs for more information). This defines
 811 | # how often the scheduler should run (in seconds).
 812 | scheduler_heartbeat_sec = 5
 813 | 
 814 | # The number of times to try to schedule each DAG file
 815 | # -1 indicates unlimited number
 816 | num_runs = -1
 817 | 
 818 | # The number of seconds to wait between consecutive DAG file processing
 819 | processor_poll_interval = 1
 820 | 
 821 | # after how much time (seconds) a new DAGs should be picked up from the filesystem
 822 | min_file_process_interval = 0
 823 | 
 824 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
 825 | dag_dir_list_interval = 300
 826 | 
 827 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats
 828 | print_stats_interval = 30
 829 | 
 830 | # How often (in seconds) should pool usage stats be sent to statsd (if statsd_on is enabled)
 831 | pool_metrics_interval = 5.0
 832 | 
 833 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold
 834 | # ago (in seconds), scheduler is considered unhealthy.
 835 | # This is used by the health check in the "/health" endpoint
 836 | scheduler_health_check_threshold = 30
 837 | 
 838 | # How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs
 839 | orphaned_tasks_check_interval = 300.0
 840 | child_process_log_directory = /opt/airflow/logs/scheduler
 841 | 
 842 | # Local task jobs periodically heartbeat to the DB. If the job has
 843 | # not heartbeat in this many seconds, the scheduler will mark the
 844 | # associated task instance as failed and will re-schedule the task.
 845 | scheduler_zombie_task_threshold = 300
 846 | 
 847 | # Turn off scheduler catchup by setting this to ``False``.
 848 | # Default behavior is unchanged and
 849 | # Command Line Backfills still work, but the scheduler
 850 | # will not do scheduler catchup if this is ``False``,
 851 | # however it can be set on a per DAG basis in the
 852 | # DAG definition (catchup)
 853 | catchup_by_default = False
 854 | 
 855 | # This changes the batch size of queries in the scheduling main loop.
 856 | # If this is too high, SQL query performance may be impacted by one
 857 | # or more of the following:
 858 | # - reversion to full table scan
 859 | # - complexity of query predicate
 860 | # - excessive locking
 861 | # Additionally, you may hit the maximum allowable query length for your db.
 862 | # Set this to 0 for no limit (not advised)
 863 | max_tis_per_query = 512
 864 | 
 865 | # Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries.
 866 | # If this is set to False then you should not run more than a single
 867 | # scheduler at once
 868 | use_row_level_locking = True
 869 | 
 870 | # Max number of DAGs to create DagRuns for per scheduler loop
 871 | #
 872 | # Default: 10
 873 | # max_dagruns_to_create_per_loop =
 874 | 
 875 | # How many DagRuns should a scheduler examine (and lock) when scheduling
 876 | # and queuing tasks.
 877 | #
 878 | # Default: 20
 879 | # max_dagruns_per_loop_to_schedule =
 880 | 
 881 | # Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the
 882 | # same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other
 883 | # dags in some circumstances
 884 | #
 885 | # Default: True
 886 | # schedule_after_task_execution =
 887 | 
 888 | # The scheduler can run multiple processes in parallel to parse dags.
 889 | # This defines how many processes will run.
 890 | parsing_processes = 2
 891 | 
 892 | # Turn off scheduler use of cron intervals by setting this to False.
 893 | # DAGs submitted manually in the web UI or with trigger_dag will still run.
 894 | use_job_schedule = True
 895 | 
 896 | # Allow externally triggered DagRuns for Execution Dates in the future
 897 | # Only has effect if schedule_interval is set to None in DAG
 898 | allow_trigger_in_future = False
 899 | 
 900 | [kerberos]
 901 | ccache = /tmp/airflow_krb5_ccache
 902 | 
 903 | # gets augmented with fqdn
 904 | principal = airflow
 905 | reinit_frequency = 3600
 906 | kinit_path = kinit
 907 | keytab = airflow.keytab
 908 | 
 909 | [github_enterprise]
 910 | api_rev = v3
 911 | 
 912 | [admin]
 913 | # UI to hide sensitive variable fields when set to True
 914 | hide_sensitive_variable_fields = True
 915 | 
 916 | # A comma-separated list of sensitive keywords to look for in variables names.
 917 | sensitive_variable_fields =
 918 | 
 919 | [elasticsearch]
 920 | # Elasticsearch host
 921 | host =
 922 | 
 923 | # Format of the log_id, which is used to query for a given tasks logs
 924 | log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number}
 925 | 
 926 | # Used to mark the end of a log stream for a task
 927 | end_of_log_mark = end_of_log
 928 | 
 929 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id
 930 | # Code will construct log_id using the log_id template from the argument above.
 931 | # NOTE: The code will prefix the https:// automatically, don't include that here.
 932 | frontend =
 933 | 
 934 | # Write the task logs to the stdout of the worker, rather than the default files
 935 | write_stdout = False
 936 | 
 937 | # Instead of the default log formatter, write the log lines as JSON
 938 | json_format = False
 939 | 
 940 | # Log fields to also attach to the json output, if enabled
 941 | json_fields = asctime, filename, lineno, levelname, message
 942 | 
 943 | [elasticsearch_configs]
 944 | use_ssl = False
 945 | verify_certs = True
 946 | 
 947 | [kubernetes]
 948 | # Path to the YAML pod file. If set, all other kubernetes-related fields are ignored.
 949 | pod_template_file =
 950 | 
 951 | # The repository of the Kubernetes Image for the Worker to Run
 952 | worker_container_repository =
 953 | 
 954 | # The tag of the Kubernetes Image for the Worker to Run
 955 | worker_container_tag =
 956 | 
 957 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default``
 958 | namespace = default
 959 | 
 960 | # If True, all worker pods will be deleted upon termination
 961 | delete_worker_pods = True
 962 | 
 963 | # If False (and delete_worker_pods is True),
 964 | # failed worker pods will not be deleted so users can investigate them.
 965 | delete_worker_pods_on_failure = False
 966 | 
 967 | # Number of Kubernetes Worker Pod creation calls per scheduler loop.
 968 | # Note that the current default of "1" will only launch a single pod
 969 | # per-heartbeat. It is HIGHLY recommended that users increase this
 970 | # number to match the tolerance of their kubernetes cluster for
 971 | # better performance.
 972 | worker_pods_creation_batch_size = 1
 973 | 
 974 | # Allows users to launch pods in multiple namespaces.
 975 | # Will require creating a cluster-role for the scheduler
 976 | multi_namespace_mode = False
 977 | 
 978 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster.
 979 | # It's intended for clients that expect to be running inside a pod running on kubernetes.
 980 | # It will raise an exception if called from a process not running in a kubernetes environment.
 981 | in_cluster = True
 982 | 
 983 | # When running with in_cluster=False change the default cluster_context or config_file
 984 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has.
 985 | # cluster_context =
 986 | 
 987 | # Path to the kubernetes configfile to be used when ``in_cluster`` is set to False
 988 | # config_file =
 989 | 
 990 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods
 991 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string.
 992 | # List of supported params are similar for all core_v1_apis, hence a single config
 993 | # variable for all apis. See:
 994 | # https://raw.githubusercontent.com/kubernetes-client/python/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/api/core_v1_api.py
 995 | kube_client_request_args =
 996 | 
 997 | # Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client
 998 | # ``core_v1_api`` method when using the Kubernetes Executor.
 999 | # This should be an object and can contain any of the options listed in the ``v1DeleteOptions``
1000 | # class defined here:
1001 | # https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19
1002 | # Example: delete_option_kwargs = {{"grace_period_seconds": 10}}
1003 | delete_option_kwargs =
1004 | 
1005 | # Enables TCP keepalive mechanism. This prevents Kubernetes API requests to hang indefinitely
1006 | # when idle connection is time-outed on services like cloud load balancers or firewalls.
1007 | enable_tcp_keepalive = False
1008 | 
1009 | # When the `enable_tcp_keepalive` option is enabled, TCP probes a connection that has
1010 | # been idle for `tcp_keep_idle` seconds.
1011 | tcp_keep_idle = 120
1012 | 
1013 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond
1014 | # to a keepalive probe, TCP retransmits the probe after `tcp_keep_intvl` seconds.
1015 | tcp_keep_intvl = 30
1016 | 
1017 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond
1018 | # to a keepalive probe, TCP retransmits the probe `tcp_keep_cnt number` of times before
1019 | # a connection is considered to be broken.
1020 | tcp_keep_cnt = 6
1021 | 
1022 | [smart_sensor]
1023 | # When `use_smart_sensor` is True, Airflow redirects multiple qualified sensor tasks to
1024 | # smart sensor task.
1025 | use_smart_sensor = False
1026 | 
1027 | # `shard_code_upper_limit` is the upper limit of `shard_code` value. The `shard_code` is generated
1028 | # by `hashcode % shard_code_upper_limit`.
1029 | shard_code_upper_limit = 10000
1030 | 
1031 | # The number of running smart sensor processes for each service.
1032 | shards = 5
1033 | 
1034 | # comma separated sensor classes support in smart_sensor.
1035 | sensors_enabled = NamedHivePartitionSensor
1036 | 


--------------------------------------------------------------------------------