├── img
    ├── pipeline.png
    ├── treeview.png
    └── pipelinechart.png
├── dags
    ├── utils
    │   ├── ml_pipeline_config.py
    │   ├── load_data.py
    │   ├── save_batch_data.py
    │   ├── track_experiments_info.py
    │   ├── preprocess_data.py
    │   ├── files_util.py
    │   ├── fit_best_model.py
    │   └── experiment.py
    ├── sql
    │   ├── create_experiments.sql
    │   └── create_batch_data_table.sql
    └── ml_pipeline.py
├── README.md
└── docker-compose.yaml


/img/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NicoloAlbanese/airflow-ml-pipeline-mvp/HEAD/img/pipeline.png


--------------------------------------------------------------------------------
/img/treeview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NicoloAlbanese/airflow-ml-pipeline-mvp/HEAD/img/treeview.png


--------------------------------------------------------------------------------
/img/pipelinechart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NicoloAlbanese/airflow-ml-pipeline-mvp/HEAD/img/pipelinechart.png


--------------------------------------------------------------------------------
/dags/utils/ml_pipeline_config.py:
--------------------------------------------------------------------------------
 1 | params = {
 2 |     "db_engine": "postgresql+psycopg2://airflow:airflow@postgres/airflow",
 3 |     "db_schema": "public",
 4 |     "db_experiments_table": "experiments",
 5 |     "db_batch_table": "batch_data",
 6 |     "test_split_ratio": 0.3,
 7 |     "cv_folds": 3,
 8 |     "max_pca_components": 30,
 9 |     "logreg_maxiter": 1000
10 | }   


--------------------------------------------------------------------------------
/dags/sql/create_experiments.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS experiments (
 2 |     experiment_id SERIAL PRIMARY KEY,
 3 |     experiment_datetime VARCHAR NOT NULL,
 4 |     cv_folds NUMERIC NOT NULL,
 5 |     logreg_maxiter NUMERIC NOT NULL,
 6 |     max_pca_components NUMERIC NOT NULL,
 7 |     best_logreg_c NUMERIC NOT NULL,
 8 |     best_pca_components NUMERIC NOT NULL,
 9 |     test_set_accuracy NUMERIC NOT NULL
10 | );


--------------------------------------------------------------------------------
/dags/utils/load_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.datasets import load_breast_cancer
 4 | from utils.files_util import save_files
 5 | 
 6 | def load_data():
 7 |     
 8 |     breast_ds = load_breast_cancer()
 9 |     labels = np.reshape(breast_ds.target, (569,1))
10 |     breast_data = np.concatenate([breast_ds.data, labels], axis=1)
11 |     df = pd.DataFrame(breast_data)
12 |     df.columns = np.append(breast_ds.feature_names, 'label')
13 |     df.name="df"
14 |     save_files([df])


--------------------------------------------------------------------------------
/dags/utils/save_batch_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sqlalchemy import create_engine
 3 | 
 4 | from utils.files_util import load_files
 5 | import utils.ml_pipeline_config as config
 6 | 
 7 | db_engine = config.params["db_engine"]
 8 | db_schema = config.params["db_schema"]
 9 | table_batch = config.params["db_batch_table"] 
10 | 
11 | def save_batch_data():
12 |     df = load_files(['df'])[0]
13 |     engine = create_engine(db_engine)
14 |     df.to_sql(table_batch, engine, schema=db_schema, if_exists='replace', index=False)


--------------------------------------------------------------------------------
/dags/utils/track_experiments_info.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sqlalchemy import create_engine
 3 | 
 4 | from utils.files_util import load_files
 5 | import utils.ml_pipeline_config as config
 6 | 
 7 | db_engine = config.params["db_engine"]
 8 | db_schema = config.params["db_schema"]
 9 | table_name = config.params["db_experiments_table"] 
10 | 
11 | def track_experiments_info():
12 |     df = load_files(['exp_info'])[0]
13 |     engine = create_engine(db_engine)
14 |     df.to_sql(table_name, engine, schema=db_schema, if_exists='append', index=False)
15 | 


--------------------------------------------------------------------------------
/dags/utils/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn import model_selection
 3 | from utils.files_util import save_files, load_files
 4 | 
 5 | import utils.ml_pipeline_config as config
 6 | 
 7 | test_size = config.params["test_split_ratio"]
 8 | 
 9 | def preprocess_data():
10 | 
11 |     df = load_files(['df'])[0]
12 |     x_train, x_test, y_train, y_test = model_selection.train_test_split(df.iloc[:,:-1], 
13 |                                                                         df['label'], 
14 |                                                                         test_size=test_size)
15 |     x_train.name = 'x_train'
16 |     x_test.name = 'x_test'
17 |     y_train.name = 'y_train'
18 |     y_test.name = 'y_test'
19 |     save_files([x_train, x_test, y_train, y_test])
20 | 


--------------------------------------------------------------------------------
/dags/utils/files_util.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os.path
 3 | 
 4 | 
 5 | def save_files(df_list):
 6 |     '''
 7 |     accepts dataframe list as input
 8 |     saves each dataframe in the tmp folder as csv
 9 |     the file name corresponds to the dataframe "name" attribute
10 |     '''
11 |     [ df.to_csv('/opt/airflow/data/' + df.name + '.csv' , sep=',', index=False) for df in df_list ]
12 | 
13 | 
14 | 
15 | def load_files(names_list):
16 |     '''
17 |     accepts a list of names (str) as input
18 |     load each csv file from the tmp folder with the input names
19 |     returns a list of loaded dataframes
20 |     '''
21 |     df_list = []
22 |     [ df_list.append(pd.read_csv("/opt/airflow/data/" + name + ".csv")) for name in names_list if os.path.isfile('/opt/airflow/data/' + name + '.csv') ]
23 | 
24 |     return df_list


--------------------------------------------------------------------------------
/dags/utils/fit_best_model.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datetime import datetime
 3 | import joblib
 4 | 
 5 | from sklearn.preprocessing import StandardScaler
 6 | from sklearn.pipeline import Pipeline
 7 | from sklearn.decomposition import PCA
 8 | from sklearn.linear_model import LogisticRegression
 9 | 
10 | from utils.files_util import load_files
11 | 
12 | def fit_best_model():
13 |     
14 |     df, best_params = load_files(['df', 'exp_info'])
15 |     pipe = Pipeline([('scaler', StandardScaler()),
16 |                  ('pca', PCA(n_components = best_params['best_pca_components'].values[0])),
17 |                  ('log_reg', LogisticRegression(C=best_params['best_logreg_c'].values[0]))
18 |                  ])     
19 |     pipe.fit(df.iloc[:,:-1], df['label'])
20 | 
21 |     # save best model
22 |     now = datetime.now().strftime('%d-%m-%Y_%H:%M:%S')
23 |     filename = 'model_' + now + '.pkl'
24 |     joblib.dump(pipe, '/opt/airflow/models/' + filename, compress=1)
25 | 


--------------------------------------------------------------------------------
/dags/sql/create_batch_data_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS batch_data (
 2 |     patient_id SERIAL PRIMARY KEY,
 3 | 	mean_radius NUMERIC NOT NULL,
 4 | 	mean_texture NUMERIC NOT NULL,
 5 | 	mean_perimeter NUMERIC NOT NULL,
 6 | 	mean_area NUMERIC NOT NULL,
 7 | 	mean_smoothness NUMERIC NOT NULL,
 8 | 	mean_compactness NUMERIC NOT NULL,
 9 | 	mean_concavity NUMERIC NOT NULL,
10 | 	mean_concave_points NUMERIC NOT NULL,
11 | 	mean_symmetry NUMERIC NOT NULL,
12 | 	mean_fractal_dimension NUMERIC NOT NULL,
13 | 	radius_error NUMERIC NOT NULL,
14 | 	texture_error NUMERIC NOT NULL,
15 | 	perimeter_error NUMERIC NOT NULL,
16 | 	area_error NUMERIC NOT NULL,
17 | 	smoothness_error NUMERIC NOT NULL,
18 | 	compactness_error NUMERIC NOT NULL,
19 | 	concavity_error NUMERIC NOT NULL,
20 | 	concave_points_error NUMERIC NOT NULL,
21 | 	symmetry_error NUMERIC NOT NULL,
22 | 	fractal_dimension_error NUMERIC NOT NULL,
23 | 	worst_radius NUMERIC NOT NULL,
24 | 	worst_texture NUMERIC NOT NULL,
25 | 	worst_perimeter NUMERIC NOT NULL,
26 | 	worst_area NUMERIC NOT NULL,
27 | 	worst_smoothness NUMERIC NOT NULL,
28 | 	worst_compactness NUMERIC NOT NULL,
29 | 	worst_concavity NUMERIC NOT NULL,
30 | 	worst_concave_points NUMERIC NOT NULL,
31 | 	worst_symmetry NUMERIC NOT NULL,
32 | 	worst_fractal_dimension NUMERIC NOT NULL,
33 | 	label NUMERIC NOT NULL
34 | );


--------------------------------------------------------------------------------
/dags/ml_pipeline.py:
--------------------------------------------------------------------------------
 1 | from airflow.models import DAG
 2 | 
 3 | from airflow.operators.python import PythonOperator
 4 | from airflow.providers.postgres.operators.postgres import PostgresOperator
 5 | from airflow.utils.task_group import TaskGroup
 6 | 
 7 | from datetime import datetime
 8 | 
 9 | from utils.load_data import load_data
10 | from utils.preprocess_data import preprocess_data
11 | from utils.experiment import experiment
12 | from utils.track_experiments_info import track_experiments_info
13 | from utils.fit_best_model import fit_best_model
14 | from utils.save_batch_data import save_batch_data
15 | 
16 | 
17 | default_args= {
18 |     'owner': 'Nicolò C. Albanese',
19 |     'email_on_failure': False,
20 |     'email': ['nicolo_albanese@outlook.it'],
21 |     'start_date': datetime(2021, 12, 1)
22 | }
23 | 
24 | with DAG(
25 |     "ml_pipeline",
26 |     description='End-to-end ML pipeline example',
27 |     schedule_interval='@daily',
28 |     default_args=default_args, 
29 |     catchup=False) as dag:
30 | 
31 |     
32 |     # task: 1
33 |     with TaskGroup('creating_storage_structures') as creating_storage_structures:
34 | 
35 |         # task: 1.1
36 |         creating_experiment_tracking_table = PostgresOperator(
37 |             task_id="creating_experiment_tracking_table",
38 |             postgres_conn_id='postgres_default',
39 |             sql='sql/create_experiments.sql'
40 |         )
41 | 
42 |         # task: 1.2
43 |         creating_batch_data_table = PostgresOperator(
44 |             task_id="creating_batch_data_table",
45 |             postgres_conn_id='postgres_default',
46 |             sql='sql/create_batch_data_table.sql'
47 |         )
48 | 
49 |     # task: 2
50 |     fetching_data = PythonOperator(
51 |         task_id='fetching_data',
52 |         python_callable=load_data
53 | 
54 |     )
55 |     
56 |     # task: 3
57 |     with TaskGroup('preparing_data') as preparing_data:
58 | 
59 |         # task: 3.1
60 |         preprocessing = PythonOperator(
61 |             task_id='preprocessing',
62 |             python_callable=preprocess_data
63 |         )
64 | 
65 |         # task: 3.2
66 |         saving_batch_data = PythonOperator(
67 |             task_id='saving_batch_data',
68 |             python_callable=save_batch_data
69 |         )
70 |         
71 |     # task: 4
72 |     hyperparam_tuning = PythonOperator(
73 |         task_id='hyperparam_tuning',
74 |         python_callable=experiment
75 |     )
76 | 
77 |     # task: 5
78 |     with TaskGroup('after_crossvalidation') as after_crossvalidation:
79 | 
80 |         # =======
81 |         # task: 5.1        
82 |         saving_results = PythonOperator(
83 |             task_id='saving_results',
84 |             python_callable=track_experiments_info
85 |         )
86 | 
87 |         # task: 5.2
88 |         fitting_best_model = PythonOperator(
89 |             task_id='fitting_best_model',
90 |             python_callable=fit_best_model
91 |         )    
92 | 
93 |     creating_storage_structures >> fetching_data >> preparing_data >> hyperparam_tuning >> after_crossvalidation


--------------------------------------------------------------------------------
/dags/utils/experiment.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from datetime import datetime
 4 | 
 5 | from sklearn.preprocessing import StandardScaler
 6 | from sklearn.pipeline import Pipeline
 7 | from sklearn.model_selection import GridSearchCV
 8 | from sklearn.decomposition import PCA
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.metrics import accuracy_score
11 | 
12 | from utils.files_util import save_files, load_files
13 | import utils.ml_pipeline_config as config
14 | 
15 | 
16 | def experiment():
17 | 
18 |     x_train, x_test, y_train, y_test = load_files(['x_train', 'x_test', 'y_train', 'y_test'])
19 |     
20 |     # the maximum number of principal components to investigate cannot be higher than the number of coolumns in the dataset 
21 |     max_pca_components = config.params["max_pca_components"] if config.params["max_pca_components"] <= x_train.shape[1] else x_train.shape[1]
22 |     cv_folds = config.params["cv_folds"] 
23 |     logreg_maxiter = config.params["logreg_maxiter"]
24 | 
25 |     # pipeline definition
26 |     std_scaler = StandardScaler()
27 |     pca = PCA(max_pca_components-1)
28 |     log_reg = LogisticRegression(max_iter=logreg_maxiter)
29 | 
30 |     pipe = Pipeline(steps=[('std_scaler', std_scaler),
31 |                         ('pca', pca), 
32 |                         ('log_reg', log_reg)])
33 | 
34 |     # parameters for hyper-parameter tuning
35 |     params = {
36 |         'pca__n_components': list(range(1, max_pca_components)),
37 |         'log_reg__C': np.logspace(0.05, 0.1, 1)
38 |     }
39 | 
40 |     # cross-validated training through grid search
41 |     grid_search = GridSearchCV(pipe, params, cv=cv_folds)
42 |     grid_search.fit(x_train, y_train)
43 | 
44 |     # selection of the best parameters 
45 |     best_c = round(grid_search.best_params_.get("log_reg__C"),2)
46 |     best_princ_comp = grid_search.best_params_.get("pca__n_components")
47 |     
48 |     # performances on test set
49 |     y_test_predicted = grid_search.best_estimator_.predict(x_test)
50 |     test_set_accuracy = round(accuracy_score(y_test, y_test_predicted),3)
51 | 
52 |     # save esperiments information for historical persistence
53 |     now = datetime.now().strftime("%d-%m-%Y_%H:%M:%S")
54 | 
55 |     exp_info = pd.DataFrame([[now,
56 |                           cv_folds,
57 |                           logreg_maxiter,
58 |                           max_pca_components,
59 |                           best_c,
60 |                           best_princ_comp,
61 |                           test_set_accuracy]],
62 |                           columns=['experiment_datetime',
63 |                                    'cv_folds',
64 |                                    'logreg_maxiter',
65 |                                    'max_pca_components', 
66 |                                    'best_logreg_c',
67 |                                    'best_pca_components',
68 |                                    'test_set_accuracy'
69 |                                    ])
70 |     exp_info.name = 'exp_info'
71 | 
72 |     save_files([exp_info])
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # airflow-ml-pipeline-mvp
 2 | 
 3 | Machine Learning pipeline MVP on Docker and Apache Airflow
 4 | 
 5 | __Author__: Nicolò C. Albanese (nicolo_albanese@outlook.it)
 6 | 
 7 | ![Pipeline](https://github.com/NicoloAlbanese/airflow-ml-pipeline-mvp/blob/main/img/pipeline.png)
 8 | 
 9 | ## 1. Prerequisites
10 | 
11 | - Docker Compose
12 | 
13 | ## 2. Setup 
14 | 
15 | ### 2.1 Configuration file
16 | 
17 | Starting from the [Airflow's official Docker Compose yaml file](https://airflow.apache.org/docs/apache-airflow/stable/docker-compose.yaml), the following changes are applied:
18 | 
19 | 1. Set the AIRFLOW__CORE__EXECUTOR to LocalExecutor to run the pipeline locally.
20 | 2. Remove the definitions of the Redis, Flower and Worker services and their dependencies, as they are not needed for a local execution. 
21 | 3. Set the AIRFLOW__CORE__LOAD_EXAMPLES to false, as we do not want to load the native examples when accessing the web UI.
22 | 4. Populate the \_PIP_ADDITIONAL_REQUIREMENTS with: ${\_PIP_ADDITIONAL_REQUIREMENTS:-scikit-learn}, as we make use of the scikit-learn library for this example.
23 | 5. Create two more Docker volumes, respectively:
24 |    5.1 ./data:/opt/airflow/data, in order to store the data.
25 |    5.2 ./models:/opt/airflow/models, in order to store the model objects.
26 | 
27 | ### 2.2 Execution
28 | 
29 | From command line:
30 | 
31 | ```
32 | docker-compose -f docker-compose.yaml up -d
33 | ```
34 | 
35 | Airflow UI is accassible at _localhost:8080_ by web browser.
36 | 
37 | Saved models can be found either in the _models_ subfolder within the project or by:
38 | 
39 | ```
40 | docker container exec -it airflow-dev_airflow-scheduler_1 bash
41 | 
42 | cd /opt/airflow/models
43 | 
44 | ls -l
45 | ```
46 | 
47 | Experiment tracking table can be checked by:
48 | 
49 | ```
50 | docker container exec -it airflow-dev_airflow-scheduler_1 bash
51 | 
52 | import pandas as pd
53 | 
54 | from sqlalchemy import create_engine
55 | 
56 | engine = create_engine('postgresql+psycopg2://airflow:airflow@postgres/airflow')
57 | 
58 | pd.read_sql('SELECT * FROM experiments', engine)
59 | ```
60 | 
61 | ## 3. DAG
62 | 
63 | Graph view:
64 | 
65 | ![Graph](https://github.com/NicoloAlbanese/airflow-ml-pipeline-mvp/blob/main/img/pipelinechart.png)
66 | 
67 | Tree view:
68 | 
69 | ![Tree](https://github.com/NicoloAlbanese/airflow-ml-pipeline-mvp/blob/main/img/treeview.png)
70 | 
71 | ## 4. Caveats
72 | 
73 | 1. Airflow is an orchestrator. Ideally, it should not perform the tasks, but simply wrapping them around a logical structure allowing scheduling, monitoring and scaling.
74 | 2. We made use of the Local Executor to achieve a working local environment for testing purposes. Nevertheless, in order to enable scaling and pushing tasks to worker nodes, other types of executors should be used instead, such as the Celery Executor or Kubernetes Executor.
75 | 3. We stored data in the native PostgreSQL natively available and associated with the Airflow's metastore. This allowed to create a working example without specifying further services. Nevertheless, separation of duties and life cycle decoupling would require to store pipeline's data externally to the orchestrator's components.
76 | 4. We installed the needed dependencies by leveraging the \_PIP_ADDITIONAL_REQUIREMENTS configuration property. Although convenient for testing purposes, it would not be recommended for production systems. [Custom images](https://airflow.apache.org/docs/docker-stack/build.html) should be built instead.
77 | 5. In a real world scenario involving large datasets, Python and Pandas (as well as csv files) would not be the most favourable approach towards data manipulation, whereas Spark is preferable.
78 | 
79 | 
80 | ## 5. References
81 | 
82 | https://nicolo-albanese.medium.com/end-to-end-machine-learning-pipeline-with-docker-and-apache-airflow-from-scratch-35f6a75f57ad
83 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | ---
 18 | version: '3'
 19 | x-airflow-common:
 20 |   &airflow-common
 21 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 22 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 23 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 24 |   image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.2}
 25 |   # build: .
 26 |   environment:
 27 |     &airflow-common-env
 28 |     AIRFLOW__CORE__EXECUTOR: LocalExecutor
 29 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 30 |     AIRFLOW__CORE__FERNET_KEY: ''
 31 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 32 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 33 |     AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
 34 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-scikit-learn}
 35 |   volumes:
 36 |     - ./dags:/opt/airflow/dags
 37 |     - ./logs:/opt/airflow/logs
 38 |     - ./plugins:/opt/airflow/plugins
 39 |     - ./data:/opt/airflow/data
 40 |     - ./models:/opt/airflow/models
 41 |   user: "${AIRFLOW_UID:-50000}:0"
 42 |   depends_on:
 43 |     &airflow-common-depends-on
 44 |     postgres:
 45 |       condition: service_healthy
 46 | 
 47 | services:
 48 |   postgres:
 49 |     image: postgres:13
 50 |     environment:
 51 |       POSTGRES_USER: airflow
 52 |       POSTGRES_PASSWORD: airflow
 53 |       POSTGRES_DB: airflow
 54 |     volumes:
 55 |       - postgres-db-volume:/var/lib/postgresql/data
 56 |     healthcheck:
 57 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 58 |       interval: 5s
 59 |       retries: 5
 60 |     restart: always
 61 | 
 62 |   airflow-webserver:
 63 |     <<: *airflow-common
 64 |     command: webserver
 65 |     ports:
 66 |       - 8080:8080
 67 |     healthcheck:
 68 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
 69 |       interval: 10s
 70 |       timeout: 10s
 71 |       retries: 5
 72 |     restart: always
 73 |     depends_on:
 74 |       <<: *airflow-common-depends-on
 75 |       airflow-init:
 76 |         condition: service_completed_successfully
 77 | 
 78 |   airflow-scheduler:
 79 |     <<: *airflow-common
 80 |     command: scheduler
 81 |     healthcheck:
 82 |       test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
 83 |       interval: 10s
 84 |       timeout: 10s
 85 |       retries: 5
 86 |     restart: always
 87 |     depends_on:
 88 |       <<: *airflow-common-depends-on
 89 |       airflow-init:
 90 |         condition: service_completed_successfully
 91 | 
 92 |   airflow-triggerer:
 93 |     <<: *airflow-common
 94 |     command: triggerer
 95 |     healthcheck:
 96 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
 97 |       interval: 10s
 98 |       timeout: 10s
 99 |       retries: 5
100 |     restart: always
101 |     depends_on:
102 |       <<: *airflow-common-depends-on
103 |       airflow-init:
104 |         condition: service_completed_successfully
105 | 
106 |   airflow-init:
107 |     <<: *airflow-common
108 |     entrypoint: /bin/bash
109 |     # yamllint disable rule:line-length
110 |     command:
111 |       - -c
112 |       - |
113 |         function ver() {
114 |           printf "%04d%04d%04d%04d" $${1//./ }
115 |         }
116 |         airflow_version=$$(gosu airflow airflow version)
117 |         airflow_version_comparable=$$(ver $${airflow_version})
118 |         min_airflow_version=2.2.0
119 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
120 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
121 |           echo
122 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
123 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
124 |           echo
125 |           exit 1
126 |         fi
127 |         if [[ -z "${AIRFLOW_UID}" ]]; then
128 |           echo
129 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
130 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
131 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
132 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
133 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
134 |           echo
135 |         fi
136 |         one_meg=1048576
137 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
138 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
139 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
140 |         warning_resources="false"
141 |         if (( mem_available < 4000 )) ; then
142 |           echo
143 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
144 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
145 |           echo
146 |           warning_resources="true"
147 |         fi
148 |         if (( cpus_available < 2 )); then
149 |           echo
150 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
151 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
152 |           echo
153 |           warning_resources="true"
154 |         fi
155 |         if (( disk_available < one_meg * 10 )); then
156 |           echo
157 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
158 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
159 |           echo
160 |           warning_resources="true"
161 |         fi
162 |         if [[ $${warning_resources} == "true" ]]; then
163 |           echo
164 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
165 |           echo "Please follow the instructions to increase amount of resources available:"
166 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
167 |           echo
168 |         fi
169 |         mkdir -p /sources/logs /sources/dags /sources/plugins
170 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
171 |         exec /entrypoint airflow version
172 |     # yamllint enable rule:line-length
173 |     environment:
174 |       <<: *airflow-common-env
175 |       _AIRFLOW_DB_UPGRADE: 'true'
176 |       _AIRFLOW_WWW_USER_CREATE: 'true'
177 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
178 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
179 |     user: "0:0"
180 |     volumes:
181 |       - .:/sources
182 | 
183 |   airflow-cli:
184 |     <<: *airflow-common
185 |     profiles:
186 |       - debug
187 |     environment:
188 |       <<: *airflow-common-env
189 |       CONNECTION_CHECK_MAX_COUNT: "0"
190 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
191 |     command:
192 |       - bash
193 |       - -c
194 |       - airflow
195 | 
196 | volumes:
197 |   postgres-db-volume:


--------------------------------------------------------------------------------