├── img ├── pipeline.png ├── treeview.png └── pipelinechart.png ├── dags ├── utils │ ├── ml_pipeline_config.py │ ├── load_data.py │ ├── save_batch_data.py │ ├── track_experiments_info.py │ ├── preprocess_data.py │ ├── files_util.py │ ├── fit_best_model.py │ └── experiment.py ├── sql │ ├── create_experiments.sql │ └── create_batch_data_table.sql └── ml_pipeline.py ├── README.md └── docker-compose.yaml /img/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NicoloAlbanese/airflow-ml-pipeline-mvp/HEAD/img/pipeline.png -------------------------------------------------------------------------------- /img/treeview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NicoloAlbanese/airflow-ml-pipeline-mvp/HEAD/img/treeview.png -------------------------------------------------------------------------------- /img/pipelinechart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NicoloAlbanese/airflow-ml-pipeline-mvp/HEAD/img/pipelinechart.png -------------------------------------------------------------------------------- /dags/utils/ml_pipeline_config.py: -------------------------------------------------------------------------------- 1 | params = { 2 | "db_engine": "postgresql+psycopg2://airflow:airflow@postgres/airflow", 3 | "db_schema": "public", 4 | "db_experiments_table": "experiments", 5 | "db_batch_table": "batch_data", 6 | "test_split_ratio": 0.3, 7 | "cv_folds": 3, 8 | "max_pca_components": 30, 9 | "logreg_maxiter": 1000 10 | } -------------------------------------------------------------------------------- /dags/sql/create_experiments.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS experiments ( 2 | experiment_id SERIAL PRIMARY KEY, 3 | experiment_datetime VARCHAR NOT NULL, 4 | cv_folds NUMERIC NOT NULL, 5 | logreg_maxiter NUMERIC NOT NULL, 6 | max_pca_components NUMERIC NOT NULL, 7 | best_logreg_c NUMERIC NOT NULL, 8 | best_pca_components NUMERIC NOT NULL, 9 | test_set_accuracy NUMERIC NOT NULL 10 | ); -------------------------------------------------------------------------------- /dags/utils/load_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.datasets import load_breast_cancer 4 | from utils.files_util import save_files 5 | 6 | def load_data(): 7 | 8 | breast_ds = load_breast_cancer() 9 | labels = np.reshape(breast_ds.target, (569,1)) 10 | breast_data = np.concatenate([breast_ds.data, labels], axis=1) 11 | df = pd.DataFrame(breast_data) 12 | df.columns = np.append(breast_ds.feature_names, 'label') 13 | df.name="df" 14 | save_files([df]) -------------------------------------------------------------------------------- /dags/utils/save_batch_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sqlalchemy import create_engine 3 | 4 | from utils.files_util import load_files 5 | import utils.ml_pipeline_config as config 6 | 7 | db_engine = config.params["db_engine"] 8 | db_schema = config.params["db_schema"] 9 | table_batch = config.params["db_batch_table"] 10 | 11 | def save_batch_data(): 12 | df = load_files(['df'])[0] 13 | engine = create_engine(db_engine) 14 | df.to_sql(table_batch, engine, schema=db_schema, if_exists='replace', index=False) -------------------------------------------------------------------------------- /dags/utils/track_experiments_info.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sqlalchemy import create_engine 3 | 4 | from utils.files_util import load_files 5 | import utils.ml_pipeline_config as config 6 | 7 | db_engine = config.params["db_engine"] 8 | db_schema = config.params["db_schema"] 9 | table_name = config.params["db_experiments_table"] 10 | 11 | def track_experiments_info(): 12 | df = load_files(['exp_info'])[0] 13 | engine = create_engine(db_engine) 14 | df.to_sql(table_name, engine, schema=db_schema, if_exists='append', index=False) 15 | -------------------------------------------------------------------------------- /dags/utils/preprocess_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn import model_selection 3 | from utils.files_util import save_files, load_files 4 | 5 | import utils.ml_pipeline_config as config 6 | 7 | test_size = config.params["test_split_ratio"] 8 | 9 | def preprocess_data(): 10 | 11 | df = load_files(['df'])[0] 12 | x_train, x_test, y_train, y_test = model_selection.train_test_split(df.iloc[:,:-1], 13 | df['label'], 14 | test_size=test_size) 15 | x_train.name = 'x_train' 16 | x_test.name = 'x_test' 17 | y_train.name = 'y_train' 18 | y_test.name = 'y_test' 19 | save_files([x_train, x_test, y_train, y_test]) 20 | -------------------------------------------------------------------------------- /dags/utils/files_util.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os.path 3 | 4 | 5 | def save_files(df_list): 6 | ''' 7 | accepts dataframe list as input 8 | saves each dataframe in the tmp folder as csv 9 | the file name corresponds to the dataframe "name" attribute 10 | ''' 11 | [ df.to_csv('/opt/airflow/data/' + df.name + '.csv' , sep=',', index=False) for df in df_list ] 12 | 13 | 14 | 15 | def load_files(names_list): 16 | ''' 17 | accepts a list of names (str) as input 18 | load each csv file from the tmp folder with the input names 19 | returns a list of loaded dataframes 20 | ''' 21 | df_list = [] 22 | [ df_list.append(pd.read_csv("/opt/airflow/data/" + name + ".csv")) for name in names_list if os.path.isfile('/opt/airflow/data/' + name + '.csv') ] 23 | 24 | return df_list -------------------------------------------------------------------------------- /dags/utils/fit_best_model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datetime import datetime 3 | import joblib 4 | 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.decomposition import PCA 8 | from sklearn.linear_model import LogisticRegression 9 | 10 | from utils.files_util import load_files 11 | 12 | def fit_best_model(): 13 | 14 | df, best_params = load_files(['df', 'exp_info']) 15 | pipe = Pipeline([('scaler', StandardScaler()), 16 | ('pca', PCA(n_components = best_params['best_pca_components'].values[0])), 17 | ('log_reg', LogisticRegression(C=best_params['best_logreg_c'].values[0])) 18 | ]) 19 | pipe.fit(df.iloc[:,:-1], df['label']) 20 | 21 | # save best model 22 | now = datetime.now().strftime('%d-%m-%Y_%H:%M:%S') 23 | filename = 'model_' + now + '.pkl' 24 | joblib.dump(pipe, '/opt/airflow/models/' + filename, compress=1) 25 | -------------------------------------------------------------------------------- /dags/sql/create_batch_data_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS batch_data ( 2 | patient_id SERIAL PRIMARY KEY, 3 | mean_radius NUMERIC NOT NULL, 4 | mean_texture NUMERIC NOT NULL, 5 | mean_perimeter NUMERIC NOT NULL, 6 | mean_area NUMERIC NOT NULL, 7 | mean_smoothness NUMERIC NOT NULL, 8 | mean_compactness NUMERIC NOT NULL, 9 | mean_concavity NUMERIC NOT NULL, 10 | mean_concave_points NUMERIC NOT NULL, 11 | mean_symmetry NUMERIC NOT NULL, 12 | mean_fractal_dimension NUMERIC NOT NULL, 13 | radius_error NUMERIC NOT NULL, 14 | texture_error NUMERIC NOT NULL, 15 | perimeter_error NUMERIC NOT NULL, 16 | area_error NUMERIC NOT NULL, 17 | smoothness_error NUMERIC NOT NULL, 18 | compactness_error NUMERIC NOT NULL, 19 | concavity_error NUMERIC NOT NULL, 20 | concave_points_error NUMERIC NOT NULL, 21 | symmetry_error NUMERIC NOT NULL, 22 | fractal_dimension_error NUMERIC NOT NULL, 23 | worst_radius NUMERIC NOT NULL, 24 | worst_texture NUMERIC NOT NULL, 25 | worst_perimeter NUMERIC NOT NULL, 26 | worst_area NUMERIC NOT NULL, 27 | worst_smoothness NUMERIC NOT NULL, 28 | worst_compactness NUMERIC NOT NULL, 29 | worst_concavity NUMERIC NOT NULL, 30 | worst_concave_points NUMERIC NOT NULL, 31 | worst_symmetry NUMERIC NOT NULL, 32 | worst_fractal_dimension NUMERIC NOT NULL, 33 | label NUMERIC NOT NULL 34 | ); -------------------------------------------------------------------------------- /dags/ml_pipeline.py: -------------------------------------------------------------------------------- 1 | from airflow.models import DAG 2 | 3 | from airflow.operators.python import PythonOperator 4 | from airflow.providers.postgres.operators.postgres import PostgresOperator 5 | from airflow.utils.task_group import TaskGroup 6 | 7 | from datetime import datetime 8 | 9 | from utils.load_data import load_data 10 | from utils.preprocess_data import preprocess_data 11 | from utils.experiment import experiment 12 | from utils.track_experiments_info import track_experiments_info 13 | from utils.fit_best_model import fit_best_model 14 | from utils.save_batch_data import save_batch_data 15 | 16 | 17 | default_args= { 18 | 'owner': 'Nicolò C. Albanese', 19 | 'email_on_failure': False, 20 | 'email': ['nicolo_albanese@outlook.it'], 21 | 'start_date': datetime(2021, 12, 1) 22 | } 23 | 24 | with DAG( 25 | "ml_pipeline", 26 | description='End-to-end ML pipeline example', 27 | schedule_interval='@daily', 28 | default_args=default_args, 29 | catchup=False) as dag: 30 | 31 | 32 | # task: 1 33 | with TaskGroup('creating_storage_structures') as creating_storage_structures: 34 | 35 | # task: 1.1 36 | creating_experiment_tracking_table = PostgresOperator( 37 | task_id="creating_experiment_tracking_table", 38 | postgres_conn_id='postgres_default', 39 | sql='sql/create_experiments.sql' 40 | ) 41 | 42 | # task: 1.2 43 | creating_batch_data_table = PostgresOperator( 44 | task_id="creating_batch_data_table", 45 | postgres_conn_id='postgres_default', 46 | sql='sql/create_batch_data_table.sql' 47 | ) 48 | 49 | # task: 2 50 | fetching_data = PythonOperator( 51 | task_id='fetching_data', 52 | python_callable=load_data 53 | 54 | ) 55 | 56 | # task: 3 57 | with TaskGroup('preparing_data') as preparing_data: 58 | 59 | # task: 3.1 60 | preprocessing = PythonOperator( 61 | task_id='preprocessing', 62 | python_callable=preprocess_data 63 | ) 64 | 65 | # task: 3.2 66 | saving_batch_data = PythonOperator( 67 | task_id='saving_batch_data', 68 | python_callable=save_batch_data 69 | ) 70 | 71 | # task: 4 72 | hyperparam_tuning = PythonOperator( 73 | task_id='hyperparam_tuning', 74 | python_callable=experiment 75 | ) 76 | 77 | # task: 5 78 | with TaskGroup('after_crossvalidation') as after_crossvalidation: 79 | 80 | # ======= 81 | # task: 5.1 82 | saving_results = PythonOperator( 83 | task_id='saving_results', 84 | python_callable=track_experiments_info 85 | ) 86 | 87 | # task: 5.2 88 | fitting_best_model = PythonOperator( 89 | task_id='fitting_best_model', 90 | python_callable=fit_best_model 91 | ) 92 | 93 | creating_storage_structures >> fetching_data >> preparing_data >> hyperparam_tuning >> after_crossvalidation -------------------------------------------------------------------------------- /dags/utils/experiment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from datetime import datetime 4 | 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.model_selection import GridSearchCV 8 | from sklearn.decomposition import PCA 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.metrics import accuracy_score 11 | 12 | from utils.files_util import save_files, load_files 13 | import utils.ml_pipeline_config as config 14 | 15 | 16 | def experiment(): 17 | 18 | x_train, x_test, y_train, y_test = load_files(['x_train', 'x_test', 'y_train', 'y_test']) 19 | 20 | # the maximum number of principal components to investigate cannot be higher than the number of coolumns in the dataset 21 | max_pca_components = config.params["max_pca_components"] if config.params["max_pca_components"] <= x_train.shape[1] else x_train.shape[1] 22 | cv_folds = config.params["cv_folds"] 23 | logreg_maxiter = config.params["logreg_maxiter"] 24 | 25 | # pipeline definition 26 | std_scaler = StandardScaler() 27 | pca = PCA(max_pca_components-1) 28 | log_reg = LogisticRegression(max_iter=logreg_maxiter) 29 | 30 | pipe = Pipeline(steps=[('std_scaler', std_scaler), 31 | ('pca', pca), 32 | ('log_reg', log_reg)]) 33 | 34 | # parameters for hyper-parameter tuning 35 | params = { 36 | 'pca__n_components': list(range(1, max_pca_components)), 37 | 'log_reg__C': np.logspace(0.05, 0.1, 1) 38 | } 39 | 40 | # cross-validated training through grid search 41 | grid_search = GridSearchCV(pipe, params, cv=cv_folds) 42 | grid_search.fit(x_train, y_train) 43 | 44 | # selection of the best parameters 45 | best_c = round(grid_search.best_params_.get("log_reg__C"),2) 46 | best_princ_comp = grid_search.best_params_.get("pca__n_components") 47 | 48 | # performances on test set 49 | y_test_predicted = grid_search.best_estimator_.predict(x_test) 50 | test_set_accuracy = round(accuracy_score(y_test, y_test_predicted),3) 51 | 52 | # save esperiments information for historical persistence 53 | now = datetime.now().strftime("%d-%m-%Y_%H:%M:%S") 54 | 55 | exp_info = pd.DataFrame([[now, 56 | cv_folds, 57 | logreg_maxiter, 58 | max_pca_components, 59 | best_c, 60 | best_princ_comp, 61 | test_set_accuracy]], 62 | columns=['experiment_datetime', 63 | 'cv_folds', 64 | 'logreg_maxiter', 65 | 'max_pca_components', 66 | 'best_logreg_c', 67 | 'best_pca_components', 68 | 'test_set_accuracy' 69 | ]) 70 | exp_info.name = 'exp_info' 71 | 72 | save_files([exp_info]) 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # airflow-ml-pipeline-mvp 2 | 3 | Machine Learning pipeline MVP on Docker and Apache Airflow 4 | 5 | __Author__: Nicolò C. Albanese (nicolo_albanese@outlook.it) 6 | 7 | ![Pipeline](https://github.com/NicoloAlbanese/airflow-ml-pipeline-mvp/blob/main/img/pipeline.png) 8 | 9 | ## 1. Prerequisites 10 | 11 | - Docker Compose 12 | 13 | ## 2. Setup 14 | 15 | ### 2.1 Configuration file 16 | 17 | Starting from the [Airflow's official Docker Compose yaml file](https://airflow.apache.org/docs/apache-airflow/stable/docker-compose.yaml), the following changes are applied: 18 | 19 | 1. Set the AIRFLOW__CORE__EXECUTOR to LocalExecutor to run the pipeline locally. 20 | 2. Remove the definitions of the Redis, Flower and Worker services and their dependencies, as they are not needed for a local execution.  21 | 3. Set the AIRFLOW__CORE__LOAD_EXAMPLES to false, as we do not want to load the native examples when accessing the web UI. 22 | 4. Populate the \_PIP_ADDITIONAL_REQUIREMENTS with: ${\_PIP_ADDITIONAL_REQUIREMENTS:-scikit-learn}, as we make use of the scikit-learn library for this example. 23 | 5. Create two more Docker volumes, respectively: 24 | 5.1 ./data:/opt/airflow/data, in order to store the data. 25 | 5.2 ./models:/opt/airflow/models, in order to store the model objects. 26 | 27 | ### 2.2 Execution 28 | 29 | From command line: 30 | 31 | ``` 32 | docker-compose -f docker-compose.yaml up -d 33 | ``` 34 | 35 | Airflow UI is accassible at _localhost:8080_ by web browser. 36 | 37 | Saved models can be found either in the _models_ subfolder within the project or by: 38 | 39 | ``` 40 | docker container exec -it airflow-dev_airflow-scheduler_1 bash 41 | 42 | cd /opt/airflow/models 43 | 44 | ls -l 45 | ``` 46 | 47 | Experiment tracking table can be checked by: 48 | 49 | ``` 50 | docker container exec -it airflow-dev_airflow-scheduler_1 bash 51 | 52 | import pandas as pd 53 | 54 | from sqlalchemy import create_engine 55 | 56 | engine = create_engine('postgresql+psycopg2://airflow:airflow@postgres/airflow') 57 | 58 | pd.read_sql('SELECT * FROM experiments', engine) 59 | ``` 60 | 61 | ## 3. DAG 62 | 63 | Graph view: 64 | 65 | ![Graph](https://github.com/NicoloAlbanese/airflow-ml-pipeline-mvp/blob/main/img/pipelinechart.png) 66 | 67 | Tree view: 68 | 69 | ![Tree](https://github.com/NicoloAlbanese/airflow-ml-pipeline-mvp/blob/main/img/treeview.png) 70 | 71 | ## 4. Caveats 72 | 73 | 1. Airflow is an orchestrator. Ideally, it should not perform the tasks, but simply wrapping them around a logical structure allowing scheduling, monitoring and scaling. 74 | 2. We made use of the Local Executor to achieve a working local environment for testing purposes. Nevertheless, in order to enable scaling and pushing tasks to worker nodes, other types of executors should be used instead, such as the Celery Executor or Kubernetes Executor. 75 | 3. We stored data in the native PostgreSQL natively available and associated with the Airflow's metastore. This allowed to create a working example without specifying further services. Nevertheless, separation of duties and life cycle decoupling would require to store pipeline's data externally to the orchestrator's components. 76 | 4. We installed the needed dependencies by leveraging the \_PIP_ADDITIONAL_REQUIREMENTS configuration property. Although convenient for testing purposes, it would not be recommended for production systems. [Custom images](https://airflow.apache.org/docs/docker-stack/build.html) should be built instead. 77 | 5. In a real world scenario involving large datasets, Python and Pandas (as well as csv files) would not be the most favourable approach towards data manipulation, whereas Spark is preferable. 78 | 79 | 80 | ## 5. References 81 | 82 | https://nicolo-albanese.medium.com/end-to-end-machine-learning-pipeline-with-docker-and-apache-airflow-from-scratch-35f6a75f57ad 83 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | --- 18 | version: '3' 19 | x-airflow-common: 20 | &airflow-common 21 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 22 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 23 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 24 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.2} 25 | # build: . 26 | environment: 27 | &airflow-common-env 28 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 29 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 30 | AIRFLOW__CORE__FERNET_KEY: '' 31 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 32 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 33 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 34 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-scikit-learn} 35 | volumes: 36 | - ./dags:/opt/airflow/dags 37 | - ./logs:/opt/airflow/logs 38 | - ./plugins:/opt/airflow/plugins 39 | - ./data:/opt/airflow/data 40 | - ./models:/opt/airflow/models 41 | user: "${AIRFLOW_UID:-50000}:0" 42 | depends_on: 43 | &airflow-common-depends-on 44 | postgres: 45 | condition: service_healthy 46 | 47 | services: 48 | postgres: 49 | image: postgres:13 50 | environment: 51 | POSTGRES_USER: airflow 52 | POSTGRES_PASSWORD: airflow 53 | POSTGRES_DB: airflow 54 | volumes: 55 | - postgres-db-volume:/var/lib/postgresql/data 56 | healthcheck: 57 | test: ["CMD", "pg_isready", "-U", "airflow"] 58 | interval: 5s 59 | retries: 5 60 | restart: always 61 | 62 | airflow-webserver: 63 | <<: *airflow-common 64 | command: webserver 65 | ports: 66 | - 8080:8080 67 | healthcheck: 68 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 69 | interval: 10s 70 | timeout: 10s 71 | retries: 5 72 | restart: always 73 | depends_on: 74 | <<: *airflow-common-depends-on 75 | airflow-init: 76 | condition: service_completed_successfully 77 | 78 | airflow-scheduler: 79 | <<: *airflow-common 80 | command: scheduler 81 | healthcheck: 82 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 83 | interval: 10s 84 | timeout: 10s 85 | retries: 5 86 | restart: always 87 | depends_on: 88 | <<: *airflow-common-depends-on 89 | airflow-init: 90 | condition: service_completed_successfully 91 | 92 | airflow-triggerer: 93 | <<: *airflow-common 94 | command: triggerer 95 | healthcheck: 96 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 97 | interval: 10s 98 | timeout: 10s 99 | retries: 5 100 | restart: always 101 | depends_on: 102 | <<: *airflow-common-depends-on 103 | airflow-init: 104 | condition: service_completed_successfully 105 | 106 | airflow-init: 107 | <<: *airflow-common 108 | entrypoint: /bin/bash 109 | # yamllint disable rule:line-length 110 | command: 111 | - -c 112 | - | 113 | function ver() { 114 | printf "%04d%04d%04d%04d" $${1//./ } 115 | } 116 | airflow_version=$$(gosu airflow airflow version) 117 | airflow_version_comparable=$$(ver $${airflow_version}) 118 | min_airflow_version=2.2.0 119 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 120 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 121 | echo 122 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 123 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 124 | echo 125 | exit 1 126 | fi 127 | if [[ -z "${AIRFLOW_UID}" ]]; then 128 | echo 129 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 130 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 131 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 132 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 133 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" 134 | echo 135 | fi 136 | one_meg=1048576 137 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 138 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 139 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 140 | warning_resources="false" 141 | if (( mem_available < 4000 )) ; then 142 | echo 143 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 144 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 145 | echo 146 | warning_resources="true" 147 | fi 148 | if (( cpus_available < 2 )); then 149 | echo 150 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 151 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 152 | echo 153 | warning_resources="true" 154 | fi 155 | if (( disk_available < one_meg * 10 )); then 156 | echo 157 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 158 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 159 | echo 160 | warning_resources="true" 161 | fi 162 | if [[ $${warning_resources} == "true" ]]; then 163 | echo 164 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 165 | echo "Please follow the instructions to increase amount of resources available:" 166 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" 167 | echo 168 | fi 169 | mkdir -p /sources/logs /sources/dags /sources/plugins 170 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 171 | exec /entrypoint airflow version 172 | # yamllint enable rule:line-length 173 | environment: 174 | <<: *airflow-common-env 175 | _AIRFLOW_DB_UPGRADE: 'true' 176 | _AIRFLOW_WWW_USER_CREATE: 'true' 177 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 178 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 179 | user: "0:0" 180 | volumes: 181 | - .:/sources 182 | 183 | airflow-cli: 184 | <<: *airflow-common 185 | profiles: 186 | - debug 187 | environment: 188 | <<: *airflow-common-env 189 | CONNECTION_CHECK_MAX_COUNT: "0" 190 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 191 | command: 192 | - bash 193 | - -c 194 | - airflow 195 | 196 | volumes: 197 | postgres-db-volume: --------------------------------------------------------------------------------