├── .DS_Store
├── images
└── pipeline_design.png
├── requirements.txt
├── README.md
├── .gitignore
├── dags
└── app.py
└── docker-compose.yaml
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunjana2199/amazon_books_data_pipeline/HEAD/.DS_Store
--------------------------------------------------------------------------------
/images/pipeline_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunjana2199/amazon_books_data_pipeline/HEAD/images/pipeline_design.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | appnope==0.1.4
2 | asttokens==2.4.1
3 | beautifulsoup4==4.12.3
4 | bs4==0.0.2
5 | certifi==2024.6.2
6 | charset-normalizer==3.3.2
7 | comm==0.2.2
8 | debugpy==1.8.1
9 | decorator==5.1.1
10 | executing==2.0.1
11 | idna==3.7
12 | ipykernel==6.29.4
13 | ipython==8.25.0
14 | jedi==0.19.1
15 | jupyter_client==8.6.2
16 | jupyter_core==5.7.2
17 | matplotlib-inline==0.1.7
18 | nest-asyncio==1.6.0
19 | numpy==2.0.0
20 | packaging==24.1
21 | pandas==2.2.2
22 | parso==0.8.4
23 | pexpect==4.9.0
24 | platformdirs==4.2.2
25 | prompt_toolkit==3.0.47
26 | psutil==6.0.0
27 | ptyprocess==0.7.0
28 | pure-eval==0.2.2
29 | Pygments==2.18.0
30 | python-dateutil==2.9.0.post0
31 | pytz==2024.1
32 | pyzmq==26.0.3
33 | requests==2.32.3
34 | six==1.16.0
35 | soupsieve==2.5
36 | stack-data==0.6.3
37 | tornado==6.4.1
38 | traitlets==5.14.3
39 | tzdata==2024.1
40 | urllib3==2.2.2
41 | wcwidth==0.2.13
42 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Amazon Books Data Pipeline
3 | -----------
4 |
5 |
6 | ## Create a virtual environment and activate it (optional)
7 | """
8 |
9 | python -m venv venv
10 | source venv/bin/activate
11 |
12 | """
13 |
14 |
15 | # 🔗Important links and Code
16 | -----
17 |
18 | ## Install Airflow
19 | -----
20 |
21 | Follow steps in the link - https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html
22 |
23 | ## Install PGAdmin
24 | -----
25 | Code to add in yaml file
26 |
27 |
28 | """
29 |
30 | postgres:
31 | image: postgres:13
32 | environment:
33 | POSTGRES_USER: airflow
34 | POSTGRES_PASSWORD: airflow
35 | POSTGRES_DB: airflow
36 | volumes:
37 | - postgres-db-volume:/var/lib/postgresql/data
38 | healthcheck:
39 | test: ["CMD", "pg_isready", "-U", "airflow"]
40 | interval: 10s
41 | retries: 5
42 | start_period: 5s
43 | restart: always
44 | ports:
45 | - "5432:5432"
46 |
47 | pgadmin:
48 |
49 | container_name: pgadmin4_container2
50 |
51 | image: dpage/pgadmin4
52 |
53 | restart: always
54 |
55 | environment:
56 |
57 | PGADMIN_DEFAULT_EMAIL: admin@admin.com
58 | PGADMIN_DEFAULT_PASSWORD: root
59 |
60 | ports:
61 | - "5050:80"
62 | """
63 |
64 | -----
65 |
66 | # Pipeline Design
67 |
68 | 
69 |
70 | -----
71 | -----
72 |
73 | # ⬇️Watch this video to Implement this Project from Scratch
74 |
75 |
80 |
81 | Link to video : https://www.youtube.com/watch?v=3xyoM28B40Y
82 |
83 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | logs/
134 |
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 |
139 | # Rope project settings
140 | .ropeproject
141 |
142 | # mkdocs documentation
143 | /site
144 |
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 |
150 | # Pyre type checker
151 | .pyre/
152 |
153 | # pytype static type analyzer
154 | .pytype/
155 |
156 | # Cython debug symbols
157 | cython_debug/
158 |
159 | # PyCharm
160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | # and can be added to the global gitignore or merged into this file. For a more nuclear
163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 |
--------------------------------------------------------------------------------
/dags/app.py:
--------------------------------------------------------------------------------
1 |
2 | #dag - directed acyclic graph
3 |
4 | #tasks : 1) fetch amazon data (extract) 2) clean data (transform) 3) create and store data in table on postgres (load)
5 | #operators : Python Operator and PostgresOperator
6 | #hooks - allows connection to postgres
7 | #dependencies
8 |
9 | from datetime import datetime, timedelta
10 | from airflow import DAG
11 | import requests
12 | import pandas as pd
13 | from bs4 import BeautifulSoup
14 | from airflow.operators.python import PythonOperator
15 | from airflow.providers.postgres.operators.postgres import PostgresOperator
16 | from airflow.providers.postgres.hooks.postgres import PostgresHook
17 |
18 | #1) fetch amazon data (extract) 2) clean data (transform)
19 |
20 | headers = {
21 | "Referer": 'https://www.amazon.com/',
22 | "Sec-Ch-Ua": "Not_A Brand",
23 | "Sec-Ch-Ua-Mobile": "?0",
24 | "Sec-Ch-Ua-Platform": "macOS",
25 | 'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
26 | }
27 |
28 |
29 | def get_amazon_data_books(num_books, ti):
30 | # Base URL of the Amazon search results for data science books
31 | base_url = f"https://www.amazon.com/s?k=data+engineering+books"
32 |
33 | books = []
34 | seen_titles = set() # To keep track of seen titles
35 |
36 | page = 1
37 |
38 | while len(books) < num_books:
39 | url = f"{base_url}&page={page}"
40 |
41 | # Send a request to the URL
42 | response = requests.get(url, headers=headers)
43 |
44 | # Check if the request was successful
45 | if response.status_code == 200:
46 | # Parse the content of the request with BeautifulSoup
47 | soup = BeautifulSoup(response.content, "html.parser")
48 |
49 | # Find book containers (you may need to adjust the class names based on the actual HTML structure)
50 | book_containers = soup.find_all("div", {"class": "s-result-item"})
51 |
52 | # Loop through the book containers and extract data
53 | for book in book_containers:
54 | title = book.find("span", {"class": "a-text-normal"})
55 | author = book.find("a", {"class": "a-size-base"})
56 | price = book.find("span", {"class": "a-price-whole"})
57 | rating = book.find("span", {"class": "a-icon-alt"})
58 |
59 | if title and author and price and rating:
60 | book_title = title.text.strip()
61 |
62 | # Check if title has been seen before
63 | if book_title not in seen_titles:
64 | seen_titles.add(book_title)
65 | books.append({
66 | "Title": book_title,
67 | "Author": author.text.strip(),
68 | "Price": price.text.strip(),
69 | "Rating": rating.text.strip(),
70 | })
71 |
72 | # Increment the page number for the next iteration
73 | page += 1
74 | else:
75 | print("Failed to retrieve the page")
76 | break
77 |
78 | # Limit to the requested number of books
79 | books = books[:num_books]
80 |
81 | # Convert the list of dictionaries into a DataFrame
82 | df = pd.DataFrame(books)
83 |
84 | # Remove duplicates based on 'Title' column
85 | df.drop_duplicates(subset="Title", inplace=True)
86 |
87 | # Push the DataFrame to XCom
88 | ti.xcom_push(key='book_data', value=df.to_dict('records'))
89 |
90 | #3) create and store data in table on postgres (load)
91 |
92 | def insert_book_data_into_postgres(ti):
93 | book_data = ti.xcom_pull(key='book_data', task_ids='fetch_book_data')
94 | if not book_data:
95 | raise ValueError("No book data found")
96 |
97 | postgres_hook = PostgresHook(postgres_conn_id='books_connection')
98 | insert_query = """
99 | INSERT INTO books (title, authors, price, rating)
100 | VALUES (%s, %s, %s, %s)
101 | """
102 | for book in book_data:
103 | postgres_hook.run(insert_query, parameters=(book['Title'], book['Author'], book['Price'], book['Rating']))
104 |
105 |
106 | default_args = {
107 | 'owner': 'airflow',
108 | 'depends_on_past': False,
109 | 'start_date': datetime(2024, 6, 20),
110 | 'retries': 1,
111 | 'retry_delay': timedelta(minutes=5),
112 | }
113 |
114 | dag = DAG(
115 | 'fetch_and_store_amazon_books',
116 | default_args=default_args,
117 | description='A simple DAG to fetch book data from Amazon and store it in Postgres',
118 | schedule_interval=timedelta(days=1),
119 | )
120 |
121 | #operators : Python Operator and PostgresOperator
122 | #hooks - allows connection to postgres
123 |
124 |
125 | fetch_book_data_task = PythonOperator(
126 | task_id='fetch_book_data',
127 | python_callable=get_amazon_data_books,
128 | op_args=[50], # Number of books to fetch
129 | dag=dag,
130 | )
131 |
132 | create_table_task = PostgresOperator(
133 | task_id='create_table',
134 | postgres_conn_id='books_connection',
135 | sql="""
136 | CREATE TABLE IF NOT EXISTS books (
137 | id SERIAL PRIMARY KEY,
138 | title TEXT NOT NULL,
139 | authors TEXT,
140 | price TEXT,
141 | rating TEXT
142 | );
143 | """,
144 | dag=dag,
145 | )
146 |
147 | insert_book_data_task = PythonOperator(
148 | task_id='insert_book_data',
149 | python_callable=insert_book_data_into_postgres,
150 | dag=dag,
151 | )
152 |
153 | #dependencies
154 |
155 | fetch_book_data_task >> create_table_task >> insert_book_data_task
156 |
157 |
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 |
19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
20 | #
21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
22 | #
23 | # This configuration supports basic configuration using environment variables or an .env file
24 | # The following variables are supported:
25 | #
26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
27 | # Default: apache/airflow:2.9.2
28 | # AIRFLOW_UID - User ID in Airflow containers
29 | # Default: 50000
30 | # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed.
31 | # Default: .
32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
33 | #
34 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
35 | # Default: airflow
36 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
37 | # Default: airflow
38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
39 | # Use this option ONLY for quick checks. Installing requirements at container
40 | # startup is done EVERY TIME the service is started.
41 | # A better way is to build a custom image or extend the official image
42 | # as described in https://airflow.apache.org/docs/docker-stack/build.html.
43 | # Default: ''
44 | #
45 | # Feel free to modify this file to suit your needs.
46 | ---
47 | x-airflow-common:
48 | &airflow-common
49 | # In order to add custom dependencies or upgrade provider packages you can use your extended image.
50 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
51 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
52 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.9.2}
53 | # build: .
54 | environment:
55 | &airflow-common-env
56 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor
57 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
58 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
59 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
60 | AIRFLOW__CORE__FERNET_KEY: ''
61 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
62 | AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
63 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
64 | # yamllint disable rule:line-length
65 | # Use simple http server on scheduler for health checks
66 | # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
67 | # yamllint enable rule:line-length
68 | AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
69 | # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
70 | # for other purpose (development, test and especially production usage) build/extend Airflow image.
71 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
72 | # The following line can be used to set a custom config file, stored in the local config folder
73 | # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
74 | # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
75 | volumes:
76 | - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
77 | - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
78 | - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
79 | - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
80 | user: "${AIRFLOW_UID:-50000}:0"
81 | depends_on:
82 | &airflow-common-depends-on
83 | redis:
84 | condition: service_healthy
85 | postgres:
86 | condition: service_healthy
87 |
88 | services:
89 | postgres:
90 | image: postgres:13
91 | environment:
92 | POSTGRES_USER: airflow
93 | POSTGRES_PASSWORD: airflow
94 | POSTGRES_DB: airflow
95 | volumes:
96 | - postgres-db-volume:/var/lib/postgresql/data
97 | healthcheck:
98 | test: ["CMD", "pg_isready", "-U", "airflow"]
99 | interval: 10s
100 | retries: 5
101 | start_period: 5s
102 | restart: always
103 | ports:
104 | - "5432:5432"
105 |
106 | pgadmin:
107 | container_name: pgadmin4_container2
108 | image: dpage/pgadmin4
109 | restart: always
110 | environment:
111 | PGADMIN_DEFAULT_EMAIL: admin@admin.com
112 | PGADMIN_DEFAULT_PASSWORD: root
113 | ports:
114 | - "5050:80"
115 |
116 | redis:
117 | # Redis is limited to 7.2-bookworm due to licencing change
118 | # https://redis.io/blog/redis-adopts-dual-source-available-licensing/
119 | image: redis:7.2-bookworm
120 | expose:
121 | - 6379
122 | healthcheck:
123 | test: ["CMD", "redis-cli", "ping"]
124 | interval: 10s
125 | timeout: 30s
126 | retries: 50
127 | start_period: 30s
128 | restart: always
129 |
130 | airflow-webserver:
131 | <<: *airflow-common
132 | command: webserver
133 | ports:
134 | - "8080:8080"
135 | healthcheck:
136 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
137 | interval: 30s
138 | timeout: 10s
139 | retries: 5
140 | start_period: 30s
141 | restart: always
142 | depends_on:
143 | <<: *airflow-common-depends-on
144 | airflow-init:
145 | condition: service_completed_successfully
146 |
147 | airflow-scheduler:
148 | <<: *airflow-common
149 | command: scheduler
150 | healthcheck:
151 | test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
152 | interval: 30s
153 | timeout: 10s
154 | retries: 5
155 | start_period: 30s
156 | restart: always
157 | depends_on:
158 | <<: *airflow-common-depends-on
159 | airflow-init:
160 | condition: service_completed_successfully
161 |
162 | airflow-worker:
163 | <<: *airflow-common
164 | command: celery worker
165 | healthcheck:
166 | # yamllint disable rule:line-length
167 | test:
168 | - "CMD-SHELL"
169 | - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
170 | interval: 30s
171 | timeout: 10s
172 | retries: 5
173 | start_period: 30s
174 | environment:
175 | <<: *airflow-common-env
176 | # Required to handle warm shutdown of the celery workers properly
177 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
178 | DUMB_INIT_SETSID: "0"
179 | restart: always
180 | depends_on:
181 | <<: *airflow-common-depends-on
182 | airflow-init:
183 | condition: service_completed_successfully
184 |
185 | airflow-triggerer:
186 | <<: *airflow-common
187 | command: triggerer
188 | healthcheck:
189 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
190 | interval: 30s
191 | timeout: 10s
192 | retries: 5
193 | start_period: 30s
194 | restart: always
195 | depends_on:
196 | <<: *airflow-common-depends-on
197 | airflow-init:
198 | condition: service_completed_successfully
199 |
200 | airflow-init:
201 | <<: *airflow-common
202 | entrypoint: /bin/bash
203 | # yamllint disable rule:line-length
204 | command:
205 | - -c
206 | - |
207 | if [[ -z "${AIRFLOW_UID}" ]]; then
208 | echo
209 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
210 | echo "If you are on Linux, you SHOULD follow the instructions below to set "
211 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
212 | echo "For other operating systems you can get rid of the warning with manually created .env file:"
213 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
214 | echo
215 | fi
216 | one_meg=1048576
217 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
218 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
219 | disk_available=$$(df / | tail -1 | awk '{print $$4}')
220 | warning_resources="false"
221 | if (( mem_available < 4000 )) ; then
222 | echo
223 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
224 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
225 | echo
226 | warning_resources="true"
227 | fi
228 | if (( cpus_available < 2 )); then
229 | echo
230 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
231 | echo "At least 2 CPUs recommended. You have $${cpus_available}"
232 | echo
233 | warning_resources="true"
234 | fi
235 | if (( disk_available < one_meg * 10 )); then
236 | echo
237 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
238 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
239 | echo
240 | warning_resources="true"
241 | fi
242 | if [[ $${warning_resources} == "true" ]]; then
243 | echo
244 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
245 | echo "Please follow the instructions to increase amount of resources available:"
246 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
247 | echo
248 | fi
249 | mkdir -p /sources/logs /sources/dags /sources/plugins
250 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
251 | exec /entrypoint airflow version
252 | # yamllint enable rule:line-length
253 | environment:
254 | <<: *airflow-common-env
255 | _AIRFLOW_DB_MIGRATE: 'true'
256 | _AIRFLOW_WWW_USER_CREATE: 'true'
257 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
258 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
259 | _PIP_ADDITIONAL_REQUIREMENTS: ''
260 | user: "0:0"
261 | volumes:
262 | - ${AIRFLOW_PROJ_DIR:-.}:/sources
263 |
264 | airflow-cli:
265 | <<: *airflow-common
266 | profiles:
267 | - debug
268 | environment:
269 | <<: *airflow-common-env
270 | CONNECTION_CHECK_MAX_COUNT: "0"
271 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
272 | command:
273 | - bash
274 | - -c
275 | - airflow
276 |
277 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
278 | # or by explicitly targeted on the command line e.g. docker-compose up flower.
279 | # See: https://docs.docker.com/compose/profiles/
280 | flower:
281 | <<: *airflow-common
282 | command: celery flower
283 | profiles:
284 | - flower
285 | ports:
286 | - "5555:5555"
287 | healthcheck:
288 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
289 | interval: 30s
290 | timeout: 10s
291 | retries: 5
292 | start_period: 30s
293 | restart: always
294 | depends_on:
295 | <<: *airflow-common-depends-on
296 | airflow-init:
297 | condition: service_completed_successfully
298 |
299 | volumes:
300 | postgres-db-volume:
301 |
--------------------------------------------------------------------------------