├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── 1-pip
    │   └── constraints.sh
    ├── 2-compose
    │   └── docker-compose.yaml
    ├── 3-simple
    │   └── pandas_example.py
    └── 4-applied
    │   ├── clean_dataset.py
    │   └── get_wine_dataset.py
└── lab.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Alfredo Deza
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Learning Apache Airflow
 2 | 
 3 | Apache Airflow is a platform to define data pipelines, monitor execution and handle workflow orchestration. If you are familiar with schedulers, consumers, and queues, Airflow is a great tool to explore.
 4 | 
 5 | Airflow solves several problems like managing scheduled jobs and easily handling dependencies between tasks. It also provides a great UI to monitor and manage the workflows.
 6 | 
 7 | This repository is part of a course on applied Apache Airflow. It is meant to be used as a reference for the course and not as a standalone guide. 
 8 | 
 9 | ## Lesson 1: Installation
10 | 
11 | There are many different ways you can install and use Airflow. From building the project from source to using a hosted (ready-to-use) service. In this course we will explore installing from the Python Package Index (PyPI) as well as using Docker Compose. 
12 | 
13 | * [Installing from PyPI](./examples/1-pip/)
14 | * [Installing with Docker Compose](./examples/2-compose/)
15 | 
16 | ### PyPI
17 | 
18 | Always refer to the [official installation guide](https://airflow.apache.org/docs/apache-airflow/stable/start.html). You'll need to have Python 3 installed. Only use `pip` to install Airflow as the other many ways the Python community has come up with to install packages can cause issues, including Poetry and pip-tools. 
19 | 
20 | Create a temporary constraint file called `constraint.sh`:
21 | 
22 | ```bash
23 | AIRFLOW_VERSION=2.7.1
24 | 
25 | # Extract the version of Python you have installed. If you're currently using Python 3.11 you may want to set this manually as noted above, Python 3.11 is not yet supported.
26 | PYTHON_VERSION="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)"
27 | 
28 | CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt"
29 | # For example this would install 2.7.1 with python 3.8: https://raw.githubusercontent.com/apache/airflow/constraints-2.7.1/constraints-3.8.txt
30 | ```
31 | 
32 | Then source it with `source constraint.sh` and install Airflow with `pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}"`.
33 | 
34 | Once completed run the standalone sub-command to populate the database and start all components:
35 | 
36 | ```
37 | airflow standalone
38 | ```
39 | 
40 | Go to the UI at [localhost:8080](http://localhost:8080) and you should see the Airflow UI.
41 | 
42 | ### Docker Compose
43 | 
44 | For Apache Airflow 2.7.1 you can fetch a pre-made `docker-compose.yaml` file from the documentation:
45 | 
46 | ```bash
47 | curl -LfO 'https://airflow.apache.org/docs/apache-airflow/2.7.1/docker-compose.yaml'
48 | ```
49 | 
50 | Change the version if you want to use something different. The Docker compose method is meant to have an all-in-one setup for development and testing and it isn't recommended for production environments.
51 | 
52 | Initialize the database before starting the rest of the containers, this is required as it will otherwise not setup the environment correctly including populating the database with its initial data:
53 | 
54 | ```bash
55 | docker compose up airflow-init
56 | ```
57 | 
58 | Then start the rest of the containers with:
59 | 
60 | ```
61 | docker compose up
62 | ```
63 | 
64 | Access the environment at [localhost:8080](http://localhost:8080). Use the default credentials `airflow` and `airflow` to login.
65 | 
66 | ## Lesson 2: Apache Airflow Fundamentals
67 | 
68 | Airflow has several components that are useful to understand before diving into the code. Start by exploring the simple example to add a Python task to a DAG. Run the task and explore the logs and the UI.
69 | 
70 | * [Simple DAG](./examples/3-simple/)
71 | 
72 | ## Lesson 3: Creating and running a Pipeline
73 | 
74 | Creating a pipeline in Airflow allows you to feel more comfortable with core concepts related to Data Engineering. In this example we will create a pipeline that will download a file from the internet, we will clean the dataset using Pandas and then we will persist specific data to a database. All of these actions will be performed in separate steps in tasks.
75 | 
76 | * [Example pipeline](./examples/4-applied/)
77 | 
78 | ## Lesson 4: Practice Lab
79 | 
80 | Use the [included practice lab](./lab.md) to build a data pipeline using Apache Airflow to extract census data, transform it, and load it into a database based on certain conditions. Follow the steps in the lab to complete the exercise in your own repository. 
81 | 
82 | 


--------------------------------------------------------------------------------
/examples/1-pip/constraints.sh:
--------------------------------------------------------------------------------
 1 | AIRFLOW_VERSION=2.7.1
 2 | 
 3 | # Extract the version of Python you have installed. If you're currently using Python 3.11 you may want to set this manually as noted above, Python 3.11 is not yet supported.
 4 | PYTHON_VERSION="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)"
 5 | 
 6 | CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt"
 7 | # For example this would install 2.7.1 with python 3.8: https://raw.githubusercontent.com/apache/airflow/constraints-2.7.1/constraints-3.8.txt
 8 | 
 9 | echo "Install with the following command:"
10 | echo "pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}""
11 | 


--------------------------------------------------------------------------------
/examples/2-compose/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.7.1
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # AIRFLOW_PROJ_DIR             - Base path to which all the files will be volumed.
 31 | #                                Default: .
 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 33 | #
 34 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 37 | #                                Default: airflow
 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 39 | #                                Use this option ONLY for quick checks. Installing requirements at container
 40 | #                                startup is done EVERY TIME the service is started.
 41 | #                                A better way is to build a custom image or extend the official image
 42 | #                                as described in https://airflow.apache.org/docs/docker-stack/build.html.
 43 | #                                Default: ''
 44 | #
 45 | # Feel free to modify this file to suit your needs.
 46 | ---
 47 | version: '3.8'
 48 | x-airflow-common:
 49 |   &airflow-common
 50 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 51 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 52 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 53 |   image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.7.1}
 54 |   # build: .
 55 |   environment:
 56 |     &airflow-common-env
 57 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 58 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 59 |     # For backward compatibility, with Airflow <2.3
 60 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 61 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 62 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 63 |     AIRFLOW__CORE__FERNET_KEY: ''
 64 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 65 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
 66 |     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
 67 |     # yamllint disable rule:line-length
 68 |     # Use simple http server on scheduler for health checks
 69 |     # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
 70 |     # yamllint enable rule:line-length
 71 |     AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
 72 |     # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
 73 |     # for other purpose (development, test and especially production usage) build/extend Airflow image.
 74 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 75 |   volumes:
 76 |     - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
 77 |     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
 78 |     - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
 79 |     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
 80 |   user: "${AIRFLOW_UID:-50000}:0"
 81 |   depends_on:
 82 |     &airflow-common-depends-on
 83 |     redis:
 84 |       condition: service_healthy
 85 |     postgres:
 86 |       condition: service_healthy
 87 | 
 88 | services:
 89 |   postgres:
 90 |     image: postgres:13
 91 |     environment:
 92 |       POSTGRES_USER: airflow
 93 |       POSTGRES_PASSWORD: airflow
 94 |       POSTGRES_DB: airflow
 95 |     volumes:
 96 |       - postgres-db-volume:/var/lib/postgresql/data
 97 |     healthcheck:
 98 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 99 |       interval: 10s
100 |       retries: 5
101 |       start_period: 5s
102 |     restart: always
103 | 
104 |   redis:
105 |     image: redis:latest
106 |     expose:
107 |       - 6379
108 |     healthcheck:
109 |       test: ["CMD", "redis-cli", "ping"]
110 |       interval: 10s
111 |       timeout: 30s
112 |       retries: 50
113 |       start_period: 30s
114 |     restart: always
115 | 
116 |   airflow-webserver:
117 |     <<: *airflow-common
118 |     command: webserver
119 |     ports:
120 |       - "8080:8080"
121 |     healthcheck:
122 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
123 |       interval: 30s
124 |       timeout: 10s
125 |       retries: 5
126 |       start_period: 30s
127 |     restart: always
128 |     depends_on:
129 |       <<: *airflow-common-depends-on
130 |       airflow-init:
131 |         condition: service_completed_successfully
132 | 
133 |   airflow-scheduler:
134 |     <<: *airflow-common
135 |     command: scheduler
136 |     healthcheck:
137 |       test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
138 |       interval: 30s
139 |       timeout: 10s
140 |       retries: 5
141 |       start_period: 30s
142 |     restart: always
143 |     depends_on:
144 |       <<: *airflow-common-depends-on
145 |       airflow-init:
146 |         condition: service_completed_successfully
147 | 
148 |   airflow-worker:
149 |     <<: *airflow-common
150 |     command: celery worker
151 |     healthcheck:
152 |       # yamllint disable rule:line-length
153 |       test:
154 |         - "CMD-SHELL"
155 |         - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
156 |       interval: 30s
157 |       timeout: 10s
158 |       retries: 5
159 |       start_period: 30s
160 |     environment:
161 |       <<: *airflow-common-env
162 |       # Required to handle warm shutdown of the celery workers properly
163 |       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
164 |       DUMB_INIT_SETSID: "0"
165 |     restart: always
166 |     depends_on:
167 |       <<: *airflow-common-depends-on
168 |       airflow-init:
169 |         condition: service_completed_successfully
170 | 
171 |   airflow-triggerer:
172 |     <<: *airflow-common
173 |     command: triggerer
174 |     healthcheck:
175 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
176 |       interval: 30s
177 |       timeout: 10s
178 |       retries: 5
179 |       start_period: 30s
180 |     restart: always
181 |     depends_on:
182 |       <<: *airflow-common-depends-on
183 |       airflow-init:
184 |         condition: service_completed_successfully
185 | 
186 |   airflow-init:
187 |     <<: *airflow-common
188 |     entrypoint: /bin/bash
189 |     # yamllint disable rule:line-length
190 |     command:
191 |       - -c
192 |       - |
193 |         function ver() {
194 |           printf "%04d%04d%04d%04d" $${1//./ }
195 |         }
196 |         airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version)
197 |         airflow_version_comparable=$$(ver $${airflow_version})
198 |         min_airflow_version=2.2.0
199 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
200 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
201 |           echo
202 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
203 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
204 |           echo
205 |           exit 1
206 |         fi
207 |         if [[ -z "${AIRFLOW_UID}" ]]; then
208 |           echo
209 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
210 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
211 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
212 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
213 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
214 |           echo
215 |         fi
216 |         one_meg=1048576
217 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
218 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
219 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
220 |         warning_resources="false"
221 |         if (( mem_available < 4000 )) ; then
222 |           echo
223 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
224 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
225 |           echo
226 |           warning_resources="true"
227 |         fi
228 |         if (( cpus_available < 2 )); then
229 |           echo
230 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
231 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
232 |           echo
233 |           warning_resources="true"
234 |         fi
235 |         if (( disk_available < one_meg * 10 )); then
236 |           echo
237 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
238 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
239 |           echo
240 |           warning_resources="true"
241 |         fi
242 |         if [[ $${warning_resources} == "true" ]]; then
243 |           echo
244 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
245 |           echo "Please follow the instructions to increase amount of resources available:"
246 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
247 |           echo
248 |         fi
249 |         mkdir -p /sources/logs /sources/dags /sources/plugins
250 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
251 |         exec /entrypoint airflow version
252 |     # yamllint enable rule:line-length
253 |     environment:
254 |       <<: *airflow-common-env
255 |       _AIRFLOW_DB_MIGRATE: 'true'
256 |       _AIRFLOW_WWW_USER_CREATE: 'true'
257 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
258 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
259 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
260 |     user: "0:0"
261 |     volumes:
262 |       - ${AIRFLOW_PROJ_DIR:-.}:/sources
263 | 
264 |   airflow-cli:
265 |     <<: *airflow-common
266 |     profiles:
267 |       - debug
268 |     environment:
269 |       <<: *airflow-common-env
270 |       CONNECTION_CHECK_MAX_COUNT: "0"
271 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
272 |     command:
273 |       - bash
274 |       - -c
275 |       - airflow
276 | 
277 |   # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
278 |   # or by explicitly targeted on the command line e.g. docker-compose up flower.
279 |   # See: https://docs.docker.com/compose/profiles/
280 |   flower:
281 |     <<: *airflow-common
282 |     command: celery flower
283 |     profiles:
284 |       - flower
285 |     ports:
286 |       - "5555:5555"
287 |     healthcheck:
288 |       test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
289 |       interval: 30s
290 |       timeout: 10s
291 |       retries: 5
292 |       start_period: 30s
293 |     restart: always
294 |     depends_on:
295 |       <<: *airflow-common-depends-on
296 |       airflow-init:
297 |         condition: service_completed_successfully
298 | 
299 | volumes:
300 |   postgres-db-volume:
301 | 


--------------------------------------------------------------------------------
/examples/3-simple/pandas_example.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | import tempfile
 4 | import time
 5 | from pprint import pprint
 6 | 
 7 | import pendulum
 8 | 
 9 | from airflow import DAG
10 | from airflow.decorators import task
11 | from airflow.operators.python import PythonVirtualenvOperator, is_venv_installed
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | PATH_TO_PYTHON_BINARY = sys.executable
16 | 
17 | 
18 | with DAG(
19 |     dag_id="example_python_and_pandas",
20 |     schedule=None,
21 |     tags=["example"],
22 | ) as dag:
23 | 
24 |     if not is_venv_installed():
25 |         log.warning("The virtalenv_python example task requires virtualenv, please install it.")
26 |     else:
27 |         # [START howto_operator_python_venv]
28 |         @task.virtualenv(
29 |             task_id="virtualenv_python", requirements=["pandas==2.1.1"], system_site_packages=False
30 |         )
31 |         def pandas_head():
32 |             import pandas as pd
33 |             csv_url = "https://raw.githubusercontent.com/paiml/wine-ratings/main/wine-ratings.csv"
34 |             df = pd.read_csv(csv_url, index_col=0)
35 |             head = df.head(10)
36 |             return head.to_csv()
37 | 
38 |         pandas_task = pandas_head()
39 | 


--------------------------------------------------------------------------------
/examples/4-applied/clean_dataset.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from datetime import datetime
 4 | from airflow import DAG, Dataset
 5 | from airflow.decorators import task
 6 | from airflow.operators.python import is_venv_installed
 7 | 
 8 | log = logging.getLogger(__name__)
 9 | 
10 | RAW_WINE_DATASET = Dataset("file://localhost/airflow/datasets/raw_wine_dataset.csv")
11 | 
12 | with DAG(
13 |     dag_id="wine_dataset_consumer",
14 |     schedule=[RAW_WINE_DATASET],
15 |     start_date=datetime(2023, 1, 1),
16 |     tags=["example"],
17 | ) as dag:
18 | 
19 |     if not is_venv_installed():
20 |         raise RuntimeError("virtualenv is not installed!")
21 |     else:
22 |         @task.virtualenv(
23 |             task_id="virtualenv_python", requirements=["pandas==2.1.1"],
24 |             system_site_packages=False
25 |         )
26 |         def clean_dataset():
27 |             import pandas as pd
28 |             df = pd.read_csv("~/airflow/datasets/raw_wine_dataset.csv", index_col=0)
29 |             df = df.replace({"\r": ""}, regex=True)
30 |             df = df.replace({"\n": " "}, regex=True)
31 |             df.drop(['grape'], axis=1, inplace=True)
32 |             df.to_csv("~/airflow/datasets/cleaned_dataset.csv")
33 | 
34 |         @task.virtualenv(
35 |             task_id="sqlite_persist_wine_data", requirements=["pandas==2.1.1", "sqlalchemy==2.0.21"],
36 |             system_site_packages=False
37 |         )
38 |         def persist_dataset():
39 |             import pandas as pd
40 |             from sqlalchemy import create_engine
41 |             engine = create_engine('sqlite:////Users/alfredodeza/airflow/tmp/wine_dataset.db', echo=True)
42 |             df = pd.read_csv("~/airflow/datasets/cleaned_dataset.csv", index_col=0)
43 |             df.to_sql('wine_dataset', engine)
44 |             df.notes.to_sql("wine_notes", engine)
45 |         
46 |         clean_dataset() >> persist_dataset()


--------------------------------------------------------------------------------
/examples/4-applied/get_wine_dataset.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from datetime import datetime
 4 | from airflow import DAG, Dataset
 5 | from airflow.decorators import task
 6 | from airflow.operators.python import is_venv_installed
 7 | 
 8 | log = logging.getLogger(__name__)
 9 | 
10 | RAW_WINE_DATASET = Dataset("file://localhost/airflow/datasets/raw_wine_dataset.csv")
11 | 
12 | with DAG(
13 |     dag_id="wine_dataset_get",
14 |     schedule=None,
15 |     start_date=datetime(2023, 1, 1),
16 |     tags=["example"],
17 | ) as dag:
18 | 
19 |     if not is_venv_installed():
20 |         raise RuntimeError("virtualenv is not installed!")
21 |     else:
22 |         @task.virtualenv(
23 |             task_id="virtualenv_python", requirements=["pandas==2.1.1"],
24 |             system_site_packages=False, outlets=[RAW_WINE_DATASET]
25 |         )
26 |         def retrieve_dataset():
27 |             import pandas as pd
28 |             df = pd.read_csv("https://raw.githubusercontent.com/paiml/wine-ratings/main/wine-ratings.csv", index_col=0)
29 |             df.to_csv("~/airflow/datasets/raw_wine_dataset.csv")
30 | 
31 |         retrieve_dataset()
32 | 


--------------------------------------------------------------------------------
/lab.md:
--------------------------------------------------------------------------------
 1 | # Practice Lab: Build a Data Pipeline for Census Data
 2 | 
 3 | In this practice lab, you will build a data pipeline using Apache Airflow to extract census data, transform it, and load it into a database based on certain conditions.
 4 | 
 5 | ## Learning Objectives:
 6 | 
 7 | * Create tasks to extract, transform, and load data
 8 | * Handle missing data while transforming
 9 | * Load filtered data into a database based on conditions
10 | 
11 | ## Steps:
12 | 
13 | 1. Create a new DAG called census_data_pipeline
14 | 1. Define a PythonOperator to download the census CSV data from:
15 |     ```
16 |     https://raw.githubusercontent.com/practical-bootcamp/week4-assignment1-template/main/city_census.csv
17 |     ```
18 | 1. Read in the CSV data and handle any missing values.
19 | 1. Filter the data and choose a condition to extract specific data. For example:
20 |     * Age is greater than 30
21 |     * State is 'Iowa'
22 | 1. Parse the data into a clean DataFrame and store it in a database
23 | 1. Set up the task ordering and dependencies correctly
24 | 1. Monitor the pipeline execution in the Airflow UI
25 | 
26 | **Bonus challenge:** Perform validation on the number of rows before loading into the database and add some simple statistics about the data as a final step.
27 | 
28 | 
29 | By completing this lab, you will gain practical hands-on experience building a real-world data pipeline with Airflow. The skills covered, including data ingestion, transformation, orchestration, and database loading, will align directly with real-world data engineering pipelines. This lab will allow you to apply the Airflow concepts from the course to an end-to-end pipeline development scenario.
30 | 


--------------------------------------------------------------------------------