├── .env
├── requirements.txt
├── .gitignore
├── dags
    ├── dag_definition
    │   ├── __pycache__
    │   │   └── random_users.cpython-312.pyc
    │   └── random_users.py
    └── utils
    │   ├── aws.py
    │   ├── extract_data.py
    │   └── transfer_to_s3.py
├── Dockerfile
├── README.md
└── docker-compose.yaml


/.env:
--------------------------------------------------------------------------------
1 | AIRFLOW_UID=50000


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | awswrangler==3.10.0
2 | boto3==1.34.94


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | my_venv/
3 | plugins/
4 | user_profiles/
5 | __pycache__/
6 | convert_profiles.py
7 | random_user.txt
8 | results.json
9 | 


--------------------------------------------------------------------------------
/dags/dag_definition/__pycache__/random_users.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/random_user_generator/main/dags/dag_definition/__pycache__/random_users.cpython-312.pyc


--------------------------------------------------------------------------------
/dags/utils/aws.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from airflow.models import Variable
 3 | 
 4 | 
 5 | def session():
 6 |     session = boto3.Session(
 7 |             aws_access_key_id=Variable.get('access_key'),
 8 |             aws_secret_access_key=Variable.get('secret_key'),
 9 |             region_name='eu-central-1'
10 |         )
11 |     return session
12 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # set your bas eimage
 2 | FROM apache/airflow:2.10.3
 3 | 
 4 | # sets your desired working directory
 5 | WORKDIR /downloads
 6 | 
 7 | # copy requirements.txt file from local to the downloads folder in the container
 8 | COPY requirements.txt /downloads
 9 | 
10 | # Install the dependncies in the text file
11 | RUN pip install --no-cache-dir -r /downloads/requirements.txt


--------------------------------------------------------------------------------
/dags/utils/extract_data.py:
--------------------------------------------------------------------------------
 1 | # Import packages
 2 | import pandas as pd
 3 | import requests
 4 | 
 5 | 
 6 | def get_api_data():
 7 |     """
 8 |     This function is used to get the 100 random profiles data
 9 |     from the api and then turn the data into a pandas DataFrame.
10 |     """
11 |     url = 'https://randomuser.me/api/?results=1000'
12 |     response = requests.get(url)
13 |     response = response.json()['results']
14 |     data = pd.DataFrame(response)
15 |     return data
16 | 
17 | 
18 | def extract_selected_columns():
19 |     """
20 |     This function is used to extract the desired columns from the dataset.
21 |     """
22 |     data = get_api_data()
23 | 
24 |     # Extract relevant columns
25 |     data['first_name'] = data['name'].apply(lambda x: x['first'])
26 |     data['last_name'] = data['name'].apply(lambda x: x['last'])
27 |     selected_columns = data[['gender', 'first_name', 'last_name']]
28 | 
29 |     return selected_columns
30 | 


--------------------------------------------------------------------------------
/dags/utils/transfer_to_s3.py:
--------------------------------------------------------------------------------
 1 | import awswrangler as wr
 2 | 
 3 | from utils.aws import session
 4 | from utils.extract_data import extract_selected_columns
 5 | 
 6 | 
 7 | def upload_to_s3():
 8 |     """
 9 |     Uploads a pandas dataframe to an s3 bucket using AWS Wrangler.
10 | 
11 |     :param data: pandas DataFrame to upload
12 |     :param bucket_name: Name of the S3 bucket
13 |     :param file_key: Path/key for the file in the bucket
14 |     """
15 | 
16 |     # creating a variable to save outr extraxted data
17 |     data = extract_selected_columns()
18 | 
19 |     # Verify that the dataFrame is not empty
20 |     if data.empty:
21 |         print("The DataFrame is empty. No data to upload.")
22 |         return
23 | 
24 |     bucket_name = "s3://chisomnwa-bucket"
25 |     file_key = "random-profiles.parquet"
26 | 
27 |     # Upload the Dataframe as a Parquet file to s3
28 |     wr.s3.to_parquet(
29 |         df=data,
30 |         path=f"{bucket_name}/{file_key}",
31 |         index=False,
32 |         boto3_session=session(),
33 |         dataset=True,
34 |         mode='overwrite'
35 |     )
36 | 
37 |     print(f"Data Successfully uploaded to {bucket_name}/{file_key}")
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Random User Generator
 2 | 
 3 | This project is about exercising my skills on data pipeline orcehestration with **airflow**. A skill I learnt during my intensive data engineer Bootcamp training at Core Data Engineers.
 4 | 
 5 | ## Project Tasks
 6 |   * Go to https://randomuser.me/api
 7 |   * Pull 1000 profiles
 8 |   * Convert the profiles into a Pandas Dataframe
 9 |   * When you go to the Airflow logs, you should see the shape of the dataframe
10 |   * Use AWS Wrangler to transfer the data to S3
11 | 
12 |   ## Solution
13 |   * Create your projct folder
14 |   * Deploy Airflow on Docker Compose by fetching docker-copmpose.yaml and set
15 |   * Create your necessary files like the `.env`, `Dockerfile`, and `requirements.txt` files.
16 |   * Create your functions that you will use to get the data, from the APIU, extract the necessary columns, and load the data into Amazon S3.
17 | 
18 |   ## At the end, under the dag folder, you will have:
19 | * **dag_definition folder** which has the:
20 | 
21 |     * **randon_users.py** - file that contains the DAG for the airflow orchestration.
22 | 
23 | * **utils folder** which has the:
24 | 
25 |     * **extract_data.py** - file that contains the functions that gets the data from the API and also extracts the necessary columns from the downloaded data
26 | 
27 |     * **transfer_data.py** - file that contains the function that uploads the data to Amazon s3.
28 | 
29 | 


--------------------------------------------------------------------------------
/dags/dag_definition/random_users.py:
--------------------------------------------------------------------------------
 1 | # Import the required airflow modules
 2 | from datetime import timedelta
 3 | 
 4 | from airflow import DAG
 5 | from airflow.operators.python import PythonOperator
 6 | from airflow.utils.dates import datetime
 7 | 
 8 | from utils.extract_data import extract_selected_columns
 9 | from utils.transfer_to_s3 import upload_to_s3
10 | 
11 | # Create default arguments
12 | default_args = {
13 |     'owner': 'chisom',
14 |     'start_date': datetime(2024, 11, 20),
15 |     'retries': 3,
16 |     'retry_delay': timedelta(seconds=3),
17 |     'execution_timeout': timedelta(minutes=10),
18 | }
19 | 
20 | # Instantiate a DAG
21 | dag = DAG(
22 |     dag_id="random_user_generator",
23 |     default_args=default_args,
24 |     default_view="graph",
25 |     tags=["generate_users"],
26 |     description='returning random users',
27 |     schedule_interval="0 0 * * *",
28 |     catchup=False
29 | )
30 | 
31 | # Create a Python operator object that calls function that extracts
32 | # the data from the API
33 | # Extracts the required columns and then converts it to a pandas dataframe
34 | convert_profiles = PythonOperator(
35 |     dag=dag,
36 |     task_id='convert_profiles',
37 |     python_callable=extract_selected_columns
38 |     )
39 | 
40 | # Create a Python operator object that calls the function that loads data to s3
41 | load_data_to_s3 = PythonOperator(
42 |     dag=dag,
43 |     task_id='load_data',
44 |     python_callable=upload_to_s3
45 | )
46 | 
47 | # Specify the task dependencies
48 | convert_profiles >> load_data_to_s3
49 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.10.3
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # AIRFLOW_PROJ_DIR             - Base path to which all the files will be volumed.
 31 | #                                Default: .
 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 33 | #
 34 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 37 | #                                Default: airflow
 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 39 | #                                Use this option ONLY for quick checks. Installing requirements at container
 40 | #                                startup is done EVERY TIME the service is started.
 41 | #                                A better way is to build a custom image or extend the official image
 42 | #                                as described in https://airflow.apache.org/docs/docker-stack/build.html.
 43 | #                                Default: ''
 44 | #
 45 | # Feel free to modify this file to suit your needs.
 46 | ---
 47 | x-airflow-common:
 48 |   &airflow-common
 49 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 50 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 51 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 52 |   image: ${AIRFLOW_IMAGE_NAME:-airflow_awswrangler:v1.0}
 53 |   # build: .
 54 |   environment:
 55 |     &airflow-common-env
 56 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 57 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 58 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 59 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 60 |     AIRFLOW__CORE__FERNET_KEY: ''
 61 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 62 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
 63 |     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
 64 |     # yamllint disable rule:line-length
 65 |     # Use simple http server on scheduler for health checks
 66 |     # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
 67 |     # yamllint enable rule:line-length
 68 |     AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
 69 |     # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
 70 |     # for other purpose (development, test and especially production usage) build/extend Airflow image.
 71 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 72 |     # The following line can be used to set a custom config file, stored in the local config folder
 73 |     # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
 74 |     # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
 75 |   volumes:
 76 |     - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
 77 |     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
 78 |     - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
 79 |     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
 80 |   user: "${AIRFLOW_UID:-50000}:0"
 81 |   depends_on:
 82 |     &airflow-common-depends-on
 83 |     redis:
 84 |       condition: service_healthy
 85 |     postgres:
 86 |       condition: service_healthy
 87 | 
 88 | services:
 89 |   postgres:
 90 |     image: postgres:13
 91 |     environment:
 92 |       POSTGRES_USER: airflow
 93 |       POSTGRES_PASSWORD: airflow
 94 |       POSTGRES_DB: airflow
 95 |     volumes:
 96 |       - postgres-db-volume:/var/lib/postgresql/data
 97 |     healthcheck:
 98 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 99 |       interval: 10s
100 |       retries: 5
101 |       start_period: 5s
102 |     restart: always
103 | 
104 |   redis:
105 |     # Redis is limited to 7.2-bookworm due to licencing change
106 |     # https://redis.io/blog/redis-adopts-dual-source-available-licensing/
107 |     image: redis:7.2-bookworm
108 |     expose:
109 |       - 6379
110 |     healthcheck:
111 |       test: ["CMD", "redis-cli", "ping"]
112 |       interval: 10s
113 |       timeout: 30s
114 |       retries: 50
115 |       start_period: 30s
116 |     restart: always
117 | 
118 |   airflow-webserver:
119 |     <<: *airflow-common
120 |     command: webserver
121 |     ports:
122 |       - "8080:8080"
123 |     healthcheck:
124 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
125 |       interval: 30s
126 |       timeout: 10s
127 |       retries: 5
128 |       start_period: 30s
129 |     restart: always
130 |     depends_on:
131 |       <<: *airflow-common-depends-on
132 |       airflow-init:
133 |         condition: service_completed_successfully
134 | 
135 |   airflow-scheduler:
136 |     <<: *airflow-common
137 |     command: scheduler
138 |     healthcheck:
139 |       test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
140 |       interval: 30s
141 |       timeout: 10s
142 |       retries: 5
143 |       start_period: 30s
144 |     restart: always
145 |     depends_on:
146 |       <<: *airflow-common-depends-on
147 |       airflow-init:
148 |         condition: service_completed_successfully
149 | 
150 |   airflow-worker:
151 |     <<: *airflow-common
152 |     command: celery worker
153 |     healthcheck:
154 |       # yamllint disable rule:line-length
155 |       test:
156 |         - "CMD-SHELL"
157 |         - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
158 |       interval: 30s
159 |       timeout: 10s
160 |       retries: 5
161 |       start_period: 30s
162 |     environment:
163 |       <<: *airflow-common-env
164 |       # Required to handle warm shutdown of the celery workers properly
165 |       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
166 |       DUMB_INIT_SETSID: "0"
167 |     restart: always
168 |     depends_on:
169 |       <<: *airflow-common-depends-on
170 |       airflow-init:
171 |         condition: service_completed_successfully
172 | 
173 |   airflow-triggerer:
174 |     <<: *airflow-common
175 |     command: triggerer
176 |     healthcheck:
177 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
178 |       interval: 30s
179 |       timeout: 10s
180 |       retries: 5
181 |       start_period: 30s
182 |     restart: always
183 |     depends_on:
184 |       <<: *airflow-common-depends-on
185 |       airflow-init:
186 |         condition: service_completed_successfully
187 | 
188 |   airflow-init:
189 |     <<: *airflow-common
190 |     entrypoint: /bin/bash
191 |     # yamllint disable rule:line-length
192 |     command:
193 |       - -c
194 |       - |
195 |         if [[ -z "${AIRFLOW_UID}" ]]; then
196 |           echo
197 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
198 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
199 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
200 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
201 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
202 |           echo
203 |         fi
204 |         one_meg=1048576
205 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
206 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
207 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
208 |         warning_resources="false"
209 |         if (( mem_available < 4000 )) ; then
210 |           echo
211 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
212 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
213 |           echo
214 |           warning_resources="true"
215 |         fi
216 |         if (( cpus_available < 2 )); then
217 |           echo
218 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
219 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
220 |           echo
221 |           warning_resources="true"
222 |         fi
223 |         if (( disk_available < one_meg * 10 )); then
224 |           echo
225 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
226 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
227 |           echo
228 |           warning_resources="true"
229 |         fi
230 |         if [[ $${warning_resources} == "true" ]]; then
231 |           echo
232 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
233 |           echo "Please follow the instructions to increase amount of resources available:"
234 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
235 |           echo
236 |         fi
237 |         mkdir -p /sources/logs /sources/dags /sources/plugins
238 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
239 |         exec /entrypoint airflow version
240 |     # yamllint enable rule:line-length
241 |     environment:
242 |       <<: *airflow-common-env
243 |       _AIRFLOW_DB_MIGRATE: 'true'
244 |       _AIRFLOW_WWW_USER_CREATE: 'true'
245 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
246 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
247 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
248 |     user: "0:0"
249 |     volumes:
250 |       - ${AIRFLOW_PROJ_DIR:-.}:/sources
251 | 
252 |   airflow-cli:
253 |     <<: *airflow-common
254 |     profiles:
255 |       - debug
256 |     environment:
257 |       <<: *airflow-common-env
258 |       CONNECTION_CHECK_MAX_COUNT: "0"
259 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
260 |     command:
261 |       - bash
262 |       - -c
263 |       - airflow
264 | 
265 |   # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
266 |   # or by explicitly targeted on the command line e.g. docker-compose up flower.
267 |   # See: https://docs.docker.com/compose/profiles/
268 |   flower:
269 |     <<: *airflow-common
270 |     command: celery flower
271 |     profiles:
272 |       - flower
273 |     ports:
274 |       - "5555:5555"
275 |     healthcheck:
276 |       test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
277 |       interval: 30s
278 |       timeout: 10s
279 |       retries: 5
280 |       start_period: 30s
281 |     restart: always
282 |     depends_on:
283 |       <<: *airflow-common-depends-on
284 |       airflow-init:
285 |         condition: service_completed_successfully
286 | 
287 | volumes:
288 |   postgres-db-volume:
289 | 


--------------------------------------------------------------------------------