├── .env ├── requirements.txt ├── .gitignore ├── dags ├── dag_definition │ ├── __pycache__ │ │ └── random_users.cpython-312.pyc │ └── random_users.py └── utils │ ├── aws.py │ ├── extract_data.py │ └── transfer_to_s3.py ├── Dockerfile ├── README.md └── docker-compose.yaml /.env: -------------------------------------------------------------------------------- 1 | AIRFLOW_UID=50000 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | awswrangler==3.10.0 2 | boto3==1.34.94 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | my_venv/ 3 | plugins/ 4 | user_profiles/ 5 | __pycache__/ 6 | convert_profiles.py 7 | random_user.txt 8 | results.json 9 | -------------------------------------------------------------------------------- /dags/dag_definition/__pycache__/random_users.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/random_user_generator/main/dags/dag_definition/__pycache__/random_users.cpython-312.pyc -------------------------------------------------------------------------------- /dags/utils/aws.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from airflow.models import Variable 3 | 4 | 5 | def session(): 6 | session = boto3.Session( 7 | aws_access_key_id=Variable.get('access_key'), 8 | aws_secret_access_key=Variable.get('secret_key'), 9 | region_name='eu-central-1' 10 | ) 11 | return session 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # set your bas eimage 2 | FROM apache/airflow:2.10.3 3 | 4 | # sets your desired working directory 5 | WORKDIR /downloads 6 | 7 | # copy requirements.txt file from local to the downloads folder in the container 8 | COPY requirements.txt /downloads 9 | 10 | # Install the dependncies in the text file 11 | RUN pip install --no-cache-dir -r /downloads/requirements.txt -------------------------------------------------------------------------------- /dags/utils/extract_data.py: -------------------------------------------------------------------------------- 1 | # Import packages 2 | import pandas as pd 3 | import requests 4 | 5 | 6 | def get_api_data(): 7 | """ 8 | This function is used to get the 100 random profiles data 9 | from the api and then turn the data into a pandas DataFrame. 10 | """ 11 | url = 'https://randomuser.me/api/?results=1000' 12 | response = requests.get(url) 13 | response = response.json()['results'] 14 | data = pd.DataFrame(response) 15 | return data 16 | 17 | 18 | def extract_selected_columns(): 19 | """ 20 | This function is used to extract the desired columns from the dataset. 21 | """ 22 | data = get_api_data() 23 | 24 | # Extract relevant columns 25 | data['first_name'] = data['name'].apply(lambda x: x['first']) 26 | data['last_name'] = data['name'].apply(lambda x: x['last']) 27 | selected_columns = data[['gender', 'first_name', 'last_name']] 28 | 29 | return selected_columns 30 | -------------------------------------------------------------------------------- /dags/utils/transfer_to_s3.py: -------------------------------------------------------------------------------- 1 | import awswrangler as wr 2 | 3 | from utils.aws import session 4 | from utils.extract_data import extract_selected_columns 5 | 6 | 7 | def upload_to_s3(): 8 | """ 9 | Uploads a pandas dataframe to an s3 bucket using AWS Wrangler. 10 | 11 | :param data: pandas DataFrame to upload 12 | :param bucket_name: Name of the S3 bucket 13 | :param file_key: Path/key for the file in the bucket 14 | """ 15 | 16 | # creating a variable to save outr extraxted data 17 | data = extract_selected_columns() 18 | 19 | # Verify that the dataFrame is not empty 20 | if data.empty: 21 | print("The DataFrame is empty. No data to upload.") 22 | return 23 | 24 | bucket_name = "s3://chisomnwa-bucket" 25 | file_key = "random-profiles.parquet" 26 | 27 | # Upload the Dataframe as a Parquet file to s3 28 | wr.s3.to_parquet( 29 | df=data, 30 | path=f"{bucket_name}/{file_key}", 31 | index=False, 32 | boto3_session=session(), 33 | dataset=True, 34 | mode='overwrite' 35 | ) 36 | 37 | print(f"Data Successfully uploaded to {bucket_name}/{file_key}") 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Random User Generator 2 | 3 | This project is about exercising my skills on data pipeline orcehestration with **airflow**. A skill I learnt during my intensive data engineer Bootcamp training at Core Data Engineers. 4 | 5 | ## Project Tasks 6 | * Go to https://randomuser.me/api 7 | * Pull 1000 profiles 8 | * Convert the profiles into a Pandas Dataframe 9 | * When you go to the Airflow logs, you should see the shape of the dataframe 10 | * Use AWS Wrangler to transfer the data to S3 11 | 12 | ## Solution 13 | * Create your projct folder 14 | * Deploy Airflow on Docker Compose by fetching docker-copmpose.yaml and set 15 | * Create your necessary files like the `.env`, `Dockerfile`, and `requirements.txt` files. 16 | * Create your functions that you will use to get the data, from the APIU, extract the necessary columns, and load the data into Amazon S3. 17 | 18 | ## At the end, under the dag folder, you will have: 19 | * **dag_definition folder** which has the: 20 | 21 | * **randon_users.py** - file that contains the DAG for the airflow orchestration. 22 | 23 | * **utils folder** which has the: 24 | 25 | * **extract_data.py** - file that contains the functions that gets the data from the API and also extracts the necessary columns from the downloaded data 26 | 27 | * **transfer_data.py** - file that contains the function that uploads the data to Amazon s3. 28 | 29 | -------------------------------------------------------------------------------- /dags/dag_definition/random_users.py: -------------------------------------------------------------------------------- 1 | # Import the required airflow modules 2 | from datetime import timedelta 3 | 4 | from airflow import DAG 5 | from airflow.operators.python import PythonOperator 6 | from airflow.utils.dates import datetime 7 | 8 | from utils.extract_data import extract_selected_columns 9 | from utils.transfer_to_s3 import upload_to_s3 10 | 11 | # Create default arguments 12 | default_args = { 13 | 'owner': 'chisom', 14 | 'start_date': datetime(2024, 11, 20), 15 | 'retries': 3, 16 | 'retry_delay': timedelta(seconds=3), 17 | 'execution_timeout': timedelta(minutes=10), 18 | } 19 | 20 | # Instantiate a DAG 21 | dag = DAG( 22 | dag_id="random_user_generator", 23 | default_args=default_args, 24 | default_view="graph", 25 | tags=["generate_users"], 26 | description='returning random users', 27 | schedule_interval="0 0 * * *", 28 | catchup=False 29 | ) 30 | 31 | # Create a Python operator object that calls function that extracts 32 | # the data from the API 33 | # Extracts the required columns and then converts it to a pandas dataframe 34 | convert_profiles = PythonOperator( 35 | dag=dag, 36 | task_id='convert_profiles', 37 | python_callable=extract_selected_columns 38 | ) 39 | 40 | # Create a Python operator object that calls the function that loads data to s3 41 | load_data_to_s3 = PythonOperator( 42 | dag=dag, 43 | task_id='load_data', 44 | python_callable=upload_to_s3 45 | ) 46 | 47 | # Specify the task dependencies 48 | convert_profiles >> load_data_to_s3 49 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.10.3 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. 31 | # Default: . 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 33 | # 34 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 35 | # Default: airflow 36 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 37 | # Default: airflow 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 39 | # Use this option ONLY for quick checks. Installing requirements at container 40 | # startup is done EVERY TIME the service is started. 41 | # A better way is to build a custom image or extend the official image 42 | # as described in https://airflow.apache.org/docs/docker-stack/build.html. 43 | # Default: '' 44 | # 45 | # Feel free to modify this file to suit your needs. 46 | --- 47 | x-airflow-common: 48 | &airflow-common 49 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 50 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 51 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 52 | image: ${AIRFLOW_IMAGE_NAME:-airflow_awswrangler:v1.0} 53 | # build: . 54 | environment: 55 | &airflow-common-env 56 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 57 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 58 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 59 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 60 | AIRFLOW__CORE__FERNET_KEY: '' 61 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 62 | AIRFLOW__CORE__LOAD_EXAMPLES: 'true' 63 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' 64 | # yamllint disable rule:line-length 65 | # Use simple http server on scheduler for health checks 66 | # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server 67 | # yamllint enable rule:line-length 68 | AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' 69 | # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks 70 | # for other purpose (development, test and especially production usage) build/extend Airflow image. 71 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 72 | # The following line can be used to set a custom config file, stored in the local config folder 73 | # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file 74 | # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' 75 | volumes: 76 | - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags 77 | - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs 78 | - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config 79 | - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins 80 | user: "${AIRFLOW_UID:-50000}:0" 81 | depends_on: 82 | &airflow-common-depends-on 83 | redis: 84 | condition: service_healthy 85 | postgres: 86 | condition: service_healthy 87 | 88 | services: 89 | postgres: 90 | image: postgres:13 91 | environment: 92 | POSTGRES_USER: airflow 93 | POSTGRES_PASSWORD: airflow 94 | POSTGRES_DB: airflow 95 | volumes: 96 | - postgres-db-volume:/var/lib/postgresql/data 97 | healthcheck: 98 | test: ["CMD", "pg_isready", "-U", "airflow"] 99 | interval: 10s 100 | retries: 5 101 | start_period: 5s 102 | restart: always 103 | 104 | redis: 105 | # Redis is limited to 7.2-bookworm due to licencing change 106 | # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ 107 | image: redis:7.2-bookworm 108 | expose: 109 | - 6379 110 | healthcheck: 111 | test: ["CMD", "redis-cli", "ping"] 112 | interval: 10s 113 | timeout: 30s 114 | retries: 50 115 | start_period: 30s 116 | restart: always 117 | 118 | airflow-webserver: 119 | <<: *airflow-common 120 | command: webserver 121 | ports: 122 | - "8080:8080" 123 | healthcheck: 124 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 125 | interval: 30s 126 | timeout: 10s 127 | retries: 5 128 | start_period: 30s 129 | restart: always 130 | depends_on: 131 | <<: *airflow-common-depends-on 132 | airflow-init: 133 | condition: service_completed_successfully 134 | 135 | airflow-scheduler: 136 | <<: *airflow-common 137 | command: scheduler 138 | healthcheck: 139 | test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] 140 | interval: 30s 141 | timeout: 10s 142 | retries: 5 143 | start_period: 30s 144 | restart: always 145 | depends_on: 146 | <<: *airflow-common-depends-on 147 | airflow-init: 148 | condition: service_completed_successfully 149 | 150 | airflow-worker: 151 | <<: *airflow-common 152 | command: celery worker 153 | healthcheck: 154 | # yamllint disable rule:line-length 155 | test: 156 | - "CMD-SHELL" 157 | - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 158 | interval: 30s 159 | timeout: 10s 160 | retries: 5 161 | start_period: 30s 162 | environment: 163 | <<: *airflow-common-env 164 | # Required to handle warm shutdown of the celery workers properly 165 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 166 | DUMB_INIT_SETSID: "0" 167 | restart: always 168 | depends_on: 169 | <<: *airflow-common-depends-on 170 | airflow-init: 171 | condition: service_completed_successfully 172 | 173 | airflow-triggerer: 174 | <<: *airflow-common 175 | command: triggerer 176 | healthcheck: 177 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 178 | interval: 30s 179 | timeout: 10s 180 | retries: 5 181 | start_period: 30s 182 | restart: always 183 | depends_on: 184 | <<: *airflow-common-depends-on 185 | airflow-init: 186 | condition: service_completed_successfully 187 | 188 | airflow-init: 189 | <<: *airflow-common 190 | entrypoint: /bin/bash 191 | # yamllint disable rule:line-length 192 | command: 193 | - -c 194 | - | 195 | if [[ -z "${AIRFLOW_UID}" ]]; then 196 | echo 197 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 198 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 199 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 200 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 201 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" 202 | echo 203 | fi 204 | one_meg=1048576 205 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 206 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 207 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 208 | warning_resources="false" 209 | if (( mem_available < 4000 )) ; then 210 | echo 211 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 212 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 213 | echo 214 | warning_resources="true" 215 | fi 216 | if (( cpus_available < 2 )); then 217 | echo 218 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 219 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 220 | echo 221 | warning_resources="true" 222 | fi 223 | if (( disk_available < one_meg * 10 )); then 224 | echo 225 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 226 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 227 | echo 228 | warning_resources="true" 229 | fi 230 | if [[ $${warning_resources} == "true" ]]; then 231 | echo 232 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 233 | echo "Please follow the instructions to increase amount of resources available:" 234 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" 235 | echo 236 | fi 237 | mkdir -p /sources/logs /sources/dags /sources/plugins 238 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 239 | exec /entrypoint airflow version 240 | # yamllint enable rule:line-length 241 | environment: 242 | <<: *airflow-common-env 243 | _AIRFLOW_DB_MIGRATE: 'true' 244 | _AIRFLOW_WWW_USER_CREATE: 'true' 245 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 246 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 247 | _PIP_ADDITIONAL_REQUIREMENTS: '' 248 | user: "0:0" 249 | volumes: 250 | - ${AIRFLOW_PROJ_DIR:-.}:/sources 251 | 252 | airflow-cli: 253 | <<: *airflow-common 254 | profiles: 255 | - debug 256 | environment: 257 | <<: *airflow-common-env 258 | CONNECTION_CHECK_MAX_COUNT: "0" 259 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 260 | command: 261 | - bash 262 | - -c 263 | - airflow 264 | 265 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up 266 | # or by explicitly targeted on the command line e.g. docker-compose up flower. 267 | # See: https://docs.docker.com/compose/profiles/ 268 | flower: 269 | <<: *airflow-common 270 | command: celery flower 271 | profiles: 272 | - flower 273 | ports: 274 | - "5555:5555" 275 | healthcheck: 276 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 277 | interval: 30s 278 | timeout: 10s 279 | retries: 5 280 | start_period: 30s 281 | restart: always 282 | depends_on: 283 | <<: *airflow-common-depends-on 284 | airflow-init: 285 | condition: service_completed_successfully 286 | 287 | volumes: 288 | postgres-db-volume: 289 | --------------------------------------------------------------------------------