├── dbt
    ├── dbt_travel_agency
    │   ├── seeds
    │   │   └── .gitkeep
    │   ├── tests
    │   │   └── .gitkeep
    │   ├── analyses
    │   │   └── .gitkeep
    │   ├── snapshots
    │   │   └── .gitkeep
    │   ├── .gitignore
    │   ├── models
    │   │   ├── facts
    │   │   │   ├── fact_country_population.sql
    │   │   │   └── source.yml
    │   │   └── dimensions
    │   │   │   ├── dim_country.sql
    │   │   │   └── schema.yml
    │   ├── README.md
    │   └── dbt_project.yml
    └── requirements.txt
├── airflow
    ├── requirements.txt
    ├── Dockerfile
    ├── dags
    │   ├── includes
    │   │   ├── SQL
    │   │   │   └── create_table.sql
    │   │   ├── extract_data.py
    │   │   ├── transform_data.py
    │   │   └── s3_utils.py
    │   └── dag_definition
    │   │   └── travel_agency_dag.py
    └── docker-compose.yaml
├── .DS_Store
├── architecture
    ├── modules
    │   ├── ecr
    │   │   ├── variables.tf
    │   │   ├── outputs.tf
    │   │   └── main.tf
    │   ├── iam_roles
    │   │   ├── outputs.tf
    │   │   ├── variables.tf
    │   │   └── main.tf
    │   ├── vpc
    │   │   ├── outputs.tf
    │   │   ├── variables.tf
    │   │   └── main.tf
    │   ├── s3
    │   │   ├── variables.tf
    │   │   └── main.tf
    │   ├── ssm
    │   │   ├── variables.tf
    │   │   └── main.tf
    │   └── redshift
    │   │   ├── main.tf
    │   │   └── variables.tf
    ├── backend.tf
    ├── provider.tf
    └── main.tf
├── images
    ├── dbt_init_successful.png
    ├── dbt_run_successful.png
    ├── dbt_debug_successful.png
    ├── travel_agency_dag_run_success.png
    ├── travel_agency_dag_run_success_2.png
    ├── Data_Loaded_Successfuly_into_Table.png
    ├── Travel Agency Architectural Diagram.png
    ├── fact_and_dimension_tables_in_redshift.png
    ├── redshift_table_created_successfully.png
    └── Travel Agency Orchestration Flow Chart.png
├── Travel_Agency_Project_Slides.pdf
├── requirements_ci.txt
├── .gitignore
├── app_requirements.txt
├── .github
    └── workflow
    │   ├── ci.yml
    │   └── cd.yml
├── Dockerfile
├── travel_agency_app.py
├── project_README.md
└── README.md


/dbt/dbt_travel_agency/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | awswrangler==3.10.0
2 | boto3==1.34.94


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/.DS_Store


--------------------------------------------------------------------------------
/architecture/modules/ecr/variables.tf:
--------------------------------------------------------------------------------
1 | variable "ecr_name" {
2 |   default = "travel_agency_ecr"
3 | }
4 | 


--------------------------------------------------------------------------------
/dbt/requirements.txt:
--------------------------------------------------------------------------------
1 | dbt-core
2 | dbt-redshift
3 | 
4 | travel-agency-cluster.cz8jk0qkbxey.af-south-1.redshift.amazonaws.com


--------------------------------------------------------------------------------
/images/dbt_init_successful.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/dbt_init_successful.png


--------------------------------------------------------------------------------
/images/dbt_run_successful.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/dbt_run_successful.png


--------------------------------------------------------------------------------
/Travel_Agency_Project_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/Travel_Agency_Project_Slides.pdf


--------------------------------------------------------------------------------
/images/dbt_debug_successful.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/dbt_debug_successful.png


--------------------------------------------------------------------------------
/airflow/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM apache/airflow:2.10.3
2 | 
3 | COPY requirements.txt /
4 | 
5 | RUN pip install --no-cache-dir -r /requirements.txt


--------------------------------------------------------------------------------
/architecture/modules/ecr/outputs.tf:
--------------------------------------------------------------------------------
1 | output "repository_url" {
2 |   value = aws_ecr_repository.travel_agency_ecr.repository_url
3 | }
4 | 


--------------------------------------------------------------------------------
/architecture/modules/iam_roles/outputs.tf:
--------------------------------------------------------------------------------
1 | output "s3_redshift_role_arn" {
2 |     value = aws_iam_role.redshift_role.arn
3 | }
4 | 
5 | 


--------------------------------------------------------------------------------
/images/travel_agency_dag_run_success.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/travel_agency_dag_run_success.png


--------------------------------------------------------------------------------
/images/travel_agency_dag_run_success_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/travel_agency_dag_run_success_2.png


--------------------------------------------------------------------------------
/requirements_ci.txt:
--------------------------------------------------------------------------------
1 | # These are the necessary packages that are needed in ci.yaml file (for continous integration.
2 | 
3 | flake8==7.1.2
4 | isort==6.0.0


--------------------------------------------------------------------------------
/images/Data_Loaded_Successfuly_into_Table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/Data_Loaded_Successfuly_into_Table.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | config
 2 | logs
 3 | plugins
 4 | .env
 5 | aws.py
 6 | **.DS_Store**
 7 | **__pycache__**
 8 | **__init__.py**
 9 | **terraform**
10 | my_venv/
11 | 


--------------------------------------------------------------------------------
/images/Travel Agency Architectural Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/Travel Agency Architectural Diagram.png


--------------------------------------------------------------------------------
/images/fact_and_dimension_tables_in_redshift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/fact_and_dimension_tables_in_redshift.png


--------------------------------------------------------------------------------
/images/redshift_table_created_successfully.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/redshift_table_created_successfully.png


--------------------------------------------------------------------------------
/images/Travel Agency Orchestration Flow Chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/Travel Agency Orchestration Flow Chart.png


--------------------------------------------------------------------------------
/architecture/modules/vpc/outputs.tf:
--------------------------------------------------------------------------------
1 | output "vpc_id" {
2 |   value = aws_vpc.travel_agency_vpc.id
3 | }
4 | 
5 | output "subnet_group_id" {
6 |   value = aws_redshift_subnet_group.redshift_subnet_group.id
7 | }
8 | 
9 | 


--------------------------------------------------------------------------------
/architecture/modules/s3/variables.tf:
--------------------------------------------------------------------------------
1 | # variable "bucket_name" {
2 | #   default = "cde-project-travel-agency-bucket"
3 | # }
4 | 
5 | variable "bucket_name" {
6 |   description = "The name of the S3 bucket"
7 |   type        = string
8 | }
9 | 


--------------------------------------------------------------------------------
/architecture/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 |   backend "s3" {
3 |     bucket = "travel-agency-backend-bucket"
4 |     key    = "travel-agency/dev/terraform.tfstate" # You define this path yourself. It's like a folder structure.
5 |     region = "af-south-1"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/architecture/provider.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "~> 5.0"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | # Configure the AWS Provider
11 | provider "aws" {
12 |   region = "af-south-1"
13 | }


--------------------------------------------------------------------------------
/architecture/modules/ecr/main.tf:
--------------------------------------------------------------------------------
1 | resource "aws_ecr_repository" "travel_agency_ecr" {
2 |   name                 = var.ecr_name
3 |   image_tag_mutability = "MUTABLE"
4 |   force_delete = true
5 |   image_scanning_configuration {
6 |     scan_on_push = true
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/architecture/modules/iam_roles/variables.tf:
--------------------------------------------------------------------------------
 1 | # variable "redshift_role_arn" {
 2 | #   default = var.redshift_role_arn
 3 | # }
 4 | 
 5 | # variable "redshift_role_arn" {
 6 | #   description = "The ARN of the Redshift role"
 7 | #   type        = string
 8 | # }
 9 | 
10 | 


--------------------------------------------------------------------------------
/architecture/modules/ssm/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "password" {
 2 |   description = "This is the name of the SSM parameter"
 3 |   default = "redshift_password"
 4 | }
 5 | 
 6 | variable "username" {
 7 |   description = "This is the value the SSM parameter"
 8 |   default = "redshift_username"
 9 | }
10 | 


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/models/facts/fact_country_population.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | with base as (
 3 |     select
 4 |         country_name,                
 5 |         population,                    
 6 |         area                 
 7 |     from countries_data
 8 |     where population is not null       
 9 | )
10 | 
11 | select * from base


--------------------------------------------------------------------------------
/app_requirements.txt:
--------------------------------------------------------------------------------
1 | # These are necessary packages for the travel_agency_appto run
2 | apache-airflow==2.10.5       # For using Airflow’s Variable module to manage configuration values. 
3 | boto3==1.36.18               # For interacting with AWS services like S3.
4 | pandas==2.2.3                # For data manipulation and transformation
5 | requests==2.32.3             # For making HTTP requests to APIs.


--------------------------------------------------------------------------------
/architecture/modules/vpc/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "vpc_name" {
 2 |    default = "travel_agency_vpc"
 3 | }
 4 | 
 5 | # variable "resource_prefix" {
 6 | #   type = string
 7 | # }
 8 | 
 9 | variable "azs" {
10 |   default = ["af-south-1a", "af-south-1b", "af-south-1c"]
11 | }
12 | 
13 | # variable "create_route" {
14 | #   type    = bool
15 | #   default = true  # Set to false in environments where the route already exists
16 | # }
17 | 


--------------------------------------------------------------------------------
/architecture/modules/redshift/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_redshift_cluster" "travel_agency_cluster" {
 2 |   cluster_identifier        = var.cluster_identifier
 3 |   database_name             = var.database_name
 4 |   master_username           = var.username
 5 |   master_password           = var.password
 6 |   node_type                 = "dc2.large"
 7 |   cluster_type              = "multi"
 8 |   number_of_nodes           = 2
 9 |   cluster_subnet_group_name = var.redshift_subnet_group
10 |   iam_roles                 = [var.redshift_role_arn]
11 |   skip_final_snapshot       = true
12 | }
13 | 


--------------------------------------------------------------------------------
/architecture/modules/s3/main.tf:
--------------------------------------------------------------------------------
 1 | # Create an S3 Bucket
 2 | resource "aws_s3_bucket" "chisom_travel_agency" {
 3 |   bucket = var.bucket_name
 4 | 
 5 |   tags = {
 6 |     Name        = "CDE Capstone Project bucket"
 7 |     Environment = "Dev"
 8 |     owner       = "Chisom"
 9 |     team        = "Core Data Engineers"
10 |     managed_by  = "Team Leaders"
11 |   }
12 | }
13 | 
14 | 
15 | # Enable bucket versioning
16 | resource "aws_s3_bucket_versioning" "chisom_cde_project_versioning" {
17 |   bucket = aws_s3_bucket.chisom_travel_agency.id
18 |   versioning_configuration {
19 |     status = "Enabled"
20 |   }
21 | }


--------------------------------------------------------------------------------
/.github/workflow/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI - Code Quality Checks
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   lint-test:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Checkout Code
10 |         uses: actions/checkout@v3
11 | 
12 |       - name: Set up Python
13 |         uses: actions/setup-python@v4
14 |         with:
15 |           python-version: "3.11.6"
16 | 
17 |       - name: Install dependencies
18 |         run: pip install -r requirements_ci.txt
19 | 
20 |       - name: Running isort.
21 |         run: sort --check-only .
22 | 
23 |       - name: Running flake8
24 |         run: flake8 .
25 | 
26 | 


--------------------------------------------------------------------------------
/architecture/modules/iam_roles/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_iam_role" "redshift_role" {
 2 |   name = "s3_redshift_iam_role"
 3 | 
 4 |   # Terraform's "jsonencode" function converts a
 5 |   # Terraform expression result to valid JSON syntax.
 6 |   assume_role_policy = jsonencode({
 7 |     Version = "2012-10-17"
 8 |     Statement = [
 9 |       {
10 |         Action = "sts:AssumeRole"
11 |         Effect = "Allow"
12 |         Sid    = ""
13 |         Principal = {
14 |           Service = "redshift.amazonaws.com"
15 |         }
16 |       },
17 |     ]
18 |   })
19 | 
20 |   tags = {
21 |     name = "travel_agency_role"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/airflow/dags/includes/SQL/create_table.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Got this from the transformed_data_to_redshift script
 3 | begin;
 4 | CREATE TABLE IF NOT EXISTS countries_data (
 5 |     country_name TEXT NOT NULL,
 6 |     independent BOOLEAN,
 7 |     unMember BOOLEAN,
 8 |     startOfWeek VARCHAR(225),
 9 |     official_country_name TEXT,
10 |     common_native_names TEXT,
11 |     currency_code VARCHAR(225),
12 |     currency_name TEXT,
13 |     currency_symbol VARCHAR(225),
14 |     country_code VARCHAR(2000) UNIQUE NOT NULL,
15 |     capital TEXT,
16 |     region TEXT,
17 |     subregion TEXT,
18 |     languages VARCHAR (1000),
19 |     area FLOAT,
20 |     population BIGINT,
21 |     continents TEXT
22 | );
23 | end;
24 | 


--------------------------------------------------------------------------------
/architecture/modules/ssm/main.tf:
--------------------------------------------------------------------------------
 1 | resource "random_password" "redshift_pass" {
 2 |   length  = 10
 3 |   min_lower = 5
 4 |   min_numeric = 2
 5 |   min_special = 1
 6 |   min_upper = 2
 7 | }
 8 | 
 9 | resource "random_password" "redshift_username" {
10 |   length  = 10
11 |   min_lower = 6
12 |   min_numeric = 2
13 |   upper = false
14 | }
15 | 
16 | resource "aws_ssm_parameter" "ssm_password" {
17 |   name  = var.password
18 |   type  = "String"
19 |   value = random_password.redshift_pass.result
20 | }
21 | 
22 | resource "aws_ssm_parameter" "ssm_username" {
23 |   name  = var.username
24 |   type  = "String"
25 |   value = "a${random_password.redshift_username.result}" 
26 | }
27 | 
28 | # Look for arguments that can change and parameterize those arguments


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/models/facts/source.yml:
--------------------------------------------------------------------------------
 1 | ersion: 2
 2 | 
 3 | models:
 4 |     - name: fact_country_population
 5 |       description:
 6 |           'Fact table storing measurable data related to countries such as
 7 |           population and area.'
 8 |       columns:
 9 |           - name: country_id
10 |             description: 'Foreign key linking to the dim_country table.'
11 |             tests:
12 |                 - not_null
13 |                 - relationships:
14 |                       to: ref('dim_country')
15 |                       field: country_id
16 |           - name: population
17 |             description: 'Population of the country.'
18 |             tests:
19 |                 - not_null
20 |           - name: area
21 |             description: 'Total area of the country in square kilometers.'


--------------------------------------------------------------------------------
/architecture/modules/redshift/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "redshift_subnet_group" {
 2 |   description = "Subnet group for the Redshift cluster"
 3 |   type        = string
 4 | }
 5 | 
 6 | variable "redshift_role_arn" {
 7 |   description = "IAM role ARN for the Redshift cluster"
 8 |   type        = string
 9 | }
10 | 
11 | variable "username" {
12 |   description = "Master username for the Redshift cluster"
13 |   type        = string
14 | }
15 | 
16 | variable "password" {
17 |   description = "Master password for the Redshift cluster"
18 |   type        = string
19 | }
20 | 
21 | variable "database_name" {
22 |   description = "Database name for the Redshift cluster"
23 |   type        = string
24 | }
25 | 
26 | variable "cluster_identifier" {
27 |   description = "Identifier for the Redshift cluster"
28 |   type        = string
29 | }
30 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # This is the Dockerfile for the the image build during the CI/CD process
 2 | 
 3 | # Use Python base Image
 4 | FROM python:3.9-slim
 5 | 
 6 | # Set working directory in the container
 7 | WORKDIR /app
 8 | 
 9 | # Copy the necessary files from tthe current directory into /app in the container
10 | COPY travel_agency_app.py /app
11 | COPY app_requirements.txt /app/
12 | 
13 | # Installl the needed packages specified in the requirements.txt file
14 | RUN pip install --no-cache-dir -r app_requirements.txt
15 | 
16 | # Set the environment variable for AWS region (optional but good practice)
17 | ENV AWS_DEFAULT_REGION=af-south-1
18 | 
19 | # Make port 80 available to the world outside this container )if nneeded)
20 | EXPOSE 8080
21 | 
22 | # Run upload_to_s3.py when the container launches
23 | CMD ["python", "travel_agency_app.py"] 


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/models/dimensions/dim_country.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | with base as (
 4 |     select
 5 |         country_name,                   -- Common name of the country
 6 |         official_country_name,                 -- Official name of the country
 7 |         capital,                       -- Capital city
 8 |         region,                        -- Region of the country
 9 |         subregion,                     -- Subregion of the country
10 |         languages,                       --  to handle multiple language
11 |         currency_code,                 -- Currency code
12 |         currency_name,                 -- Currency name
13 |         currency_symbol,               -- Currency symbol
14 |         continents                  -- When the record was last updated
15 |     from countries_data                   -- Refers to your raw country data table
16 | )
17 | 
18 | select * from base


--------------------------------------------------------------------------------
/airflow/dags/includes/extract_data.py:
--------------------------------------------------------------------------------
 1 | # RETRIEVING THE DATA FROM THE API
 2 | # Import packages and libraries
 3 | import requests
 4 | import pandas as pd
 5 | import logging
 6 | 
 7 | # Configure logging
 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
 9 | 
10 | def get_data():
11 |     """
12 |     A function that gets the data from the API
13 |     and then turns the extracted data into a pandas dataframe
14 |     """
15 |     url = "https://restcountries.com/v3.1/all"
16 | 
17 |     response = requests.get(url)
18 |     logging.info("Fetching data from the API...")
19 | 
20 |     if response.status_code == 200:
21 |         #Parse JSON response
22 |         data = response.json()
23 | 
24 |         # Convert JSON data to Pandas DataFrame
25 |         profiles_data = pd.DataFrame(data)
26 |         logging.info(f"Data successsfuly turned into a pandas Dataframe with\
27 |                       {profiles_data.shape[0]}records and {profiles_data.shape[1]} columns ")
28 |         
29 |         return profiles_data
30 |     
31 | print(get_data())
32 | # print(get_data().shape)
33 | # print(get_data().columns)
34 | 


--------------------------------------------------------------------------------
/architecture/main.tf:
--------------------------------------------------------------------------------
 1 | # module "vpc" {
 2 | #   source = "./modules/vpc"
 3 | # }
 4 | 
 5 | # module "ecr" {
 6 | #   source = "./modules/ecr"
 7 | # }
 8 | 
 9 | # module "s3" {
10 | #   source      = "./modules/s3"
11 | #   bucket_name = "travel-agency-bucket"
12 | # }
13 | 
14 | # module "roles" {
15 | #   source = "./modules/iam_roles"
16 | # }
17 | 
18 | # data "aws_ssm_parameter" "password" {
19 | #   name = "redshift_password"
20 | # }
21 | 
22 | # data "aws_ssm_parameter" "username" {
23 | #   name = "redshift_username"
24 | # }
25 | 
26 | # module "ssm" {
27 | #   source = "./modules/ssm"
28 | # }
29 | 
30 | # module "redshift" {
31 | #   source                = "./modules/redshift"
32 | #   redshift_subnet_group = module.vpc.subnet_group_id
33 | #   redshift_role_arn     = module.roles.s3_redshift_role_arn # Pointing to the output of the iam_roles
34 | #   username              = data.aws_ssm_parameter.username.value
35 | #   password              = data.aws_ssm_parameter.password.value
36 | #   database_name         = "travel_agency"
37 | #   cluster_identifier    = "travel-agency-cluster" # It must contain only lowercase alphanumeric characters (a-z, 0-9) and hyphens (-).
38 | # }
39 | 


--------------------------------------------------------------------------------
/.github/workflow/cd.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Push Travel Agency Docker Image
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main # Tigger on push to the main branch
 7 |   pull_request:
 8 |     branches:
 9 |       - main # Trigger on pull request to the main branch
10 | 
11 |   jobs:
12 |      build:
13 |       runs-on: ubuntu-latest
14 | 
15 |       steps:
16 |       # Checkout code from the repository
17 |       - name: Checkout Code
18 |         uses: actions/checkout@v2
19 | 
20 |       # Set up Docker Buildx (for multi-platform builds if needed)
21 |       - name: Set up Docker Buildx
22 |         uses: docker/setup-buildx-action@v1
23 | 
24 |       # Log in to Amazon ECR
25 |       - name: Log in to AWS ECR
26 |         uses: aws-actions/amazon-ecr-login@v1
27 | 
28 |       # Build and push docker image
29 |       - name: Build and Push Docker image to AWS ECR
30 |         run: |
31 |           docker build -t cde/travel_agency:latest .
32 |           docker tag cde/travel_agency:latest ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/cde/travel_agency:latest
33 |           docker push ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/cde/travel_agency:latest
34 | 
35 | 


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/models/dimensions/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |     - name: dim_country
 5 |       description:
 6 |           'Dimension table storing country-related descriptive information.'
 7 |       columns:
 8 |           - name: country_id
 9 |             description: 'Unique identifier for each country.'
10 |             tests:
11 |                 - not_null
12 |                 - unique
13 |           - name: country_name
14 |             description: 'Common name of the country.'
15 |           - name: official_name
16 |             description: 'Official name of the country.'
17 |           - name: capital
18 |             description: 'Capital city of the country.'
19 |           - name: region
20 |             description: 'Geographical region of the country.'
21 |           - name: subregion
22 |             description: 'Subregion of the country.'
23 |           - name: languages
24 |             description:
25 |                 'Official languages spoken in the country (stored as JSONB).'
26 |           - name: currency_code
27 |             description: "ISO code for the country's currency."
28 |           - name: currency_name
29 |             description: "Name of the country's currency."
30 |           - name: currency_symbol
31 |             description: "Symbol for the country's currency."
32 |           - name: continents
33 |             description: 'Continent(s) associated with the country.'
34 | 
35 | tests:
36 |     - unique:
37 |           name: 'dim_country_country_id_unique'
38 |           description: 'Ensure that the country_id is unique for each country.'


--------------------------------------------------------------------------------
/dbt/dbt_travel_agency/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'dbt_travel_agency'
 6 | version: '1.0.0'
 7 | 
 8 | # This setting configures which "profile" dbt uses for this project.
 9 | profile: 'dbt_travel_agency'
10 | 
11 | # These configurations specify where dbt should look for different types of files.
12 | # The `model-paths` config, for example, states that models in this project can be
13 | # found in the "models/" directory. You probably won't need to change these!
14 | model-paths: ["models"]
15 | analysis-paths: ["analyses"]
16 | test-paths: ["tests"]
17 | seed-paths: ["seeds"]
18 | macro-paths: ["macros"]
19 | snapshot-paths: ["snapshots"]
20 | 
21 | clean-targets:         # directories to be removed by `dbt clean`
22 |   - "target"
23 |   - "dbt_packages"
24 | 
25 | 
26 | # Configuring models
27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
28 | 
29 | # In this example config, we tell dbt to build all models in the example/
30 | # directory as views. These settings can be overridden in the individual model
31 | # files using the `{{ config(...) }}` macro.
32 | # models:
33 | #   dbt_travel_agency:
34 | #     # Config indicated by + and applies to all files under models/example/
35 | #     example:
36 | #       +materialized: view
37 | 
38 | models:
39 |  dbt_travel_agency:
40 |    staging:
41 |      # +schema: public
42 |      +materialized: table
43 |    analytics:
44 |      # +schema: public
45 |      +materialized: table
46 | 
47 | 


--------------------------------------------------------------------------------
/architecture/modules/vpc/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_vpc" "travel_agency_vpc" {
 2 |   cidr_block       = "172.16.0.0/16"
 3 | 
 4 |   tags = {
 5 |    Name = var.vpc_name
 6 |  }
 7 | }
 8 | 
 9 | resource "aws_subnet" "redshift_subnet_a" {
10 |   vpc_id     = aws_vpc.travel_agency_vpc.id
11 |   cidr_block ="172.16.24.0/24"
12 |   availability_zone = var.azs[0]
13 |   tags = {
14 |     Name = "zone_a"
15 |   }
16 | }
17 | 
18 | resource "aws_subnet" "redshift_subnet_b" {
19 |   vpc_id     = aws_vpc.travel_agency_vpc.id
20 |   cidr_block = "172.16.25.0/24"
21 |  availability_zone = var.azs[1]
22 |   tags = {
23 |     Name = "zone_b"
24 |   }
25 | }
26 | 
27 | resource "aws_redshift_subnet_group" "redshift_subnet_group" {
28 |   name       = "foo"
29 |   subnet_ids = [aws_subnet.redshift_subnet_a.id, aws_subnet.redshift_subnet_b.id]
30 | 
31 |   tags = {
32 |     environment = "redshift subnet group"
33 |   }
34 | }
35 | 
36 | resource "aws_internet_gateway" "igw" {
37 |   vpc_id = aws_vpc.travel_agency_vpc.id
38 | 
39 |   tags = {
40 |     Name = "chisom_igw"
41 |   }
42 | }
43 | 
44 | resource "aws_route_table" "default" {
45 |   vpc_id = aws_vpc.travel_agency_vpc.id
46 | 
47 |   route {
48 |     cidr_block = "0.0.0.0/0"
49 |     gateway_id = aws_internet_gateway.igw.id
50 |   }
51 | }
52 | 
53 | resource "aws_route_table_association" "a" {
54 |   subnet_id      = aws_subnet.redshift_subnet_a.id
55 |   route_table_id = aws_route_table.default.id
56 | }
57 | resource "aws_route_table_association" "b" {
58 |   subnet_id      = aws_subnet.redshift_subnet_b.id
59 |   route_table_id = aws_route_table.default.id
60 | }
61 | 
62 | resource "aws_default_security_group" "default" {
63 |   vpc_id = aws_vpc.travel_agency_vpc.id
64 | 
65 |   ingress {
66 |     description = "Allow inbound connections from Redshift"
67 |     protocol  = "tcp"
68 |     from_port = 5439
69 |     to_port   = 5439
70 |     cidr_blocks = ["0.0.0.0/0"]
71 |   }
72 | 
73 |   tags = {
74 |     "Name" = "redshift_security_group"
75 |   }
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/airflow/dags/dag_definition/travel_agency_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from datetime import datetime, timedelta
 3 | 
 4 | # Custom module imports
 5 | from includes.s3_utils import save_parquet_to_s3, upload_to_s3
 6 | 
 7 | # Airflow provider imports
 8 | from airflow.operators.python import PythonOperator
 9 | from airflow.providers.amazon.aws.operators.redshift_data import RedshiftDataOperator
10 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator
11 | 
12 | # Define DAG folder path for mounted Docker volume
13 | dag_folder = "/opt/airflow/dags"
14 | sql_file = "create_table.sql"  # Use just the filename
15 | 
16 | default_args = {
17 |     'owner': "chisom",
18 |     'start_date': datetime(2025, 2, 26),
19 |     'retries': 2,
20 |     'retry_delay': timedelta(seconds=5),
21 |     'execution_timeout': timedelta(minutes=10),
22 | }
23 | 
24 | with DAG(
25 |     dag_id="travel_agency_dag",
26 |     default_args=default_args,
27 |     description="A simple DAG to extract data from an API, load it to S3, transform it, and load it into Redshift",
28 |     default_view="graph",
29 |     tags=["travel_agency", "cde"],
30 |     schedule_interval="@daily",
31 |     catchup=False,
32 |     template_searchpath=f"{dag_folder}/includes/sql",
33 | ) as dag:
34 | 
35 |     # Task 1: Load data to S3
36 |     load_data_to_s3 = PythonOperator(
37 |         task_id="load_data_to_S3",
38 |         python_callable=upload_to_s3
39 |     )
40 | 
41 |     # Task 2: Transform data
42 |     transform_data = PythonOperator(
43 |         task_id = "transform_data",
44 |         python_callable = save_parquet_to_s3
45 |     )
46 | 
47 |     # Task 3: Create table in Redshift
48 |     create_table = RedshiftDataOperator(
49 |         task_id = "create_table",
50 |         cluster_identifier="travel-agency-cluster",
51 |         database="travel_agency",
52 |         sql=sql_file,
53 |         aws_conn_id="aws_default",
54 |         wait_for_completion=True,
55 |         region="af-south-1",
56 |          params={
57 |             "schema": "public",
58 |             "table": "countries_data",
59 |         },
60 |                 )
61 | 
62 |     # Task 4: Load transformed data into Redshift
63 |     load_data_to_redshift = S3ToRedshiftOperator(
64 |         task_id='load_data_to_redshift',
65 |         schema='public',
66 |         table='countries_data',
67 |         s3_bucket='travel-agency-bucket',
68 |         s3_key='processed_data/processed_data.parquet',
69 |         redshift_conn_id='redshift_default',
70 |         aws_conn_id='aws_default',
71 |         copy_options=[
72 |             "FORMAT AS PARQUET"
73 |         ],
74 |         method='REPLACE',
75 |     )
76 | 
77 |     # Define task dependencies
78 |     load_data_to_s3 >> transform_data >> create_table >> load_data_to_redshift
79 | 


--------------------------------------------------------------------------------
/travel_agency_app.py:
--------------------------------------------------------------------------------
 1 | # THIS CONTAINS:
 2 | 
 3 | # A: Code that extracts the data ffrom the API
 4 | # B: Code that loads the raw data to s3
 5 | 
 6 | # THis is the part of the fulfillment of the C/CD implementation where this app will be built as an image using a dockerfile
 7 | 
 8 | # Retrieving data from the API
 9 | # Import libraries and packages
10 | import requests
11 | import pandas as pd
12 | import logging
13 | 
14 | # Configure logging
15 | logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(message)')
16 | 
17 | def get_data():
18 |     """
19 |         A function that  extracts the data from the API
20 |         and then turns it  into a pandas DataFrame
21 |     """
22 | 
23 |     url = "https://restcountries.com/v3.1/all"
24 | 
25 |     response = requests.get(url)
26 |     logging.info("Fetching data from the API...")
27 | 
28 |     if response.status_code == 200:
29 |         # Parse JSON response
30 |         data = response.json()
31 | 
32 |         # Convert JSON data to Pandas DataFrame
33 |         profiles_data = pd.DataFrame(data)
34 |         logging.info(f"Successfuly turned into a Pandas DataFrame\
35 |                      {profiles_data.shape[0]} records and {profiles_data.shape[1]} columns")
36 |         return profiles_data
37 |     
38 | print(get_data())
39 | 
40 | 
41 | ###############################################################################
42 | # Creating a connection to connect to AWS
43 | # Import necessary libraries and packages
44 | import boto3
45 | from airflow.models import Variable
46 | 
47 | def create_session():
48 |     """Initialize and return a Boto3 session using Airflow variables."""
49 |     aws_access_key_id = Variable.get("aws_access_key_id")
50 |     aws_secret_access_key = Variable.get("aws_secret_access_key")
51 |     region_name = "af-south-1"
52 | 
53 |     session = boto3.Session(
54 |         aws_access_key_id=aws_access_key_id,
55 |         aws_secret_access_key=aws_secret_access_key,
56 |         region_name=region_name
57 |     )
58 | 
59 |     return session
60 | 
61 | print(create_session())
62 | 
63 | 
64 | ###############################################################################
65 | # Loading the raw data into an s3 bucket
66 | from io import BytesIO
67 | 
68 | def upload_to_s3():
69 |     """
70 |         Uploading data into an s3 bucket with a fixed file name
71 |     """
72 |     data = get_data()
73 | 
74 |     if data.empty:
75 |         print('DataFrame is empty. No data to return')
76 |         return
77 |     
78 |     bucket_name = 'travel-agency-bucket'
79 |     file_key = 'raw_data/data.parquet'
80 | 
81 |     # Convert DataFrame to bytes
82 |     buffer = BytesIO()
83 |     data.to_parquet(buffer, index=False)
84 |     buffer.seek(0)
85 | 
86 |     # upload file to s3
87 |     s3_client = create_session().client('s3')
88 |     s3_client.put_object(bucket_name, file_key, Body=buffer.getvalue())
89 | 
90 |     print(f"Data successfully uploaded to s3://{bucket_name}/{file_key}")
91 | 
92 | upload_to_s3()
93 | 
94 | 


--------------------------------------------------------------------------------
/project_README.md:
--------------------------------------------------------------------------------
 1 | # CDE-CAPSTONE
 2 | A travel Agency reached out to CDE, their business model involves recommending tourist location to their customers based on different data points, they want one of our graduates to build a Data Platform that will process the data from the Country rest API [HERE](https://restcountries.com/v3.1/all) into their cloud based Database/Data Warehouse for predictive analytics by their Data Science team.
 3 | 
 4 | ### SOME CONSIDERATIONS TO BE AWARE
 5 | - The company needs some specific fields/attributes from the API data to enable downstream usage.
 6 |   - However, you have to extract the entire raw data from the API into any `Cloud Based Object Storage`, which will serve as the Raw layer.
 7 |   - The choice of file format when storing the data in Object Storage must be `Parquet` file considering its better for performance.
 8 |   - We want to store the entire API data because, if the agency require more field/attribute in the future, we don't have to pull data from the API again, we simply pull from the one on the data lake.
 9 | - From the Data Lake, please extract the below attributes that is required for predictive analytics by the Travel Agency and write to a Cloud Database or Datawarehouse.
10 | 
11 |   - Country name
12 | 
13 |   - Independence
14 | 
15 |   - United Nation members
16 | 
17 |   - startOfWeek
18 | 
19 |   - Official country name
20 | 
21 |   - Common native name
22 | 
23 |   - Currency Code e.g USD, EUR
24 | 
25 |   - Currency name
26 | 
27 |   - Currency symbol
28 | 
29 |   - Country code ( idd ) . e.g Germany country code is +49
30 | 
31 |   - you need to concatenate idd root and idd suffix from the response
32 | 
33 |   - Capital
34 | 
35 |   - Region
36 | 
37 |   - Sub region
38 | 
39 |   - Languages
40 | 
41 |   - Area
42 | 
43 |   - Population
44 | 
45 |   - Continents
46 | 
47 |  - Apache Airflow `MUST` be used for Orchestrating the entire workflow, which includes
48 |    - Extracting the data from the API
49 |    - Writing the extracted data to the Data Lake
50 |    - Extracting the final required attributes to the Database/Data Warehouse.
51 |   
52 | - CI/CD should be integrated to the Github Repository
53 |   - CI that carry out checks on code linting to ensure code written follow best practices.
54 |   - CD to carry out the Build and Push of the code that Extract and Write to object storage to a Cloud based Container Registry.
55 |     - Basically, you need to package the code that does the Extract of the raw data from API and the Code that write the data to Object storage into a Docker image and push the image to a Cloud Based Container Registry.
56 | 
57 | - All Cloud Infrastructures like IAM, Object storage, DB/DW resource provisioning has to be Terraformed with the Terraform State File backend managed in the cloud using an Object Storage.
58 | 
59 | - Lastly, please you `MUST` leverage DBT to model the data into Fact and Dimension tables.
60 | 
61 | ### BONUS (NOT MANDATORY)
62 | Derive any insights from the Data Set.
63 | 
64 | 
65 | ## SUBMISSION REQUIREMENTS
66 | - A power point or something similar for presentation of the entire project covering the below
67 |   - The Data Architecture.
68 |   - The choice of tools 🛠️
69 | - Well documented read me of the project in Github.
70 | - Project submission will close on the 20th November, 2024.
71 |   - Github link should be submitted in the link [HERE](https://forms.gle/osnNmo7JyGkQeXnb8)
72 | 


--------------------------------------------------------------------------------
/airflow/dags/includes/transform_data.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(name)s:%(message)s')
  8 | logging.getLogger().setLevel(20)
  9 | 
 10 | 
 11 | def transform_data(df):
 12 |     """
 13 |     Transforms the given Pandas DataFrame by performing specific data
 14 |       processing operations.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     df : pd.DataFrame
 19 |         A Pandas DataFrame in Parquet format that contains the data to be
 20 |           transformed.
 21 | 
 22 |     Returns
 23 |     -------
 24 |     pd.DataFrame
 25 |         A new Pandas DataFrame with the transformed data.
 26 |     """
 27 |     logging.info("Starting data transformation.")
 28 | 
 29 |     logging.debug("Extracting currency details.")
 30 |     currency_details = df['currencies'].apply(extract_currency_details)
 31 |     df = pd.concat([df, currency_details], axis=1)
 32 | 
 33 |     logging.debug("Extracting country names and native names.")
 34 |     df['country_name'] = df['name'].apply(lambda x: x.get('common'))
 35 |     df['official_country_name'] = df['name'].apply(lambda x: x.get('official'))
 36 |     df['common_native_names'] = df['name'].apply(lambda x:
 37 |                                                  extract_all_common_native_name
 38 |                                                  (x.get('nativeName')))
 39 | 
 40 |     logging.debug("Extracting languages and country codes.")
 41 |     df['languages'] = df['languages'].apply(lambda x: extract_languages(x))
 42 |     df['country_code'] = df['idd'].apply(lambda x: generate_country_codes(x))
 43 | 
 44 |     logging.debug("Simplifying continent and capital columns.")
 45 |     df['continents'] = df['continents'].str[0]
 46 |     df['capital'] = df['capital'].str[0]
 47 | 
 48 |     logging.debug("Dropping columns: 'name', 'idd', 'currencies'.")
 49 |     df = df.drop(columns=['name', 'idd', 'currencies'])
 50 | 
 51 |     desired_order = [
 52 |         'country_name', 'independent', 'unMember', 'startOfWeek',
 53 |         'official_country_name', 'common_native_names',
 54 |         'currency_code', 'currency_name', 'currency_symbol',
 55 |         'country_code', 'capital', 'region', 'subregion',
 56 |         'languages', 'area', 'population', 'continents'
 57 |     ]
 58 |     logging.debug("Reordering columns.")
 59 |     df = df[desired_order]
 60 | 
 61 |     logging.info("Data transformation completed.")
 62 |     return df
 63 | 
 64 | 
 65 | def extract_currency_details(row):
 66 |     if isinstance(row, dict) and len(row) > 0:
 67 |         valid_entry = {key: value for key, value in row.items()
 68 |                        if value is not None}
 69 |         if valid_entry:
 70 |             code = list(valid_entry.keys())[0]
 71 |             details = valid_entry[code]
 72 |             logging.debug("Currency details found: code=%s, \
 73 |                           details=%s", code, details)
 74 |             return pd.Series({
 75 |                 'currency_code': code,
 76 |                 'currency_name': details.get('name', None),
 77 |                 'currency_symbol': details.get('symbol', None)
 78 |             })
 79 |     logging.debug("No valid currency details found.")
 80 |     return pd.Series({'currency_code': None, 'currency_name': None,
 81 |                       'currency_symbol': None})
 82 | 
 83 | 
 84 | def extract_languages(language):
 85 |     if isinstance(language, dict):
 86 |         result = ", ".join(str(x) for x in language.values() if x is not None)
 87 |         logging.debug("Extracted languages: %s", result)
 88 |         return result
 89 |     logging.debug("No valid language data found.")
 90 |     return None
 91 | 
 92 | 
 93 | def extract_all_common_native_name(native_name):
 94 |     if isinstance(native_name, dict):
 95 |         result = ", ".join(entry.get('common', '') for entry in
 96 |                            native_name.values() if isinstance(entry, dict) and
 97 |                            'common' in entry)
 98 |         logging.debug("Extracted native names: %s", result)
 99 |         return result
100 |     logging.debug("No valid native names found.")
101 |     return None
102 | 
103 | 
104 | def generate_country_codes(idd):
105 |     if isinstance(idd, dict):
106 |         root = idd.get('root', '')
107 |         suffixes = idd.get('suffixes', [])
108 |         if isinstance(suffixes, (list, np.ndarray)):
109 |             result = " ".join([root + suffix for suffix in suffixes])
110 |             logging.debug("Generated country codes: %s", result)
111 |             return result
112 |     logging.debug("No valid country code data found.")
113 |     # return None
114 |     return "Unknown"
115 | 


--------------------------------------------------------------------------------
/airflow/dags/includes/s3_utils.py:
--------------------------------------------------------------------------------
  1 | ####################################################################################
  2 | # # Using this version if not working wih airflow
  3 | # import boto3
  4 | # import os
  5 | # from boto3.session import Session  # Import Session explicitly
  6 | 
  7 | # from dotenv import load_dotenv
  8 | # # Load environment variables from .env file
  9 | # load_dotenv()
 10 | 
 11 | # def create_session():
 12 | #     session = Session(
 13 | #         aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
 14 | #         aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
 15 | #         region_name="af-south-1"
 16 | #     )
 17 | 
 18 | #     print("Ran successfully")
 19 | 
 20 | #     return session
 21 | 
 22 | # print(create_session())
 23 | 
 24 | 
 25 | ####################################################################################
 26 | # CREATING A SESSION TO CONNECT TO AWS
 27 | # Import necessary libraries
 28 | import boto3
 29 | from airflow.models import Variable
 30 | 
 31 | def create_session():
 32 |     """Initialize and return a Boto3 session using Airflow variables."""
 33 |     aws_access_key_id = Variable.get("aws_access_key_id")
 34 |     aws_secret_access_key = Variable.get("aws_secret_access_key")
 35 |     region_name = "af-south-1"
 36 | 
 37 |     session = boto3.Session(
 38 |         aws_access_key_id=aws_access_key_id,
 39 |         aws_secret_access_key=aws_secret_access_key,
 40 |         region_name=region_name
 41 |     )
 42 | 
 43 |     return session
 44 | 
 45 | # print(create_session())
 46 | 
 47 | 
 48 | ####################################################################################
 49 | # UPLOADS RAW DATA TO S3 BUCKET
 50 | import awswrangler as wr
 51 | from io import BytesIO
 52 | 
 53 | from includes.extract_data import  get_data
 54 | 
 55 | def upload_to_s3():
 56 |     """
 57 |     Uploads a pandas DataFrame to an S3 bucket with a fixed file name.
 58 |     """
 59 |     data = get_data()
 60 | 
 61 |     if data.empty:
 62 |         print("The DataFrame is empty. No data to upload")
 63 |         return
 64 |     
 65 |     bucket_name = "travel-agency-bucket"
 66 |     file_key = "raw_data/data.parquet"  # Specify the exact file name
 67 | 
 68 |     # Convert DataFrame to bytes
 69 |     buffer = BytesIO()
 70 |     data.to_parquet(buffer, index=False)
 71 |     buffer.seek(0)
 72 | 
 73 |     # Upload file to S3
 74 |     s3_client = create_session().client ('s3')
 75 |     s3_client.put_object(Bucket=bucket_name, Key=file_key, Body=buffer.getvalue())
 76 | 
 77 |     print(f"Data successfully uploaded to s3://{bucket_name}/{file_key}")
 78 | 
 79 | upload_to_s3()
 80 | 
 81 | 
 82 | ###############################################################################
 83 | # RETRIEVS RAW DATA, TRANSFROMS IT AND LOADS IT BACK TO S3 BUCKET
 84 | import pandas as pd
 85 | from io import BytesIO
 86 | import logging
 87 | 
 88 | from includes.transform_data import transform_data
 89 | 
 90 | # Configure logging
 91 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
 92 | 
 93 | # Define S3 bucket and file key
 94 | bucket_name = "travel-agency-bucket"
 95 | file_key = "raw_data/data.parquet"
 96 | 
 97 | def retrieve_and_process_data():
 98 |     """Retrieve a Parquet file from S3 and process it into a Pandas DataFrame."""
 99 |     try:
100 |         s3_client = create_session().client("s3")
101 |         response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
102 |         df = pd.read_parquet(BytesIO(response["Body"].read()))
103 |         logging.info(f"Successfully retrieved and read the Parquet file from s3://{bucket_name}/{file_key}")
104 |         return df
105 |     except Exception as e:
106 |         logging.error(f"Error retrieving or processing the Parquet file: {e}")
107 |         # raise
108 |         return pd.DataFrame()
109 | 
110 | def save_parquet_to_s3():
111 |     """Reads raw data from S3, transforms it, and saves the transformed data back to S3."""
112 |     try:
113 |         # Retrieve raw data from S3
114 |         raw_df = retrieve_and_process_data()
115 | 
116 |         # Apply transformation using extract_country_info
117 |         # transformed_df = raw_df.apply(extract_and_rename_columns, axis=1, result_type="expand")
118 |         transformed_df = transform_data(raw_df)
119 | 
120 |         # Print transformed data to the terminal
121 |         # print("\nTransformed Data Preview:")
122 |         print(transformed_df.dtypes)
123 |         max_length = transformed_df['country_code'].astype(str).apply(len).max()
124 |         print(f"Max length in Parquet file: {max_length}")
125 |         
126 |         # Define S3 target path for transformed data
127 |         processed_file_key = "processed_data/processed_data.parquet"
128 |         s3_client = create_session().client("s3")
129 | 
130 |         # Convert DataFrame to bytes
131 |         buffer = BytesIO()
132 |         transformed_df.to_parquet(buffer, index=False, engine="pyarrow") # Edited here
133 |         buffer.seek(0)
134 | 
135 |         # Upload file to S3
136 |         s3_client.put_object(Bucket=bucket_name, Key=processed_file_key, Body=buffer.getvalue())
137 | 
138 |         logging.info(f"Processed data successfully saved to s3://{bucket_name}/{processed_file_key}")
139 | 
140 |     except Exception as e:
141 |         logging.error(f"Error saving processed data to S3: {e}")
142 |         raise
143 | 
144 | # Call the function to save processed data to S3
145 | save_parquet_to_s3()
146 | 
147 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Travel Agency Data Platform
  2 | 
  3 | ## Introduction
  4 | In response to the **CDE cohort-1** capstone project requirement, I created this project which integrates all the key concepts and tools learned during the Bootcamp program.
  5 | 
  6 | The project demonstrates the development of a robust Data Platform capable of ingesting, transforming, and storing data for predictive analytics. The tools and services utilized in this project includes `Docker`, `Airflow`, `Terraform`, `AWS Services` (like Amazon s3. Redshift, ECR), and `dbt`.
  7 | 
  8 | ## Overview
  9 | 
 10 | The goal of this project is to build a scalable and efficient data pipeline for a travel agency, enabling their Data Science team to analyze curated data for predictive analytics. The pipeline is designed to:
 11 | 
 12 | * Extract raw data from the Country [REST API](https://restcountries.com/v3.1/all).
 13 | 
 14 | * Load the data into an AWS S3 Data Lake in Parquet format.
 15 | 
 16 | * Transform the data into a curated dataset containing specific fields relevant for analysis.
 17 | 
 18 | * Load the transformed data into Amazon Redshift table.
 19 | 
 20 | * Use dbt to model the data into Fact and Dimension tables for efficient querying.
 21 | 
 22 | * incorporate CI/CD pipelines to automate code quality checks, build processes, and deployments, ensuring best practices and streamlined workflows.
 23 | 
 24 | 
 25 | ## Methodology
 26 | 
 27 | Having carefully assessed the requirements, **Docker** was used to host **Airflow*, which served as the orchestration tool for this project. 
 28 | The dataset was extracted from the Country REST API and stored in Parquet format in **Amazon S3**  to ensure future extensibility. 
 29 | Relevant columns were then selected from the raw data and loaded into a **Redshift**  table,
 30 | which functioned as the Data Warehouse. **dbt** was utilized to model the transformed data into `Fact` and `Dimension tables`, enabling efficient querying. 
 31 | Additionally, **Terraform** was employed as an Infrastructure as Code (IaC) tool to provision all necessary AWS resources.
 32 | 
 33 | ## Project Architecture
 34 | ### Overview
 35 | The data pipeline is designed to seamlessly integrate data ingestion, transformation, and storage. Below is a high-level description of the architecture:
 36 | 
 37 | * **Data Ingestion**: Data is stored in Amazon S3 bucket.
 38 | 
 39 | * **Infrastructure Setup**: Terraform provisions the AWS VPC, IAM roles, Redshift clusters, and other necessary resources.
 40 | 
 41 | * **ETL Orchestration**: Apache Airflow orchestrates the ETL process, including loading data into Redshift and triggering dbt models.
 42 | 
 43 | * **Data Transformation**: dbt structures raw data into analytics-ready formats.
 44 | 
 45 | * **Output**: Transformed data is stored in Redshift and can be accessed by BI tools or queried directly.
 46 | 
 47 | ## Architectural Diagram
 48 | 
 49 | ![image](https://github.com/Chisomnwa/CDE_Capstone_Project/blob/main/Travel%20Agency%20Platform%20Architecture.png)
 50 | 
 51 | 
 52 | ## Tools and Services Used and their Functions
 53 | 
 54 | * **Terraform**: To avoid manually creating the resources which was used for this project, Terraform was used for Infrastructure as Code (IaC) to provision and manage cloud resources like AWS S3, Redshift, IAM roles, and VPC. This ensures scalability, consistency, and reproducibility in infrastructure deployment.
 55 | 
 56 | * **Docker**: Docker was used to containerize Airflow by building from an Apache Airflow Image found [here](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html). A `Dockerfile` and a `requirement.txt` file were used to enhance the base image provided by Airflow to include `awswrangler` package.
 57 | 
 58 | * **Airflow**: Airflow which is an open-source technology was used as an orchestration tool to automate and manage the ETL pipeline. It had dag dependencies on the worker nodes set up to trigger at a scheduled interval which can be accesses on the web server GUI.
 59 | 
 60 | * **AWS Service**: Because of its accessibility and compatibility with terraform, and ease of use, AWS cloud was chosen as the preferred cloud provider. An IAM with the necessary policies (using the principle of least-privilege) was provisoned and was used by Airflow for accessing the Data Lake (s3 bucket) and Data Warehouse (Redshift), using the access keys and secret access key parameters saved in the SSM parameter store. 
 61 | 
 62 |     Under AWS Services also, **Amazon ECR** was used to store the Docker image, which contains the codes for extracting the data from the API and uploading the raw data to the s3 bucket Data Lake.
 63 | 
 64 | * **dbt**: which stands for Data Build Tool was utilized for transforming the raw data and modeling it into Fact and Dimension tables in Redshift. It also helps to ensure data quality, modularity, and version control, making it easier to mantain analytics-ready datasets.
 65 | 
 66 | # Setup Instructions
 67 | ## Prerequisites:
 68 | * AWS Account with sufficient IAM permissions.
 69 | 
 70 | * Installed tools: Terraform, Apache Airflow, dbt, AWS CLI, Python (with virtualenv).
 71 | ----
 72 | 
 73 | 1. Clone the Repository:
 74 | 
 75 |         `git clone <repository_url>`
 76 | 
 77 |         `cd <repository_directory>`
 78 | 
 79 | 2.  Configure AWS Credentials:
 80 | 
 81 |     Ensure your AWS CLI is configured with the required credentials:
 82 | 
 83 |         `aws configure`
 84 | 
 85 | 3. Infrastructure Provisioning:
 86 | 
 87 |     Navigate to the **Architecture directory** and run the following commands:
 88 | 
 89 |         `terraform init`
 90 |         `terraform fmt`
 91 |         `terraform validate`
 92 |         `terraform apply`
 93 | 
 94 | 4. Start Airflow:
 95 | 
 96 |     Set up Airflow and deploy the DAGs:
 97 | 
 98 |     * cd into the **airflow directory**.
 99 | 
100 |     * Download docker-compose.yaml file using
101 | 
102 |         `curl -LfO 'https://airflow.apache.org/docs/apache-airflow/2.10.3/docker-compose.yaml'`
103 | 
104 |     * Create your Dockerfile, .env file, and your requirements.txt file.
105 | 
106 |     * set up your DAG.
107 | 
108 |     * In the terminal run `docker-compose up -d` to start up your containers.
109 | 
110 |     * Go to your localhost:8080 to view and run your dag.
111 | 
112 |     PS: Place the DAG files in the dags folder and configure Airflow connections for Redshift and S3.
113 | 
114 | 5. Run dbt Models:
115 | 
116 |     Navigate to your dbt project folder and execute:
117 | 
118 |         `dbt run`
119 | 
120 | # Challenges and Solutions
121 | 
122 | * **Challenge 1**: Terraform Variable Errors
123 | 
124 |     **Solution**: Refactored variable definitions and ensured proper passing of values between modules.
125 | 
126 | * **Challenge 2**: Airflow Redshift Connection
127 | 
128 |     **Solution**: Configured Redshift connection settings in Airflow with accurate credentials and endpoint details.
129 | 
130 | * **Challenge 3**: dbt Execution Errors
131 | 
132 |     **Solution**: Debugged errors using dbt logs and ensured proper schema configurations in dbt_project.yml.
133 | 
134 | # Future Improvements
135 | **Monitoring**: Implement AWS CloudWatch for pipeline monitoring and alerting.
136 | 
137 | **Scaling**: Extend the pipeline to integrate AWS Glue for more complex ETL processes
138 | 
139 | 
140 | # Power Point presentation
141 | To have a view of the project's data architecture and understand the **Why** for my choice of tools 🛠️,
142 | you can access the power point slides [here](https://github.com/Chisomnwa/CDE_Capstone_Project/blob/main/Travel_Agency_Project_Slides.pdf).
143 | 
144 | # Medium Article
145 | Here's a [medium article](https://medium.com/towards-data-engineering/how-i-built-a-travel-agency-data-platform-27e81a5dd668) that I have written on this project. It gives the full view and the detailed steps on how to complete
146 | this project. You will enjoy it.
147 | 
148 | 


--------------------------------------------------------------------------------
/airflow/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.10.5
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # AIRFLOW_PROJ_DIR             - Base path to which all the files will be volumed.
 31 | #                                Default: .
 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 33 | #
 34 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 37 | #                                Default: airflow
 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 39 | #                                Use this option ONLY for quick checks. Installing requirements at container
 40 | #                                startup is done EVERY TIME the service is started.
 41 | #                                A better way is to build a custom image or extend the official image
 42 | #                                as described in https://airflow.apache.org/docs/docker-stack/build.html.
 43 | #                                Default: ''
 44 | #
 45 | # Feel free to modify this file to suit your needs.
 46 | ---
 47 | x-airflow-common:
 48 |   &airflow-common
 49 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 50 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 51 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 52 |   image: ${AIRFLOW_IMAGE_NAME:-airflow_awswrangler}
 53 |   # build: .
 54 |   environment:
 55 |     &airflow-common-env
 56 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 57 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 58 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 59 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 60 |     AIRFLOW__CORE__FERNET_KEY: ''
 61 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 62 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
 63 |     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
 64 |     # yamllint disable rule:line-length
 65 |     # Use simple http server on scheduler for health checks
 66 |     # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
 67 |     # yamllint enable rule:line-length
 68 |     AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
 69 |     # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
 70 |     # for other purpose (development, test and especially production usage) build/extend Airflow image.
 71 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 72 |     # The following line can be used to set a custom config file, stored in the local config folder
 73 |     # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
 74 |     # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
 75 |   volumes:
 76 |     - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
 77 |     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
 78 |     - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
 79 |     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
 80 |   user: "${AIRFLOW_UID:-50000}:0"
 81 |   depends_on:
 82 |     &airflow-common-depends-on
 83 |     redis:
 84 |       condition: service_healthy
 85 |     postgres:
 86 |       condition: service_healthy
 87 | 
 88 | services:
 89 |   postgres:
 90 |     image: postgres:13
 91 |     environment:
 92 |       POSTGRES_USER: airflow
 93 |       POSTGRES_PASSWORD: airflow
 94 |       POSTGRES_DB: airflow
 95 |     volumes:
 96 |       - postgres-db-volume:/var/lib/postgresql/data
 97 |     healthcheck:
 98 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 99 |       interval: 10s
100 |       retries: 5
101 |       start_period: 5s
102 |     restart: always
103 | 
104 |   redis:
105 |     # Redis is limited to 7.2-bookworm due to licencing change
106 |     # https://redis.io/blog/redis-adopts-dual-source-available-licensing/
107 |     image: redis:7.2-bookworm
108 |     expose:
109 |       - 6379
110 |     healthcheck:
111 |       test: ["CMD", "redis-cli", "ping"]
112 |       interval: 10s
113 |       timeout: 30s
114 |       retries: 50
115 |       start_period: 30s
116 |     restart: always
117 | 
118 |   airflow-webserver:
119 |     <<: *airflow-common
120 |     command: webserver
121 |     ports:
122 |       - "8080:8080"
123 |     healthcheck:
124 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
125 |       interval: 30s
126 |       timeout: 10s
127 |       retries: 5
128 |       start_period: 30s
129 |     restart: always
130 |     depends_on:
131 |       <<: *airflow-common-depends-on
132 |       airflow-init:
133 |         condition: service_completed_successfully
134 | 
135 |   airflow-scheduler:
136 |     <<: *airflow-common
137 |     command: scheduler
138 |     healthcheck:
139 |       test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
140 |       interval: 30s
141 |       timeout: 10s
142 |       retries: 5
143 |       start_period: 30s
144 |     restart: always
145 |     depends_on:
146 |       <<: *airflow-common-depends-on
147 |       airflow-init:
148 |         condition: service_completed_successfully
149 | 
150 |   airflow-worker:
151 |     <<: *airflow-common
152 |     command: celery worker
153 |     healthcheck:
154 |       # yamllint disable rule:line-length
155 |       test:
156 |         - "CMD-SHELL"
157 |         - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
158 |       interval: 30s
159 |       timeout: 10s
160 |       retries: 5
161 |       start_period: 30s
162 |     environment:
163 |       <<: *airflow-common-env
164 |       # Required to handle warm shutdown of the celery workers properly
165 |       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
166 |       DUMB_INIT_SETSID: "0"
167 |     restart: always
168 |     depends_on:
169 |       <<: *airflow-common-depends-on
170 |       airflow-init:
171 |         condition: service_completed_successfully
172 | 
173 |   airflow-triggerer:
174 |     <<: *airflow-common
175 |     command: triggerer
176 |     healthcheck:
177 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
178 |       interval: 30s
179 |       timeout: 10s
180 |       retries: 5
181 |       start_period: 30s
182 |     restart: always
183 |     depends_on:
184 |       <<: *airflow-common-depends-on
185 |       airflow-init:
186 |         condition: service_completed_successfully
187 | 
188 |   airflow-init:
189 |     <<: *airflow-common
190 |     entrypoint: /bin/bash
191 |     # yamllint disable rule:line-length
192 |     command:
193 |       - -c
194 |       - |
195 |         if [[ -z "${AIRFLOW_UID}" ]]; then
196 |           echo
197 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
198 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
199 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
200 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
201 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
202 |           echo
203 |         fi
204 |         one_meg=1048576
205 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
206 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
207 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
208 |         warning_resources="false"
209 |         if (( mem_available < 4000 )) ; then
210 |           echo
211 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
212 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
213 |           echo
214 |           warning_resources="true"
215 |         fi
216 |         if (( cpus_available < 2 )); then
217 |           echo
218 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
219 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
220 |           echo
221 |           warning_resources="true"
222 |         fi
223 |         if (( disk_available < one_meg * 10 )); then
224 |           echo
225 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
226 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
227 |           echo
228 |           warning_resources="true"
229 |         fi
230 |         if [[ $${warning_resources} == "true" ]]; then
231 |           echo
232 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
233 |           echo "Please follow the instructions to increase amount of resources available:"
234 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
235 |           echo
236 |         fi
237 |         mkdir -p /sources/logs /sources/dags /sources/plugins
238 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
239 |         exec /entrypoint airflow version
240 |     # yamllint enable rule:line-length
241 |     environment:
242 |       <<: *airflow-common-env
243 |       _AIRFLOW_DB_MIGRATE: 'true'
244 |       _AIRFLOW_WWW_USER_CREATE: 'true'
245 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
246 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
247 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
248 |     user: "0:0"
249 |     volumes:
250 |       - ${AIRFLOW_PROJ_DIR:-.}:/sources
251 | 
252 |   airflow-cli:
253 |     <<: *airflow-common
254 |     profiles:
255 |       - debug
256 |     environment:
257 |       <<: *airflow-common-env
258 |       CONNECTION_CHECK_MAX_COUNT: "0"
259 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
260 |     command:
261 |       - bash
262 |       - -c
263 |       - airflow
264 | 
265 |   # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
266 |   # or by explicitly targeted on the command line e.g. docker-compose up flower.
267 |   # See: https://docs.docker.com/compose/profiles/
268 |   flower:
269 |     <<: *airflow-common
270 |     command: celery flower
271 |     profiles:
272 |       - flower
273 |     ports:
274 |       - "5555:5555"
275 |     healthcheck:
276 |       test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
277 |       interval: 30s
278 |       timeout: 10s
279 |       retries: 5
280 |       start_period: 30s
281 |     restart: always
282 |     depends_on:
283 |       <<: *airflow-common-depends-on
284 |       airflow-init:
285 |         condition: service_completed_successfully
286 | 
287 | volumes:
288 |   postgres-db-volume:
289 | 


--------------------------------------------------------------------------------