├── dbt ├── dbt_travel_agency │ ├── seeds │ │ └── .gitkeep │ ├── tests │ │ └── .gitkeep │ ├── analyses │ │ └── .gitkeep │ ├── snapshots │ │ └── .gitkeep │ ├── .gitignore │ ├── models │ │ ├── facts │ │ │ ├── fact_country_population.sql │ │ │ └── source.yml │ │ └── dimensions │ │ │ ├── dim_country.sql │ │ │ └── schema.yml │ ├── README.md │ └── dbt_project.yml └── requirements.txt ├── airflow ├── requirements.txt ├── Dockerfile ├── dags │ ├── includes │ │ ├── SQL │ │ │ └── create_table.sql │ │ ├── extract_data.py │ │ ├── transform_data.py │ │ └── s3_utils.py │ └── dag_definition │ │ └── travel_agency_dag.py └── docker-compose.yaml ├── .DS_Store ├── architecture ├── modules │ ├── ecr │ │ ├── variables.tf │ │ ├── outputs.tf │ │ └── main.tf │ ├── iam_roles │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── main.tf │ ├── vpc │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── main.tf │ ├── s3 │ │ ├── variables.tf │ │ └── main.tf │ ├── ssm │ │ ├── variables.tf │ │ └── main.tf │ └── redshift │ │ ├── main.tf │ │ └── variables.tf ├── backend.tf ├── provider.tf └── main.tf ├── images ├── dbt_init_successful.png ├── dbt_run_successful.png ├── dbt_debug_successful.png ├── travel_agency_dag_run_success.png ├── travel_agency_dag_run_success_2.png ├── Data_Loaded_Successfuly_into_Table.png ├── Travel Agency Architectural Diagram.png ├── fact_and_dimension_tables_in_redshift.png ├── redshift_table_created_successfully.png └── Travel Agency Orchestration Flow Chart.png ├── Travel_Agency_Project_Slides.pdf ├── requirements_ci.txt ├── .gitignore ├── app_requirements.txt ├── .github └── workflow │ ├── ci.yml │ └── cd.yml ├── Dockerfile ├── travel_agency_app.py ├── project_README.md └── README.md /dbt/dbt_travel_agency/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | awswrangler==3.10.0 2 | boto3==1.34.94 -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/.DS_Store -------------------------------------------------------------------------------- /architecture/modules/ecr/variables.tf: -------------------------------------------------------------------------------- 1 | variable "ecr_name" { 2 | default = "travel_agency_ecr" 3 | } 4 | -------------------------------------------------------------------------------- /dbt/requirements.txt: -------------------------------------------------------------------------------- 1 | dbt-core 2 | dbt-redshift 3 | 4 | travel-agency-cluster.cz8jk0qkbxey.af-south-1.redshift.amazonaws.com -------------------------------------------------------------------------------- /images/dbt_init_successful.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/dbt_init_successful.png -------------------------------------------------------------------------------- /images/dbt_run_successful.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/dbt_run_successful.png -------------------------------------------------------------------------------- /Travel_Agency_Project_Slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/Travel_Agency_Project_Slides.pdf -------------------------------------------------------------------------------- /images/dbt_debug_successful.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/dbt_debug_successful.png -------------------------------------------------------------------------------- /airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.10.3 2 | 3 | COPY requirements.txt / 4 | 5 | RUN pip install --no-cache-dir -r /requirements.txt -------------------------------------------------------------------------------- /architecture/modules/ecr/outputs.tf: -------------------------------------------------------------------------------- 1 | output "repository_url" { 2 | value = aws_ecr_repository.travel_agency_ecr.repository_url 3 | } 4 | -------------------------------------------------------------------------------- /architecture/modules/iam_roles/outputs.tf: -------------------------------------------------------------------------------- 1 | output "s3_redshift_role_arn" { 2 | value = aws_iam_role.redshift_role.arn 3 | } 4 | 5 | -------------------------------------------------------------------------------- /images/travel_agency_dag_run_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/travel_agency_dag_run_success.png -------------------------------------------------------------------------------- /images/travel_agency_dag_run_success_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/travel_agency_dag_run_success_2.png -------------------------------------------------------------------------------- /requirements_ci.txt: -------------------------------------------------------------------------------- 1 | # These are the necessary packages that are needed in ci.yaml file (for continous integration. 2 | 3 | flake8==7.1.2 4 | isort==6.0.0 -------------------------------------------------------------------------------- /images/Data_Loaded_Successfuly_into_Table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/Data_Loaded_Successfuly_into_Table.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | config 2 | logs 3 | plugins 4 | .env 5 | aws.py 6 | **.DS_Store** 7 | **__pycache__** 8 | **__init__.py** 9 | **terraform** 10 | my_venv/ 11 | -------------------------------------------------------------------------------- /images/Travel Agency Architectural Diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/Travel Agency Architectural Diagram.png -------------------------------------------------------------------------------- /images/fact_and_dimension_tables_in_redshift.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/fact_and_dimension_tables_in_redshift.png -------------------------------------------------------------------------------- /images/redshift_table_created_successfully.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/redshift_table_created_successfully.png -------------------------------------------------------------------------------- /images/Travel Agency Orchestration Flow Chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chisomnwa/Travel_Agency_Project/main/images/Travel Agency Orchestration Flow Chart.png -------------------------------------------------------------------------------- /architecture/modules/vpc/outputs.tf: -------------------------------------------------------------------------------- 1 | output "vpc_id" { 2 | value = aws_vpc.travel_agency_vpc.id 3 | } 4 | 5 | output "subnet_group_id" { 6 | value = aws_redshift_subnet_group.redshift_subnet_group.id 7 | } 8 | 9 | -------------------------------------------------------------------------------- /architecture/modules/s3/variables.tf: -------------------------------------------------------------------------------- 1 | # variable "bucket_name" { 2 | # default = "cde-project-travel-agency-bucket" 3 | # } 4 | 5 | variable "bucket_name" { 6 | description = "The name of the S3 bucket" 7 | type = string 8 | } 9 | -------------------------------------------------------------------------------- /architecture/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "s3" { 3 | bucket = "travel-agency-backend-bucket" 4 | key = "travel-agency/dev/terraform.tfstate" # You define this path yourself. It's like a folder structure. 5 | region = "af-south-1" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /architecture/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "~> 5.0" 6 | } 7 | } 8 | } 9 | 10 | # Configure the AWS Provider 11 | provider "aws" { 12 | region = "af-south-1" 13 | } -------------------------------------------------------------------------------- /architecture/modules/ecr/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_ecr_repository" "travel_agency_ecr" { 2 | name = var.ecr_name 3 | image_tag_mutability = "MUTABLE" 4 | force_delete = true 5 | image_scanning_configuration { 6 | scan_on_push = true 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /architecture/modules/iam_roles/variables.tf: -------------------------------------------------------------------------------- 1 | # variable "redshift_role_arn" { 2 | # default = var.redshift_role_arn 3 | # } 4 | 5 | # variable "redshift_role_arn" { 6 | # description = "The ARN of the Redshift role" 7 | # type = string 8 | # } 9 | 10 | -------------------------------------------------------------------------------- /architecture/modules/ssm/variables.tf: -------------------------------------------------------------------------------- 1 | variable "password" { 2 | description = "This is the name of the SSM parameter" 3 | default = "redshift_password" 4 | } 5 | 6 | variable "username" { 7 | description = "This is the value the SSM parameter" 8 | default = "redshift_username" 9 | } 10 | -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/models/facts/fact_country_population.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | with base as ( 3 | select 4 | country_name, 5 | population, 6 | area 7 | from countries_data 8 | where population is not null 9 | ) 10 | 11 | select * from base -------------------------------------------------------------------------------- /app_requirements.txt: -------------------------------------------------------------------------------- 1 | # These are necessary packages for the travel_agency_appto run 2 | apache-airflow==2.10.5 # For using Airflow’s Variable module to manage configuration values. 3 | boto3==1.36.18 # For interacting with AWS services like S3. 4 | pandas==2.2.3 # For data manipulation and transformation 5 | requests==2.32.3 # For making HTTP requests to APIs. -------------------------------------------------------------------------------- /architecture/modules/vpc/variables.tf: -------------------------------------------------------------------------------- 1 | variable "vpc_name" { 2 | default = "travel_agency_vpc" 3 | } 4 | 5 | # variable "resource_prefix" { 6 | # type = string 7 | # } 8 | 9 | variable "azs" { 10 | default = ["af-south-1a", "af-south-1b", "af-south-1c"] 11 | } 12 | 13 | # variable "create_route" { 14 | # type = bool 15 | # default = true # Set to false in environments where the route already exists 16 | # } 17 | -------------------------------------------------------------------------------- /architecture/modules/redshift/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_redshift_cluster" "travel_agency_cluster" { 2 | cluster_identifier = var.cluster_identifier 3 | database_name = var.database_name 4 | master_username = var.username 5 | master_password = var.password 6 | node_type = "dc2.large" 7 | cluster_type = "multi" 8 | number_of_nodes = 2 9 | cluster_subnet_group_name = var.redshift_subnet_group 10 | iam_roles = [var.redshift_role_arn] 11 | skip_final_snapshot = true 12 | } 13 | -------------------------------------------------------------------------------- /architecture/modules/s3/main.tf: -------------------------------------------------------------------------------- 1 | # Create an S3 Bucket 2 | resource "aws_s3_bucket" "chisom_travel_agency" { 3 | bucket = var.bucket_name 4 | 5 | tags = { 6 | Name = "CDE Capstone Project bucket" 7 | Environment = "Dev" 8 | owner = "Chisom" 9 | team = "Core Data Engineers" 10 | managed_by = "Team Leaders" 11 | } 12 | } 13 | 14 | 15 | # Enable bucket versioning 16 | resource "aws_s3_bucket_versioning" "chisom_cde_project_versioning" { 17 | bucket = aws_s3_bucket.chisom_travel_agency.id 18 | versioning_configuration { 19 | status = "Enabled" 20 | } 21 | } -------------------------------------------------------------------------------- /.github/workflow/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI - Code Quality Checks 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint-test: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout Code 10 | uses: actions/checkout@v3 11 | 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: "3.11.6" 16 | 17 | - name: Install dependencies 18 | run: pip install -r requirements_ci.txt 19 | 20 | - name: Running isort. 21 | run: sort --check-only . 22 | 23 | - name: Running flake8 24 | run: flake8 . 25 | 26 | -------------------------------------------------------------------------------- /architecture/modules/iam_roles/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_iam_role" "redshift_role" { 2 | name = "s3_redshift_iam_role" 3 | 4 | # Terraform's "jsonencode" function converts a 5 | # Terraform expression result to valid JSON syntax. 6 | assume_role_policy = jsonencode({ 7 | Version = "2012-10-17" 8 | Statement = [ 9 | { 10 | Action = "sts:AssumeRole" 11 | Effect = "Allow" 12 | Sid = "" 13 | Principal = { 14 | Service = "redshift.amazonaws.com" 15 | } 16 | }, 17 | ] 18 | }) 19 | 20 | tags = { 21 | name = "travel_agency_role" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /airflow/dags/includes/SQL/create_table.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Got this from the transformed_data_to_redshift script 3 | begin; 4 | CREATE TABLE IF NOT EXISTS countries_data ( 5 | country_name TEXT NOT NULL, 6 | independent BOOLEAN, 7 | unMember BOOLEAN, 8 | startOfWeek VARCHAR(225), 9 | official_country_name TEXT, 10 | common_native_names TEXT, 11 | currency_code VARCHAR(225), 12 | currency_name TEXT, 13 | currency_symbol VARCHAR(225), 14 | country_code VARCHAR(2000) UNIQUE NOT NULL, 15 | capital TEXT, 16 | region TEXT, 17 | subregion TEXT, 18 | languages VARCHAR (1000), 19 | area FLOAT, 20 | population BIGINT, 21 | continents TEXT 22 | ); 23 | end; 24 | -------------------------------------------------------------------------------- /architecture/modules/ssm/main.tf: -------------------------------------------------------------------------------- 1 | resource "random_password" "redshift_pass" { 2 | length = 10 3 | min_lower = 5 4 | min_numeric = 2 5 | min_special = 1 6 | min_upper = 2 7 | } 8 | 9 | resource "random_password" "redshift_username" { 10 | length = 10 11 | min_lower = 6 12 | min_numeric = 2 13 | upper = false 14 | } 15 | 16 | resource "aws_ssm_parameter" "ssm_password" { 17 | name = var.password 18 | type = "String" 19 | value = random_password.redshift_pass.result 20 | } 21 | 22 | resource "aws_ssm_parameter" "ssm_username" { 23 | name = var.username 24 | type = "String" 25 | value = "a${random_password.redshift_username.result}" 26 | } 27 | 28 | # Look for arguments that can change and parameterize those arguments -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/models/facts/source.yml: -------------------------------------------------------------------------------- 1 | ersion: 2 2 | 3 | models: 4 | - name: fact_country_population 5 | description: 6 | 'Fact table storing measurable data related to countries such as 7 | population and area.' 8 | columns: 9 | - name: country_id 10 | description: 'Foreign key linking to the dim_country table.' 11 | tests: 12 | - not_null 13 | - relationships: 14 | to: ref('dim_country') 15 | field: country_id 16 | - name: population 17 | description: 'Population of the country.' 18 | tests: 19 | - not_null 20 | - name: area 21 | description: 'Total area of the country in square kilometers.' -------------------------------------------------------------------------------- /architecture/modules/redshift/variables.tf: -------------------------------------------------------------------------------- 1 | variable "redshift_subnet_group" { 2 | description = "Subnet group for the Redshift cluster" 3 | type = string 4 | } 5 | 6 | variable "redshift_role_arn" { 7 | description = "IAM role ARN for the Redshift cluster" 8 | type = string 9 | } 10 | 11 | variable "username" { 12 | description = "Master username for the Redshift cluster" 13 | type = string 14 | } 15 | 16 | variable "password" { 17 | description = "Master password for the Redshift cluster" 18 | type = string 19 | } 20 | 21 | variable "database_name" { 22 | description = "Database name for the Redshift cluster" 23 | type = string 24 | } 25 | 26 | variable "cluster_identifier" { 27 | description = "Identifier for the Redshift cluster" 28 | type = string 29 | } 30 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # This is the Dockerfile for the the image build during the CI/CD process 2 | 3 | # Use Python base Image 4 | FROM python:3.9-slim 5 | 6 | # Set working directory in the container 7 | WORKDIR /app 8 | 9 | # Copy the necessary files from tthe current directory into /app in the container 10 | COPY travel_agency_app.py /app 11 | COPY app_requirements.txt /app/ 12 | 13 | # Installl the needed packages specified in the requirements.txt file 14 | RUN pip install --no-cache-dir -r app_requirements.txt 15 | 16 | # Set the environment variable for AWS region (optional but good practice) 17 | ENV AWS_DEFAULT_REGION=af-south-1 18 | 19 | # Make port 80 available to the world outside this container )if nneeded) 20 | EXPOSE 8080 21 | 22 | # Run upload_to_s3.py when the container launches 23 | CMD ["python", "travel_agency_app.py"] -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/models/dimensions/dim_country.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | with base as ( 4 | select 5 | country_name, -- Common name of the country 6 | official_country_name, -- Official name of the country 7 | capital, -- Capital city 8 | region, -- Region of the country 9 | subregion, -- Subregion of the country 10 | languages, -- to handle multiple language 11 | currency_code, -- Currency code 12 | currency_name, -- Currency name 13 | currency_symbol, -- Currency symbol 14 | continents -- When the record was last updated 15 | from countries_data -- Refers to your raw country data table 16 | ) 17 | 18 | select * from base -------------------------------------------------------------------------------- /airflow/dags/includes/extract_data.py: -------------------------------------------------------------------------------- 1 | # RETRIEVING THE DATA FROM THE API 2 | # Import packages and libraries 3 | import requests 4 | import pandas as pd 5 | import logging 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s') 9 | 10 | def get_data(): 11 | """ 12 | A function that gets the data from the API 13 | and then turns the extracted data into a pandas dataframe 14 | """ 15 | url = "https://restcountries.com/v3.1/all" 16 | 17 | response = requests.get(url) 18 | logging.info("Fetching data from the API...") 19 | 20 | if response.status_code == 200: 21 | #Parse JSON response 22 | data = response.json() 23 | 24 | # Convert JSON data to Pandas DataFrame 25 | profiles_data = pd.DataFrame(data) 26 | logging.info(f"Data successsfuly turned into a pandas Dataframe with\ 27 | {profiles_data.shape[0]}records and {profiles_data.shape[1]} columns ") 28 | 29 | return profiles_data 30 | 31 | print(get_data()) 32 | # print(get_data().shape) 33 | # print(get_data().columns) 34 | -------------------------------------------------------------------------------- /architecture/main.tf: -------------------------------------------------------------------------------- 1 | # module "vpc" { 2 | # source = "./modules/vpc" 3 | # } 4 | 5 | # module "ecr" { 6 | # source = "./modules/ecr" 7 | # } 8 | 9 | # module "s3" { 10 | # source = "./modules/s3" 11 | # bucket_name = "travel-agency-bucket" 12 | # } 13 | 14 | # module "roles" { 15 | # source = "./modules/iam_roles" 16 | # } 17 | 18 | # data "aws_ssm_parameter" "password" { 19 | # name = "redshift_password" 20 | # } 21 | 22 | # data "aws_ssm_parameter" "username" { 23 | # name = "redshift_username" 24 | # } 25 | 26 | # module "ssm" { 27 | # source = "./modules/ssm" 28 | # } 29 | 30 | # module "redshift" { 31 | # source = "./modules/redshift" 32 | # redshift_subnet_group = module.vpc.subnet_group_id 33 | # redshift_role_arn = module.roles.s3_redshift_role_arn # Pointing to the output of the iam_roles 34 | # username = data.aws_ssm_parameter.username.value 35 | # password = data.aws_ssm_parameter.password.value 36 | # database_name = "travel_agency" 37 | # cluster_identifier = "travel-agency-cluster" # It must contain only lowercase alphanumeric characters (a-z, 0-9) and hyphens (-). 38 | # } 39 | -------------------------------------------------------------------------------- /.github/workflow/cd.yml: -------------------------------------------------------------------------------- 1 | name: Build and Push Travel Agency Docker Image 2 | 3 | on: 4 | push: 5 | branches: 6 | - main # Tigger on push to the main branch 7 | pull_request: 8 | branches: 9 | - main # Trigger on pull request to the main branch 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | # Checkout code from the repository 17 | - name: Checkout Code 18 | uses: actions/checkout@v2 19 | 20 | # Set up Docker Buildx (for multi-platform builds if needed) 21 | - name: Set up Docker Buildx 22 | uses: docker/setup-buildx-action@v1 23 | 24 | # Log in to Amazon ECR 25 | - name: Log in to AWS ECR 26 | uses: aws-actions/amazon-ecr-login@v1 27 | 28 | # Build and push docker image 29 | - name: Build and Push Docker image to AWS ECR 30 | run: | 31 | docker build -t cde/travel_agency:latest . 32 | docker tag cde/travel_agency:latest ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/cde/travel_agency:latest 33 | docker push ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/cde/travel_agency:latest 34 | 35 | -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/models/dimensions/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: dim_country 5 | description: 6 | 'Dimension table storing country-related descriptive information.' 7 | columns: 8 | - name: country_id 9 | description: 'Unique identifier for each country.' 10 | tests: 11 | - not_null 12 | - unique 13 | - name: country_name 14 | description: 'Common name of the country.' 15 | - name: official_name 16 | description: 'Official name of the country.' 17 | - name: capital 18 | description: 'Capital city of the country.' 19 | - name: region 20 | description: 'Geographical region of the country.' 21 | - name: subregion 22 | description: 'Subregion of the country.' 23 | - name: languages 24 | description: 25 | 'Official languages spoken in the country (stored as JSONB).' 26 | - name: currency_code 27 | description: "ISO code for the country's currency." 28 | - name: currency_name 29 | description: "Name of the country's currency." 30 | - name: currency_symbol 31 | description: "Symbol for the country's currency." 32 | - name: continents 33 | description: 'Continent(s) associated with the country.' 34 | 35 | tests: 36 | - unique: 37 | name: 'dim_country_country_id_unique' 38 | description: 'Ensure that the country_id is unique for each country.' -------------------------------------------------------------------------------- /dbt/dbt_travel_agency/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'dbt_travel_agency' 6 | version: '1.0.0' 7 | 8 | # This setting configures which "profile" dbt uses for this project. 9 | profile: 'dbt_travel_agency' 10 | 11 | # These configurations specify where dbt should look for different types of files. 12 | # The `model-paths` config, for example, states that models in this project can be 13 | # found in the "models/" directory. You probably won't need to change these! 14 | model-paths: ["models"] 15 | analysis-paths: ["analyses"] 16 | test-paths: ["tests"] 17 | seed-paths: ["seeds"] 18 | macro-paths: ["macros"] 19 | snapshot-paths: ["snapshots"] 20 | 21 | clean-targets: # directories to be removed by `dbt clean` 22 | - "target" 23 | - "dbt_packages" 24 | 25 | 26 | # Configuring models 27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 28 | 29 | # In this example config, we tell dbt to build all models in the example/ 30 | # directory as views. These settings can be overridden in the individual model 31 | # files using the `{{ config(...) }}` macro. 32 | # models: 33 | # dbt_travel_agency: 34 | # # Config indicated by + and applies to all files under models/example/ 35 | # example: 36 | # +materialized: view 37 | 38 | models: 39 | dbt_travel_agency: 40 | staging: 41 | # +schema: public 42 | +materialized: table 43 | analytics: 44 | # +schema: public 45 | +materialized: table 46 | 47 | -------------------------------------------------------------------------------- /architecture/modules/vpc/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_vpc" "travel_agency_vpc" { 2 | cidr_block = "172.16.0.0/16" 3 | 4 | tags = { 5 | Name = var.vpc_name 6 | } 7 | } 8 | 9 | resource "aws_subnet" "redshift_subnet_a" { 10 | vpc_id = aws_vpc.travel_agency_vpc.id 11 | cidr_block ="172.16.24.0/24" 12 | availability_zone = var.azs[0] 13 | tags = { 14 | Name = "zone_a" 15 | } 16 | } 17 | 18 | resource "aws_subnet" "redshift_subnet_b" { 19 | vpc_id = aws_vpc.travel_agency_vpc.id 20 | cidr_block = "172.16.25.0/24" 21 | availability_zone = var.azs[1] 22 | tags = { 23 | Name = "zone_b" 24 | } 25 | } 26 | 27 | resource "aws_redshift_subnet_group" "redshift_subnet_group" { 28 | name = "foo" 29 | subnet_ids = [aws_subnet.redshift_subnet_a.id, aws_subnet.redshift_subnet_b.id] 30 | 31 | tags = { 32 | environment = "redshift subnet group" 33 | } 34 | } 35 | 36 | resource "aws_internet_gateway" "igw" { 37 | vpc_id = aws_vpc.travel_agency_vpc.id 38 | 39 | tags = { 40 | Name = "chisom_igw" 41 | } 42 | } 43 | 44 | resource "aws_route_table" "default" { 45 | vpc_id = aws_vpc.travel_agency_vpc.id 46 | 47 | route { 48 | cidr_block = "0.0.0.0/0" 49 | gateway_id = aws_internet_gateway.igw.id 50 | } 51 | } 52 | 53 | resource "aws_route_table_association" "a" { 54 | subnet_id = aws_subnet.redshift_subnet_a.id 55 | route_table_id = aws_route_table.default.id 56 | } 57 | resource "aws_route_table_association" "b" { 58 | subnet_id = aws_subnet.redshift_subnet_b.id 59 | route_table_id = aws_route_table.default.id 60 | } 61 | 62 | resource "aws_default_security_group" "default" { 63 | vpc_id = aws_vpc.travel_agency_vpc.id 64 | 65 | ingress { 66 | description = "Allow inbound connections from Redshift" 67 | protocol = "tcp" 68 | from_port = 5439 69 | to_port = 5439 70 | cidr_blocks = ["0.0.0.0/0"] 71 | } 72 | 73 | tags = { 74 | "Name" = "redshift_security_group" 75 | } 76 | } 77 | 78 | -------------------------------------------------------------------------------- /airflow/dags/dag_definition/travel_agency_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from datetime import datetime, timedelta 3 | 4 | # Custom module imports 5 | from includes.s3_utils import save_parquet_to_s3, upload_to_s3 6 | 7 | # Airflow provider imports 8 | from airflow.operators.python import PythonOperator 9 | from airflow.providers.amazon.aws.operators.redshift_data import RedshiftDataOperator 10 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator 11 | 12 | # Define DAG folder path for mounted Docker volume 13 | dag_folder = "/opt/airflow/dags" 14 | sql_file = "create_table.sql" # Use just the filename 15 | 16 | default_args = { 17 | 'owner': "chisom", 18 | 'start_date': datetime(2025, 2, 26), 19 | 'retries': 2, 20 | 'retry_delay': timedelta(seconds=5), 21 | 'execution_timeout': timedelta(minutes=10), 22 | } 23 | 24 | with DAG( 25 | dag_id="travel_agency_dag", 26 | default_args=default_args, 27 | description="A simple DAG to extract data from an API, load it to S3, transform it, and load it into Redshift", 28 | default_view="graph", 29 | tags=["travel_agency", "cde"], 30 | schedule_interval="@daily", 31 | catchup=False, 32 | template_searchpath=f"{dag_folder}/includes/sql", 33 | ) as dag: 34 | 35 | # Task 1: Load data to S3 36 | load_data_to_s3 = PythonOperator( 37 | task_id="load_data_to_S3", 38 | python_callable=upload_to_s3 39 | ) 40 | 41 | # Task 2: Transform data 42 | transform_data = PythonOperator( 43 | task_id = "transform_data", 44 | python_callable = save_parquet_to_s3 45 | ) 46 | 47 | # Task 3: Create table in Redshift 48 | create_table = RedshiftDataOperator( 49 | task_id = "create_table", 50 | cluster_identifier="travel-agency-cluster", 51 | database="travel_agency", 52 | sql=sql_file, 53 | aws_conn_id="aws_default", 54 | wait_for_completion=True, 55 | region="af-south-1", 56 | params={ 57 | "schema": "public", 58 | "table": "countries_data", 59 | }, 60 | ) 61 | 62 | # Task 4: Load transformed data into Redshift 63 | load_data_to_redshift = S3ToRedshiftOperator( 64 | task_id='load_data_to_redshift', 65 | schema='public', 66 | table='countries_data', 67 | s3_bucket='travel-agency-bucket', 68 | s3_key='processed_data/processed_data.parquet', 69 | redshift_conn_id='redshift_default', 70 | aws_conn_id='aws_default', 71 | copy_options=[ 72 | "FORMAT AS PARQUET" 73 | ], 74 | method='REPLACE', 75 | ) 76 | 77 | # Define task dependencies 78 | load_data_to_s3 >> transform_data >> create_table >> load_data_to_redshift 79 | -------------------------------------------------------------------------------- /travel_agency_app.py: -------------------------------------------------------------------------------- 1 | # THIS CONTAINS: 2 | 3 | # A: Code that extracts the data ffrom the API 4 | # B: Code that loads the raw data to s3 5 | 6 | # THis is the part of the fulfillment of the C/CD implementation where this app will be built as an image using a dockerfile 7 | 8 | # Retrieving data from the API 9 | # Import libraries and packages 10 | import requests 11 | import pandas as pd 12 | import logging 13 | 14 | # Configure logging 15 | logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(message)') 16 | 17 | def get_data(): 18 | """ 19 | A function that extracts the data from the API 20 | and then turns it into a pandas DataFrame 21 | """ 22 | 23 | url = "https://restcountries.com/v3.1/all" 24 | 25 | response = requests.get(url) 26 | logging.info("Fetching data from the API...") 27 | 28 | if response.status_code == 200: 29 | # Parse JSON response 30 | data = response.json() 31 | 32 | # Convert JSON data to Pandas DataFrame 33 | profiles_data = pd.DataFrame(data) 34 | logging.info(f"Successfuly turned into a Pandas DataFrame\ 35 | {profiles_data.shape[0]} records and {profiles_data.shape[1]} columns") 36 | return profiles_data 37 | 38 | print(get_data()) 39 | 40 | 41 | ############################################################################### 42 | # Creating a connection to connect to AWS 43 | # Import necessary libraries and packages 44 | import boto3 45 | from airflow.models import Variable 46 | 47 | def create_session(): 48 | """Initialize and return a Boto3 session using Airflow variables.""" 49 | aws_access_key_id = Variable.get("aws_access_key_id") 50 | aws_secret_access_key = Variable.get("aws_secret_access_key") 51 | region_name = "af-south-1" 52 | 53 | session = boto3.Session( 54 | aws_access_key_id=aws_access_key_id, 55 | aws_secret_access_key=aws_secret_access_key, 56 | region_name=region_name 57 | ) 58 | 59 | return session 60 | 61 | print(create_session()) 62 | 63 | 64 | ############################################################################### 65 | # Loading the raw data into an s3 bucket 66 | from io import BytesIO 67 | 68 | def upload_to_s3(): 69 | """ 70 | Uploading data into an s3 bucket with a fixed file name 71 | """ 72 | data = get_data() 73 | 74 | if data.empty: 75 | print('DataFrame is empty. No data to return') 76 | return 77 | 78 | bucket_name = 'travel-agency-bucket' 79 | file_key = 'raw_data/data.parquet' 80 | 81 | # Convert DataFrame to bytes 82 | buffer = BytesIO() 83 | data.to_parquet(buffer, index=False) 84 | buffer.seek(0) 85 | 86 | # upload file to s3 87 | s3_client = create_session().client('s3') 88 | s3_client.put_object(bucket_name, file_key, Body=buffer.getvalue()) 89 | 90 | print(f"Data successfully uploaded to s3://{bucket_name}/{file_key}") 91 | 92 | upload_to_s3() 93 | 94 | -------------------------------------------------------------------------------- /project_README.md: -------------------------------------------------------------------------------- 1 | # CDE-CAPSTONE 2 | A travel Agency reached out to CDE, their business model involves recommending tourist location to their customers based on different data points, they want one of our graduates to build a Data Platform that will process the data from the Country rest API [HERE](https://restcountries.com/v3.1/all) into their cloud based Database/Data Warehouse for predictive analytics by their Data Science team. 3 | 4 | ### SOME CONSIDERATIONS TO BE AWARE 5 | - The company needs some specific fields/attributes from the API data to enable downstream usage. 6 | - However, you have to extract the entire raw data from the API into any `Cloud Based Object Storage`, which will serve as the Raw layer. 7 | - The choice of file format when storing the data in Object Storage must be `Parquet` file considering its better for performance. 8 | - We want to store the entire API data because, if the agency require more field/attribute in the future, we don't have to pull data from the API again, we simply pull from the one on the data lake. 9 | - From the Data Lake, please extract the below attributes that is required for predictive analytics by the Travel Agency and write to a Cloud Database or Datawarehouse. 10 | 11 | - Country name 12 | 13 | - Independence 14 | 15 | - United Nation members 16 | 17 | - startOfWeek 18 | 19 | - Official country name 20 | 21 | - Common native name 22 | 23 | - Currency Code e.g USD, EUR 24 | 25 | - Currency name 26 | 27 | - Currency symbol 28 | 29 | - Country code ( idd ) . e.g Germany country code is +49 30 | 31 | - you need to concatenate idd root and idd suffix from the response 32 | 33 | - Capital 34 | 35 | - Region 36 | 37 | - Sub region 38 | 39 | - Languages 40 | 41 | - Area 42 | 43 | - Population 44 | 45 | - Continents 46 | 47 | - Apache Airflow `MUST` be used for Orchestrating the entire workflow, which includes 48 | - Extracting the data from the API 49 | - Writing the extracted data to the Data Lake 50 | - Extracting the final required attributes to the Database/Data Warehouse. 51 | 52 | - CI/CD should be integrated to the Github Repository 53 | - CI that carry out checks on code linting to ensure code written follow best practices. 54 | - CD to carry out the Build and Push of the code that Extract and Write to object storage to a Cloud based Container Registry. 55 | - Basically, you need to package the code that does the Extract of the raw data from API and the Code that write the data to Object storage into a Docker image and push the image to a Cloud Based Container Registry. 56 | 57 | - All Cloud Infrastructures like IAM, Object storage, DB/DW resource provisioning has to be Terraformed with the Terraform State File backend managed in the cloud using an Object Storage. 58 | 59 | - Lastly, please you `MUST` leverage DBT to model the data into Fact and Dimension tables. 60 | 61 | ### BONUS (NOT MANDATORY) 62 | Derive any insights from the Data Set. 63 | 64 | 65 | ## SUBMISSION REQUIREMENTS 66 | - A power point or something similar for presentation of the entire project covering the below 67 | - The Data Architecture. 68 | - The choice of tools 🛠️ 69 | - Well documented read me of the project in Github. 70 | - Project submission will close on the 20th November, 2024. 71 | - Github link should be submitted in the link [HERE](https://forms.gle/osnNmo7JyGkQeXnb8) 72 | -------------------------------------------------------------------------------- /airflow/dags/includes/transform_data.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(name)s:%(message)s') 8 | logging.getLogger().setLevel(20) 9 | 10 | 11 | def transform_data(df): 12 | """ 13 | Transforms the given Pandas DataFrame by performing specific data 14 | processing operations. 15 | 16 | Parameters 17 | ---------- 18 | df : pd.DataFrame 19 | A Pandas DataFrame in Parquet format that contains the data to be 20 | transformed. 21 | 22 | Returns 23 | ------- 24 | pd.DataFrame 25 | A new Pandas DataFrame with the transformed data. 26 | """ 27 | logging.info("Starting data transformation.") 28 | 29 | logging.debug("Extracting currency details.") 30 | currency_details = df['currencies'].apply(extract_currency_details) 31 | df = pd.concat([df, currency_details], axis=1) 32 | 33 | logging.debug("Extracting country names and native names.") 34 | df['country_name'] = df['name'].apply(lambda x: x.get('common')) 35 | df['official_country_name'] = df['name'].apply(lambda x: x.get('official')) 36 | df['common_native_names'] = df['name'].apply(lambda x: 37 | extract_all_common_native_name 38 | (x.get('nativeName'))) 39 | 40 | logging.debug("Extracting languages and country codes.") 41 | df['languages'] = df['languages'].apply(lambda x: extract_languages(x)) 42 | df['country_code'] = df['idd'].apply(lambda x: generate_country_codes(x)) 43 | 44 | logging.debug("Simplifying continent and capital columns.") 45 | df['continents'] = df['continents'].str[0] 46 | df['capital'] = df['capital'].str[0] 47 | 48 | logging.debug("Dropping columns: 'name', 'idd', 'currencies'.") 49 | df = df.drop(columns=['name', 'idd', 'currencies']) 50 | 51 | desired_order = [ 52 | 'country_name', 'independent', 'unMember', 'startOfWeek', 53 | 'official_country_name', 'common_native_names', 54 | 'currency_code', 'currency_name', 'currency_symbol', 55 | 'country_code', 'capital', 'region', 'subregion', 56 | 'languages', 'area', 'population', 'continents' 57 | ] 58 | logging.debug("Reordering columns.") 59 | df = df[desired_order] 60 | 61 | logging.info("Data transformation completed.") 62 | return df 63 | 64 | 65 | def extract_currency_details(row): 66 | if isinstance(row, dict) and len(row) > 0: 67 | valid_entry = {key: value for key, value in row.items() 68 | if value is not None} 69 | if valid_entry: 70 | code = list(valid_entry.keys())[0] 71 | details = valid_entry[code] 72 | logging.debug("Currency details found: code=%s, \ 73 | details=%s", code, details) 74 | return pd.Series({ 75 | 'currency_code': code, 76 | 'currency_name': details.get('name', None), 77 | 'currency_symbol': details.get('symbol', None) 78 | }) 79 | logging.debug("No valid currency details found.") 80 | return pd.Series({'currency_code': None, 'currency_name': None, 81 | 'currency_symbol': None}) 82 | 83 | 84 | def extract_languages(language): 85 | if isinstance(language, dict): 86 | result = ", ".join(str(x) for x in language.values() if x is not None) 87 | logging.debug("Extracted languages: %s", result) 88 | return result 89 | logging.debug("No valid language data found.") 90 | return None 91 | 92 | 93 | def extract_all_common_native_name(native_name): 94 | if isinstance(native_name, dict): 95 | result = ", ".join(entry.get('common', '') for entry in 96 | native_name.values() if isinstance(entry, dict) and 97 | 'common' in entry) 98 | logging.debug("Extracted native names: %s", result) 99 | return result 100 | logging.debug("No valid native names found.") 101 | return None 102 | 103 | 104 | def generate_country_codes(idd): 105 | if isinstance(idd, dict): 106 | root = idd.get('root', '') 107 | suffixes = idd.get('suffixes', []) 108 | if isinstance(suffixes, (list, np.ndarray)): 109 | result = " ".join([root + suffix for suffix in suffixes]) 110 | logging.debug("Generated country codes: %s", result) 111 | return result 112 | logging.debug("No valid country code data found.") 113 | # return None 114 | return "Unknown" 115 | -------------------------------------------------------------------------------- /airflow/dags/includes/s3_utils.py: -------------------------------------------------------------------------------- 1 | #################################################################################### 2 | # # Using this version if not working wih airflow 3 | # import boto3 4 | # import os 5 | # from boto3.session import Session # Import Session explicitly 6 | 7 | # from dotenv import load_dotenv 8 | # # Load environment variables from .env file 9 | # load_dotenv() 10 | 11 | # def create_session(): 12 | # session = Session( 13 | # aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), 14 | # aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), 15 | # region_name="af-south-1" 16 | # ) 17 | 18 | # print("Ran successfully") 19 | 20 | # return session 21 | 22 | # print(create_session()) 23 | 24 | 25 | #################################################################################### 26 | # CREATING A SESSION TO CONNECT TO AWS 27 | # Import necessary libraries 28 | import boto3 29 | from airflow.models import Variable 30 | 31 | def create_session(): 32 | """Initialize and return a Boto3 session using Airflow variables.""" 33 | aws_access_key_id = Variable.get("aws_access_key_id") 34 | aws_secret_access_key = Variable.get("aws_secret_access_key") 35 | region_name = "af-south-1" 36 | 37 | session = boto3.Session( 38 | aws_access_key_id=aws_access_key_id, 39 | aws_secret_access_key=aws_secret_access_key, 40 | region_name=region_name 41 | ) 42 | 43 | return session 44 | 45 | # print(create_session()) 46 | 47 | 48 | #################################################################################### 49 | # UPLOADS RAW DATA TO S3 BUCKET 50 | import awswrangler as wr 51 | from io import BytesIO 52 | 53 | from includes.extract_data import get_data 54 | 55 | def upload_to_s3(): 56 | """ 57 | Uploads a pandas DataFrame to an S3 bucket with a fixed file name. 58 | """ 59 | data = get_data() 60 | 61 | if data.empty: 62 | print("The DataFrame is empty. No data to upload") 63 | return 64 | 65 | bucket_name = "travel-agency-bucket" 66 | file_key = "raw_data/data.parquet" # Specify the exact file name 67 | 68 | # Convert DataFrame to bytes 69 | buffer = BytesIO() 70 | data.to_parquet(buffer, index=False) 71 | buffer.seek(0) 72 | 73 | # Upload file to S3 74 | s3_client = create_session().client ('s3') 75 | s3_client.put_object(Bucket=bucket_name, Key=file_key, Body=buffer.getvalue()) 76 | 77 | print(f"Data successfully uploaded to s3://{bucket_name}/{file_key}") 78 | 79 | upload_to_s3() 80 | 81 | 82 | ############################################################################### 83 | # RETRIEVS RAW DATA, TRANSFROMS IT AND LOADS IT BACK TO S3 BUCKET 84 | import pandas as pd 85 | from io import BytesIO 86 | import logging 87 | 88 | from includes.transform_data import transform_data 89 | 90 | # Configure logging 91 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s") 92 | 93 | # Define S3 bucket and file key 94 | bucket_name = "travel-agency-bucket" 95 | file_key = "raw_data/data.parquet" 96 | 97 | def retrieve_and_process_data(): 98 | """Retrieve a Parquet file from S3 and process it into a Pandas DataFrame.""" 99 | try: 100 | s3_client = create_session().client("s3") 101 | response = s3_client.get_object(Bucket=bucket_name, Key=file_key) 102 | df = pd.read_parquet(BytesIO(response["Body"].read())) 103 | logging.info(f"Successfully retrieved and read the Parquet file from s3://{bucket_name}/{file_key}") 104 | return df 105 | except Exception as e: 106 | logging.error(f"Error retrieving or processing the Parquet file: {e}") 107 | # raise 108 | return pd.DataFrame() 109 | 110 | def save_parquet_to_s3(): 111 | """Reads raw data from S3, transforms it, and saves the transformed data back to S3.""" 112 | try: 113 | # Retrieve raw data from S3 114 | raw_df = retrieve_and_process_data() 115 | 116 | # Apply transformation using extract_country_info 117 | # transformed_df = raw_df.apply(extract_and_rename_columns, axis=1, result_type="expand") 118 | transformed_df = transform_data(raw_df) 119 | 120 | # Print transformed data to the terminal 121 | # print("\nTransformed Data Preview:") 122 | print(transformed_df.dtypes) 123 | max_length = transformed_df['country_code'].astype(str).apply(len).max() 124 | print(f"Max length in Parquet file: {max_length}") 125 | 126 | # Define S3 target path for transformed data 127 | processed_file_key = "processed_data/processed_data.parquet" 128 | s3_client = create_session().client("s3") 129 | 130 | # Convert DataFrame to bytes 131 | buffer = BytesIO() 132 | transformed_df.to_parquet(buffer, index=False, engine="pyarrow") # Edited here 133 | buffer.seek(0) 134 | 135 | # Upload file to S3 136 | s3_client.put_object(Bucket=bucket_name, Key=processed_file_key, Body=buffer.getvalue()) 137 | 138 | logging.info(f"Processed data successfully saved to s3://{bucket_name}/{processed_file_key}") 139 | 140 | except Exception as e: 141 | logging.error(f"Error saving processed data to S3: {e}") 142 | raise 143 | 144 | # Call the function to save processed data to S3 145 | save_parquet_to_s3() 146 | 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Travel Agency Data Platform 2 | 3 | ## Introduction 4 | In response to the **CDE cohort-1** capstone project requirement, I created this project which integrates all the key concepts and tools learned during the Bootcamp program. 5 | 6 | The project demonstrates the development of a robust Data Platform capable of ingesting, transforming, and storing data for predictive analytics. The tools and services utilized in this project includes `Docker`, `Airflow`, `Terraform`, `AWS Services` (like Amazon s3. Redshift, ECR), and `dbt`. 7 | 8 | ## Overview 9 | 10 | The goal of this project is to build a scalable and efficient data pipeline for a travel agency, enabling their Data Science team to analyze curated data for predictive analytics. The pipeline is designed to: 11 | 12 | * Extract raw data from the Country [REST API](https://restcountries.com/v3.1/all). 13 | 14 | * Load the data into an AWS S3 Data Lake in Parquet format. 15 | 16 | * Transform the data into a curated dataset containing specific fields relevant for analysis. 17 | 18 | * Load the transformed data into Amazon Redshift table. 19 | 20 | * Use dbt to model the data into Fact and Dimension tables for efficient querying. 21 | 22 | * incorporate CI/CD pipelines to automate code quality checks, build processes, and deployments, ensuring best practices and streamlined workflows. 23 | 24 | 25 | ## Methodology 26 | 27 | Having carefully assessed the requirements, **Docker** was used to host **Airflow*, which served as the orchestration tool for this project. 28 | The dataset was extracted from the Country REST API and stored in Parquet format in **Amazon S3** to ensure future extensibility. 29 | Relevant columns were then selected from the raw data and loaded into a **Redshift** table, 30 | which functioned as the Data Warehouse. **dbt** was utilized to model the transformed data into `Fact` and `Dimension tables`, enabling efficient querying. 31 | Additionally, **Terraform** was employed as an Infrastructure as Code (IaC) tool to provision all necessary AWS resources. 32 | 33 | ## Project Architecture 34 | ### Overview 35 | The data pipeline is designed to seamlessly integrate data ingestion, transformation, and storage. Below is a high-level description of the architecture: 36 | 37 | * **Data Ingestion**: Data is stored in Amazon S3 bucket. 38 | 39 | * **Infrastructure Setup**: Terraform provisions the AWS VPC, IAM roles, Redshift clusters, and other necessary resources. 40 | 41 | * **ETL Orchestration**: Apache Airflow orchestrates the ETL process, including loading data into Redshift and triggering dbt models. 42 | 43 | * **Data Transformation**: dbt structures raw data into analytics-ready formats. 44 | 45 | * **Output**: Transformed data is stored in Redshift and can be accessed by BI tools or queried directly. 46 | 47 | ## Architectural Diagram 48 | 49 | ![image](https://github.com/Chisomnwa/CDE_Capstone_Project/blob/main/Travel%20Agency%20Platform%20Architecture.png) 50 | 51 | 52 | ## Tools and Services Used and their Functions 53 | 54 | * **Terraform**: To avoid manually creating the resources which was used for this project, Terraform was used for Infrastructure as Code (IaC) to provision and manage cloud resources like AWS S3, Redshift, IAM roles, and VPC. This ensures scalability, consistency, and reproducibility in infrastructure deployment. 55 | 56 | * **Docker**: Docker was used to containerize Airflow by building from an Apache Airflow Image found [here](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html). A `Dockerfile` and a `requirement.txt` file were used to enhance the base image provided by Airflow to include `awswrangler` package. 57 | 58 | * **Airflow**: Airflow which is an open-source technology was used as an orchestration tool to automate and manage the ETL pipeline. It had dag dependencies on the worker nodes set up to trigger at a scheduled interval which can be accesses on the web server GUI. 59 | 60 | * **AWS Service**: Because of its accessibility and compatibility with terraform, and ease of use, AWS cloud was chosen as the preferred cloud provider. An IAM with the necessary policies (using the principle of least-privilege) was provisoned and was used by Airflow for accessing the Data Lake (s3 bucket) and Data Warehouse (Redshift), using the access keys and secret access key parameters saved in the SSM parameter store. 61 | 62 | Under AWS Services also, **Amazon ECR** was used to store the Docker image, which contains the codes for extracting the data from the API and uploading the raw data to the s3 bucket Data Lake. 63 | 64 | * **dbt**: which stands for Data Build Tool was utilized for transforming the raw data and modeling it into Fact and Dimension tables in Redshift. It also helps to ensure data quality, modularity, and version control, making it easier to mantain analytics-ready datasets. 65 | 66 | # Setup Instructions 67 | ## Prerequisites: 68 | * AWS Account with sufficient IAM permissions. 69 | 70 | * Installed tools: Terraform, Apache Airflow, dbt, AWS CLI, Python (with virtualenv). 71 | ---- 72 | 73 | 1. Clone the Repository: 74 | 75 | `git clone ` 76 | 77 | `cd ` 78 | 79 | 2. Configure AWS Credentials: 80 | 81 | Ensure your AWS CLI is configured with the required credentials: 82 | 83 | `aws configure` 84 | 85 | 3. Infrastructure Provisioning: 86 | 87 | Navigate to the **Architecture directory** and run the following commands: 88 | 89 | `terraform init` 90 | `terraform fmt` 91 | `terraform validate` 92 | `terraform apply` 93 | 94 | 4. Start Airflow: 95 | 96 | Set up Airflow and deploy the DAGs: 97 | 98 | * cd into the **airflow directory**. 99 | 100 | * Download docker-compose.yaml file using 101 | 102 | `curl -LfO 'https://airflow.apache.org/docs/apache-airflow/2.10.3/docker-compose.yaml'` 103 | 104 | * Create your Dockerfile, .env file, and your requirements.txt file. 105 | 106 | * set up your DAG. 107 | 108 | * In the terminal run `docker-compose up -d` to start up your containers. 109 | 110 | * Go to your localhost:8080 to view and run your dag. 111 | 112 | PS: Place the DAG files in the dags folder and configure Airflow connections for Redshift and S3. 113 | 114 | 5. Run dbt Models: 115 | 116 | Navigate to your dbt project folder and execute: 117 | 118 | `dbt run` 119 | 120 | # Challenges and Solutions 121 | 122 | * **Challenge 1**: Terraform Variable Errors 123 | 124 | **Solution**: Refactored variable definitions and ensured proper passing of values between modules. 125 | 126 | * **Challenge 2**: Airflow Redshift Connection 127 | 128 | **Solution**: Configured Redshift connection settings in Airflow with accurate credentials and endpoint details. 129 | 130 | * **Challenge 3**: dbt Execution Errors 131 | 132 | **Solution**: Debugged errors using dbt logs and ensured proper schema configurations in dbt_project.yml. 133 | 134 | # Future Improvements 135 | **Monitoring**: Implement AWS CloudWatch for pipeline monitoring and alerting. 136 | 137 | **Scaling**: Extend the pipeline to integrate AWS Glue for more complex ETL processes 138 | 139 | 140 | # Power Point presentation 141 | To have a view of the project's data architecture and understand the **Why** for my choice of tools 🛠️, 142 | you can access the power point slides [here](https://github.com/Chisomnwa/CDE_Capstone_Project/blob/main/Travel_Agency_Project_Slides.pdf). 143 | 144 | # Medium Article 145 | Here's a [medium article](https://medium.com/towards-data-engineering/how-i-built-a-travel-agency-data-platform-27e81a5dd668) that I have written on this project. It gives the full view and the detailed steps on how to complete 146 | this project. You will enjoy it. 147 | 148 | -------------------------------------------------------------------------------- /airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.10.5 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. 31 | # Default: . 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 33 | # 34 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 35 | # Default: airflow 36 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 37 | # Default: airflow 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 39 | # Use this option ONLY for quick checks. Installing requirements at container 40 | # startup is done EVERY TIME the service is started. 41 | # A better way is to build a custom image or extend the official image 42 | # as described in https://airflow.apache.org/docs/docker-stack/build.html. 43 | # Default: '' 44 | # 45 | # Feel free to modify this file to suit your needs. 46 | --- 47 | x-airflow-common: 48 | &airflow-common 49 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 50 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 51 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 52 | image: ${AIRFLOW_IMAGE_NAME:-airflow_awswrangler} 53 | # build: . 54 | environment: 55 | &airflow-common-env 56 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 57 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 58 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 59 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 60 | AIRFLOW__CORE__FERNET_KEY: '' 61 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 62 | AIRFLOW__CORE__LOAD_EXAMPLES: 'true' 63 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' 64 | # yamllint disable rule:line-length 65 | # Use simple http server on scheduler for health checks 66 | # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server 67 | # yamllint enable rule:line-length 68 | AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' 69 | # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks 70 | # for other purpose (development, test and especially production usage) build/extend Airflow image. 71 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 72 | # The following line can be used to set a custom config file, stored in the local config folder 73 | # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file 74 | # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' 75 | volumes: 76 | - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags 77 | - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs 78 | - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config 79 | - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins 80 | user: "${AIRFLOW_UID:-50000}:0" 81 | depends_on: 82 | &airflow-common-depends-on 83 | redis: 84 | condition: service_healthy 85 | postgres: 86 | condition: service_healthy 87 | 88 | services: 89 | postgres: 90 | image: postgres:13 91 | environment: 92 | POSTGRES_USER: airflow 93 | POSTGRES_PASSWORD: airflow 94 | POSTGRES_DB: airflow 95 | volumes: 96 | - postgres-db-volume:/var/lib/postgresql/data 97 | healthcheck: 98 | test: ["CMD", "pg_isready", "-U", "airflow"] 99 | interval: 10s 100 | retries: 5 101 | start_period: 5s 102 | restart: always 103 | 104 | redis: 105 | # Redis is limited to 7.2-bookworm due to licencing change 106 | # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ 107 | image: redis:7.2-bookworm 108 | expose: 109 | - 6379 110 | healthcheck: 111 | test: ["CMD", "redis-cli", "ping"] 112 | interval: 10s 113 | timeout: 30s 114 | retries: 50 115 | start_period: 30s 116 | restart: always 117 | 118 | airflow-webserver: 119 | <<: *airflow-common 120 | command: webserver 121 | ports: 122 | - "8080:8080" 123 | healthcheck: 124 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 125 | interval: 30s 126 | timeout: 10s 127 | retries: 5 128 | start_period: 30s 129 | restart: always 130 | depends_on: 131 | <<: *airflow-common-depends-on 132 | airflow-init: 133 | condition: service_completed_successfully 134 | 135 | airflow-scheduler: 136 | <<: *airflow-common 137 | command: scheduler 138 | healthcheck: 139 | test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] 140 | interval: 30s 141 | timeout: 10s 142 | retries: 5 143 | start_period: 30s 144 | restart: always 145 | depends_on: 146 | <<: *airflow-common-depends-on 147 | airflow-init: 148 | condition: service_completed_successfully 149 | 150 | airflow-worker: 151 | <<: *airflow-common 152 | command: celery worker 153 | healthcheck: 154 | # yamllint disable rule:line-length 155 | test: 156 | - "CMD-SHELL" 157 | - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 158 | interval: 30s 159 | timeout: 10s 160 | retries: 5 161 | start_period: 30s 162 | environment: 163 | <<: *airflow-common-env 164 | # Required to handle warm shutdown of the celery workers properly 165 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 166 | DUMB_INIT_SETSID: "0" 167 | restart: always 168 | depends_on: 169 | <<: *airflow-common-depends-on 170 | airflow-init: 171 | condition: service_completed_successfully 172 | 173 | airflow-triggerer: 174 | <<: *airflow-common 175 | command: triggerer 176 | healthcheck: 177 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 178 | interval: 30s 179 | timeout: 10s 180 | retries: 5 181 | start_period: 30s 182 | restart: always 183 | depends_on: 184 | <<: *airflow-common-depends-on 185 | airflow-init: 186 | condition: service_completed_successfully 187 | 188 | airflow-init: 189 | <<: *airflow-common 190 | entrypoint: /bin/bash 191 | # yamllint disable rule:line-length 192 | command: 193 | - -c 194 | - | 195 | if [[ -z "${AIRFLOW_UID}" ]]; then 196 | echo 197 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 198 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 199 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 200 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 201 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" 202 | echo 203 | fi 204 | one_meg=1048576 205 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 206 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 207 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 208 | warning_resources="false" 209 | if (( mem_available < 4000 )) ; then 210 | echo 211 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 212 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 213 | echo 214 | warning_resources="true" 215 | fi 216 | if (( cpus_available < 2 )); then 217 | echo 218 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 219 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 220 | echo 221 | warning_resources="true" 222 | fi 223 | if (( disk_available < one_meg * 10 )); then 224 | echo 225 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 226 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 227 | echo 228 | warning_resources="true" 229 | fi 230 | if [[ $${warning_resources} == "true" ]]; then 231 | echo 232 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 233 | echo "Please follow the instructions to increase amount of resources available:" 234 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" 235 | echo 236 | fi 237 | mkdir -p /sources/logs /sources/dags /sources/plugins 238 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 239 | exec /entrypoint airflow version 240 | # yamllint enable rule:line-length 241 | environment: 242 | <<: *airflow-common-env 243 | _AIRFLOW_DB_MIGRATE: 'true' 244 | _AIRFLOW_WWW_USER_CREATE: 'true' 245 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 246 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 247 | _PIP_ADDITIONAL_REQUIREMENTS: '' 248 | user: "0:0" 249 | volumes: 250 | - ${AIRFLOW_PROJ_DIR:-.}:/sources 251 | 252 | airflow-cli: 253 | <<: *airflow-common 254 | profiles: 255 | - debug 256 | environment: 257 | <<: *airflow-common-env 258 | CONNECTION_CHECK_MAX_COUNT: "0" 259 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 260 | command: 261 | - bash 262 | - -c 263 | - airflow 264 | 265 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up 266 | # or by explicitly targeted on the command line e.g. docker-compose up flower. 267 | # See: https://docs.docker.com/compose/profiles/ 268 | flower: 269 | <<: *airflow-common 270 | command: celery flower 271 | profiles: 272 | - flower 273 | ports: 274 | - "5555:5555" 275 | healthcheck: 276 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 277 | interval: 30s 278 | timeout: 10s 279 | retries: 5 280 | start_period: 30s 281 | restart: always 282 | depends_on: 283 | <<: *airflow-common-depends-on 284 | airflow-init: 285 | condition: service_completed_successfully 286 | 287 | volumes: 288 | postgres-db-volume: 289 | --------------------------------------------------------------------------------