├── .github
└── workflows
│ ├── ci_cd.yml
│ └── destroy-infra.yml
├── .gitignore
├── Infra
├── locals.tf
├── main.tf
├── modules
│ ├── eventbridge
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
│ ├── glue_catalog_database
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
│ ├── glue_catalog_table
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
│ ├── glue_classifier
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
│ ├── glue_crawler
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
│ ├── glue_iam
│ │ ├── main.tf
│ │ └── output.tf
│ ├── glue_job
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
│ ├── glue_trigger
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
│ ├── lambda
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
│ ├── request_layer
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
│ └── s3
│ │ ├── main.tf
│ │ ├── output.tf
│ │ └── variables.tf
└── providers.tf
├── Makefile
├── README.md
├── env
└── base.env
├── etl
├── extract
│ ├── System
│ │ ├── LocalLocation.py
│ │ ├── location.py
│ │ └── workspace.py
│ ├── __init__.py
│ ├── extract_data.py
│ └── system_config.yml
├── glue_etl_job
│ ├── __init__.py
│ └── transform_data.py
└── load
│ ├── __init__.py
│ └── load_data.py
├── images
└── architecture.png
├── requirements.txt
└── terraform.tfstate
/.github/workflows/ci_cd.yml:
--------------------------------------------------------------------------------
1 | name: "Terraform action"
2 | on:
3 | push:
4 | branches:
5 | - main
6 | pull_request:
7 | permissions:
8 | id-token: write # This is required for aws oidc connection
9 | contents: read # This is required for actions/checkout
10 | pull-requests: write # This is required for gh bot to comment PR
11 | env:
12 | TF_LOG: INFO
13 | AWS_REGION: ${{ secrets.AWS_REGION }}
14 | jobs:
15 | deploy:
16 | runs-on: ubuntu-latest
17 | defaults:
18 | run:
19 | shell: bash
20 | working-directory: .
21 | steps:
22 | - name: Git checkout
23 | uses: actions/checkout@v3
24 |
25 | - name: Configure AWS credentials from AWS account
26 | uses: aws-actions/configure-aws-credentials@v1
27 | with:
28 | role-to-assume: ${{ secrets.AWS_ROLE }}
29 | aws-region: ${{ secrets.AWS_REGION }}
30 | role-session-name: GitHub-OIDC-TERRAFORM
31 |
32 | - name: Setup Terraform
33 | uses: hashicorp/setup-terraform@v2
34 | with:
35 | terraform_version: 1.7.5
36 |
37 | - name: Terraform fmt
38 | id: fmt
39 | run: terraform fmt -check
40 | continue-on-error: true
41 |
42 | - name: Terraform Init
43 | id: init
44 | env:
45 | AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }}
46 | AWS_BUCKET_KEY_NAME: ${{ secrets.AWS_BUCKET_KEY_NAME }}
47 | run: make terraform-init
48 |
49 | - name: Terraform Validate
50 | id: validate
51 | run: make terraform-validate
52 |
53 | - name: Terraform Plan
54 | id: plan
55 | run: make terraform-plan
56 | if: github.event_name == 'pull_request'
57 | continue-on-error: true
58 |
59 | - uses: actions/github-script@v6
60 | if: github.event_name == 'pull_request'
61 | env:
62 | PLAN: "terraform\n${{ steps.plan.outputs.stdout }}"
63 | with:
64 | github-token: ${{ secrets.GITHUB_TOKEN }}
65 | script: |
66 | const output = `#### Terraform Format and Style 🖌\`${{ steps.fmt.outcome }}\`
67 | #### Terraform Initialization ⚙️\`${{ steps.init.outcome }}\`
68 | #### Terraform Validation 🤖\`${{ steps.validate.outcome }}\`
69 | Validation Output
70 |
71 | \`\`\`\n
72 | ${{ steps.validate.outputs.stdout }}
73 | \`\`\`
74 |
75 |
76 |
77 | #### Terraform Plan 📖\`${{ steps.plan.outcome }}\`
78 |
79 | Show Plan
80 |
81 | \`\`\`\n
82 | ${process.env.PLAN}
83 | \`\`\`
84 |
85 |
86 |
87 | *Pushed by: @${{ github.actor }}, Action: \`${{ github.event_name }}\`*`;
88 |
89 | github.rest.issues.createComment({
90 | issue_number: context.issue.number,
91 | owner: context.repo.owner,
92 | repo: context.repo.repo,
93 | body: output
94 | })
95 |
96 | - name: Terraform Plan Status
97 | if: steps.plan.outcome == 'failure'
98 | run: exit 1
99 |
100 | - name: Terraform Apply
101 | if: github.ref == 'refs/heads/main' && github.event_name == 'push'
102 | run: make terraform-apply
--------------------------------------------------------------------------------
/.github/workflows/destroy-infra.yml:
--------------------------------------------------------------------------------
1 | name: "Terraform destroy"
2 | on:
3 | push:
4 | branches:
5 | - feat*/*
6 | workflow_dispatch:
7 | permissions:
8 | id-token: write # This is required for aws oidc connection
9 | contents: read # This is required for actions/checkout
10 | pull-requests: write # This is required for gh bot to comment PR
11 | env:
12 | TF_LOG: INFO
13 | AWS_REGION: ${{ secrets.AWS_REGION }}
14 | jobs:
15 | deploy:
16 | runs-on: ubuntu-latest
17 | defaults:
18 | run:
19 | shell: bash
20 | working-directory: .
21 | steps:
22 | - name: Git checkout
23 | uses: actions/checkout@v3
24 |
25 | - name: Configure AWS credentials from AWS account
26 | uses: aws-actions/configure-aws-credentials@v1
27 | with:
28 | role-to-assume: ${{ secrets.AWS_ROLE }}
29 | aws-region: ${{ secrets.AWS_REGION }}
30 | role-session-name: GitHub-OIDC-TERRAFORM
31 |
32 | - name: Setup Terraform
33 | uses: hashicorp/setup-terraform@v2
34 | with:
35 | terraform_version: 1.7.5
36 |
37 | - name: Terraform fmt
38 | id: fmt
39 | run: terraform fmt -check
40 | continue-on-error: true
41 |
42 | - name: Terraform Init
43 | id: init
44 | env:
45 | AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }}
46 | AWS_BUCKET_KEY_NAME: ${{ secrets.AWS_BUCKET_KEY_NAME }}
47 | run: make terraform-init
48 |
49 | - name: Terraform Destroy
50 | id: destroy
51 | run: make terraform-destroy
52 |
53 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | venv/*
3 |
4 | Infra/requirements
5 | Infra/requirements/*
6 |
7 | Infra/.terraform
8 | Infra/.terraform/*
9 |
10 | Infra/lambda_function_extract_data.zip
11 | Infra/requirements.zip
12 |
13 | .DS_Store
14 | Infra/.terraform.lock.hcl
15 | Infra/terraform.tfstate.backup
16 | Infra/terraform.tfstate
17 |
18 | # Local .terraform directories
19 | **/.terraform/*
20 |
21 | # .tfstate files
22 | *.tfstate
23 | *.tfstate.*
24 |
25 | # Crash log files
26 | crash.log
27 | crash.*.log
28 |
29 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
30 | # password, private keys, and other secrets. These should not be part of version
31 | # control as they are data points which are potentially sensitive and subject
32 | # to change depending on the environment.
33 | *.tfvars
34 | *.tfvars.json
35 |
36 | # Ignore override files as they are usually used to override resources locally and so
37 | # are not checked in
38 | override.tf
39 | override.tf.json
40 | *_override.tf
41 | *_override.tf.json
42 |
43 | # Include override files you do wish to add to version control using negated pattern
44 | # !example_override.tf
45 |
46 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
47 | # example: *tfplan*
48 |
49 | # Ignore CLI configuration files
50 | .terraformrc
51 | terraform.rc
52 |
--------------------------------------------------------------------------------
/Infra/locals.tf:
--------------------------------------------------------------------------------
1 | locals {
2 |
3 | #buckets
4 | lambda_layer_bucket_name = "my-lambda-layer-bucket-001"
5 | lambda_layer = "lambda_layer"
6 | rapid_api_host = "zillow56.p.rapidapi.com"
7 | rapid_api_key = "XXXX"
8 | bucket_name = "real-estate-etl-101"
9 | raw_repertory = "raw_data"
10 | std_repertory = "std_data"
11 | aws_region = "eu-west-3"
12 |
13 | utils_bucket = "real-estate-etl-utils"
14 | glue_script_key = "script/glue_etl_script.py"
15 | glue_local_script_path = "../etl/glue_etl_job/transform_data.py"
16 |
17 | # first method layer
18 | layer_zip_path = "python.zip"
19 | layer_name = "my_lambda_requirements_layer"
20 | requirements_path = "../requirements.txt"
21 |
22 | path_to_system_folder = "../etl/extract/System"
23 |
24 | compatible_layer_runtimes = ["python3.10"]
25 | compatible_architectures = ["x86_64"]
26 |
27 | # lambda
28 | path_to_source_folder = "../etl/extract"
29 | #path_to_source_file = "../etl/extract"
30 | path_to_output = "lambda_function_extract_data.zip"
31 | function_name = "lambda_extract_fromAPI"
32 | function_handler = "extract_data.lambda_handler"
33 | memory_size = 512
34 | timeout = 300
35 | runtime = "python3.10"
36 |
37 | # Glue catalog
38 | glue_catalog_database_name = "real-estate-database"
39 |
40 | # iam
41 |
42 | # Glue Crawler
43 | glue_Crawler_Name = "real_estate_crawler"
44 | houston_crawler_name = "real_estate_houston_crawler"
45 | panamera_crawler_name = "real_estate_panamera_crawler"
46 | houston = "houston"
47 | panamera = "pasadena"
48 |
49 | # Glue Classifier
50 | classifier_name = "real_estate_classifier"
51 | json_path = "$[*]"
52 |
53 | # Glue Job
54 | glue_job_name = "real_estate_job"
55 | glue_version = "4.0"
56 | worker_type = "G.1X"
57 | number_of_workers = 2
58 | time_out = 2880
59 | script_location = ""
60 | class = "GlueApp"
61 | enable-job-insights = "true"
62 | enable-auto-scaling = "false"
63 | enable-glue-datacatalog = "true"
64 | job-language = "python"
65 | job-bookmark-option = "job-bookmark-disable"
66 | datalake-formats = "iceberg"
67 | conf = "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.warehouse=s3://tnt-erp-sql/ --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO"
68 |
69 | # cloudwatch
70 | schedule_name = "schedule"
71 | schedule_value = "cron(0 8 ? * MON-FRI *)"
72 |
73 | # Glue Trigger
74 | glue_trigger_name = "realestate-glue-job-trigger"
75 | glue_trigger_schedule_type = "SCHEDULED"
76 | glue_trigger_schedule_value = "cron(15 12 * * ? *)"
77 |
78 | }
--------------------------------------------------------------------------------
/Infra/main.tf:
--------------------------------------------------------------------------------
1 | module "s3bucket" {
2 | source = "./modules/s3"
3 |
4 | bucket_name = local.bucket_name
5 | raw_repertory = local.raw_repertory
6 | std_repertory = local.std_repertory
7 |
8 | utils_bucket_name = local.utils_bucket
9 | glue_script_key = local.glue_script_key
10 | glue_local_script_path = local.glue_local_script_path
11 |
12 | }
13 |
14 | module "lambdaLayer" {
15 | source = "./modules/request_layer"
16 |
17 | requirements_path = local.requirements_path
18 | layer_zip_path = local.layer_zip_path
19 | layer_name = local.layer_name
20 |
21 | path_to_system_folder = local.path_to_system_folder
22 |
23 | lambda_layer_bucket_name = local.lambda_layer_bucket_name
24 | lambda_layer = local.lambda_layer
25 |
26 | #path_to_request_layer_source = local.path_to_request_layer_source
27 | #path_to_request_layer_artifact = local.path_to_request_layer_artifact
28 |
29 | #path_to_request_layer_filename = local.path_to_request_layer_filename
30 | #request_layer_name = local.request_layer_name
31 |
32 |
33 | #path_to_request_layer_source = local.path_to_request_layer_source
34 | #path_to_request_layer_artifact = local.path_to_request_layer_artifact
35 |
36 | #path_to_request_layer_filename = local.path_to_request_layer_filename
37 | #request_layer_name = local.request_layer_name
38 |
39 | compatible_layer_runtimes = local.compatible_layer_runtimes
40 | compatible_architectures = local.compatible_architectures
41 |
42 | }
43 |
44 | module "lambdaFunction" {
45 | source = "./modules/lambda"
46 |
47 | path_to_source_folder = local.path_to_source_folder
48 | path_to_output = local.path_to_output
49 | function_name = local.function_name
50 | function_handler = local.function_handler
51 | memory_size = local.memory_size
52 | timeout = local.timeout
53 | runtime = local.runtime
54 | rapid_api_host = local.rapid_api_host
55 | rapid_api_key = local.rapid_api_key
56 | bucket_name = local.bucket_name
57 | raw_repertory = local.raw_repertory
58 | lambda_layer_arns = [module.lambdaLayer.lamnda_layer_arn]
59 | aws_region = local.aws_region
60 | s3_bucket_arn = module.s3bucket.s3_etl_bucket_arn
61 |
62 | }
63 |
64 | module "cloudwatch_schedule_module" {
65 | source = "./modules/eventbridge"
66 | schedule_name = local.schedule_name
67 | schedule_value = local.schedule_value
68 | aws_lambda_arn = module.lambdaFunction.lambda_function_arn
69 | aws_lambda_function_name = module.lambdaFunction.lambda_function_name
70 | }
71 |
72 | module "glueCatalogDatabase" {
73 | source = "./modules/glue_catalog_database"
74 |
75 | glue_catalog_database_name = local.glue_catalog_database_name
76 | }
77 |
78 | module "glueIamRole" {
79 | source = "./modules/glue_iam"
80 |
81 | }
82 |
83 | module "glueClassifier" {
84 | source = "./modules/glue_classifier"
85 | classifier_name = local.classifier_name
86 | json_path = local.json_path
87 |
88 | }
89 |
90 | module "glueCrawler" {
91 | source = "./modules/glue_crawler"
92 |
93 | database = module.glueCatalogDatabase.database_name
94 | houston_crawler_name = local.houston_crawler_name
95 | panamera_crawler_name = local.panamera_crawler_name
96 |
97 | houston = local.houston
98 | panamera = local.panamera
99 |
100 | #name = local.glue_Crawler_Name
101 | glue_iam_role = module.glueIamRole.glue_iam_arn
102 |
103 | classifiers = [module.glueClassifier.aws_glue_classifier_id]
104 | s3_target_path_panamera = module.s3bucket.aws_s3_bucket_uri
105 | s3_target_path_houston = module.s3bucket.aws_s3_bucket_uri
106 | #s3_target_path = module.s3bucket.aws_s3_bucket_uri
107 | }
108 |
109 | module "glueJob" {
110 | source = "./modules/glue_job"
111 |
112 | name = local.glue_job_name
113 | iam_glue_arn = module.glueIamRole.glue_iam_arn
114 | glue_version = local.glue_version
115 | #worker_type = local.worker_type
116 | script_location = module.s3bucket.aws_s3_bucket_glue_script_uri
117 | timeout = local.time_out
118 | class = local.class
119 | enable-job-insights = local.enable-job-insights
120 | enable-auto-scaling = local.enable-auto-scaling
121 | enable-glue-datacatalog = local.enable-glue-datacatalog
122 | job-language = local.job-language
123 | job-bookmark-option = local.job-bookmark-option
124 | datalake-formats = local.datalake-formats
125 | conf = local.conf
126 |
127 | }
128 |
129 | module "glueTrigger" {
130 | source = "./modules/glue_trigger"
131 |
132 | name = local.glue_trigger_name
133 | schedule_type = local.glue_trigger_schedule_type
134 | schedule_value = local.schedule_value
135 | job_name = module.glueJob.aws_glue_job_name
136 | }
137 |
138 |
139 |
140 |
141 | /*
142 | ### lambda
143 |
144 | data "aws_iam_policy_document" "lambda_assume_role" {
145 | statement {
146 |
147 | effect = "Allow"
148 |
149 | principals {
150 | type = "Service"
151 | identifiers = ["lambda.amazonaws.com"]
152 | }
153 |
154 | actions = ["sts:AssumeRole"]
155 | }
156 | }
157 |
158 | #define variables
159 | locals {
160 | layer_zip_path = "requirements.zip"
161 | layer_name = "my_lambda_requirements_layer"
162 | requirements_path = "../requirements.txt"
163 | }
164 |
165 | # create zip file from requirements.txt. Triggers only when the file is updated
166 | resource "null_resource" "lambda_layer" {
167 | triggers = {
168 | requirements = filesha1(local.requirements_path)
169 | }
170 | # the command to install python and dependencies to the machine and zips
171 | provisioner "local-exec" {
172 | command = < Workspace:
18 | return Workspace(**loader.construct_mapping(node))
--------------------------------------------------------------------------------
/etl/extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/extract/__init__.py
--------------------------------------------------------------------------------
/etl/extract/extract_data.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import pathlib
4 | from datetime import datetime
5 | from pathlib import Path
6 |
7 | import boto3
8 | import requests
9 | import yaml
10 | from System.LocalLocation import LocalLocation
11 | from System.workspace import Workspace
12 |
13 | DST_BUCKET = os.environ.get("DST_BUCKET")
14 | REGION = os.environ.get("REGION")
15 | RAW_FOLDER = os.environ.get("RAW_FOLDER")
16 | API_KEY = os.environ.get("API_KEY")
17 | API_HOST = os.environ.get("API_HOST")
18 | URL = "https://zillow56.p.rapidapi.com/search"
19 |
20 |
21 | # creer la liste des villes
22 | # country = ["houston", "pasadena", "Katy", "Cypress"]
23 | country = ["houston", "pasadena"]
24 |
25 | s3 = boto3.client("s3", region_name=REGION)
26 |
27 |
28 | def lambda_handler(event, context):
29 | create_s3_directories_based_on_city(s3, DST_BUCKET, country, RAW_FOLDER)
30 |
31 | date = get_time()[1]
32 |
33 | populate_database_table_s3_bucket(s3, DST_BUCKET, date, country, RAW_FOLDER)
34 |
35 |
36 | # create directories based on city name
37 | def create_s3_directories_based_on_city(
38 | s3, bucket_name, city_name_list, database_name_s3
39 | ):
40 |
41 | for city_name in city_name_list:
42 | table_name_s3_prefix = str(database_name_s3) + "/" + str(city_name)
43 |
44 | # check if s3 object already exists
45 | try:
46 | s3.head_object(Bucket=bucket_name, Key=table_name_s3_prefix)
47 | except s3.exceptions.ClientError as e:
48 | if e.response["Error"]["Code"] == "404":
49 | # key doesn't exists
50 | s3.put_object(Bucket=bucket_name, Key=(table_name_s3_prefix + "/"))
51 |
52 | pass
53 | else:
54 | # Key exists, do nothing
55 | pass
56 |
57 |
58 | def get_time():
59 | dt = datetime.now()
60 | timestamp = str(datetime.timestamp(dt)).replace(".", "_")
61 | return timestamp, dt.strftime("%Y-%m-%d")
62 |
63 |
64 | def fetch_api_data(url, query):
65 | headers = {
66 | # mettre api_key et api_host
67 | "X-RapidAPI-Key": API_KEY,
68 | "X-RapidAPI-Host": API_HOST,
69 | }
70 | response = requests.get(url, headers=headers, params=query)
71 |
72 | if response.status_code == 200:
73 | data = json.loads(response.text)
74 | return data["results"]
75 | else:
76 | raise Exception(f"Error fetching data: {response.text}")
77 |
78 |
79 | def populate_database_table_s3_bucket(
80 | s3, bucket_name, date, city_name_list, database_name
81 | ):
82 |
83 | try:
84 | for table_name in city_name_list:
85 | file_name = f"{table_name}_{date}.json"
86 | query = {"location": f"{table_name}, tx"}
87 | # fetching data
88 | data = fetch_api_data(URL, query)
89 | s3_object_key = f"{database_name}/{table_name}/{date}/{file_name}"
90 | try:
91 | s3.put_object(
92 | Bucket=bucket_name, Key=s3_object_key, Body=json.dumps(data)
93 | )
94 | except ClientError as e:
95 | raise Exception(
96 | f"Error uploading data to S3: {e}"
97 | ) from e # Re-raise with more context
98 |
99 | except Exception as e:
100 | print(f"Error populating table '{table_name}': {e}")
101 |
102 |
103 | """
104 | def create_workspace_objects(config_file_path="system_config.yml"):
105 | local = LocalLocation()
106 | list_workspace_object = []
107 | if os.path.exists(config_file_path):
108 | list_workspace = local.readConfigFile(config_file_path)
109 | for workspace in list_workspace:
110 | list_workspace_object.append(
111 | Workspace(workspace.database, workspace.table_name)
112 | )
113 | return list_workspace_object
114 |
115 | def create_s3_directories(bucket_name, workspace_object):
116 | s3 = boto3.client("s3")
117 | database_name_s3 = RAW_FOLDER
118 | # database_name_s3 = workspace_object.get_database()
119 | table_name_s3 = workspace_object.get_table_name()
120 | # database_name_s3_prefix = str(database_name_s3)
121 | table_name_s3_prefix = str(database_name_s3) + "/" + str(table_name_s3)
122 | try:
123 | # s3.put_object(Bucket=bucket_name, Key=(database_name_s3_prefix + '/'))
124 | s3.put_object(Bucket=bucket_name, Key=(table_name_s3_prefix + "/"))
125 | except s3.exceptions.ClientError as e:
126 | if e.response["Error"]["Code"] == "404":
127 | pass
128 |
129 | cpt = 0
130 | while cpt < len(list_workspace_object):
131 | try:
132 | workspace_object = None
133 | workspace_object = list_workspace_object[cpt]
134 | database_name = RAW_FOLDER
135 | # database_name = workspace_object.get_database()
136 | table_name = workspace_object.get_table_name()
137 | file_name = f"{table_name}_{date}.json"
138 | query = {"location": f"{table_name}, tx"}
139 | data = fetch_api_data(URL, query)
140 |
141 | print(table_name, file_name, query, data)
142 | # data = {"test": "alla"}
143 | # s3_object_key = f"{database_name}/{table_name}/{date}/{file_name}"
144 |
145 | # Convert data to a byte stream (assuming it's serializable)
146 |
147 |
148 | if isinstance(data, dict):
149 | data_bytes = json.dumps(data, ensure_ascii=False).encode("utf-8")
150 | else:
151 | raise TypeError("Data must be a dictionary serializable to JSON")
152 | try:
153 | s3.put_object(Bucket=bucket_name, Key=s3_object_key, Body=data_bytes)
154 | cpt += 1
155 | except ClientError as e:
156 | raise Exception(
157 | f"Error uploading data to S3: {e}"
158 | ) from e # Re-raise with more context
159 |
160 | except Exception as e:
161 | print(f"Error populating table '{table_name}': {e}")
162 |
163 |
164 | def create_local_directories(workspace_object):
165 | database_name = workspace_object.get_database()
166 | table_name = workspace_object.get_table_name()
167 | if not os.path.exists(database_name):
168 | os.makedirs(database_name)
169 | table_dir = os.path.join(database_name, table_name)
170 | if not os.path.exists(table_dir):
171 | os.makedirs(table_dir) # Create the table directory
172 |
173 | def populate_database_table_local(data,date,list_workspace_object):
174 | for workspace_object in list_workspace_object:
175 | try:
176 | database_name = workspace_object.get_database()
177 | table_name = workspace_object.get_table_name()
178 | file_name = f"{table_name}_{date}.json"
179 | table_partitioned = pathlib.Path(f"/Users/XXX/Desktop/GLUE/{database_name}/{table_name}/{date}")
180 | table_partitioned.mkdir(parents=True, exist_ok=True)
181 | with open(table_partitioned / file_name, 'w') as file:
182 | json.dump(data, file)
183 | except Exception as e:
184 | print(f"Error populating table '{table_name}': {e}")
185 | """
186 |
--------------------------------------------------------------------------------
/etl/extract/system_config.yml:
--------------------------------------------------------------------------------
1 | Workspaces:
2 | - !Workspace
3 | database: raw_data
4 | table_name: houston
5 | - !Workspace
6 | database: raw_data
7 | table_name: pasadena
8 | # - !Workspace
9 | # database: raw_data
10 | # table_name: katy
11 | # - !Workspace
12 | # database: raw_data
13 | # table_name: Cypress
14 |
--------------------------------------------------------------------------------
/etl/glue_etl_job/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/glue_etl_job/__init__.py
--------------------------------------------------------------------------------
/etl/glue_etl_job/transform_data.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from awsglue.context import GlueContext
4 | from awsglue.dynamicframe import DynamicFrame
5 | from awsglue.job import Job
6 | from awsglue.transforms import *
7 | from awsglue.utils import getResolvedOptions
8 | from pyspark.context import SparkContext
9 | from pyspark.sql import functions as F
10 | from pyspark.sql.functions import col, expr, first
11 |
12 | sc = SparkContext.getOrCreate()
13 | glueContext = GlueContext(sc)
14 | spark = glueContext.spark_session
15 | job = Job(glueContext)
16 |
17 |
18 | def extract_houston_from_catalog(database, houston_table_name):
19 | raw_houston_dynamic_frame = glueContext.create_dynamic_frame.from_catalog(
20 | database=database, table_name=houston_table_name
21 | )
22 | df = raw_houston_dynamic_frame.toDF()
23 | return df
24 |
25 | def drop_columns(df):
26 | # drop from table columns with struct type
27 | cols = ("listing_sub_type", "open_house_info")
28 |
29 | df_drops= df.drop(*cols)
30 | return df_drops
31 |
32 |
33 | def group_data(df):
34 | #"group the data by zipcode"
35 | df_group = (
36 | df.groupBy("zipcode","state","city","country","currency")
37 | .agg(
38 | F.count("*").alias("Total Zipcodes"),
39 | F.avg("bathrooms").alias("avg_bathrooms"),
40 | F.avg("bedrooms").alias("avg_bedrooms"),
41 | F.mean(col("price") / col("livingArea")).alias("avg_price_per_sqft"),
42 | )
43 | .orderBy("zipcode")
44 | )
45 | return df_group
46 |
47 |
48 | def load_to_s3(glue_dynamic_frame):
49 | s3output = glueContext.getSink(
50 | path="s3://real-estate-etl-101/std_data/",
51 | connection_type="s3",
52 | updateBehavior="UPDATE_IN_DATABASE",
53 | partitionKeys=[],
54 | compression="snappy",
55 | enableUpdateCatalog=True,
56 | transformation_ctx="s3output",
57 | )
58 |
59 | s3output.setCatalogInfo(
60 | catalogDatabase="real-estate-database", catalogTableName="immo_report"
61 | )
62 |
63 | s3output.setFormat("glueparquet")
64 | s3output.writeFrame(glue_dynamic_frame)
65 |
66 |
67 | if __name__ == "__main__":
68 | database = "real-estate-database"
69 | houston_table_name = "immo_houston"
70 | df_houston = extract_houston_from_catalog(database, houston_table_name)
71 |
72 | df_drops = drop_columns(df_houston)
73 |
74 | df_final = group_data(df_drops)
75 |
76 | # going from Spark dataframe to glue dynamic frame
77 |
78 | glue_dynamic_frame = DynamicFrame.fromDF(df_final, glueContext, "glue_etl")
79 |
80 | # load to s3
81 | load_to_s3(glue_dynamic_frame)
82 |
83 | job.commit()
--------------------------------------------------------------------------------
/etl/load/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/load/__init__.py
--------------------------------------------------------------------------------
/etl/load/load_data.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/load/load_data.py
--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/images/architecture.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | PyYAML
3 |
--------------------------------------------------------------------------------
/terraform.tfstate:
--------------------------------------------------------------------------------
1 | {
2 | "version": 4,
3 | "terraform_version": "1.7.5",
4 | "serial": 1,
5 | "lineage": "b41d31ed-d92b-6863-63ef-f1f090b6a6ac",
6 | "outputs": {},
7 | "resources": [],
8 | "check_results": null
9 | }
10 |
--------------------------------------------------------------------------------