├── .github
    └── workflows
    │   ├── ci_cd.yml
    │   └── destroy-infra.yml
├── .gitignore
├── Infra
    ├── locals.tf
    ├── main.tf
    ├── modules
    │   ├── eventbridge
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    │   ├── glue_catalog_database
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    │   ├── glue_catalog_table
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    │   ├── glue_classifier
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    │   ├── glue_crawler
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    │   ├── glue_iam
    │   │   ├── main.tf
    │   │   └── output.tf
    │   ├── glue_job
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    │   ├── glue_trigger
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    │   ├── lambda
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    │   ├── request_layer
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    │   └── s3
    │   │   ├── main.tf
    │   │   ├── output.tf
    │   │   └── variables.tf
    └── providers.tf
├── Makefile
├── README.md
├── env
    └── base.env
├── etl
    ├── extract
    │   ├── System
    │   │   ├── LocalLocation.py
    │   │   ├── location.py
    │   │   └── workspace.py
    │   ├── __init__.py
    │   ├── extract_data.py
    │   └── system_config.yml
    ├── glue_etl_job
    │   ├── __init__.py
    │   └── transform_data.py
    └── load
    │   ├── __init__.py
    │   └── load_data.py
├── images
    └── architecture.png
├── requirements.txt
└── terraform.tfstate


/.github/workflows/ci_cd.yml:
--------------------------------------------------------------------------------
  1 | name: "Terraform action"
  2 | on:
  3 |   push:
  4 |     branches:
  5 |       - main
  6 |   pull_request:
  7 | permissions:
  8 |       id-token: write # This is required for aws oidc connection
  9 |       contents: read # This is required for actions/checkout
 10 |       pull-requests: write # This is required for gh bot to comment PR
 11 | env:
 12 |   TF_LOG: INFO
 13 |   AWS_REGION: ${{ secrets.AWS_REGION }}
 14 | jobs:
 15 |   deploy:
 16 |     runs-on: ubuntu-latest
 17 |     defaults:
 18 |       run:
 19 |         shell: bash
 20 |         working-directory: .
 21 |     steps:
 22 |       - name: Git checkout
 23 |         uses: actions/checkout@v3
 24 | 
 25 |       - name: Configure AWS credentials from AWS account
 26 |         uses: aws-actions/configure-aws-credentials@v1
 27 |         with:
 28 |           role-to-assume: ${{ secrets.AWS_ROLE }}
 29 |           aws-region: ${{ secrets.AWS_REGION }}
 30 |           role-session-name: GitHub-OIDC-TERRAFORM
 31 | 
 32 |       - name: Setup Terraform
 33 |         uses: hashicorp/setup-terraform@v2
 34 |         with:
 35 |           terraform_version: 1.7.5
 36 | 
 37 |       - name: Terraform fmt
 38 |         id: fmt
 39 |         run: terraform fmt -check
 40 |         continue-on-error: true
 41 | 
 42 |       - name: Terraform Init
 43 |         id: init
 44 |         env:
 45 |           AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }}
 46 |           AWS_BUCKET_KEY_NAME: ${{ secrets.AWS_BUCKET_KEY_NAME }}
 47 |         run: make terraform-init
 48 | 
 49 |       - name: Terraform Validate
 50 |         id: validate
 51 |         run: make terraform-validate
 52 | 
 53 |       - name: Terraform Plan
 54 |         id: plan
 55 |         run: make terraform-plan
 56 |         if: github.event_name == 'pull_request'
 57 |         continue-on-error: true
 58 | 
 59 |       - uses: actions/github-script@v6
 60 |         if: github.event_name == 'pull_request'
 61 |         env:
 62 |           PLAN: "terraform\n${{ steps.plan.outputs.stdout }}"
 63 |         with:
 64 |           github-token: ${{ secrets.GITHUB_TOKEN }}
 65 |           script: |
 66 |             const output = `#### Terraform Format and Style 🖌\`${{ steps.fmt.outcome }}\`
 67 |             #### Terraform Initialization ⚙️\`${{ steps.init.outcome }}\`
 68 |             #### Terraform Validation 🤖\`${{ steps.validate.outcome }}\`
 69 |             <details><summary>Validation Output</summary>
 70 | 
 71 |             \`\`\`\n
 72 |             ${{ steps.validate.outputs.stdout }}
 73 |             \`\`\`
 74 | 
 75 |             </details>
 76 | 
 77 |             #### Terraform Plan 📖\`${{ steps.plan.outcome }}\`
 78 | 
 79 |             <details><summary>Show Plan</summary>
 80 | 
 81 |             \`\`\`\n
 82 |             ${process.env.PLAN}
 83 |             \`\`\`
 84 | 
 85 |             </details>
 86 | 
 87 |             *Pushed by: @${{ github.actor }}, Action: \`${{ github.event_name }}\`*`;
 88 | 
 89 |             github.rest.issues.createComment({
 90 |               issue_number: context.issue.number,
 91 |               owner: context.repo.owner,
 92 |               repo: context.repo.repo,
 93 |               body: output
 94 |             })
 95 | 
 96 |       - name: Terraform Plan Status
 97 |         if: steps.plan.outcome == 'failure'
 98 |         run: exit 1
 99 | 
100 |       - name: Terraform Apply
101 |         if: github.ref == 'refs/heads/main' && github.event_name == 'push'
102 |         run: make terraform-apply


--------------------------------------------------------------------------------
/.github/workflows/destroy-infra.yml:
--------------------------------------------------------------------------------
 1 | name: "Terraform destroy"
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - feat*/*
 6 |   workflow_dispatch:
 7 | permissions:
 8 |   id-token: write # This is required for aws oidc connection
 9 |   contents: read # This is required for actions/checkout
10 |   pull-requests: write # This is required for gh bot to comment PR
11 | env:
12 |   TF_LOG: INFO
13 |   AWS_REGION: ${{ secrets.AWS_REGION }}
14 | jobs:
15 |   deploy:
16 |     runs-on: ubuntu-latest
17 |     defaults:
18 |       run:
19 |         shell: bash
20 |         working-directory: .
21 |     steps:
22 |       - name: Git checkout
23 |         uses: actions/checkout@v3
24 | 
25 |       - name: Configure AWS credentials from AWS account
26 |         uses: aws-actions/configure-aws-credentials@v1
27 |         with:
28 |           role-to-assume: ${{ secrets.AWS_ROLE }}
29 |           aws-region: ${{ secrets.AWS_REGION }}
30 |           role-session-name: GitHub-OIDC-TERRAFORM
31 | 
32 |       - name: Setup Terraform
33 |         uses: hashicorp/setup-terraform@v2
34 |         with:
35 |           terraform_version: 1.7.5
36 | 
37 |       - name: Terraform fmt
38 |         id: fmt
39 |         run: terraform fmt -check
40 |         continue-on-error: true
41 | 
42 |       - name: Terraform Init
43 |         id: init
44 |         env:
45 |           AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }}
46 |           AWS_BUCKET_KEY_NAME: ${{ secrets.AWS_BUCKET_KEY_NAME }}
47 |         run: make terraform-init
48 | 
49 |       - name: Terraform Destroy
50 |         id: destroy
51 |         run: make terraform-destroy
52 | 
53 |       


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | venv
 2 | venv/*
 3 | 
 4 | Infra/requirements
 5 | Infra/requirements/*
 6 | 
 7 | Infra/.terraform
 8 | Infra/.terraform/*
 9 | 
10 | Infra/lambda_function_extract_data.zip
11 | Infra/requirements.zip
12 | 
13 | .DS_Store
14 | Infra/.terraform.lock.hcl
15 | Infra/terraform.tfstate.backup
16 | Infra/terraform.tfstate
17 | 
18 | # Local .terraform directories
19 | **/.terraform/*
20 | 
21 | # .tfstate files
22 | *.tfstate
23 | *.tfstate.*
24 | 
25 | # Crash log files
26 | crash.log
27 | crash.*.log
28 | 
29 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
30 | # password, private keys, and other secrets. These should not be part of version 
31 | # control as they are data points which are potentially sensitive and subject 
32 | # to change depending on the environment.
33 | *.tfvars
34 | *.tfvars.json
35 | 
36 | # Ignore override files as they are usually used to override resources locally and so
37 | # are not checked in
38 | override.tf
39 | override.tf.json
40 | *_override.tf
41 | *_override.tf.json
42 | 
43 | # Include override files you do wish to add to version control using negated pattern
44 | # !example_override.tf
45 | 
46 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
47 | # example: *tfplan*
48 | 
49 | # Ignore CLI configuration files
50 | .terraformrc
51 | terraform.rc
52 | 


--------------------------------------------------------------------------------
/Infra/locals.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 | 
 3 |   #buckets 
 4 |   lambda_layer_bucket_name = "my-lambda-layer-bucket-001"
 5 |   lambda_layer             = "lambda_layer"
 6 |   rapid_api_host           = "zillow56.p.rapidapi.com"
 7 |   rapid_api_key            = "XXXX"
 8 |   bucket_name              = "real-estate-etl-101"
 9 |   raw_repertory            = "raw_data"
10 |   std_repertory            = "std_data"
11 |   aws_region               = "eu-west-3"
12 | 
13 |   utils_bucket           = "real-estate-etl-utils"
14 |   glue_script_key        = "script/glue_etl_script.py"
15 |   glue_local_script_path = "../etl/glue_etl_job/transform_data.py"
16 | 
17 |   # first method layer
18 |   layer_zip_path    = "python.zip"
19 |   layer_name        = "my_lambda_requirements_layer"
20 |   requirements_path = "../requirements.txt"
21 | 
22 |   path_to_system_folder = "../etl/extract/System"
23 | 
24 |   compatible_layer_runtimes = ["python3.10"]
25 |   compatible_architectures  = ["x86_64"]
26 | 
27 |   # lambda 
28 |   path_to_source_folder = "../etl/extract"
29 |   #path_to_source_file = "../etl/extract"
30 |   path_to_output   = "lambda_function_extract_data.zip"
31 |   function_name    = "lambda_extract_fromAPI"
32 |   function_handler = "extract_data.lambda_handler"
33 |   memory_size      = 512
34 |   timeout          = 300
35 |   runtime          = "python3.10"
36 | 
37 |   # Glue catalog
38 |   glue_catalog_database_name = "real-estate-database"
39 | 
40 |   # iam
41 | 
42 |   # Glue Crawler
43 |   glue_Crawler_Name     = "real_estate_crawler"
44 |   houston_crawler_name  = "real_estate_houston_crawler"
45 |   panamera_crawler_name = "real_estate_panamera_crawler"
46 |   houston               = "houston"
47 |   panamera              = "pasadena"
48 | 
49 |   # Glue Classifier
50 |   classifier_name = "real_estate_classifier"
51 |   json_path       = "$[*]"
52 | 
53 |   # Glue Job
54 |   glue_job_name           = "real_estate_job"
55 |   glue_version            = "4.0"
56 |   worker_type             = "G.1X"
57 |   number_of_workers       = 2
58 |   time_out                = 2880
59 |   script_location         = ""
60 |   class                   = "GlueApp"
61 |   enable-job-insights     = "true"
62 |   enable-auto-scaling     = "false"
63 |   enable-glue-datacatalog = "true"
64 |   job-language            = "python"
65 |   job-bookmark-option     = "job-bookmark-disable"
66 |   datalake-formats        = "iceberg"
67 |   conf                    = "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions  --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog  --conf spark.sql.catalog.glue_catalog.warehouse=s3://tnt-erp-sql/ --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog  --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO"
68 | 
69 |   # cloudwatch
70 |   schedule_name  = "schedule"
71 |   schedule_value = "cron(0 8 ? * MON-FRI *)"
72 | 
73 |   # Glue Trigger 
74 |   glue_trigger_name           = "realestate-glue-job-trigger"
75 |   glue_trigger_schedule_type  = "SCHEDULED"
76 |   glue_trigger_schedule_value = "cron(15 12 * * ? *)"
77 | 
78 | }


--------------------------------------------------------------------------------
/Infra/main.tf:
--------------------------------------------------------------------------------
  1 | module "s3bucket" {
  2 |   source = "./modules/s3"
  3 | 
  4 |   bucket_name   = local.bucket_name
  5 |   raw_repertory = local.raw_repertory
  6 |   std_repertory = local.std_repertory
  7 | 
  8 |   utils_bucket_name      = local.utils_bucket
  9 |   glue_script_key        = local.glue_script_key
 10 |   glue_local_script_path = local.glue_local_script_path
 11 | 
 12 | }
 13 | 
 14 | module "lambdaLayer" {
 15 |   source = "./modules/request_layer"
 16 | 
 17 |   requirements_path = local.requirements_path
 18 |   layer_zip_path    = local.layer_zip_path
 19 |   layer_name        = local.layer_name
 20 | 
 21 |   path_to_system_folder = local.path_to_system_folder
 22 | 
 23 |   lambda_layer_bucket_name = local.lambda_layer_bucket_name
 24 |   lambda_layer             = local.lambda_layer
 25 | 
 26 |   #path_to_request_layer_source = local.path_to_request_layer_source
 27 |   #path_to_request_layer_artifact = local.path_to_request_layer_artifact
 28 | 
 29 |   #path_to_request_layer_filename = local.path_to_request_layer_filename
 30 |   #request_layer_name = local.request_layer_name
 31 | 
 32 | 
 33 |   #path_to_request_layer_source = local.path_to_request_layer_source
 34 |   #path_to_request_layer_artifact = local.path_to_request_layer_artifact
 35 | 
 36 |   #path_to_request_layer_filename = local.path_to_request_layer_filename
 37 |   #request_layer_name = local.request_layer_name
 38 | 
 39 |   compatible_layer_runtimes = local.compatible_layer_runtimes
 40 |   compatible_architectures  = local.compatible_architectures
 41 | 
 42 | }
 43 | 
 44 | module "lambdaFunction" {
 45 |   source = "./modules/lambda"
 46 | 
 47 |   path_to_source_folder = local.path_to_source_folder
 48 |   path_to_output        = local.path_to_output
 49 |   function_name         = local.function_name
 50 |   function_handler      = local.function_handler
 51 |   memory_size           = local.memory_size
 52 |   timeout               = local.timeout
 53 |   runtime               = local.runtime
 54 |   rapid_api_host        = local.rapid_api_host
 55 |   rapid_api_key         = local.rapid_api_key
 56 |   bucket_name           = local.bucket_name
 57 |   raw_repertory         = local.raw_repertory
 58 |   lambda_layer_arns     = [module.lambdaLayer.lamnda_layer_arn]
 59 |   aws_region            = local.aws_region
 60 |   s3_bucket_arn         = module.s3bucket.s3_etl_bucket_arn
 61 | 
 62 | }
 63 | 
 64 | module "cloudwatch_schedule_module" {
 65 |   source                   = "./modules/eventbridge"
 66 |   schedule_name            = local.schedule_name
 67 |   schedule_value           = local.schedule_value
 68 |   aws_lambda_arn           = module.lambdaFunction.lambda_function_arn
 69 |   aws_lambda_function_name = module.lambdaFunction.lambda_function_name
 70 | }
 71 | 
 72 | module "glueCatalogDatabase" {
 73 |   source = "./modules/glue_catalog_database"
 74 | 
 75 |   glue_catalog_database_name = local.glue_catalog_database_name
 76 | }
 77 | 
 78 | module "glueIamRole" {
 79 |   source = "./modules/glue_iam"
 80 | 
 81 | }
 82 | 
 83 | module "glueClassifier" {
 84 |   source          = "./modules/glue_classifier"
 85 |   classifier_name = local.classifier_name
 86 |   json_path       = local.json_path
 87 | 
 88 | }
 89 | 
 90 | module "glueCrawler" {
 91 |   source = "./modules/glue_crawler"
 92 | 
 93 |   database              = module.glueCatalogDatabase.database_name
 94 |   houston_crawler_name  = local.houston_crawler_name
 95 |   panamera_crawler_name = local.panamera_crawler_name
 96 | 
 97 |   houston  = local.houston
 98 |   panamera = local.panamera
 99 | 
100 |   #name = local.glue_Crawler_Name
101 |   glue_iam_role = module.glueIamRole.glue_iam_arn
102 | 
103 |   classifiers             = [module.glueClassifier.aws_glue_classifier_id]
104 |   s3_target_path_panamera = module.s3bucket.aws_s3_bucket_uri
105 |   s3_target_path_houston  = module.s3bucket.aws_s3_bucket_uri
106 |   #s3_target_path = module.s3bucket.aws_s3_bucket_uri
107 | }
108 | 
109 | module "glueJob" {
110 |   source = "./modules/glue_job"
111 | 
112 |   name         = local.glue_job_name
113 |   iam_glue_arn = module.glueIamRole.glue_iam_arn
114 |   glue_version = local.glue_version
115 |   #worker_type = local.worker_type
116 |   script_location         = module.s3bucket.aws_s3_bucket_glue_script_uri
117 |   timeout                 = local.time_out
118 |   class                   = local.class
119 |   enable-job-insights     = local.enable-job-insights
120 |   enable-auto-scaling     = local.enable-auto-scaling
121 |   enable-glue-datacatalog = local.enable-glue-datacatalog
122 |   job-language            = local.job-language
123 |   job-bookmark-option     = local.job-bookmark-option
124 |   datalake-formats        = local.datalake-formats
125 |   conf                    = local.conf
126 | 
127 | }
128 | 
129 | module "glueTrigger" {
130 |   source = "./modules/glue_trigger"
131 | 
132 |   name           = local.glue_trigger_name
133 |   schedule_type  = local.glue_trigger_schedule_type
134 |   schedule_value = local.schedule_value
135 |   job_name       = module.glueJob.aws_glue_job_name
136 | }
137 | 
138 | 
139 | 
140 | 
141 | /*
142 | ### lambda 
143 | 
144 | data "aws_iam_policy_document" "lambda_assume_role" {
145 |   statement {
146 |   
147 |     effect = "Allow"
148 | 
149 |     principals {
150 |       type        = "Service"
151 |       identifiers = ["lambda.amazonaws.com"]
152 |     }
153 | 
154 |     actions = ["sts:AssumeRole"]
155 |   }
156 | }
157 | 
158 | #define variables
159 | locals {
160 |   layer_zip_path    = "requirements.zip"
161 |   layer_name        = "my_lambda_requirements_layer"
162 |   requirements_path = "../requirements.txt"
163 | }
164 | 
165 | # create zip file from requirements.txt. Triggers only when the file is updated
166 | resource "null_resource" "lambda_layer" {
167 |   triggers = {
168 |     requirements = filesha1(local.requirements_path)
169 |   }
170 |   # the command to install python and dependencies to the machine and zips
171 |   provisioner "local-exec" {
172 |     command = <<EOT
173 |       set -e
174 |       rm -rf requirements
175 |       mkdir requirements
176 |       pip3 install -r ${local.requirements_path} -t requirements/
177 |       zip -r ${local.layer_zip_path} requirements/
178 |     EOT
179 |   }
180 | }
181 | 
182 | resource "aws_s3_bucket" "lambda_layer_bucket" {
183 |   bucket = var.lambda_layer_bucket_name
184 | }
185 | 
186 | resource "aws_s3_object" "lambda_layer_zip" {
187 |     bucket   = aws_s3_bucket.lambda_layer_bucket.id
188 |     key      =  "${var.lambda_layer}/${local.layer_name}/${local.layer_zip_path}"
189 |     source     = local.layer_zip_path
190 |     depends_on = [null_resource.lambda_layer] # triggered only if the zip file is created
191 |     #content_type = "application/x-directory"  
192 | }
193 | 
194 | # create lambda layer from s3 object
195 | resource "aws_lambda_layer_version" "my-lambda-layer" {
196 |   s3_bucket           = aws_s3_bucket.lambda_layer_bucket.id
197 |   s3_key              = aws_s3_object.lambda_layer_zip.key
198 |   layer_name          = local.layer_name
199 |   compatible_runtimes = ["python3.10"]
200 |   skip_destroy        = true
201 |   depends_on          = [aws_s3_object.lambda_layer_zip] # triggered only if the zip file is uploaded to the bucket
202 | }
203 | 
204 | resource "aws_iam_role" "iam_for_lambda" {
205 |   name               = "iam_for_lambda"
206 |   assume_role_policy = data.aws_iam_policy_document.lambda_assume_role.json
207 | }
208 | 
209 | data "aws_iam_policy_document" "lambda_policy" {
210 | #name        = "lambda_s3"
211 | #description = "lambda_policy_s3"
212 | statement {
213 |     effect    = "Allow"
214 |     actions   = ["s3:GetObject","s3:ListBucket"]
215 |     resources = ["*"]
216 |   }
217 | }
218 | 
219 | resource "aws_iam_policy" "lambda_policy" {
220 |   name        = "lambda-policy"
221 |   description = "allow lambda to get and list object into the bucket"
222 |   policy      = data.aws_iam_policy_document.lambda_policy.json
223 | }
224 | 
225 | resource "aws_iam_role_policy_attachment" "attach_getObject" {
226 |   role       = aws_iam_role.iam_for_lambda.name
227 |   policy_arn = aws_iam_policy.lambda_policy.arn
228 | }
229 | 
230 | data "archive_file" "lambda" {
231 |   type        = "zip"
232 |   source_file = "../etl/extract/extract_data.py"
233 |   output_path = "lambda_function_extract_data.zip"
234 | }
235 | 
236 | resource "aws_lambda_function" "lambda" {
237 |   # If the file is not in the current working directory you will need to include a
238 |   # path.module in the filename.
239 |   filename      = "lambda_function_extract_data.zip"
240 |   function_name = "lambda_extract_fromAPI"
241 |   role          = aws_iam_role.iam_for_lambda.arn
242 |   handler       = "extract_data.lambda_handler"
243 | 
244 |   source_code_hash = data.archive_file.lambda.output_base64sha256
245 | 
246 |   runtime = "python3.10"
247 |   layers = [aws_lambda_layer_version.my-lambda-layer.arn]
248 | 
249 |   #s3_bucket = var.bucket_name
250 |   #s3_key = aws_s3_object.lambda_layer_zip.key
251 | 
252 |   environment {
253 |     variables = {
254 |       API_KEY = var.rapid_api_key
255 |       API_HOST = var.rapid_api_host
256 |       DST_BUCKET = var.bucket_name
257 |       REGION = var.aws_region
258 |       RAW_FOLDER = var.raw_repertory
259 |     }
260 |   }
261 | }
262 | 
263 | resource "aws_s3_bucket" "etl_bucket"{
264 |   bucket  = var.bucket_name
265 |   force_destroy = true
266 | }
267 | 
268 | resource "aws_s3_object" "raw_zone" {
269 |     bucket   = aws_s3_bucket.etl_bucket.id
270 |     acl = "private"
271 |     key      =  "${var.raw_repertory}/"
272 |     content_type = "application/x-directory"  
273 | }
274 | 
275 | resource "aws_s3_object" "std_zone" {
276 |     bucket   = aws_s3_bucket.etl_bucket.id
277 |     acl = "private"
278 |     key      =  "${var.std_repertory}/"
279 |     content_type = "application/x-directory"  
280 | }
281 | 
282 | resource "aws_lambda_permission" "s3" {
283 |   statement_id  = "AllowExecutionFromS3Bucket"
284 |   action        = "lambda:InvokeFunction"
285 |   function_name = aws_lambda_function.lambda.arn
286 |   principal     = "s3.amazonaws.com"
287 | 
288 |   source_arn = "arn:aws:s3:::${var.bucket_name}/*"
289 | }
290 | #### fin first part 
291 | 
292 | 
293 | resource "aws_s3_bucket" "raw_bucket"{
294 |   bucket = aws_s3_bucket.etl_bucket.id
295 |   key    = var.raw_data_key
296 |   
297 | }
298 | 
299 | resource "aws_s3_object" "raw_customer" {
300 |   bucket = aws_s3_bucket.etl_bucket.id
301 |   key    = var.raw_data_customer_key
302 |   source = var.raw_customer_data_local_path
303 | }
304 | 
305 | resource "aws_s3_object" "raw_item" {
306 |   bucket = aws_s3_bucket.etl_bucket.id
307 |   key    = var.raw_data_item_key
308 |   source = var.raw_item_data_local_path
309 | }
310 | 
311 | resource "aws_s3_object" "raw_store" {
312 |   bucket = aws_s3_bucket.etl_bucket.id
313 |   key    = var.raw_data_store_key
314 |   source = var.raw_store_data_local_path
315 | }
316 | 
317 | resource "aws_s3_object" "raw_time" {
318 |   bucket = aws_s3_bucket.etl_bucket.id
319 |   key    = var.raw_data_time_key
320 |   source = var.raw_time_data_local_path
321 | }
322 | 
323 | resource "aws_s3_object" "raw_transaction" {
324 |   bucket = aws_s3_bucket.etl_bucket.id
325 |   key    = var.raw_data_transaction_key
326 |   source = var.raw_transaction_data_local_path
327 | }
328 | 
329 | 
330 | resource "aws_iam_role" "glue_role" {
331 |   name = "glue_role_etl"
332 |   assume_role_policy = jsonencode({
333 |       Version = "2012-10-17"
334 |       Statement = [
335 |       {
336 |           Action = "sts:AssumeRole"
337 |           Effect = "Allow"
338 |           Sid    = ""
339 |           Principal = {
340 |           Service = "glue.amazonaws.com"
341 |           }
342 |       },
343 |       ]
344 |   })
345 | 
346 |   managed_policy_arns = ["arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole" ,"arn:aws:iam::aws:policy/AmazonS3FullAccess"]
347 | }
348 | 
349 | resource "aws_glue_catalog_database" "ecommerce-etl-database" {
350 |   name = var.database_name
351 | }
352 | 
353 | resource "aws_glue_crawler" "ecommerce_crawler" {
354 |   database_name = aws_glue_catalog_database.ecommerce-etl-database.name
355 |   name          = "ecommerce-data"
356 |   role          = aws_iam_role.glue_role.arn
357 | 
358 |   s3_target {
359 |       path = "s3://${aws_s3_bucket.etl_bucket.bucket}/${var.path_to_data_key}"
360 |   }
361 | }
362 | 
363 | resource "aws_s3_object" "glue_script" {
364 |   bucket = aws_s3_bucket.etl_bucket.id
365 |   key    = var.script_key
366 |   source = var.local_glue_script
367 | }
368 | 
369 | resource "aws_glue_job" "ecommerce-etl-job" {
370 |   name     = "ecommerce-etl-job"
371 |   role_arn = aws_iam_role.glue_role.arn
372 |   glue_version = "4.0"
373 |   worker_type  = "G.1X"
374 |   number_of_workers = 2
375 |   timeout = 2880
376 | 
377 |   command {
378 |       script_location = "s3://${aws_s3_bucket.etl_bucket.bucket}/${var.script_key}"
379 |   }
380 | }
381 | */


--------------------------------------------------------------------------------
/Infra/modules/eventbridge/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_cloudwatch_event_rule" "schedule" {
 2 |     name = var.schedule_name #"schedule"
 3 |     description = "Schedule for Lambda Function"
 4 |     schedule_expression = var.schedule_value
 5 | }
 6 | 
 7 | resource "aws_cloudwatch_event_target" "schedule_lambda" {
 8 |     rule = aws_cloudwatch_event_rule.schedule.name
 9 |     target_id = "processing_lambda"
10 |     arn = var.aws_lambda_arn
11 | }
12 | 
13 | resource "aws_lambda_permission" "allow_events_bridge_to_run_lambda" {
14 |     statement_id = "AllowExecutionFromCloudWatch"
15 |     action = "lambda:InvokeFunction"
16 |     function_name = var.aws_lambda_function_name
17 |     principal = "events.amazonaws.com"
18 | }


--------------------------------------------------------------------------------
/Infra/modules/eventbridge/output.tf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/Infra/modules/eventbridge/output.tf


--------------------------------------------------------------------------------
/Infra/modules/eventbridge/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "schedule_value" {
 2 |   description = "schedule"
 3 |   type        = string
 4 | }
 5 | 
 6 | variable "aws_lambda_arn" {
 7 |   description = "variable arn"
 8 |   type        = string
 9 | }
10 | 
11 | variable "aws_lambda_function_name" {
12 |   description = "variable function namearn"
13 |   type        = string
14 | }
15 | 
16 | variable "schedule_name" {
17 |   description = "schedule name"
18 |   type        = string
19 | }
20 | 
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/Infra/modules/glue_catalog_database/main.tf:
--------------------------------------------------------------------------------
1 | resource "aws_glue_catalog_database" "aws_glue_catalog_database" {
2 |   name = var.glue_catalog_database_name
3 | }
4 | 


--------------------------------------------------------------------------------
/Infra/modules/glue_catalog_database/output.tf:
--------------------------------------------------------------------------------
1 | output "database_name" {
2 |     value = aws_glue_catalog_database.aws_glue_catalog_database.name
3 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_catalog_database/variables.tf:
--------------------------------------------------------------------------------
1 | variable "glue_catalog_database_name" {
2 |   description = "principal bucket name"
3 |   type        = string
4 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_catalog_table/main.tf:
--------------------------------------------------------------------------------
1 | resource "aws_glue_catalog_table" "glue_catalog_table" {
2 |   name = var.table-name
3 |   database_name = var.database_name
4 | }
5 | 


--------------------------------------------------------------------------------
/Infra/modules/glue_catalog_table/output.tf:
--------------------------------------------------------------------------------
1 | output "name" {
2 |   value = aws_glue_catalog_table.glue_catalog_table.name
3 | }
4 | 
5 | output "arn" {
6 |   value = aws_glue_catalog_table.glue_catalog_table.arn
7 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_catalog_table/variables.tf:
--------------------------------------------------------------------------------
1 | variable "table_name" {
2 |   description = "glue catalog table name"
3 |   type        = string
4 | }
5 | 
6 | variable "database_name" {
7 |   description = "glue catalog database name"
8 |   type        = string
9 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_classifier/main.tf:
--------------------------------------------------------------------------------
1 | resource "aws_glue_classifier" "crawler_classifier" {
2 |   name = var.classifier_name
3 | 
4 |   json_classifier {
5 |     json_path = var.json_path
6 |   }
7 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_classifier/output.tf:
--------------------------------------------------------------------------------
1 | output "aws_glue_classifier_id" {
2 |   value = aws_glue_classifier.crawler_classifier.id
3 |   #description = "The name of the Glue ETL Job"
4 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_classifier/variables.tf:
--------------------------------------------------------------------------------
1 | variable "classifier_name" {
2 |   description = "classifier name"
3 |   type        = string
4 | }
5 | 
6 | variable "json_path" {
7 |   description = "json path"
8 |   type        = string
9 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_crawler/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_glue_crawler" "houston_crawler" {
 2 |   name                  = var.houston_crawler_name
 3 |   database_name         = var.database
 4 |   role                  = var.glue_iam_role 
 5 |   table_prefix          = "immo_" 
 6 |   classifiers = var.classifiers
 7 | 
 8 |   s3_target {
 9 |     path = "${var.s3_target_path_houston}${var.houston}"
10 |   }
11 | 
12 | }
13 | 
14 | resource "aws_glue_crawler" "panamera_crawler" {
15 |   name                  = var.panamera_crawler_name
16 |   database_name         = var.database
17 |   role                  = var.glue_iam_role 
18 |   table_prefix          = "immo_" 
19 |   classifiers = var.classifiers
20 | 
21 |   s3_target {
22 |     path = "${var.s3_target_path_panamera}${var.panamera}"
23 |   }
24 | 
25 | }
26 | 
27 | 
28 | 
29 | /*
30 | resource "aws_glue_crawler" "immo_crawler" {
31 |   database_name = var.database
32 |   name          = var.name
33 |   role          = var.glue_iam_role
34 |   table_prefix = "immo_"
35 |   classifiers = var.classifiers
36 |   
37 |   s3_target {
38 |       path = var.s3_target_path
39 |   }
40 | 
41 |   
42 |   
43 | 
44 | ├───data = raw_data
45 | │   ├───bike_data = panamera
46 | │   │   ├───tbl_bikedata_station
47 | │   │   └───tbl_bikedata_trip
48 | │   ├───br_ecommerce = houston
49 | │   │   ├───tbl_brecommerce_customers
50 | │   │   ├───tbl_brecommerce_geolocation
51 |       = zozo
52 | 
53 | 
54 |   s3_target {
55 |       path = var.s3_target_path
56 |   }
57 |  
58 | 
59 |   dynamic "s3_path" {
60 |     for_each = var.directories
61 | 
62 |     content {
63 |       path = 
64 |     }
65 |   }
66 |  */
67 |   #schedule = "cron(0 2 * * ? *)"
68 | 


--------------------------------------------------------------------------------
/Infra/modules/glue_crawler/output.tf:
--------------------------------------------------------------------------------
1 | output "aws_glue_houston_crawler_name" {
2 |   value = aws_glue_crawler.houston_crawler.name
3 |   description = "The name of the Glue Crawler"
4 | }
5 | 
6 | output "aws_glue_panamera_crawler_name" {
7 |   value = aws_glue_crawler.panamera_crawler.name
8 |   description = "The name of the Glue Crawler"
9 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_crawler/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "database" {
 2 |   description = "glue catalog database name"
 3 |   type        = string
 4 | }
 5 | 
 6 | variable "houston_crawler_name" {
 7 |   description = "glue houston crawler name"
 8 |   type        = string
 9 | }
10 | 
11 | variable "panamera_crawler_name" {
12 |   description = "glue panamera crawler name"
13 |   type        = string
14 | }
15 | 
16 | variable "houston" {
17 |   description = "houston value"
18 |   type        = string
19 | }
20 | 
21 | variable "panamera" {
22 |   description = "panamera value"
23 |   type        = string
24 | }
25 | 
26 | variable "s3_target_path_houston" {
27 |   description = "s3 target path"
28 |   type        = string
29 | }
30 | 
31 | variable "s3_target_path_panamera" {
32 |   description = "s3 target path"
33 |   type        = string
34 | }
35 | 
36 | variable "glue_iam_role" {
37 |   description = "glue iam role"
38 |   type        = string
39 | }
40 | 
41 | variable "classifiers" {
42 |   description = "classifiers"
43 |   type        = list(string)
44 | }
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/Infra/modules/glue_iam/main.tf:
--------------------------------------------------------------------------------
 1 | data "aws_iam_policy_document" "glue_assume_role" {
 2 |   statement {
 3 |   
 4 |     effect = "Allow"
 5 | 
 6 |     principals {
 7 |       type        = "Service"
 8 |       identifiers = ["glue.amazonaws.com"]
 9 |     }
10 | 
11 |     actions = ["sts:AssumeRole"]
12 |   }
13 | }
14 | 
15 | resource "aws_iam_role" "iam_for_glue" {
16 |   name               = "iam_for_glue"
17 |   assume_role_policy = data.aws_iam_policy_document.glue_assume_role.json
18 | }
19 | 
20 | data "aws_iam_policy_document" "glue_policy_document" {
21 |   statement {
22 |       effect    = "Allow"
23 |       actions   = [
24 |         "s3:GetBucketLocation",
25 |         "s3:ListBucket",
26 |         "s3:GetBucketAcl",
27 |         "s3:GetObject",
28 |         "s3:PutObject",
29 |         "s3:DeleteObject"
30 |         ]
31 |       resources = ["*"] #to be specify later
32 |     }
33 |   statement {
34 |       effect    = "Allow"
35 |       actions   = [
36 |         "glue:*"
37 |         ]
38 |       resources = ["*"] 
39 |     }
40 |   statement {
41 |       effect    = "Allow"
42 |       actions   = [
43 |         "logs:CreateLogGroup",
44 |         "logs:CreateLogStream",
45 |         "logs:PutLogEvents"
46 |         ]
47 |       resources = [
48 |         "arn:aws:logs:*:*:*:/aws-glue/*"
49 |         ] #to be specify later
50 |     }
51 | }
52 | 
53 | resource "aws_iam_policy" "glue_policy" {
54 |   name        = "glue-policy"
55 |   description = "allow lambda to get and list object into the bucket"
56 |   policy      = data.aws_iam_policy_document.glue_policy_document.json
57 | }
58 | 
59 | resource "aws_iam_role_policy_attachment" "attach_getObject" {
60 |   role       = aws_iam_role.iam_for_glue.name
61 |   policy_arn = aws_iam_policy.glue_policy.arn
62 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_iam/output.tf:
--------------------------------------------------------------------------------
1 | output "glue_iam_arn" {
2 |   value = aws_iam_role.iam_for_glue.arn
3 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_job/main.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | resource "aws_glue_job" "immo-glue-job" {
 3 |   name     = var.name
 4 |   role_arn = var.iam_glue_arn
 5 |   glue_version = var.glue_version #"4.0"
 6 |   #worker_type  = var.worker_type #"G.1X"
 7 |   #number_of_workers = var.number_of_workers #2
 8 |   timeout = var.timeout #2880
 9 | 
10 |   command {
11 |       script_location = "s3://${var.script_location}"
12 |   }
13 | 
14 |   default_arguments = {
15 |     "--class"                   = var.class #"GlueApp"
16 |     "--enable-job-insights"     = var.enable-job-insights #"true"
17 |     "--enable-auto-scaling"     = var.enable-auto-scaling #"false"
18 |     "--enable-glue-datacatalog" = var.enable-glue-datacatalog #"true"
19 |     "--job-language"            = var.job-language #"python"
20 |     "--job-bookmark-option"     = var.job-bookmark-option #"job-bookmark-disable"
21 |     "--datalake-formats"        = var.datalake-formats #"iceberg"
22 |     "--conf"                    = var.conf #"spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions  --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog  --conf spark.sql.catalog.glue_catalog.warehouse=s3://tnt-erp-sql/ --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog  --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO"
23 |   }
24 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_job/output.tf:
--------------------------------------------------------------------------------
1 | output "aws_glue_job_name" {
2 |   value = aws_glue_job.immo-glue-job.name
3 |   description = "The name of the Glue ETL Job"
4 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_job/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "script_location" {
 2 |   description = "script_location"
 3 |   type        = string
 4 | }
 5 | 
 6 | variable "name" {
 7 |   description = "name"
 8 |   type        = string
 9 | }
10 | 
11 | variable "timeout" {
12 |   description = "timeout"
13 |   type        = string
14 | }
15 | 
16 | variable "iam_glue_arn" {
17 |   description = "iam glue arn"
18 |   type        = string
19 | }
20 | /*
21 | variable "number_of_workers" {
22 |   description = "number of workers"
23 |   type        = integer
24 | }
25 | 
26 | variable "worker_type" {
27 |   description = "worker type"
28 |   type        = string
29 | }*/
30 | 
31 | variable "glue_version" {
32 |   description = "glue version"
33 |   type        = string
34 | }
35 | 
36 | variable "class" {
37 |   description = "argument glue class"
38 |   type        = string
39 | }
40 | 
41 | variable "enable-job-insights" {
42 |   description = "argument enable job insights"
43 |   type        = string
44 | }
45 | 
46 | variable "enable-auto-scaling" {
47 |   description = "argument enable job scaling"
48 |   type        = string
49 | }
50 | 
51 | variable "enable-glue-datacatalog" {
52 |   description = "argument enable glue datacatalog"
53 |   type        = string
54 | }
55 | 
56 | variable "job-language" {
57 |   description = "argument job language"
58 |   type        = string
59 | }
60 | 
61 | variable "job-bookmark-option" {
62 |   description = "argument job bookmark option"
63 |   type        = string
64 | }
65 | 
66 | variable "datalake-formats" {
67 |   description = "argument job datalake formats"
68 |   type        = string
69 | }
70 | 
71 | variable "conf" {
72 |   description = "argument conf"
73 |   type        = string
74 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_trigger/main.tf:
--------------------------------------------------------------------------------
1 | resource "aws_glue_trigger" "gluejob-trigger" {
2 |   name     = var.name
3 |   schedule = var.schedule_value #"cron(15 12 * * ? *)"
4 |   type     = var.schedule_type #"SCHEDULED"
5 | 
6 |   actions {
7 |     job_name = var.job_name
8 |   }
9 | }


--------------------------------------------------------------------------------
/Infra/modules/glue_trigger/output.tf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/Infra/modules/glue_trigger/output.tf


--------------------------------------------------------------------------------
/Infra/modules/glue_trigger/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "name" {
 2 |   description = "glue trigger name"
 3 |   type        = string
 4 | }
 5 | 
 6 | variable "schedule_type" {
 7 |   description = "glue schedule type"
 8 |   type        = string
 9 | }
10 | 
11 | variable "schedule_value" {
12 |   description = "glue schedule value"
13 |   type        = string
14 | }
15 | 
16 | variable "job_name" {
17 |   description = "glue job name"
18 |   type        = string
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/Infra/modules/lambda/main.tf:
--------------------------------------------------------------------------------
 1 | data "aws_iam_policy_document" "lambda_assume_role" {
 2 |   statement {
 3 |   
 4 |     effect = "Allow"
 5 | 
 6 |     principals {
 7 |       type        = "Service"
 8 |       identifiers = ["lambda.amazonaws.com"]
 9 |     }
10 | 
11 |     actions = ["sts:AssumeRole"]
12 |   }
13 | }
14 | 
15 | data "aws_iam_policy_document" "lambda_policy" {
16 | statement {
17 |     effect    = "Allow"
18 |     actions   = ["s3:GetObject","s3:ListBucket", "s3:PutObject"]
19 |     resources = ["*"]
20 |   }
21 | }
22 | 
23 | resource "aws_iam_role" "iam_for_lambda" {
24 |   name               = "iam_for_lambda"
25 |   assume_role_policy = data.aws_iam_policy_document.lambda_assume_role.json
26 | }
27 | 
28 | resource "aws_iam_policy" "lambda_policy" {
29 |   name        = "lambda-policy"
30 |   description = "allow lambda to get and list object into the bucket"
31 |   policy      = data.aws_iam_policy_document.lambda_policy.json
32 | }
33 | 
34 | resource "aws_iam_role_policy_attachment" "attach_getObject" {
35 |   role       = aws_iam_role.iam_for_lambda.name
36 |   policy_arn = aws_iam_policy.lambda_policy.arn
37 | }
38 | 
39 | data "archive_file" "lambda" {
40 |   type        = "zip"
41 |   source_dir = var.path_to_source_folder
42 |   #source_file = var.path_to_source_file #"../../etl/extract/extract_data.py"
43 |   output_path = var.path_to_output #"lambda_function_extract_data.zip"
44 | }
45 | 
46 | resource "aws_lambda_function" "lambda" {
47 |   # If the file is not in the current working directory you will need to include a
48 |   # path.module in the filename.
49 |   filename      = var.path_to_output #"lambda_function_extract_data.zip"
50 |   function_name = var.function_name #"lambda_extract_fromAPI"
51 |   role          = aws_iam_role.iam_for_lambda.arn
52 |   handler       = var.function_handler #"extract_data.lambda_handler"
53 | 
54 |   memory_size = var.memory_size
55 |   timeout     = var.timeout
56 | 
57 |   source_code_hash = data.archive_file.lambda.output_base64sha256
58 | 
59 |   runtime = var.runtime #"python3.10"
60 |   layers = var.lambda_layer_arns # à modifier
61 | 
62 |   #s3_bucket = var.bucket_name
63 |   #s3_key = aws_s3_object.lambda_layer_zip.key
64 | 
65 |   environment {
66 |     variables = {
67 |       API_KEY = var.rapid_api_key
68 |       API_HOST = var.rapid_api_host
69 |       DST_BUCKET = var.bucket_name
70 |       REGION = var.aws_region
71 |       RAW_FOLDER = var.raw_repertory
72 |     }
73 |   }
74 | }
75 | 
76 | resource "aws_lambda_permission" "s3" {
77 |   statement_id  = "AllowExecutionFromS3Bucket"
78 |   action        = "lambda:InvokeFunction"
79 |   function_name = aws_lambda_function.lambda.arn
80 |   principal     = "s3.amazonaws.com"
81 | 
82 |   source_arn = var.s3_bucket_arn
83 | }


--------------------------------------------------------------------------------
/Infra/modules/lambda/output.tf:
--------------------------------------------------------------------------------
1 | output "lambda_function_name" {
2 |   value = aws_lambda_function.lambda.function_name
3 |   description = "lambda function name"
4 | }
5 | 
6 | output "lambda_function_arn" {
7 |   value = aws_lambda_function.lambda.arn
8 |   description = "arn of the lambda function"
9 | }


--------------------------------------------------------------------------------
/Infra/modules/lambda/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "rapid_api_key" {
 2 |   description = "Rapid API Key"
 3 |   type        = string
 4 |   #default     = "c7d66d4175msh4b730460e56d07dp177281jsn66cc27e2b144"
 5 | }
 6 | 
 7 | variable "rapid_api_host" {
 8 |   description = "Rapid API Host"
 9 |   type        = string
10 |   #default     = "zillow56.p.rapidapi.com"
11 | }
12 | 
13 | variable "aws_region" {
14 |   description = "AWS Region to deploy to"
15 |   type        = string
16 |   #default = "eu-west-3"
17 | }
18 | 
19 | variable "bucket_name" {
20 |   description = "principal bucket name"
21 |   type        = string
22 |   #default     = "real-estate-etl-101"
23 | }
24 | 
25 | variable "raw_repertory" {
26 |   description = "raws data repertory"
27 |   type        = string
28 |   #default     = "raw_data"
29 | }
30 | 
31 | variable "lambda_layer_arns" {
32 |   description = "lambda_layer_arns"
33 |   type        = list(string)
34 | }
35 | 
36 | variable "runtime" {
37 |   description = "Lambda Runtime"
38 |   type        = string
39 | }
40 | 
41 | variable "function_handler" {
42 |   description = "Name of Lambda Function Handler"
43 |   type        = string
44 | }
45 | 
46 | variable "function_name" {
47 |   description = "Name of Lambda Function"
48 |   type        = string
49 | }
50 | /*
51 | variable "path_to_source_file" {
52 |   description = "Path to Lambda Fucntion Source Code"
53 |   type        = string
54 | }
55 | */
56 | variable "path_to_source_folder" {
57 |   description = "Path to Lambda Fucntion Source Code"
58 |   type        = string
59 | }
60 | 
61 | variable "path_to_output" {
62 |   description = "Path to ZIP artifact"
63 |   type        = string
64 | }
65 | 
66 | variable "memory_size" {
67 |   description = "Lambda Memory"
68 |   type        = number
69 | }
70 | 
71 | variable "timeout" {
72 |   description = "Lambda Timeout"
73 |   type        = number
74 | }
75 | 
76 | variable "s3_bucket_arn" {
77 |   description = "lambda_layer_arns"
78 |   type        = string
79 | }


--------------------------------------------------------------------------------
/Infra/modules/request_layer/main.tf:
--------------------------------------------------------------------------------
 1 | # create zip file from requirements.txt. Triggers only when the file is updated
 2 | # first method 
 3 | resource "null_resource" "lambda_layer" {
 4 |   triggers = {
 5 |     requirements = filesha1(var.requirements_path)
 6 |   }
 7 |   # the command to install python and dependencies to the machine and zips
 8 |   provisioner "local-exec" {
 9 |     command = <<EOT
10 |       set -e
11 |       rm -rf python
12 |       mkdir python
13 |       python3 -m venv venv_layer
14 |       . venv_layer/bin/activate 
15 |       pip3 install -r ${var.requirements_path}
16 |       cp -r venv_layer/lib python
17 |       zip -r ${var.layer_zip_path} python/
18 |       rm -rf venv_layer
19 |       rm -rf python
20 |     EOT
21 |   }
22 | }
23 | 
24 | resource "aws_s3_bucket" "lambda_layer_bucket" {
25 |   bucket = var.lambda_layer_bucket_name
26 | }
27 | 
28 | resource "aws_s3_object" "lambda_layer_zip" {
29 |     bucket   = aws_s3_bucket.lambda_layer_bucket.id
30 |     key      =  "${var.lambda_layer}/${var.layer_name}/${var.layer_zip_path}"
31 |     source     = var.layer_zip_path
32 |     depends_on = [null_resource.lambda_layer] # triggered only if the zip file is created
33 |     #content_type = "application/x-directory"  
34 | }
35 | 
36 | resource "aws_lambda_layer_version" "requests_layer" {
37 |   s3_bucket   = aws_s3_bucket.lambda_layer_bucket.id
38 |   s3_key = aws_s3_object.lambda_layer_zip.key
39 |   layer_name = var.layer_name
40 |   #source_code_hash    = filebase64sha256(var.path_to_request_layer_filename)
41 | 
42 |   compatible_runtimes      = var.compatible_layer_runtimes
43 |   depends_on = [aws_s3_object.lambda_layer_zip]
44 |   #compatible_architectures = var.compatible_architectures
45 | 
46 | }
47 | 
48 | 
49 | 
50 | ## second method
51 | /*
52 | data "archive_file" "layer" {
53 |   type        = "zip"
54 |   source_dir  = var.path_to_request_layer_source
55 |   output_path = var.path_to_request_layer_artifact
56 | }
57 | 
58 | 
59 | 
60 | resource "aws_lambda_layer_version" "requests_layer" {
61 |   filename   = var.path_to_request_layer_filename
62 |   layer_name = var.request_layer_name
63 |   source_code_hash    = filebase64sha256(var.path_to_request_layer_filename)
64 | 
65 |   compatible_runtimes      = var.compatible_layer_runtimes
66 |   compatible_architectures = var.compatible_architectures
67 | }
68 | 
69 | */


--------------------------------------------------------------------------------
/Infra/modules/request_layer/output.tf:
--------------------------------------------------------------------------------
1 | output "lamnda_layer_arn" {
2 |   value = aws_lambda_layer_version.requests_layer.arn
3 | }
4 | 


--------------------------------------------------------------------------------
/Infra/modules/request_layer/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "compatible_architectures" {
 2 |   description = "compatible_architectures"
 3 |   type        = list(string)
 4 | }
 5 | 
 6 | variable "compatible_layer_runtimes" {
 7 |   description = "compatible_layer_runtimes"
 8 |   type        = list(string)
 9 | }
10 | 
11 | variable "requirements_path"{
12 |   description = "requirements path"
13 |   type        = string
14 | }
15 | 
16 | variable "layer_zip_path"{
17 |   description = "layer zip path"
18 |   type        = string
19 | }
20 | 
21 | variable "lambda_layer_bucket_name"{
22 |   description = "lambda layer bucket name"
23 |   type        = string
24 | }
25 | 
26 | variable "layer_name"{
27 |   description = "layer name"
28 |   type        = string
29 | }
30 | 
31 | variable "lambda_layer"{
32 |   description = "lambda layer"
33 |   type        = string
34 | }
35 | 
36 | variable "path_to_system_folder"{
37 |   description = "path_to_system_folder"
38 |   type        = string
39 | }
40 | 
41 | /*
42 | variable "path_to_request_layer_source" {
43 |   description = "request layer source path"
44 |   type        = string
45 | }
46 | 
47 | variable "path_to_request_layer_artifact" {
48 |   description = "request layer artifact"
49 |   type        = string
50 | }
51 | 
52 | variable "request_layer_name" {
53 |   description = "layer name"
54 |   type        = string
55 | }
56 | 
57 | variable "path_to_request_layer_filename" {
58 |   description = "path to request layer filename"
59 |   type        = string
60 | }
61 | */
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/Infra/modules/s3/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_s3_bucket" "etl_bucket"{
 2 |   bucket  = var.bucket_name
 3 |   force_destroy = true
 4 | }
 5 | 
 6 | resource "aws_s3_bucket" "utils_bucket"{
 7 |   bucket  = var.utils_bucket_name
 8 |   force_destroy = true
 9 | }
10 | 
11 | resource "aws_s3_object" "raw_zone" {
12 |     bucket   = aws_s3_bucket.etl_bucket.id
13 |     acl = "private"
14 |     key      =  "${var.raw_repertory}/"
15 |     content_type = "application/x-directory"  
16 | }
17 | 
18 | resource "aws_s3_object" "std_zone" {
19 |     bucket   = aws_s3_bucket.etl_bucket.id
20 |     acl = "private"
21 |     key      =  "${var.std_repertory}/"
22 |     content_type = "application/x-directory"  
23 | }
24 | 
25 | resource "aws_s3_object" "glue_script" {
26 |   bucket = aws_s3_bucket.utils_bucket.id
27 |   key    = var.glue_script_key
28 |   source = var.glue_local_script_path
29 |   etag = filemd5(var.glue_local_script_path)
30 | }


--------------------------------------------------------------------------------
/Infra/modules/s3/output.tf:
--------------------------------------------------------------------------------
 1 | output "s3_etl_bucket_arn"{
 2 |     value = aws_s3_bucket.etl_bucket.arn
 3 | }
 4 | 
 5 | output "s3_utils_bucket_arn"{
 6 |     value = aws_s3_bucket.utils_bucket.arn
 7 | }
 8 | 
 9 | output "aws_s3_bucket_uri" {
10 |   value = "${aws_s3_bucket.etl_bucket.bucket}/${aws_s3_object.raw_zone.key}"
11 | }
12 | 
13 | output "aws_s3_bucket_glue_script_uri" {
14 |   value = "${aws_s3_bucket.utils_bucket.bucket}/${aws_s3_object.glue_script.key}"
15 | }
16 | 


--------------------------------------------------------------------------------
/Infra/modules/s3/variables.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | variable "bucket_name" {
 3 |   description = "principal bucket name"
 4 |   type        = string
 5 |   #default     = "real-estate-etl-101"
 6 | }
 7 | 
 8 | variable "utils_bucket_name" {
 9 |   description = "utils bucket name"
10 |   type        = string
11 |   #default     = "real-estate-etl-101"
12 | }
13 | 
14 | variable "raw_repertory" {
15 |   description = "raws data repertory"
16 |   type        = string
17 |   #default     = "raw_data"
18 | }
19 | 
20 | variable "std_repertory" {
21 |   description = "std data repertory"
22 |   type        = string
23 |   #default     = "std_data"
24 | }
25 | 
26 | variable "glue_local_script_path"{
27 |   description = "glue local script path"
28 |   type        = string
29 |   #default = "../glue_etl_job/transform_data.py"
30 | }
31 | 
32 | variable "glue_script_key" {
33 |   description = "glue script key"
34 |   type        = string
35 | }


--------------------------------------------------------------------------------
/Infra/providers.tf:
--------------------------------------------------------------------------------
1 | 
2 | terraform {
3 |   backend "s3" {}
4 | }


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | $(info [Makefile] Loading commons variables from env/base.env ...)
 2 | include env/base.env
 3 | 
 4 | terraform-init:
 5 | 	terraform -chdir=$(TERRAFORM_DIR) init -backend-config="bucket=${AWS_BUCKET_NAME}" -backend-config="key=${AWS_BUCKET_KEY_NAME}" -backend-config="region=${AWS_REGION}"
 6 | 
 7 | terraform-validate:
 8 | 	terraform -chdir=$(TERRAFORM_DIR) validate -no-color
 9 | 
10 | terraform-plan:
11 | 	terraform -chdir=$(TERRAFORM_DIR) plan -no-color
12 | 
13 | terraform-apply:
14 | 	terraform -chdir=$(TERRAFORM_DIR) apply -auto-approve -input=false
15 | 
16 | terraform-destroy:
17 | #terraform -chdir=$(TERRAFORM_DIR) destroy -auto-approve
18 | 	terraform -chdir=$(TERRAFORM_DIR) destroy -target module.lambdaLayer.null_resource.lambda_layer -target module.s3bucket.aws_s3_bucket.etl_bucket 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Architecture 
 2 | 
 3 | ![Pipeline Architecture](images/architecture.png)
 4 | 
 5 | # Tech Stack
 6 | - Terraform 
 7 | - Github actions (CI/CD)
 8 | - AWS Glue Data Catalog
 9 | - AWS Glue Crawler
10 | - AWS Glue Trigger
11 | - AWS Glue Classifier
12 | - AWS Glue ETL Job
13 | - AWS Lambda
14 | - AWS Eventbridge
15 | - Amazon S3
16 | - Amazon Athena
17 | - SQL
18 | - Python
19 | 
20 | # Overwiew
21 | 
22 | In this project, I have create an ETL Job on AWS using Terraform. 
23 | The project extract data from an API (Zillow) which are data from real estate, then, process it using AWS ETL Glue Job with Spark.
24 | Data is extracted from the API using lambda function which is scheduled to run every day. At the end, the data is stored in an s3 bucket in a JSON format. 
25 | 
26 | The AWS Crawler then crawl the data and create a table in glue data catalog, then use AWS ETL job with Spark to process that real estate data and build a report to showcast, for each state, country the price per sqft. 
27 | 
28 | For more information you can check this meduim article : 
29 | - [How I build an ETL pipeline with AWS Glue, Lambda and Terraform](https://medium.com/@lorenagongang/how-i-build-an-etl-pipeline-with-aws-glue-lambda-and-terraform-bbdf0788cc75)
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/env/base.env:
--------------------------------------------------------------------------------
1 | INFRA_FOLDER_NAME = Infra
2 | PROJECT_DIR = $(shell pwd)
3 | TERRAFORM_DIR := $(PROJECT_DIR)/$(INFRA_FOLDER_NAME)


--------------------------------------------------------------------------------
/etl/extract/System/LocalLocation.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import sys
 3 | path_root = Path(__file__).parents[0]
 4 | sys.path.append(str(path_root))
 5 | print(sys.path)
 6 | 
 7 | 
 8 | from location import Location 
 9 | from workspace import Workspace
10 | 
11 | import workspace
12 | import yaml
13 | from pathlib import Path
14 | import shutil
15 | 
16 | 
17 | class LocalLocation(Location):
18 |    
19 |     def get_Configloader(self):
20 |         loader = yaml.SafeLoader
21 |         loader.add_constructor("!Workspace", workspace.workspace_constructor)
22 |         return loader
23 | 
24 |     def readConfigFile(self,location):
25 |         data = yaml.load(open(location, "rb"), Loader=self.get_Configloader())
26 |         works = []
27 |         list_Works = data["Workspaces"]
28 |         for o in list_Works:
29 |             database = Path(o.database)
30 |             table_name = Path(o.table_name)
31 |             if database.exists() & database.is_dir() & table_name.exists() & table_name.is_dir():
32 |                 works.append(Workspace(o.database,o.table_name))   
33 |         return list_Works
34 |     
35 |     def moveFile(self, src, dst):
36 |         shutil.move(src, dst)


--------------------------------------------------------------------------------
/etl/extract/System/location.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |  
3 | class Location(ABC):
4 |  
5 | 
6 |     @abstractmethod
7 |     def readConfigFile(self):
8 |         pass   


--------------------------------------------------------------------------------
/etl/extract/System/workspace.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | 
 3 | class Workspace:
 4 | 
 5 |     def __init__(self, database,table_name):
 6 |         self.database = database
 7 |         self.table_name = table_name
 8 | 
 9 |     def get_database(self):
10 |         return self.database
11 |     def get_table_name(self):
12 |         return self.table_name        
13 | 
14 |     def __str__(self):
15 |         return "I am the tablename " + self.table_name
16 | 
17 | def workspace_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode) -> Workspace:
18 |     return Workspace(**loader.construct_mapping(node))


--------------------------------------------------------------------------------
/etl/extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/extract/__init__.py


--------------------------------------------------------------------------------
/etl/extract/extract_data.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import pathlib
  4 | from datetime import datetime
  5 | from pathlib import Path
  6 | 
  7 | import boto3
  8 | import requests
  9 | import yaml
 10 | from System.LocalLocation import LocalLocation
 11 | from System.workspace import Workspace
 12 | 
 13 | DST_BUCKET = os.environ.get("DST_BUCKET")
 14 | REGION = os.environ.get("REGION")
 15 | RAW_FOLDER = os.environ.get("RAW_FOLDER")
 16 | API_KEY = os.environ.get("API_KEY")
 17 | API_HOST = os.environ.get("API_HOST")
 18 | URL = "https://zillow56.p.rapidapi.com/search"
 19 | 
 20 | 
 21 | # creer la liste des villes
 22 | # country = ["houston", "pasadena", "Katy", "Cypress"]
 23 | country = ["houston", "pasadena"]
 24 | 
 25 | s3 = boto3.client("s3", region_name=REGION)
 26 | 
 27 | 
 28 | def lambda_handler(event, context):
 29 |     create_s3_directories_based_on_city(s3, DST_BUCKET, country, RAW_FOLDER)
 30 | 
 31 |     date = get_time()[1]
 32 | 
 33 |     populate_database_table_s3_bucket(s3, DST_BUCKET, date, country, RAW_FOLDER)
 34 | 
 35 | 
 36 | # create directories based on city name
 37 | def create_s3_directories_based_on_city(
 38 |     s3, bucket_name, city_name_list, database_name_s3
 39 | ):
 40 | 
 41 |     for city_name in city_name_list:
 42 |         table_name_s3_prefix = str(database_name_s3) + "/" + str(city_name)
 43 | 
 44 |         #  check if s3 object already exists
 45 |         try:
 46 |             s3.head_object(Bucket=bucket_name, Key=table_name_s3_prefix)
 47 |         except s3.exceptions.ClientError as e:
 48 |             if e.response["Error"]["Code"] == "404":
 49 |                 # key doesn't exists
 50 |                 s3.put_object(Bucket=bucket_name, Key=(table_name_s3_prefix + "/"))
 51 | 
 52 |                 pass
 53 |         else:
 54 |             # Key exists, do nothing
 55 |             pass
 56 | 
 57 | 
 58 | def get_time():
 59 |     dt = datetime.now()
 60 |     timestamp = str(datetime.timestamp(dt)).replace(".", "_")
 61 |     return timestamp, dt.strftime("%Y-%m-%d")
 62 | 
 63 | 
 64 | def fetch_api_data(url, query):
 65 |     headers = {
 66 |         # mettre api_key et api_host
 67 |         "X-RapidAPI-Key": API_KEY,
 68 |         "X-RapidAPI-Host": API_HOST,
 69 |     }
 70 |     response = requests.get(url, headers=headers, params=query)
 71 | 
 72 |     if response.status_code == 200:
 73 |         data = json.loads(response.text)
 74 |         return data["results"]
 75 |     else:
 76 |         raise Exception(f"Error fetching data: {response.text}")
 77 | 
 78 | 
 79 | def populate_database_table_s3_bucket(
 80 |     s3, bucket_name, date, city_name_list, database_name
 81 | ):
 82 | 
 83 |     try:
 84 |         for table_name in city_name_list:
 85 |             file_name = f"{table_name}_{date}.json"
 86 |             query = {"location": f"{table_name}, tx"}
 87 |             # fetching data
 88 |             data = fetch_api_data(URL, query)
 89 |             s3_object_key = f"{database_name}/{table_name}/{date}/{file_name}"
 90 |             try:
 91 |                 s3.put_object(
 92 |                     Bucket=bucket_name, Key=s3_object_key, Body=json.dumps(data)
 93 |                 )
 94 |             except ClientError as e:
 95 |                 raise Exception(
 96 |                     f"Error uploading data to S3: {e}"
 97 |                 ) from e  # Re-raise with more context
 98 | 
 99 |     except Exception as e:
100 |         print(f"Error populating table '{table_name}': {e}")
101 | 
102 | 
103 | """
104 | def create_workspace_objects(config_file_path="system_config.yml"):
105 |     local = LocalLocation()
106 |     list_workspace_object = []
107 |     if os.path.exists(config_file_path):
108 |         list_workspace = local.readConfigFile(config_file_path)
109 |         for workspace in list_workspace:
110 |             list_workspace_object.append(
111 |                 Workspace(workspace.database, workspace.table_name)
112 |             )
113 |     return list_workspace_object
114 |     
115 | def create_s3_directories(bucket_name, workspace_object):
116 |     s3 = boto3.client("s3")
117 |     database_name_s3 = RAW_FOLDER
118 |     # database_name_s3 = workspace_object.get_database()
119 |     table_name_s3 = workspace_object.get_table_name()
120 |     # database_name_s3_prefix = str(database_name_s3)
121 |     table_name_s3_prefix = str(database_name_s3) + "/" + str(table_name_s3)
122 |     try:
123 |         # s3.put_object(Bucket=bucket_name, Key=(database_name_s3_prefix + '/'))
124 |         s3.put_object(Bucket=bucket_name, Key=(table_name_s3_prefix + "/"))
125 |     except s3.exceptions.ClientError as e:
126 |         if e.response["Error"]["Code"] == "404":
127 |             pass
128 | 
129 |     cpt = 0
130 |     while cpt < len(list_workspace_object):
131 |         try:
132 |             workspace_object = None
133 |             workspace_object = list_workspace_object[cpt]
134 |             database_name = RAW_FOLDER
135 |             # database_name = workspace_object.get_database()
136 |             table_name = workspace_object.get_table_name()
137 |             file_name = f"{table_name}_{date}.json"
138 |             query = {"location": f"{table_name}, tx"}
139 |             data = fetch_api_data(URL, query)
140 | 
141 |             print(table_name, file_name, query, data)
142 |             # data = {"test": "alla"}
143 |             # s3_object_key = f"{database_name}/{table_name}/{date}/{file_name}"
144 | 
145 |             # Convert data to a byte stream (assuming it's serializable)
146 | 
147 |             
148 |             if isinstance(data, dict):
149 |                 data_bytes = json.dumps(data, ensure_ascii=False).encode("utf-8")
150 |             else:
151 |                 raise TypeError("Data must be a dictionary serializable to JSON")
152 |             try:
153 |                 s3.put_object(Bucket=bucket_name, Key=s3_object_key, Body=data_bytes)
154 |                 cpt += 1
155 |             except ClientError as e:
156 |                 raise Exception(
157 |                     f"Error uploading data to S3: {e}"
158 |                 ) from e  # Re-raise with more context
159 |             
160 |         except Exception as e:
161 |             print(f"Error populating table '{table_name}': {e}")
162 | 
163 | 
164 | def create_local_directories(workspace_object):
165 |     database_name = workspace_object.get_database()
166 |     table_name = workspace_object.get_table_name()
167 |     if not os.path.exists(database_name):
168 |         os.makedirs(database_name)
169 |     table_dir = os.path.join(database_name, table_name)
170 |     if not os.path.exists(table_dir):
171 |         os.makedirs(table_dir)  # Create the table directory
172 | 
173 | def populate_database_table_local(data,date,list_workspace_object):   
174 |     for workspace_object in list_workspace_object:
175 |         try:
176 |             database_name = workspace_object.get_database()
177 |             table_name = workspace_object.get_table_name()
178 |             file_name = f"{table_name}_{date}.json"
179 |             table_partitioned = pathlib.Path(f"/Users/XXX/Desktop/GLUE/{database_name}/{table_name}/{date}")
180 |             table_partitioned.mkdir(parents=True, exist_ok=True)
181 |             with open(table_partitioned / file_name, 'w') as file:
182 |                 json.dump(data, file)
183 |         except Exception as e: 
184 |             print(f"Error populating table '{table_name}': {e}")
185 | """
186 | 


--------------------------------------------------------------------------------
/etl/extract/system_config.yml:
--------------------------------------------------------------------------------
 1 | Workspaces:
 2 |  - !Workspace
 3 |     database: raw_data 
 4 |     table_name: houston
 5 |  - !Workspace
 6 |     database: raw_data 
 7 |     table_name: pasadena 
 8 | # - !Workspace
 9 | #    database: raw_data 
10 | #    table_name: katy 
11 | # - !Workspace    
12 | #    database: raw_data 
13 | #    table_name: Cypress 
14 | 


--------------------------------------------------------------------------------
/etl/glue_etl_job/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/glue_etl_job/__init__.py


--------------------------------------------------------------------------------
/etl/glue_etl_job/transform_data.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from awsglue.context import GlueContext
 4 | from awsglue.dynamicframe import DynamicFrame
 5 | from awsglue.job import Job
 6 | from awsglue.transforms import *
 7 | from awsglue.utils import getResolvedOptions
 8 | from pyspark.context import SparkContext
 9 | from pyspark.sql import functions as F
10 | from pyspark.sql.functions import col, expr, first
11 | 
12 | sc = SparkContext.getOrCreate()
13 | glueContext = GlueContext(sc)
14 | spark = glueContext.spark_session
15 | job = Job(glueContext)
16 | 
17 | 
18 | def extract_houston_from_catalog(database, houston_table_name):
19 |     raw_houston_dynamic_frame = glueContext.create_dynamic_frame.from_catalog(
20 |         database=database, table_name=houston_table_name
21 |     )
22 |     df = raw_houston_dynamic_frame.toDF()
23 |     return df
24 | 
25 | def drop_columns(df):
26 |     # drop from table columns with struct type
27 |     cols = ("listing_sub_type", "open_house_info")
28 | 
29 |     df_drops= df.drop(*cols)
30 |     return df_drops
31 | 
32 | 
33 | def group_data(df):
34 |     #"group the data by zipcode"
35 |     df_group = (
36 |         df.groupBy("zipcode","state","city","country","currency")
37 |         .agg(
38 |             F.count("*").alias("Total Zipcodes"),
39 |             F.avg("bathrooms").alias("avg_bathrooms"),
40 |             F.avg("bedrooms").alias("avg_bedrooms"),
41 |             F.mean(col("price") / col("livingArea")).alias("avg_price_per_sqft"),
42 |         )
43 |         .orderBy("zipcode")
44 |     )
45 |     return df_group
46 | 
47 | 
48 | def load_to_s3(glue_dynamic_frame):
49 |     s3output = glueContext.getSink(
50 |         path="s3://real-estate-etl-101/std_data/",
51 |         connection_type="s3",
52 |         updateBehavior="UPDATE_IN_DATABASE",
53 |         partitionKeys=[],
54 |         compression="snappy",
55 |         enableUpdateCatalog=True,
56 |         transformation_ctx="s3output",
57 |     )
58 | 
59 |     s3output.setCatalogInfo(
60 |         catalogDatabase="real-estate-database", catalogTableName="immo_report"
61 |     )
62 | 
63 |     s3output.setFormat("glueparquet")
64 |     s3output.writeFrame(glue_dynamic_frame)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     database = "real-estate-database"
69 |     houston_table_name = "immo_houston"
70 |     df_houston = extract_houston_from_catalog(database, houston_table_name)
71 |     
72 |     df_drops = drop_columns(df_houston)
73 |     
74 |     df_final = group_data(df_drops)
75 | 
76 |     # going from Spark dataframe to glue dynamic frame
77 |     
78 |     glue_dynamic_frame = DynamicFrame.fromDF(df_final, glueContext, "glue_etl")
79 | 
80 |     # load to s3
81 |     load_to_s3(glue_dynamic_frame)
82 |     
83 |     job.commit()


--------------------------------------------------------------------------------
/etl/load/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/load/__init__.py


--------------------------------------------------------------------------------
/etl/load/load_data.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/load/load_data.py


--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/images/architecture.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | PyYAML
3 | 


--------------------------------------------------------------------------------
/terraform.tfstate:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 4,
 3 |   "terraform_version": "1.7.5",
 4 |   "serial": 1,
 5 |   "lineage": "b41d31ed-d92b-6863-63ef-f1f090b6a6ac",
 6 |   "outputs": {},
 7 |   "resources": [],
 8 |   "check_results": null
 9 | }
10 | 


--------------------------------------------------------------------------------