├── .github └── workflows │ ├── ci_cd.yml │ └── destroy-infra.yml ├── .gitignore ├── Infra ├── locals.tf ├── main.tf ├── modules │ ├── eventbridge │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── glue_catalog_database │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── glue_catalog_table │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── glue_classifier │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── glue_crawler │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── glue_iam │ │ ├── main.tf │ │ └── output.tf │ ├── glue_job │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── glue_trigger │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── lambda │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── request_layer │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf │ └── s3 │ │ ├── main.tf │ │ ├── output.tf │ │ └── variables.tf └── providers.tf ├── Makefile ├── README.md ├── env └── base.env ├── etl ├── extract │ ├── System │ │ ├── LocalLocation.py │ │ ├── location.py │ │ └── workspace.py │ ├── __init__.py │ ├── extract_data.py │ └── system_config.yml ├── glue_etl_job │ ├── __init__.py │ └── transform_data.py └── load │ ├── __init__.py │ └── load_data.py ├── images └── architecture.png ├── requirements.txt └── terraform.tfstate /.github/workflows/ci_cd.yml: -------------------------------------------------------------------------------- 1 | name: "Terraform action" 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | permissions: 8 | id-token: write # This is required for aws oidc connection 9 | contents: read # This is required for actions/checkout 10 | pull-requests: write # This is required for gh bot to comment PR 11 | env: 12 | TF_LOG: INFO 13 | AWS_REGION: ${{ secrets.AWS_REGION }} 14 | jobs: 15 | deploy: 16 | runs-on: ubuntu-latest 17 | defaults: 18 | run: 19 | shell: bash 20 | working-directory: . 21 | steps: 22 | - name: Git checkout 23 | uses: actions/checkout@v3 24 | 25 | - name: Configure AWS credentials from AWS account 26 | uses: aws-actions/configure-aws-credentials@v1 27 | with: 28 | role-to-assume: ${{ secrets.AWS_ROLE }} 29 | aws-region: ${{ secrets.AWS_REGION }} 30 | role-session-name: GitHub-OIDC-TERRAFORM 31 | 32 | - name: Setup Terraform 33 | uses: hashicorp/setup-terraform@v2 34 | with: 35 | terraform_version: 1.7.5 36 | 37 | - name: Terraform fmt 38 | id: fmt 39 | run: terraform fmt -check 40 | continue-on-error: true 41 | 42 | - name: Terraform Init 43 | id: init 44 | env: 45 | AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }} 46 | AWS_BUCKET_KEY_NAME: ${{ secrets.AWS_BUCKET_KEY_NAME }} 47 | run: make terraform-init 48 | 49 | - name: Terraform Validate 50 | id: validate 51 | run: make terraform-validate 52 | 53 | - name: Terraform Plan 54 | id: plan 55 | run: make terraform-plan 56 | if: github.event_name == 'pull_request' 57 | continue-on-error: true 58 | 59 | - uses: actions/github-script@v6 60 | if: github.event_name == 'pull_request' 61 | env: 62 | PLAN: "terraform\n${{ steps.plan.outputs.stdout }}" 63 | with: 64 | github-token: ${{ secrets.GITHUB_TOKEN }} 65 | script: | 66 | const output = `#### Terraform Format and Style 🖌\`${{ steps.fmt.outcome }}\` 67 | #### Terraform Initialization ⚙️\`${{ steps.init.outcome }}\` 68 | #### Terraform Validation 🤖\`${{ steps.validate.outcome }}\` 69 |
Validation Output 70 | 71 | \`\`\`\n 72 | ${{ steps.validate.outputs.stdout }} 73 | \`\`\` 74 | 75 |
76 | 77 | #### Terraform Plan 📖\`${{ steps.plan.outcome }}\` 78 | 79 |
Show Plan 80 | 81 | \`\`\`\n 82 | ${process.env.PLAN} 83 | \`\`\` 84 | 85 |
86 | 87 | *Pushed by: @${{ github.actor }}, Action: \`${{ github.event_name }}\`*`; 88 | 89 | github.rest.issues.createComment({ 90 | issue_number: context.issue.number, 91 | owner: context.repo.owner, 92 | repo: context.repo.repo, 93 | body: output 94 | }) 95 | 96 | - name: Terraform Plan Status 97 | if: steps.plan.outcome == 'failure' 98 | run: exit 1 99 | 100 | - name: Terraform Apply 101 | if: github.ref == 'refs/heads/main' && github.event_name == 'push' 102 | run: make terraform-apply -------------------------------------------------------------------------------- /.github/workflows/destroy-infra.yml: -------------------------------------------------------------------------------- 1 | name: "Terraform destroy" 2 | on: 3 | push: 4 | branches: 5 | - feat*/* 6 | workflow_dispatch: 7 | permissions: 8 | id-token: write # This is required for aws oidc connection 9 | contents: read # This is required for actions/checkout 10 | pull-requests: write # This is required for gh bot to comment PR 11 | env: 12 | TF_LOG: INFO 13 | AWS_REGION: ${{ secrets.AWS_REGION }} 14 | jobs: 15 | deploy: 16 | runs-on: ubuntu-latest 17 | defaults: 18 | run: 19 | shell: bash 20 | working-directory: . 21 | steps: 22 | - name: Git checkout 23 | uses: actions/checkout@v3 24 | 25 | - name: Configure AWS credentials from AWS account 26 | uses: aws-actions/configure-aws-credentials@v1 27 | with: 28 | role-to-assume: ${{ secrets.AWS_ROLE }} 29 | aws-region: ${{ secrets.AWS_REGION }} 30 | role-session-name: GitHub-OIDC-TERRAFORM 31 | 32 | - name: Setup Terraform 33 | uses: hashicorp/setup-terraform@v2 34 | with: 35 | terraform_version: 1.7.5 36 | 37 | - name: Terraform fmt 38 | id: fmt 39 | run: terraform fmt -check 40 | continue-on-error: true 41 | 42 | - name: Terraform Init 43 | id: init 44 | env: 45 | AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }} 46 | AWS_BUCKET_KEY_NAME: ${{ secrets.AWS_BUCKET_KEY_NAME }} 47 | run: make terraform-init 48 | 49 | - name: Terraform Destroy 50 | id: destroy 51 | run: make terraform-destroy 52 | 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | venv/* 3 | 4 | Infra/requirements 5 | Infra/requirements/* 6 | 7 | Infra/.terraform 8 | Infra/.terraform/* 9 | 10 | Infra/lambda_function_extract_data.zip 11 | Infra/requirements.zip 12 | 13 | .DS_Store 14 | Infra/.terraform.lock.hcl 15 | Infra/terraform.tfstate.backup 16 | Infra/terraform.tfstate 17 | 18 | # Local .terraform directories 19 | **/.terraform/* 20 | 21 | # .tfstate files 22 | *.tfstate 23 | *.tfstate.* 24 | 25 | # Crash log files 26 | crash.log 27 | crash.*.log 28 | 29 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 30 | # password, private keys, and other secrets. These should not be part of version 31 | # control as they are data points which are potentially sensitive and subject 32 | # to change depending on the environment. 33 | *.tfvars 34 | *.tfvars.json 35 | 36 | # Ignore override files as they are usually used to override resources locally and so 37 | # are not checked in 38 | override.tf 39 | override.tf.json 40 | *_override.tf 41 | *_override.tf.json 42 | 43 | # Include override files you do wish to add to version control using negated pattern 44 | # !example_override.tf 45 | 46 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 47 | # example: *tfplan* 48 | 49 | # Ignore CLI configuration files 50 | .terraformrc 51 | terraform.rc 52 | -------------------------------------------------------------------------------- /Infra/locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | 3 | #buckets 4 | lambda_layer_bucket_name = "my-lambda-layer-bucket-001" 5 | lambda_layer = "lambda_layer" 6 | rapid_api_host = "zillow56.p.rapidapi.com" 7 | rapid_api_key = "XXXX" 8 | bucket_name = "real-estate-etl-101" 9 | raw_repertory = "raw_data" 10 | std_repertory = "std_data" 11 | aws_region = "eu-west-3" 12 | 13 | utils_bucket = "real-estate-etl-utils" 14 | glue_script_key = "script/glue_etl_script.py" 15 | glue_local_script_path = "../etl/glue_etl_job/transform_data.py" 16 | 17 | # first method layer 18 | layer_zip_path = "python.zip" 19 | layer_name = "my_lambda_requirements_layer" 20 | requirements_path = "../requirements.txt" 21 | 22 | path_to_system_folder = "../etl/extract/System" 23 | 24 | compatible_layer_runtimes = ["python3.10"] 25 | compatible_architectures = ["x86_64"] 26 | 27 | # lambda 28 | path_to_source_folder = "../etl/extract" 29 | #path_to_source_file = "../etl/extract" 30 | path_to_output = "lambda_function_extract_data.zip" 31 | function_name = "lambda_extract_fromAPI" 32 | function_handler = "extract_data.lambda_handler" 33 | memory_size = 512 34 | timeout = 300 35 | runtime = "python3.10" 36 | 37 | # Glue catalog 38 | glue_catalog_database_name = "real-estate-database" 39 | 40 | # iam 41 | 42 | # Glue Crawler 43 | glue_Crawler_Name = "real_estate_crawler" 44 | houston_crawler_name = "real_estate_houston_crawler" 45 | panamera_crawler_name = "real_estate_panamera_crawler" 46 | houston = "houston" 47 | panamera = "pasadena" 48 | 49 | # Glue Classifier 50 | classifier_name = "real_estate_classifier" 51 | json_path = "$[*]" 52 | 53 | # Glue Job 54 | glue_job_name = "real_estate_job" 55 | glue_version = "4.0" 56 | worker_type = "G.1X" 57 | number_of_workers = 2 58 | time_out = 2880 59 | script_location = "" 60 | class = "GlueApp" 61 | enable-job-insights = "true" 62 | enable-auto-scaling = "false" 63 | enable-glue-datacatalog = "true" 64 | job-language = "python" 65 | job-bookmark-option = "job-bookmark-disable" 66 | datalake-formats = "iceberg" 67 | conf = "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.warehouse=s3://tnt-erp-sql/ --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO" 68 | 69 | # cloudwatch 70 | schedule_name = "schedule" 71 | schedule_value = "cron(0 8 ? * MON-FRI *)" 72 | 73 | # Glue Trigger 74 | glue_trigger_name = "realestate-glue-job-trigger" 75 | glue_trigger_schedule_type = "SCHEDULED" 76 | glue_trigger_schedule_value = "cron(15 12 * * ? *)" 77 | 78 | } -------------------------------------------------------------------------------- /Infra/main.tf: -------------------------------------------------------------------------------- 1 | module "s3bucket" { 2 | source = "./modules/s3" 3 | 4 | bucket_name = local.bucket_name 5 | raw_repertory = local.raw_repertory 6 | std_repertory = local.std_repertory 7 | 8 | utils_bucket_name = local.utils_bucket 9 | glue_script_key = local.glue_script_key 10 | glue_local_script_path = local.glue_local_script_path 11 | 12 | } 13 | 14 | module "lambdaLayer" { 15 | source = "./modules/request_layer" 16 | 17 | requirements_path = local.requirements_path 18 | layer_zip_path = local.layer_zip_path 19 | layer_name = local.layer_name 20 | 21 | path_to_system_folder = local.path_to_system_folder 22 | 23 | lambda_layer_bucket_name = local.lambda_layer_bucket_name 24 | lambda_layer = local.lambda_layer 25 | 26 | #path_to_request_layer_source = local.path_to_request_layer_source 27 | #path_to_request_layer_artifact = local.path_to_request_layer_artifact 28 | 29 | #path_to_request_layer_filename = local.path_to_request_layer_filename 30 | #request_layer_name = local.request_layer_name 31 | 32 | 33 | #path_to_request_layer_source = local.path_to_request_layer_source 34 | #path_to_request_layer_artifact = local.path_to_request_layer_artifact 35 | 36 | #path_to_request_layer_filename = local.path_to_request_layer_filename 37 | #request_layer_name = local.request_layer_name 38 | 39 | compatible_layer_runtimes = local.compatible_layer_runtimes 40 | compatible_architectures = local.compatible_architectures 41 | 42 | } 43 | 44 | module "lambdaFunction" { 45 | source = "./modules/lambda" 46 | 47 | path_to_source_folder = local.path_to_source_folder 48 | path_to_output = local.path_to_output 49 | function_name = local.function_name 50 | function_handler = local.function_handler 51 | memory_size = local.memory_size 52 | timeout = local.timeout 53 | runtime = local.runtime 54 | rapid_api_host = local.rapid_api_host 55 | rapid_api_key = local.rapid_api_key 56 | bucket_name = local.bucket_name 57 | raw_repertory = local.raw_repertory 58 | lambda_layer_arns = [module.lambdaLayer.lamnda_layer_arn] 59 | aws_region = local.aws_region 60 | s3_bucket_arn = module.s3bucket.s3_etl_bucket_arn 61 | 62 | } 63 | 64 | module "cloudwatch_schedule_module" { 65 | source = "./modules/eventbridge" 66 | schedule_name = local.schedule_name 67 | schedule_value = local.schedule_value 68 | aws_lambda_arn = module.lambdaFunction.lambda_function_arn 69 | aws_lambda_function_name = module.lambdaFunction.lambda_function_name 70 | } 71 | 72 | module "glueCatalogDatabase" { 73 | source = "./modules/glue_catalog_database" 74 | 75 | glue_catalog_database_name = local.glue_catalog_database_name 76 | } 77 | 78 | module "glueIamRole" { 79 | source = "./modules/glue_iam" 80 | 81 | } 82 | 83 | module "glueClassifier" { 84 | source = "./modules/glue_classifier" 85 | classifier_name = local.classifier_name 86 | json_path = local.json_path 87 | 88 | } 89 | 90 | module "glueCrawler" { 91 | source = "./modules/glue_crawler" 92 | 93 | database = module.glueCatalogDatabase.database_name 94 | houston_crawler_name = local.houston_crawler_name 95 | panamera_crawler_name = local.panamera_crawler_name 96 | 97 | houston = local.houston 98 | panamera = local.panamera 99 | 100 | #name = local.glue_Crawler_Name 101 | glue_iam_role = module.glueIamRole.glue_iam_arn 102 | 103 | classifiers = [module.glueClassifier.aws_glue_classifier_id] 104 | s3_target_path_panamera = module.s3bucket.aws_s3_bucket_uri 105 | s3_target_path_houston = module.s3bucket.aws_s3_bucket_uri 106 | #s3_target_path = module.s3bucket.aws_s3_bucket_uri 107 | } 108 | 109 | module "glueJob" { 110 | source = "./modules/glue_job" 111 | 112 | name = local.glue_job_name 113 | iam_glue_arn = module.glueIamRole.glue_iam_arn 114 | glue_version = local.glue_version 115 | #worker_type = local.worker_type 116 | script_location = module.s3bucket.aws_s3_bucket_glue_script_uri 117 | timeout = local.time_out 118 | class = local.class 119 | enable-job-insights = local.enable-job-insights 120 | enable-auto-scaling = local.enable-auto-scaling 121 | enable-glue-datacatalog = local.enable-glue-datacatalog 122 | job-language = local.job-language 123 | job-bookmark-option = local.job-bookmark-option 124 | datalake-formats = local.datalake-formats 125 | conf = local.conf 126 | 127 | } 128 | 129 | module "glueTrigger" { 130 | source = "./modules/glue_trigger" 131 | 132 | name = local.glue_trigger_name 133 | schedule_type = local.glue_trigger_schedule_type 134 | schedule_value = local.schedule_value 135 | job_name = module.glueJob.aws_glue_job_name 136 | } 137 | 138 | 139 | 140 | 141 | /* 142 | ### lambda 143 | 144 | data "aws_iam_policy_document" "lambda_assume_role" { 145 | statement { 146 | 147 | effect = "Allow" 148 | 149 | principals { 150 | type = "Service" 151 | identifiers = ["lambda.amazonaws.com"] 152 | } 153 | 154 | actions = ["sts:AssumeRole"] 155 | } 156 | } 157 | 158 | #define variables 159 | locals { 160 | layer_zip_path = "requirements.zip" 161 | layer_name = "my_lambda_requirements_layer" 162 | requirements_path = "../requirements.txt" 163 | } 164 | 165 | # create zip file from requirements.txt. Triggers only when the file is updated 166 | resource "null_resource" "lambda_layer" { 167 | triggers = { 168 | requirements = filesha1(local.requirements_path) 169 | } 170 | # the command to install python and dependencies to the machine and zips 171 | provisioner "local-exec" { 172 | command = < Workspace: 18 | return Workspace(**loader.construct_mapping(node)) -------------------------------------------------------------------------------- /etl/extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/extract/__init__.py -------------------------------------------------------------------------------- /etl/extract/extract_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | from datetime import datetime 5 | from pathlib import Path 6 | 7 | import boto3 8 | import requests 9 | import yaml 10 | from System.LocalLocation import LocalLocation 11 | from System.workspace import Workspace 12 | 13 | DST_BUCKET = os.environ.get("DST_BUCKET") 14 | REGION = os.environ.get("REGION") 15 | RAW_FOLDER = os.environ.get("RAW_FOLDER") 16 | API_KEY = os.environ.get("API_KEY") 17 | API_HOST = os.environ.get("API_HOST") 18 | URL = "https://zillow56.p.rapidapi.com/search" 19 | 20 | 21 | # creer la liste des villes 22 | # country = ["houston", "pasadena", "Katy", "Cypress"] 23 | country = ["houston", "pasadena"] 24 | 25 | s3 = boto3.client("s3", region_name=REGION) 26 | 27 | 28 | def lambda_handler(event, context): 29 | create_s3_directories_based_on_city(s3, DST_BUCKET, country, RAW_FOLDER) 30 | 31 | date = get_time()[1] 32 | 33 | populate_database_table_s3_bucket(s3, DST_BUCKET, date, country, RAW_FOLDER) 34 | 35 | 36 | # create directories based on city name 37 | def create_s3_directories_based_on_city( 38 | s3, bucket_name, city_name_list, database_name_s3 39 | ): 40 | 41 | for city_name in city_name_list: 42 | table_name_s3_prefix = str(database_name_s3) + "/" + str(city_name) 43 | 44 | # check if s3 object already exists 45 | try: 46 | s3.head_object(Bucket=bucket_name, Key=table_name_s3_prefix) 47 | except s3.exceptions.ClientError as e: 48 | if e.response["Error"]["Code"] == "404": 49 | # key doesn't exists 50 | s3.put_object(Bucket=bucket_name, Key=(table_name_s3_prefix + "/")) 51 | 52 | pass 53 | else: 54 | # Key exists, do nothing 55 | pass 56 | 57 | 58 | def get_time(): 59 | dt = datetime.now() 60 | timestamp = str(datetime.timestamp(dt)).replace(".", "_") 61 | return timestamp, dt.strftime("%Y-%m-%d") 62 | 63 | 64 | def fetch_api_data(url, query): 65 | headers = { 66 | # mettre api_key et api_host 67 | "X-RapidAPI-Key": API_KEY, 68 | "X-RapidAPI-Host": API_HOST, 69 | } 70 | response = requests.get(url, headers=headers, params=query) 71 | 72 | if response.status_code == 200: 73 | data = json.loads(response.text) 74 | return data["results"] 75 | else: 76 | raise Exception(f"Error fetching data: {response.text}") 77 | 78 | 79 | def populate_database_table_s3_bucket( 80 | s3, bucket_name, date, city_name_list, database_name 81 | ): 82 | 83 | try: 84 | for table_name in city_name_list: 85 | file_name = f"{table_name}_{date}.json" 86 | query = {"location": f"{table_name}, tx"} 87 | # fetching data 88 | data = fetch_api_data(URL, query) 89 | s3_object_key = f"{database_name}/{table_name}/{date}/{file_name}" 90 | try: 91 | s3.put_object( 92 | Bucket=bucket_name, Key=s3_object_key, Body=json.dumps(data) 93 | ) 94 | except ClientError as e: 95 | raise Exception( 96 | f"Error uploading data to S3: {e}" 97 | ) from e # Re-raise with more context 98 | 99 | except Exception as e: 100 | print(f"Error populating table '{table_name}': {e}") 101 | 102 | 103 | """ 104 | def create_workspace_objects(config_file_path="system_config.yml"): 105 | local = LocalLocation() 106 | list_workspace_object = [] 107 | if os.path.exists(config_file_path): 108 | list_workspace = local.readConfigFile(config_file_path) 109 | for workspace in list_workspace: 110 | list_workspace_object.append( 111 | Workspace(workspace.database, workspace.table_name) 112 | ) 113 | return list_workspace_object 114 | 115 | def create_s3_directories(bucket_name, workspace_object): 116 | s3 = boto3.client("s3") 117 | database_name_s3 = RAW_FOLDER 118 | # database_name_s3 = workspace_object.get_database() 119 | table_name_s3 = workspace_object.get_table_name() 120 | # database_name_s3_prefix = str(database_name_s3) 121 | table_name_s3_prefix = str(database_name_s3) + "/" + str(table_name_s3) 122 | try: 123 | # s3.put_object(Bucket=bucket_name, Key=(database_name_s3_prefix + '/')) 124 | s3.put_object(Bucket=bucket_name, Key=(table_name_s3_prefix + "/")) 125 | except s3.exceptions.ClientError as e: 126 | if e.response["Error"]["Code"] == "404": 127 | pass 128 | 129 | cpt = 0 130 | while cpt < len(list_workspace_object): 131 | try: 132 | workspace_object = None 133 | workspace_object = list_workspace_object[cpt] 134 | database_name = RAW_FOLDER 135 | # database_name = workspace_object.get_database() 136 | table_name = workspace_object.get_table_name() 137 | file_name = f"{table_name}_{date}.json" 138 | query = {"location": f"{table_name}, tx"} 139 | data = fetch_api_data(URL, query) 140 | 141 | print(table_name, file_name, query, data) 142 | # data = {"test": "alla"} 143 | # s3_object_key = f"{database_name}/{table_name}/{date}/{file_name}" 144 | 145 | # Convert data to a byte stream (assuming it's serializable) 146 | 147 | 148 | if isinstance(data, dict): 149 | data_bytes = json.dumps(data, ensure_ascii=False).encode("utf-8") 150 | else: 151 | raise TypeError("Data must be a dictionary serializable to JSON") 152 | try: 153 | s3.put_object(Bucket=bucket_name, Key=s3_object_key, Body=data_bytes) 154 | cpt += 1 155 | except ClientError as e: 156 | raise Exception( 157 | f"Error uploading data to S3: {e}" 158 | ) from e # Re-raise with more context 159 | 160 | except Exception as e: 161 | print(f"Error populating table '{table_name}': {e}") 162 | 163 | 164 | def create_local_directories(workspace_object): 165 | database_name = workspace_object.get_database() 166 | table_name = workspace_object.get_table_name() 167 | if not os.path.exists(database_name): 168 | os.makedirs(database_name) 169 | table_dir = os.path.join(database_name, table_name) 170 | if not os.path.exists(table_dir): 171 | os.makedirs(table_dir) # Create the table directory 172 | 173 | def populate_database_table_local(data,date,list_workspace_object): 174 | for workspace_object in list_workspace_object: 175 | try: 176 | database_name = workspace_object.get_database() 177 | table_name = workspace_object.get_table_name() 178 | file_name = f"{table_name}_{date}.json" 179 | table_partitioned = pathlib.Path(f"/Users/XXX/Desktop/GLUE/{database_name}/{table_name}/{date}") 180 | table_partitioned.mkdir(parents=True, exist_ok=True) 181 | with open(table_partitioned / file_name, 'w') as file: 182 | json.dump(data, file) 183 | except Exception as e: 184 | print(f"Error populating table '{table_name}': {e}") 185 | """ 186 | -------------------------------------------------------------------------------- /etl/extract/system_config.yml: -------------------------------------------------------------------------------- 1 | Workspaces: 2 | - !Workspace 3 | database: raw_data 4 | table_name: houston 5 | - !Workspace 6 | database: raw_data 7 | table_name: pasadena 8 | # - !Workspace 9 | # database: raw_data 10 | # table_name: katy 11 | # - !Workspace 12 | # database: raw_data 13 | # table_name: Cypress 14 | -------------------------------------------------------------------------------- /etl/glue_etl_job/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/glue_etl_job/__init__.py -------------------------------------------------------------------------------- /etl/glue_etl_job/transform_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from awsglue.context import GlueContext 4 | from awsglue.dynamicframe import DynamicFrame 5 | from awsglue.job import Job 6 | from awsglue.transforms import * 7 | from awsglue.utils import getResolvedOptions 8 | from pyspark.context import SparkContext 9 | from pyspark.sql import functions as F 10 | from pyspark.sql.functions import col, expr, first 11 | 12 | sc = SparkContext.getOrCreate() 13 | glueContext = GlueContext(sc) 14 | spark = glueContext.spark_session 15 | job = Job(glueContext) 16 | 17 | 18 | def extract_houston_from_catalog(database, houston_table_name): 19 | raw_houston_dynamic_frame = glueContext.create_dynamic_frame.from_catalog( 20 | database=database, table_name=houston_table_name 21 | ) 22 | df = raw_houston_dynamic_frame.toDF() 23 | return df 24 | 25 | def drop_columns(df): 26 | # drop from table columns with struct type 27 | cols = ("listing_sub_type", "open_house_info") 28 | 29 | df_drops= df.drop(*cols) 30 | return df_drops 31 | 32 | 33 | def group_data(df): 34 | #"group the data by zipcode" 35 | df_group = ( 36 | df.groupBy("zipcode","state","city","country","currency") 37 | .agg( 38 | F.count("*").alias("Total Zipcodes"), 39 | F.avg("bathrooms").alias("avg_bathrooms"), 40 | F.avg("bedrooms").alias("avg_bedrooms"), 41 | F.mean(col("price") / col("livingArea")).alias("avg_price_per_sqft"), 42 | ) 43 | .orderBy("zipcode") 44 | ) 45 | return df_group 46 | 47 | 48 | def load_to_s3(glue_dynamic_frame): 49 | s3output = glueContext.getSink( 50 | path="s3://real-estate-etl-101/std_data/", 51 | connection_type="s3", 52 | updateBehavior="UPDATE_IN_DATABASE", 53 | partitionKeys=[], 54 | compression="snappy", 55 | enableUpdateCatalog=True, 56 | transformation_ctx="s3output", 57 | ) 58 | 59 | s3output.setCatalogInfo( 60 | catalogDatabase="real-estate-database", catalogTableName="immo_report" 61 | ) 62 | 63 | s3output.setFormat("glueparquet") 64 | s3output.writeFrame(glue_dynamic_frame) 65 | 66 | 67 | if __name__ == "__main__": 68 | database = "real-estate-database" 69 | houston_table_name = "immo_houston" 70 | df_houston = extract_houston_from_catalog(database, houston_table_name) 71 | 72 | df_drops = drop_columns(df_houston) 73 | 74 | df_final = group_data(df_drops) 75 | 76 | # going from Spark dataframe to glue dynamic frame 77 | 78 | glue_dynamic_frame = DynamicFrame.fromDF(df_final, glueContext, "glue_etl") 79 | 80 | # load to s3 81 | load_to_s3(glue_dynamic_frame) 82 | 83 | job.commit() -------------------------------------------------------------------------------- /etl/load/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/load/__init__.py -------------------------------------------------------------------------------- /etl/load/load_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/etl/load/load_data.py -------------------------------------------------------------------------------- /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g-lorena/aws_etl_pipeline/e4dbe8938c38a9d79c01729c4fc9256199f8e2c3/images/architecture.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | PyYAML 3 | -------------------------------------------------------------------------------- /terraform.tfstate: -------------------------------------------------------------------------------- 1 | { 2 | "version": 4, 3 | "terraform_version": "1.7.5", 4 | "serial": 1, 5 | "lineage": "b41d31ed-d92b-6863-63ef-f1f090b6a6ac", 6 | "outputs": {}, 7 | "resources": [], 8 | "check_results": null 9 | } 10 | --------------------------------------------------------------------------------