├── dags ├── sql │ ├── trunc_target_tables.sql │ ├── init_db_schema.sql │ ├── full_load.sql │ └── incremental_load.sql ├── full_load_dag.py ├── incremental_load_dag.py └── operators │ ├── soda_to_s3_operator.py │ └── s3_to_postgres_operator.py ├── airflow_installation.txt └── README.md /dags/sql/trunc_target_tables.sql: -------------------------------------------------------------------------------- 1 | -- echo "" > /home/airflow/airflow/dags/sql/trunc_target_tables.sql 2 | -- nano /home/airflow/airflow/dags/sql/trunc_target_tables.sql 3 | TRUNCATE TABLE raw.soda_evictions; 4 | TRUNCATE TABLE raw.district_data; 5 | TRUNCATE TABLE raw.neighborhood_data; 6 | -------------------------------------------------------------------------------- /dags/full_load_dag.py: -------------------------------------------------------------------------------- 1 | # echo "" > /home/airflow/airflow/dags/full_load_dag.py 2 | # nano /home/airflow/airflow/dags/full_load_dag.py 3 | 4 | from airflow import DAG 5 | from airflow.operators.postgres_operator import PostgresOperator 6 | from operators.soda_to_s3_operator import SodaToS3Operator 7 | from operators.s3_to_postgres_operator import S3ToPostgresOperator 8 | from airflow.utils.dates import days_ago 9 | from datetime import timedelta 10 | 11 | soda_headers = { 12 | 'keyId':'############', 13 | 'keySecret':'#################', 14 | 'Accept':'application/json' 15 | } 16 | 17 | default_args = { 18 | 'owner': 'airflow', 19 | 'depends_on_past': False, 20 | 'start_date': days_ago(2), 21 | 'email': ['airflow@example.com'], 22 | 'email_on_failure': False, 23 | 'email_on_retry': False, 24 | 'retries': 1, 25 | 'retry_delay': timedelta(seconds=30) 26 | } 27 | 28 | with DAG('eviction-tracker-full_load', 29 | default_args=default_args, 30 | description='Executes full load from SODA API to Production DW.', 31 | max_active_runs=1, 32 | schedule_interval=None) as dag: 33 | 34 | op1 = SodaToS3Operator( 35 | task_id='get_evictions_data', 36 | http_conn_id='API_Evictions', 37 | headers=soda_headers, 38 | s3_conn_id='S3_Evictions', 39 | s3_bucket='sf-evictionmeter', 40 | s3_directory='soda_jsons', 41 | size_check=True, 42 | max_bytes=500000000, 43 | dag=dag 44 | ) 45 | 46 | op2 = PostgresOperator( 47 | task_id='initialize_target_db', 48 | postgres_conn_id='RDS_Evictions', 49 | sql='sql/init_db_schema.sql', 50 | dag=dag 51 | ) 52 | 53 | op3 = S3ToPostgresOperator( 54 | task_id='load_evictions_data', 55 | s3_conn_id='S3_Evictions', 56 | s3_bucket='sf-evictionmeter', 57 | s3_prefix='soda_jsons/soda_evictions_import', 58 | source_data_type='json', 59 | postgres_conn_id='RDS_Evictions', 60 | schema='raw', 61 | table='soda_evictions', 62 | get_latest=True, 63 | dag=dag 64 | ) 65 | 66 | op4 = S3ToPostgresOperator( 67 | task_id='load_neighborhood_data', 68 | s3_conn_id='S3_Evictions', 69 | s3_bucket='sf-evictionmeter', 70 | s3_prefix='census_csv/sf_by_neighborhood', 71 | source_data_type='csv', 72 | header=True, 73 | postgres_conn_id='RDS_Evictions', 74 | schema='raw', 75 | table='neighborhood_data', 76 | get_latest=True, 77 | dag=dag 78 | ) 79 | 80 | op5 = S3ToPostgresOperator( 81 | task_id='load_district_data', 82 | s3_conn_id='S3_Evictions', 83 | s3_bucket='sf-evictionmeter', 84 | s3_prefix='census_csv/sf_by_district', 85 | source_data_type='csv', 86 | header=True, 87 | postgres_conn_id='RDS_Evictions', 88 | schema='raw', 89 | table='district_data', 90 | get_latest=True, 91 | dag=dag 92 | ) 93 | 94 | op6 = PostgresOperator( 95 | task_id='execute_full_load', 96 | postgres_conn_id='RDS_Evictions', 97 | sql='sql/full_load.sql', 98 | dag=dag 99 | ) 100 | 101 | op1 >> op2 >> (op3, op4, op5) >> op6 102 | -------------------------------------------------------------------------------- /dags/incremental_load_dag.py: -------------------------------------------------------------------------------- 1 | # echo "" > /home/airflow/airflow/dags/incremental_load_dag.py 2 | # nano /home/airflow/airflow/dags/incremental_load_dag.py 3 | 4 | from airflow import DAG 5 | from airflow.operators.postgres_operator import PostgresOperator 6 | from airflow.operators.python_operator import ShortCircuitOperator 7 | from operators.soda_to_s3_operator import SodaToS3Operator 8 | from operators.s3_to_postgres_operator import S3ToPostgresOperator 9 | from airflow.utils.dates import days_ago 10 | from datetime import timedelta 11 | 12 | soda_headers = { 13 | 'keyId':'############', 14 | 'keySecret':'#################', 15 | 'Accept':'application/json' 16 | } 17 | 18 | default_args = { 19 | 'owner': 'airflow', 20 | 'depends_on_past': False, 21 | 'start_date': days_ago(2), 22 | 'email': ['airflow@example.com'], 23 | 'email_on_failure': False, 24 | 'email_on_retry': False, 25 | 'retries': 1, 26 | 'retry_delay': timedelta(seconds=30) 27 | } 28 | 29 | def get_size(**context): 30 | val = context['ti'].xcom_pull(key='obj_len') 31 | return True if val > 0 else False 32 | 33 | 34 | with DAG('eviction-tracker-incremental_load', 35 | default_args=default_args, 36 | description='Executes incremental load from SODA API & S3-hosted csv''s into Production DW.', 37 | max_active_runs=1, 38 | schedule_interval=None) as dag: 39 | 40 | op1 = SodaToS3Operator( 41 | task_id='get_evictions_data', 42 | http_conn_id='API_Evictions', 43 | headers=soda_headers, 44 | days_ago=31, 45 | s3_conn_id='S3_Evictions', 46 | s3_bucket='sf-evictionmeter', 47 | s3_directory='soda_jsons', 48 | size_check=True, 49 | max_bytes=500000000, 50 | dag=dag 51 | ) 52 | 53 | op2 = ShortCircuitOperator( 54 | task_id='check_get_results', 55 | python_callable=get_size, 56 | provide_context=True, 57 | dag=dag 58 | ) 59 | 60 | op3 = PostgresOperator( 61 | task_id='truncate_target_tables', 62 | postgres_conn_id='RDS_Evictions', 63 | sql='sql/trunc_target_tables.sql', 64 | dag=dag 65 | ) 66 | 67 | op4 = S3ToPostgresOperator( 68 | task_id='load_evictions_data', 69 | s3_conn_id='S3_Evictions', 70 | s3_bucket='sf-evictionmeter', 71 | s3_prefix='soda_jsons/soda_evictions_import', 72 | source_data_type='json', 73 | postgres_conn_id='RDS_Evictions', 74 | schema='raw', 75 | table='soda_evictions', 76 | get_latest=True, 77 | dag=dag 78 | ) 79 | 80 | op5 = S3ToPostgresOperator( 81 | task_id='load_neighborhood_data', 82 | s3_conn_id='S3_Evictions', 83 | s3_bucket='sf-evictionmeter', 84 | s3_prefix='census_csv/sf_by_neighborhood', 85 | source_data_type='csv', 86 | header=True, 87 | postgres_conn_id='RDS_Evictions', 88 | schema='raw', 89 | table='neighborhood_data', 90 | get_latest=True, 91 | dag=dag 92 | ) 93 | 94 | op6 = S3ToPostgresOperator( 95 | task_id='load_district_data', 96 | s3_conn_id='S3_Evictions', 97 | s3_bucket='sf-evictionmeter', 98 | s3_prefix='census_csv/sf_by_district', 99 | source_data_type='csv', 100 | header=True, 101 | postgres_conn_id='RDS_Evictions', 102 | schema='raw', 103 | table='district_data', 104 | get_latest=True, 105 | dag=dag 106 | ) 107 | 108 | op7 = PostgresOperator( 109 | task_id='execute_incremental_load', 110 | postgres_conn_id='RDS_Evictions', 111 | sql='sql/incremental_load.sql', 112 | dag=dag 113 | ) 114 | 115 | op1 >> op2 >> op3 >> (op4, op5, op6) >> op7 116 | -------------------------------------------------------------------------------- /airflow_installation.txt: -------------------------------------------------------------------------------- 1 | 2 | STEP 1 - Launch EC2 Instance: 3 | - t3.medium 4 | - 12gb storage 5 | - launch-wizard-3 security group to open TCP Port 8080 6 | - associate elastic IP 7 | 8 | 9 | STEP 2 - Install Postgres Server on EC2: 10 | run: 11 | sudo apt-get update 12 | sudo apt-get install python-psycopg2 13 | sudo apt-get install postgresql postgresql-contrib 14 | 15 | 16 | Step 3 - Create OS User airflow 17 | run: 18 | sudo adduser airflow 19 | sudo usermod -aG sudo airflow 20 | su - airflow 21 | 22 | Note: From here on, make sure you are logged in as airflow user. 23 | 24 | 25 | Step 4 - Create Postgres Metadatabase and User Access 26 | run: 27 | sudo -u postgres psql 28 | 29 | in postgres prompt: 30 | CREATE USER airflow PASSWORD 'password'; 31 | CREATE DATABASE airflow; 32 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO airflow; 33 | \q 34 | 35 | 36 | Step 5 - Change Postgres Connection Config 37 | run: 38 | sudo nano /etc/postgresql/10/main/pg_hba.conf 39 | 40 | Change this line - 41 | # IPv4 local connections: 42 | host all all 127.0.0.1/32 md5 43 | To this line - 44 | # IPv4 local connections: 45 | host all all 0.0.0.0/0 trust 46 | 47 | run: 48 | sudo nano /etc/postgresql/10/main/postgresql.conf 49 | 50 | Change this line - 51 | #listen_addresses = ‘localhost’ # what IP address(es) to listen on 52 | To this line - 53 | listen_addresses = ‘*’ # what IP address(es) to listen on 54 | 55 | restart postgres server: 56 | sudo service postgresql restart 57 | 58 | 59 | Step 6 - Install Airflow 60 | 61 | run: 62 | su - airflow 63 | sudo apt-get install python3-pip 64 | sudo python3 -m pip install apache-airflow[postgres,s3,aws] 65 | 66 | run: 67 | airflow initdb 68 | 69 | 70 | Step 7 - Connect Airflow to Postgres 71 | 72 | run: 73 | nano /home/airflow/airflow/airflow.cfg 74 | 75 | Change lines - 76 | sql_alchemy_conn = postgresql+psycopg2://airflow:password@localhost:5432/airflow 77 | executor = LocalExecutor 78 | load_examples = False 79 | 80 | run: 81 | airflow initdb 82 | 83 | 84 | Step 7 - Add DAGs: 85 | mkdir /home/airflow/airflow/dags/ 86 | cd /home/airflow/airflow/dags/ 87 | touch tutorial.py 88 | nano tutorial.py 89 | 90 | 91 | Step 6: Setup Airflow Webserver and Scheduler to start automatically 92 | We are almost there. The final thing we need to do is to ensure airflow starts up when your ec2 instance starts. 93 | 94 | sudo nano /etc/systemd/system/airflow-webserver.service 95 | 96 | Paste the following into the file created above 97 | 98 | [Unit] 99 | Description=Airflow webserver daemon 100 | After=network.target postgresql.service 101 | Wants=postgresql.service 102 | [Service] 103 | EnvironmentFile=/etc/environment 104 | User=airflow 105 | Group=airflow 106 | Type=simple 107 | ExecStart= /usr/local/bin/airflow webserver 108 | Restart=on-failure 109 | RestartSec=5s 110 | PrivateTmp=true 111 | [Install] 112 | WantedBy=multi-user.target 113 | 114 | Next we will create the following file to enable scheduler service 115 | 116 | sudo nano /etc/systemd/system/airflow-scheduler.service 117 | 118 | Paste the following 119 | 120 | [Unit] 121 | Description=Airflow scheduler daemon 122 | After=network.target postgresql.service 123 | Wants=postgresql.service 124 | [Service] 125 | EnvironmentFile=/etc/environment 126 | User=airflow 127 | Group=airflow 128 | Type=simple 129 | ExecStart=/usr/local/bin/airflow scheduler 130 | Restart=always 131 | RestartSec=5s 132 | [Install] 133 | WantedBy=multi-user.target 134 | 135 | Next enable these services and check their status 136 | 137 | sudo systemctl enable airflow-webserver.service 138 | sudo systemctl enable airflow-scheduler.service 139 | sudo systemctl start airflow-scheduler 140 | sudo systemctl start airflow-webserver 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SF-EvictionTracker 2 | 3 | Tracking eviction trends in San Francisco across filing reasons, districts, neighborhoods, and demographics in the months following COVID-19. Data warehouse infrastructure is housed in the AWS ecosystem and uses Apache Airflow for orchestration with public-facing dashboards created using Metabase. 4 | 5 | Questions? Feel free to reach me at ilya.glprn@gmail.com. 6 | 7 | Public Dashboard Link: http://sf-evictiontracker-metabase.us-east-1.elasticbeanstalk.com/public/dashboard/f637e470-8ea9-4b03-af80-53988e5b6a9b 8 | 9 | 10 |

ARCHITECTURE:

11 | 12 | ![Architecture](https://i.imgur.com/s2gLBZt.png) 13 | Data is sourced from San Francisco Open Data's API and csv's containing San Francisco district and neighborhood aggregate census results. Airflow orchestrates its movement to an S3 bucket and into a data warehouse hosted in RDS. SQL scripts are then ran to transform the data from its raw form through a staging schema and into production target tables. The presentation layer is created using Metabase, an open-source data visualization tool, and deployed using Elastic Beanstalk. 14 | 15 |

DATA MODEL:

16 | 17 | Dimension Tables: 18 | `dim_district` 19 | `dim_neighborhood` 20 | `dim_location` 21 | `dim_reason` 22 | `dim_date` 23 | `br_reason_group` 24 | 25 | Fact Tables: 26 | `fact_evictions` 27 | 28 | The data model is implemented using a star schema with a bridge table to accomodate any new permutations for the reason dimension. More information on bridge tables can be found here: https://www.kimballgroup.com/2012/02/design-tip-142-building-bridges/ 29 | 30 | ![Model](https://i.imgur.com/uInBlzR.png) 31 |

ETL FLOW:

32 | 33 | General Overview - 34 | - Evictions data is collected from the SODA API and moved into an S3 Bucket 35 | - Neighborhood/district census data is stored as a CSV in S3 36 | - Once the API load to S3 is complete, data is moved into RDS into a "raw" schema and moves through a staging schema for processing 37 | - ETL job execution is complete once data is moved from the staging schema into the final production tables 38 | 39 | DAGs and Custom Airflow Operators - 40 | 41 | ![Ops](https://i.imgur.com/WTOUiGU.jpg) 42 | ![Dag](https://i.imgur.com/yJb3DKT.jpg) 43 | 44 | There are 2 DAGs (Directed Acyclic Graphs) used for this project - full load which should be executed on initial setup and incremental load which is scheduled to run daily and pull new data from the Socrata Open Data API. 45 | 46 | The incremental load DAG uses XCom to pass the filesize of the load between the API call task and a ShortCircuitOperator to skip downstream tasks if the API call produces no results. 47 | 48 | The DAGs use two customer operators. They have been purpose built for this project but are easily expandable to be used in other data pipelines. 49 | 50 | 1. soda_to_s3_operator: Queries the Socrata Open Data API using a SoQL string and uploads the results to an S3 bucket. Includes optional function to check source data size and abort ETL if filesize exceeds user-defined limit. 51 | 52 | 2. s3_to_postges_operator: Collects data from a file hosted on AWS S3 and loads it into a Postgres table. Current version supports JSON and CSV source data types. 53 | 54 | 55 |

INFRASTRUCTURE:

56 | 57 | This project is hosted in the AWS ecosystem and uses the following resources: 58 | 59 | ![EC2](https://i.imgur.com/jB2X1jI.png) 60 | 61 | EC2 - 62 | - t2.medium - dedicated resource for Airflow, managed by AWS Instance Scheduler to complete the daily DAG run and shut off after execution 63 | - t2.small - used to host Metabase, always online 64 | 65 | RDS - 66 | - t2.small - hosts application database for Metabase and the data warehouse 67 | 68 | Elastic Beanstalk is used to deploy the Metabase web application. 69 | 70 | 71 |

DASHBOARD:

72 | 73 | The dashboard is publicly accessible here: http://sf-evictiontracker-metabase.us-east-1.elasticbeanstalk.com/public/dashboard/f637e470-8ea9-4b03-af80-53988e5b6a9b 74 | 75 | Some examples screengrabs below! 76 | 77 | ![Dash1](https://i.imgur.com/MZ325PT.jpg) 78 | ![Dash2](https://i.imgur.com/OeyOVp0.jpg) 79 | ![Dash3](https://i.imgur.com/v6Nwz9l.jpg) 80 | -------------------------------------------------------------------------------- /dags/operators/soda_to_s3_operator.py: -------------------------------------------------------------------------------- 1 | # echo "" > /home/airflow/airflow/dags/operators/soda_to_s3_operator.py 2 | # nano /home/airflow/airflow/dags/operators/soda_to_s3_operator.py 3 | 4 | from airflow.models.baseoperator import BaseOperator 5 | from airflow.utils.decorators import apply_defaults 6 | from airflow.hooks.http_hook import HttpHook 7 | from airflow.hooks.S3_hook import S3Hook 8 | 9 | from datetime import datetime, timedelta 10 | import json 11 | import sys 12 | 13 | 14 | class SizeExceededError(Exception): 15 | """Raised when max file size is exceeded""" 16 | def __init__(self): 17 | self.message = 'Max file size exceeded' 18 | 19 | def __str__(self): 20 | return f'SizeExceededError, {self.message}' 21 | 22 | 23 | class SodaToS3Operator(BaseOperator): 24 | """ 25 | Queries the Socrata Open Data API using a SoQL string and uploads the results to an S3 bucket. 26 | 27 | :param endpoint: Optional API connection endpoint 28 | :param data: Custom Socrata SoQL string used to query API, overrides default get request 29 | :param days_ago: Restricts get request to updated/created records from specified date onward 30 | :param headers: Dictionary containing optional API connection keys (keyId, keySecret, Accept) 31 | :param s3_conn_id: S3 Connection ID 32 | :param s3_bucket: S3 Bucket Destination 33 | :param s3_directory: S3 Directory Destination 34 | :param method: Request type for API 35 | :param http_conn_id: SODA API Connection ID 36 | :param size_check: Boolean indicating whether to run a size check prior to upload to S3 37 | :param max_bytes: Maximum number of bytes to allow for a single S3 upload 38 | """ 39 | 40 | @apply_defaults 41 | def __init__(self, 42 | endpoint=None, 43 | data=None, 44 | days_ago=None, 45 | headers=None, 46 | s3_conn_id=None, 47 | s3_bucket=None, 48 | s3_directory='', 49 | method='GET', 50 | http_conn_id='http_default', 51 | size_check=False, 52 | max_bytes=5000000000, 53 | *args, 54 | **kwargs) -> None: 55 | 56 | super().__init__(*args, **kwargs) 57 | 58 | self.endpoint = endpoint 59 | self.data = data 60 | self.days_ago = days_ago 61 | self.s3_conn_id = s3_conn_id 62 | self.s3_bucket = s3_bucket 63 | self.s3_directory = s3_directory 64 | self.headers = headers 65 | self.method = method 66 | self.http_conn_id = http_conn_id 67 | self.size_check = size_check 68 | self.max_bytes = max_bytes 69 | 70 | 71 | def get_size(self, obj, seen=None): 72 | """ 73 | Recursively finds size of object. 74 | """ 75 | 76 | size = sys.getsizeof(obj) 77 | if seen is None: 78 | seen = set() 79 | obj_id = id(obj) 80 | if obj_id in seen: 81 | return 0 82 | seen.add(obj_id) 83 | if isinstance(obj, dict): 84 | size += sum([self.get_size(v, seen) for v in obj.values()]) 85 | size += sum([self.get_size(k, seen) for k in obj.keys()]) 86 | elif hasattr(obj, '__dict__'): 87 | size += self.get_size(obj.__dict__, seen) 88 | elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)): 89 | size += sum([self.get_size(i, seen) for i in obj]) 90 | return size 91 | 92 | 93 | def parse_metadata(self, header): 94 | """ 95 | Parses metadata from API response. 96 | """ 97 | 98 | try: 99 | metadata = { 100 | 'api-call-date': header['Date'], 101 | 'content-type': header['Content-Type'], 102 | 'source-last-modified': header['X-SODA2-Truth-Last-Modified'], 103 | 'fields': header['X-SODA2-Fields'], 104 | 'types': header['X-SODA2-Types'] 105 | } 106 | except KeyError: 107 | metadata = {'KeyError': 'Metadata missing from header, see error log.'} 108 | 109 | return metadata 110 | 111 | 112 | def execute(self, context): 113 | """ 114 | Executes the operator, including running a max filesize check if enabled. 115 | 116 | The SODA API maxes out the # of returned results so we use paging to query 117 | the endpoint multiple times and continuously move the offset. 118 | 119 | Metadata is parsed and saved separately in a /logs/ subfolder along with 120 | the JSON results from API call. 121 | """ 122 | 123 | soda = HttpHook(method=self.method, http_conn_id=self.http_conn_id) 124 | 125 | if self.data: 126 | soql_filter = self.data 127 | elif self.days_ago: 128 | current_dt = datetime.now() 129 | target_dt = current_dt - timedelta(self.days_ago) 130 | format_dt = target_dt.strftime("%Y-%m-%d") 131 | soql_filter = f"""$query=SELECT:*,* WHERE :created_at > '{format_dt}' OR :updated_at > '{format_dt}' 132 | ORDER BY :id LIMIT 10000""" 133 | else: 134 | soql_filter = """$query=SELECT:*,* ORDER BY :id LIMIT 10000""" 135 | 136 | print('getting... ' + soql_filter) 137 | 138 | #soql_filter = f"""$query=SELECT:*,* WHERE :created_at < '2020-04-01' ORDER BY :id LIMIT 10000""" 139 | 140 | offset, counter = 0, 1 141 | combined = [] 142 | while True: 143 | soql_filter_offset = soql_filter + f' OFFSET {offset}' 144 | response = soda.run(endpoint=self.endpoint, data=soql_filter_offset, headers=self.headers) 145 | if response.status_code != 200: 146 | break 147 | captured = response.json() 148 | if len(captured) == 0: 149 | break 150 | combined.extend(captured) 151 | offset = 10000 * counter 152 | counter += 1 153 | 154 | if self.size_check == True: 155 | print('actual size... ' + str(self.get_size(combined))) 156 | print('max size... ' + str(self.max_bytes)) 157 | if self.get_size(combined) > self.max_bytes: 158 | raise SizeExceededError 159 | 160 | dest_s3 = S3Hook(self.s3_conn_id) 161 | 162 | body_obj = 'soda_evictions_import_' + datetime.now().strftime("%Y-%m-%dT%H%M%S") + '.json' 163 | 164 | metadata = self.parse_metadata(response.headers) 165 | meta_obj = 'logs/soda_evictions_import_log_' + datetime.now().strftime("%Y-%m-%dT%H%M%S") 166 | 167 | dest_s3.load_string(json.dumps(combined), key=self.s3_directory+'/'+body_obj, bucket_name=self.s3_bucket) 168 | dest_s3.load_string(json.dumps(metadata), key=self.s3_directory+'/'+meta_obj, bucket_name=self.s3_bucket) 169 | 170 | # XCom used to skip downstream tasks if body object size is 0 171 | self.xcom_push(context=context, key='obj_len', value=len(combined)) 172 | -------------------------------------------------------------------------------- /dags/operators/s3_to_postgres_operator.py: -------------------------------------------------------------------------------- 1 | # echo "" > /home/airflow/airflow/dags/operators/s3_to_postgres_operator.py 2 | # nano /home/airflow/airflow/dags/operators/s3_to_postgres_operator.py 3 | 4 | from airflow.models.baseoperator import BaseOperator 5 | from airflow.utils.decorators import apply_defaults 6 | from airflow.hooks.S3_hook import S3Hook 7 | from airflow.hooks.postgres_hook import PostgresHook 8 | 9 | import json 10 | import io 11 | from contextlib import closing 12 | 13 | 14 | class S3ToPostgresOperator(BaseOperator): 15 | """ 16 | Collects data from a file hosted on AWS S3 and loads it into a Postgres table. 17 | Current version supports JSON and CSV sources but requires pre-defined data model. 18 | 19 | :param s3_conn_id: S3 Connection ID 20 | :param s3_bucket: S3 Bucket Destination 21 | :param s3_prefix: S3 File Prefix 22 | :param source_data_type: S3 Source File data type 23 | :param header: Toggles ignore header for CSV source type 24 | :param postgres_conn_id: Postgres Connection ID 25 | :param db_schema: Postgres Target Schema 26 | :param db_table: Postgres Target Table 27 | :param get_latest: if True, pulls from last modified file in S3 path 28 | """ 29 | 30 | @apply_defaults 31 | def __init__(self, 32 | s3_conn_id=None, 33 | s3_bucket=None, 34 | s3_prefix='', 35 | source_data_type='', 36 | postgres_conn_id='postgres_default', 37 | header=False, 38 | schema='public', 39 | table='raw_load', 40 | get_latest=False, 41 | *args, 42 | **kwargs) -> None: 43 | 44 | super().__init__(*args, **kwargs) 45 | 46 | self.s3_conn_id = s3_conn_id 47 | self.s3_bucket = s3_bucket 48 | self.s3_prefix = s3_prefix 49 | self.source_data_type = source_data_type 50 | self.postgres_conn_id = postgres_conn_id 51 | self.header = header 52 | self.schema = schema 53 | self.table = table 54 | self.get_latest = get_latest 55 | 56 | 57 | def execute(self, context): 58 | """ 59 | Executes the operator. 60 | """ 61 | s3_hook = S3Hook(self.s3_conn_id) 62 | s3_session = s3_hook.get_session() 63 | s3_client = s3_session.client('s3') 64 | 65 | if self.get_latest == True: 66 | objects = s3_client.list_objects_v2(Bucket=self.s3_bucket, Prefix=self.s3_prefix)['Contents'] 67 | latest = max(objects, key=lambda x: x['LastModified']) 68 | s3_obj = s3_client.get_object(Bucket=self.s3_bucket, Key=latest['Key']) 69 | 70 | file_content = s3_obj['Body'].read().decode('utf-8') 71 | 72 | pg_hook = PostgresHook(self.postgres_conn_id) 73 | 74 | if self.source_data_type == 'json': 75 | 76 | print('inserting json object...') 77 | 78 | json_content = json.loads(file_content) 79 | 80 | schema = self.schema 81 | if isinstance(self.schema, tuple): 82 | schema = self.schema[0] 83 | 84 | table = self.table 85 | if isinstance(self.table, tuple): 86 | table = self.table[0] 87 | 88 | target_fields = ['raw_id','created_at','updated_at','eviction_id','address','city','state', 89 | 'zip','file_date','non_payment','breach','nuisance','illegal_use','failure_to_sign_renewal', 90 | 'access_denial','unapproved_subtenant','owner_move_in','demolition','capital_improvement', 91 | 'substantial_rehab','ellis_act_withdrawal','condo_conversion','roommate_same_unit', 92 | 'other_cause','late_payments','lead_remediation','development','good_samaritan_ends', 93 | 'constraints_date','supervisor_district','neighborhood'] 94 | target_fields = ','.join(target_fields) 95 | 96 | with closing(pg_hook.get_conn()) as conn: 97 | with closing(conn.cursor()) as cur: 98 | cur.executemany( 99 | f"""INSERT INTO {schema}.{table} ({target_fields}) 100 | VALUES( 101 | %(:id)s, %(:created_at)s, %(:updated_at)s, %(eviction_id)s, %(address)s, %(city)s, %(state)s, %(zip)s, 102 | %(file_date)s, %(non_payment)s, %(breach)s, %(nuisance)s, %(illegal_use)s, %(failure_to_sign_renewal)s, 103 | %(access_denial)s, %(unapproved_subtenant)s, %(owner_move_in)s, %(demolition)s, %(capital_improvement)s, 104 | %(substantial_rehab)s, %(ellis_act_withdrawal)s, %(condo_conversion)s, %(roommate_same_unit)s, 105 | %(other_cause)s, %(late_payments)s, %(lead_remediation)s, %(development)s, %(good_samaritan_ends)s, 106 | %(constraints_date)s, %(supervisor_district)s, %(neighborhood)s 107 | ); 108 | """,({ 109 | ':id': line[':id'], ':created_at': line[':created_at'], ':updated_at': line[':updated_at'], 110 | 'eviction_id': line['eviction_id'], 'address': line.get('address', None), 'city': line.get('city', None), 111 | 'state': line.get('state', None),'zip': line.get('zip', None),'file_date': line.get('file_date', None), 112 | 'non_payment': line.get('non_payment', None),'breach': line.get('breach', None), 113 | 'nuisance': line.get('nuisance', None),'illegal_use': line.get('illegal_use', None), 114 | 'failure_to_sign_renewal': line.get('failure_to_sign_renewal', None), 115 | 'access_denial': line.get('access_denial', None),'unapproved_subtenant': line.get('unapproved_subtenant', None), 116 | 'owner_move_in': line.get('owner_move_in', None),'demolition': line.get('demolition', None), 117 | 'capital_improvement': line.get('capital_improvement', None), 118 | 'substantial_rehab': line.get('substantial_rehab', None),'ellis_act_withdrawal': line.get('ellis_act_withdrawal', None), 119 | 'condo_conversion': line.get('condo_conversion', None),'roommate_same_unit': line.get('roommate_same_unit', None), 120 | 'other_cause': line.get('other_cause', None),'late_payments': line.get('late_payments', None), 121 | 'lead_remediation': line.get('lead_remediation', None),'development': line.get('development', None), 122 | 'good_samaritan_ends': line.get('good_samaritan_ends', None),'constraints_date': line.get('constraints_date', None), 123 | 'supervisor_district': line.get('supervisor_district', None),'neighborhood': line.get('neighborhood', None) 124 | } for line in json_content)) 125 | conn.commit() 126 | 127 | 128 | if self.source_data_type == 'csv': 129 | 130 | print('inserting csv...') 131 | 132 | file = io.StringIO(file_content) 133 | 134 | sql = "COPY %s FROM STDIN DELIMITER ','" 135 | if self.header == True: 136 | sql = "COPY %s FROM STDIN DELIMITER ',' CSV HEADER" 137 | 138 | schema = self.schema 139 | if isinstance(self.schema, tuple): 140 | schema = self.schema[0] 141 | 142 | table = self.table 143 | if isinstance(self.table, tuple): 144 | table = self.table[0] 145 | 146 | table = f'{schema}.{table}' 147 | 148 | with closing(pg_hook.get_conn()) as conn: 149 | with closing(conn.cursor()) as cur: 150 | cur.copy_expert(sql=sql % table, file=file) 151 | conn.commit() 152 | 153 | print('inserting complete...') 154 | -------------------------------------------------------------------------------- /dags/sql/init_db_schema.sql: -------------------------------------------------------------------------------- 1 | -- echo "" > /home/airflow/airflow/dags/sql/init_db_schema.sql 2 | -- nano /home/airflow/airflow/dags/sql/init_db_schema.sql 3 | 4 | DROP SCHEMA IF EXISTS raw CASCADE; 5 | DROP SCHEMA IF EXISTS staging CASCADE; 6 | DROP SCHEMA IF EXISTS prod CASCADE; 7 | 8 | CREATE SCHEMA raw; 9 | CREATE SCHEMA staging; 10 | CREATE SCHEMA prod; 11 | 12 | 13 | -- Raw 14 | CREATE UNLOGGED TABLE raw.soda_evictions ( 15 | raw_id text, 16 | created_at timestamp, 17 | updated_at timestamp, 18 | eviction_id text, 19 | address text, 20 | city text, 21 | state text, 22 | zip text, 23 | file_date timestamp, 24 | non_payment boolean, 25 | breach boolean, 26 | nuisance boolean, 27 | illegal_use boolean, 28 | failure_to_sign_renewal boolean, 29 | access_denial boolean, 30 | unapproved_subtenant boolean, 31 | owner_move_in boolean, 32 | demolition boolean, 33 | capital_improvement boolean, 34 | substantial_rehab boolean, 35 | ellis_act_withdrawal boolean, 36 | condo_conversion boolean, 37 | roommate_same_unit boolean, 38 | other_cause boolean, 39 | late_payments boolean, 40 | lead_remediation boolean, 41 | development boolean, 42 | good_samaritan_ends boolean, 43 | constraints_date timestamp, 44 | supervisor_district text, 45 | neighborhood text 46 | ); 47 | 48 | CREATE UNLOGGED TABLE raw.district_data ( 49 | district text, 50 | population text, 51 | households text, 52 | perc_asian text, 53 | perc_black text, 54 | perc_white text, 55 | perc_nat_am text, 56 | perc_nat_pac text, 57 | perc_other text, 58 | perc_latin text, 59 | median_age text, 60 | total_units text, 61 | perc_owner_occupied text, 62 | perc_renter_occupied text, 63 | median_rent_as_perc_of_income text, 64 | median_household_income text, 65 | median_family_income text, 66 | per_capita_income text, 67 | perc_in_poverty text 68 | ); 69 | CREATE UNIQUE INDEX district_name_uniq_idx ON raw.district_data (district); 70 | 71 | CREATE UNLOGGED TABLE raw.neighborhood_data ( 72 | acs_name text, 73 | db_name text, 74 | population text, 75 | households text, 76 | perc_asian text, 77 | perc_black text, 78 | perc_white text, 79 | perc_nat_am text, 80 | perc_nat_pac text, 81 | perc_other text, 82 | perc_latin text, 83 | median_age text, 84 | total_units text, 85 | perc_owner_occupied text, 86 | perc_renter_occupied text, 87 | median_rent_as_perc_of_income text, 88 | median_household_income text, 89 | median_family_income text, 90 | per_capita_income text, 91 | perc_in_poverty text 92 | ); 93 | CREATE UNIQUE INDEX neighborhood_name_uniq_idx ON raw.neighborhood_data (acs_name); 94 | 95 | -- Staging 96 | CREATE TABLE staging.dim_district ( 97 | district_key serial PRIMARY KEY, 98 | district text, 99 | population integer, 100 | households integer, 101 | percent_asian numeric, 102 | percent_black numeric, 103 | percent_white numeric, 104 | percent_native_am numeric, 105 | percent_pacific_isle numeric, 106 | percent_other_race numeric, 107 | percent_latin numeric, 108 | median_age numeric, 109 | total_units integer, 110 | percent_owner_occupied numeric, 111 | percent_renter_occupied numeric, 112 | median_rent_as_perc_of_income numeric, 113 | median_household_income numeric, 114 | median_family_income numeric, 115 | per_capita_income numeric, 116 | percent_in_poverty numeric 117 | ); 118 | CREATE UNIQUE INDEX district_name_uniq_idx ON staging.dim_district (district); 119 | 120 | CREATE TABLE staging.dim_neighborhood ( 121 | neighborhood_key serial PRIMARY KEY, 122 | neighborhood text, 123 | neighborhood_alt_name text, 124 | population integer, 125 | households integer, 126 | percent_asian numeric, 127 | percent_black numeric, 128 | percent_white numeric, 129 | percent_native_am numeric, 130 | percent_pacific_isle numeric, 131 | percent_other_race numeric, 132 | percent_latin numeric, 133 | median_age numeric, 134 | total_units integer, 135 | percent_owner_occupied numeric, 136 | percent_renter_occupied numeric, 137 | median_rent_as_perc_of_income numeric, 138 | median_household_income numeric, 139 | median_family_income numeric, 140 | per_capita_income numeric, 141 | percent_in_poverty numeric 142 | ); 143 | CREATE UNIQUE INDEX neighborhood_name_uniq_idx ON staging.dim_neighborhood (neighborhood); 144 | 145 | CREATE TABLE staging.dim_location ( 146 | location_key serial PRIMARY KEY, 147 | city text, 148 | state text, 149 | zip_code text 150 | ); 151 | 152 | CREATE TABLE staging.dim_reason ( 153 | reason_key serial PRIMARY KEY, 154 | reason_code text, 155 | reason_desc text 156 | ); 157 | 158 | CREATE TABLE staging.br_reason_group ( 159 | reason_group_key int, 160 | reason_key int 161 | ); 162 | CREATE INDEX reason_group_key_idx ON staging.br_reason_group (reason_group_key); 163 | CREATE INDEX reason_key_idx ON staging.br_reason_group (reason_key); 164 | 165 | CREATE TABLE staging.dim_date ( 166 | date_key int PRIMARY KEY, 167 | date date, 168 | year int, 169 | month int, 170 | month_name text, 171 | day int, 172 | day_of_year int, 173 | weekday_name text, 174 | calendar_week int, 175 | formatted_date text, 176 | quartal text, 177 | year_quartal text, 178 | year_month text, 179 | year_calendar_week text, 180 | weekend text, 181 | us_holiday text, 182 | period text, 183 | cw_start date, 184 | cw_end date, 185 | month_start date, 186 | month_end date 187 | ); 188 | 189 | CREATE TABLE staging.fact_evictions ( 190 | eviction_key text PRIMARY KEY, 191 | location_key int, 192 | district_key int, 193 | neighborhood_key int, 194 | reason_group_key int, 195 | file_date_key int, 196 | constraints_date_key int, 197 | street_address text 198 | ); 199 | 200 | 201 | -- Prod 202 | CREATE TABLE prod.dim_district ( 203 | district_key serial PRIMARY KEY, 204 | district text, 205 | population integer, 206 | households integer, 207 | percent_asian numeric, 208 | percent_black numeric, 209 | percent_white numeric, 210 | percent_native_am numeric, 211 | percent_pacific_isle numeric, 212 | percent_other_race numeric, 213 | percent_latin numeric, 214 | median_age numeric, 215 | total_units integer, 216 | percent_owner_occupied numeric, 217 | percent_renter_occupied numeric, 218 | median_rent_as_perc_of_income numeric, 219 | median_household_income numeric, 220 | median_family_income numeric, 221 | per_capita_income numeric, 222 | percent_in_poverty numeric 223 | ); 224 | 225 | CREATE TABLE prod.dim_neighborhood ( 226 | neighborhood_key serial PRIMARY KEY, 227 | neighborhood text, 228 | neighborhood_alt_name text, 229 | population integer, 230 | households integer, 231 | percent_asian numeric, 232 | percent_black numeric, 233 | percent_white numeric, 234 | percent_native_am numeric, 235 | percent_pacific_isle numeric, 236 | percent_other_race numeric, 237 | percent_latin numeric, 238 | median_age numeric, 239 | total_units integer, 240 | percent_owner_occupied numeric, 241 | percent_renter_occupied numeric, 242 | median_rent_as_perc_of_income numeric, 243 | median_household_income numeric, 244 | median_family_income numeric, 245 | per_capita_income numeric, 246 | percent_in_poverty numeric 247 | ); 248 | 249 | CREATE TABLE prod.dim_location ( 250 | location_key serial PRIMARY KEY, 251 | city text, 252 | state text, 253 | zip_code text 254 | ); 255 | 256 | CREATE TABLE prod.dim_reason ( 257 | reason_key serial PRIMARY KEY, 258 | reason_code text, 259 | reason_desc text 260 | ); 261 | 262 | CREATE TABLE prod.br_reason_group ( 263 | reason_group_key int, 264 | reason_key int 265 | ); 266 | CREATE INDEX reason_group_key_idx ON prod.br_reason_group (reason_group_key); 267 | CREATE INDEX reason_key_idx ON prod.br_reason_group (reason_key); 268 | 269 | CREATE TABLE prod.dim_date ( 270 | date_key int PRIMARY KEY, 271 | date date, 272 | year int, 273 | month int, 274 | month_name text, 275 | day int, 276 | day_of_year int, 277 | weekday_name text, 278 | calendar_week int, 279 | formatted_date text, 280 | quartal text, 281 | year_quartal text, 282 | year_month text, 283 | year_calendar_week text, 284 | weekend text, 285 | us_holiday text, 286 | period text, 287 | cw_start date, 288 | cw_end date, 289 | month_start date, 290 | month_end date 291 | ); 292 | 293 | CREATE TABLE prod.fact_evictions ( 294 | eviction_key text PRIMARY KEY, 295 | location_key int, 296 | district_key int, 297 | neighborhood_key int, 298 | reason_group_key int, 299 | file_date_key int, 300 | constraints_date_key int, 301 | street_address text 302 | ); 303 | -------------------------------------------------------------------------------- /dags/sql/full_load.sql: -------------------------------------------------------------------------------- 1 | -- echo "" > /home/airflow/airflow/dags/sql/full_load.sql 2 | -- nano /home/airflow/airflow/dags/sql/full_load.sql 3 | 4 | -- Populate District Dimension 5 | 6 | INSERT INTO staging.dim_district (district_key, district) 7 | SELECT -1, 'Unknown'; 8 | 9 | INSERT INTO staging.dim_district (district, population, households, percent_asian, percent_black, percent_white, percent_native_am, 10 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 11 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 12 | median_family_income, per_capita_income, percent_in_poverty) 13 | SELECT 14 | district, 15 | population::int, 16 | households::int, 17 | perc_asian::numeric as percent_asian, 18 | perc_black::numeric as percent_black, 19 | perc_white::numeric as percent_white, 20 | perc_nat_am::numeric as percent_native_am, 21 | perc_nat_pac::numeric as percent_pacific_isle, 22 | perc_other::numeric as percent_other_race, 23 | perc_latin::numeric as percent_latin, 24 | median_age::numeric, 25 | total_units::int, 26 | perc_owner_occupied::numeric as percent_owner_occupied, 27 | perc_renter_occupied::numeric as percent_renter_occupied, 28 | median_rent_as_perc_of_income::numeric, 29 | median_household_income::numeric, 30 | median_family_income::numeric, 31 | per_capita_income::numeric, 32 | perc_in_poverty::numeric as percent_in_poverty 33 | FROM raw.district_data; 34 | 35 | 36 | -- Populate Neighborhood Dimension 37 | 38 | INSERT INTO staging.dim_neighborhood (neighborhood_key, neighborhood) 39 | SELECT -1, 'Unknown'; 40 | 41 | INSERT INTO staging.dim_neighborhood (neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 42 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 43 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 44 | median_family_income, per_capita_income, percent_in_poverty) 45 | SELECT 46 | acs_name as neighborhood, 47 | db_name as neighborhood_alt_name, 48 | population::int, 49 | households::int, 50 | perc_asian::numeric as percent_asian, 51 | perc_black::numeric as percent_black, 52 | perc_white::numeric as percent_white, 53 | perc_nat_am::numeric as percent_native_am, 54 | perc_nat_pac::numeric as percent_pacific_isle, 55 | perc_other::numeric as percent_other_race, 56 | perc_latin::numeric as percent_latin, 57 | median_age::numeric, 58 | total_units::int, 59 | perc_owner_occupied::numeric as percent_owner_occupied, 60 | perc_renter_occupied::numeric as percent_renter_occupied, 61 | median_rent_as_perc_of_income::numeric, 62 | median_household_income::numeric, 63 | median_family_income::numeric, 64 | per_capita_income::numeric, 65 | perc_in_poverty::numeric as percent_in_poverty 66 | FROM raw.neighborhood_data; 67 | 68 | 69 | -- Populate Location Dimension 70 | 71 | INSERT INTO staging.dim_location (location_key, city, state, zip_code) 72 | SELECT -1, 'Unknown', 'Unknown', 'Unknown'; 73 | 74 | INSERT INTO staging.dim_location (city, state, zip_code) 75 | SELECT DISTINCT 76 | COALESCE(city, 'Unknown') as city, 77 | COALESCE(state, 'Unknown') as state, 78 | COALESCE(zip, 'Unknown') as zip_code 79 | FROM raw.soda_evictions 80 | WHERE 81 | city IS NOT NULL OR state IS NOT NULL OR zip IS NOT NULL; 82 | 83 | 84 | -- Populate Reason Dimension 85 | 86 | INSERT INTO staging.dim_reason (reason_key, reason_code, reason_desc) 87 | VALUES (-1, 'Unknown', 'Unknown'); 88 | 89 | INSERT INTO staging.dim_reason (reason_code, reason_desc) 90 | VALUES ('non_payment', 'Non-Payment'), 91 | ('breach', 'Breach'), 92 | ('nuisance', 'Nuisance'), 93 | ('illegal_use', 'Illegal Use'), 94 | ('failure_to_sign_renewal', 'Failure to Sign Renewal'), 95 | ('access_denial', 'Access Denial'), 96 | ('unapproved_subtenant', 'Unapproved Subtenant'), 97 | ('owner_move_in', 'Owner Move-In'), 98 | ('demolition', 'Demolition'), 99 | ('capital_improvement', 'Capital Improvement'), 100 | ('substantial_rehab', 'Substantial Rehab'), 101 | ('ellis_act_withdrawal', 'Ellis Act Withdrawal'), 102 | ('condo_conversion', 'Condo Conversion'), 103 | ('roommate_same_unit', 'Roommate Same Unit'), 104 | ('other_cause', 'Other Cause'), 105 | ('late_payments', 'Late Payments'), 106 | ('lead_remediation', 'Lead Remediation'), 107 | ('development', 'Development'), 108 | ('good_samaritan_ends', 'Good Samaritan Ends'); 109 | 110 | 111 | -- Populate Reason Bridge Table 112 | 113 | SELECT 114 | ROW_NUMBER() OVER(ORDER BY concat_reason) as group_key, 115 | string_to_array(concat_reason, '|') as reason_array, 116 | concat_reason 117 | INTO TEMP tmp_reason_group 118 | FROM ( 119 | SELECT DISTINCT 120 | TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)) as concat_reason 121 | FROM ( 122 | SELECT 123 | eviction_id, 124 | CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END|| 125 | CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END|| 126 | CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END|| 127 | CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END|| 128 | CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END|| 129 | CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END|| 130 | CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END|| 131 | CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END|| 132 | CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END|| 133 | CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END|| 134 | CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END|| 135 | CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END|| 136 | CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END|| 137 | CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END|| 138 | CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END|| 139 | CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END|| 140 | CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END|| 141 | CASE WHEN development = 'true' THEN 'development|' ELSE '' END|| 142 | CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END 143 | as concat_reason 144 | FROM raw.soda_evictions 145 | ) f1 146 | ) f2; 147 | 148 | INSERT INTO staging.br_reason_group (reason_group_key, reason_key) 149 | SELECT DISTINCT 150 | group_key as reason_group_key, 151 | reason_key 152 | FROM (SELECT group_key, unnest(reason_array) unnested FROM tmp_reason_group) grp 153 | JOIN staging.dim_Reason r ON r.reason_code = grp.unnested; 154 | 155 | 156 | -- Populate Date Dimension Table 157 | 158 | INSERT INTO staging.dim_date (date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week, 159 | formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday, 160 | period, cw_start, cw_end, month_start, month_end) 161 | SELECT -1, '1900-01-01', -1, -1, 'Unknown', -1, -1, 'Unknown', -1, 'Unknown', 'Unknown', 'Unknown', 'Unknown', 162 | 'Unknown', 'Unknown', 'Unknown', 'Unknown', '1900-01-01', '1900-01-01', '1900-01-01', '1900-01-01'; 163 | 164 | 165 | INSERT INTO staging.dim_date (date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week, 166 | formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday, 167 | period, cw_start, cw_end, month_start, month_end) 168 | SELECT 169 | TO_CHAR(datum, 'yyyymmdd')::int as date_key, 170 | datum as date, 171 | EXTRACT(YEAR FROM datum) as year, 172 | EXTRACT(MONTH FROM datum) as month, 173 | TO_CHAR(datum, 'TMMonth') as month_name, 174 | EXTRACT(DAY FROM datum) as day, 175 | EXTRACT(doy FROM datum) as day_of_year, 176 | TO_CHAR(datum, 'TMDay') as weekday_name, 177 | EXTRACT(week FROM datum) as calendar_week, 178 | TO_CHAR(datum, 'dd. mm. yyyy') as formatted_date, 179 | 'Q' || TO_CHAR(datum, 'Q') as quartal, 180 | TO_CHAR(datum, 'yyyy/"Q"Q') as year_quartal, 181 | TO_CHAR(datum, 'yyyy/mm') as year_month, 182 | TO_CHAR(datum, 'iyyy/IW') as year_calendar_week, 183 | CASE WHEN EXTRACT(isodow FROM datum) IN (6, 7) THEN 'Weekend' ELSE 'Weekday' END as weekend, 184 | CASE WHEN TO_CHAR(datum, 'MMDD') IN ('0101', '0704', '1225', '1226') THEN 'Holiday' ELSE 'No holiday' END 185 | as us_holiday, 186 | CASE WHEN TO_CHAR(datum, 'MMDD') BETWEEN '0701' AND '0831' THEN 'Summer break' 187 | WHEN TO_CHAR(datum, 'MMDD') BETWEEN '1115' AND '1225' THEN 'Christmas season' 188 | WHEN TO_CHAR(datum, 'MMDD') > '1225' OR TO_CHAR(datum, 'MMDD') <= '0106' THEN 'Winter break' 189 | ELSE 'Normal' END 190 | as period, 191 | datum + (1 - EXTRACT(isodow FROM datum))::int as cw_start, 192 | datum + (7 - EXTRACT(isodow FROM datum))::int as cw_end, 193 | datum + (1 - EXTRACT(DAY FROM datum))::int as month_start, 194 | (datum + (1 - EXTRACT(DAY FROM datum))::int + '1 month'::interval)::date - '1 day'::interval as month_end 195 | FROM ( 196 | SELECT '1997-01-01'::date + SEQUENCE.DAY as datum 197 | FROM generate_series(0,10956) as SEQUENCE(DAY) 198 | GROUP BY SEQUENCE.DAY 199 | ) DQ; 200 | 201 | 202 | -- Populate Evictions Fact Table 203 | 204 | SELECT 205 | eviction_id, 206 | group_key as reason_group_key 207 | INTO tmp_reason_facts 208 | FROM ( 209 | SELECT 210 | eviction_id, 211 | TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)) as concat_reason 212 | FROM ( 213 | SELECT 214 | eviction_id, 215 | CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END|| 216 | CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END|| 217 | CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END|| 218 | CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END|| 219 | CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END|| 220 | CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END|| 221 | CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END|| 222 | CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END|| 223 | CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END|| 224 | CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END|| 225 | CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END|| 226 | CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END|| 227 | CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END|| 228 | CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END|| 229 | CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END|| 230 | CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END|| 231 | CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END|| 232 | CASE WHEN development = 'true' THEN 'development|' ELSE '' END|| 233 | CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END 234 | as concat_reason 235 | FROM raw.soda_evictions 236 | ) grp 237 | ) f_grp 238 | JOIN tmp_reason_group t_grp ON f_grp.concat_reason = t_grp.concat_reason; 239 | 240 | 241 | INSERT INTO staging.fact_evictions (eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, 242 | constraints_date_key, street_address) 243 | SELECT 244 | f.eviction_id as eviction_key, 245 | COALESCE(d.district_key, -1) as district_key, 246 | COALESCE(n.neighborhood_key, -1) as neighborhood_key, 247 | COALESCE(l.location_key, -1) as location_key, 248 | reason_group_key, 249 | COALESCE(dt1.date_key, -1) as file_date_key, 250 | COALESCE(dt2.date_key, -1) as constraints_date_key, 251 | f.address as street_address 252 | FROM raw.soda_evictions f 253 | LEFT JOIN tmp_reason_facts r ON f.eviction_id = r.eviction_id 254 | LEFT JOIN staging.dim_district d ON f.supervisor_district = d.district 255 | LEFT JOIN staging.dim_neighborhood n ON f.neighborhood = n.neighborhood_alt_name 256 | LEFT JOIN staging.dim_location l 257 | ON COALESCE(f.city, 'Unknown') = l.city 258 | AND COALESCE(f.state, 'Unknown') = l.state 259 | AND COALESCE(f.zip, 'Unknown') = l.zip_code 260 | LEFT JOIN staging.dim_date dt1 ON f.file_date = dt1.date 261 | LEFT JOIN staging.dim_date dt2 ON f.constraints_date = dt2.date; 262 | 263 | DROP TABLE tmp_reason_group; 264 | DROP TABLE tmp_reason_facts; 265 | 266 | 267 | -- Migrate to Production Schema 268 | 269 | INSERT INTO prod.dim_district 270 | (district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am, 271 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 272 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 273 | median_family_income, per_capita_income, percent_in_poverty) 274 | SELECT 275 | district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am, 276 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 277 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 278 | median_family_income, per_capita_income, percent_in_poverty 279 | FROM staging.dim_district; 280 | 281 | INSERT INTO prod.dim_neighborhood 282 | (neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 283 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 284 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 285 | median_family_income, per_capita_income, percent_in_poverty) 286 | SELECT 287 | neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 288 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 289 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 290 | median_family_income, per_capita_income, percent_in_poverty 291 | FROM staging.dim_neighborhood; 292 | 293 | INSERT INTO prod.dim_location (location_key, city, state, zip_code) 294 | SELECT location_key, city, state, zip_code 295 | FROM staging.dim_location; 296 | 297 | INSERT INTO prod.dim_reason (reason_key, reason_code, reason_desc) 298 | SELECT reason_key, reason_code, reason_desc 299 | FROM staging.dim_reason; 300 | 301 | INSERT INTO prod.br_reason_group (reason_group_key, reason_key) 302 | SELECT reason_group_key, reason_key 303 | FROM staging.br_reason_group; 304 | 305 | INSERT INTO prod.dim_date 306 | (date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week, 307 | formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday, 308 | period, cw_start, cw_end, month_start, month_end) 309 | SELECT 310 | date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week, 311 | formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday, period, 312 | cw_start, cw_end, month_start, month_end 313 | FROM staging.dim_date; 314 | 315 | INSERT INTO prod.fact_evictions 316 | (eviction_key, district_key, neighborhood_key, location_key, reason_group_key, 317 | file_date_key, constraints_date_key, street_address) 318 | SELECT 319 | eviction_key, district_key, neighborhood_key, location_key, reason_group_key, 320 | file_date_key, constraints_date_key, street_address 321 | FROM staging.fact_evictions; 322 | -------------------------------------------------------------------------------- /dags/sql/incremental_load.sql: -------------------------------------------------------------------------------- 1 | -- echo "" > /home/airflow/airflow/dags/sql/incremental_load.sql 2 | -- nano /home/airflow/airflow/dags/sql/incremental_load.sql 3 | 4 | -- Populate District Dimension 5 | 6 | INSERT INTO staging.dim_district 7 | (district, population, households, percent_asian, percent_black, percent_white, percent_native_am, 8 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 9 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 10 | median_family_income, per_capita_income, percent_in_poverty) 11 | SELECT 12 | district, 13 | population::int, 14 | households::int, 15 | perc_asian::numeric as percent_asian, 16 | perc_black::numeric as percent_black, 17 | perc_white::numeric as percent_white, 18 | perc_nat_am::numeric as percent_native_am, 19 | perc_nat_pac::numeric as percent_pacific_isle, 20 | perc_other::numeric as percent_other_race, 21 | perc_latin::numeric as percent_latin, 22 | median_age::numeric, 23 | total_units::int, 24 | perc_owner_occupied::numeric as percent_owner_occupied, 25 | perc_renter_occupied::numeric as percent_renter_occupied, 26 | median_rent_as_perc_of_income::numeric, 27 | median_household_income::numeric, 28 | median_family_income::numeric, 29 | per_capita_income::numeric, 30 | perc_in_poverty::numeric as percent_in_poverty 31 | FROM raw.district_data 32 | ON CONFLICT (district) DO UPDATE SET 33 | population = EXCLUDED.population, 34 | households = EXCLUDED.households, 35 | percent_asian = EXCLUDED.percent_asian, 36 | percent_black = EXCLUDED.percent_black, 37 | percent_white = EXCLUDED.percent_white, 38 | percent_native_am = EXCLUDED.percent_native_am, 39 | percent_pacific_isle = EXCLUDED.percent_pacific_isle, 40 | percent_other_race = EXCLUDED.percent_other_race, 41 | percent_latin = EXCLUDED.percent_latin, 42 | median_age = EXCLUDED.median_age, 43 | total_units = EXCLUDED.total_units, 44 | percent_owner_occupied = EXCLUDED.percent_owner_occupied, 45 | percent_renter_occupied = EXCLUDED.percent_renter_occupied, 46 | median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income, 47 | median_household_income = EXCLUDED.median_household_income, 48 | median_family_income = EXCLUDED.median_family_income, 49 | per_capita_income = EXCLUDED.per_capita_income, 50 | percent_in_poverty = EXCLUDED.percent_in_poverty; 51 | 52 | 53 | -- Populate Neighborhood Dimension 54 | 55 | INSERT INTO staging.dim_neighborhood 56 | (neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 57 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 58 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 59 | median_family_income, per_capita_income, percent_in_poverty) 60 | SELECT 61 | acs_name as neighborhood, 62 | db_name as neighborhood_alt_name, 63 | population::int, 64 | households::int, 65 | perc_asian::numeric as percent_asian, 66 | perc_black::numeric as percent_black, 67 | perc_white::numeric as percent_white, 68 | perc_nat_am::numeric as percent_native_am, 69 | perc_nat_pac::numeric as percent_pacific_isle, 70 | perc_other::numeric as percent_other_race, 71 | perc_latin::numeric as percent_latin, 72 | median_age::numeric, 73 | total_units::int, 74 | perc_owner_occupied::numeric as percent_owner_occupied, 75 | perc_renter_occupied::numeric as percent_renter_occupied, 76 | median_rent_as_perc_of_income::numeric, 77 | median_household_income::numeric, 78 | median_family_income::numeric, 79 | per_capita_income::numeric, 80 | perc_in_poverty::numeric as percent_in_poverty 81 | FROM raw.neighborhood_data 82 | ON CONFLICT (neighborhood) DO UPDATE SET 83 | neighborhood_alt_name = EXCLUDED.neighborhood_alt_name, 84 | population = EXCLUDED.population, 85 | households = EXCLUDED.households, 86 | percent_asian = EXCLUDED.percent_asian, 87 | percent_black = EXCLUDED.percent_black, 88 | percent_white = EXCLUDED.percent_white, 89 | percent_native_am = EXCLUDED.percent_native_am, 90 | percent_pacific_isle = EXCLUDED.percent_pacific_isle, 91 | percent_other_race = EXCLUDED.percent_other_race, 92 | percent_latin = EXCLUDED.percent_latin, 93 | median_age = EXCLUDED.median_age, 94 | total_units = EXCLUDED.total_units, 95 | percent_owner_occupied = EXCLUDED.percent_owner_occupied, 96 | percent_renter_occupied = EXCLUDED.percent_renter_occupied, 97 | median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income, 98 | median_household_income = EXCLUDED.median_household_income, 99 | median_family_income = EXCLUDED.median_family_income, 100 | per_capita_income = EXCLUDED.per_capita_income, 101 | percent_in_poverty = EXCLUDED.percent_in_poverty; 102 | 103 | 104 | -- Populate Location Dimension 105 | 106 | INSERT INTO staging.dim_location (city, state, zip_code) 107 | SELECT 108 | se.city, 109 | se.state, 110 | se.zip_code 111 | FROM ( 112 | SELECT DISTINCT 113 | COALESCE(city, 'Unknown') as city, 114 | COALESCE(state, 'Unknown') as state, 115 | COALESCE(zip, 'Unknown') as zip_code 116 | FROM raw.soda_evictions 117 | ) se 118 | LEFT JOIN staging.dim_location dl 119 | ON se.city = dl.city 120 | AND se.state = dl.state 121 | AND se.zip_code = dl.zip_code 122 | WHERE 123 | dl.location_key IS NULL; 124 | 125 | 126 | -- Populate Reason Bridge Table 127 | 128 | SELECT DISTINCT 129 | reason_group_key, 130 | ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array 131 | INTO TEMP tmp_existing_reason_groups 132 | FROM staging.br_reason_group 133 | GROUP BY reason_group_key; 134 | 135 | SELECT 136 | concat_reason, 137 | ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array 138 | INTO TEMP tmp_new_reason_groups 139 | FROM ( 140 | SELECT DISTINCT 141 | string_to_array(TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)), '|') as concat_reason, 142 | unnest(string_to_array(TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)), '|')) unnested_reason 143 | FROM ( 144 | SELECT DISTINCT 145 | CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END|| 146 | CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END|| 147 | CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END|| 148 | CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END|| 149 | CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END|| 150 | CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END|| 151 | CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END|| 152 | CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END|| 153 | CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END|| 154 | CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END|| 155 | CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END|| 156 | CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END|| 157 | CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END|| 158 | CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END|| 159 | CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END|| 160 | CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END|| 161 | CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END|| 162 | CASE WHEN development = 'true' THEN 'development|' ELSE '' END|| 163 | CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END 164 | as concat_reason 165 | FROM raw.soda_evictions 166 | ) se1 167 | GROUP BY concat_reason 168 | ) se2 169 | JOIN staging.dim_reason r ON se2.unnested_reason = r.reason_code 170 | GROUP BY concat_reason; 171 | 172 | INSERT INTO staging.br_reason_group (reason_group_key, reason_key) 173 | SELECT 174 | final_grp.max_key + new_grp.tmp_group_key as reason_group_key, 175 | new_grp.reason_key as reason_key 176 | FROM ( 177 | SELECT DISTINCT 178 | ROW_NUMBER() OVER(ORDER BY concat_reason) as tmp_group_key, 179 | concat_reason, 180 | unnest(n.rk_array) as reason_key 181 | FROM tmp_new_reason_groups n 182 | LEFT JOIN tmp_existing_reason_groups e ON n.rk_array = e.rk_array 183 | WHERE e.reason_group_key IS NULL 184 | ) new_grp 185 | LEFT JOIN (SELECT MAX(reason_group_key) max_key FROM staging.br_reason_group) final_grp ON 1=1 186 | ORDER BY reason_group_key, reason_key; 187 | 188 | DROP TABLE tmp_existing_reason_groups; 189 | DROP TABLE tmp_new_reason_groups; 190 | 191 | 192 | -- Populate Staging Fact Table 193 | 194 | SELECT DISTINCT 195 | reason_group_key, 196 | ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array 197 | INTO TEMP tmp_existing_reason_groups 198 | FROM staging.br_reason_group 199 | GROUP BY reason_group_key; 200 | 201 | SELECT 202 | eviction_id, 203 | ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array 204 | INTO TEMP tmp_fct_reason_groups 205 | FROM ( 206 | SELECT 207 | eviction_id, 208 | unnest(string_to_array(TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)), '|')) as unnested_reason 209 | FROM ( 210 | SELECT 211 | eviction_id, 212 | CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END|| 213 | CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END|| 214 | CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END|| 215 | CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END|| 216 | CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END|| 217 | CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END|| 218 | CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END|| 219 | CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END|| 220 | CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END|| 221 | CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END|| 222 | CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END|| 223 | CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END|| 224 | CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END|| 225 | CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END|| 226 | CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END|| 227 | CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END|| 228 | CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END|| 229 | CASE WHEN development = 'true' THEN 'development|' ELSE '' END|| 230 | CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END 231 | as concat_reason 232 | FROM raw.soda_evictions 233 | ) se1 234 | ) se2 235 | JOIN staging.dim_reason r ON se2.unnested_reason = r.reason_code 236 | GROUP BY se2.eviction_id; 237 | 238 | SELECT 239 | eviction_id, 240 | reason_group_key 241 | INTO tmp_reason_group_lookup 242 | FROM tmp_fct_reason_groups f 243 | JOIN tmp_existing_reason_groups d ON f.rk_array = d.rk_array; 244 | 245 | 246 | TRUNCATE TABLE staging.fact_evictions; 247 | 248 | INSERT INTO staging.fact_evictions 249 | (eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, constraints_date_key, street_address) 250 | SELECT 251 | f.eviction_id as eviction_key, 252 | COALESCE(d.district_key, -1) as district_key, 253 | COALESCE(n.neighborhood_key, -1) as neighborhood_key, 254 | COALESCE(l.location_key, -1) as location_key, 255 | r.reason_group_key as reason_group_key, 256 | COALESCE(dt1.date_key, -1) as file_date_key, 257 | COALESCE(dt2.date_key, -1) as constraints_date_key, 258 | f.address as street_address 259 | FROM raw.soda_evictions f 260 | JOIN tmp_reason_group_lookup r ON f.eviction_id = r.eviction_id 261 | LEFT JOIN staging.dim_district d ON f.supervisor_district = d.district 262 | LEFT JOIN staging.dim_neighborhood n ON f.neighborhood = n.neighborhood_alt_name 263 | LEFT JOIN staging.dim_location l 264 | ON COALESCE(f.city, 'Unknown') = l.city 265 | AND COALESCE(f.state, 'Unknown') = l.state 266 | AND COALESCE(f.zip, 'Unknown') = l.zip_code 267 | LEFT JOIN staging.dim_date dt1 ON f.file_date = dt1.date 268 | LEFT JOIN staging.dim_date dt2 ON f.constraints_date = dt2.date; 269 | 270 | DROP TABLE tmp_existing_reason_groups; 271 | DROP TABLE tmp_fct_reason_groups; 272 | DROP TABLE tmp_reason_group_lookup; 273 | 274 | 275 | -- Merge Into Production Schema 276 | 277 | INSERT INTO prod.dim_district 278 | (district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am, 279 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 280 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 281 | median_family_income, per_capita_income, percent_in_poverty) 282 | SELECT 283 | district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am, 284 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 285 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 286 | median_family_income, per_capita_income, percent_in_poverty 287 | FROM staging.dim_district 288 | ON CONFLICT(district_key) DO UPDATE SET 289 | district = EXCLUDED.district, 290 | population = EXCLUDED.population, 291 | households = EXCLUDED.households, 292 | percent_asian = EXCLUDED.percent_asian, 293 | percent_black = EXCLUDED.percent_black, 294 | percent_white = EXCLUDED.percent_white, 295 | percent_native_am = EXCLUDED.percent_native_am, 296 | percent_pacific_isle = EXCLUDED.percent_pacific_isle, 297 | percent_other_race = EXCLUDED.percent_other_race, 298 | percent_latin = EXCLUDED.percent_latin, 299 | median_age = EXCLUDED.median_age, 300 | total_units = EXCLUDED.total_units, 301 | percent_owner_occupied = EXCLUDED.percent_owner_occupied, 302 | percent_renter_occupied = EXCLUDED.percent_renter_occupied, 303 | median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income, 304 | median_household_income = EXCLUDED.median_household_income, 305 | median_family_income = EXCLUDED.median_family_income, 306 | per_capita_income = EXCLUDED.per_capita_income, 307 | percent_in_poverty = EXCLUDED.percent_in_poverty; 308 | 309 | 310 | INSERT INTO prod.dim_neighborhood 311 | (neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 312 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 313 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 314 | median_family_income, per_capita_income, percent_in_poverty) 315 | SELECT 316 | neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 317 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 318 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 319 | median_family_income, per_capita_income, percent_in_poverty 320 | FROM staging.dim_neighborhood 321 | ON CONFLICT (neighborhood_key) DO UPDATE SET 322 | neighborhood = EXCLUDED.neighborhood, 323 | neighborhood_alt_name = EXCLUDED.neighborhood_alt_name, 324 | population = EXCLUDED.population, 325 | households = EXCLUDED.households, 326 | percent_asian = EXCLUDED.percent_asian, 327 | percent_black = EXCLUDED.percent_black, 328 | percent_white = EXCLUDED.percent_white, 329 | percent_native_am = EXCLUDED.percent_native_am, 330 | percent_pacific_isle = EXCLUDED.percent_pacific_isle, 331 | percent_other_race = EXCLUDED.percent_other_race, 332 | percent_latin = EXCLUDED.percent_latin, 333 | median_age = EXCLUDED.median_age, 334 | total_units = EXCLUDED.total_units, 335 | percent_owner_occupied = EXCLUDED.percent_owner_occupied, 336 | percent_renter_occupied = EXCLUDED.percent_renter_occupied, 337 | median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income, 338 | median_household_income = EXCLUDED.median_household_income, 339 | median_family_income = EXCLUDED.median_family_income, 340 | per_capita_income = EXCLUDED.per_capita_income, 341 | percent_in_poverty = EXCLUDED.percent_in_poverty; 342 | 343 | 344 | INSERT INTO prod.dim_location (location_key, city, state, zip_code) 345 | SELECT location_key, city, state, zip_code 346 | FROM staging.dim_location 347 | ON CONFLICT (location_key) DO NOTHING; 348 | 349 | 350 | INSERT INTO prod.br_reason_group (reason_group_key, reason_key) 351 | SELECT stg.reason_group_key, stg.reason_key 352 | FROM staging.br_reason_group stg 353 | LEFT JOIN prod.br_reason_group prd 354 | ON stg.reason_group_key = prd.reason_group_key 355 | AND stg.reason_key = prd.reason_key 356 | WHERE 357 | prd.reason_group_key IS NULL; 358 | 359 | 360 | INSERT INTO prod.fact_evictions 361 | (eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, constraints_date_key, street_address) 362 | SELECT eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, constraints_date_key, street_address 363 | FROM staging.fact_evictions 364 | ON CONFLICT (eviction_key) DO UPDATE SET 365 | district_key = EXCLUDED.district_key, 366 | neighborhood_key = EXCLUDED.neighborhood_key, 367 | location_key = EXCLUDED.location_key, 368 | reason_group_key = EXCLUDED.reason_group_key, 369 | file_date_key = EXCLUDED.file_date_key, 370 | constraints_date_key = EXCLUDED.constraints_date_key, 371 | street_address = EXCLUDED.street_address; 372 | --------------------------------------------------------------------------------