├── dags
    ├── sql
    │   ├── trunc_target_tables.sql
    │   ├── init_db_schema.sql
    │   ├── full_load.sql
    │   └── incremental_load.sql
    ├── full_load_dag.py
    ├── incremental_load_dag.py
    └── operators
    │   ├── soda_to_s3_operator.py
    │   └── s3_to_postgres_operator.py
├── airflow_installation.txt
└── README.md


/dags/sql/trunc_target_tables.sql:
--------------------------------------------------------------------------------
1 | -- echo "" > /home/airflow/airflow/dags/sql/trunc_target_tables.sql
2 | -- nano /home/airflow/airflow/dags/sql/trunc_target_tables.sql
3 | TRUNCATE TABLE raw.soda_evictions;
4 | TRUNCATE TABLE raw.district_data;
5 | TRUNCATE TABLE raw.neighborhood_data;
6 | 


--------------------------------------------------------------------------------
/dags/full_load_dag.py:
--------------------------------------------------------------------------------
  1 | # echo "" > /home/airflow/airflow/dags/full_load_dag.py
  2 | # nano /home/airflow/airflow/dags/full_load_dag.py
  3 | 
  4 | from airflow import DAG
  5 | from airflow.operators.postgres_operator import PostgresOperator
  6 | from operators.soda_to_s3_operator import SodaToS3Operator
  7 | from operators.s3_to_postgres_operator import S3ToPostgresOperator
  8 | from airflow.utils.dates import days_ago
  9 | from datetime import timedelta
 10 | 
 11 | soda_headers = {
 12 |     'keyId':'############',
 13 |     'keySecret':'#################',
 14 |     'Accept':'application/json'
 15 | }
 16 | 
 17 | default_args = {
 18 |     'owner': 'airflow',
 19 |     'depends_on_past': False,
 20 |     'start_date': days_ago(2),
 21 |     'email': ['airflow@example.com'],
 22 |     'email_on_failure': False,
 23 |     'email_on_retry': False,
 24 |     'retries': 1,
 25 |     'retry_delay': timedelta(seconds=30)
 26 | }
 27 | 
 28 | with DAG('eviction-tracker-full_load',
 29 | 		default_args=default_args,
 30 | 		description='Executes full load from SODA API to Production DW.',
 31 | 		max_active_runs=1,
 32 | 		schedule_interval=None) as dag:
 33 |  
 34 | 	op1 = SodaToS3Operator(
 35 | 		task_id='get_evictions_data',
 36 | 		http_conn_id='API_Evictions',
 37 | 		headers=soda_headers,
 38 | 		s3_conn_id='S3_Evictions',
 39 | 		s3_bucket='sf-evictionmeter',
 40 | 		s3_directory='soda_jsons',
 41 | 		size_check=True,
 42 | 		max_bytes=500000000,
 43 | 		dag=dag
 44 | 	)
 45 | 	
 46 | 	op2 = PostgresOperator(
 47 | 		task_id='initialize_target_db',
 48 | 		postgres_conn_id='RDS_Evictions',
 49 | 		sql='sql/init_db_schema.sql',
 50 | 		dag=dag
 51 | 	)
 52 | 	
 53 | 	op3 = S3ToPostgresOperator(
 54 | 		task_id='load_evictions_data',
 55 | 		s3_conn_id='S3_Evictions',
 56 | 		s3_bucket='sf-evictionmeter',
 57 | 		s3_prefix='soda_jsons/soda_evictions_import',
 58 | 		source_data_type='json',
 59 | 		postgres_conn_id='RDS_Evictions',
 60 | 		schema='raw',
 61 | 		table='soda_evictions',
 62 | 		get_latest=True,
 63 | 		dag=dag
 64 | 	)
 65 | 	
 66 | 	op4 = S3ToPostgresOperator(
 67 | 		task_id='load_neighborhood_data',
 68 | 		s3_conn_id='S3_Evictions',
 69 | 		s3_bucket='sf-evictionmeter',
 70 | 		s3_prefix='census_csv/sf_by_neighborhood',
 71 | 		source_data_type='csv',
 72 | 		header=True,
 73 | 		postgres_conn_id='RDS_Evictions',
 74 | 		schema='raw',
 75 | 		table='neighborhood_data',
 76 | 		get_latest=True,
 77 | 		dag=dag
 78 | 	)
 79 | 	
 80 | 	op5 = S3ToPostgresOperator(
 81 | 		task_id='load_district_data',
 82 | 		s3_conn_id='S3_Evictions',
 83 | 		s3_bucket='sf-evictionmeter',
 84 | 		s3_prefix='census_csv/sf_by_district',
 85 | 		source_data_type='csv',
 86 | 		header=True,
 87 | 		postgres_conn_id='RDS_Evictions',
 88 | 		schema='raw',
 89 | 		table='district_data',
 90 | 		get_latest=True,
 91 | 		dag=dag
 92 | 	)
 93 | 	
 94 | 	op6 = PostgresOperator(
 95 | 		task_id='execute_full_load',
 96 | 		postgres_conn_id='RDS_Evictions',
 97 | 		sql='sql/full_load.sql',
 98 | 		dag=dag
 99 | 	)
100 | 	
101 | 	op1 >> op2 >> (op3, op4, op5) >> op6
102 | 


--------------------------------------------------------------------------------
/dags/incremental_load_dag.py:
--------------------------------------------------------------------------------
  1 | # echo "" > /home/airflow/airflow/dags/incremental_load_dag.py
  2 | # nano /home/airflow/airflow/dags/incremental_load_dag.py
  3 | 
  4 | from airflow import DAG
  5 | from airflow.operators.postgres_operator import PostgresOperator
  6 | from airflow.operators.python_operator import ShortCircuitOperator
  7 | from operators.soda_to_s3_operator import SodaToS3Operator
  8 | from operators.s3_to_postgres_operator import S3ToPostgresOperator
  9 | from airflow.utils.dates import days_ago
 10 | from datetime import timedelta
 11 | 
 12 | soda_headers = {
 13 |     'keyId':'############',
 14 |     'keySecret':'#################',
 15 |     'Accept':'application/json'
 16 | }
 17 | 
 18 | default_args = {
 19 |     'owner': 'airflow',
 20 |     'depends_on_past': False,
 21 |     'start_date': days_ago(2),
 22 |     'email': ['airflow@example.com'],
 23 |     'email_on_failure': False,
 24 |     'email_on_retry': False,
 25 |     'retries': 1,
 26 |     'retry_delay': timedelta(seconds=30)
 27 | }
 28 | 
 29 | def get_size(**context):
 30 | 	val = context['ti'].xcom_pull(key='obj_len')
 31 | 	return True if val > 0 else False
 32 | 	
 33 | 
 34 | with DAG('eviction-tracker-incremental_load',
 35 | 	default_args=default_args,
 36 | 	description='Executes incremental load from SODA API & S3-hosted csv''s into Production DW.',
 37 | 	max_active_runs=1,
 38 | 	schedule_interval=None) as dag:
 39 |  
 40 | 	op1 = SodaToS3Operator(
 41 | 		task_id='get_evictions_data',
 42 | 		http_conn_id='API_Evictions',
 43 | 		headers=soda_headers,
 44 | 		days_ago=31,
 45 | 		s3_conn_id='S3_Evictions',
 46 | 		s3_bucket='sf-evictionmeter',
 47 | 		s3_directory='soda_jsons',
 48 | 		size_check=True,
 49 | 		max_bytes=500000000,
 50 | 		dag=dag
 51 | 	)
 52 | 	
 53 | 	op2 = ShortCircuitOperator(
 54 | 		task_id='check_get_results',
 55 | 		python_callable=get_size,
 56 | 		provide_context=True,
 57 | 		dag=dag
 58 | 	)
 59 | 	
 60 | 	op3 = PostgresOperator(
 61 | 		task_id='truncate_target_tables',
 62 | 		postgres_conn_id='RDS_Evictions',
 63 | 		sql='sql/trunc_target_tables.sql',
 64 | 		dag=dag
 65 | 	)
 66 | 	
 67 | 	op4 = S3ToPostgresOperator(
 68 | 		task_id='load_evictions_data',
 69 | 		s3_conn_id='S3_Evictions',
 70 | 		s3_bucket='sf-evictionmeter',
 71 | 		s3_prefix='soda_jsons/soda_evictions_import',
 72 | 		source_data_type='json',
 73 | 		postgres_conn_id='RDS_Evictions',
 74 | 		schema='raw',
 75 | 		table='soda_evictions',
 76 | 		get_latest=True,
 77 | 		dag=dag
 78 | 	)
 79 | 	
 80 | 	op5 = S3ToPostgresOperator(
 81 | 		task_id='load_neighborhood_data',
 82 | 		s3_conn_id='S3_Evictions',
 83 | 		s3_bucket='sf-evictionmeter',
 84 | 		s3_prefix='census_csv/sf_by_neighborhood',
 85 | 		source_data_type='csv',
 86 | 		header=True,
 87 | 		postgres_conn_id='RDS_Evictions',
 88 | 		schema='raw',
 89 | 		table='neighborhood_data',
 90 | 		get_latest=True,
 91 | 		dag=dag
 92 | 	)
 93 | 	
 94 | 	op6 = S3ToPostgresOperator(
 95 | 		task_id='load_district_data',
 96 | 		s3_conn_id='S3_Evictions',
 97 | 		s3_bucket='sf-evictionmeter',
 98 | 		s3_prefix='census_csv/sf_by_district',
 99 | 		source_data_type='csv',
100 | 		header=True,
101 | 		postgres_conn_id='RDS_Evictions',
102 | 		schema='raw',
103 | 		table='district_data',
104 | 		get_latest=True,
105 | 		dag=dag
106 | 	)
107 | 	
108 | 	op7 = PostgresOperator(
109 | 		task_id='execute_incremental_load',
110 | 		postgres_conn_id='RDS_Evictions',
111 | 		sql='sql/incremental_load.sql',
112 | 		dag=dag
113 | 	)
114 | 	
115 | 	op1 >> op2 >> op3 >> (op4, op5, op6) >> op7
116 | 


--------------------------------------------------------------------------------
/airflow_installation.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | STEP 1 - Launch EC2 Instance:
  3 | - t3.medium
  4 | - 12gb storage
  5 | - launch-wizard-3 security group to open TCP Port 8080
  6 | - associate elastic IP 
  7 | 
  8 | 
  9 | STEP 2 - Install Postgres Server on EC2:
 10 | run:
 11 | sudo apt-get update
 12 | sudo apt-get install python-psycopg2
 13 | sudo apt-get install postgresql postgresql-contrib
 14 | 
 15 | 
 16 | Step 3 - Create OS User airflow
 17 | run:
 18 | sudo adduser airflow
 19 | sudo usermod -aG sudo airflow
 20 | su - airflow
 21 | 
 22 | Note: From here on, make sure you are logged in as airflow user.
 23 | 
 24 | 
 25 | Step 4 - Create Postgres Metadatabase and User Access
 26 | run: 
 27 | sudo -u postgres psql
 28 | 
 29 | in postgres prompt: 
 30 | CREATE USER airflow PASSWORD 'password';
 31 | CREATE DATABASE airflow;
 32 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO airflow;
 33 | \q 
 34 | 
 35 | 
 36 | Step 5 - Change Postgres Connection Config
 37 | run:
 38 | sudo nano /etc/postgresql/10/main/pg_hba.conf
 39 | 
 40 | Change this line -
 41 | # IPv4 local connections:
 42 | host    all             all             127.0.0.1/32         md5
 43 | To this line - 
 44 | # IPv4 local connections:
 45 | host    all             all             0.0.0.0/0            trust
 46 | 
 47 | run:
 48 | sudo nano /etc/postgresql/10/main/postgresql.conf
 49 | 
 50 | Change this line - 
 51 | #listen_addresses = ‘localhost’ # what IP address(es) to listen on
 52 | To this line -
 53 | listen_addresses = ‘*’ # what IP address(es) to listen on
 54 | 
 55 | restart postgres server:
 56 | sudo service postgresql restart
 57 | 
 58 | 
 59 | Step 6 - Install Airflow
 60 | 
 61 | run:
 62 | su - airflow
 63 | sudo apt-get install python3-pip
 64 | sudo python3 -m pip install apache-airflow[postgres,s3,aws]
 65 | 
 66 | run:
 67 | airflow initdb
 68 | 
 69 | 
 70 | Step 7 - Connect Airflow to Postgres
 71 | 
 72 | run:
 73 | nano /home/airflow/airflow/airflow.cfg
 74 | 
 75 | Change lines -
 76 | sql_alchemy_conn = postgresql+psycopg2://airflow:password@localhost:5432/airflow
 77 | executor = LocalExecutor
 78 | load_examples = False
 79 | 
 80 | run:
 81 | airflow initdb
 82 | 
 83 | 
 84 | Step 7 - Add DAGs:
 85 | mkdir /home/airflow/airflow/dags/
 86 | cd /home/airflow/airflow/dags/
 87 | touch tutorial.py
 88 | nano tutorial.py
 89 | 
 90 | 
 91 | Step 6: Setup Airflow Webserver and Scheduler to start automatically
 92 | We are almost there. The final thing we need to do is to ensure airflow starts up when your ec2 instance starts.
 93 | 
 94 | sudo nano /etc/systemd/system/airflow-webserver.service
 95 | 
 96 | Paste the following into the file created above
 97 | 
 98 | [Unit]
 99 | Description=Airflow webserver daemon
100 | After=network.target postgresql.service
101 | Wants=postgresql.service
102 | [Service]
103 | EnvironmentFile=/etc/environment
104 | User=airflow
105 | Group=airflow
106 | Type=simple
107 | ExecStart= /usr/local/bin/airflow webserver
108 | Restart=on-failure
109 | RestartSec=5s
110 | PrivateTmp=true
111 | [Install]
112 | WantedBy=multi-user.target
113 | 
114 | Next we will create the following file to enable scheduler service
115 | 
116 | sudo nano /etc/systemd/system/airflow-scheduler.service
117 | 
118 | Paste the following
119 | 
120 | [Unit]
121 | Description=Airflow scheduler daemon
122 | After=network.target postgresql.service
123 | Wants=postgresql.service
124 | [Service]
125 | EnvironmentFile=/etc/environment
126 | User=airflow
127 | Group=airflow
128 | Type=simple
129 | ExecStart=/usr/local/bin/airflow scheduler
130 | Restart=always
131 | RestartSec=5s
132 | [Install]
133 | WantedBy=multi-user.target
134 | 
135 | Next enable these services and check their status
136 | 
137 | sudo systemctl enable airflow-webserver.service
138 | sudo systemctl enable airflow-scheduler.service
139 | sudo systemctl start airflow-scheduler
140 | sudo systemctl start airflow-webserver
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SF-EvictionTracker
 2 | 
 3 | Tracking eviction trends in San Francisco across filing reasons, districts, neighborhoods, and demographics in the months following COVID-19. Data warehouse infrastructure is housed in the AWS ecosystem and uses Apache Airflow for orchestration with public-facing dashboards created using Metabase. 
 4 | 
 5 | Questions? Feel free to reach me at ilya.glprn@gmail.com.
 6 | 
 7 | Public Dashboard Link: http://sf-evictiontracker-metabase.us-east-1.elasticbeanstalk.com/public/dashboard/f637e470-8ea9-4b03-af80-53988e5b6a9b
 8 | 
 9 | 
10 | <h3>ARCHITECTURE:</h3>
11 | 
12 | ![Architecture](https://i.imgur.com/s2gLBZt.png)
13 | Data is sourced from San Francisco Open Data's API and csv's containing San Francisco district and neighborhood aggregate census results. Airflow orchestrates its movement to an S3 bucket and into a data warehouse hosted in RDS. SQL scripts are then ran to transform the data from its raw form through a staging schema and into production target tables. The presentation layer is created using Metabase, an open-source data visualization tool, and deployed using Elastic Beanstalk. 
14 | 
15 | <h3>DATA MODEL:</h3>
16 | 
17 | Dimension Tables:
18 | `dim_district`
19 | `dim_neighborhood`
20 | `dim_location`
21 | `dim_reason`
22 | `dim_date`
23 | `br_reason_group`
24 | 
25 | Fact Tables:
26 | `fact_evictions`
27 | 
28 | The data model is implemented using a star schema with a bridge table to accomodate any new permutations for the reason dimension. More information on bridge tables can be found here: https://www.kimballgroup.com/2012/02/design-tip-142-building-bridges/
29 | 
30 | ![Model](https://i.imgur.com/uInBlzR.png)
31 | <h3>ETL FLOW:</h3>
32 | 
33 | General Overview - 
34 | - Evictions data is collected from the SODA API and moved into an S3 Bucket
35 | - Neighborhood/district census data is stored as a CSV in S3
36 | - Once the API load to S3 is complete, data is moved into RDS into a "raw" schema and moves through a staging schema for processing
37 | - ETL job execution is complete once data is moved from the staging schema into the final production tables
38 | 
39 | DAGs and Custom Airflow Operators -
40 | 
41 | ![Ops](https://i.imgur.com/WTOUiGU.jpg)
42 | ![Dag](https://i.imgur.com/yJb3DKT.jpg)
43 | 
44 | There are 2 DAGs (Directed Acyclic Graphs) used for this project - <b>full load</b> which should be executed on initial setup and <b>incremental load</b> which is scheduled to run daily and pull new data from the Socrata Open Data API.
45 | 
46 | The incremental load DAG uses XCom to pass the filesize of the load between the API call task and a ShortCircuitOperator to skip downstream tasks if the API call produces no results. 
47 | 
48 | The DAGs use two customer operators. They have been purpose built for this project but are easily expandable to be used in other data pipelines.
49 | 
50 | 1. soda_to_s3_operator: Queries the Socrata Open Data API using a SoQL string and uploads the results to an S3 bucket. Includes optional function to check source data size and abort ETL if filesize exceeds user-defined limit.
51 | 
52 | 2. s3_to_postges_operator: Collects data from a file hosted on AWS S3 and loads it into a Postgres table. Current version supports JSON and CSV source data types.
53 | 
54 | 
55 | <h3>INFRASTRUCTURE:</h3>
56 | 
57 | This project is hosted in the AWS ecosystem and uses the following resources:
58 | 
59 | ![EC2](https://i.imgur.com/jB2X1jI.png)
60 | 
61 | EC2 -
62 | - t2.medium - dedicated resource for Airflow, managed by AWS Instance Scheduler to complete the daily DAG run and shut off after execution 
63 | - t2.small - used to host Metabase, always online
64 | 
65 | RDS -
66 | - t2.small - hosts application database for Metabase and the data warehouse
67 | 
68 | Elastic Beanstalk is used to deploy the Metabase web application.
69 | 
70 | 
71 | <h3>DASHBOARD:</h3>
72 | 
73 | The dashboard is publicly accessible here: http://sf-evictiontracker-metabase.us-east-1.elasticbeanstalk.com/public/dashboard/f637e470-8ea9-4b03-af80-53988e5b6a9b
74 | 
75 | Some examples screengrabs below!
76 | 
77 | ![Dash1](https://i.imgur.com/MZ325PT.jpg)
78 | ![Dash2](https://i.imgur.com/OeyOVp0.jpg)
79 | ![Dash3](https://i.imgur.com/v6Nwz9l.jpg)
80 | 


--------------------------------------------------------------------------------
/dags/operators/soda_to_s3_operator.py:
--------------------------------------------------------------------------------
  1 | # echo "" > /home/airflow/airflow/dags/operators/soda_to_s3_operator.py
  2 | # nano /home/airflow/airflow/dags/operators/soda_to_s3_operator.py
  3 | 
  4 | from airflow.models.baseoperator import BaseOperator
  5 | from airflow.utils.decorators import apply_defaults
  6 | from airflow.hooks.http_hook import HttpHook
  7 | from airflow.hooks.S3_hook import S3Hook
  8 | 
  9 | from datetime import datetime, timedelta
 10 | import json
 11 | import sys
 12 | 
 13 | 
 14 | class SizeExceededError(Exception):
 15 | 	"""Raised when max file size is exceeded"""
 16 | 	def __init__(self):
 17 | 		self.message = 'Max file size exceeded'
 18 | 
 19 | 	def __str__(self):
 20 | 		return f'SizeExceededError, {self.message}'
 21 | 
 22 | 
 23 | class SodaToS3Operator(BaseOperator):
 24 | 	""" 
 25 | 	Queries the Socrata Open Data API using a SoQL string and uploads the results to an S3 bucket.
 26 | 	
 27 | 	:param endpoint:		Optional API connection endpoint
 28 | 	:param data:			Custom Socrata SoQL string used to query API, overrides default get request
 29 | 	:param days_ago:		Restricts get request to updated/created records from specified date onward
 30 | 	:param headers:			Dictionary containing optional API connection keys (keyId, keySecret, Accept)
 31 | 	:param s3_conn_id:		S3 Connection ID
 32 | 	:param s3_bucket:		S3 Bucket Destination
 33 | 	:param s3_directory:		S3 Directory Destination
 34 | 	:param method:			Request type for API
 35 | 	:param http_conn_id:		SODA API Connection ID
 36 | 	:param size_check:		Boolean indicating whether to run a size check prior to upload to S3
 37 | 	:param max_bytes:		Maximum number of bytes to allow for a single S3 upload		
 38 | 	"""
 39 | 	
 40 | 	@apply_defaults
 41 | 	def __init__(self,
 42 | 		endpoint=None,
 43 | 		data=None,
 44 | 		days_ago=None,
 45 | 		headers=None,
 46 | 		s3_conn_id=None,
 47 | 		s3_bucket=None,
 48 | 		s3_directory='',
 49 | 		method='GET',
 50 | 		http_conn_id='http_default',
 51 | 		size_check=False,
 52 | 		max_bytes=5000000000,
 53 | 		*args, 
 54 | 		**kwargs) -> None: 
 55 | 		
 56 | 		super().__init__(*args, **kwargs)
 57 | 		
 58 | 		self.endpoint = endpoint
 59 | 		self.data = data
 60 | 		self.days_ago = days_ago
 61 | 		self.s3_conn_id = s3_conn_id
 62 | 		self.s3_bucket = s3_bucket
 63 | 		self.s3_directory = s3_directory
 64 | 		self.headers = headers
 65 | 		self.method = method
 66 | 		self.http_conn_id = http_conn_id
 67 | 		self.size_check = size_check
 68 | 		self.max_bytes = max_bytes
 69 | 	
 70 | 	
 71 | 	def get_size(self, obj, seen=None):
 72 | 		"""
 73 | 		Recursively finds size of object.
 74 | 		"""
 75 | 		
 76 | 		size = sys.getsizeof(obj)
 77 | 		if seen is None:
 78 | 			seen = set()
 79 | 		obj_id = id(obj)
 80 | 		if obj_id in seen:
 81 | 			return 0
 82 | 		seen.add(obj_id)
 83 | 		if isinstance(obj, dict):
 84 | 			size += sum([self.get_size(v, seen) for v in obj.values()])
 85 | 			size += sum([self.get_size(k, seen) for k in obj.keys()])
 86 | 		elif hasattr(obj, '__dict__'):
 87 | 			size += self.get_size(obj.__dict__, seen)
 88 | 		elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
 89 | 			size += sum([self.get_size(i, seen) for i in obj])
 90 | 		return size
 91 | 	
 92 | 	
 93 | 	def parse_metadata(self, header):
 94 | 		"""
 95 | 		Parses metadata from API response.
 96 | 		"""
 97 | 		
 98 | 		try:
 99 | 			metadata = {
100 | 				'api-call-date': header['Date'],
101 | 				'content-type': header['Content-Type'],
102 | 				'source-last-modified': header['X-SODA2-Truth-Last-Modified'],
103 | 				'fields': header['X-SODA2-Fields'],
104 | 				'types': header['X-SODA2-Types']
105 | 			}
106 | 		except KeyError:
107 | 			metadata = {'KeyError': 'Metadata missing from header, see error log.'}
108 |     
109 | 		return metadata
110 | 
111 | 	
112 | 	def execute(self, context):
113 | 		"""
114 | 		Executes the operator, including running a max filesize check if enabled. 
115 | 		
116 | 		The SODA API maxes out the # of returned results so we use paging to query
117 | 		the endpoint multiple times and continuously move the offset. 
118 | 		
119 | 		Metadata is parsed and saved separately in a /logs/ subfolder along with 
120 | 		the JSON results from API call.
121 | 		"""
122 | 		
123 | 		soda = HttpHook(method=self.method, http_conn_id=self.http_conn_id)
124 | 		
125 | 		if self.data:
126 | 			soql_filter = self.data
127 | 		elif self.days_ago:
128 | 			current_dt = datetime.now()
129 | 			target_dt = current_dt - timedelta(self.days_ago)
130 | 			format_dt = target_dt.strftime("%Y-%m-%d")
131 | 			soql_filter = f"""$query=SELECT:*,* WHERE :created_at > '{format_dt}' OR :updated_at > '{format_dt}'
132 | 							   ORDER BY :id LIMIT 10000"""
133 | 		else:
134 | 			soql_filter = """$query=SELECT:*,* ORDER BY :id LIMIT 10000"""
135 | 		
136 | 		print('getting... ' + soql_filter)
137 | 		
138 | 		#soql_filter = f"""$query=SELECT:*,* WHERE :created_at < '2020-04-01' ORDER BY :id LIMIT 10000"""
139 | 		
140 | 		offset, counter = 0, 1
141 | 		combined = []
142 | 		while True:
143 | 			soql_filter_offset = soql_filter + f' OFFSET {offset}'
144 | 			response = soda.run(endpoint=self.endpoint, data=soql_filter_offset, headers=self.headers)
145 | 			if response.status_code != 200:
146 | 				break
147 | 			captured = response.json()
148 | 			if len(captured) == 0:
149 | 				break
150 | 			combined.extend(captured)
151 | 			offset = 10000 * counter
152 | 			counter += 1
153 | 
154 | 		if self.size_check == True:
155 | 			print('actual size... ' + str(self.get_size(combined)))
156 | 			print('max size... ' + str(self.max_bytes))
157 | 			if self.get_size(combined) > self.max_bytes:
158 | 				raise SizeExceededError
159 | 		
160 | 		dest_s3 = S3Hook(self.s3_conn_id)
161 | 		
162 | 		body_obj = 'soda_evictions_import_' + datetime.now().strftime("%Y-%m-%dT%H%M%S") + '.json'
163 | 		
164 | 		metadata = self.parse_metadata(response.headers)
165 | 		meta_obj = 'logs/soda_evictions_import_log_' + datetime.now().strftime("%Y-%m-%dT%H%M%S")
166 | 		
167 | 		dest_s3.load_string(json.dumps(combined), key=self.s3_directory+'/'+body_obj, bucket_name=self.s3_bucket)
168 | 		dest_s3.load_string(json.dumps(metadata), key=self.s3_directory+'/'+meta_obj, bucket_name=self.s3_bucket)
169 | 		
170 | 		# XCom used to skip downstream tasks if body object size is 0
171 | 		self.xcom_push(context=context, key='obj_len', value=len(combined))
172 | 


--------------------------------------------------------------------------------
/dags/operators/s3_to_postgres_operator.py:
--------------------------------------------------------------------------------
  1 | # echo "" > /home/airflow/airflow/dags/operators/s3_to_postgres_operator.py
  2 | # nano /home/airflow/airflow/dags/operators/s3_to_postgres_operator.py
  3 | 
  4 | from airflow.models.baseoperator import BaseOperator
  5 | from airflow.utils.decorators import apply_defaults
  6 | from airflow.hooks.S3_hook import S3Hook
  7 | from airflow.hooks.postgres_hook import PostgresHook
  8 | 
  9 | import json
 10 | import io
 11 | from contextlib import closing
 12 | 
 13 | 
 14 | class S3ToPostgresOperator(BaseOperator):
 15 | 	""" 
 16 | 	Collects data from a file hosted on AWS S3 and loads it into a Postgres table. 
 17 | 	Current version supports JSON and CSV sources but requires pre-defined data model.
 18 | 	
 19 | 	:param s3_conn_id:			S3 Connection ID
 20 | 	:param s3_bucket:			S3 Bucket Destination
 21 | 	:param s3_prefix:			S3 File Prefix
 22 | 	:param source_data_type:		S3 Source File data type
 23 | 	:param header:				Toggles ignore header for CSV source type 
 24 | 	:param postgres_conn_id: 		Postgres Connection ID
 25 | 	:param db_schema:			Postgres Target Schema
 26 | 	:param db_table:			Postgres Target Table
 27 | 	:param get_latest:			if True, pulls from last modified file in S3 path
 28 | 	"""
 29 | 	
 30 | 	@apply_defaults
 31 | 	def __init__(self,
 32 | 		s3_conn_id=None,
 33 | 		s3_bucket=None,
 34 | 		s3_prefix='',
 35 | 		source_data_type='',
 36 | 		postgres_conn_id='postgres_default',
 37 | 		header=False,
 38 | 		schema='public',
 39 | 		table='raw_load',
 40 | 		get_latest=False,		
 41 | 		*args, 
 42 | 		**kwargs) -> None: 
 43 | 		
 44 | 		super().__init__(*args, **kwargs)
 45 | 		
 46 | 		self.s3_conn_id = s3_conn_id
 47 | 		self.s3_bucket = s3_bucket
 48 | 		self.s3_prefix = s3_prefix
 49 | 		self.source_data_type = source_data_type
 50 | 		self.postgres_conn_id = postgres_conn_id
 51 | 		self.header = header
 52 | 		self.schema = schema
 53 | 		self.table = table
 54 | 		self.get_latest = get_latest
 55 | 	
 56 | 	
 57 | 	def execute(self, context):
 58 | 		"""
 59 | 		Executes the operator.
 60 | 		"""
 61 | 		s3_hook = S3Hook(self.s3_conn_id)
 62 | 		s3_session = s3_hook.get_session()
 63 | 		s3_client = s3_session.client('s3')
 64 | 		
 65 | 		if self.get_latest == True:
 66 | 			objects = s3_client.list_objects_v2(Bucket=self.s3_bucket, Prefix=self.s3_prefix)['Contents']
 67 | 			latest = max(objects, key=lambda x: x['LastModified'])
 68 | 			s3_obj = s3_client.get_object(Bucket=self.s3_bucket, Key=latest['Key'])
 69 | 			
 70 | 		file_content = s3_obj['Body'].read().decode('utf-8')
 71 | 		
 72 | 		pg_hook = PostgresHook(self.postgres_conn_id)
 73 | 			
 74 | 		if self.source_data_type == 'json':
 75 | 			
 76 | 			print('inserting json object...')
 77 | 	
 78 | 			json_content = json.loads(file_content)		
 79 | 				
 80 | 			schema = self.schema
 81 | 			if isinstance(self.schema, tuple):
 82 | 				schema = self.schema[0]
 83 | 			
 84 | 			table = self.table	
 85 | 			if isinstance(self.table, tuple):
 86 | 				table = self.table[0]	
 87 | 		
 88 | 			target_fields = ['raw_id','created_at','updated_at','eviction_id','address','city','state',
 89 | 					'zip','file_date','non_payment','breach','nuisance','illegal_use','failure_to_sign_renewal',
 90 | 					'access_denial','unapproved_subtenant','owner_move_in','demolition','capital_improvement',
 91 | 					'substantial_rehab','ellis_act_withdrawal','condo_conversion','roommate_same_unit',
 92 | 					'other_cause','late_payments','lead_remediation','development','good_samaritan_ends',
 93 | 					'constraints_date','supervisor_district','neighborhood']
 94 | 			target_fields = ','.join(target_fields)
 95 | 			
 96 | 			with closing(pg_hook.get_conn()) as conn:
 97 | 				with closing(conn.cursor()) as cur:
 98 | 						cur.executemany(
 99 | 							f"""INSERT INTO {schema}.{table} ({target_fields})
100 | 							VALUES(
101 | 							%(:id)s, %(:created_at)s, %(:updated_at)s, %(eviction_id)s, %(address)s, %(city)s, %(state)s, %(zip)s,
102 | 							%(file_date)s, %(non_payment)s, %(breach)s, %(nuisance)s, %(illegal_use)s, %(failure_to_sign_renewal)s,
103 | 							%(access_denial)s, %(unapproved_subtenant)s, %(owner_move_in)s, %(demolition)s, %(capital_improvement)s,
104 | 							%(substantial_rehab)s, %(ellis_act_withdrawal)s, %(condo_conversion)s, %(roommate_same_unit)s,
105 | 							%(other_cause)s, %(late_payments)s, %(lead_remediation)s, %(development)s, %(good_samaritan_ends)s,
106 | 							%(constraints_date)s, %(supervisor_district)s, %(neighborhood)s
107 | 								);
108 | 							""",({
109 | 							':id': line[':id'], ':created_at': line[':created_at'], ':updated_at': line[':updated_at'],
110 | 							'eviction_id': line['eviction_id'], 'address': line.get('address', None), 'city': line.get('city', None),
111 | 							'state': line.get('state', None),'zip': line.get('zip', None),'file_date': line.get('file_date', None),
112 | 							'non_payment': line.get('non_payment', None),'breach': line.get('breach', None),
113 | 							'nuisance': line.get('nuisance', None),'illegal_use': line.get('illegal_use', None),
114 | 							'failure_to_sign_renewal': line.get('failure_to_sign_renewal', None),
115 | 							'access_denial': line.get('access_denial', None),'unapproved_subtenant': line.get('unapproved_subtenant', None),
116 | 							'owner_move_in': line.get('owner_move_in', None),'demolition': line.get('demolition', None),
117 | 							'capital_improvement': line.get('capital_improvement', None),
118 | 							'substantial_rehab': line.get('substantial_rehab', None),'ellis_act_withdrawal': line.get('ellis_act_withdrawal', None),
119 | 							'condo_conversion': line.get('condo_conversion', None),'roommate_same_unit': line.get('roommate_same_unit', None),
120 | 							'other_cause': line.get('other_cause', None),'late_payments': line.get('late_payments', None),
121 | 							'lead_remediation': line.get('lead_remediation', None),'development': line.get('development', None),
122 | 							'good_samaritan_ends': line.get('good_samaritan_ends', None),'constraints_date': line.get('constraints_date', None),
123 | 							'supervisor_district': line.get('supervisor_district', None),'neighborhood': line.get('neighborhood', None)
124 | 							 } for line in json_content))
125 | 						conn.commit()		
126 | 			
127 | 			
128 | 		if self.source_data_type == 'csv':
129 | 			
130 | 			print('inserting csv...')
131 | 
132 | 			file = io.StringIO(file_content)
133 | 			
134 | 			sql = "COPY %s FROM STDIN DELIMITER ','"
135 | 			if self.header == True:
136 | 				sql = "COPY %s FROM STDIN DELIMITER ',' CSV HEADER"
137 | 			
138 | 			schema = self.schema
139 | 			if isinstance(self.schema, tuple):
140 | 				schema = self.schema[0]
141 | 			
142 | 			table = self.table	
143 | 			if isinstance(self.table, tuple):
144 | 				table = self.table[0]	
145 | 				
146 | 			table = f'{schema}.{table}'	
147 | 			
148 | 			with closing(pg_hook.get_conn()) as conn:
149 | 				with closing(conn.cursor()) as cur:
150 | 					cur.copy_expert(sql=sql % table, file=file)
151 | 					conn.commit()
152 | 		
153 | 		print('inserting complete...')
154 | 


--------------------------------------------------------------------------------
/dags/sql/init_db_schema.sql:
--------------------------------------------------------------------------------
  1 | -- echo "" > /home/airflow/airflow/dags/sql/init_db_schema.sql
  2 | -- nano /home/airflow/airflow/dags/sql/init_db_schema.sql
  3 | 
  4 | DROP SCHEMA IF EXISTS raw CASCADE;
  5 | DROP SCHEMA IF EXISTS staging CASCADE;
  6 | DROP SCHEMA IF EXISTS prod CASCADE;
  7 | 
  8 | CREATE SCHEMA raw;
  9 | CREATE SCHEMA staging;
 10 | CREATE SCHEMA prod;
 11 | 
 12 | 
 13 | -- Raw
 14 | CREATE UNLOGGED TABLE raw.soda_evictions (
 15 | 	raw_id text,
 16 | 	created_at timestamp,
 17 | 	updated_at timestamp,
 18 | 	eviction_id text,
 19 | 	address text,
 20 | 	city text,
 21 | 	state text,
 22 | 	zip text,
 23 | 	file_date timestamp,
 24 | 	non_payment boolean,
 25 | 	breach boolean,
 26 | 	nuisance boolean,
 27 | 	illegal_use boolean,
 28 | 	failure_to_sign_renewal boolean,
 29 | 	access_denial boolean,
 30 | 	unapproved_subtenant boolean,
 31 | 	owner_move_in boolean,
 32 | 	demolition boolean,
 33 | 	capital_improvement boolean,
 34 | 	substantial_rehab boolean,
 35 | 	ellis_act_withdrawal boolean,
 36 | 	condo_conversion boolean,
 37 | 	roommate_same_unit boolean,
 38 | 	other_cause boolean,
 39 | 	late_payments boolean,
 40 | 	lead_remediation boolean,
 41 | 	development boolean,
 42 | 	good_samaritan_ends boolean,
 43 | 	constraints_date timestamp,
 44 | 	supervisor_district text,
 45 | 	neighborhood text
 46 | );
 47 | 
 48 | CREATE UNLOGGED TABLE raw.district_data (
 49 | 	district text,
 50 | 	population text,
 51 | 	households text,
 52 | 	perc_asian text,
 53 | 	perc_black text,
 54 | 	perc_white text,
 55 | 	perc_nat_am text,
 56 | 	perc_nat_pac text,
 57 | 	perc_other text,
 58 | 	perc_latin text,
 59 | 	median_age text,
 60 | 	total_units text,
 61 | 	perc_owner_occupied text,
 62 | 	perc_renter_occupied text,
 63 | 	median_rent_as_perc_of_income text,
 64 | 	median_household_income text,
 65 | 	median_family_income text,
 66 | 	per_capita_income text,
 67 | 	perc_in_poverty text
 68 | );
 69 | CREATE UNIQUE INDEX district_name_uniq_idx ON raw.district_data (district);
 70 | 
 71 | CREATE UNLOGGED TABLE raw.neighborhood_data (
 72 | 	acs_name text,
 73 | 	db_name text,
 74 | 	population text,
 75 | 	households text,
 76 | 	perc_asian text,
 77 | 	perc_black text,
 78 | 	perc_white text,
 79 | 	perc_nat_am text,
 80 | 	perc_nat_pac text,
 81 | 	perc_other text,
 82 | 	perc_latin text,
 83 | 	median_age text,
 84 | 	total_units text,
 85 | 	perc_owner_occupied text,
 86 | 	perc_renter_occupied text,
 87 | 	median_rent_as_perc_of_income text,
 88 | 	median_household_income text,
 89 | 	median_family_income text,
 90 | 	per_capita_income text,
 91 | 	perc_in_poverty text
 92 | );
 93 | CREATE UNIQUE INDEX neighborhood_name_uniq_idx ON raw.neighborhood_data (acs_name);
 94 | 
 95 | -- Staging
 96 | CREATE TABLE staging.dim_district (
 97 | 	district_key serial PRIMARY KEY,
 98 | 	district text,
 99 | 	population integer,
100 | 	households integer,
101 | 	percent_asian numeric,
102 | 	percent_black numeric,
103 | 	percent_white numeric,
104 | 	percent_native_am numeric,
105 | 	percent_pacific_isle numeric,
106 | 	percent_other_race numeric,
107 | 	percent_latin numeric,
108 | 	median_age numeric,
109 | 	total_units integer,
110 | 	percent_owner_occupied numeric,
111 | 	percent_renter_occupied numeric,
112 | 	median_rent_as_perc_of_income numeric,
113 | 	median_household_income numeric,
114 | 	median_family_income numeric,
115 | 	per_capita_income numeric,
116 | 	percent_in_poverty numeric
117 | );
118 | CREATE UNIQUE INDEX district_name_uniq_idx ON staging.dim_district (district);
119 | 
120 | CREATE TABLE staging.dim_neighborhood (
121 | 	neighborhood_key serial PRIMARY KEY,
122 | 	neighborhood text,
123 | 	neighborhood_alt_name text,
124 | 	population integer,
125 | 	households integer,
126 | 	percent_asian numeric,
127 | 	percent_black numeric,
128 | 	percent_white numeric,
129 | 	percent_native_am numeric,
130 | 	percent_pacific_isle numeric,
131 | 	percent_other_race numeric,
132 | 	percent_latin numeric,
133 | 	median_age numeric,
134 | 	total_units integer,
135 | 	percent_owner_occupied numeric,
136 | 	percent_renter_occupied numeric,
137 | 	median_rent_as_perc_of_income numeric,
138 | 	median_household_income numeric,
139 | 	median_family_income numeric,
140 | 	per_capita_income numeric,
141 | 	percent_in_poverty numeric
142 | );
143 | CREATE UNIQUE INDEX neighborhood_name_uniq_idx ON staging.dim_neighborhood (neighborhood);
144 | 
145 | CREATE TABLE staging.dim_location (
146 | 	location_key serial PRIMARY KEY,
147 | 	city text,
148 | 	state text,
149 | 	zip_code text
150 | );
151 | 
152 | CREATE TABLE staging.dim_reason (
153 | 	reason_key serial PRIMARY KEY,
154 | 	reason_code text,
155 | 	reason_desc text
156 | );
157 | 
158 | CREATE TABLE staging.br_reason_group (
159 | 	reason_group_key int,
160 | 	reason_key int
161 | );	
162 | CREATE INDEX reason_group_key_idx ON staging.br_reason_group (reason_group_key);
163 | CREATE INDEX reason_key_idx ON staging.br_reason_group (reason_key);
164 | 
165 | CREATE TABLE staging.dim_date (
166 | 	date_key int PRIMARY KEY,
167 | 	date date,
168 | 	year int,
169 | 	month int,
170 | 	month_name text,
171 | 	day int,
172 | 	day_of_year int,
173 | 	weekday_name text,
174 | 	calendar_week int,
175 | 	formatted_date text,
176 | 	quartal text,
177 | 	year_quartal text,
178 | 	year_month text,
179 | 	year_calendar_week text,
180 | 	weekend text,
181 | 	us_holiday text,
182 | 	period text,
183 | 	cw_start date,
184 | 	cw_end date,
185 | 	month_start date,
186 | 	month_end date
187 | );
188 | 
189 | CREATE TABLE staging.fact_evictions (
190 | 	eviction_key text PRIMARY KEY,
191 | 	location_key int,
192 | 	district_key int,
193 | 	neighborhood_key int,
194 | 	reason_group_key int,
195 | 	file_date_key int,
196 | 	constraints_date_key int,
197 | 	street_address text
198 | );
199 | 
200 | 
201 | -- Prod
202 | CREATE TABLE prod.dim_district (
203 | 	district_key serial PRIMARY KEY,
204 | 	district text,
205 | 	population integer,
206 | 	households integer,
207 | 	percent_asian numeric,
208 | 	percent_black numeric,
209 | 	percent_white numeric,
210 | 	percent_native_am numeric,
211 | 	percent_pacific_isle numeric,
212 | 	percent_other_race numeric,
213 | 	percent_latin numeric,
214 | 	median_age numeric,
215 | 	total_units integer,
216 | 	percent_owner_occupied numeric,
217 | 	percent_renter_occupied numeric,
218 | 	median_rent_as_perc_of_income numeric,
219 | 	median_household_income numeric,
220 | 	median_family_income numeric,
221 | 	per_capita_income numeric,
222 | 	percent_in_poverty numeric
223 | );
224 | 
225 | CREATE TABLE prod.dim_neighborhood (
226 | 	neighborhood_key serial PRIMARY KEY,
227 | 	neighborhood text,
228 | 	neighborhood_alt_name text,
229 | 	population integer,
230 | 	households integer,
231 | 	percent_asian numeric,
232 | 	percent_black numeric,
233 | 	percent_white numeric,
234 | 	percent_native_am numeric,
235 | 	percent_pacific_isle numeric,
236 | 	percent_other_race numeric,
237 | 	percent_latin numeric,
238 | 	median_age numeric,
239 | 	total_units integer,
240 | 	percent_owner_occupied numeric,
241 | 	percent_renter_occupied numeric,
242 | 	median_rent_as_perc_of_income numeric,
243 | 	median_household_income numeric,
244 | 	median_family_income numeric,
245 | 	per_capita_income numeric,
246 | 	percent_in_poverty numeric
247 | );
248 | 
249 | CREATE TABLE prod.dim_location (
250 | 	location_key serial PRIMARY KEY,
251 | 	city text,
252 | 	state text,
253 | 	zip_code text
254 | );
255 | 
256 | CREATE TABLE prod.dim_reason (
257 | 	reason_key serial PRIMARY KEY,
258 | 	reason_code text,
259 | 	reason_desc text
260 | );
261 | 
262 | CREATE TABLE prod.br_reason_group (
263 | 	reason_group_key int,
264 | 	reason_key int
265 | );	
266 | CREATE INDEX reason_group_key_idx ON prod.br_reason_group (reason_group_key);
267 | CREATE INDEX reason_key_idx ON prod.br_reason_group (reason_key);
268 | 
269 | CREATE TABLE prod.dim_date (
270 | 	date_key int PRIMARY KEY,
271 | 	date date,
272 | 	year int,
273 | 	month int,
274 | 	month_name text,
275 | 	day int,
276 | 	day_of_year int,
277 | 	weekday_name text,
278 | 	calendar_week int,
279 | 	formatted_date text,
280 | 	quartal text,
281 | 	year_quartal text,
282 | 	year_month text,
283 | 	year_calendar_week text,
284 | 	weekend text,
285 | 	us_holiday text,
286 | 	period text,
287 | 	cw_start date,
288 | 	cw_end date,
289 | 	month_start date,
290 | 	month_end date
291 | );
292 | 
293 | CREATE TABLE prod.fact_evictions (
294 | 	eviction_key text PRIMARY KEY,
295 | 	location_key int,
296 | 	district_key int,
297 | 	neighborhood_key int,
298 | 	reason_group_key int,
299 | 	file_date_key int,
300 | 	constraints_date_key int,
301 | 	street_address text
302 | );
303 | 


--------------------------------------------------------------------------------
/dags/sql/full_load.sql:
--------------------------------------------------------------------------------
  1 | -- echo "" > /home/airflow/airflow/dags/sql/full_load.sql
  2 | -- nano /home/airflow/airflow/dags/sql/full_load.sql
  3 | 
  4 | -- Populate District Dimension
  5 | 
  6 | INSERT INTO staging.dim_district (district_key, district)
  7 | SELECT -1, 'Unknown';
  8 | 
  9 | INSERT INTO staging.dim_district (district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
 10 | 				percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
 11 | 				percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
 12 | 				median_family_income, per_capita_income, percent_in_poverty)
 13 | SELECT 
 14 | 	district,
 15 | 	population::int,
 16 | 	households::int,
 17 | 	perc_asian::numeric as percent_asian,
 18 | 	perc_black::numeric as percent_black,
 19 | 	perc_white::numeric as percent_white,
 20 | 	perc_nat_am::numeric as percent_native_am,
 21 | 	perc_nat_pac::numeric as percent_pacific_isle,
 22 | 	perc_other::numeric as percent_other_race,
 23 | 	perc_latin::numeric as percent_latin,
 24 | 	median_age::numeric,
 25 | 	total_units::int,
 26 | 	perc_owner_occupied::numeric as percent_owner_occupied,
 27 | 	perc_renter_occupied::numeric as percent_renter_occupied,
 28 | 	median_rent_as_perc_of_income::numeric,
 29 | 	median_household_income::numeric,
 30 | 	median_family_income::numeric,
 31 | 	per_capita_income::numeric,
 32 | 	perc_in_poverty::numeric as percent_in_poverty
 33 | FROM raw.district_data;	
 34 | 
 35 | 
 36 | -- Populate Neighborhood Dimension
 37 | 
 38 | INSERT INTO staging.dim_neighborhood (neighborhood_key, neighborhood)
 39 | SELECT -1, 'Unknown';
 40 | 
 41 | INSERT INTO staging.dim_neighborhood (neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 
 42 | 				percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
 43 | 				percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
 44 | 				median_family_income, per_capita_income, percent_in_poverty)
 45 | SELECT 
 46 | 	acs_name as neighborhood,
 47 | 	db_name as neighborhood_alt_name,
 48 | 	population::int,
 49 | 	households::int,
 50 | 	perc_asian::numeric as percent_asian,
 51 | 	perc_black::numeric as percent_black,
 52 | 	perc_white::numeric as percent_white,
 53 | 	perc_nat_am::numeric as percent_native_am,
 54 | 	perc_nat_pac::numeric as percent_pacific_isle,
 55 | 	perc_other::numeric as percent_other_race,
 56 | 	perc_latin::numeric as percent_latin,
 57 | 	median_age::numeric,
 58 | 	total_units::int,
 59 | 	perc_owner_occupied::numeric as percent_owner_occupied,
 60 | 	perc_renter_occupied::numeric as percent_renter_occupied,
 61 | 	median_rent_as_perc_of_income::numeric,
 62 | 	median_household_income::numeric,
 63 | 	median_family_income::numeric,
 64 | 	per_capita_income::numeric,
 65 | 	perc_in_poverty::numeric as percent_in_poverty
 66 | FROM raw.neighborhood_data;	
 67 | 
 68 | 
 69 | -- Populate Location Dimension
 70 | 
 71 | INSERT INTO staging.dim_location (location_key, city, state, zip_code)
 72 | SELECT -1, 'Unknown', 'Unknown', 'Unknown';
 73 | 
 74 | INSERT INTO staging.dim_location (city, state, zip_code)
 75 | SELECT DISTINCT
 76 | 	COALESCE(city, 'Unknown') as city,
 77 | 	COALESCE(state, 'Unknown') as state,
 78 | 	COALESCE(zip, 'Unknown') as zip_code
 79 | FROM raw.soda_evictions
 80 | WHERE 
 81 |  city IS NOT NULL OR state IS NOT NULL OR zip IS NOT NULL;
 82 | 
 83 | 
 84 | -- Populate Reason Dimension
 85 | 
 86 | INSERT INTO staging.dim_reason (reason_key, reason_code, reason_desc)
 87 | VALUES (-1, 'Unknown', 'Unknown');
 88 | 
 89 | INSERT INTO staging.dim_reason (reason_code, reason_desc)
 90 | VALUES 	('non_payment', 'Non-Payment'),
 91 | 	('breach', 'Breach'),
 92 | 	('nuisance', 'Nuisance'),
 93 | 	('illegal_use', 'Illegal Use'),
 94 | 	('failure_to_sign_renewal', 'Failure to Sign Renewal'),
 95 | 	('access_denial', 'Access Denial'),
 96 | 	('unapproved_subtenant', 'Unapproved Subtenant'),
 97 | 	('owner_move_in', 'Owner Move-In'),
 98 | 	('demolition', 'Demolition'),
 99 | 	('capital_improvement', 'Capital Improvement'),
100 | 	('substantial_rehab', 'Substantial Rehab'),
101 | 	('ellis_act_withdrawal', 'Ellis Act Withdrawal'),
102 | 	('condo_conversion', 'Condo Conversion'),
103 | 	('roommate_same_unit', 'Roommate Same Unit'),
104 | 	('other_cause', 'Other Cause'),
105 | 	('late_payments', 'Late Payments'),
106 | 	('lead_remediation', 'Lead Remediation'),
107 | 	('development', 'Development'),
108 | 	('good_samaritan_ends', 'Good Samaritan Ends');
109 | 
110 | 	
111 | -- Populate Reason Bridge Table
112 | 
113 | SELECT 
114 | 	ROW_NUMBER() OVER(ORDER BY concat_reason) as group_key,
115 | 	string_to_array(concat_reason, '|') as reason_array,
116 | 	concat_reason
117 | INTO TEMP tmp_reason_group
118 | FROM (
119 | 	SELECT DISTINCT
120 | 		TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)) as concat_reason
121 | 	FROM (
122 | 		SELECT
123 | 			eviction_id,
124 | 			CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END||
125 | 			CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END||
126 | 			CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END||
127 | 			CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END||
128 | 			CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END||
129 | 			CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END||
130 | 			CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END||
131 | 			CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END||
132 | 			CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END||
133 | 			CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END||
134 | 			CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END||
135 | 			CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END||
136 | 			CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END||
137 | 			CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END||
138 | 			CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END||
139 | 			CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END||
140 | 			CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END||
141 | 			CASE WHEN development = 'true' THEN 'development|' ELSE '' END||
142 | 			CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END
143 | 				as concat_reason
144 | 		FROM raw.soda_evictions
145 | 		) f1
146 | 	) f2;
147 | 
148 | INSERT INTO staging.br_reason_group (reason_group_key, reason_key)
149 | SELECT DISTINCT
150 | 	group_key as reason_group_key,
151 | 	reason_key
152 | FROM (SELECT group_key, unnest(reason_array) unnested FROM tmp_reason_group) grp
153 | JOIN staging.dim_Reason r ON r.reason_code = grp.unnested;	
154 | 
155 | 
156 | -- Populate Date Dimension Table
157 | 
158 | INSERT INTO staging.dim_date (date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week, 
159 | 				formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday,
160 | 				period, cw_start, cw_end, month_start, month_end)
161 | SELECT -1, '1900-01-01', -1, -1, 'Unknown', -1, -1, 'Unknown', -1, 'Unknown', 'Unknown', 'Unknown', 'Unknown',
162 | 		'Unknown', 'Unknown', 'Unknown', 'Unknown', '1900-01-01', '1900-01-01', '1900-01-01', '1900-01-01';
163 | 		
164 | 
165 | INSERT INTO staging.dim_date (date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week, 
166 | 				formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday,
167 | 				period, cw_start, cw_end, month_start, month_end)
168 | SELECT
169 | 	TO_CHAR(datum, 'yyyymmdd')::int as date_key,
170 | 	datum as date,
171 | 	EXTRACT(YEAR FROM datum) as year,
172 | 	EXTRACT(MONTH FROM datum) as month,
173 | 	TO_CHAR(datum, 'TMMonth') as month_name,
174 | 	EXTRACT(DAY FROM datum) as day,
175 | 	EXTRACT(doy FROM datum) as day_of_year,
176 | 	TO_CHAR(datum, 'TMDay') as weekday_name,
177 | 	EXTRACT(week FROM datum) as calendar_week,
178 | 	TO_CHAR(datum, 'dd. mm. yyyy') as formatted_date,
179 | 	'Q' || TO_CHAR(datum, 'Q') as quartal,
180 | 	TO_CHAR(datum, 'yyyy/"Q"Q') as year_quartal,
181 | 	TO_CHAR(datum, 'yyyy/mm') as year_month,
182 | 	TO_CHAR(datum, 'iyyy/IW') as year_calendar_week,
183 | 	CASE WHEN EXTRACT(isodow FROM datum) IN (6, 7) THEN 'Weekend' ELSE 'Weekday' END as weekend,
184 | 	CASE WHEN TO_CHAR(datum, 'MMDD') IN ('0101', '0704', '1225', '1226') THEN 'Holiday' ELSE 'No holiday' END
185 | 			as us_holiday,
186 | 	CASE WHEN TO_CHAR(datum, 'MMDD') BETWEEN '0701' AND '0831' THEN 'Summer break'
187 | 	     WHEN TO_CHAR(datum, 'MMDD') BETWEEN '1115' AND '1225' THEN 'Christmas season'
188 | 	     WHEN TO_CHAR(datum, 'MMDD') > '1225' OR TO_CHAR(datum, 'MMDD') <= '0106' THEN 'Winter break'
189 | 		 ELSE 'Normal' END
190 | 			as period,
191 | 	datum + (1 - EXTRACT(isodow FROM datum))::int as cw_start,
192 | 	datum + (7 - EXTRACT(isodow FROM datum))::int as cw_end,
193 | 	datum + (1 - EXTRACT(DAY FROM datum))::int as month_start,
194 | 	(datum + (1 - EXTRACT(DAY FROM datum))::int + '1 month'::interval)::date - '1 day'::interval as month_end
195 | FROM (
196 | 	SELECT '1997-01-01'::date + SEQUENCE.DAY as datum
197 | 	FROM generate_series(0,10956) as SEQUENCE(DAY)
198 | 	GROUP BY SEQUENCE.DAY
199 |      ) DQ;
200 | 
201 | 
202 | -- Populate Evictions Fact Table
203 | 
204 | SELECT 
205 | 	eviction_id,
206 | 	group_key as reason_group_key
207 | INTO tmp_reason_facts
208 | FROM (
209 | 	SELECT 
210 | 		eviction_id,
211 | 		TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)) as concat_reason
212 | 	FROM (
213 | 		SELECT
214 | 			eviction_id,
215 | 			CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END||
216 | 			CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END||
217 | 			CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END||
218 | 			CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END||
219 | 			CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END||
220 | 			CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END||
221 | 			CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END||
222 | 			CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END||
223 | 			CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END||
224 | 			CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END||
225 | 			CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END||
226 | 			CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END||
227 | 			CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END||
228 | 			CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END||
229 | 			CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END||
230 | 			CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END||
231 | 			CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END||
232 | 			CASE WHEN development = 'true' THEN 'development|' ELSE '' END||
233 | 			CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END
234 | 				as concat_reason
235 | 		FROM raw.soda_evictions
236 | 		) grp
237 | 	) f_grp
238 | JOIN tmp_reason_group t_grp ON f_grp.concat_reason = t_grp.concat_reason;	
239 | 
240 | 
241 | INSERT INTO staging.fact_evictions (eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, 
242 | 									constraints_date_key, street_address)
243 | SELECT 
244 | 	f.eviction_id as eviction_key,
245 | 	COALESCE(d.district_key, -1) as district_key,
246 | 	COALESCE(n.neighborhood_key, -1) as neighborhood_key,
247 | 	COALESCE(l.location_key, -1) as location_key,
248 | 	reason_group_key,
249 | 	COALESCE(dt1.date_key, -1) as file_date_key,
250 | 	COALESCE(dt2.date_key, -1) as constraints_date_key,
251 | 	f.address as street_address
252 | FROM raw.soda_evictions f
253 | LEFT JOIN tmp_reason_facts r ON f.eviction_id = r.eviction_id
254 | LEFT JOIN staging.dim_district d ON f.supervisor_district = d.district
255 | LEFT JOIN staging.dim_neighborhood n ON f.neighborhood = n.neighborhood_alt_name
256 | LEFT JOIN staging.dim_location l 
257 | 	ON COALESCE(f.city, 'Unknown') = l.city
258 | 	AND COALESCE(f.state, 'Unknown') = l.state
259 | 	AND COALESCE(f.zip, 'Unknown') = l.zip_code
260 | LEFT JOIN staging.dim_date dt1 ON f.file_date = dt1.date
261 | LEFT JOIN staging.dim_date dt2 ON f.constraints_date = dt2.date;
262 | 
263 | DROP TABLE tmp_reason_group;
264 | DROP TABLE tmp_reason_facts;
265 | 
266 | 		     
267 | -- Migrate to Production Schema
268 | 
269 | INSERT INTO prod.dim_district 
270 | 	(district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
271 | 	percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
272 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
273 | 	median_family_income, per_capita_income, percent_in_poverty)
274 | SELECT 
275 | 	district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
276 | 	percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
277 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
278 | 	median_family_income, per_capita_income, percent_in_poverty
279 | FROM staging.dim_district;
280 | 
281 | INSERT INTO prod.dim_neighborhood
282 | 	(neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 
283 | 	percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
284 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
285 | 	median_family_income, per_capita_income, percent_in_poverty)
286 | SELECT
287 | 	neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 
288 | 	percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
289 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
290 | 	median_family_income, per_capita_income, percent_in_poverty
291 | FROM staging.dim_neighborhood;
292 | 
293 | INSERT INTO prod.dim_location (location_key, city, state, zip_code)
294 | SELECT location_key, city, state, zip_code
295 | FROM staging.dim_location;
296 | 
297 | INSERT INTO prod.dim_reason (reason_key, reason_code, reason_desc)
298 | SELECT reason_key, reason_code, reason_desc
299 | FROM staging.dim_reason;
300 | 
301 | INSERT INTO prod.br_reason_group (reason_group_key, reason_key)
302 | SELECT reason_group_key, reason_key
303 | FROM staging.br_reason_group;
304 | 
305 | INSERT INTO prod.dim_date 
306 | 	(date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week, 
307 | 	formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday,
308 | 	period, cw_start, cw_end, month_start, month_end)
309 | SELECT 
310 | 	date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week, 
311 | 	formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday, period, 
312 | 	cw_start, cw_end, month_start, month_end
313 | FROM staging.dim_date;
314 | 
315 | INSERT INTO prod.fact_evictions 
316 | 	(eviction_key, district_key, neighborhood_key, location_key, reason_group_key, 
317 | 	file_date_key, constraints_date_key, street_address)
318 | SELECT 
319 | 	eviction_key, district_key, neighborhood_key, location_key, reason_group_key, 
320 | 	file_date_key, constraints_date_key, street_address
321 | FROM staging.fact_evictions;
322 | 


--------------------------------------------------------------------------------
/dags/sql/incremental_load.sql:
--------------------------------------------------------------------------------
  1 | -- echo "" > /home/airflow/airflow/dags/sql/incremental_load.sql
  2 | -- nano /home/airflow/airflow/dags/sql/incremental_load.sql
  3 | 
  4 | -- Populate District Dimension
  5 | 
  6 | INSERT INTO staging.dim_district
  7 | 	(district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
  8 | 	percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
  9 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
 10 | 	median_family_income, per_capita_income, percent_in_poverty)
 11 | SELECT 
 12 | 	district,
 13 | 	population::int,
 14 | 	households::int,
 15 | 	perc_asian::numeric as percent_asian,
 16 | 	perc_black::numeric as percent_black,
 17 | 	perc_white::numeric as percent_white,
 18 | 	perc_nat_am::numeric as percent_native_am,
 19 | 	perc_nat_pac::numeric as percent_pacific_isle,
 20 | 	perc_other::numeric as percent_other_race,
 21 | 	perc_latin::numeric as percent_latin,
 22 | 	median_age::numeric,
 23 | 	total_units::int,
 24 | 	perc_owner_occupied::numeric as percent_owner_occupied,
 25 | 	perc_renter_occupied::numeric as percent_renter_occupied,
 26 | 	median_rent_as_perc_of_income::numeric,
 27 | 	median_household_income::numeric,
 28 | 	median_family_income::numeric, 			
 29 | 	per_capita_income::numeric,
 30 | 	perc_in_poverty::numeric as percent_in_poverty
 31 | FROM raw.district_data
 32 | 	ON CONFLICT (district) DO UPDATE SET 
 33 | 		population = EXCLUDED.population,
 34 | 		households = EXCLUDED.households,
 35 | 		percent_asian = EXCLUDED.percent_asian,
 36 | 		percent_black = EXCLUDED.percent_black,
 37 | 		percent_white = EXCLUDED.percent_white,
 38 | 		percent_native_am = EXCLUDED.percent_native_am,
 39 | 		percent_pacific_isle = EXCLUDED.percent_pacific_isle,
 40 | 		percent_other_race = EXCLUDED.percent_other_race,
 41 | 		percent_latin = EXCLUDED.percent_latin,
 42 | 		median_age = EXCLUDED.median_age,
 43 | 		total_units = EXCLUDED.total_units,
 44 | 		percent_owner_occupied = EXCLUDED.percent_owner_occupied,
 45 | 		percent_renter_occupied = EXCLUDED.percent_renter_occupied,
 46 | 		median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income,
 47 | 		median_household_income = EXCLUDED.median_household_income,
 48 | 		median_family_income = EXCLUDED.median_family_income,
 49 | 		per_capita_income = EXCLUDED.per_capita_income,
 50 | 		percent_in_poverty = EXCLUDED.percent_in_poverty;
 51 | 
 52 | 
 53 | -- Populate Neighborhood Dimension
 54 | 
 55 | INSERT INTO staging.dim_neighborhood
 56 | 	(neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 
 57 | 	percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
 58 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
 59 | 	median_family_income, per_capita_income, percent_in_poverty)
 60 | SELECT 
 61 | 	acs_name as neighborhood,
 62 | 	db_name as neighborhood_alt_name,
 63 | 	population::int,
 64 | 	households::int,
 65 | 	perc_asian::numeric as percent_asian,
 66 | 	perc_black::numeric as percent_black,
 67 | 	perc_white::numeric as percent_white,
 68 | 	perc_nat_am::numeric as percent_native_am,
 69 | 	perc_nat_pac::numeric as percent_pacific_isle,
 70 | 	perc_other::numeric as percent_other_race,
 71 | 	perc_latin::numeric as percent_latin,
 72 | 	median_age::numeric,
 73 | 	total_units::int,
 74 | 	perc_owner_occupied::numeric as percent_owner_occupied,
 75 | 	perc_renter_occupied::numeric as percent_renter_occupied,
 76 | 	median_rent_as_perc_of_income::numeric,
 77 | 	median_household_income::numeric,
 78 | 	median_family_income::numeric,
 79 | 	per_capita_income::numeric,
 80 | 	perc_in_poverty::numeric as percent_in_poverty
 81 | FROM raw.neighborhood_data
 82 | 	ON CONFLICT (neighborhood) DO UPDATE SET
 83 | 		neighborhood_alt_name = EXCLUDED.neighborhood_alt_name,
 84 | 		population = EXCLUDED.population,
 85 | 		households = EXCLUDED.households,
 86 | 		percent_asian = EXCLUDED.percent_asian,
 87 | 		percent_black = EXCLUDED.percent_black,
 88 | 		percent_white = EXCLUDED.percent_white,
 89 | 		percent_native_am = EXCLUDED.percent_native_am,
 90 | 		percent_pacific_isle = EXCLUDED.percent_pacific_isle,
 91 | 		percent_other_race = EXCLUDED.percent_other_race,
 92 | 		percent_latin = EXCLUDED.percent_latin,
 93 | 		median_age = EXCLUDED.median_age,
 94 | 		total_units = EXCLUDED.total_units,
 95 | 		percent_owner_occupied = EXCLUDED.percent_owner_occupied,
 96 | 		percent_renter_occupied = EXCLUDED.percent_renter_occupied,
 97 | 		median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income,
 98 | 		median_household_income = EXCLUDED.median_household_income,
 99 | 		median_family_income = EXCLUDED.median_family_income,
100 | 		per_capita_income = EXCLUDED.per_capita_income,
101 | 		percent_in_poverty = EXCLUDED.percent_in_poverty;
102 | 
103 | 
104 | -- Populate Location Dimension
105 | 
106 | INSERT INTO staging.dim_location (city, state, zip_code)
107 | SELECT 
108 | 	se.city,
109 | 	se.state,
110 | 	se.zip_code
111 | FROM (
112 | 	SELECT DISTINCT
113 | 		COALESCE(city, 'Unknown') as city,
114 | 		COALESCE(state, 'Unknown') as state,
115 | 		COALESCE(zip, 'Unknown') as zip_code
116 | 	FROM raw.soda_evictions
117 | 	) se
118 | LEFT JOIN staging.dim_location dl 
119 | 	ON se.city = dl.city
120 | 	AND se.state = dl.state
121 | 	AND se.zip_code = dl.zip_code
122 | WHERE 
123 | 	dl.location_key IS NULL;
124 | 	
125 | 	
126 | -- Populate Reason Bridge Table
127 | 
128 | SELECT DISTINCT
129 | 	reason_group_key,
130 | 	ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array
131 | INTO TEMP tmp_existing_reason_groups
132 | FROM staging.br_reason_group
133 | GROUP BY reason_group_key; 
134 | 
135 | SELECT 
136 | 	concat_reason,
137 | 	ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array
138 | INTO TEMP tmp_new_reason_groups
139 | FROM (
140 | 	SELECT DISTINCT
141 | 		string_to_array(TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)), '|') as concat_reason,
142 | 		unnest(string_to_array(TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)), '|')) unnested_reason
143 | 	FROM (
144 | 		SELECT DISTINCT
145 | 			CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END||
146 | 			CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END||
147 | 			CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END||
148 | 			CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END||
149 | 			CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END||
150 | 			CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END||
151 | 			CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END||
152 | 			CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END||
153 | 			CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END||
154 | 			CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END||
155 | 			CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END||
156 | 			CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END||
157 | 			CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END||
158 | 			CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END||
159 | 			CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END||
160 | 			CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END||
161 | 			CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END||
162 | 			CASE WHEN development = 'true' THEN 'development|' ELSE '' END||
163 | 			CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END
164 | 				as concat_reason
165 | 		FROM raw.soda_evictions
166 | 		) se1
167 | 	GROUP BY concat_reason
168 | 	) se2
169 | JOIN staging.dim_reason r ON se2.unnested_reason = r.reason_code
170 | GROUP BY concat_reason; 
171 | 
172 | INSERT INTO staging.br_reason_group (reason_group_key, reason_key)
173 | SELECT
174 | 	final_grp.max_key + new_grp.tmp_group_key as reason_group_key,
175 | 	new_grp.reason_key as reason_key
176 | FROM (
177 | 	SELECT DISTINCT
178 | 		ROW_NUMBER() OVER(ORDER BY concat_reason) as tmp_group_key,
179 | 		concat_reason,
180 | 		unnest(n.rk_array) as reason_key
181 | 	FROM tmp_new_reason_groups n
182 | 	LEFT JOIN tmp_existing_reason_groups e ON n.rk_array = e.rk_array
183 | 	WHERE e.reason_group_key IS NULL
184 | 	) new_grp
185 | LEFT JOIN (SELECT MAX(reason_group_key) max_key FROM staging.br_reason_group) final_grp ON 1=1
186 | ORDER BY reason_group_key, reason_key;
187 | 
188 | DROP TABLE tmp_existing_reason_groups;
189 | DROP TABLE tmp_new_reason_groups;
190 | 
191 | 					    	    
192 | -- Populate Staging Fact Table
193 | 
194 | SELECT DISTINCT
195 | 	reason_group_key,
196 | 	ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array
197 | INTO TEMP tmp_existing_reason_groups
198 | FROM staging.br_reason_group
199 | GROUP BY reason_group_key; 
200 | 					    
201 | SELECT 
202 | 	eviction_id,
203 | 	ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array
204 | INTO TEMP tmp_fct_reason_groups	
205 | FROM (
206 | 	SELECT 
207 | 		eviction_id,
208 | 		unnest(string_to_array(TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)), '|')) as unnested_reason
209 | 	FROM (
210 | 		SELECT 
211 | 			eviction_id,
212 | 			CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END||
213 | 			CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END||
214 | 			CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END||
215 | 			CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END||
216 | 			CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END||
217 | 			CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END||
218 | 			CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END||
219 | 			CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END||
220 | 			CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END||
221 | 			CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END||
222 | 			CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END||
223 | 			CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END||
224 | 			CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END||
225 | 			CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END||
226 | 			CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END||
227 | 			CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END||
228 | 			CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END||
229 | 			CASE WHEN development = 'true' THEN 'development|' ELSE '' END||
230 | 			CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END
231 | 				as concat_reason
232 | 		FROM raw.soda_evictions
233 | 		) se1
234 | 	) se2
235 | JOIN staging.dim_reason r ON se2.unnested_reason = r.reason_code	
236 | GROUP BY se2.eviction_id; 
237 | 					    
238 | SELECT
239 | 	eviction_id, 
240 | 	reason_group_key
241 | INTO tmp_reason_group_lookup
242 | FROM tmp_fct_reason_groups f
243 | JOIN tmp_existing_reason_groups d ON f.rk_array = d.rk_array;			    
244 | 
245 | 
246 | TRUNCATE TABLE staging.fact_evictions;
247 | 					    
248 | INSERT INTO staging.fact_evictions 
249 | 	(eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, constraints_date_key, street_address)
250 | SELECT
251 | 	f.eviction_id as eviction_key,
252 | 	COALESCE(d.district_key, -1) as district_key,
253 | 	COALESCE(n.neighborhood_key, -1) as neighborhood_key,
254 | 	COALESCE(l.location_key, -1) as location_key,
255 | 	r.reason_group_key as reason_group_key,
256 | 	COALESCE(dt1.date_key, -1) as file_date_key,
257 | 	COALESCE(dt2.date_key, -1) as constraints_date_key,
258 | 	f.address as street_address
259 | FROM raw.soda_evictions f
260 | JOIN tmp_reason_group_lookup r ON f.eviction_id = r.eviction_id
261 | LEFT JOIN staging.dim_district d ON f.supervisor_district = d.district
262 | LEFT JOIN staging.dim_neighborhood n ON f.neighborhood = n.neighborhood_alt_name
263 | LEFT JOIN staging.dim_location l 
264 | 	ON COALESCE(f.city, 'Unknown') = l.city
265 | 	AND COALESCE(f.state, 'Unknown') = l.state
266 | 	AND COALESCE(f.zip, 'Unknown') = l.zip_code
267 | LEFT JOIN staging.dim_date dt1 ON f.file_date = dt1.date
268 | LEFT JOIN staging.dim_date dt2 ON f.constraints_date = dt2.date;
269 | 
270 | DROP TABLE tmp_existing_reason_groups;
271 | DROP TABLE tmp_fct_reason_groups;
272 | DROP TABLE tmp_reason_group_lookup;
273 | 					    
274 | 					    
275 | -- Merge Into Production Schema
276 | 
277 | INSERT INTO prod.dim_district
278 | 	(district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
279 | 	percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
280 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
281 | 	median_family_income, per_capita_income, percent_in_poverty)
282 | SELECT 	
283 | 	district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
284 | 	percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
285 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
286 | 	median_family_income, per_capita_income, percent_in_poverty
287 | FROM staging.dim_district	
288 | 	ON CONFLICT(district_key) DO UPDATE SET
289 | 		district = EXCLUDED.district,
290 | 		population = EXCLUDED.population,
291 | 		households = EXCLUDED.households,
292 | 		percent_asian = EXCLUDED.percent_asian,
293 | 		percent_black = EXCLUDED.percent_black,
294 | 		percent_white = EXCLUDED.percent_white,
295 | 		percent_native_am = EXCLUDED.percent_native_am,
296 | 		percent_pacific_isle = EXCLUDED.percent_pacific_isle,
297 | 		percent_other_race = EXCLUDED.percent_other_race,
298 | 		percent_latin = EXCLUDED.percent_latin,
299 | 		median_age = EXCLUDED.median_age,
300 | 		total_units = EXCLUDED.total_units,
301 | 		percent_owner_occupied = EXCLUDED.percent_owner_occupied,
302 | 		percent_renter_occupied = EXCLUDED.percent_renter_occupied,
303 | 		median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income,
304 | 		median_household_income = EXCLUDED.median_household_income,
305 | 		median_family_income = EXCLUDED.median_family_income,
306 | 		per_capita_income = EXCLUDED.per_capita_income,
307 | 		percent_in_poverty = EXCLUDED.percent_in_poverty;
308 | 
309 | 
310 | INSERT INTO prod.dim_neighborhood
311 | 	(neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 
312 | 	percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
313 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
314 | 	median_family_income, per_capita_income, percent_in_poverty)
315 | SELECT 
316 | 	neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white, 
317 | 	percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units, 
318 | 	percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income, 
319 | 	median_family_income, per_capita_income, percent_in_poverty
320 | FROM staging.dim_neighborhood
321 | 	ON CONFLICT (neighborhood_key) DO UPDATE SET
322 | 		neighborhood = EXCLUDED.neighborhood,
323 | 		neighborhood_alt_name = EXCLUDED.neighborhood_alt_name,
324 | 		population = EXCLUDED.population,
325 | 		households = EXCLUDED.households,
326 | 		percent_asian = EXCLUDED.percent_asian,
327 | 		percent_black = EXCLUDED.percent_black,
328 | 		percent_white = EXCLUDED.percent_white,
329 | 		percent_native_am = EXCLUDED.percent_native_am,
330 | 		percent_pacific_isle = EXCLUDED.percent_pacific_isle,
331 | 		percent_other_race = EXCLUDED.percent_other_race,
332 | 		percent_latin = EXCLUDED.percent_latin,
333 | 		median_age = EXCLUDED.median_age,
334 | 		total_units = EXCLUDED.total_units,
335 | 		percent_owner_occupied = EXCLUDED.percent_owner_occupied,
336 | 		percent_renter_occupied = EXCLUDED.percent_renter_occupied,
337 | 		median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income,
338 | 		median_household_income = EXCLUDED.median_household_income,
339 | 		median_family_income = EXCLUDED.median_family_income,
340 | 		per_capita_income = EXCLUDED.per_capita_income,
341 | 		percent_in_poverty = EXCLUDED.percent_in_poverty;
342 | 
343 | 
344 | INSERT INTO prod.dim_location (location_key, city, state, zip_code)
345 | SELECT location_key, city, state, zip_code
346 | FROM staging.dim_location
347 | 	ON CONFLICT (location_key) DO NOTHING;	
348 | 
349 | 					    
350 | INSERT INTO prod.br_reason_group (reason_group_key, reason_key)
351 | SELECT stg.reason_group_key, stg.reason_key 
352 | FROM staging.br_reason_group stg
353 | LEFT JOIN prod.br_reason_group prd 
354 | 	ON stg.reason_group_key = prd.reason_group_key 
355 | 	AND stg.reason_key = prd.reason_key
356 | WHERE 
357 | 	prd.reason_group_key IS NULL;
358 | 					    
359 | 					    
360 | INSERT INTO prod.fact_evictions 
361 | 	(eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, constraints_date_key, street_address)
362 | SELECT eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, constraints_date_key, street_address
363 | FROM staging.fact_evictions 
364 | 	ON CONFLICT (eviction_key) DO UPDATE SET 
365 | 		district_key = EXCLUDED.district_key,
366 | 		neighborhood_key = EXCLUDED.neighborhood_key,
367 | 		location_key = EXCLUDED.location_key,
368 | 		reason_group_key = EXCLUDED.reason_group_key,
369 | 		file_date_key = EXCLUDED.file_date_key,
370 | 		constraints_date_key = EXCLUDED.constraints_date_key,
371 | 		street_address = EXCLUDED.street_address;
372 | 


--------------------------------------------------------------------------------