├── dags
├── sql
│ ├── trunc_target_tables.sql
│ ├── init_db_schema.sql
│ ├── full_load.sql
│ └── incremental_load.sql
├── full_load_dag.py
├── incremental_load_dag.py
└── operators
│ ├── soda_to_s3_operator.py
│ └── s3_to_postgres_operator.py
├── airflow_installation.txt
└── README.md
/dags/sql/trunc_target_tables.sql:
--------------------------------------------------------------------------------
1 | -- echo "" > /home/airflow/airflow/dags/sql/trunc_target_tables.sql
2 | -- nano /home/airflow/airflow/dags/sql/trunc_target_tables.sql
3 | TRUNCATE TABLE raw.soda_evictions;
4 | TRUNCATE TABLE raw.district_data;
5 | TRUNCATE TABLE raw.neighborhood_data;
6 |
--------------------------------------------------------------------------------
/dags/full_load_dag.py:
--------------------------------------------------------------------------------
1 | # echo "" > /home/airflow/airflow/dags/full_load_dag.py
2 | # nano /home/airflow/airflow/dags/full_load_dag.py
3 |
4 | from airflow import DAG
5 | from airflow.operators.postgres_operator import PostgresOperator
6 | from operators.soda_to_s3_operator import SodaToS3Operator
7 | from operators.s3_to_postgres_operator import S3ToPostgresOperator
8 | from airflow.utils.dates import days_ago
9 | from datetime import timedelta
10 |
11 | soda_headers = {
12 | 'keyId':'############',
13 | 'keySecret':'#################',
14 | 'Accept':'application/json'
15 | }
16 |
17 | default_args = {
18 | 'owner': 'airflow',
19 | 'depends_on_past': False,
20 | 'start_date': days_ago(2),
21 | 'email': ['airflow@example.com'],
22 | 'email_on_failure': False,
23 | 'email_on_retry': False,
24 | 'retries': 1,
25 | 'retry_delay': timedelta(seconds=30)
26 | }
27 |
28 | with DAG('eviction-tracker-full_load',
29 | default_args=default_args,
30 | description='Executes full load from SODA API to Production DW.',
31 | max_active_runs=1,
32 | schedule_interval=None) as dag:
33 |
34 | op1 = SodaToS3Operator(
35 | task_id='get_evictions_data',
36 | http_conn_id='API_Evictions',
37 | headers=soda_headers,
38 | s3_conn_id='S3_Evictions',
39 | s3_bucket='sf-evictionmeter',
40 | s3_directory='soda_jsons',
41 | size_check=True,
42 | max_bytes=500000000,
43 | dag=dag
44 | )
45 |
46 | op2 = PostgresOperator(
47 | task_id='initialize_target_db',
48 | postgres_conn_id='RDS_Evictions',
49 | sql='sql/init_db_schema.sql',
50 | dag=dag
51 | )
52 |
53 | op3 = S3ToPostgresOperator(
54 | task_id='load_evictions_data',
55 | s3_conn_id='S3_Evictions',
56 | s3_bucket='sf-evictionmeter',
57 | s3_prefix='soda_jsons/soda_evictions_import',
58 | source_data_type='json',
59 | postgres_conn_id='RDS_Evictions',
60 | schema='raw',
61 | table='soda_evictions',
62 | get_latest=True,
63 | dag=dag
64 | )
65 |
66 | op4 = S3ToPostgresOperator(
67 | task_id='load_neighborhood_data',
68 | s3_conn_id='S3_Evictions',
69 | s3_bucket='sf-evictionmeter',
70 | s3_prefix='census_csv/sf_by_neighborhood',
71 | source_data_type='csv',
72 | header=True,
73 | postgres_conn_id='RDS_Evictions',
74 | schema='raw',
75 | table='neighborhood_data',
76 | get_latest=True,
77 | dag=dag
78 | )
79 |
80 | op5 = S3ToPostgresOperator(
81 | task_id='load_district_data',
82 | s3_conn_id='S3_Evictions',
83 | s3_bucket='sf-evictionmeter',
84 | s3_prefix='census_csv/sf_by_district',
85 | source_data_type='csv',
86 | header=True,
87 | postgres_conn_id='RDS_Evictions',
88 | schema='raw',
89 | table='district_data',
90 | get_latest=True,
91 | dag=dag
92 | )
93 |
94 | op6 = PostgresOperator(
95 | task_id='execute_full_load',
96 | postgres_conn_id='RDS_Evictions',
97 | sql='sql/full_load.sql',
98 | dag=dag
99 | )
100 |
101 | op1 >> op2 >> (op3, op4, op5) >> op6
102 |
--------------------------------------------------------------------------------
/dags/incremental_load_dag.py:
--------------------------------------------------------------------------------
1 | # echo "" > /home/airflow/airflow/dags/incremental_load_dag.py
2 | # nano /home/airflow/airflow/dags/incremental_load_dag.py
3 |
4 | from airflow import DAG
5 | from airflow.operators.postgres_operator import PostgresOperator
6 | from airflow.operators.python_operator import ShortCircuitOperator
7 | from operators.soda_to_s3_operator import SodaToS3Operator
8 | from operators.s3_to_postgres_operator import S3ToPostgresOperator
9 | from airflow.utils.dates import days_ago
10 | from datetime import timedelta
11 |
12 | soda_headers = {
13 | 'keyId':'############',
14 | 'keySecret':'#################',
15 | 'Accept':'application/json'
16 | }
17 |
18 | default_args = {
19 | 'owner': 'airflow',
20 | 'depends_on_past': False,
21 | 'start_date': days_ago(2),
22 | 'email': ['airflow@example.com'],
23 | 'email_on_failure': False,
24 | 'email_on_retry': False,
25 | 'retries': 1,
26 | 'retry_delay': timedelta(seconds=30)
27 | }
28 |
29 | def get_size(**context):
30 | val = context['ti'].xcom_pull(key='obj_len')
31 | return True if val > 0 else False
32 |
33 |
34 | with DAG('eviction-tracker-incremental_load',
35 | default_args=default_args,
36 | description='Executes incremental load from SODA API & S3-hosted csv''s into Production DW.',
37 | max_active_runs=1,
38 | schedule_interval=None) as dag:
39 |
40 | op1 = SodaToS3Operator(
41 | task_id='get_evictions_data',
42 | http_conn_id='API_Evictions',
43 | headers=soda_headers,
44 | days_ago=31,
45 | s3_conn_id='S3_Evictions',
46 | s3_bucket='sf-evictionmeter',
47 | s3_directory='soda_jsons',
48 | size_check=True,
49 | max_bytes=500000000,
50 | dag=dag
51 | )
52 |
53 | op2 = ShortCircuitOperator(
54 | task_id='check_get_results',
55 | python_callable=get_size,
56 | provide_context=True,
57 | dag=dag
58 | )
59 |
60 | op3 = PostgresOperator(
61 | task_id='truncate_target_tables',
62 | postgres_conn_id='RDS_Evictions',
63 | sql='sql/trunc_target_tables.sql',
64 | dag=dag
65 | )
66 |
67 | op4 = S3ToPostgresOperator(
68 | task_id='load_evictions_data',
69 | s3_conn_id='S3_Evictions',
70 | s3_bucket='sf-evictionmeter',
71 | s3_prefix='soda_jsons/soda_evictions_import',
72 | source_data_type='json',
73 | postgres_conn_id='RDS_Evictions',
74 | schema='raw',
75 | table='soda_evictions',
76 | get_latest=True,
77 | dag=dag
78 | )
79 |
80 | op5 = S3ToPostgresOperator(
81 | task_id='load_neighborhood_data',
82 | s3_conn_id='S3_Evictions',
83 | s3_bucket='sf-evictionmeter',
84 | s3_prefix='census_csv/sf_by_neighborhood',
85 | source_data_type='csv',
86 | header=True,
87 | postgres_conn_id='RDS_Evictions',
88 | schema='raw',
89 | table='neighborhood_data',
90 | get_latest=True,
91 | dag=dag
92 | )
93 |
94 | op6 = S3ToPostgresOperator(
95 | task_id='load_district_data',
96 | s3_conn_id='S3_Evictions',
97 | s3_bucket='sf-evictionmeter',
98 | s3_prefix='census_csv/sf_by_district',
99 | source_data_type='csv',
100 | header=True,
101 | postgres_conn_id='RDS_Evictions',
102 | schema='raw',
103 | table='district_data',
104 | get_latest=True,
105 | dag=dag
106 | )
107 |
108 | op7 = PostgresOperator(
109 | task_id='execute_incremental_load',
110 | postgres_conn_id='RDS_Evictions',
111 | sql='sql/incremental_load.sql',
112 | dag=dag
113 | )
114 |
115 | op1 >> op2 >> op3 >> (op4, op5, op6) >> op7
116 |
--------------------------------------------------------------------------------
/airflow_installation.txt:
--------------------------------------------------------------------------------
1 |
2 | STEP 1 - Launch EC2 Instance:
3 | - t3.medium
4 | - 12gb storage
5 | - launch-wizard-3 security group to open TCP Port 8080
6 | - associate elastic IP
7 |
8 |
9 | STEP 2 - Install Postgres Server on EC2:
10 | run:
11 | sudo apt-get update
12 | sudo apt-get install python-psycopg2
13 | sudo apt-get install postgresql postgresql-contrib
14 |
15 |
16 | Step 3 - Create OS User airflow
17 | run:
18 | sudo adduser airflow
19 | sudo usermod -aG sudo airflow
20 | su - airflow
21 |
22 | Note: From here on, make sure you are logged in as airflow user.
23 |
24 |
25 | Step 4 - Create Postgres Metadatabase and User Access
26 | run:
27 | sudo -u postgres psql
28 |
29 | in postgres prompt:
30 | CREATE USER airflow PASSWORD 'password';
31 | CREATE DATABASE airflow;
32 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO airflow;
33 | \q
34 |
35 |
36 | Step 5 - Change Postgres Connection Config
37 | run:
38 | sudo nano /etc/postgresql/10/main/pg_hba.conf
39 |
40 | Change this line -
41 | # IPv4 local connections:
42 | host all all 127.0.0.1/32 md5
43 | To this line -
44 | # IPv4 local connections:
45 | host all all 0.0.0.0/0 trust
46 |
47 | run:
48 | sudo nano /etc/postgresql/10/main/postgresql.conf
49 |
50 | Change this line -
51 | #listen_addresses = ‘localhost’ # what IP address(es) to listen on
52 | To this line -
53 | listen_addresses = ‘*’ # what IP address(es) to listen on
54 |
55 | restart postgres server:
56 | sudo service postgresql restart
57 |
58 |
59 | Step 6 - Install Airflow
60 |
61 | run:
62 | su - airflow
63 | sudo apt-get install python3-pip
64 | sudo python3 -m pip install apache-airflow[postgres,s3,aws]
65 |
66 | run:
67 | airflow initdb
68 |
69 |
70 | Step 7 - Connect Airflow to Postgres
71 |
72 | run:
73 | nano /home/airflow/airflow/airflow.cfg
74 |
75 | Change lines -
76 | sql_alchemy_conn = postgresql+psycopg2://airflow:password@localhost:5432/airflow
77 | executor = LocalExecutor
78 | load_examples = False
79 |
80 | run:
81 | airflow initdb
82 |
83 |
84 | Step 7 - Add DAGs:
85 | mkdir /home/airflow/airflow/dags/
86 | cd /home/airflow/airflow/dags/
87 | touch tutorial.py
88 | nano tutorial.py
89 |
90 |
91 | Step 6: Setup Airflow Webserver and Scheduler to start automatically
92 | We are almost there. The final thing we need to do is to ensure airflow starts up when your ec2 instance starts.
93 |
94 | sudo nano /etc/systemd/system/airflow-webserver.service
95 |
96 | Paste the following into the file created above
97 |
98 | [Unit]
99 | Description=Airflow webserver daemon
100 | After=network.target postgresql.service
101 | Wants=postgresql.service
102 | [Service]
103 | EnvironmentFile=/etc/environment
104 | User=airflow
105 | Group=airflow
106 | Type=simple
107 | ExecStart= /usr/local/bin/airflow webserver
108 | Restart=on-failure
109 | RestartSec=5s
110 | PrivateTmp=true
111 | [Install]
112 | WantedBy=multi-user.target
113 |
114 | Next we will create the following file to enable scheduler service
115 |
116 | sudo nano /etc/systemd/system/airflow-scheduler.service
117 |
118 | Paste the following
119 |
120 | [Unit]
121 | Description=Airflow scheduler daemon
122 | After=network.target postgresql.service
123 | Wants=postgresql.service
124 | [Service]
125 | EnvironmentFile=/etc/environment
126 | User=airflow
127 | Group=airflow
128 | Type=simple
129 | ExecStart=/usr/local/bin/airflow scheduler
130 | Restart=always
131 | RestartSec=5s
132 | [Install]
133 | WantedBy=multi-user.target
134 |
135 | Next enable these services and check their status
136 |
137 | sudo systemctl enable airflow-webserver.service
138 | sudo systemctl enable airflow-scheduler.service
139 | sudo systemctl start airflow-scheduler
140 | sudo systemctl start airflow-webserver
141 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SF-EvictionTracker
2 |
3 | Tracking eviction trends in San Francisco across filing reasons, districts, neighborhoods, and demographics in the months following COVID-19. Data warehouse infrastructure is housed in the AWS ecosystem and uses Apache Airflow for orchestration with public-facing dashboards created using Metabase.
4 |
5 | Questions? Feel free to reach me at ilya.glprn@gmail.com.
6 |
7 | Public Dashboard Link: http://sf-evictiontracker-metabase.us-east-1.elasticbeanstalk.com/public/dashboard/f637e470-8ea9-4b03-af80-53988e5b6a9b
8 |
9 |
10 |
ARCHITECTURE:
11 |
12 | 
13 | Data is sourced from San Francisco Open Data's API and csv's containing San Francisco district and neighborhood aggregate census results. Airflow orchestrates its movement to an S3 bucket and into a data warehouse hosted in RDS. SQL scripts are then ran to transform the data from its raw form through a staging schema and into production target tables. The presentation layer is created using Metabase, an open-source data visualization tool, and deployed using Elastic Beanstalk.
14 |
15 | DATA MODEL:
16 |
17 | Dimension Tables:
18 | `dim_district`
19 | `dim_neighborhood`
20 | `dim_location`
21 | `dim_reason`
22 | `dim_date`
23 | `br_reason_group`
24 |
25 | Fact Tables:
26 | `fact_evictions`
27 |
28 | The data model is implemented using a star schema with a bridge table to accomodate any new permutations for the reason dimension. More information on bridge tables can be found here: https://www.kimballgroup.com/2012/02/design-tip-142-building-bridges/
29 |
30 | 
31 | ETL FLOW:
32 |
33 | General Overview -
34 | - Evictions data is collected from the SODA API and moved into an S3 Bucket
35 | - Neighborhood/district census data is stored as a CSV in S3
36 | - Once the API load to S3 is complete, data is moved into RDS into a "raw" schema and moves through a staging schema for processing
37 | - ETL job execution is complete once data is moved from the staging schema into the final production tables
38 |
39 | DAGs and Custom Airflow Operators -
40 |
41 | 
42 | 
43 |
44 | There are 2 DAGs (Directed Acyclic Graphs) used for this project - full load which should be executed on initial setup and incremental load which is scheduled to run daily and pull new data from the Socrata Open Data API.
45 |
46 | The incremental load DAG uses XCom to pass the filesize of the load between the API call task and a ShortCircuitOperator to skip downstream tasks if the API call produces no results.
47 |
48 | The DAGs use two customer operators. They have been purpose built for this project but are easily expandable to be used in other data pipelines.
49 |
50 | 1. soda_to_s3_operator: Queries the Socrata Open Data API using a SoQL string and uploads the results to an S3 bucket. Includes optional function to check source data size and abort ETL if filesize exceeds user-defined limit.
51 |
52 | 2. s3_to_postges_operator: Collects data from a file hosted on AWS S3 and loads it into a Postgres table. Current version supports JSON and CSV source data types.
53 |
54 |
55 | INFRASTRUCTURE:
56 |
57 | This project is hosted in the AWS ecosystem and uses the following resources:
58 |
59 | 
60 |
61 | EC2 -
62 | - t2.medium - dedicated resource for Airflow, managed by AWS Instance Scheduler to complete the daily DAG run and shut off after execution
63 | - t2.small - used to host Metabase, always online
64 |
65 | RDS -
66 | - t2.small - hosts application database for Metabase and the data warehouse
67 |
68 | Elastic Beanstalk is used to deploy the Metabase web application.
69 |
70 |
71 | DASHBOARD:
72 |
73 | The dashboard is publicly accessible here: http://sf-evictiontracker-metabase.us-east-1.elasticbeanstalk.com/public/dashboard/f637e470-8ea9-4b03-af80-53988e5b6a9b
74 |
75 | Some examples screengrabs below!
76 |
77 | 
78 | 
79 | 
80 |
--------------------------------------------------------------------------------
/dags/operators/soda_to_s3_operator.py:
--------------------------------------------------------------------------------
1 | # echo "" > /home/airflow/airflow/dags/operators/soda_to_s3_operator.py
2 | # nano /home/airflow/airflow/dags/operators/soda_to_s3_operator.py
3 |
4 | from airflow.models.baseoperator import BaseOperator
5 | from airflow.utils.decorators import apply_defaults
6 | from airflow.hooks.http_hook import HttpHook
7 | from airflow.hooks.S3_hook import S3Hook
8 |
9 | from datetime import datetime, timedelta
10 | import json
11 | import sys
12 |
13 |
14 | class SizeExceededError(Exception):
15 | """Raised when max file size is exceeded"""
16 | def __init__(self):
17 | self.message = 'Max file size exceeded'
18 |
19 | def __str__(self):
20 | return f'SizeExceededError, {self.message}'
21 |
22 |
23 | class SodaToS3Operator(BaseOperator):
24 | """
25 | Queries the Socrata Open Data API using a SoQL string and uploads the results to an S3 bucket.
26 |
27 | :param endpoint: Optional API connection endpoint
28 | :param data: Custom Socrata SoQL string used to query API, overrides default get request
29 | :param days_ago: Restricts get request to updated/created records from specified date onward
30 | :param headers: Dictionary containing optional API connection keys (keyId, keySecret, Accept)
31 | :param s3_conn_id: S3 Connection ID
32 | :param s3_bucket: S3 Bucket Destination
33 | :param s3_directory: S3 Directory Destination
34 | :param method: Request type for API
35 | :param http_conn_id: SODA API Connection ID
36 | :param size_check: Boolean indicating whether to run a size check prior to upload to S3
37 | :param max_bytes: Maximum number of bytes to allow for a single S3 upload
38 | """
39 |
40 | @apply_defaults
41 | def __init__(self,
42 | endpoint=None,
43 | data=None,
44 | days_ago=None,
45 | headers=None,
46 | s3_conn_id=None,
47 | s3_bucket=None,
48 | s3_directory='',
49 | method='GET',
50 | http_conn_id='http_default',
51 | size_check=False,
52 | max_bytes=5000000000,
53 | *args,
54 | **kwargs) -> None:
55 |
56 | super().__init__(*args, **kwargs)
57 |
58 | self.endpoint = endpoint
59 | self.data = data
60 | self.days_ago = days_ago
61 | self.s3_conn_id = s3_conn_id
62 | self.s3_bucket = s3_bucket
63 | self.s3_directory = s3_directory
64 | self.headers = headers
65 | self.method = method
66 | self.http_conn_id = http_conn_id
67 | self.size_check = size_check
68 | self.max_bytes = max_bytes
69 |
70 |
71 | def get_size(self, obj, seen=None):
72 | """
73 | Recursively finds size of object.
74 | """
75 |
76 | size = sys.getsizeof(obj)
77 | if seen is None:
78 | seen = set()
79 | obj_id = id(obj)
80 | if obj_id in seen:
81 | return 0
82 | seen.add(obj_id)
83 | if isinstance(obj, dict):
84 | size += sum([self.get_size(v, seen) for v in obj.values()])
85 | size += sum([self.get_size(k, seen) for k in obj.keys()])
86 | elif hasattr(obj, '__dict__'):
87 | size += self.get_size(obj.__dict__, seen)
88 | elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
89 | size += sum([self.get_size(i, seen) for i in obj])
90 | return size
91 |
92 |
93 | def parse_metadata(self, header):
94 | """
95 | Parses metadata from API response.
96 | """
97 |
98 | try:
99 | metadata = {
100 | 'api-call-date': header['Date'],
101 | 'content-type': header['Content-Type'],
102 | 'source-last-modified': header['X-SODA2-Truth-Last-Modified'],
103 | 'fields': header['X-SODA2-Fields'],
104 | 'types': header['X-SODA2-Types']
105 | }
106 | except KeyError:
107 | metadata = {'KeyError': 'Metadata missing from header, see error log.'}
108 |
109 | return metadata
110 |
111 |
112 | def execute(self, context):
113 | """
114 | Executes the operator, including running a max filesize check if enabled.
115 |
116 | The SODA API maxes out the # of returned results so we use paging to query
117 | the endpoint multiple times and continuously move the offset.
118 |
119 | Metadata is parsed and saved separately in a /logs/ subfolder along with
120 | the JSON results from API call.
121 | """
122 |
123 | soda = HttpHook(method=self.method, http_conn_id=self.http_conn_id)
124 |
125 | if self.data:
126 | soql_filter = self.data
127 | elif self.days_ago:
128 | current_dt = datetime.now()
129 | target_dt = current_dt - timedelta(self.days_ago)
130 | format_dt = target_dt.strftime("%Y-%m-%d")
131 | soql_filter = f"""$query=SELECT:*,* WHERE :created_at > '{format_dt}' OR :updated_at > '{format_dt}'
132 | ORDER BY :id LIMIT 10000"""
133 | else:
134 | soql_filter = """$query=SELECT:*,* ORDER BY :id LIMIT 10000"""
135 |
136 | print('getting... ' + soql_filter)
137 |
138 | #soql_filter = f"""$query=SELECT:*,* WHERE :created_at < '2020-04-01' ORDER BY :id LIMIT 10000"""
139 |
140 | offset, counter = 0, 1
141 | combined = []
142 | while True:
143 | soql_filter_offset = soql_filter + f' OFFSET {offset}'
144 | response = soda.run(endpoint=self.endpoint, data=soql_filter_offset, headers=self.headers)
145 | if response.status_code != 200:
146 | break
147 | captured = response.json()
148 | if len(captured) == 0:
149 | break
150 | combined.extend(captured)
151 | offset = 10000 * counter
152 | counter += 1
153 |
154 | if self.size_check == True:
155 | print('actual size... ' + str(self.get_size(combined)))
156 | print('max size... ' + str(self.max_bytes))
157 | if self.get_size(combined) > self.max_bytes:
158 | raise SizeExceededError
159 |
160 | dest_s3 = S3Hook(self.s3_conn_id)
161 |
162 | body_obj = 'soda_evictions_import_' + datetime.now().strftime("%Y-%m-%dT%H%M%S") + '.json'
163 |
164 | metadata = self.parse_metadata(response.headers)
165 | meta_obj = 'logs/soda_evictions_import_log_' + datetime.now().strftime("%Y-%m-%dT%H%M%S")
166 |
167 | dest_s3.load_string(json.dumps(combined), key=self.s3_directory+'/'+body_obj, bucket_name=self.s3_bucket)
168 | dest_s3.load_string(json.dumps(metadata), key=self.s3_directory+'/'+meta_obj, bucket_name=self.s3_bucket)
169 |
170 | # XCom used to skip downstream tasks if body object size is 0
171 | self.xcom_push(context=context, key='obj_len', value=len(combined))
172 |
--------------------------------------------------------------------------------
/dags/operators/s3_to_postgres_operator.py:
--------------------------------------------------------------------------------
1 | # echo "" > /home/airflow/airflow/dags/operators/s3_to_postgres_operator.py
2 | # nano /home/airflow/airflow/dags/operators/s3_to_postgres_operator.py
3 |
4 | from airflow.models.baseoperator import BaseOperator
5 | from airflow.utils.decorators import apply_defaults
6 | from airflow.hooks.S3_hook import S3Hook
7 | from airflow.hooks.postgres_hook import PostgresHook
8 |
9 | import json
10 | import io
11 | from contextlib import closing
12 |
13 |
14 | class S3ToPostgresOperator(BaseOperator):
15 | """
16 | Collects data from a file hosted on AWS S3 and loads it into a Postgres table.
17 | Current version supports JSON and CSV sources but requires pre-defined data model.
18 |
19 | :param s3_conn_id: S3 Connection ID
20 | :param s3_bucket: S3 Bucket Destination
21 | :param s3_prefix: S3 File Prefix
22 | :param source_data_type: S3 Source File data type
23 | :param header: Toggles ignore header for CSV source type
24 | :param postgres_conn_id: Postgres Connection ID
25 | :param db_schema: Postgres Target Schema
26 | :param db_table: Postgres Target Table
27 | :param get_latest: if True, pulls from last modified file in S3 path
28 | """
29 |
30 | @apply_defaults
31 | def __init__(self,
32 | s3_conn_id=None,
33 | s3_bucket=None,
34 | s3_prefix='',
35 | source_data_type='',
36 | postgres_conn_id='postgres_default',
37 | header=False,
38 | schema='public',
39 | table='raw_load',
40 | get_latest=False,
41 | *args,
42 | **kwargs) -> None:
43 |
44 | super().__init__(*args, **kwargs)
45 |
46 | self.s3_conn_id = s3_conn_id
47 | self.s3_bucket = s3_bucket
48 | self.s3_prefix = s3_prefix
49 | self.source_data_type = source_data_type
50 | self.postgres_conn_id = postgres_conn_id
51 | self.header = header
52 | self.schema = schema
53 | self.table = table
54 | self.get_latest = get_latest
55 |
56 |
57 | def execute(self, context):
58 | """
59 | Executes the operator.
60 | """
61 | s3_hook = S3Hook(self.s3_conn_id)
62 | s3_session = s3_hook.get_session()
63 | s3_client = s3_session.client('s3')
64 |
65 | if self.get_latest == True:
66 | objects = s3_client.list_objects_v2(Bucket=self.s3_bucket, Prefix=self.s3_prefix)['Contents']
67 | latest = max(objects, key=lambda x: x['LastModified'])
68 | s3_obj = s3_client.get_object(Bucket=self.s3_bucket, Key=latest['Key'])
69 |
70 | file_content = s3_obj['Body'].read().decode('utf-8')
71 |
72 | pg_hook = PostgresHook(self.postgres_conn_id)
73 |
74 | if self.source_data_type == 'json':
75 |
76 | print('inserting json object...')
77 |
78 | json_content = json.loads(file_content)
79 |
80 | schema = self.schema
81 | if isinstance(self.schema, tuple):
82 | schema = self.schema[0]
83 |
84 | table = self.table
85 | if isinstance(self.table, tuple):
86 | table = self.table[0]
87 |
88 | target_fields = ['raw_id','created_at','updated_at','eviction_id','address','city','state',
89 | 'zip','file_date','non_payment','breach','nuisance','illegal_use','failure_to_sign_renewal',
90 | 'access_denial','unapproved_subtenant','owner_move_in','demolition','capital_improvement',
91 | 'substantial_rehab','ellis_act_withdrawal','condo_conversion','roommate_same_unit',
92 | 'other_cause','late_payments','lead_remediation','development','good_samaritan_ends',
93 | 'constraints_date','supervisor_district','neighborhood']
94 | target_fields = ','.join(target_fields)
95 |
96 | with closing(pg_hook.get_conn()) as conn:
97 | with closing(conn.cursor()) as cur:
98 | cur.executemany(
99 | f"""INSERT INTO {schema}.{table} ({target_fields})
100 | VALUES(
101 | %(:id)s, %(:created_at)s, %(:updated_at)s, %(eviction_id)s, %(address)s, %(city)s, %(state)s, %(zip)s,
102 | %(file_date)s, %(non_payment)s, %(breach)s, %(nuisance)s, %(illegal_use)s, %(failure_to_sign_renewal)s,
103 | %(access_denial)s, %(unapproved_subtenant)s, %(owner_move_in)s, %(demolition)s, %(capital_improvement)s,
104 | %(substantial_rehab)s, %(ellis_act_withdrawal)s, %(condo_conversion)s, %(roommate_same_unit)s,
105 | %(other_cause)s, %(late_payments)s, %(lead_remediation)s, %(development)s, %(good_samaritan_ends)s,
106 | %(constraints_date)s, %(supervisor_district)s, %(neighborhood)s
107 | );
108 | """,({
109 | ':id': line[':id'], ':created_at': line[':created_at'], ':updated_at': line[':updated_at'],
110 | 'eviction_id': line['eviction_id'], 'address': line.get('address', None), 'city': line.get('city', None),
111 | 'state': line.get('state', None),'zip': line.get('zip', None),'file_date': line.get('file_date', None),
112 | 'non_payment': line.get('non_payment', None),'breach': line.get('breach', None),
113 | 'nuisance': line.get('nuisance', None),'illegal_use': line.get('illegal_use', None),
114 | 'failure_to_sign_renewal': line.get('failure_to_sign_renewal', None),
115 | 'access_denial': line.get('access_denial', None),'unapproved_subtenant': line.get('unapproved_subtenant', None),
116 | 'owner_move_in': line.get('owner_move_in', None),'demolition': line.get('demolition', None),
117 | 'capital_improvement': line.get('capital_improvement', None),
118 | 'substantial_rehab': line.get('substantial_rehab', None),'ellis_act_withdrawal': line.get('ellis_act_withdrawal', None),
119 | 'condo_conversion': line.get('condo_conversion', None),'roommate_same_unit': line.get('roommate_same_unit', None),
120 | 'other_cause': line.get('other_cause', None),'late_payments': line.get('late_payments', None),
121 | 'lead_remediation': line.get('lead_remediation', None),'development': line.get('development', None),
122 | 'good_samaritan_ends': line.get('good_samaritan_ends', None),'constraints_date': line.get('constraints_date', None),
123 | 'supervisor_district': line.get('supervisor_district', None),'neighborhood': line.get('neighborhood', None)
124 | } for line in json_content))
125 | conn.commit()
126 |
127 |
128 | if self.source_data_type == 'csv':
129 |
130 | print('inserting csv...')
131 |
132 | file = io.StringIO(file_content)
133 |
134 | sql = "COPY %s FROM STDIN DELIMITER ','"
135 | if self.header == True:
136 | sql = "COPY %s FROM STDIN DELIMITER ',' CSV HEADER"
137 |
138 | schema = self.schema
139 | if isinstance(self.schema, tuple):
140 | schema = self.schema[0]
141 |
142 | table = self.table
143 | if isinstance(self.table, tuple):
144 | table = self.table[0]
145 |
146 | table = f'{schema}.{table}'
147 |
148 | with closing(pg_hook.get_conn()) as conn:
149 | with closing(conn.cursor()) as cur:
150 | cur.copy_expert(sql=sql % table, file=file)
151 | conn.commit()
152 |
153 | print('inserting complete...')
154 |
--------------------------------------------------------------------------------
/dags/sql/init_db_schema.sql:
--------------------------------------------------------------------------------
1 | -- echo "" > /home/airflow/airflow/dags/sql/init_db_schema.sql
2 | -- nano /home/airflow/airflow/dags/sql/init_db_schema.sql
3 |
4 | DROP SCHEMA IF EXISTS raw CASCADE;
5 | DROP SCHEMA IF EXISTS staging CASCADE;
6 | DROP SCHEMA IF EXISTS prod CASCADE;
7 |
8 | CREATE SCHEMA raw;
9 | CREATE SCHEMA staging;
10 | CREATE SCHEMA prod;
11 |
12 |
13 | -- Raw
14 | CREATE UNLOGGED TABLE raw.soda_evictions (
15 | raw_id text,
16 | created_at timestamp,
17 | updated_at timestamp,
18 | eviction_id text,
19 | address text,
20 | city text,
21 | state text,
22 | zip text,
23 | file_date timestamp,
24 | non_payment boolean,
25 | breach boolean,
26 | nuisance boolean,
27 | illegal_use boolean,
28 | failure_to_sign_renewal boolean,
29 | access_denial boolean,
30 | unapproved_subtenant boolean,
31 | owner_move_in boolean,
32 | demolition boolean,
33 | capital_improvement boolean,
34 | substantial_rehab boolean,
35 | ellis_act_withdrawal boolean,
36 | condo_conversion boolean,
37 | roommate_same_unit boolean,
38 | other_cause boolean,
39 | late_payments boolean,
40 | lead_remediation boolean,
41 | development boolean,
42 | good_samaritan_ends boolean,
43 | constraints_date timestamp,
44 | supervisor_district text,
45 | neighborhood text
46 | );
47 |
48 | CREATE UNLOGGED TABLE raw.district_data (
49 | district text,
50 | population text,
51 | households text,
52 | perc_asian text,
53 | perc_black text,
54 | perc_white text,
55 | perc_nat_am text,
56 | perc_nat_pac text,
57 | perc_other text,
58 | perc_latin text,
59 | median_age text,
60 | total_units text,
61 | perc_owner_occupied text,
62 | perc_renter_occupied text,
63 | median_rent_as_perc_of_income text,
64 | median_household_income text,
65 | median_family_income text,
66 | per_capita_income text,
67 | perc_in_poverty text
68 | );
69 | CREATE UNIQUE INDEX district_name_uniq_idx ON raw.district_data (district);
70 |
71 | CREATE UNLOGGED TABLE raw.neighborhood_data (
72 | acs_name text,
73 | db_name text,
74 | population text,
75 | households text,
76 | perc_asian text,
77 | perc_black text,
78 | perc_white text,
79 | perc_nat_am text,
80 | perc_nat_pac text,
81 | perc_other text,
82 | perc_latin text,
83 | median_age text,
84 | total_units text,
85 | perc_owner_occupied text,
86 | perc_renter_occupied text,
87 | median_rent_as_perc_of_income text,
88 | median_household_income text,
89 | median_family_income text,
90 | per_capita_income text,
91 | perc_in_poverty text
92 | );
93 | CREATE UNIQUE INDEX neighborhood_name_uniq_idx ON raw.neighborhood_data (acs_name);
94 |
95 | -- Staging
96 | CREATE TABLE staging.dim_district (
97 | district_key serial PRIMARY KEY,
98 | district text,
99 | population integer,
100 | households integer,
101 | percent_asian numeric,
102 | percent_black numeric,
103 | percent_white numeric,
104 | percent_native_am numeric,
105 | percent_pacific_isle numeric,
106 | percent_other_race numeric,
107 | percent_latin numeric,
108 | median_age numeric,
109 | total_units integer,
110 | percent_owner_occupied numeric,
111 | percent_renter_occupied numeric,
112 | median_rent_as_perc_of_income numeric,
113 | median_household_income numeric,
114 | median_family_income numeric,
115 | per_capita_income numeric,
116 | percent_in_poverty numeric
117 | );
118 | CREATE UNIQUE INDEX district_name_uniq_idx ON staging.dim_district (district);
119 |
120 | CREATE TABLE staging.dim_neighborhood (
121 | neighborhood_key serial PRIMARY KEY,
122 | neighborhood text,
123 | neighborhood_alt_name text,
124 | population integer,
125 | households integer,
126 | percent_asian numeric,
127 | percent_black numeric,
128 | percent_white numeric,
129 | percent_native_am numeric,
130 | percent_pacific_isle numeric,
131 | percent_other_race numeric,
132 | percent_latin numeric,
133 | median_age numeric,
134 | total_units integer,
135 | percent_owner_occupied numeric,
136 | percent_renter_occupied numeric,
137 | median_rent_as_perc_of_income numeric,
138 | median_household_income numeric,
139 | median_family_income numeric,
140 | per_capita_income numeric,
141 | percent_in_poverty numeric
142 | );
143 | CREATE UNIQUE INDEX neighborhood_name_uniq_idx ON staging.dim_neighborhood (neighborhood);
144 |
145 | CREATE TABLE staging.dim_location (
146 | location_key serial PRIMARY KEY,
147 | city text,
148 | state text,
149 | zip_code text
150 | );
151 |
152 | CREATE TABLE staging.dim_reason (
153 | reason_key serial PRIMARY KEY,
154 | reason_code text,
155 | reason_desc text
156 | );
157 |
158 | CREATE TABLE staging.br_reason_group (
159 | reason_group_key int,
160 | reason_key int
161 | );
162 | CREATE INDEX reason_group_key_idx ON staging.br_reason_group (reason_group_key);
163 | CREATE INDEX reason_key_idx ON staging.br_reason_group (reason_key);
164 |
165 | CREATE TABLE staging.dim_date (
166 | date_key int PRIMARY KEY,
167 | date date,
168 | year int,
169 | month int,
170 | month_name text,
171 | day int,
172 | day_of_year int,
173 | weekday_name text,
174 | calendar_week int,
175 | formatted_date text,
176 | quartal text,
177 | year_quartal text,
178 | year_month text,
179 | year_calendar_week text,
180 | weekend text,
181 | us_holiday text,
182 | period text,
183 | cw_start date,
184 | cw_end date,
185 | month_start date,
186 | month_end date
187 | );
188 |
189 | CREATE TABLE staging.fact_evictions (
190 | eviction_key text PRIMARY KEY,
191 | location_key int,
192 | district_key int,
193 | neighborhood_key int,
194 | reason_group_key int,
195 | file_date_key int,
196 | constraints_date_key int,
197 | street_address text
198 | );
199 |
200 |
201 | -- Prod
202 | CREATE TABLE prod.dim_district (
203 | district_key serial PRIMARY KEY,
204 | district text,
205 | population integer,
206 | households integer,
207 | percent_asian numeric,
208 | percent_black numeric,
209 | percent_white numeric,
210 | percent_native_am numeric,
211 | percent_pacific_isle numeric,
212 | percent_other_race numeric,
213 | percent_latin numeric,
214 | median_age numeric,
215 | total_units integer,
216 | percent_owner_occupied numeric,
217 | percent_renter_occupied numeric,
218 | median_rent_as_perc_of_income numeric,
219 | median_household_income numeric,
220 | median_family_income numeric,
221 | per_capita_income numeric,
222 | percent_in_poverty numeric
223 | );
224 |
225 | CREATE TABLE prod.dim_neighborhood (
226 | neighborhood_key serial PRIMARY KEY,
227 | neighborhood text,
228 | neighborhood_alt_name text,
229 | population integer,
230 | households integer,
231 | percent_asian numeric,
232 | percent_black numeric,
233 | percent_white numeric,
234 | percent_native_am numeric,
235 | percent_pacific_isle numeric,
236 | percent_other_race numeric,
237 | percent_latin numeric,
238 | median_age numeric,
239 | total_units integer,
240 | percent_owner_occupied numeric,
241 | percent_renter_occupied numeric,
242 | median_rent_as_perc_of_income numeric,
243 | median_household_income numeric,
244 | median_family_income numeric,
245 | per_capita_income numeric,
246 | percent_in_poverty numeric
247 | );
248 |
249 | CREATE TABLE prod.dim_location (
250 | location_key serial PRIMARY KEY,
251 | city text,
252 | state text,
253 | zip_code text
254 | );
255 |
256 | CREATE TABLE prod.dim_reason (
257 | reason_key serial PRIMARY KEY,
258 | reason_code text,
259 | reason_desc text
260 | );
261 |
262 | CREATE TABLE prod.br_reason_group (
263 | reason_group_key int,
264 | reason_key int
265 | );
266 | CREATE INDEX reason_group_key_idx ON prod.br_reason_group (reason_group_key);
267 | CREATE INDEX reason_key_idx ON prod.br_reason_group (reason_key);
268 |
269 | CREATE TABLE prod.dim_date (
270 | date_key int PRIMARY KEY,
271 | date date,
272 | year int,
273 | month int,
274 | month_name text,
275 | day int,
276 | day_of_year int,
277 | weekday_name text,
278 | calendar_week int,
279 | formatted_date text,
280 | quartal text,
281 | year_quartal text,
282 | year_month text,
283 | year_calendar_week text,
284 | weekend text,
285 | us_holiday text,
286 | period text,
287 | cw_start date,
288 | cw_end date,
289 | month_start date,
290 | month_end date
291 | );
292 |
293 | CREATE TABLE prod.fact_evictions (
294 | eviction_key text PRIMARY KEY,
295 | location_key int,
296 | district_key int,
297 | neighborhood_key int,
298 | reason_group_key int,
299 | file_date_key int,
300 | constraints_date_key int,
301 | street_address text
302 | );
303 |
--------------------------------------------------------------------------------
/dags/sql/full_load.sql:
--------------------------------------------------------------------------------
1 | -- echo "" > /home/airflow/airflow/dags/sql/full_load.sql
2 | -- nano /home/airflow/airflow/dags/sql/full_load.sql
3 |
4 | -- Populate District Dimension
5 |
6 | INSERT INTO staging.dim_district (district_key, district)
7 | SELECT -1, 'Unknown';
8 |
9 | INSERT INTO staging.dim_district (district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
10 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
11 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
12 | median_family_income, per_capita_income, percent_in_poverty)
13 | SELECT
14 | district,
15 | population::int,
16 | households::int,
17 | perc_asian::numeric as percent_asian,
18 | perc_black::numeric as percent_black,
19 | perc_white::numeric as percent_white,
20 | perc_nat_am::numeric as percent_native_am,
21 | perc_nat_pac::numeric as percent_pacific_isle,
22 | perc_other::numeric as percent_other_race,
23 | perc_latin::numeric as percent_latin,
24 | median_age::numeric,
25 | total_units::int,
26 | perc_owner_occupied::numeric as percent_owner_occupied,
27 | perc_renter_occupied::numeric as percent_renter_occupied,
28 | median_rent_as_perc_of_income::numeric,
29 | median_household_income::numeric,
30 | median_family_income::numeric,
31 | per_capita_income::numeric,
32 | perc_in_poverty::numeric as percent_in_poverty
33 | FROM raw.district_data;
34 |
35 |
36 | -- Populate Neighborhood Dimension
37 |
38 | INSERT INTO staging.dim_neighborhood (neighborhood_key, neighborhood)
39 | SELECT -1, 'Unknown';
40 |
41 | INSERT INTO staging.dim_neighborhood (neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white,
42 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
43 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
44 | median_family_income, per_capita_income, percent_in_poverty)
45 | SELECT
46 | acs_name as neighborhood,
47 | db_name as neighborhood_alt_name,
48 | population::int,
49 | households::int,
50 | perc_asian::numeric as percent_asian,
51 | perc_black::numeric as percent_black,
52 | perc_white::numeric as percent_white,
53 | perc_nat_am::numeric as percent_native_am,
54 | perc_nat_pac::numeric as percent_pacific_isle,
55 | perc_other::numeric as percent_other_race,
56 | perc_latin::numeric as percent_latin,
57 | median_age::numeric,
58 | total_units::int,
59 | perc_owner_occupied::numeric as percent_owner_occupied,
60 | perc_renter_occupied::numeric as percent_renter_occupied,
61 | median_rent_as_perc_of_income::numeric,
62 | median_household_income::numeric,
63 | median_family_income::numeric,
64 | per_capita_income::numeric,
65 | perc_in_poverty::numeric as percent_in_poverty
66 | FROM raw.neighborhood_data;
67 |
68 |
69 | -- Populate Location Dimension
70 |
71 | INSERT INTO staging.dim_location (location_key, city, state, zip_code)
72 | SELECT -1, 'Unknown', 'Unknown', 'Unknown';
73 |
74 | INSERT INTO staging.dim_location (city, state, zip_code)
75 | SELECT DISTINCT
76 | COALESCE(city, 'Unknown') as city,
77 | COALESCE(state, 'Unknown') as state,
78 | COALESCE(zip, 'Unknown') as zip_code
79 | FROM raw.soda_evictions
80 | WHERE
81 | city IS NOT NULL OR state IS NOT NULL OR zip IS NOT NULL;
82 |
83 |
84 | -- Populate Reason Dimension
85 |
86 | INSERT INTO staging.dim_reason (reason_key, reason_code, reason_desc)
87 | VALUES (-1, 'Unknown', 'Unknown');
88 |
89 | INSERT INTO staging.dim_reason (reason_code, reason_desc)
90 | VALUES ('non_payment', 'Non-Payment'),
91 | ('breach', 'Breach'),
92 | ('nuisance', 'Nuisance'),
93 | ('illegal_use', 'Illegal Use'),
94 | ('failure_to_sign_renewal', 'Failure to Sign Renewal'),
95 | ('access_denial', 'Access Denial'),
96 | ('unapproved_subtenant', 'Unapproved Subtenant'),
97 | ('owner_move_in', 'Owner Move-In'),
98 | ('demolition', 'Demolition'),
99 | ('capital_improvement', 'Capital Improvement'),
100 | ('substantial_rehab', 'Substantial Rehab'),
101 | ('ellis_act_withdrawal', 'Ellis Act Withdrawal'),
102 | ('condo_conversion', 'Condo Conversion'),
103 | ('roommate_same_unit', 'Roommate Same Unit'),
104 | ('other_cause', 'Other Cause'),
105 | ('late_payments', 'Late Payments'),
106 | ('lead_remediation', 'Lead Remediation'),
107 | ('development', 'Development'),
108 | ('good_samaritan_ends', 'Good Samaritan Ends');
109 |
110 |
111 | -- Populate Reason Bridge Table
112 |
113 | SELECT
114 | ROW_NUMBER() OVER(ORDER BY concat_reason) as group_key,
115 | string_to_array(concat_reason, '|') as reason_array,
116 | concat_reason
117 | INTO TEMP tmp_reason_group
118 | FROM (
119 | SELECT DISTINCT
120 | TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)) as concat_reason
121 | FROM (
122 | SELECT
123 | eviction_id,
124 | CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END||
125 | CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END||
126 | CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END||
127 | CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END||
128 | CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END||
129 | CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END||
130 | CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END||
131 | CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END||
132 | CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END||
133 | CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END||
134 | CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END||
135 | CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END||
136 | CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END||
137 | CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END||
138 | CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END||
139 | CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END||
140 | CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END||
141 | CASE WHEN development = 'true' THEN 'development|' ELSE '' END||
142 | CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END
143 | as concat_reason
144 | FROM raw.soda_evictions
145 | ) f1
146 | ) f2;
147 |
148 | INSERT INTO staging.br_reason_group (reason_group_key, reason_key)
149 | SELECT DISTINCT
150 | group_key as reason_group_key,
151 | reason_key
152 | FROM (SELECT group_key, unnest(reason_array) unnested FROM tmp_reason_group) grp
153 | JOIN staging.dim_Reason r ON r.reason_code = grp.unnested;
154 |
155 |
156 | -- Populate Date Dimension Table
157 |
158 | INSERT INTO staging.dim_date (date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week,
159 | formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday,
160 | period, cw_start, cw_end, month_start, month_end)
161 | SELECT -1, '1900-01-01', -1, -1, 'Unknown', -1, -1, 'Unknown', -1, 'Unknown', 'Unknown', 'Unknown', 'Unknown',
162 | 'Unknown', 'Unknown', 'Unknown', 'Unknown', '1900-01-01', '1900-01-01', '1900-01-01', '1900-01-01';
163 |
164 |
165 | INSERT INTO staging.dim_date (date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week,
166 | formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday,
167 | period, cw_start, cw_end, month_start, month_end)
168 | SELECT
169 | TO_CHAR(datum, 'yyyymmdd')::int as date_key,
170 | datum as date,
171 | EXTRACT(YEAR FROM datum) as year,
172 | EXTRACT(MONTH FROM datum) as month,
173 | TO_CHAR(datum, 'TMMonth') as month_name,
174 | EXTRACT(DAY FROM datum) as day,
175 | EXTRACT(doy FROM datum) as day_of_year,
176 | TO_CHAR(datum, 'TMDay') as weekday_name,
177 | EXTRACT(week FROM datum) as calendar_week,
178 | TO_CHAR(datum, 'dd. mm. yyyy') as formatted_date,
179 | 'Q' || TO_CHAR(datum, 'Q') as quartal,
180 | TO_CHAR(datum, 'yyyy/"Q"Q') as year_quartal,
181 | TO_CHAR(datum, 'yyyy/mm') as year_month,
182 | TO_CHAR(datum, 'iyyy/IW') as year_calendar_week,
183 | CASE WHEN EXTRACT(isodow FROM datum) IN (6, 7) THEN 'Weekend' ELSE 'Weekday' END as weekend,
184 | CASE WHEN TO_CHAR(datum, 'MMDD') IN ('0101', '0704', '1225', '1226') THEN 'Holiday' ELSE 'No holiday' END
185 | as us_holiday,
186 | CASE WHEN TO_CHAR(datum, 'MMDD') BETWEEN '0701' AND '0831' THEN 'Summer break'
187 | WHEN TO_CHAR(datum, 'MMDD') BETWEEN '1115' AND '1225' THEN 'Christmas season'
188 | WHEN TO_CHAR(datum, 'MMDD') > '1225' OR TO_CHAR(datum, 'MMDD') <= '0106' THEN 'Winter break'
189 | ELSE 'Normal' END
190 | as period,
191 | datum + (1 - EXTRACT(isodow FROM datum))::int as cw_start,
192 | datum + (7 - EXTRACT(isodow FROM datum))::int as cw_end,
193 | datum + (1 - EXTRACT(DAY FROM datum))::int as month_start,
194 | (datum + (1 - EXTRACT(DAY FROM datum))::int + '1 month'::interval)::date - '1 day'::interval as month_end
195 | FROM (
196 | SELECT '1997-01-01'::date + SEQUENCE.DAY as datum
197 | FROM generate_series(0,10956) as SEQUENCE(DAY)
198 | GROUP BY SEQUENCE.DAY
199 | ) DQ;
200 |
201 |
202 | -- Populate Evictions Fact Table
203 |
204 | SELECT
205 | eviction_id,
206 | group_key as reason_group_key
207 | INTO tmp_reason_facts
208 | FROM (
209 | SELECT
210 | eviction_id,
211 | TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)) as concat_reason
212 | FROM (
213 | SELECT
214 | eviction_id,
215 | CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END||
216 | CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END||
217 | CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END||
218 | CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END||
219 | CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END||
220 | CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END||
221 | CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END||
222 | CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END||
223 | CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END||
224 | CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END||
225 | CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END||
226 | CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END||
227 | CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END||
228 | CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END||
229 | CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END||
230 | CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END||
231 | CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END||
232 | CASE WHEN development = 'true' THEN 'development|' ELSE '' END||
233 | CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END
234 | as concat_reason
235 | FROM raw.soda_evictions
236 | ) grp
237 | ) f_grp
238 | JOIN tmp_reason_group t_grp ON f_grp.concat_reason = t_grp.concat_reason;
239 |
240 |
241 | INSERT INTO staging.fact_evictions (eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key,
242 | constraints_date_key, street_address)
243 | SELECT
244 | f.eviction_id as eviction_key,
245 | COALESCE(d.district_key, -1) as district_key,
246 | COALESCE(n.neighborhood_key, -1) as neighborhood_key,
247 | COALESCE(l.location_key, -1) as location_key,
248 | reason_group_key,
249 | COALESCE(dt1.date_key, -1) as file_date_key,
250 | COALESCE(dt2.date_key, -1) as constraints_date_key,
251 | f.address as street_address
252 | FROM raw.soda_evictions f
253 | LEFT JOIN tmp_reason_facts r ON f.eviction_id = r.eviction_id
254 | LEFT JOIN staging.dim_district d ON f.supervisor_district = d.district
255 | LEFT JOIN staging.dim_neighborhood n ON f.neighborhood = n.neighborhood_alt_name
256 | LEFT JOIN staging.dim_location l
257 | ON COALESCE(f.city, 'Unknown') = l.city
258 | AND COALESCE(f.state, 'Unknown') = l.state
259 | AND COALESCE(f.zip, 'Unknown') = l.zip_code
260 | LEFT JOIN staging.dim_date dt1 ON f.file_date = dt1.date
261 | LEFT JOIN staging.dim_date dt2 ON f.constraints_date = dt2.date;
262 |
263 | DROP TABLE tmp_reason_group;
264 | DROP TABLE tmp_reason_facts;
265 |
266 |
267 | -- Migrate to Production Schema
268 |
269 | INSERT INTO prod.dim_district
270 | (district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
271 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
272 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
273 | median_family_income, per_capita_income, percent_in_poverty)
274 | SELECT
275 | district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
276 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
277 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
278 | median_family_income, per_capita_income, percent_in_poverty
279 | FROM staging.dim_district;
280 |
281 | INSERT INTO prod.dim_neighborhood
282 | (neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white,
283 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
284 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
285 | median_family_income, per_capita_income, percent_in_poverty)
286 | SELECT
287 | neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white,
288 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
289 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
290 | median_family_income, per_capita_income, percent_in_poverty
291 | FROM staging.dim_neighborhood;
292 |
293 | INSERT INTO prod.dim_location (location_key, city, state, zip_code)
294 | SELECT location_key, city, state, zip_code
295 | FROM staging.dim_location;
296 |
297 | INSERT INTO prod.dim_reason (reason_key, reason_code, reason_desc)
298 | SELECT reason_key, reason_code, reason_desc
299 | FROM staging.dim_reason;
300 |
301 | INSERT INTO prod.br_reason_group (reason_group_key, reason_key)
302 | SELECT reason_group_key, reason_key
303 | FROM staging.br_reason_group;
304 |
305 | INSERT INTO prod.dim_date
306 | (date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week,
307 | formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday,
308 | period, cw_start, cw_end, month_start, month_end)
309 | SELECT
310 | date_key, date, year, month, month_name, day, day_of_year, weekday_name, calendar_week,
311 | formatted_date, quartal, year_quartal, year_month, year_calendar_week, weekend, us_holiday, period,
312 | cw_start, cw_end, month_start, month_end
313 | FROM staging.dim_date;
314 |
315 | INSERT INTO prod.fact_evictions
316 | (eviction_key, district_key, neighborhood_key, location_key, reason_group_key,
317 | file_date_key, constraints_date_key, street_address)
318 | SELECT
319 | eviction_key, district_key, neighborhood_key, location_key, reason_group_key,
320 | file_date_key, constraints_date_key, street_address
321 | FROM staging.fact_evictions;
322 |
--------------------------------------------------------------------------------
/dags/sql/incremental_load.sql:
--------------------------------------------------------------------------------
1 | -- echo "" > /home/airflow/airflow/dags/sql/incremental_load.sql
2 | -- nano /home/airflow/airflow/dags/sql/incremental_load.sql
3 |
4 | -- Populate District Dimension
5 |
6 | INSERT INTO staging.dim_district
7 | (district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
8 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
9 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
10 | median_family_income, per_capita_income, percent_in_poverty)
11 | SELECT
12 | district,
13 | population::int,
14 | households::int,
15 | perc_asian::numeric as percent_asian,
16 | perc_black::numeric as percent_black,
17 | perc_white::numeric as percent_white,
18 | perc_nat_am::numeric as percent_native_am,
19 | perc_nat_pac::numeric as percent_pacific_isle,
20 | perc_other::numeric as percent_other_race,
21 | perc_latin::numeric as percent_latin,
22 | median_age::numeric,
23 | total_units::int,
24 | perc_owner_occupied::numeric as percent_owner_occupied,
25 | perc_renter_occupied::numeric as percent_renter_occupied,
26 | median_rent_as_perc_of_income::numeric,
27 | median_household_income::numeric,
28 | median_family_income::numeric,
29 | per_capita_income::numeric,
30 | perc_in_poverty::numeric as percent_in_poverty
31 | FROM raw.district_data
32 | ON CONFLICT (district) DO UPDATE SET
33 | population = EXCLUDED.population,
34 | households = EXCLUDED.households,
35 | percent_asian = EXCLUDED.percent_asian,
36 | percent_black = EXCLUDED.percent_black,
37 | percent_white = EXCLUDED.percent_white,
38 | percent_native_am = EXCLUDED.percent_native_am,
39 | percent_pacific_isle = EXCLUDED.percent_pacific_isle,
40 | percent_other_race = EXCLUDED.percent_other_race,
41 | percent_latin = EXCLUDED.percent_latin,
42 | median_age = EXCLUDED.median_age,
43 | total_units = EXCLUDED.total_units,
44 | percent_owner_occupied = EXCLUDED.percent_owner_occupied,
45 | percent_renter_occupied = EXCLUDED.percent_renter_occupied,
46 | median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income,
47 | median_household_income = EXCLUDED.median_household_income,
48 | median_family_income = EXCLUDED.median_family_income,
49 | per_capita_income = EXCLUDED.per_capita_income,
50 | percent_in_poverty = EXCLUDED.percent_in_poverty;
51 |
52 |
53 | -- Populate Neighborhood Dimension
54 |
55 | INSERT INTO staging.dim_neighborhood
56 | (neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white,
57 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
58 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
59 | median_family_income, per_capita_income, percent_in_poverty)
60 | SELECT
61 | acs_name as neighborhood,
62 | db_name as neighborhood_alt_name,
63 | population::int,
64 | households::int,
65 | perc_asian::numeric as percent_asian,
66 | perc_black::numeric as percent_black,
67 | perc_white::numeric as percent_white,
68 | perc_nat_am::numeric as percent_native_am,
69 | perc_nat_pac::numeric as percent_pacific_isle,
70 | perc_other::numeric as percent_other_race,
71 | perc_latin::numeric as percent_latin,
72 | median_age::numeric,
73 | total_units::int,
74 | perc_owner_occupied::numeric as percent_owner_occupied,
75 | perc_renter_occupied::numeric as percent_renter_occupied,
76 | median_rent_as_perc_of_income::numeric,
77 | median_household_income::numeric,
78 | median_family_income::numeric,
79 | per_capita_income::numeric,
80 | perc_in_poverty::numeric as percent_in_poverty
81 | FROM raw.neighborhood_data
82 | ON CONFLICT (neighborhood) DO UPDATE SET
83 | neighborhood_alt_name = EXCLUDED.neighborhood_alt_name,
84 | population = EXCLUDED.population,
85 | households = EXCLUDED.households,
86 | percent_asian = EXCLUDED.percent_asian,
87 | percent_black = EXCLUDED.percent_black,
88 | percent_white = EXCLUDED.percent_white,
89 | percent_native_am = EXCLUDED.percent_native_am,
90 | percent_pacific_isle = EXCLUDED.percent_pacific_isle,
91 | percent_other_race = EXCLUDED.percent_other_race,
92 | percent_latin = EXCLUDED.percent_latin,
93 | median_age = EXCLUDED.median_age,
94 | total_units = EXCLUDED.total_units,
95 | percent_owner_occupied = EXCLUDED.percent_owner_occupied,
96 | percent_renter_occupied = EXCLUDED.percent_renter_occupied,
97 | median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income,
98 | median_household_income = EXCLUDED.median_household_income,
99 | median_family_income = EXCLUDED.median_family_income,
100 | per_capita_income = EXCLUDED.per_capita_income,
101 | percent_in_poverty = EXCLUDED.percent_in_poverty;
102 |
103 |
104 | -- Populate Location Dimension
105 |
106 | INSERT INTO staging.dim_location (city, state, zip_code)
107 | SELECT
108 | se.city,
109 | se.state,
110 | se.zip_code
111 | FROM (
112 | SELECT DISTINCT
113 | COALESCE(city, 'Unknown') as city,
114 | COALESCE(state, 'Unknown') as state,
115 | COALESCE(zip, 'Unknown') as zip_code
116 | FROM raw.soda_evictions
117 | ) se
118 | LEFT JOIN staging.dim_location dl
119 | ON se.city = dl.city
120 | AND se.state = dl.state
121 | AND se.zip_code = dl.zip_code
122 | WHERE
123 | dl.location_key IS NULL;
124 |
125 |
126 | -- Populate Reason Bridge Table
127 |
128 | SELECT DISTINCT
129 | reason_group_key,
130 | ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array
131 | INTO TEMP tmp_existing_reason_groups
132 | FROM staging.br_reason_group
133 | GROUP BY reason_group_key;
134 |
135 | SELECT
136 | concat_reason,
137 | ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array
138 | INTO TEMP tmp_new_reason_groups
139 | FROM (
140 | SELECT DISTINCT
141 | string_to_array(TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)), '|') as concat_reason,
142 | unnest(string_to_array(TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)), '|')) unnested_reason
143 | FROM (
144 | SELECT DISTINCT
145 | CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END||
146 | CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END||
147 | CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END||
148 | CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END||
149 | CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END||
150 | CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END||
151 | CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END||
152 | CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END||
153 | CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END||
154 | CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END||
155 | CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END||
156 | CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END||
157 | CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END||
158 | CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END||
159 | CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END||
160 | CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END||
161 | CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END||
162 | CASE WHEN development = 'true' THEN 'development|' ELSE '' END||
163 | CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END
164 | as concat_reason
165 | FROM raw.soda_evictions
166 | ) se1
167 | GROUP BY concat_reason
168 | ) se2
169 | JOIN staging.dim_reason r ON se2.unnested_reason = r.reason_code
170 | GROUP BY concat_reason;
171 |
172 | INSERT INTO staging.br_reason_group (reason_group_key, reason_key)
173 | SELECT
174 | final_grp.max_key + new_grp.tmp_group_key as reason_group_key,
175 | new_grp.reason_key as reason_key
176 | FROM (
177 | SELECT DISTINCT
178 | ROW_NUMBER() OVER(ORDER BY concat_reason) as tmp_group_key,
179 | concat_reason,
180 | unnest(n.rk_array) as reason_key
181 | FROM tmp_new_reason_groups n
182 | LEFT JOIN tmp_existing_reason_groups e ON n.rk_array = e.rk_array
183 | WHERE e.reason_group_key IS NULL
184 | ) new_grp
185 | LEFT JOIN (SELECT MAX(reason_group_key) max_key FROM staging.br_reason_group) final_grp ON 1=1
186 | ORDER BY reason_group_key, reason_key;
187 |
188 | DROP TABLE tmp_existing_reason_groups;
189 | DROP TABLE tmp_new_reason_groups;
190 |
191 |
192 | -- Populate Staging Fact Table
193 |
194 | SELECT DISTINCT
195 | reason_group_key,
196 | ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array
197 | INTO TEMP tmp_existing_reason_groups
198 | FROM staging.br_reason_group
199 | GROUP BY reason_group_key;
200 |
201 | SELECT
202 | eviction_id,
203 | ARRAY_AGG(reason_key ORDER BY reason_key ASC) as rk_array
204 | INTO TEMP tmp_fct_reason_groups
205 | FROM (
206 | SELECT
207 | eviction_id,
208 | unnest(string_to_array(TRIM(TRAILING '|' FROM (CASE WHEN concat_reason = '' THEN 'Unknown' ELSE concat_reason END)), '|')) as unnested_reason
209 | FROM (
210 | SELECT
211 | eviction_id,
212 | CASE WHEN non_payment = 'true' THEN 'non_payment|' ELSE '' END||
213 | CASE WHEN breach = 'true' THEN 'breach|' ELSE '' END||
214 | CASE WHEN nuisance = 'true' THEN 'nuisance|' ELSE '' END||
215 | CASE WHEN illegal_use = 'true' THEN 'illegal_use|' ELSE '' END||
216 | CASE WHEN failure_to_sign_renewal = 'true' THEN 'failure_to_sign_renewal|' ELSE '' END||
217 | CASE WHEN access_denial = 'true' THEN 'access_denial|' ELSE '' END||
218 | CASE WHEN unapproved_subtenant = 'true' THEN 'unapproved_subtenant|' ELSE '' END||
219 | CASE WHEN owner_move_in = 'true' THEN 'owner_move_in|' ELSE '' END||
220 | CASE WHEN demolition = 'true' THEN 'demolition|' ELSE '' END||
221 | CASE WHEN capital_improvement = 'true' THEN 'capital_improvement|' ELSE '' END||
222 | CASE WHEN substantial_rehab = 'true' THEN 'substantial_rehab|' ELSE '' END||
223 | CASE WHEN ellis_act_withdrawal = 'true' THEN 'ellis_act_withdrawal|' ELSE '' END||
224 | CASE WHEN condo_conversion = 'true' THEN 'condo_conversion|' ELSE '' END||
225 | CASE WHEN roommate_same_unit = 'true' THEN 'roommate_same_unit|' ELSE '' END||
226 | CASE WHEN other_cause = 'true' THEN 'other_cause|' ELSE '' END||
227 | CASE WHEN late_payments = 'true' THEN 'late_payments|' ELSE '' END||
228 | CASE WHEN lead_remediation = 'true' THEN 'lead_remediation|' ELSE '' END||
229 | CASE WHEN development = 'true' THEN 'development|' ELSE '' END||
230 | CASE WHEN good_samaritan_ends = 'true' THEN 'good_samaritan_ends|' ELSE '' END
231 | as concat_reason
232 | FROM raw.soda_evictions
233 | ) se1
234 | ) se2
235 | JOIN staging.dim_reason r ON se2.unnested_reason = r.reason_code
236 | GROUP BY se2.eviction_id;
237 |
238 | SELECT
239 | eviction_id,
240 | reason_group_key
241 | INTO tmp_reason_group_lookup
242 | FROM tmp_fct_reason_groups f
243 | JOIN tmp_existing_reason_groups d ON f.rk_array = d.rk_array;
244 |
245 |
246 | TRUNCATE TABLE staging.fact_evictions;
247 |
248 | INSERT INTO staging.fact_evictions
249 | (eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, constraints_date_key, street_address)
250 | SELECT
251 | f.eviction_id as eviction_key,
252 | COALESCE(d.district_key, -1) as district_key,
253 | COALESCE(n.neighborhood_key, -1) as neighborhood_key,
254 | COALESCE(l.location_key, -1) as location_key,
255 | r.reason_group_key as reason_group_key,
256 | COALESCE(dt1.date_key, -1) as file_date_key,
257 | COALESCE(dt2.date_key, -1) as constraints_date_key,
258 | f.address as street_address
259 | FROM raw.soda_evictions f
260 | JOIN tmp_reason_group_lookup r ON f.eviction_id = r.eviction_id
261 | LEFT JOIN staging.dim_district d ON f.supervisor_district = d.district
262 | LEFT JOIN staging.dim_neighborhood n ON f.neighborhood = n.neighborhood_alt_name
263 | LEFT JOIN staging.dim_location l
264 | ON COALESCE(f.city, 'Unknown') = l.city
265 | AND COALESCE(f.state, 'Unknown') = l.state
266 | AND COALESCE(f.zip, 'Unknown') = l.zip_code
267 | LEFT JOIN staging.dim_date dt1 ON f.file_date = dt1.date
268 | LEFT JOIN staging.dim_date dt2 ON f.constraints_date = dt2.date;
269 |
270 | DROP TABLE tmp_existing_reason_groups;
271 | DROP TABLE tmp_fct_reason_groups;
272 | DROP TABLE tmp_reason_group_lookup;
273 |
274 |
275 | -- Merge Into Production Schema
276 |
277 | INSERT INTO prod.dim_district
278 | (district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
279 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
280 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
281 | median_family_income, per_capita_income, percent_in_poverty)
282 | SELECT
283 | district_key, district, population, households, percent_asian, percent_black, percent_white, percent_native_am,
284 | percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
285 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
286 | median_family_income, per_capita_income, percent_in_poverty
287 | FROM staging.dim_district
288 | ON CONFLICT(district_key) DO UPDATE SET
289 | district = EXCLUDED.district,
290 | population = EXCLUDED.population,
291 | households = EXCLUDED.households,
292 | percent_asian = EXCLUDED.percent_asian,
293 | percent_black = EXCLUDED.percent_black,
294 | percent_white = EXCLUDED.percent_white,
295 | percent_native_am = EXCLUDED.percent_native_am,
296 | percent_pacific_isle = EXCLUDED.percent_pacific_isle,
297 | percent_other_race = EXCLUDED.percent_other_race,
298 | percent_latin = EXCLUDED.percent_latin,
299 | median_age = EXCLUDED.median_age,
300 | total_units = EXCLUDED.total_units,
301 | percent_owner_occupied = EXCLUDED.percent_owner_occupied,
302 | percent_renter_occupied = EXCLUDED.percent_renter_occupied,
303 | median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income,
304 | median_household_income = EXCLUDED.median_household_income,
305 | median_family_income = EXCLUDED.median_family_income,
306 | per_capita_income = EXCLUDED.per_capita_income,
307 | percent_in_poverty = EXCLUDED.percent_in_poverty;
308 |
309 |
310 | INSERT INTO prod.dim_neighborhood
311 | (neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white,
312 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
313 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
314 | median_family_income, per_capita_income, percent_in_poverty)
315 | SELECT
316 | neighborhood_key, neighborhood, neighborhood_alt_name, population, households, percent_asian, percent_black, percent_white,
317 | percent_native_am, percent_pacific_isle, percent_other_race, percent_latin, median_age, total_units,
318 | percent_owner_occupied, percent_renter_occupied, median_rent_as_perc_of_income, median_household_income,
319 | median_family_income, per_capita_income, percent_in_poverty
320 | FROM staging.dim_neighborhood
321 | ON CONFLICT (neighborhood_key) DO UPDATE SET
322 | neighborhood = EXCLUDED.neighborhood,
323 | neighborhood_alt_name = EXCLUDED.neighborhood_alt_name,
324 | population = EXCLUDED.population,
325 | households = EXCLUDED.households,
326 | percent_asian = EXCLUDED.percent_asian,
327 | percent_black = EXCLUDED.percent_black,
328 | percent_white = EXCLUDED.percent_white,
329 | percent_native_am = EXCLUDED.percent_native_am,
330 | percent_pacific_isle = EXCLUDED.percent_pacific_isle,
331 | percent_other_race = EXCLUDED.percent_other_race,
332 | percent_latin = EXCLUDED.percent_latin,
333 | median_age = EXCLUDED.median_age,
334 | total_units = EXCLUDED.total_units,
335 | percent_owner_occupied = EXCLUDED.percent_owner_occupied,
336 | percent_renter_occupied = EXCLUDED.percent_renter_occupied,
337 | median_rent_as_perc_of_income = EXCLUDED.median_rent_as_perc_of_income,
338 | median_household_income = EXCLUDED.median_household_income,
339 | median_family_income = EXCLUDED.median_family_income,
340 | per_capita_income = EXCLUDED.per_capita_income,
341 | percent_in_poverty = EXCLUDED.percent_in_poverty;
342 |
343 |
344 | INSERT INTO prod.dim_location (location_key, city, state, zip_code)
345 | SELECT location_key, city, state, zip_code
346 | FROM staging.dim_location
347 | ON CONFLICT (location_key) DO NOTHING;
348 |
349 |
350 | INSERT INTO prod.br_reason_group (reason_group_key, reason_key)
351 | SELECT stg.reason_group_key, stg.reason_key
352 | FROM staging.br_reason_group stg
353 | LEFT JOIN prod.br_reason_group prd
354 | ON stg.reason_group_key = prd.reason_group_key
355 | AND stg.reason_key = prd.reason_key
356 | WHERE
357 | prd.reason_group_key IS NULL;
358 |
359 |
360 | INSERT INTO prod.fact_evictions
361 | (eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, constraints_date_key, street_address)
362 | SELECT eviction_key, district_key, neighborhood_key, location_key, reason_group_key, file_date_key, constraints_date_key, street_address
363 | FROM staging.fact_evictions
364 | ON CONFLICT (eviction_key) DO UPDATE SET
365 | district_key = EXCLUDED.district_key,
366 | neighborhood_key = EXCLUDED.neighborhood_key,
367 | location_key = EXCLUDED.location_key,
368 | reason_group_key = EXCLUDED.reason_group_key,
369 | file_date_key = EXCLUDED.file_date_key,
370 | constraints_date_key = EXCLUDED.constraints_date_key,
371 | street_address = EXCLUDED.street_address;
372 |
--------------------------------------------------------------------------------