├── Airbnb Project
    ├── README.md
    ├── analyses
    │   └── full_moon_no_sleep.sql
    ├── assets
    │   └── input_schema.png
    ├── dbt_project.yml
    ├── environment-setup.md
    ├── logs
    │   └── dbt.log
    ├── macros
    │   ├── no_nulls_in_columns.sql
    │   └── positive_value.sql
    ├── models
    │   ├── dashboard.yml
    │   ├── dim
    │   │   ├── dim_hosts_cleansed.sql
    │   │   ├── dim_listings_cleansed.sql
    │   │   └── dim_listings_w_hosts.sql
    │   ├── docs.md
    │   ├── fct
    │   │   └── fct_reviews.sql
    │   ├── mart
    │   │   └── mart_fullmoon_reviews.sql
    │   ├── overview.md
    │   ├── schema.yml
    │   ├── sources.yml
    │   └── src
    │   │   ├── src_hosts.sql
    │   │   ├── src_listings.sql
    │   │   └── src_reviews.sql
    ├── packages.yml
    ├── seeds
    │   └── seed_full_moon_dates.csv
    ├── snapshots
    │   └── scd_raw_listings.sql
    ├── target
    │   ├── assets
    │   │   └── input_schema.png
    │   ├── catalog.json
    │   ├── compiled
    │   │   └── dbtlearn
    │   │   │   ├── analyses
    │   │   │       └── full_moon_no_sleep.sql
    │   │   │   ├── models
    │   │   │       ├── dim
    │   │   │       │   ├── dim_hosts_cleansed.sql
    │   │   │       │   ├── dim_listings_cleansed.sql
    │   │   │       │   └── dim_listings_w_hosts.sql
    │   │   │       ├── fct
    │   │   │       │   └── fct_reviews.sql
    │   │   │       ├── mart
    │   │   │       │   └── mart_fullmoon_reviews.sql
    │   │   │       ├── schema.yml
    │   │   │       │   ├── accepted_values_dim_hosts_cleansed_is_superhost__t__f.sql
    │   │   │       │   ├── accepted_values_dim_listings_c_1ca6148a08c62a5218f2a162f9d2a9a6.sql
    │   │   │       │   ├── dbt_expectations_expect_column_07e7a515218ef6e3a17e164c642c7d18.sql
    │   │   │       │   ├── dbt_expectations_expect_column_39596d790161761077ff1592b68943f6.sql
    │   │   │       │   ├── dbt_expectations_expect_column_8e138814a11b6202811546795bffca5d.sql
    │   │   │       │   ├── dbt_expectations_expect_column_c59e300e0dddb335c4211147100ac1c6.sql
    │   │   │       │   ├── dbt_expectations_expect_table__fbda7436ebe2ffe341acf0622c76d629.sql
    │   │   │       │   ├── not_null_dim_hosts_cleansed_host_id.sql
    │   │   │       │   ├── not_null_dim_hosts_cleansed_host_name.sql
    │   │   │       │   ├── not_null_dim_listings_cleansed_host_id.sql
    │   │   │       │   ├── not_null_dim_listings_cleansed_listing_id.sql
    │   │   │       │   ├── positive_value_dim_listings_cleansed_minimum_nights.sql
    │   │   │       │   ├── relationships_dim_listings_cle_05e2397b186a7b9306fc747b3cc4ef83.sql
    │   │   │       │   ├── unique_dim_hosts_cleansed_host_id.sql
    │   │   │       │   └── unique_dim_listings_cleansed_listing_id.sql
    │   │   │       ├── sources.yml
    │   │   │       │   ├── dbt_expectations_source_expect_a60b59a84fbc4577a11df360c50013bb.sql
    │   │   │       │   └── dbt_expectations_source_expect_d9770018e28873e7be74335902d9e4e5.sql
    │   │   │       └── src
    │   │   │       │   ├── src_hosts.sql
    │   │   │       │   ├── src_listings.sql
    │   │   │       │   └── src_reviews.sql
    │   │   │   └── tests
    │   │   │       ├── consistent_created_at.sql
    │   │   │       ├── dim_listings_minimum_nights.sql
    │   │   │       └── no_nulls_in_dim_listings.sql
    │   ├── graph.gpickle
    │   ├── index.html
    │   ├── manifest.json
    │   ├── partial_parse.msgpack
    │   ├── run
    │   │   └── dbtlearn
    │   │   │   ├── models
    │   │   │       ├── dim
    │   │   │       │   ├── dim_hosts_cleansed.sql
    │   │   │       │   ├── dim_listings_cleansed.sql
    │   │   │       │   └── dim_listings_w_hosts.sql
    │   │   │       ├── fct
    │   │   │       │   └── fct_reviews.sql
    │   │   │       ├── mart
    │   │   │       │   └── mart_fullmoon_reviews.sql
    │   │   │       ├── schema.yml
    │   │   │       │   ├── accepted_values_dim_listings_c_1ca6148a08c62a5218f2a162f9d2a9a6.sql
    │   │   │       │   ├── accepted_values_dim_listings_c_9fcd3cfe888517e67ec95c75be12c62a.sql
    │   │   │       │   ├── dbt_expectations_expect_column_07e7a515218ef6e3a17e164c642c7d18.sql
    │   │   │       │   ├── dbt_expectations_expect_column_39596d790161761077ff1592b68943f6.sql
    │   │   │       │   ├── dbt_expectations_expect_column_68f998c0da11ce3e6e806b41ab34f533.sql
    │   │   │       │   ├── dbt_expectations_expect_column_8e138814a11b6202811546795bffca5d.sql
    │   │   │       │   ├── dbt_expectations_expect_column_c59e300e0dddb335c4211147100ac1c6.sql
    │   │   │       │   ├── dbt_expectations_expect_table__fbda7436ebe2ffe341acf0622c76d629.sql
    │   │   │       │   ├── not_null_dim_listings_cleansed_host_id.sql
    │   │   │       │   ├── not_null_dim_listings_cleansed_listing_id.sql
    │   │   │       │   ├── positive_value_dim_listings_cleansed_minimum_nights.sql
    │   │   │       │   ├── relationships_dim_listings_cle_05e2397b186a7b9306fc747b3cc4ef83.sql
    │   │   │       │   └── unique_dim_listings_cleansed_listing_id.sql
    │   │   │       ├── sources.yml
    │   │   │       │   ├── dbt_expectations_source_expect_a60b59a84fbc4577a11df360c50013bb.sql
    │   │   │       │   └── dbt_expectations_source_expect_d9770018e28873e7be74335902d9e4e5.sql
    │   │   │       └── src
    │   │   │       │   ├── src_hosts.sql
    │   │   │       │   ├── src_listings.sql
    │   │   │       │   └── src_reviews.sql
    │   │   │   ├── seeds
    │   │   │       └── seed_full_moon_dates.csv
    │   │   │   ├── snapshots
    │   │   │       └── scd_raw_listings.sql
    │   │   │   └── tests
    │   │   │       ├── consistent_created_at.sql
    │   │   │       ├── dim_listings_minimum_nights.sql
    │   │   │       └── no_nulls_in_dim_listings.sql
    │   ├── run_results.json
    │   └── sources.json
    └── tests
    │   ├── consistent_created_at.sql
    │   ├── dim_listings_minimum_nights.sql
    │   └── no_nulls_in_dim_listings.sql
├── Dagster
    └── README.md
├── Dog Adoption
    ├── Datasets
    │   ├── description.csv
    │   ├── location.csv
    │   └── travel.csv
    ├── README.md
    └── dog_adoption_clean.ipynb
├── README.md
├── Uber Project
    ├── GCP
    │   └── gcp compute machine_vm installation commands.txt
    ├── Mage
    │   ├── uber_gbq_load.py
    │   ├── uber_load_data.py
    │   └── uber_transformation.py
    ├── README.md
    ├── Uber Data Engineering.ipynb
    ├── Uber_Dashboard.pdf
    ├── sql_script.sql
    └── uber_data.csv
├── dbt
    └── jaffle shop.md
├── notes.md
└── resources.md


/Airbnb Project/README.md:
--------------------------------------------------------------------------------
  1 | # 🛌 Airbnb Project
  2 | 
  3 | ## Objective
  4 | 
  5 | Work in progress.
  6 | 
  7 | ## Data Pipeline
  8 | 
  9 | Data Warehouse (Snowflake) -> Transformation (dbt) -> Dashboard (Prefect)
 10 | 
 11 | Steps:
 12 | 1. Ingest data into Snowflake.
 13 | 2. Perform transformation of data in dbt in which the cleansed models are materialised in Snowflake
 14 | 3.
 15 | 
 16 | **DAG**
 17 | 
 18 | <img width="1371" alt="image" src="https://user-images.githubusercontent.com/81607668/223373076-a320ca6e-b58c-4421-b41e-cda991365772.png">
 19 | 
 20 | **
 21 | 
 22 | ## Setup virtual environment and dbt
 23 | 
 24 | Create a virtual environment by running the commands below:
 25 | ```bash
 26 | # Create the virtual environment. Ensure that you have "cd" in the desired project's folder.
 27 | python3 -m venv venv # venv is the name of the virtual environment
 28 | 
 29 | # Activate the newly created virtual environment "venv"
 30 | source venv/bin/activate
 31 | ```
 32 | 
 33 | Install dbt-snowflake library:
 34 | ```bash
 35 | pip3 install dbt-snowflake
 36 | dbt # Run dbt to confirm that dbt is installed and working. You'll see a list of help guides.
 37 | ```
 38 | 
 39 | Create a dbt project
 40 | ```bash
 41 | dbt init dbtlearn # dbtlearn is the project name
 42 | 
 43 | type: snowflake
 44 | account: https://<take this value>.snowflakecomputing.com
 45 | user: <username in Snowflake>
 46 | password: <password in Snowflake>
 47 | role: <role in Snowflake>
 48 | warehouse: <warehouse in Snowflake> # Uppercase/lowercase-sensitive
 49 | database: <database in Snowflake> # Uppercase/lowercase-sensitive
 50 | schema: <schema in Snowflake> # Uppercase/lowercase-sensitive
 51 | 
 52 | # To check that the project has all the files it needs
 53 | dbt debug # Ensure it says "All checks passed!"
 54 | ```
 55 | 
 56 | ## Create dbt Models
 57 | 
 58 | Before we start, the models or SQL files in dbt are separated into layers:
 59 | 
 60 | **1) Staging Models (Src)**
 61 | - Stored in `models/s
 62 | - Purpose: Ingest raw or lightly cleansed data from source (ie. Snowflake).
 63 | - Work done on data: Minimal (change column name) or no transformation.
 64 | 
 65 | **2) Dimension Models (Dim)**
 66 | - Purpose: Models contain descriptive information about data ie. products, location, time.
 67 | - Typically denormalized.
 68 | - Work done on data: Filtering, grouping, aggregating, type casting, joins, defining categories (using CASE statement).
 69 | 
 70 | **3) Fact Models (Fct)**
 71 | - Purpose: Models store quantitative and analytical data ie. events, transactions, orders.
 72 | - Contain measures like sales, revenue, quantity, etc to perform analytics or metrics calculations.
 73 | - Work done on data: Aggregations, calculations
 74 | 
 75 | ‼️ Do not end the models (.sql) with `;` in dbt. It'll throw an error. 😮‍💨
 76 | 
 77 | ## 1) Create Staging Models
 78 | 
 79 | **`src_listings.sql`**
 80 | 
 81 | Create `src_listings` model (.sql) in `models/src` folder with the following SELECT statement. Click [here](https://discourse.getdbt.com/t/why-the-fishtown-sql-style-guide-uses-so-many-ctes/1091) to understand why we are "importing" the upstream data in CTEs.
 82 | 
 83 | ```sql
 84 | -- File path: models/src/src_listings.sql
 85 | WITH raw_listings AS (
 86 | 	SELECT *
 87 | 	FROM AIRBNB.RAW.RAW_LISTINGS
 88 | )
 89 | 
 90 | SELECT 
 91 |   id AS listing_id,
 92 |   name AS listing_name,
 93 |   listing_url,
 94 |   room_type,
 95 |   minimum_nights,
 96 |   host_id,
 97 |   price AS price_str,
 98 |   created_at,
 99 |   updated_at
100 | FROM 
101 |   raw_listings
102 | ```
103 | 
104 | To materialise models in dbt and Snowflake, run `dbt run` or, to run a specific model `dbt run -m src_listings` in termimal.
105 | 
106 | <img width="825" alt="Screenshot 2023-02-27 at 11 19 35 AM" src="https://user-images.githubusercontent.com/81607668/221465370-43ca1830-945c-4789-88ca-b8d48ed7a945.png">
107 | 
108 | Do the same with `src_reviews` and `src_hosts` models.
109 | 
110 | **`src_reviews.sql`**
111 | 
112 | ```sql
113 | -- File path: models/src/src_reviews.sql
114 | WITH raw_reviews AS (
115 | 	SELECT *
116 | 	FROM AIRBNB.RAW.RAW_REVIEWS
117 | )
118 | 
119 | SELECT
120 | 	listing_id,
121 | 	date AS review_date,
122 | 	reviewer_name,
123 | 	comments AS review_text,
124 | 	sentiment AS review_sentiment
125 | FROM raw_reviews
126 | ```
127 | 
128 | **`src_hosts.sql`**
129 | 
130 | ```sql
131 | -- File path: models/src/src_hosts.sql
132 | WITH raw_hosts AS (
133 | 	SELECT *
134 | 	FROM AIRBNB.RAW.RAW_HOSTS
135 | )
136 | 
137 | SELECT
138 | 	id AS host_id,
139 | 	name AS host_name,
140 | 	is_superhost,
141 | 	created_at,
142 | 	updated_at
143 | FROM raw_hosts
144 | ```
145 | 
146 | Then, run `dbt run` to materialise the new models in dbt and Snowflake.
147 | <img width="828" alt="image" src="https://user-images.githubusercontent.com/81607668/221467527-563cc613-86a5-46d7-a3c8-77b8b1c76524.png">
148 | 
149 | In Snowflake, the **DEV** folder is created to contain all the dbt materialisations. All the newly created models are contained in the **Views** folder.
150 | <img width="1438" alt="Screenshot 2023-02-27 at 11 42 14 AM" src="https://user-images.githubusercontent.com/81607668/221467811-70144a38-8407-4d8f-b8d5-a0aa158444ac.png">
151 | 
152 | ## 2) Create Dim Models
153 | 
154 | Create a `models/dim` folder to put in the dim models, which are the cleansed models from `src`, or staging layer. This keeps all the models well-organized. 
155 | 
156 | **`dim_listings_cleansed.sql`**
157 | 
158 | ```sql
159 | -- File path: models/dim/dim_listings_cleansed.sql
160 | WITH src_listings AS (
161 |     SELECT *
162 |     FROM {{ref("src_listings")}}
163 | )
164 | 
165 | SELECT
166 |     listing_id,
167 |     listing_name,
168 |     room_type,
169 |     CASE
170 |         WHEN minimum_nights = 0 THEN 1
171 |         ELSE minimum_nights 
172 |     END AS minimum_nights, -- ensure that the minimum stay is 1 night
173 |     host_id, 
174 |     LTRIM(price_str,'$')::NUMBER(10,2) AS price_per_night, -- remove "$" and cast into number with 2 decimals
175 |     created_at,
176 |     updated_at
177 | FROM src_listings
178 | ```
179 | 
180 | **`dim_hosts_cleansed.sql`**
181 | 
182 | ```sql
183 | -- File path: models/dim/dim_hosts_cleansed.sql
184 | WITH src_hosts AS (
185 |     SELECT *
186 |     FROM {{ref("src_hosts")}}
187 | )
188 | 
189 | SELECT
190 |     host_id,
191 |     NVL(host_name,'Anonymous') AS host_name,
192 |     is_superhost,
193 |     created_at,
194 |     updated_at
195 | FROM src_hosts
196 | ```
197 | 
198 | This is how the folder structure should look like now:
199 | <img width="286" alt="Screenshot 2023-10-19 at 3 06 43 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/a1ff5342-e38f-41c0-8aa1-9b65f09b49a0">
200 | 
201 | ## 3) Create Fct Models
202 | 
203 | **`fct_reviews.sql`**
204 | 
205 | To update and append new data to the existing `fct_reviews` model in an incremental fashion. Click [here](https://docs.getdbt.com/docs/build/incremental-models#using-incremental-materializations) for the official dbt documentation on the incremental model.
206 | 
207 | ```sql
208 | -- File path: models/fct/fct_reviews.sql
209 | 
210 | -- Macro configuration to materialise model incrementally: 
211 | {{
212 |     config(
213 |         materialized='incremental',
214 |         on_schema_changes='fail'
215 |     )
216 | }}
217 | 
218 | 
219 | WITH src_reviews AS (
220 |     SELECT *
221 |     FROM {{ref("src_reviews")}}
222 | )
223 | 
224 | SELECT *
225 | FROM src_reviews
226 | WHERE review_text IS NOT NULL
227 | {% if is_incremental() %}
228 |   -- This filter will only be applied on an incremental run.
229 |   -- If there are rows with review_date greater than the most recent review_date in this model, select row and add into {{ this }} model.
230 |   -- This condition ensures that only new or updated records since the last dbt run are included.
231 |   AND review_date > (select MAX(review_date) FROM {{ this }})
232 | {% endif %}
233 | ```
234 | 
235 | <img width="840" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/8be2fed5-23e4-4a68-b928-8c705616f5e8">
236 | 
237 | **`dim_listings_w_hosts.sql`**
238 | 
239 | ```sql
240 | -- File path: models/fct/fct_reviews.sql
241 | 
242 | WITH 
243 | 
244 | listings AS (
245 |     SELECT *
246 |     FROM {{ ref("dim_listings_cleansed")}}
247 | ),
248 | 
249 | hosts AS (
250 |     SELECT *
251 |     FROM {{ ref("dim_hosts_cleansed")}}
252 | )
253 | 
254 | SELECT
255 |     l.listing_id,
256 |     l.listing_name,
257 |     l.room_type,
258 |     l.minimum_nights,
259 |     l.price_per_night,
260 |     l.host_id,
261 |     h.host_name,
262 |     h.is_superhost AS host_is_superhost,
263 |     l.created_at,
264 |     GREATEST(l.updated_at, h.updated_at) AS updated_at -- Keep the most recent updated date
265 | 
266 | FROM listings AS l
267 | LEFT JOIN hosts AS h
268 |     ON l.host_id = h.host_id
269 | ```
270 | 
271 | Now that all the models are created, we changed the materialisation of source models (ie. `src_hosts`, `src_listings`, and `src_reviews`) to ephemeral in `dbt_project.yml` so that it will be materialised as a CTE and will not appear in the **Dev** folder in Snowflake. 
272 | <img width="1436" alt="Screenshot 2023-10-19 at 4 15 05 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/55d78ff3-ac33-43aa-aa45-b24ef619dbaf">
273 | 
274 | dbt does not remove them as views, so we'll need to drop them in Snowflake.
275 | <img width="1436" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/8e5a69a1-5d64-47af-a125-6e31b26ca188">
276 | 
277 | ## 4) Uploading CSV from S3
278 | 
279 | Run the command to copy the `seed_full_moon_dates.csv` file from S3 to the project's seed folder.
280 | 
281 | ```bash
282 | curl https://dbtlearn.s3.us-east-2.amazonaws.com/seed_full_moon_dates.csv -o seeds/seed_full_moon_dates.csv
283 | ```
284 | 
285 | Then, run `dbt seed` to populate the CSV as a Table in Snowflake.
286 | 
287 | <img width="968" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/6e0373d4-9337-4688-a7cb-bc518db1224a">
288 | 
289 | The `seed_full_moon_dates.csv` is updated in Snowflake.
290 | <img width="1436" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/2ad8b9de-a4c8-4a64-9bb1-ba6f25efff08">
291 | 
292 | ***
293 | 
294 | ## Change Source 
295 | 
296 | Instead of referencing the exact table in Snowflake, we create a `source.yml` which contains the references of the Snowflake table and give it an alias which we can use in dbt.
297 | 
298 | Create a `source.yml` in `/models` folder.
299 | <img width="1436" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/150d33e1-9895-40b8-80ea-490655c3910f">
300 | 
301 | Update the models which are using the exact table reference. For example, in the `src_hosts.sql` model, I update
302 | 
303 | from:
304 | ```sql
305 | WITH RAW_HOSTS AS (
306 |     SELECT *
307 |     FROM AIRBNB.RAW.RAW_HOSTS
308 | )
309 | 
310 | SELECT
311 |     id AS host_id,
312 |     name AS host_name,
313 |     is_superhost,
314 |     created_at,
315 |     updated_at
316 | 
317 | FROM RAW_HOSTS
318 | ```
319 | 
320 | to:
321 | ```sql
322 | WITH RAW_HOSTS AS (
323 |     SELECT *
324 |     FROM {{ source ('airbnb', 'hosts')}}
325 | )
326 | 
327 | SELECT
328 |     id AS host_id,
329 |     name AS host_name,
330 |     is_superhost,
331 |     created_at,
332 |     updated_at
333 | 
334 | FROM RAW_HOSTS
335 | ```
336 | ### Compiled Models in Target
337 | 
338 | To view the compiled models (.sql) in dbt, go to `target/compiled/dbtlearn/models/[dim/fct/src].sql`.
339 | 
340 | Here's a sample of a compiled model (`dim_hosts_cleansed.sql`):
341 | <img width="1436" alt="Screenshot 2023-10-19 at 4 21 48 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/28524691-7fe5-4007-b677-9ff327455be8">
342 | 
343 | Can copy and run in Snowflake for debugging purposes.
344 | 
345 | ***
346 | 
347 | ### Final Materialisation for Models
348 | 
349 | Once all the models have been cleansed, now we'll change the materialisation in dbt_project.yml.
350 | ```yml
351 | models:
352 |   dbtlearn2:
353 |     +materialized: view # Applied to all models except dim and src
354 |     dim: 
355 |       +materialized: table # Applied to dim
356 |     src:
357 |       +materialized: ephemeral # Applied to src
358 | ```
359 | 
360 | Run `dbt run` and refresh Snowflake for the changes to take place.
361 | 
362 | <img width="885" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/e209389f-24db-4814-a061-a3e83a2d83d1">
363 | 
364 | You will not see src models in Snowflake as they are ephemeral materialisations and will not appear in Snowflake.
365 | <img width="1436" alt="Screenshot 2023-10-19 at 5 45 46 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/67b75619-40ac-4393-9592-6aa67cd49988">
366 | 
367 | ### Snapshots
368 | 
369 | Snapshots are created using a snapshot block with configs onto a SELECT statement.
370 | 
371 | Here's an example of how a snapshot looks like:
372 | 
373 | <img width="611" alt="image" src="https://user-images.githubusercontent.com/81607668/221782071-35e66952-caa2-4fa1-a678-2d4ea322626f.png">
374 | 
375 | Snapshot automatically includes the columns `dbt_scd_id`, `dbt_updated_at`, `dbt_valid_from` and `dbt_valid_to`. In the first snapshot, `dbt_valid_to` is `null` because this column contains date of the next most recent changes or snapshot. 
376 | 
377 | <img width="1136" alt="image" src="https://user-images.githubusercontent.com/81607668/221782810-89766268-0725-4b95-a121-974423892382.png">
378 | 
379 | Let's say I make a change in id=3176 and ran `dbt snapshot` again. You'll see that `dbt_valid_to` contains the current timestamped and the next line represents id 3176 as well with a null `dbt_valid_to`.
380 | 
381 | <img width="833" alt="image" src="https://user-images.githubusercontent.com/81607668/221783996-15275e80-e3ac-4fb8-84c1-2882b061f585.png">
382 | 
383 | ***
384 | 
385 | ### Tests
386 | 
387 | Creating singular tests
388 | 
389 | ```sql
390 | -- tests/dim_listings_minimum_nights.sql
391 | SELECT * 
392 | FROM {{ref ('dim_listings_cleansed')}}
393 | WHERE minimum_nights < 1
394 | LIMIT 10
395 | ```
396 | 
397 | Instead of creating query tests, I can also convert them into custom generic test like below so that I can reuse for other purposes.
398 | 
399 | ```sql
400 | -- macro/positive_value.sql
401 | -- A singular test that fails when column_name is less than 1
402 | 
403 | {% test positive_value(model, column_name) %}
404 | 
405 |     SELECT * FROM {{ model }}
406 |     WHERE {{ column_name }} < 1
407 | 
408 | {% endtest %}
409 | ```
410 | 
411 | To apply them, I add the `positive_value` macro to the `schema.yml` referencing the `minimum_nights` field.
412 | 
413 | <img width="1438" alt="Screenshot 2023-03-02 at 1 12 51 PM" src="https://user-images.githubusercontent.com/81607668/222337104-2acedc89-21e6-46c1-a439-f5740022938c.png">
414 | 
415 | ***
416 | 
417 | ### dbt Packages
418 | 
419 | I'm using a `dbt utils`[doc](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) packages specifically the `generate_source_key`[doc](https://github.com/dbt-labs/dbt-utils/tree/1.0.0/#generate_surrogate_key-source) which generates a unique ID based on the specified column IDs.
420 | 
421 | <img width="893" alt="image" src="https://user-images.githubusercontent.com/81607668/222340554-760c7e0d-2cbc-499e-a59c-e123ca7fcfb6.png">
422 | 
423 | 
424 | I created a `packages.yml` file with the following package and ran `dbt deps` to install the package.
425 | 
426 | ```yml
427 | -- packages.yml
428 | packages:
429 |   - package: dbt-labs/dbt_utils
430 |     version: 1.0.0
431 | ```
432 | 
433 | <img width="733" alt="image" src="https://user-images.githubusercontent.com/81607668/222339997-61556323-9fdc-434e-a5b1-11819dc66442.png">
434 | 
435 | Then, I use this function in the `fct_reviews.sql` where I created a unique ID as the `review_id` based on the `listing_id`, `review_date`, `reviewer_name`, and `review_text` fields. 
436 | 
437 | <img width="1022" alt="image" src="https://user-images.githubusercontent.com/81607668/222340279-0d2a97a2-136b-4379-bc42-cb6fdc741ad7.png">
438 | 
439 | As `fct_reviews.sql` is an incremental, I ran the `dbt run --full-refresh --select fct_reviews` instead of `dbt run` because
440 | 
441 | ***
442 | 
443 | ### Documentation
444 | 
445 | I ran `dbt docs serve` to open into website.
446 | 
447 | <img width="1360" alt="image" src="https://user-images.githubusercontent.com/81607668/222343062-d9465f5e-0802-4ed6-8432-b6aa36afd371.png">
448 | 


--------------------------------------------------------------------------------
/Airbnb Project/analyses/full_moon_no_sleep.sql:
--------------------------------------------------------------------------------
 1 | WITH mart_fullmoon_reviews AS (
 2 |     SELECT * FROM {{ ref('mart_fullmoon_reviews') }}
 3 | )
 4 | 
 5 | SELECT 
 6 |     is_full_moon,
 7 |     review_sentiment,
 8 |     COUNT(*) AS reviews
 9 | FROM mart_fullmoon_reviews
10 | GROUP BY 
11 |     is_full_moon,
12 |     review_sentiment
13 | ORDER BY 
14 |     is_full_moon,
15 |     review_sentiment


--------------------------------------------------------------------------------
/Airbnb Project/assets/input_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/katiehuangx/data-engineering/aace791c2df1ba2899624d7d51f3d5606e90d79d/Airbnb Project/assets/input_schema.png


--------------------------------------------------------------------------------
/Airbnb Project/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'dbtlearn'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'dbtlearn'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `model-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | asset-paths: ["assets"]
22 | 
23 | target-path: "target"  # directory which will store compiled SQL files
24 | clean-targets:         # directories to be removed by `dbt clean`
25 |   - "target"
26 |   - "dbt_packages"
27 | 
28 | 
29 | # Configuring models
30 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
31 | 
32 | # In this example config, we tell dbt to build all models in the example/
33 | # directory as views. These settings can be overridden in the individual model
34 | # files using the `{{ config(...) }}` macro.
35 | models:
36 |   dbtlearn:
37 |     # Use a built-in function called materialized to ensure that models are materialised as view (not table)
38 |     + materialized: view # Models in /dbtlearn are materialised as views
39 |     + post-hook: 
40 |       - "GRANT SELECT ON {{ this }} TO ROLE REPORTER"
41 |     dim: 
42 |       + materialized: table # Models in /dim are materialised as tables
43 |     src: 
44 |       + materialized: ephemeral # Models in /src are materialised as ephemerals


--------------------------------------------------------------------------------
/Airbnb Project/environment-setup.md:
--------------------------------------------------------------------------------
  1 | # Introduction and Environment Setup
  2 | 
  3 | ## Snowflake user creation
  4 | 
  5 | Copy these SQL statements into a Snowflake Worksheet, select all and execute them (i.e. pressing the play button).
  6 | 
  7 | ```sql
  8 | -- Use an admin role
  9 | USE ROLE ACCOUNTADMIN;
 10 | 
 11 | -- Create the `transform` role
 12 | CREATE ROLE IF NOT EXISTS transform;
 13 | GRANT ROLE TRANSFORM TO ROLE ACCOUNTADMIN;
 14 | 
 15 | -- Create the default warehouse if necessary
 16 | CREATE WAREHOUSE IF NOT EXISTS COMPUTE_WH;
 17 | GRANT OPERATE ON WAREHOUSE COMPUTE_WH TO ROLE TRANSFORM;
 18 | 
 19 | -- Create the `dbt` user and assign to role
 20 | CREATE USER IF NOT EXISTS dbt
 21 |   PASSWORD='dbtPassword123'
 22 |   LOGIN_NAME='dbt'
 23 |   MUST_CHANGE_PASSWORD=FALSE
 24 |   DEFAULT_WAREHOUSE='COMPUTE_WH'
 25 |   DEFAULT_ROLE='transform'
 26 |   DEFAULT_NAMESPACE='AIRBNB.RAW'
 27 |   COMMENT='DBT user used for data transformation';
 28 | GRANT ROLE transform to USER dbt;
 29 | 
 30 | -- Create our database and schemas
 31 | CREATE DATABASE IF NOT EXISTS AIRBNB;
 32 | CREATE SCHEMA IF NOT EXISTS AIRBNB.RAW;
 33 | 
 34 | -- Set up permissions to role `transform`
 35 | GRANT ALL ON WAREHOUSE COMPUTE_WH TO ROLE transform; 
 36 | GRANT ALL ON DATABASE AIRBNB to ROLE transform;
 37 | GRANT ALL ON ALL SCHEMAS IN DATABASE AIRBNB to ROLE transform;
 38 | GRANT ALL ON FUTURE SCHEMAS IN DATABASE AIRBNB to ROLE transform;
 39 | GRANT ALL ON ALL TABLES IN SCHEMA AIRBNB.RAW to ROLE transform;
 40 | GRANT ALL ON FUTURE TABLES IN SCHEMA AIRBNB.RAW to ROLE transform;
 41 | ```
 42 | 
 43 | ***
 44 | 
 45 | ## Snowflake data import
 46 | 
 47 | Copy these SQL statements into a Snowflake Worksheet, select all and execute them (i.e. pressing the play button).
 48 | 
 49 | ```sql
 50 | -- Set up the defaults
 51 | USE WAREHOUSE COMPUTE_WH;
 52 | USE DATABASE airbnb;
 53 | USE SCHEMA RAW;
 54 | 
 55 | -- Create our three tables and import the data from S3
 56 | CREATE OR REPLACE TABLE raw_listings
 57 |                     (id integer,
 58 |                      listing_url string,
 59 |                      name string,
 60 |                      room_type string,
 61 |                      minimum_nights integer,
 62 |                      host_id integer,
 63 |                      price string,
 64 |                      created_at datetime,
 65 |                      updated_at datetime);
 66 |                     
 67 | COPY INTO raw_listings (id,
 68 |                         listing_url,
 69 |                         name,
 70 |                         room_type,
 71 |                         minimum_nights,
 72 |                         host_id,
 73 |                         price,
 74 |                         created_at,
 75 |                         updated_at)
 76 |                    from 's3://dbtlearn/listings.csv'
 77 |                     FILE_FORMAT = (type = 'CSV' skip_header = 1
 78 |                     FIELD_OPTIONALLY_ENCLOSED_BY = '"');
 79 |                     
 80 | 
 81 | CREATE OR REPLACE TABLE raw_reviews
 82 |                     (listing_id integer,
 83 |                      date datetime,
 84 |                      reviewer_name string,
 85 |                      comments string,
 86 |                      sentiment string);
 87 |                     
 88 | COPY INTO raw_reviews (listing_id, date, reviewer_name, comments, sentiment)
 89 |                    from 's3://dbtlearn/reviews.csv'
 90 |                     FILE_FORMAT = (type = 'CSV' skip_header = 1
 91 |                     FIELD_OPTIONALLY_ENCLOSED_BY = '"');
 92 |                     
 93 | 
 94 | CREATE OR REPLACE TABLE raw_hosts
 95 |                     (id integer,
 96 |                      name string,
 97 |                      is_superhost string,
 98 |                      created_at datetime,
 99 |                      updated_at datetime);
100 |                     
101 | COPY INTO raw_hosts (id, name, is_superhost, created_at, updated_at)
102 |                    from 's3://dbtlearn/hosts.csv'
103 |                     FILE_FORMAT = (type = 'CSV' skip_header = 1
104 |                     FIELD_OPTIONALLY_ENCLOSED_BY = '"');
105 | ```
106 | 
107 | 


--------------------------------------------------------------------------------
/Airbnb Project/macros/no_nulls_in_columns.sql:
--------------------------------------------------------------------------------
 1 | -- Doc | adapter.get_columns_in_relation: https://docs.getdbt.com/reference/dbt-jinja-functions/adapter#get_columns_in_relation
 2 | -- adapter.get_columns_in_relation iterates through every column in the model and checks whether column name is null.
 3 | 
 4 | {% macro no_nulls_in_columns (model)%}
 5 | 
 6 |     SELECT * 
 7 |     FROM {{ model }}
 8 |     WHERE
 9 |         {% for col in adapter.get_columns_in_relation(model) -%}
10 |             -- Interested in columns where column name is null OR... iterates to next column
11 |             {{ col.column }} IS NULL OR
12 |         {% endfor %}
13 |         FALSE -- To terminate the iteration
14 | {% endmacro %}


--------------------------------------------------------------------------------
/Airbnb Project/macros/positive_value.sql:
--------------------------------------------------------------------------------
1 | -- A singular test that fails when column_name is less than 1
2 | 
3 | {% test positive_value(model, column_name) %}
4 | 
5 |     SELECT * FROM {{ model }}
6 |     WHERE {{ column_name }} < 1
7 | 
8 | {% endtest %}


--------------------------------------------------------------------------------
/Airbnb Project/models/dashboard.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | exposures:
 4 |   - name: Executive Dashboard
 5 |     type: dashboard
 6 |     maturity: low
 7 |     url: https://09837540.us1a.app.preset.io/superset/dashboard/p/xQrpBKxae3K/
 8 |     description: Executive Dashboard about Airbnb listings and hosts
 9 |       
10 | 
11 |     depends_on:
12 |       - ref('dim_listings_w_hosts')
13 |       - ref('mart_fullmoon_reviews')
14 | 
15 |     owner:
16 |       name: Katie Huang
17 |       email: xieminee@gmail.com


--------------------------------------------------------------------------------
/Airbnb Project/models/dim/dim_hosts_cleansed.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized = 'view'
 4 |     )
 5 | }}
 6 | 
 7 | WITH src_hosts AS (
 8 |     SELECT * FROM {{ref ('src_hosts')}}
 9 | )
10 | 
11 | SELECT 
12 |     host_id,
13 |     NVL(host_name, 'Anonymous') AS host_name,
14 |     is_superhost,
15 |     created_at,
16 |     updated_at
17 | FROM src_hosts


--------------------------------------------------------------------------------
/Airbnb Project/models/dim/dim_listings_cleansed.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized = 'view'
 4 |     )
 5 | }}
 6 | 
 7 | 
 8 | WITH src_listings AS (
 9 |     SELECT * FROM {{ ref ('src_listings')}}
10 | )
11 | 
12 | SELECT
13 |     listing_id,
14 |     listing_name,
15 |     room_type,
16 |     CASE 
17 |         WHEN minimum_nights = 0 THEN 1 -- 0 night = 1 night, so we assign the value of 1 to indicate 1 night
18 |         ELSE minimum_nights
19 |     END AS minimum_nights,
20 |     host_id,
21 |     REPLACE( -- Parse string value into numerical form
22 |         price_str, '$' -- Replace '$' with price_str value. In other words, to remove '$' from string.
23 |         ) :: NUMBER (10, 2 -- Convert string type to numerical with 2 decimal places
24 |     ) AS price,
25 |     created_at,
26 |     updated_at
27 |     
28 | FROM src_listings


--------------------------------------------------------------------------------
/Airbnb Project/models/dim/dim_listings_w_hosts.sql:
--------------------------------------------------------------------------------
 1 | WITH 
 2 | listings AS (
 3 |     SELECT * FROM {{ref ('dim_listings_cleansed')}}
 4 | ),
 5 | hosts AS (
 6 |     SELECT * FROM {{ref ('dim_hosts_cleansed')}}
 7 | )
 8 | 
 9 | SELECT 
10 |     listings.listing_id,
11 |     listings.listing_name,
12 |     listings.room_type,
13 |     listings.minimum_nights,
14 |     listings.price,
15 |     listings.host_id,
16 |     hosts.host_name,
17 |     hosts.is_superhost AS host_is_superhost,
18 |     listings.created_at,
19 |     GREATEST(listings.updated_at, hosts.updated_at) AS updated_at -- Keep most recent updated_at
20 | FROM listings
21 | LEFT JOIN hosts
22 |     ON listings.host_id = hosts.host_id
23 | 


--------------------------------------------------------------------------------
/Airbnb Project/models/docs.md:
--------------------------------------------------------------------------------
1 | {% docs dim_listing_cleansed__minimum_nights %}
2 | Minimum number of nights required to rent this property.
3 | 
4 | Keep in mind that old listings might have `minimum_nights` set to `0` in the source tables. 
5 | Our cleansing algorithm updates this to `1`.
6 | 
7 | {% enddocs %}


--------------------------------------------------------------------------------
/Airbnb Project/models/fct/fct_reviews.sql:
--------------------------------------------------------------------------------
 1 | -- Doc | Config : https://docs.getdbt.com/reference/dbt-jinja-functions/config
 2 | 
 3 | {{
 4 |     config(
 5 |         materialized = 'incremental', 
 6 |         on_schema_change = 'fail'
 7 |     )
 8 | }}
 9 | -- dbt materializes the model as an incremental table. 
10 | -- Read for difference btw table vs. incremental: https://docs.getdbt.com/docs/build/materializations
11 | 
12 | WITH src_reviews AS (
13 |     SELECT * FROM {{ref ('src_reviews')}}
14 | )
15 | 
16 | SELECT 
17 |     -- Create a unique review_id based on the following columns
18 |     {{ dbt_utils.generate_surrogate_key(['listing_id', 'review_date','reviewer_name', 'review_text']) }} as review_id,
19 |     listing_id,
20 |     review_date,
21 |     NVL(reviewer_name, 'Anonymous') AS reviewer_name,
22 |     review_text,
23 |     review_sentiment
24 | FROM src_reviews
25 | WHERE review_text IS NOT NULL
26 | 
27 | {% if is_incremental() %}
28 |   and review_date > (select max(review_date) from {{ this }})
29 | {% endif %}


--------------------------------------------------------------------------------
/Airbnb Project/models/mart/mart_fullmoon_reviews.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized = 'table'
 4 |     )
 5 | }}
 6 | 
 7 | WITH fct_reviews AS (
 8 | 
 9 |     SELECT * FROM {{ref ('fct_reviews')}}
10 | 
11 | ),
12 | full_moon_dates AS (
13 | 
14 |     SELECT * FROM {{ref ('seed_full_moon_dates')}}
15 | )
16 | 
17 | SELECT
18 |     reviews.*,
19 |     CASE
20 |         WHEN fullmoon.full_moon_date IS NULL THEN 'not full moon'
21 |         ELSE 'full moon' END AS is_full_moon
22 | FROM fct_reviews AS reviews
23 | LEFT JOIN full_moon_dates AS fullmoon
24 |     ON (TO_DATE(reviews.review_date) = DATEADD(DAY, 1, fullmoon.full_moon_date))
25 | 


--------------------------------------------------------------------------------
/Airbnb Project/models/overview.md:
--------------------------------------------------------------------------------
 1 | {% docs __overview__ %}
 2 | 
 3 | # Airbnb pipeline
 4 | 
 5 | Hey, welcome to our Airbnb pipeline documentation!
 6 | 
 7 | Here is the schema of our input data:
 8 | ![input schema](assets/input_schema.png)
 9 | 
10 | {% enddocs %}


--------------------------------------------------------------------------------
/Airbnb Project/models/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   # dim_listings_cleansed model
 5 |   - name: dim_listings_cleansed
 6 |     description: Cleansed table containing Airbnb listings
 7 |     columns:
 8 | 
 9 |       - name: listing_id
10 |         description: Primary key for the listing
11 |         tests: # Built-in generic tests
12 |           - unique
13 |           - not_null
14 | 
15 |       - name: host_id
16 |         description: Foreign key references the host_id table
17 |         tests:
18 |           - not_null
19 |           - relationships: 
20 |               to: ref('dim_hosts_cleansed')
21 |               field: host_id
22 |               # Ensure referential integrity.
23 |               # Ensures that host_id in dim_listings_cleansed table exists as host_id in dim_hosts_cleansed table
24 |       
25 |       - name: room_type
26 |         description: Type of the apartment / room
27 |         tests: 
28 |           - accepted_values:
29 |               values: ['Entire home/apt',
30 |                       'Private room',
31 |                       'Shared room',
32 |                       'Hotel room']
33 |       
34 |       - name: minimum_nights
35 |         description: '{{ doc("dim_listing_cleansed__minimum_nights") }}'
36 |         tests:
37 |           - positive_value
38 | 
39 | 
40 |   # dim_hosts_cleansed model
41 |   - name: dim_hosts_cleansed
42 |     description: Cleansed table containing Airbnb hosts
43 |     columns:
44 | 
45 |       - name: host_id
46 |         description: Primary key for the hosts 
47 |         tests:
48 |           - not_null
49 |           - unique
50 | 
51 |       - name: host_name
52 |         description: Name of the host
53 |         tests:
54 |           - not_null
55 | 
56 |       - name: is_superhost
57 |         description: Defines whether the hosts is a superhost
58 |         tests:
59 |           - accepted_values:
60 |               values: ['t', 'f']
61 |  # dim_listings_w_hosts model
62 |   - name: dim_listings_w_hosts
63 |     description: Cleansed table containing Airbnb listings and hosts
64 |     tests: 
65 |       - dbt_expectations.expect_table_row_count_to_equal_other_table:
66 |           compare_model: source('airbnb', 'listings') # Take note of the indent
67 |       
68 |     columns:
69 |       - name: price
70 |         tests:
71 |           # Doc: https://github.com/calogica/dbt-expectations/tree/0.8.2/#expect_column_values_to_be_of_type
72 |           
73 |           # Test ensures that price column is always number type
74 |           - dbt_expectations.expect_column_values_to_be_of_type:
75 |               column_type: number # Refer to datawarehouse definition of column
76 |           
77 |           # Test ensures that 99% of prices are between $50 and $500
78 |           - dbt_expectations.expect_column_quantile_values_to_be_between:
79 |               quantile: .99
80 |               min_value: 50 
81 |               max_value: 500
82 |           
83 |           # Test ensures that the max price is $500
84 |           - dbt_expectations.expect_column_max_to_be_between:
85 |               max_value: 5000 
86 |               config:
87 |                 severity: warn
88 | 


--------------------------------------------------------------------------------
/Airbnb Project/models/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources: 
 4 |   - name: airbnb
 5 |     schema: raw
 6 |     tables: 
 7 |       - name: listings
 8 |         identifier: raw_listings
 9 |         columns:
10 |           - name: room_type
11 |             tests:
12 |               # Test to ensure that the COUNT(DISTINCT room_type) is always 4
13 |               - dbt_expectations.expect_column_distinct_count_to_equal:
14 |                   value: 4 
15 |           - name: price
16 |             tests: 
17 |               - dbt_expectations.expect_column_values_to_match_regex:
18 |                   regex: "^\\\\$[0-9][0-9\\\\.]+$"
19 | 
20 |       - name: hosts
21 |         identifier: raw_hosts
22 | 
23 |       - name: reviews
24 |         identifier: raw_reviews
25 |         loaded_at_field: date # Specify the original date name
26 |         # Doc | Freshness: https://docs.getdbt.com/reference/resource-properties/freshness
27 |         # A freshness block is used to define the acceptable amount of time between the most recent record and now, for a 
28 |         # table to be considered "fresh".
29 |         freshness:
30 |           warn_after: {count: 1, period: hour}
31 |           error_after: {count: 24, period: hour}


--------------------------------------------------------------------------------
/Airbnb Project/models/src/src_hosts.sql:
--------------------------------------------------------------------------------
 1 | WITH raw_hosts AS (
 2 | 	SELECT * FROM {{source ('airbnb', 'hosts')}}
 3 | )
 4 | 
 5 | SELECT
 6 |     id AS host_id,
 7 |     name AS host_name,
 8 |     is_superhost,
 9 |     created_at,
10 |     updated_at
11 | FROM raw_hosts


--------------------------------------------------------------------------------
/Airbnb Project/models/src/src_listings.sql:
--------------------------------------------------------------------------------
 1 | WITH raw_listings AS (
 2 | 	SELECT * FROM {{source ('airbnb', 'listings')}}
 3 | )
 4 | 
 5 | SELECT 
 6 | 	id AS listing_id,
 7 |     name AS listing_name,
 8 |     listing_url,
 9 |     room_type,
10 |     minimum_nights,
11 |     host_id,
12 |     price AS price_str,
13 |     created_at,
14 |     updated_at
15 | FROM 
16 | 	raw_listings


--------------------------------------------------------------------------------
/Airbnb Project/models/src/src_reviews.sql:
--------------------------------------------------------------------------------
 1 | WITH raw_reviews AS (
 2 | 	SELECT * FROM {{source ('airbnb', 'reviews')}}
 3 | )
 4 | 
 5 | SELECT
 6 |     listing_id,
 7 |     date AS review_date,
 8 |     reviewer_name,
 9 |     comments AS review_text,
10 |     sentiment AS review_sentiment
11 | FROM raw_reviews


--------------------------------------------------------------------------------
/Airbnb Project/packages.yml:
--------------------------------------------------------------------------------
 1 | # Doc | dbt Hub: https://hub.getdbt.com
 2 | # dbt utils: https://hub.getdbt.com/dbt-labs/dbt_utils/latest/
 3 | # great expectations: https://hub.getdbt.com/calogica/dbt_expectations/latest/ | https://github.com/calogica/dbt-expectations
 4 | 
 5 | packages:
 6 |   - package: dbt-labs/dbt_utils
 7 |     version: 1.0.0
 8 | 
 9 | packages:
10 |   - package: calogica/dbt_expectations
11 |     version: [">=0.8.0", "<0.9.0"]


--------------------------------------------------------------------------------
/Airbnb Project/seeds/seed_full_moon_dates.csv:
--------------------------------------------------------------------------------
  1 | full_moon_date
  2 | 2009-01-11
  3 | 2009-02-09
  4 | 2009-03-11
  5 | 2009-04-09
  6 | 2009-05-09
  7 | 2009-06-07
  8 | 2009-07-07
  9 | 2009-08-06
 10 | 2009-09-04
 11 | 2009-10-04
 12 | 2009-11-02
 13 | 2009-12-02
 14 | 2009-12-31
 15 | 2010-01-30
 16 | 2010-02-28
 17 | 2010-03-30
 18 | 2010-04-28
 19 | 2010-05-28
 20 | 2010-06-26
 21 | 2010-07-26
 22 | 2010-08-24
 23 | 2010-09-23
 24 | 2010-10-23
 25 | 2010-11-21
 26 | 2010-12-21
 27 | 2011-01-19
 28 | 2011-02-18
 29 | 2011-03-19
 30 | 2011-04-18
 31 | 2011-05-17
 32 | 2011-06-15
 33 | 2011-07-15
 34 | 2011-08-13
 35 | 2011-09-12
 36 | 2011-10-12
 37 | 2011-11-10
 38 | 2011-12-10
 39 | 2012-01-09
 40 | 2012-02-07
 41 | 2012-03-08
 42 | 2012-04-06
 43 | 2012-05-06
 44 | 2012-06-04
 45 | 2012-07-03
 46 | 2012-08-02
 47 | 2012-08-31
 48 | 2012-09-30
 49 | 2012-10-29
 50 | 2012-11-28
 51 | 2012-12-28
 52 | 2013-01-27
 53 | 2013-02-25
 54 | 2013-03-27
 55 | 2013-04-25
 56 | 2013-05-25
 57 | 2013-06-23
 58 | 2013-07-22
 59 | 2013-08-21
 60 | 2013-09-19
 61 | 2013-10-19
 62 | 2013-11-17
 63 | 2013-12-17
 64 | 2014-01-16
 65 | 2014-02-15
 66 | 2014-03-16
 67 | 2014-04-15
 68 | 2014-05-14
 69 | 2014-06-13
 70 | 2014-07-12
 71 | 2014-08-10
 72 | 2014-09-09
 73 | 2014-10-08
 74 | 2014-11-06
 75 | 2014-12-06
 76 | 2015-01-05
 77 | 2015-02-04
 78 | 2015-03-05
 79 | 2015-04-04
 80 | 2015-05-04
 81 | 2015-06-02
 82 | 2015-07-02
 83 | 2015-07-31
 84 | 2015-08-29
 85 | 2015-09-28
 86 | 2015-10-27
 87 | 2015-11-25
 88 | 2015-12-25
 89 | 2016-01-24
 90 | 2016-02-22
 91 | 2016-03-23
 92 | 2016-04-22
 93 | 2016-05-21
 94 | 2016-06-20
 95 | 2016-07-20
 96 | 2016-08-18
 97 | 2016-09-16
 98 | 2016-10-16
 99 | 2016-11-14
100 | 2016-12-14
101 | 2017-01-12
102 | 2017-02-11
103 | 2017-03-12
104 | 2017-04-11
105 | 2017-05-10
106 | 2017-06-09
107 | 2017-07-09
108 | 2017-08-07
109 | 2017-09-06
110 | 2017-10-05
111 | 2017-11-04
112 | 2017-12-03
113 | 2018-01-02
114 | 2018-01-31
115 | 2018-03-02
116 | 2018-03-31
117 | 2018-04-30
118 | 2018-05-29
119 | 2018-06-28
120 | 2018-07-27
121 | 2018-08-26
122 | 2018-09-25
123 | 2018-10-24
124 | 2018-11-23
125 | 2018-12-22
126 | 2019-01-21
127 | 2019-02-19
128 | 2019-03-21
129 | 2019-04-19
130 | 2019-05-18
131 | 2019-06-17
132 | 2019-07-16
133 | 2019-08-15
134 | 2019-09-14
135 | 2019-10-13
136 | 2019-11-12
137 | 2019-12-12
138 | 2020-01-10
139 | 2020-02-09
140 | 2020-03-09
141 | 2020-04-08
142 | 2020-05-07
143 | 2020-06-05
144 | 2020-07-05
145 | 2020-08-03
146 | 2020-09-02
147 | 2020-10-01
148 | 2020-10-31
149 | 2020-11-30
150 | 2020-12-30
151 | 2021-01-28
152 | 2021-02-27
153 | 2021-03-28
154 | 2021-04-27
155 | 2021-05-26
156 | 2021-06-24
157 | 2021-07-24
158 | 2021-08-22
159 | 2021-09-21
160 | 2021-10-20
161 | 2021-11-19
162 | 2021-12-19
163 | 2022-01-18
164 | 2022-02-16
165 | 2022-03-18
166 | 2022-04-16
167 | 2022-05-16
168 | 2022-06-14
169 | 2022-07-13
170 | 2022-08-12
171 | 2022-09-10
172 | 2022-10-09
173 | 2022-11-08
174 | 2022-12-08
175 | 2023-01-07
176 | 2023-02-05
177 | 2023-03-07
178 | 2023-04-06
179 | 2023-05-05
180 | 2023-06-04
181 | 2023-07-03
182 | 2023-08-01
183 | 2023-08-31
184 | 2023-09-29
185 | 2023-10-28
186 | 2023-11-27
187 | 2023-12-27
188 | 2024-01-25
189 | 2024-02-24
190 | 2024-03-25
191 | 2024-04-24
192 | 2024-05-23
193 | 2024-06-22
194 | 2024-07-21
195 | 2024-08-19
196 | 2024-09-18
197 | 2024-10-17
198 | 2024-11-15
199 | 2024-12-15
200 | 2025-01-13
201 | 2025-02-12
202 | 2025-03-14
203 | 2025-04-13
204 | 2025-05-12
205 | 2025-06-11
206 | 2025-07-10
207 | 2025-08-09
208 | 2025-09-07
209 | 2025-10-07
210 | 2025-11-05
211 | 2025-12-05
212 | 2026-01-03
213 | 2026-02-01
214 | 2026-03-03
215 | 2026-04-02
216 | 2026-05-01
217 | 2026-05-31
218 | 2026-06-30
219 | 2026-07-29
220 | 2026-08-28
221 | 2026-09-26
222 | 2026-10-26
223 | 2026-11-24
224 | 2026-12-24
225 | 2027-01-22
226 | 2027-02-21
227 | 2027-03-22
228 | 2027-04-21
229 | 2027-05-20
230 | 2027-06-19
231 | 2027-07-18
232 | 2027-08-17
233 | 2027-09-16
234 | 2027-10-15
235 | 2027-11-14
236 | 2027-12-13
237 | 2028-01-12
238 | 2028-02-10
239 | 2028-03-11
240 | 2028-04-09
241 | 2028-05-08
242 | 2028-06-07
243 | 2028-07-06
244 | 2028-08-05
245 | 2028-09-04
246 | 2028-10-03
247 | 2028-11-02
248 | 2028-12-02
249 | 2028-12-31
250 | 2029-01-30
251 | 2029-02-28
252 | 2029-03-30
253 | 2029-04-28
254 | 2029-05-27
255 | 2029-06-26
256 | 2029-07-25
257 | 2029-08-24
258 | 2029-09-22
259 | 2029-10-22
260 | 2029-11-21
261 | 2029-12-20
262 | 2030-01-19
263 | 2030-02-18
264 | 2030-03-19
265 | 2030-04-18
266 | 2030-05-17
267 | 2030-06-15
268 | 2030-07-15
269 | 2030-08-13
270 | 2030-09-11
271 | 2030-10-11
272 | 2030-11-10
273 | 2030-12-09
274 | 


--------------------------------------------------------------------------------
/Airbnb Project/snapshots/scd_raw_listings.sql:
--------------------------------------------------------------------------------
 1 | {% snapshot scd_raw_listings %}
 2 | 
 3 | {{ 
 4 |     config(
 5 |     target_schema="dev",
 6 |     unique_key="id",
 7 |     strategy="timestamp",
 8 |     updated_at="updated_at",
 9 |     invalidate_hard_deletes=True
10 |     ) 
11 | }}
12 | 
13 | -- Base of a snapshot is ALWAYS a SELECT * statement
14 | -- Doc | Snapshots: https://docs.getdbt.com/docs/build/snapshots
15 | 
16 | SELECT * FROM {{source ('airbnb', 'listings')}}
17 | 
18 | {% endsnapshot %}


--------------------------------------------------------------------------------
/Airbnb Project/target/assets/input_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/katiehuangx/data-engineering/aace791c2df1ba2899624d7d51f3d5606e90d79d/Airbnb Project/target/assets/input_schema.png


--------------------------------------------------------------------------------
/Airbnb Project/target/catalog.json:
--------------------------------------------------------------------------------
1 | {"metadata": {"dbt_schema_version": "https://schemas.getdbt.com/dbt/catalog/v1.json", "dbt_version": "1.4.3", "generated_at": "2023-03-07T08:55:13.990601Z", "invocation_id": "f154e3a2-cc49-45ce-affe-fa7b412907db", "env": {}}, "nodes": {"model.dbtlearn.fct_reviews": {"metadata": {"type": "BASE TABLE", "schema": "DEV", "name": "FCT_REVIEWS", "database": "AIRBNB", "comment": null, "owner": "TRANSFORM"}, "columns": {"REVIEW_ID": {"type": "TEXT", "index": 1, "name": "REVIEW_ID", "comment": null}, "LISTING_ID": {"type": "NUMBER", "index": 2, "name": "LISTING_ID", "comment": null}, "REVIEW_DATE": {"type": "TIMESTAMP_NTZ", "index": 3, "name": "REVIEW_DATE", "comment": null}, "REVIEWER_NAME": {"type": "TEXT", "index": 4, "name": "REVIEWER_NAME", "comment": null}, "REVIEW_TEXT": {"type": "TEXT", "index": 5, "name": "REVIEW_TEXT", "comment": null}, "REVIEW_SENTIMENT": {"type": "TEXT", "index": 6, "name": "REVIEW_SENTIMENT", "comment": null}}, "stats": {"last_modified": {"id": "last_modified", "label": "Last Modified", "value": "2023-03-02 05:29UTC", "include": true, "description": "The timestamp for last update/change"}, "bytes": {"id": "bytes", "label": "Approximate Size", "value": 47417344.0, "include": true, "description": "Approximate size of the table as reported by Snowflake"}, "row_count": {"id": "row_count", "label": "Row Count", "value": 409697.0, "include": true, "description": "An approximate count of rows in this table"}, "has_stats": {"id": "has_stats", "label": "Has Stats?", "value": true, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "model.dbtlearn.fct_reviews"}, "seed.dbtlearn.seed_full_moon_dates": {"metadata": {"type": "BASE TABLE", "schema": "DEV", "name": "SEED_FULL_MOON_DATES", "database": "AIRBNB", "comment": null, "owner": "TRANSFORM"}, "columns": {"FULL_MOON_DATE": {"type": "DATE", "index": 1, "name": "FULL_MOON_DATE", "comment": null}}, "stats": {"last_modified": {"id": "last_modified", "label": "Last Modified", "value": "2023-02-27 08:20UTC", "include": true, "description": "The timestamp for last update/change"}, "bytes": {"id": "bytes", "label": "Approximate Size", "value": 1536.0, "include": true, "description": "Approximate size of the table as reported by Snowflake"}, "row_count": {"id": "row_count", "label": "Row Count", "value": 272.0, "include": true, "description": "An approximate count of rows in this table"}, "has_stats": {"id": "has_stats", "label": "Has Stats?", "value": true, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "seed.dbtlearn.seed_full_moon_dates"}, "model.dbtlearn.dim_hosts_cleansed": {"metadata": {"type": "VIEW", "schema": "DEV", "name": "DIM_HOSTS_CLEANSED", "database": "AIRBNB", "comment": null, "owner": "TRANSFORM"}, "columns": {"HOST_ID": {"type": "NUMBER", "index": 1, "name": "HOST_ID", "comment": null}, "HOST_NAME": {"type": "TEXT", "index": 2, "name": "HOST_NAME", "comment": null}, "IS_SUPERHOST": {"type": "TEXT", "index": 3, "name": "IS_SUPERHOST", "comment": null}, "CREATED_AT": {"type": "TIMESTAMP_NTZ", "index": 4, "name": "CREATED_AT", "comment": null}, "UPDATED_AT": {"type": "TIMESTAMP_NTZ", "index": 5, "name": "UPDATED_AT", "comment": null}}, "stats": {"has_stats": {"id": "has_stats", "label": "Has Stats?", "value": false, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "model.dbtlearn.dim_hosts_cleansed"}, "model.dbtlearn.dim_listings_w_hosts": {"metadata": {"type": "BASE TABLE", "schema": "DEV", "name": "DIM_LISTINGS_W_HOSTS", "database": "AIRBNB", "comment": null, "owner": "TRANSFORM"}, "columns": {"LISTING_ID": {"type": "NUMBER", "index": 1, "name": "LISTING_ID", "comment": null}, "LISTING_NAME": {"type": "TEXT", "index": 2, "name": "LISTING_NAME", "comment": null}, "ROOM_TYPE": {"type": "TEXT", "index": 3, "name": "ROOM_TYPE", "comment": null}, "MINIMUM_NIGHTS": {"type": "NUMBER", "index": 4, "name": "MINIMUM_NIGHTS", "comment": null}, "PRICE": {"type": "NUMBER", "index": 5, "name": "PRICE", "comment": null}, "HOST_ID": {"type": "NUMBER", "index": 6, "name": "HOST_ID", "comment": null}, "HOST_NAME": {"type": "TEXT", "index": 7, "name": "HOST_NAME", "comment": null}, "HOST_IS_SUPERHOST": {"type": "TEXT", "index": 8, "name": "HOST_IS_SUPERHOST", "comment": null}, "CREATED_AT": {"type": "TIMESTAMP_NTZ", "index": 9, "name": "CREATED_AT", "comment": null}, "UPDATED_AT": {"type": "TIMESTAMP_NTZ", "index": 10, "name": "UPDATED_AT", "comment": null}}, "stats": {"last_modified": {"id": "last_modified", "label": "Last Modified", "value": "2023-03-07 07:45UTC", "include": true, "description": "The timestamp for last update/change"}, "bytes": {"id": "bytes", "label": "Approximate Size", "value": 866304.0, "include": true, "description": "Approximate size of the table as reported by Snowflake"}, "row_count": {"id": "row_count", "label": "Row Count", "value": 17499.0, "include": true, "description": "An approximate count of rows in this table"}, "has_stats": {"id": "has_stats", "label": "Has Stats?", "value": true, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "model.dbtlearn.dim_listings_w_hosts"}, "model.dbtlearn.mart_fullmoon_reviews": {"metadata": {"type": "BASE TABLE", "schema": "DEV", "name": "MART_FULLMOON_REVIEWS", "database": "AIRBNB", "comment": null, "owner": "TRANSFORM"}, "columns": {"REVIEW_ID": {"type": "TEXT", "index": 1, "name": "REVIEW_ID", "comment": null}, "LISTING_ID": {"type": "NUMBER", "index": 2, "name": "LISTING_ID", "comment": null}, "REVIEW_DATE": {"type": "TIMESTAMP_NTZ", "index": 3, "name": "REVIEW_DATE", "comment": null}, "REVIEWER_NAME": {"type": "TEXT", "index": 4, "name": "REVIEWER_NAME", "comment": null}, "REVIEW_TEXT": {"type": "TEXT", "index": 5, "name": "REVIEW_TEXT", "comment": null}, "REVIEW_SENTIMENT": {"type": "TEXT", "index": 6, "name": "REVIEW_SENTIMENT", "comment": null}, "IS_FULL_MOON": {"type": "TEXT", "index": 7, "name": "IS_FULL_MOON", "comment": null}}, "stats": {"last_modified": {"id": "last_modified", "label": "Last Modified", "value": "2023-03-07 07:45UTC", "include": true, "description": "The timestamp for last update/change"}, "bytes": {"id": "bytes", "label": "Approximate Size", "value": 51119104.0, "include": true, "description": "Approximate size of the table as reported by Snowflake"}, "row_count": {"id": "row_count", "label": "Row Count", "value": 409697.0, "include": true, "description": "An approximate count of rows in this table"}, "has_stats": {"id": "has_stats", "label": "Has Stats?", "value": true, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "model.dbtlearn.mart_fullmoon_reviews"}, "snapshot.dbtlearn.scd_raw_listings": {"metadata": {"type": "BASE TABLE", "schema": "DEV", "name": "SCD_RAW_LISTINGS", "database": "AIRBNB", "comment": null, "owner": "TRANSFORM"}, "columns": {"ID": {"type": "NUMBER", "index": 1, "name": "ID", "comment": null}, "LISTING_URL": {"type": "TEXT", "index": 2, "name": "LISTING_URL", "comment": null}, "NAME": {"type": "TEXT", "index": 3, "name": "NAME", "comment": null}, "ROOM_TYPE": {"type": "TEXT", "index": 4, "name": "ROOM_TYPE", "comment": null}, "MINIMUM_NIGHTS": {"type": "NUMBER", "index": 5, "name": "MINIMUM_NIGHTS", "comment": null}, "HOST_ID": {"type": "NUMBER", "index": 6, "name": "HOST_ID", "comment": null}, "PRICE": {"type": "TEXT", "index": 7, "name": "PRICE", "comment": null}, "CREATED_AT": {"type": "TIMESTAMP_NTZ", "index": 8, "name": "CREATED_AT", "comment": null}, "UPDATED_AT": {"type": "TIMESTAMP_NTZ", "index": 9, "name": "UPDATED_AT", "comment": null}, "DBT_SCD_ID": {"type": "TEXT", "index": 10, "name": "DBT_SCD_ID", "comment": null}, "DBT_UPDATED_AT": {"type": "TIMESTAMP_NTZ", "index": 11, "name": "DBT_UPDATED_AT", "comment": null}, "DBT_VALID_FROM": {"type": "TIMESTAMP_NTZ", "index": 12, "name": "DBT_VALID_FROM", "comment": null}, "DBT_VALID_TO": {"type": "TIMESTAMP_NTZ", "index": 13, "name": "DBT_VALID_TO", "comment": null}}, "stats": {"last_modified": {"id": "last_modified", "label": "Last Modified", "value": "2023-02-28 07:17UTC", "include": true, "description": "The timestamp for last update/change"}, "bytes": {"id": "bytes", "label": "Approximate Size", "value": 1713152.0, "include": true, "description": "Approximate size of the table as reported by Snowflake"}, "row_count": {"id": "row_count", "label": "Row Count", "value": 17500.0, "include": true, "description": "An approximate count of rows in this table"}, "has_stats": {"id": "has_stats", "label": "Has Stats?", "value": true, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "snapshot.dbtlearn.scd_raw_listings"}, "model.dbtlearn.dim_listings_cleansed": {"metadata": {"type": "VIEW", "schema": "DEV", "name": "DIM_LISTINGS_CLEANSED", "database": "AIRBNB", "comment": null, "owner": "TRANSFORM"}, "columns": {"LISTING_ID": {"type": "NUMBER", "index": 1, "name": "LISTING_ID", "comment": null}, "LISTING_NAME": {"type": "TEXT", "index": 2, "name": "LISTING_NAME", "comment": null}, "ROOM_TYPE": {"type": "TEXT", "index": 3, "name": "ROOM_TYPE", "comment": null}, "MINIMUM_NIGHTS": {"type": "NUMBER", "index": 4, "name": "MINIMUM_NIGHTS", "comment": null}, "HOST_ID": {"type": "NUMBER", "index": 5, "name": "HOST_ID", "comment": null}, "PRICE": {"type": "NUMBER", "index": 6, "name": "PRICE", "comment": null}, "CREATED_AT": {"type": "TIMESTAMP_NTZ", "index": 7, "name": "CREATED_AT", "comment": null}, "UPDATED_AT": {"type": "TIMESTAMP_NTZ", "index": 8, "name": "UPDATED_AT", "comment": null}}, "stats": {"has_stats": {"id": "has_stats", "label": "Has Stats?", "value": false, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "model.dbtlearn.dim_listings_cleansed"}}, "sources": {"source.dbtlearn.airbnb.reviews": {"metadata": {"type": "BASE TABLE", "schema": "RAW", "name": "RAW_REVIEWS", "database": "AIRBNB", "comment": null, "owner": "ACCOUNTADMIN"}, "columns": {"LISTING_ID": {"type": "NUMBER", "index": 1, "name": "LISTING_ID", "comment": null}, "DATE": {"type": "TIMESTAMP_NTZ", "index": 2, "name": "DATE", "comment": null}, "REVIEWER_NAME": {"type": "TEXT", "index": 3, "name": "REVIEWER_NAME", "comment": null}, "COMMENTS": {"type": "TEXT", "index": 4, "name": "COMMENTS", "comment": null}, "SENTIMENT": {"type": "TEXT", "index": 5, "name": "SENTIMENT", "comment": null}}, "stats": {"last_modified": {"id": "last_modified", "label": "Last Modified", "value": "2023-02-27 01:28UTC", "include": true, "description": "The timestamp for last update/change"}, "bytes": {"id": "bytes", "label": "Approximate Size", "value": 38968320.0, "include": true, "description": "Approximate size of the table as reported by Snowflake"}, "row_count": {"id": "row_count", "label": "Row Count", "value": 410284.0, "include": true, "description": "An approximate count of rows in this table"}, "has_stats": {"id": "has_stats", "label": "Has Stats?", "value": true, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "source.dbtlearn.airbnb.reviews"}, "source.dbtlearn.airbnb.listings": {"metadata": {"type": "BASE TABLE", "schema": "RAW", "name": "RAW_LISTINGS", "database": "AIRBNB", "comment": null, "owner": "ACCOUNTADMIN"}, "columns": {"ID": {"type": "NUMBER", "index": 1, "name": "ID", "comment": null}, "LISTING_URL": {"type": "TEXT", "index": 2, "name": "LISTING_URL", "comment": null}, "NAME": {"type": "TEXT", "index": 3, "name": "NAME", "comment": null}, "ROOM_TYPE": {"type": "TEXT", "index": 4, "name": "ROOM_TYPE", "comment": null}, "MINIMUM_NIGHTS": {"type": "NUMBER", "index": 5, "name": "MINIMUM_NIGHTS", "comment": null}, "HOST_ID": {"type": "NUMBER", "index": 6, "name": "HOST_ID", "comment": null}, "PRICE": {"type": "TEXT", "index": 7, "name": "PRICE", "comment": null}, "CREATED_AT": {"type": "TIMESTAMP_NTZ", "index": 8, "name": "CREATED_AT", "comment": null}, "UPDATED_AT": {"type": "TIMESTAMP_NTZ", "index": 9, "name": "UPDATED_AT", "comment": null}}, "stats": {"last_modified": {"id": "last_modified", "label": "Last Modified", "value": "2023-02-28 07:15UTC", "include": true, "description": "The timestamp for last update/change"}, "bytes": {"id": "bytes", "label": "Approximate Size", "value": 927744.0, "include": true, "description": "Approximate size of the table as reported by Snowflake"}, "row_count": {"id": "row_count", "label": "Row Count", "value": 17499.0, "include": true, "description": "An approximate count of rows in this table"}, "has_stats": {"id": "has_stats", "label": "Has Stats?", "value": true, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "source.dbtlearn.airbnb.listings"}, "source.dbtlearn.airbnb.hosts": {"metadata": {"type": "BASE TABLE", "schema": "RAW", "name": "RAW_HOSTS", "database": "AIRBNB", "comment": null, "owner": "ACCOUNTADMIN"}, "columns": {"ID": {"type": "NUMBER", "index": 1, "name": "ID", "comment": null}, "NAME": {"type": "TEXT", "index": 2, "name": "NAME", "comment": null}, "IS_SUPERHOST": {"type": "TEXT", "index": 3, "name": "IS_SUPERHOST", "comment": null}, "CREATED_AT": {"type": "TIMESTAMP_NTZ", "index": 4, "name": "CREATED_AT", "comment": null}, "UPDATED_AT": {"type": "TIMESTAMP_NTZ", "index": 5, "name": "UPDATED_AT", "comment": null}}, "stats": {"last_modified": {"id": "last_modified", "label": "Last Modified", "value": "2023-02-27 01:28UTC", "include": true, "description": "The timestamp for last update/change"}, "bytes": {"id": "bytes", "label": "Approximate Size", "value": 338432.0, "include": true, "description": "Approximate size of the table as reported by Snowflake"}, "row_count": {"id": "row_count", "label": "Row Count", "value": 14111.0, "include": true, "description": "An approximate count of rows in this table"}, "has_stats": {"id": "has_stats", "label": "Has Stats?", "value": true, "include": false, "description": "Indicates whether there are statistics for this table"}}, "unique_id": "source.dbtlearn.airbnb.hosts"}}, "errors": null}


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/analyses/full_moon_no_sleep.sql:
--------------------------------------------------------------------------------
 1 | WITH mart_fullmoon_reviews AS (
 2 |     SELECT * FROM airbnb.dev.mart_fullmoon_reviews
 3 | )
 4 | 
 5 | SELECT 
 6 |     is_full_moon,
 7 |     review_sentiment,
 8 |     COUNT(*) AS reviews
 9 | FROM mart_fullmoon_reviews
10 | GROUP BY 
11 |     is_full_moon,
12 |     review_sentiment
13 | ORDER BY 
14 |     is_full_moon,
15 |     review_sentiment


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/dim/dim_hosts_cleansed.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | WITH  __dbt__cte__src_hosts as (
 4 | WITH raw_hosts AS (
 5 | 	SELECT * FROM airbnb.raw.raw_hosts
 6 | )
 7 | 
 8 | SELECT
 9 |     id AS host_id,
10 |     name AS host_name,
11 |     is_superhost,
12 |     created_at,
13 |     updated_at
14 | FROM raw_hosts
15 | ),src_hosts AS (
16 |     SELECT * FROM __dbt__cte__src_hosts
17 | )
18 | 
19 | SELECT 
20 |     host_id,
21 |     NVL(host_name, 'Anonymous') AS host_name,
22 |     is_superhost,
23 |     created_at,
24 |     updated_at
25 | FROM src_hosts


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/dim/dim_listings_cleansed.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | WITH  __dbt__cte__src_listings as (
 5 | WITH raw_listings AS (
 6 | 	SELECT * FROM airbnb.raw.raw_listings
 7 | )
 8 | 
 9 | SELECT 
10 | 	id AS listing_id,
11 |     name AS listing_name,
12 |     listing_url,
13 |     room_type,
14 |     minimum_nights,
15 |     host_id,
16 |     price AS price_str,
17 |     created_at,
18 |     updated_at
19 | FROM 
20 | 	raw_listings
21 | ),src_listings AS (
22 |     SELECT * FROM __dbt__cte__src_listings
23 | )
24 | 
25 | SELECT
26 |     listing_id,
27 |     listing_name,
28 |     room_type,
29 |     CASE 
30 |         WHEN minimum_nights = 0 THEN 1 -- 0 night = 1 night, so we assign the value of 1 to indicate 1 night
31 |         ELSE minimum_nights
32 |     END AS minimum_nights,
33 |     host_id,
34 |     REPLACE( -- Parse string value into numerical form
35 |         price_str, '$' -- Replace '$' with price_str value. In other words, to remove '$' from string.
36 |         ) :: NUMBER (10, 2 -- Convert string type to numerical with 2 decimal places
37 |     ) AS price,
38 |     created_at,
39 |     updated_at
40 |     
41 | FROM src_listings


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/dim/dim_listings_w_hosts.sql:
--------------------------------------------------------------------------------
 1 | WITH 
 2 | listings AS (
 3 |     SELECT * FROM airbnb.dev.dim_listings_cleansed
 4 | ),
 5 | hosts AS (
 6 |     SELECT * FROM airbnb.dev.dim_hosts_cleansed
 7 | )
 8 | 
 9 | SELECT 
10 |     listings.listing_id,
11 |     listings.listing_name,
12 |     listings.room_type,
13 |     listings.minimum_nights,
14 |     listings.price,
15 |     listings.host_id,
16 |     hosts.host_name,
17 |     hosts.is_superhost AS host_is_superhost,
18 |     listings.created_at,
19 |     GREATEST(listings.updated_at, hosts.updated_at) AS updated_at -- Keep most recent updated_at
20 | FROM listings
21 | LEFT JOIN hosts
22 |     ON listings.host_id = hosts.host_id


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/fct/fct_reviews.sql:
--------------------------------------------------------------------------------
 1 | -- Doc | Config : https://docs.getdbt.com/reference/dbt-jinja-functions/config
 2 | 
 3 | 
 4 | -- dbt materializes the model as an incremental table. 
 5 | -- Read for difference btw table vs. incremental: https://docs.getdbt.com/docs/build/materializations
 6 | 
 7 | WITH  __dbt__cte__src_reviews as (
 8 | WITH raw_reviews AS (
 9 | 	SELECT * FROM airbnb.raw.raw_reviews
10 | )
11 | 
12 | SELECT
13 |     listing_id,
14 |     date AS review_date,
15 |     reviewer_name,
16 |     comments AS review_text,
17 |     sentiment AS review_sentiment
18 | FROM raw_reviews
19 | ),src_reviews AS (
20 |     SELECT * FROM __dbt__cte__src_reviews
21 | )
22 | 
23 | SELECT 
24 |     -- Create a unique review_id based on the following columns
25 |     
26 |     
27 | md5(cast(coalesce(cast(listing_id as TEXT), '_dbt_utils_surrogate_key_null_') || '-' || coalesce(cast(review_date as TEXT), '_dbt_utils_surrogate_key_null_') || '-' || coalesce(cast(reviewer_name as TEXT), '_dbt_utils_surrogate_key_null_') || '-' || coalesce(cast(review_text as TEXT), '_dbt_utils_surrogate_key_null_') as TEXT)) as review_id,
28 |     listing_id,
29 |     review_date,
30 |     NVL(reviewer_name, 'Anonymous') AS reviewer_name,
31 |     review_text,
32 |     review_sentiment
33 | FROM src_reviews
34 | WHERE review_text IS NOT NULL
35 | 
36 | 
37 |   and review_date > (select max(review_date) from airbnb.dev.fct_reviews)
38 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/mart/mart_fullmoon_reviews.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | WITH fct_reviews AS (
 4 | 
 5 |     SELECT * FROM airbnb.dev.fct_reviews
 6 | 
 7 | ),
 8 | full_moon_dates AS (
 9 | 
10 |     SELECT * FROM airbnb.dev.seed_full_moon_dates
11 | )
12 | 
13 | SELECT
14 |     reviews.*,
15 |     CASE
16 |         WHEN fullmoon.full_moon_date IS NULL THEN 'not full moon'
17 |         ELSE 'full moon' END AS is_full_moon
18 | FROM fct_reviews AS reviews
19 | LEFT JOIN full_moon_dates AS fullmoon
20 |     ON (TO_DATE(reviews.review_date) = DATEADD(DAY, 1, fullmoon.full_moon_date))


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/accepted_values_dim_hosts_cleansed_is_superhost__t__f.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 | 
 5 | with all_values as (
 6 | 
 7 |     select
 8 |         is_superhost as value_field,
 9 |         count(*) as n_records
10 | 
11 |     from airbnb.dev.dim_hosts_cleansed
12 |     group by is_superhost
13 | 
14 | )
15 | 
16 | select *
17 | from all_values
18 | where value_field not in (
19 |     't','f'
20 | )
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/accepted_values_dim_listings_c_1ca6148a08c62a5218f2a162f9d2a9a6.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 | 
 5 | with all_values as (
 6 | 
 7 |     select
 8 |         room_type as value_field,
 9 |         count(*) as n_records
10 | 
11 |     from airbnb.dev.dim_listings_cleansed
12 |     group by room_type
13 | 
14 | )
15 | 
16 | select *
17 | from all_values
18 | where value_field not in (
19 |     'Entire home/apt','Private room','Shared room','Hotel room'
20 | )
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/dbt_expectations_expect_column_07e7a515218ef6e3a17e164c642c7d18.sql:
--------------------------------------------------------------------------------
 1 | with relation_columns as (
 2 | 
 3 |         
 4 |         select
 5 |             cast('LISTING_ID' as TEXT) as relation_column,
 6 |             cast('NUMBER' as TEXT) as relation_column_type
 7 |         union all
 8 |         
 9 |         select
10 |             cast('LISTING_NAME' as TEXT) as relation_column,
11 |             cast('VARCHAR' as TEXT) as relation_column_type
12 |         union all
13 |         
14 |         select
15 |             cast('ROOM_TYPE' as TEXT) as relation_column,
16 |             cast('VARCHAR' as TEXT) as relation_column_type
17 |         union all
18 |         
19 |         select
20 |             cast('MINIMUM_NIGHTS' as TEXT) as relation_column,
21 |             cast('NUMBER' as TEXT) as relation_column_type
22 |         union all
23 |         
24 |         select
25 |             cast('PRICE' as TEXT) as relation_column,
26 |             cast('NUMBER' as TEXT) as relation_column_type
27 |         union all
28 |         
29 |         select
30 |             cast('HOST_ID' as TEXT) as relation_column,
31 |             cast('NUMBER' as TEXT) as relation_column_type
32 |         union all
33 |         
34 |         select
35 |             cast('HOST_NAME' as TEXT) as relation_column,
36 |             cast('VARCHAR' as TEXT) as relation_column_type
37 |         union all
38 |         
39 |         select
40 |             cast('HOST_IS_SUPERHOST' as TEXT) as relation_column,
41 |             cast('VARCHAR' as TEXT) as relation_column_type
42 |         union all
43 |         
44 |         select
45 |             cast('CREATED_AT' as TEXT) as relation_column,
46 |             cast('TIMESTAMP_NTZ' as TEXT) as relation_column_type
47 |         union all
48 |         
49 |         select
50 |             cast('UPDATED_AT' as TEXT) as relation_column,
51 |             cast('TIMESTAMP_NTZ' as TEXT) as relation_column_type
52 |         
53 |         
54 |     ),
55 |     test_data as (
56 | 
57 |         select
58 |             *
59 |         from
60 |             relation_columns
61 |         where
62 |             relation_column = 'PRICE'
63 |             and
64 |             relation_column_type not in ('NUMBER')
65 | 
66 |     )
67 |     select *
68 |     from test_data


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/dbt_expectations_expect_column_39596d790161761077ff1592b68943f6.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | 
 6 | 
 7 |     with grouped_expression as (
 8 |     select
 9 |         
10 |         
11 |     
12 |   
13 | ( 1=1 and percentile_cont(0.99) within group (order by price) >= 50 and percentile_cont(0.99) within group (order by price) <= 500
14 | )
15 |  as expression
16 | 
17 | 
18 |     from airbnb.dev.dim_listings_w_hosts
19 |     
20 | 
21 | ),
22 | validation_errors as (
23 | 
24 |     select
25 |         *
26 |     from
27 |         grouped_expression
28 |     where
29 |         not(expression = true)
30 | 
31 | )
32 | 
33 | select *
34 | from validation_errors
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/dbt_expectations_expect_column_8e138814a11b6202811546795bffca5d.sql:
--------------------------------------------------------------------------------
 1 | with relation_columns as (
 2 | 
 3 |         
 4 |         select
 5 |             cast('LISTING_ID' as TEXT) as relation_column,
 6 |             cast('NUMBER' as TEXT) as relation_column_type
 7 |         union all
 8 |         
 9 |         select
10 |             cast('LISTING_NAME' as TEXT) as relation_column,
11 |             cast('VARCHAR' as TEXT) as relation_column_type
12 |         union all
13 |         
14 |         select
15 |             cast('ROOM_TYPE' as TEXT) as relation_column,
16 |             cast('VARCHAR' as TEXT) as relation_column_type
17 |         union all
18 |         
19 |         select
20 |             cast('MINIMUM_NIGHTS' as TEXT) as relation_column,
21 |             cast('NUMBER' as TEXT) as relation_column_type
22 |         union all
23 |         
24 |         select
25 |             cast('PRICE' as TEXT) as relation_column,
26 |             cast('NUMBER' as TEXT) as relation_column_type
27 |         union all
28 |         
29 |         select
30 |             cast('HOST_ID' as TEXT) as relation_column,
31 |             cast('NUMBER' as TEXT) as relation_column_type
32 |         union all
33 |         
34 |         select
35 |             cast('HOST_NAME' as TEXT) as relation_column,
36 |             cast('VARCHAR' as TEXT) as relation_column_type
37 |         union all
38 |         
39 |         select
40 |             cast('HOST_IS_SUPERHOST' as TEXT) as relation_column,
41 |             cast('VARCHAR' as TEXT) as relation_column_type
42 |         union all
43 |         
44 |         select
45 |             cast('CREATED_AT' as TEXT) as relation_column,
46 |             cast('TIMESTAMP_NTZ' as TEXT) as relation_column_type
47 |         union all
48 |         
49 |         select
50 |             cast('UPDATED_AT' as TEXT) as relation_column,
51 |             cast('TIMESTAMP_NTZ' as TEXT) as relation_column_type
52 |         
53 |         
54 |     ),
55 |     test_data as (
56 | 
57 |         select
58 |             *
59 |         from
60 |             relation_columns
61 |         where
62 |             relation_column = 'PRICE'
63 |             and
64 |             relation_column_type not in ('NUMBER(10,2)')
65 | 
66 |     )
67 |     select *
68 |     from test_data


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/dbt_expectations_expect_column_c59e300e0dddb335c4211147100ac1c6.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | 
 6 |     with grouped_expression as (
 7 |     select
 8 |         
 9 |         
10 |     
11 |   
12 | ( 1=1 and max(price) <= 5000
13 | )
14 |  as expression
15 | 
16 | 
17 |     from airbnb.dev.dim_listings_w_hosts
18 |     
19 | 
20 | ),
21 | validation_errors as (
22 | 
23 |     select
24 |         *
25 |     from
26 |         grouped_expression
27 |     where
28 |         not(expression = true)
29 | 
30 | )
31 | 
32 | select *
33 | from validation_errors
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/dbt_expectations_expect_table__fbda7436ebe2ffe341acf0622c76d629.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     with a as (
 3 |         
 4 |     select
 5 |         
 6 |         count(*) as expression
 7 |     from
 8 |         airbnb.dev.dim_listings_w_hosts
 9 |     
10 | 
11 |     ),
12 |     b as (
13 |         
14 |     select
15 |         
16 |         count(*) * 1 as expression
17 |     from
18 |         airbnb.raw.raw_listings
19 |     
20 | 
21 |     ),
22 |     final as (
23 | 
24 |         select
25 |             
26 |             a.expression,
27 |             b.expression as compare_expression,
28 |             abs(coalesce(a.expression, 0) - coalesce(b.expression, 0)) as expression_difference,
29 |             abs(coalesce(a.expression, 0) - coalesce(b.expression, 0))/
30 |                 nullif(a.expression * 1.0, 0) as expression_difference_percent
31 |         from
32 |         
33 |             a cross join b
34 |         
35 |     )
36 |     -- DEBUG:
37 |     -- select * from final
38 |     select
39 |         *
40 |     from final
41 |     where
42 |         
43 |         expression_difference > 0.0
44 |         
45 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/not_null_dim_hosts_cleansed_host_id.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 | 
 5 | 
 6 | 
 7 | select host_id
 8 | from airbnb.dev.dim_hosts_cleansed
 9 | where host_id is null
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/not_null_dim_hosts_cleansed_host_name.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 | 
 5 | 
 6 | 
 7 | select host_name
 8 | from airbnb.dev.dim_hosts_cleansed
 9 | where host_name is null
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/not_null_dim_listings_cleansed_host_id.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 | 
 5 | 
 6 | 
 7 | select host_id
 8 | from airbnb.dev.dim_listings_cleansed
 9 | where host_id is null
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/not_null_dim_listings_cleansed_listing_id.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 | 
 5 | 
 6 | 
 7 | select listing_id
 8 | from airbnb.dev.dim_listings_cleansed
 9 | where listing_id is null
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/positive_value_dim_listings_cleansed_minimum_nights.sql:
--------------------------------------------------------------------------------
1 | 
2 | 
3 |     SELECT * FROM airbnb.dev.dim_listings_cleansed
4 |     WHERE minimum_nights < 1
5 | 
6 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/relationships_dim_listings_cle_05e2397b186a7b9306fc747b3cc4ef83.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 | 
 5 | with child as (
 6 |     select host_id as from_field
 7 |     from airbnb.dev.dim_listings_cleansed
 8 |     where host_id is not null
 9 | ),
10 | 
11 | parent as (
12 |     select host_id as to_field
13 |     from airbnb.dev.dim_hosts_cleansed
14 | )
15 | 
16 | select
17 |     from_field
18 | 
19 | from child
20 | left join parent
21 |     on child.from_field = parent.to_field
22 | 
23 | where parent.to_field is null
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/unique_dim_hosts_cleansed_host_id.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 | 
 5 | select
 6 |     host_id as unique_field,
 7 |     count(*) as n_records
 8 | 
 9 | from airbnb.dev.dim_hosts_cleansed
10 | where host_id is not null
11 | group by host_id
12 | having count(*) > 1
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/schema.yml/unique_dim_listings_cleansed_listing_id.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 | 
 5 | select
 6 |     listing_id as unique_field,
 7 |     count(*) as n_records
 8 | 
 9 | from airbnb.dev.dim_listings_cleansed
10 | where listing_id is not null
11 | group by listing_id
12 | having count(*) > 1
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/sources.yml/dbt_expectations_source_expect_a60b59a84fbc4577a11df360c50013bb.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | 
 6 |     with grouped_expression as (
 7 |     select
 8 |         
 9 |         
10 |     
11 |   
12 | 
13 | 
14 |     regexp_instr(price, '^\\$[0-9][0-9\\.]+$', 1, 1)
15 | 
16 | 
17 |  > 0
18 |  as expression
19 | 
20 | 
21 |     from airbnb.raw.raw_listings
22 |     
23 | 
24 | ),
25 | validation_errors as (
26 | 
27 |     select
28 |         *
29 |     from
30 |         grouped_expression
31 |     where
32 |         not(expression = true)
33 | 
34 | )
35 | 
36 | select *
37 | from validation_errors
38 | 
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/sources.yml/dbt_expectations_source_expect_d9770018e28873e7be74335902d9e4e5.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 |     with grouped_expression as (
 5 |     select
 6 |         
 7 |         
 8 |     
 9 |   
10 | count(distinct room_type) = 4
11 |  as expression
12 | 
13 | 
14 |     from airbnb.raw.raw_listings
15 |     
16 | 
17 | ),
18 | validation_errors as (
19 | 
20 |     select
21 |         *
22 |     from
23 |         grouped_expression
24 |     where
25 |         not(expression = true)
26 | 
27 | )
28 | 
29 | select *
30 | from validation_errors
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/src/src_hosts.sql:
--------------------------------------------------------------------------------
 1 | WITH raw_hosts AS (
 2 | 	SELECT * FROM airbnb.raw.raw_hosts
 3 | )
 4 | 
 5 | SELECT
 6 |     id AS host_id,
 7 |     name AS host_name,
 8 |     is_superhost,
 9 |     created_at,
10 |     updated_at
11 | FROM raw_hosts


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/src/src_listings.sql:
--------------------------------------------------------------------------------
 1 | WITH raw_listings AS (
 2 | 	SELECT * FROM airbnb.raw.raw_listings
 3 | )
 4 | 
 5 | SELECT 
 6 | 	id AS listing_id,
 7 |     name AS listing_name,
 8 |     listing_url,
 9 |     room_type,
10 |     minimum_nights,
11 |     host_id,
12 |     price AS price_str,
13 |     created_at,
14 |     updated_at
15 | FROM 
16 | 	raw_listings


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/models/src/src_reviews.sql:
--------------------------------------------------------------------------------
 1 | WITH raw_reviews AS (
 2 | 	SELECT * FROM airbnb.raw.raw_reviews
 3 | )
 4 | 
 5 | SELECT
 6 |     listing_id,
 7 |     date AS review_date,
 8 |     reviewer_name,
 9 |     comments AS review_text,
10 |     sentiment AS review_sentiment
11 | FROM raw_reviews


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/tests/consistent_created_at.sql:
--------------------------------------------------------------------------------
1 | -- Checks that there is no review date that is submitted before its listing was created
2 | 
3 | SELECT *
4 | FROM airbnb.dev.dim_listings_cleansed AS listings
5 | INNER JOIN airbnb.dev.fct_reviews AS reviews
6 | 	USING (listing_id)
7 | WHERE reviews.review_date <=  listings.created_at


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/tests/dim_listings_minimum_nights.sql:
--------------------------------------------------------------------------------
1 | SELECT * 
2 | FROM airbnb.dev.dim_listings_cleansed
3 | WHERE minimum_nights < 1
4 | LIMIT 10
5 | 
6 | -- Query is written in such a way that the results returned must be zero to PASS the test.


--------------------------------------------------------------------------------
/Airbnb Project/target/compiled/dbtlearn/tests/no_nulls_in_dim_listings.sql:
--------------------------------------------------------------------------------
 1 | -- Testing `no_nulls_in_columns` macro on `dim_listings_cleansed table`
 2 | -- Run dbt test -select dim_listings_cleansed
 3 | 
 4 | 
 5 | 
 6 |     SELECT * 
 7 |     FROM airbnb.dev.dim_listings_cleansed
 8 |     WHERE
 9 |         -- Interested in columns where column name is null OR... iterates to next column
10 |             LISTING_ID IS NULL OR
11 |         -- Interested in columns where column name is null OR... iterates to next column
12 |             LISTING_NAME IS NULL OR
13 |         -- Interested in columns where column name is null OR... iterates to next column
14 |             ROOM_TYPE IS NULL OR
15 |         -- Interested in columns where column name is null OR... iterates to next column
16 |             MINIMUM_NIGHTS IS NULL OR
17 |         -- Interested in columns where column name is null OR... iterates to next column
18 |             HOST_ID IS NULL OR
19 |         -- Interested in columns where column name is null OR... iterates to next column
20 |             PRICE IS NULL OR
21 |         -- Interested in columns where column name is null OR... iterates to next column
22 |             CREATED_AT IS NULL OR
23 |         -- Interested in columns where column name is null OR... iterates to next column
24 |             UPDATED_AT IS NULL OR
25 |         
26 |         FALSE -- To terminate the iteration
27 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/graph.gpickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/katiehuangx/data-engineering/aace791c2df1ba2899624d7d51f3d5606e90d79d/Airbnb Project/target/graph.gpickle


--------------------------------------------------------------------------------
/Airbnb Project/target/partial_parse.msgpack:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/katiehuangx/data-engineering/aace791c2df1ba2899624d7d51f3d5606e90d79d/Airbnb Project/target/partial_parse.msgpack


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/dim/dim_hosts_cleansed.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |   create or replace   view airbnb.dev.dim_hosts_cleansed
 3 |   
 4 |    as (
 5 |     
 6 | 
 7 | WITH  __dbt__cte__src_hosts as (
 8 | WITH raw_hosts AS (
 9 | 	SELECT * FROM airbnb.raw.raw_hosts
10 | )
11 | 
12 | SELECT
13 |     id AS host_id,
14 |     name AS host_name,
15 |     is_superhost,
16 |     created_at,
17 |     updated_at
18 | FROM raw_hosts
19 | ),src_hosts AS (
20 |     SELECT * FROM __dbt__cte__src_hosts
21 | )
22 | 
23 | SELECT 
24 |     host_id,
25 |     NVL(host_name, 'Anonymous') AS host_name,
26 |     is_superhost,
27 |     created_at,
28 |     updated_at
29 | FROM src_hosts
30 |   );
31 | 
32 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/dim/dim_listings_cleansed.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |   create or replace   view airbnb.dev.dim_listings_cleansed
 3 |   
 4 |    as (
 5 |     
 6 | 
 7 | 
 8 | WITH  __dbt__cte__src_listings as (
 9 | WITH raw_listings AS (
10 | 	SELECT * FROM airbnb.raw.raw_listings
11 | )
12 | 
13 | SELECT 
14 | 	id AS listing_id,
15 |     name AS listing_name,
16 |     listing_url,
17 |     room_type,
18 |     minimum_nights,
19 |     host_id,
20 |     price AS price_str,
21 |     created_at,
22 |     updated_at
23 | FROM 
24 | 	raw_listings
25 | ),src_listings AS (
26 |     SELECT * FROM __dbt__cte__src_listings
27 | )
28 | 
29 | SELECT
30 |     listing_id,
31 |     listing_name,
32 |     room_type,
33 |     CASE 
34 |         WHEN minimum_nights = 0 THEN 1 -- 0 night = 1 night, so we assign the value of 1 to indicate 1 night
35 |         ELSE minimum_nights
36 |     END AS minimum_nights,
37 |     host_id,
38 |     REPLACE( -- Parse string value into numerical form
39 |         price_str, '$' -- Replace '$' with price_str value. In other words, to remove '$' from string.
40 |         ) :: NUMBER (10, 2 -- Convert string type to numerical with 2 decimal places
41 |     ) AS price,
42 |     created_at,
43 |     updated_at
44 |     
45 | FROM src_listings
46 |   );
47 | 
48 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/dim/dim_listings_w_hosts.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |   
 3 |     
 4 | 
 5 |         create or replace transient table airbnb.dev.dim_listings_w_hosts  as
 6 |         (WITH 
 7 | listings AS (
 8 |     SELECT * FROM airbnb.dev.dim_listings_cleansed
 9 | ),
10 | hosts AS (
11 |     SELECT * FROM airbnb.dev.dim_hosts_cleansed
12 | )
13 | 
14 | SELECT 
15 |     listings.listing_id,
16 |     listings.listing_name,
17 |     listings.room_type,
18 |     listings.minimum_nights,
19 |     listings.price,
20 |     listings.host_id,
21 |     hosts.host_name,
22 |     hosts.is_superhost AS host_is_superhost,
23 |     listings.created_at,
24 |     GREATEST(listings.updated_at, hosts.updated_at) AS updated_at -- Keep most recent updated_at
25 | FROM listings
26 | LEFT JOIN hosts
27 |     ON listings.host_id = hosts.host_id
28 |         );
29 |       
30 |   


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/fct/fct_reviews.sql:
--------------------------------------------------------------------------------
 1 | -- back compat for old kwarg name
 2 |   
 3 |   begin;
 4 |     
 5 | 
 6 |         insert into airbnb.dev.fct_reviews ("REVIEW_ID", "LISTING_ID", "REVIEW_DATE", "REVIEWER_NAME", "REVIEW_TEXT", "REVIEW_SENTIMENT")
 7 |         (
 8 |             select "REVIEW_ID", "LISTING_ID", "REVIEW_DATE", "REVIEWER_NAME", "REVIEW_TEXT", "REVIEW_SENTIMENT"
 9 |             from airbnb.dev.fct_reviews__dbt_tmp
10 |         );
11 |     commit;


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/mart/mart_fullmoon_reviews.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |   
 3 |     
 4 | 
 5 |         create or replace transient table airbnb.dev.mart_fullmoon_reviews  as
 6 |         (
 7 | 
 8 | WITH fct_reviews AS (
 9 | 
10 |     SELECT * FROM airbnb.dev.fct_reviews
11 | 
12 | ),
13 | full_moon_dates AS (
14 | 
15 |     SELECT * FROM airbnb.dev.seed_full_moon_dates
16 | )
17 | 
18 | SELECT
19 |     reviews.*,
20 |     CASE
21 |         WHEN fullmoon.full_moon_date IS NULL THEN 'not full moon'
22 |         ELSE 'full moon' END AS is_full_moon
23 | FROM fct_reviews AS reviews
24 | LEFT JOIN full_moon_dates AS fullmoon
25 |     ON (TO_DATE(reviews.review_date) = DATEADD(DAY, 1, fullmoon.full_moon_date))
26 |         );
27 |       
28 |   


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/accepted_values_dim_listings_c_1ca6148a08c62a5218f2a162f9d2a9a6.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 |     
 8 |     
 9 | 
10 | with all_values as (
11 | 
12 |     select
13 |         room_type as value_field,
14 |         count(*) as n_records
15 | 
16 |     from airbnb.dev.dim_listings_cleansed
17 |     group by room_type
18 | 
19 | )
20 | 
21 | select *
22 | from all_values
23 | where value_field not in (
24 |     'Entire home/apt','Private room','Shared room','Hotel room'
25 | )
26 | 
27 | 
28 | 
29 |       
30 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/accepted_values_dim_listings_c_9fcd3cfe888517e67ec95c75be12c62a.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 |     
 8 |     
 9 | 
10 | with all_values as (
11 | 
12 |     select
13 |         room_type as value_field,
14 |         count(*) as n_records
15 | 
16 |     from airbnb.dev.dim_listings_cleansed
17 |     group by room_type
18 | 
19 | )
20 | 
21 | select *
22 | from all_values
23 | where value_field not in (
24 |     'Entire room/apt','Private room','Shared room','Hotel room'
25 | )
26 | 
27 | 
28 | 
29 |       
30 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/dbt_expectations_expect_column_07e7a515218ef6e3a17e164c642c7d18.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       with relation_columns as (
 7 | 
 8 |         
 9 |         select
10 |             cast('LISTING_ID' as TEXT) as relation_column,
11 |             cast('NUMBER' as TEXT) as relation_column_type
12 |         union all
13 |         
14 |         select
15 |             cast('LISTING_NAME' as TEXT) as relation_column,
16 |             cast('VARCHAR' as TEXT) as relation_column_type
17 |         union all
18 |         
19 |         select
20 |             cast('ROOM_TYPE' as TEXT) as relation_column,
21 |             cast('VARCHAR' as TEXT) as relation_column_type
22 |         union all
23 |         
24 |         select
25 |             cast('MINIMUM_NIGHTS' as TEXT) as relation_column,
26 |             cast('NUMBER' as TEXT) as relation_column_type
27 |         union all
28 |         
29 |         select
30 |             cast('PRICE' as TEXT) as relation_column,
31 |             cast('NUMBER' as TEXT) as relation_column_type
32 |         union all
33 |         
34 |         select
35 |             cast('HOST_ID' as TEXT) as relation_column,
36 |             cast('NUMBER' as TEXT) as relation_column_type
37 |         union all
38 |         
39 |         select
40 |             cast('HOST_NAME' as TEXT) as relation_column,
41 |             cast('VARCHAR' as TEXT) as relation_column_type
42 |         union all
43 |         
44 |         select
45 |             cast('HOST_IS_SUPERHOST' as TEXT) as relation_column,
46 |             cast('VARCHAR' as TEXT) as relation_column_type
47 |         union all
48 |         
49 |         select
50 |             cast('CREATED_AT' as TEXT) as relation_column,
51 |             cast('TIMESTAMP_NTZ' as TEXT) as relation_column_type
52 |         union all
53 |         
54 |         select
55 |             cast('UPDATED_AT' as TEXT) as relation_column,
56 |             cast('TIMESTAMP_NTZ' as TEXT) as relation_column_type
57 |         
58 |         
59 |     ),
60 |     test_data as (
61 | 
62 |         select
63 |             *
64 |         from
65 |             relation_columns
66 |         where
67 |             relation_column = 'PRICE'
68 |             and
69 |             relation_column_type not in ('NUMBER')
70 | 
71 |     )
72 |     select *
73 |     from test_data
74 |       
75 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/dbt_expectations_expect_column_39596d790161761077ff1592b68943f6.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 |     with grouped_expression as (
13 |     select
14 |         
15 |         
16 |     
17 |   
18 | ( 1=1 and percentile_cont(0.99) within group (order by price) >= 50 and percentile_cont(0.99) within group (order by price) <= 500
19 | )
20 |  as expression
21 | 
22 | 
23 |     from airbnb.dev.dim_listings_w_hosts
24 |     
25 | 
26 | ),
27 | validation_errors as (
28 | 
29 |     select
30 |         *
31 |     from
32 |         grouped_expression
33 |     where
34 |         not(expression = true)
35 | 
36 | )
37 | 
38 | select *
39 | from validation_errors
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 |       
47 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/dbt_expectations_expect_column_68f998c0da11ce3e6e806b41ab34f533.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 |     with grouped_expression as (
13 |     select
14 |         
15 |         
16 |     
17 |   
18 | ( 1=1 and percentile_cont(0.99) within group (order by price) >= 50 and percentile_cont(0.99) within group (order by price) <= 100
19 | )
20 |  as expression
21 | 
22 | 
23 |     from airbnb.dev.dim_listings_w_hosts
24 |     
25 | 
26 | ),
27 | validation_errors as (
28 | 
29 |     select
30 |         *
31 |     from
32 |         grouped_expression
33 |     where
34 |         not(expression = true)
35 | 
36 | )
37 | 
38 | select *
39 | from validation_errors
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 |       
47 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/dbt_expectations_expect_column_8e138814a11b6202811546795bffca5d.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       with relation_columns as (
 7 | 
 8 |         
 9 |         select
10 |             cast('LISTING_ID' as TEXT) as relation_column,
11 |             cast('NUMBER' as TEXT) as relation_column_type
12 |         union all
13 |         
14 |         select
15 |             cast('LISTING_NAME' as TEXT) as relation_column,
16 |             cast('VARCHAR' as TEXT) as relation_column_type
17 |         union all
18 |         
19 |         select
20 |             cast('ROOM_TYPE' as TEXT) as relation_column,
21 |             cast('VARCHAR' as TEXT) as relation_column_type
22 |         union all
23 |         
24 |         select
25 |             cast('MINIMUM_NIGHTS' as TEXT) as relation_column,
26 |             cast('NUMBER' as TEXT) as relation_column_type
27 |         union all
28 |         
29 |         select
30 |             cast('PRICE' as TEXT) as relation_column,
31 |             cast('NUMBER' as TEXT) as relation_column_type
32 |         union all
33 |         
34 |         select
35 |             cast('HOST_ID' as TEXT) as relation_column,
36 |             cast('NUMBER' as TEXT) as relation_column_type
37 |         union all
38 |         
39 |         select
40 |             cast('HOST_NAME' as TEXT) as relation_column,
41 |             cast('VARCHAR' as TEXT) as relation_column_type
42 |         union all
43 |         
44 |         select
45 |             cast('HOST_IS_SUPERHOST' as TEXT) as relation_column,
46 |             cast('VARCHAR' as TEXT) as relation_column_type
47 |         union all
48 |         
49 |         select
50 |             cast('CREATED_AT' as TEXT) as relation_column,
51 |             cast('TIMESTAMP_NTZ' as TEXT) as relation_column_type
52 |         union all
53 |         
54 |         select
55 |             cast('UPDATED_AT' as TEXT) as relation_column,
56 |             cast('TIMESTAMP_NTZ' as TEXT) as relation_column_type
57 |         
58 |         
59 |     ),
60 |     test_data as (
61 | 
62 |         select
63 |             *
64 |         from
65 |             relation_columns
66 |         where
67 |             relation_column = 'PRICE'
68 |             and
69 |             relation_column_type not in ('NUMBER(10,2)')
70 | 
71 |     )
72 |     select *
73 |     from test_data
74 |       
75 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/dbt_expectations_expect_column_c59e300e0dddb335c4211147100ac1c6.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 | 
 8 | 
 9 | 
10 | 
11 |     with grouped_expression as (
12 |     select
13 |         
14 |         
15 |     
16 |   
17 | ( 1=1 and max(price) <= 5000
18 | )
19 |  as expression
20 | 
21 | 
22 |     from airbnb.dev.dim_listings_w_hosts
23 |     
24 | 
25 | ),
26 | validation_errors as (
27 | 
28 |     select
29 |         *
30 |     from
31 |         grouped_expression
32 |     where
33 |         not(expression = true)
34 | 
35 | )
36 | 
37 | select *
38 | from validation_errors
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 |       
46 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/dbt_expectations_expect_table__fbda7436ebe2ffe341acf0622c76d629.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 |     with a as (
 8 |         
 9 |     select
10 |         
11 |         count(*) as expression
12 |     from
13 |         airbnb.dev.dim_listings_w_hosts
14 |     
15 | 
16 |     ),
17 |     b as (
18 |         
19 |     select
20 |         
21 |         count(*) * 1 as expression
22 |     from
23 |         airbnb.raw.raw_listings
24 |     
25 | 
26 |     ),
27 |     final as (
28 | 
29 |         select
30 |             
31 |             a.expression,
32 |             b.expression as compare_expression,
33 |             abs(coalesce(a.expression, 0) - coalesce(b.expression, 0)) as expression_difference,
34 |             abs(coalesce(a.expression, 0) - coalesce(b.expression, 0))/
35 |                 nullif(a.expression * 1.0, 0) as expression_difference_percent
36 |         from
37 |         
38 |             a cross join b
39 |         
40 |     )
41 |     -- DEBUG:
42 |     -- select * from final
43 |     select
44 |         *
45 |     from final
46 |     where
47 |         
48 |         expression_difference > 0.0
49 |         
50 | 
51 |       
52 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/not_null_dim_listings_cleansed_host_id.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 |     
 8 |     
 9 | 
10 | 
11 | 
12 | select host_id
13 | from airbnb.dev.dim_listings_cleansed
14 | where host_id is null
15 | 
16 | 
17 | 
18 |       
19 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/not_null_dim_listings_cleansed_listing_id.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 |     
 8 |     
 9 | 
10 | 
11 | 
12 | select listing_id
13 | from airbnb.dev.dim_listings_cleansed
14 | where listing_id is null
15 | 
16 | 
17 | 
18 |       
19 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/positive_value_dim_listings_cleansed_minimum_nights.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 | 
 8 |     SELECT * FROM airbnb.dev.dim_listings_cleansed
 9 |     WHERE minimum_nights < 1
10 | 
11 | 
12 |       
13 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/relationships_dim_listings_cle_05e2397b186a7b9306fc747b3cc4ef83.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 |     
 8 |     
 9 | 
10 | with child as (
11 |     select host_id as from_field
12 |     from airbnb.dev.dim_listings_cleansed
13 |     where host_id is not null
14 | ),
15 | 
16 | parent as (
17 |     select host_id as to_field
18 |     from airbnb.dev.dim_hosts_cleansed
19 | )
20 | 
21 | select
22 |     from_field
23 | 
24 | from child
25 | left join parent
26 |     on child.from_field = parent.to_field
27 | 
28 | where parent.to_field is null
29 | 
30 | 
31 | 
32 |       
33 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/schema.yml/unique_dim_listings_cleansed_listing_id.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 |     
 8 |     
 9 | 
10 | select
11 |     listing_id as unique_field,
12 |     count(*) as n_records
13 | 
14 | from airbnb.dev.dim_listings_cleansed
15 | where listing_id is not null
16 | group by listing_id
17 | having count(*) > 1
18 | 
19 | 
20 | 
21 |       
22 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/sources.yml/dbt_expectations_source_expect_a60b59a84fbc4577a11df360c50013bb.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 | 
 8 | 
 9 | 
10 | 
11 |     with grouped_expression as (
12 |     select
13 |         
14 |         
15 |     
16 |   
17 | 
18 | 
19 |     regexp_instr(price, '^\\$[0-9][0-9\\.]+$', 1, 1)
20 | 
21 | 
22 |  > 0
23 |  as expression
24 | 
25 | 
26 |     from airbnb.raw.raw_listings
27 |     
28 | 
29 | ),
30 | validation_errors as (
31 | 
32 |     select
33 |         *
34 |     from
35 |         grouped_expression
36 |     where
37 |         not(expression = true)
38 | 
39 | )
40 | 
41 | select *
42 | from validation_errors
43 | 
44 | 
45 | 
46 | 
47 | 
48 |       
49 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/sources.yml/dbt_expectations_source_expect_d9770018e28873e7be74335902d9e4e5.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       
 7 | 
 8 | 
 9 |     with grouped_expression as (
10 |     select
11 |         
12 |         
13 |     
14 |   
15 | count(distinct room_type) = 4
16 |  as expression
17 | 
18 | 
19 |     from airbnb.raw.raw_listings
20 |     
21 | 
22 | ),
23 | validation_errors as (
24 | 
25 |     select
26 |         *
27 |     from
28 |         grouped_expression
29 |     where
30 |         not(expression = true)
31 | 
32 | )
33 | 
34 | select *
35 | from validation_errors
36 | 
37 | 
38 | 
39 |       
40 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/src/src_hosts.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |   create or replace   view airbnb.dev.src_hosts
 3 |   
 4 |    as (
 5 |     WITH raw_hosts AS (
 6 | 	SELECT * FROM AIRBNB.RAW.RAW_HOSTS
 7 | )
 8 | 
 9 | SELECT
10 |     id AS host_id,
11 |     name AS host_name,
12 |     is_superhost,
13 |     created_at,
14 |     updated_at
15 | FROM raw_hosts
16 |   );
17 | 
18 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/src/src_listings.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |   create or replace   view airbnb.dev.src_listings
 3 |   
 4 |    as (
 5 |     WITH raw_listings AS (
 6 | 	SELECT * FROM AIRBNB.RAW.RAW_LISTINGS
 7 | )
 8 | 
 9 | SELECT 
10 | 	id AS listing_id,
11 |     name AS listing_name,
12 |     listing_url,
13 |     room_type,
14 |     minimum_nights,
15 |     host_id,
16 |     price AS price_str,
17 |     created_at,
18 |     updated_at
19 | FROM 
20 | 	raw_listings
21 |   );
22 | 
23 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/models/src/src_reviews.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |   create or replace   view airbnb.dev.src_reviews
 3 |   
 4 |    as (
 5 |     WITH raw_reviews AS (
 6 | 	SELECT * FROM AIRBNB.RAW.RAW_REVIEWS
 7 | )
 8 | 
 9 | SELECT
10 |     listing_id,
11 |     date AS review_date,
12 |     reviewer_name,
13 |     comments AS review_text,
14 |     sentiment AS review_sentiment
15 | FROM raw_reviews
16 |   );
17 | 
18 | 


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/seeds/seed_full_moon_dates.csv:
--------------------------------------------------------------------------------
 1 | 
 2 |     
 3 |     
 4 |     
 5 |     create table airbnb.dev.seed_full_moon_dates (full_moon_date date)
 6 |   ;
 7 |     -- dbt seed --
 8 |     
 9 |             insert into airbnb.dev.seed_full_moon_dates (full_moon_date) values
10 |             (%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s),(%s)
11 |         
12 | 
13 | ;
14 |   


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/snapshots/scd_raw_listings.sql:
--------------------------------------------------------------------------------
 1 | 
 2 |       begin;
 3 |     merge into "AIRBNB"."DEV"."SCD_RAW_LISTINGS" as DBT_INTERNAL_DEST
 4 |     using "AIRBNB"."DEV"."SCD_RAW_LISTINGS__dbt_tmp" as DBT_INTERNAL_SOURCE
 5 |     on DBT_INTERNAL_SOURCE.dbt_scd_id = DBT_INTERNAL_DEST.dbt_scd_id
 6 | 
 7 |     when matched
 8 |      and DBT_INTERNAL_DEST.dbt_valid_to is null
 9 |      and DBT_INTERNAL_SOURCE.dbt_change_type in ('update', 'delete')
10 |         then update
11 |         set dbt_valid_to = DBT_INTERNAL_SOURCE.dbt_valid_to
12 | 
13 |     when not matched
14 |      and DBT_INTERNAL_SOURCE.dbt_change_type = 'insert'
15 |         then insert ("ID", "LISTING_URL", "NAME", "ROOM_TYPE", "MINIMUM_NIGHTS", "HOST_ID", "PRICE", "CREATED_AT", "UPDATED_AT", "DBT_UPDATED_AT", "DBT_VALID_FROM", "DBT_VALID_TO", "DBT_SCD_ID")
16 |         values ("ID", "LISTING_URL", "NAME", "ROOM_TYPE", "MINIMUM_NIGHTS", "HOST_ID", "PRICE", "CREATED_AT", "UPDATED_AT", "DBT_UPDATED_AT", "DBT_VALID_FROM", "DBT_VALID_TO", "DBT_SCD_ID")
17 | 
18 | ;
19 |     commit;
20 |   


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/tests/consistent_created_at.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       -- Checks that there is no review date that is submitted before its listing was created
 7 | 
 8 | SELECT *
 9 | FROM airbnb.dev.dim_listings_cleansed AS listings
10 | INNER JOIN airbnb.dev.fct_reviews AS reviews
11 | 	USING (listing_id)
12 | WHERE reviews.review_date <=  listings.created_at
13 |       
14 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/tests/dim_listings_minimum_nights.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       SELECT * 
 7 | FROM airbnb.dev.dim_listings_cleansed
 8 | WHERE minimum_nights < 1
 9 | LIMIT 10
10 | 
11 | -- Query is written in such a way that the results returned must be zero to PASS the test.
12 |       
13 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run/dbtlearn/tests/no_nulls_in_dim_listings.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |       count(*) as failures,
 3 |       count(*) != 0 as should_warn,
 4 |       count(*) != 0 as should_error
 5 |     from (
 6 |       -- Testing `no_nulls_in_columns` macro on `dim_listings_cleansed table`
 7 | 
 8 | 
 9 | 
10 |     SELECT * 
11 |     FROM airbnb.dev.dim_listings_cleansed
12 |     WHERE
13 |         -- Interested in columns where column name is null OR... iterates to next column
14 |             LISTING_ID IS NULL OR
15 |         -- Interested in columns where column name is null OR... iterates to next column
16 |             LISTING_NAME IS NULL OR
17 |         -- Interested in columns where column name is null OR... iterates to next column
18 |             ROOM_TYPE IS NULL OR
19 |         -- Interested in columns where column name is null OR... iterates to next column
20 |             MINIMUM_NIGHTS IS NULL OR
21 |         -- Interested in columns where column name is null OR... iterates to next column
22 |             HOST_ID IS NULL OR
23 |         -- Interested in columns where column name is null OR... iterates to next column
24 |             PRICE IS NULL OR
25 |         -- Interested in columns where column name is null OR... iterates to next column
26 |             CREATED_AT IS NULL OR
27 |         -- Interested in columns where column name is null OR... iterates to next column
28 |             UPDATED_AT IS NULL OR
29 |         
30 |         FALSE -- To terminate the iteration
31 | 
32 |       
33 |     ) dbt_internal_test


--------------------------------------------------------------------------------
/Airbnb Project/target/run_results.json:
--------------------------------------------------------------------------------
1 | {"metadata": {"dbt_schema_version": "https://schemas.getdbt.com/dbt/run-results/v4.json", "dbt_version": "1.4.3", "generated_at": "2023-03-08T02:57:25.654975Z", "invocation_id": "9d4b08ad-049d-449c-8cab-3957edae46e9", "env": {}}, "results": [{"status": "pass", "timing": [{"name": "compile", "started_at": "2023-03-08T02:57:22.600469Z", "completed_at": "2023-03-08T02:57:22.614499Z"}, {"name": "execute", "started_at": "2023-03-08T02:57:22.615806Z", "completed_at": "2023-03-08T02:57:24.937562Z"}], "thread_id": "Thread-2", "execution_time": 2.9193646907806396, "adapter_response": {}, "message": null, "failures": 0, "unique_id": "test.dbtlearn.dbt_expectations_source_expect_column_distinct_count_to_equal_airbnb_listings_room_type__4.1e56c20a65"}, {"status": "pass", "timing": [{"name": "compile", "started_at": "2023-03-08T02:57:22.589982Z", "completed_at": "2023-03-08T02:57:22.614768Z"}, {"name": "execute", "started_at": "2023-03-08T02:57:22.628354Z", "completed_at": "2023-03-08T02:57:24.980654Z"}], "thread_id": "Thread-1", "execution_time": 2.988661050796509, "adapter_response": {}, "message": null, "failures": 0, "unique_id": "test.dbtlearn.dbt_expectations_expect_table_row_count_to_equal_other_table_dim_listings_w_hosts_source_airbnb_listings_.637b6229da"}, {"status": "pass", "timing": [{"name": "compile", "started_at": "2023-03-08T02:57:22.607203Z", "completed_at": "2023-03-08T02:57:22.614970Z"}, {"name": "execute", "started_at": "2023-03-08T02:57:22.653828Z", "completed_at": "2023-03-08T02:57:25.030154Z"}], "thread_id": "Thread-3", "execution_time": 3.0564730167388916, "adapter_response": {}, "message": null, "failures": 0, "unique_id": "test.dbtlearn.dbt_expectations_source_expect_column_values_to_match_regex_airbnb_listings_price___0_9_0_9_.09375076a9"}], "elapsed_time": 7.358556747436523, "args": {"write_json": true, "use_colors": true, "printer_width": 80, "version_check": true, "partial_parse": true, "static_parser": true, "profiles_dir": "/Users/katiehuang/.dbt", "send_anonymous_usage_stats": true, "quiet": false, "no_print": false, "cache_selected_only": false, "indirect_selection": "eager", "select": ["source:airbnb.listings"], "which": "test", "rpc_method": "test"}}


--------------------------------------------------------------------------------
/Airbnb Project/target/sources.json:
--------------------------------------------------------------------------------
1 | {"metadata": {"dbt_schema_version": "https://schemas.getdbt.com/dbt/sources/v3.json", "dbt_version": "1.4.3", "generated_at": "2023-02-27T09:08:06.458531Z", "invocation_id": "7ca2cbc9-0f34-47e9-bbc5-411d4ea2403a", "env": {}}, "results": [{"unique_id": "source.dbtlearn.airbnb.reviews", "max_loaded_at": "2021-10-22T00:00:00+00:00", "snapshotted_at": "2023-02-27T09:08:04.828000+00:00", "max_loaded_at_time_ago_in_s": 42628084.828, "status": "error", "criteria": {"warn_after": {"count": 1, "period": "hour"}, "error_after": {"count": 24, "period": "hour"}, "filter": null}, "adapter_response": {}, "timing": [{"name": "compile", "started_at": "2023-02-27T09:08:02.780437Z", "completed_at": "2023-02-27T09:08:02.780445Z"}, {"name": "execute", "started_at": "2023-02-27T09:08:02.781093Z", "completed_at": "2023-02-27T09:08:06.450707Z"}], "thread_id": "Thread-1", "execution_time": 3.6726982593536377}], "elapsed_time": 8.946863889694214}


--------------------------------------------------------------------------------
/Airbnb Project/tests/consistent_created_at.sql:
--------------------------------------------------------------------------------
1 | -- Checks that there is no review date that is submitted before its listing was created
2 | 
3 | SELECT *
4 | FROM {{ref ('dim_listings_cleansed')}} AS listings
5 | INNER JOIN {{ref ('fct_reviews')}} AS reviews
6 | 	USING (listing_id)
7 | WHERE reviews.review_date <=  listings.created_at
8 | 
9 | 


--------------------------------------------------------------------------------
/Airbnb Project/tests/dim_listings_minimum_nights.sql:
--------------------------------------------------------------------------------
1 | SELECT * 
2 | FROM {{ref ('dim_listings_cleansed')}}
3 | WHERE minimum_nights < 1
4 | LIMIT 10
5 | 
6 | -- Query is written in such a way that the results returned must be zero to PASS the test.


--------------------------------------------------------------------------------
/Airbnb Project/tests/no_nulls_in_dim_listings.sql:
--------------------------------------------------------------------------------
1 | -- Testing `no_nulls_in_columns` macro on `dim_listings_cleansed table`
2 | -- Run dbt test -select dim_listings_cleansed
3 | 
4 | {{ no_nulls_in_columns(ref ('dim_listings_cleansed'))}}
5 | 
6 | 


--------------------------------------------------------------------------------
/Dagster/README.md:
--------------------------------------------------------------------------------
 1 | # Learn Dagster
 2 | 
 3 | Create Python environment
 4 | 
 5 | ```python
 6 | # Activate environment
 7 | source dagster_env/bin/activate
 8 | ```
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/Dog Adoption/Datasets/location.csv:
--------------------------------------------------------------------------------
 1 | index,location,exported,imported,total,inUS
 2 | 0,Texas,635,,566,TRUE
 3 | 1,Alabama,268,2,1428,TRUE
 4 | 2,North Carolina,158,14,2627,TRUE
 5 | 3,South Carolina,139,12,1618,TRUE
 6 | 4,Georgia,137,19,3479,TRUE
 7 | 5,Puerto Rico,131,,,FALSE
 8 | 6,California,130,3,1664,TRUE
 9 | 7,South Korea,76,,,FALSE
10 | 8,Tennessee,66,20,1769,TRUE
11 | 9,Kentucky,57,4,1123,TRUE
12 | 10,Mississippi,55,,510,TRUE
13 | 11,Mexico,54,,,FALSE
14 | 12,Louisiana,53,,912,TRUE
15 | 13,West Virginia,50,,565,TRUE
16 | 14,Florida,49,32,2659,TRUE
17 | 15,Arkansas,47,1,695,TRUE
18 | 16,China,28,,,FALSE
19 | 17,New Mexico,27,,636,TRUE
20 | 18,Thailand,20,,,FALSE
21 | 19,Virginia,20,183,3058,TRUE
22 | 20,India,18,,,FALSE
23 | 21,Taiwan,18,,,FALSE
24 | 22,New York,17,390,4002,TRUE
25 | 23,Egypt,15,,,FALSE
26 | 24,Arizona,14,14,2248,TRUE
27 | 25,Bahamas,13,,,FALSE
28 | 26,Indiana,10,20,1877,TRUE
29 | 27,Missouri,9,4,920,TRUE
30 | 28,Ohio,9,33,2670,TRUE
31 | 29,Oklahoma,9,5,1636,TRUE
32 | 30,Pennsylvania,9,228,2821,TRUE
33 | 31,Spain,8,,,FALSE
34 | 32,Haiti,7,,,FALSE
35 | 33,New Jersey,7,270,3022,TRUE
36 | 34,Oman,7,,,FALSE
37 | 35,Illinois,6,8,1114,TRUE
38 | 36,Kuwait,5,,,FALSE
39 | 37,Maryland,4,123,1493,TRUE
40 | 38,Nebraska,4,,120,TRUE
41 | 39,Qatar,4,,,FALSE
42 | 40,Utah,4,56,485,TRUE
43 | 41,Connecticut,3,51,1422,TRUE
44 | 42,Costa Rica,3,,,FALSE
45 | 43,Greece,3,,,FALSE
46 | 44,Idaho,3,,49,TRUE
47 | 45,Kansas,3,1,470,TRUE
48 | 46,Russia,3,,,FALSE
49 | 47,Wisconsin,3,74,542,TRUE
50 | 48,Afghanistan,2,,,FALSE
51 | 49,Colorado,2,71,1773,TRUE
52 | 50,England,2,,,FALSE
53 | 51,Hawaii,2,,69,TRUE
54 | 52,Iowa,2,14,485,TRUE
55 | 53,Massachusetts,2,77,946,TRUE
56 | 54,North Dakota,2,,64,TRUE
57 | 55,South Dakota,2,,24,TRUE
58 | 56,Washington,2,334,1277,TRUE
59 | 57,Aruba,1,,,FALSE
60 | 58,Azerbaijan,1,,,FALSE
61 | 59,Bahrain,1,,,FALSE
62 | 60,Bosnia,1,,,FALSE
63 | 61,British Virgin Islands,1,,,FALSE
64 | 62,Canada,1,,,FALSE
65 | 63,Cayman Islands,1,,,FALSE
66 | 64,Finland,1,,,FALSE
67 | 65,Honduras,1,,,FALSE
68 | 66,Indianapolis,1,,,FALSE
69 | 67,Iran,1,,,FALSE
70 | 68,Ireland,1,,,FALSE
71 | 69,Maine,1,87,545,TRUE
72 | 70,Nevada,1,10,857,TRUE
73 | 71,Nevis,1,,,FALSE
74 | 72,Oregon,1,16,91,TRUE
75 | 73,Saudi Arabia,1,,,FALSE
76 | 74,St. Croix,1,,,FALSE
77 | 75,St. Maarten,1,,,FALSE
78 | 76,St. Simon,1,,,FALSE
79 | 77,St. Thomas,1,,,FALSE
80 | 78,Turkey,1,,,FALSE
81 | 79,United Arab Emirates,1,,,FALSE
82 | 80,Wyoming,1,,52,TRUE
83 | 81,Rhode Island,,87,607,TRUE
84 | 82,Minnesota,,67,958,TRUE
85 | 83,Vermont,,45,510,TRUE
86 | 84,New Hampshire,,33,335,TRUE
87 | 85,Delaware,,29,296,TRUE
88 | 86,Michigan,,12,673,TRUE
89 | 87,Washington DC,,11,336,TRUE
90 | 88,Montana,,,18,TRUE
91 | 89,Alaska,,,15,TRUE


--------------------------------------------------------------------------------
/Dog Adoption/README.md:
--------------------------------------------------------------------------------
 1 | # 🐶 Dog Adoption Project
 2 | 
 3 | ## Objective
 4 | 
 5 | This project creates and builds data models and deploy database for the dog adoption data. 
 6 | 
 7 | ## Technologies
 8 | - Database: PostgreSQL
 9 | - Language: Python
10 | 
11 | ## Data Architecture
12 | 
13 | <img width="591" alt="image" src="https://user-images.githubusercontent.com/81607668/237030996-a92947af-5e9b-42be-8a34-9b4073f6e7ef.png">
14 | 
15 | 1. Data is extracted from [Kaggle](https://www.kaggle.com/datasets/whenamancodes/dog-adoption) and saved locally.
16 | 2. The [dog_adoption.ipynb script](https://github.com/katiehuangx/data-engineering/blob/main/Dog%20Adoption/dog_adoption_clean.ipynb) initiates a connection to PostgreSQL and creates the database with auto-commit being run. 
17 | 3. Tables are created with the appropriate columns and data types. 
18 | 4. Data is fed into the newly created tables.
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/Dog Adoption/dog_adoption_clean.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Dog Adoption: Create and build data model and deploy into database with Python and PostgreSQL"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "Requirement already satisfied: psycopg2-binary in /opt/anaconda3/lib/python3.8/site-packages (2.9.5)\n"
 20 |      ]
 21 |     }
 22 |    ],
 23 |    "source": [
 24 |     "# Import library\n",
 25 |     "!pip3 install psycopg2-binary\n",
 26 |     "import psycopg2\n",
 27 |     "import pandas as pd"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# Create function to create database\n",
 37 |     "\n",
 38 |     "def create_database():\n",
 39 |     "    # Connect to database\n",
 40 |     "    conn = psycopg2.connect(\"host=127.0.0.1 dbname=postgres user=postgres password=********\")\n",
 41 |     "    # Use connection to get cursor to execute queries\n",
 42 |     "    cur = conn.cursor()\n",
 43 |     "    # Set automatic commit\n",
 44 |     "    conn.set_session(autocommit=True)\n",
 45 |     "    \n",
 46 |     "    # Create database\n",
 47 |     "    cur.execute(\"DROP DATABASE IF EXISTS dog_adoption\")\n",
 48 |     "    cur.execute(\"CREATE DATABASE dog_adoption\")\n",
 49 |     "    \n",
 50 |     "    # Close connection to default database\n",
 51 |     "    conn.close()\n",
 52 |     "    \n",
 53 |     "    # Connect to newly created database\n",
 54 |     "    conn = psycopg2.connect(\"host=127.0.0.1 dbname=dog_adoption user=postgres password=bobo5956\")\n",
 55 |     "    # Use connection to get cursor to execute queries\n",
 56 |     "    cur = conn.cursor()\n",
 57 |     "    # Set automatic commit\n",
 58 |     "    conn.set_session(autocommit=True)\n",
 59 |     "    \n",
 60 |     "    return cur, conn"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 3,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# Create function to drop table\n",
 70 |     "\n",
 71 |     "def drop_tables(cur, conn):\n",
 72 |     "    for query in drop_table_queries:\n",
 73 |     "        cur.execute(query)\n",
 74 |     "        conn.commit"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# Create function to create table\n",
 84 |     "\n",
 85 |     "def create_table(cur, conn):\n",
 86 |     "    for query in create_table_queries:\n",
 87 |     "        cur.execute(query)\n",
 88 |     "        conn.commit"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "### Read CSV files"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/html": [
106 |        "<div>\n",
107 |        "<style scoped>\n",
108 |        "    .dataframe tbody tr th:only-of-type {\n",
109 |        "        vertical-align: middle;\n",
110 |        "    }\n",
111 |        "\n",
112 |        "    .dataframe tbody tr th {\n",
113 |        "        vertical-align: top;\n",
114 |        "    }\n",
115 |        "\n",
116 |        "    .dataframe thead th {\n",
117 |        "        text-align: right;\n",
118 |        "    }\n",
119 |        "</style>\n",
120 |        "<table border=\"1\" class=\"dataframe\">\n",
121 |        "  <thead>\n",
122 |        "    <tr style=\"text-align: right;\">\n",
123 |        "      <th></th>\n",
124 |        "      <th>index</th>\n",
125 |        "      <th>id</th>\n",
126 |        "      <th>org_id</th>\n",
127 |        "      <th>url</th>\n",
128 |        "      <th>breed_primary</th>\n",
129 |        "      <th>breed_secondary</th>\n",
130 |        "      <th>color_primary</th>\n",
131 |        "      <th>color_secondary</th>\n",
132 |        "      <th>color_tertiary</th>\n",
133 |        "      <th>age</th>\n",
134 |        "      <th>...</th>\n",
135 |        "      <th>size</th>\n",
136 |        "      <th>coat</th>\n",
137 |        "      <th>name</th>\n",
138 |        "      <th>status</th>\n",
139 |        "      <th>posted</th>\n",
140 |        "      <th>contact_city</th>\n",
141 |        "      <th>contact_state</th>\n",
142 |        "      <th>contact_zip</th>\n",
143 |        "      <th>contact_country</th>\n",
144 |        "      <th>stateQ</th>\n",
145 |        "    </tr>\n",
146 |        "  </thead>\n",
147 |        "  <tbody>\n",
148 |        "    <tr>\n",
149 |        "      <th>0</th>\n",
150 |        "      <td>0</td>\n",
151 |        "      <td>46042150</td>\n",
152 |        "      <td>NV163</td>\n",
153 |        "      <td>https://www.petfinder.com/dog/harley-46042150/...</td>\n",
154 |        "      <td>American Staffordshire Terrier</td>\n",
155 |        "      <td>Mixed Breed</td>\n",
156 |        "      <td>White / Cream</td>\n",
157 |        "      <td>Yellow / Tan / Blond / Fawn</td>\n",
158 |        "      <td>NaN</td>\n",
159 |        "      <td>Senior</td>\n",
160 |        "      <td>...</td>\n",
161 |        "      <td>Medium</td>\n",
162 |        "      <td>Short</td>\n",
163 |        "      <td>HARLEY</td>\n",
164 |        "      <td>adoptable</td>\n",
165 |        "      <td>2019-09-20T16:37:59+0000</td>\n",
166 |        "      <td>Las Vegas</td>\n",
167 |        "      <td>NV</td>\n",
168 |        "      <td>89147.0</td>\n",
169 |        "      <td>US</td>\n",
170 |        "      <td>89009</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>1</th>\n",
174 |        "      <td>1</td>\n",
175 |        "      <td>46042002</td>\n",
176 |        "      <td>NV163</td>\n",
177 |        "      <td>https://www.petfinder.com/dog/biggie-46042002/...</td>\n",
178 |        "      <td>Pit Bull Terrier</td>\n",
179 |        "      <td>Mixed Breed</td>\n",
180 |        "      <td>Brown / Chocolate</td>\n",
181 |        "      <td>White / Cream</td>\n",
182 |        "      <td>NaN</td>\n",
183 |        "      <td>Adult</td>\n",
184 |        "      <td>...</td>\n",
185 |        "      <td>Large</td>\n",
186 |        "      <td>Short</td>\n",
187 |        "      <td>BIGGIE</td>\n",
188 |        "      <td>adoptable</td>\n",
189 |        "      <td>2019-09-20T16:24:57+0000</td>\n",
190 |        "      <td>Las Vegas</td>\n",
191 |        "      <td>NV</td>\n",
192 |        "      <td>89147.0</td>\n",
193 |        "      <td>US</td>\n",
194 |        "      <td>89009</td>\n",
195 |        "    </tr>\n",
196 |        "    <tr>\n",
197 |        "      <th>2</th>\n",
198 |        "      <td>2</td>\n",
199 |        "      <td>46040898</td>\n",
200 |        "      <td>NV99</td>\n",
201 |        "      <td>https://www.petfinder.com/dog/ziggy-46040898/n...</td>\n",
202 |        "      <td>Shepherd</td>\n",
203 |        "      <td>NaN</td>\n",
204 |        "      <td>Brindle</td>\n",
205 |        "      <td>NaN</td>\n",
206 |        "      <td>NaN</td>\n",
207 |        "      <td>Adult</td>\n",
208 |        "      <td>...</td>\n",
209 |        "      <td>Large</td>\n",
210 |        "      <td>Short</td>\n",
211 |        "      <td>Ziggy</td>\n",
212 |        "      <td>adoptable</td>\n",
213 |        "      <td>2019-09-20T14:10:11+0000</td>\n",
214 |        "      <td>Mesquite</td>\n",
215 |        "      <td>NV</td>\n",
216 |        "      <td>89027.0</td>\n",
217 |        "      <td>US</td>\n",
218 |        "      <td>89009</td>\n",
219 |        "    </tr>\n",
220 |        "    <tr>\n",
221 |        "      <th>3</th>\n",
222 |        "      <td>3</td>\n",
223 |        "      <td>46039877</td>\n",
224 |        "      <td>NV202</td>\n",
225 |        "      <td>https://www.petfinder.com/dog/gypsy-46039877/n...</td>\n",
226 |        "      <td>German Shepherd Dog</td>\n",
227 |        "      <td>NaN</td>\n",
228 |        "      <td>NaN</td>\n",
229 |        "      <td>NaN</td>\n",
230 |        "      <td>NaN</td>\n",
231 |        "      <td>Baby</td>\n",
232 |        "      <td>...</td>\n",
233 |        "      <td>Large</td>\n",
234 |        "      <td>NaN</td>\n",
235 |        "      <td>Gypsy</td>\n",
236 |        "      <td>adoptable</td>\n",
237 |        "      <td>2019-09-20T10:08:22+0000</td>\n",
238 |        "      <td>Pahrump</td>\n",
239 |        "      <td>NV</td>\n",
240 |        "      <td>89048.0</td>\n",
241 |        "      <td>US</td>\n",
242 |        "      <td>89009</td>\n",
243 |        "    </tr>\n",
244 |        "    <tr>\n",
245 |        "      <th>4</th>\n",
246 |        "      <td>4</td>\n",
247 |        "      <td>46039306</td>\n",
248 |        "      <td>NV184</td>\n",
249 |        "      <td>https://www.petfinder.com/dog/theo-46039306/nv...</td>\n",
250 |        "      <td>Dachshund</td>\n",
251 |        "      <td>NaN</td>\n",
252 |        "      <td>NaN</td>\n",
253 |        "      <td>NaN</td>\n",
254 |        "      <td>NaN</td>\n",
255 |        "      <td>Young</td>\n",
256 |        "      <td>...</td>\n",
257 |        "      <td>Small</td>\n",
258 |        "      <td>Long</td>\n",
259 |        "      <td>Theo</td>\n",
260 |        "      <td>adoptable</td>\n",
261 |        "      <td>2019-09-20T06:48:30+0000</td>\n",
262 |        "      <td>Henderson</td>\n",
263 |        "      <td>NV</td>\n",
264 |        "      <td>89052.0</td>\n",
265 |        "      <td>US</td>\n",
266 |        "      <td>89009</td>\n",
267 |        "    </tr>\n",
268 |        "  </tbody>\n",
269 |        "</table>\n",
270 |        "<p>5 rows × 21 columns</p>\n",
271 |        "</div>"
272 |       ],
273 |       "text/plain": [
274 |        "   index        id org_id                                                url  \\\n",
275 |        "0      0  46042150  NV163  https://www.petfinder.com/dog/harley-46042150/...   \n",
276 |        "1      1  46042002  NV163  https://www.petfinder.com/dog/biggie-46042002/...   \n",
277 |        "2      2  46040898   NV99  https://www.petfinder.com/dog/ziggy-46040898/n...   \n",
278 |        "3      3  46039877  NV202  https://www.petfinder.com/dog/gypsy-46039877/n...   \n",
279 |        "4      4  46039306  NV184  https://www.petfinder.com/dog/theo-46039306/nv...   \n",
280 |        "\n",
281 |        "                    breed_primary breed_secondary      color_primary  \\\n",
282 |        "0  American Staffordshire Terrier     Mixed Breed      White / Cream   \n",
283 |        "1                Pit Bull Terrier     Mixed Breed  Brown / Chocolate   \n",
284 |        "2                        Shepherd             NaN            Brindle   \n",
285 |        "3             German Shepherd Dog             NaN                NaN   \n",
286 |        "4                       Dachshund             NaN                NaN   \n",
287 |        "\n",
288 |        "               color_secondary color_tertiary     age  ...    size   coat  \\\n",
289 |        "0  Yellow / Tan / Blond / Fawn            NaN  Senior  ...  Medium  Short   \n",
290 |        "1                White / Cream            NaN   Adult  ...   Large  Short   \n",
291 |        "2                          NaN            NaN   Adult  ...   Large  Short   \n",
292 |        "3                          NaN            NaN    Baby  ...   Large    NaN   \n",
293 |        "4                          NaN            NaN   Young  ...   Small   Long   \n",
294 |        "\n",
295 |        "     name     status                    posted contact_city contact_state  \\\n",
296 |        "0  HARLEY  adoptable  2019-09-20T16:37:59+0000    Las Vegas            NV   \n",
297 |        "1  BIGGIE  adoptable  2019-09-20T16:24:57+0000    Las Vegas            NV   \n",
298 |        "2   Ziggy  adoptable  2019-09-20T14:10:11+0000     Mesquite            NV   \n",
299 |        "3   Gypsy  adoptable  2019-09-20T10:08:22+0000      Pahrump            NV   \n",
300 |        "4    Theo  adoptable  2019-09-20T06:48:30+0000    Henderson            NV   \n",
301 |        "\n",
302 |        "  contact_zip  contact_country stateQ  \n",
303 |        "0     89147.0               US  89009  \n",
304 |        "1     89147.0               US  89009  \n",
305 |        "2     89027.0               US  89009  \n",
306 |        "3     89048.0               US  89009  \n",
307 |        "4     89052.0               US  89009  \n",
308 |        "\n",
309 |        "[5 rows x 21 columns]"
310 |       ]
311 |      },
312 |      "execution_count": 5,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "# description.csv - File containing the description and details of the dog breed and more\n",
319 |     "description = pd.read_csv(\"/Users/katiehuang/Documents/Data Science/Projects/Data Engineering/Dog Adoption/Datasets/description.csv\", low_memory=False)\n",
320 |     "description.head()\n"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 6,
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "Index(['index', 'id', 'org_id', 'url', 'breed_primary', 'breed_secondary',\n",
332 |        "       'color_primary', 'color_secondary', 'color_tertiary', 'age', 'sex',\n",
333 |        "       'size', 'coat', 'name', 'status', 'posted', 'contact_city',\n",
334 |        "       'contact_state', 'contact_zip', 'contact_country', 'stateQ'],\n",
335 |        "      dtype='object')"
336 |       ]
337 |      },
338 |      "execution_count": 6,
339 |      "metadata": {},
340 |      "output_type": "execute_result"
341 |     }
342 |    ],
343 |    "source": [
344 |     "description.columns"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 7,
350 |    "metadata": {},
351 |    "outputs": [
352 |     {
353 |      "data": {
354 |       "text/html": [
355 |        "<div>\n",
356 |        "<style scoped>\n",
357 |        "    .dataframe tbody tr th:only-of-type {\n",
358 |        "        vertical-align: middle;\n",
359 |        "    }\n",
360 |        "\n",
361 |        "    .dataframe tbody tr th {\n",
362 |        "        vertical-align: top;\n",
363 |        "    }\n",
364 |        "\n",
365 |        "    .dataframe thead th {\n",
366 |        "        text-align: right;\n",
367 |        "    }\n",
368 |        "</style>\n",
369 |        "<table border=\"1\" class=\"dataframe\">\n",
370 |        "  <thead>\n",
371 |        "    <tr style=\"text-align: right;\">\n",
372 |        "      <th></th>\n",
373 |        "      <th>index</th>\n",
374 |        "      <th>id</th>\n",
375 |        "      <th>contact_city</th>\n",
376 |        "      <th>contact_state</th>\n",
377 |        "      <th>found</th>\n",
378 |        "      <th>manual</th>\n",
379 |        "      <th>remove</th>\n",
380 |        "      <th>still_there</th>\n",
381 |        "    </tr>\n",
382 |        "  </thead>\n",
383 |        "  <tbody>\n",
384 |        "    <tr>\n",
385 |        "      <th>0</th>\n",
386 |        "      <td>0</td>\n",
387 |        "      <td>44520267</td>\n",
388 |        "      <td>Anoka</td>\n",
389 |        "      <td>MN</td>\n",
390 |        "      <td>Arkansas</td>\n",
391 |        "      <td>NaN</td>\n",
392 |        "      <td>NaN</td>\n",
393 |        "      <td>NaN</td>\n",
394 |        "    </tr>\n",
395 |        "    <tr>\n",
396 |        "      <th>1</th>\n",
397 |        "      <td>1</td>\n",
398 |        "      <td>44698509</td>\n",
399 |        "      <td>Groveland</td>\n",
400 |        "      <td>FL</td>\n",
401 |        "      <td>Abacos</td>\n",
402 |        "      <td>Bahamas</td>\n",
403 |        "      <td>NaN</td>\n",
404 |        "      <td>NaN</td>\n",
405 |        "    </tr>\n",
406 |        "    <tr>\n",
407 |        "      <th>2</th>\n",
408 |        "      <td>2</td>\n",
409 |        "      <td>45983838</td>\n",
410 |        "      <td>Adamstown</td>\n",
411 |        "      <td>MD</td>\n",
412 |        "      <td>Adam</td>\n",
413 |        "      <td>Maryland</td>\n",
414 |        "      <td>NaN</td>\n",
415 |        "      <td>NaN</td>\n",
416 |        "    </tr>\n",
417 |        "    <tr>\n",
418 |        "      <th>3</th>\n",
419 |        "      <td>3</td>\n",
420 |        "      <td>44475904</td>\n",
421 |        "      <td>Saint Cloud</td>\n",
422 |        "      <td>MN</td>\n",
423 |        "      <td>Adaptil</td>\n",
424 |        "      <td>NaN</td>\n",
425 |        "      <td>True</td>\n",
426 |        "      <td>NaN</td>\n",
427 |        "    </tr>\n",
428 |        "    <tr>\n",
429 |        "      <th>4</th>\n",
430 |        "      <td>4</td>\n",
431 |        "      <td>43877389</td>\n",
432 |        "      <td>Pueblo</td>\n",
433 |        "      <td>CO</td>\n",
434 |        "      <td>Afghanistan</td>\n",
435 |        "      <td>NaN</td>\n",
436 |        "      <td>NaN</td>\n",
437 |        "      <td>NaN</td>\n",
438 |        "    </tr>\n",
439 |        "  </tbody>\n",
440 |        "</table>\n",
441 |        "</div>"
442 |       ],
443 |       "text/plain": [
444 |        "   index        id contact_city contact_state        found    manual remove  \\\n",
445 |        "0      0  44520267        Anoka            MN     Arkansas       NaN    NaN   \n",
446 |        "1      1  44698509    Groveland            FL       Abacos   Bahamas    NaN   \n",
447 |        "2      2  45983838    Adamstown            MD         Adam  Maryland    NaN   \n",
448 |        "3      3  44475904  Saint Cloud            MN      Adaptil       NaN   True   \n",
449 |        "4      4  43877389       Pueblo            CO  Afghanistan       NaN    NaN   \n",
450 |        "\n",
451 |        "  still_there  \n",
452 |        "0         NaN  \n",
453 |        "1         NaN  \n",
454 |        "2         NaN  \n",
455 |        "3         NaN  \n",
456 |        "4         NaN  "
457 |       ]
458 |      },
459 |      "execution_count": 7,
460 |      "metadata": {},
461 |      "output_type": "execute_result"
462 |     }
463 |    ],
464 |    "source": [
465 |     "# travel.csv - File containing the description and details of the dog breed and more\n",
466 |     "travel = pd.read_csv(\"/Users/katiehuang/Documents/Data Science/Projects/Data Engineering/Dog Adoption/Datasets/travel.csv\")\n",
467 |     "travel.head()"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 8,
473 |    "metadata": {},
474 |    "outputs": [
475 |     {
476 |      "data": {
477 |       "text/plain": [
478 |        "Index(['index', 'id', 'contact_city', 'contact_state', 'found', 'manual',\n",
479 |        "       'remove', 'still_there'],\n",
480 |        "      dtype='object')"
481 |       ]
482 |      },
483 |      "execution_count": 8,
484 |      "metadata": {},
485 |      "output_type": "execute_result"
486 |     }
487 |    ],
488 |    "source": [
489 |     "travel.columns"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 9,
495 |    "metadata": {},
496 |    "outputs": [
497 |     {
498 |      "data": {
499 |       "text/html": [
500 |        "<div>\n",
501 |        "<style scoped>\n",
502 |        "    .dataframe tbody tr th:only-of-type {\n",
503 |        "        vertical-align: middle;\n",
504 |        "    }\n",
505 |        "\n",
506 |        "    .dataframe tbody tr th {\n",
507 |        "        vertical-align: top;\n",
508 |        "    }\n",
509 |        "\n",
510 |        "    .dataframe thead th {\n",
511 |        "        text-align: right;\n",
512 |        "    }\n",
513 |        "</style>\n",
514 |        "<table border=\"1\" class=\"dataframe\">\n",
515 |        "  <thead>\n",
516 |        "    <tr style=\"text-align: right;\">\n",
517 |        "      <th></th>\n",
518 |        "      <th>index</th>\n",
519 |        "      <th>location</th>\n",
520 |        "      <th>exported</th>\n",
521 |        "      <th>imported</th>\n",
522 |        "      <th>total</th>\n",
523 |        "      <th>inUS</th>\n",
524 |        "    </tr>\n",
525 |        "  </thead>\n",
526 |        "  <tbody>\n",
527 |        "    <tr>\n",
528 |        "      <th>0</th>\n",
529 |        "      <td>0</td>\n",
530 |        "      <td>Texas</td>\n",
531 |        "      <td>635.0</td>\n",
532 |        "      <td>NaN</td>\n",
533 |        "      <td>566.0</td>\n",
534 |        "      <td>True</td>\n",
535 |        "    </tr>\n",
536 |        "    <tr>\n",
537 |        "      <th>1</th>\n",
538 |        "      <td>1</td>\n",
539 |        "      <td>Alabama</td>\n",
540 |        "      <td>268.0</td>\n",
541 |        "      <td>2.0</td>\n",
542 |        "      <td>1428.0</td>\n",
543 |        "      <td>True</td>\n",
544 |        "    </tr>\n",
545 |        "    <tr>\n",
546 |        "      <th>2</th>\n",
547 |        "      <td>2</td>\n",
548 |        "      <td>North Carolina</td>\n",
549 |        "      <td>158.0</td>\n",
550 |        "      <td>14.0</td>\n",
551 |        "      <td>2627.0</td>\n",
552 |        "      <td>True</td>\n",
553 |        "    </tr>\n",
554 |        "    <tr>\n",
555 |        "      <th>3</th>\n",
556 |        "      <td>3</td>\n",
557 |        "      <td>South Carolina</td>\n",
558 |        "      <td>139.0</td>\n",
559 |        "      <td>12.0</td>\n",
560 |        "      <td>1618.0</td>\n",
561 |        "      <td>True</td>\n",
562 |        "    </tr>\n",
563 |        "    <tr>\n",
564 |        "      <th>4</th>\n",
565 |        "      <td>4</td>\n",
566 |        "      <td>Georgia</td>\n",
567 |        "      <td>137.0</td>\n",
568 |        "      <td>19.0</td>\n",
569 |        "      <td>3479.0</td>\n",
570 |        "      <td>True</td>\n",
571 |        "    </tr>\n",
572 |        "  </tbody>\n",
573 |        "</table>\n",
574 |        "</div>"
575 |       ],
576 |       "text/plain": [
577 |        "   index        location  exported  imported   total  inUS\n",
578 |        "0      0           Texas     635.0       NaN   566.0  True\n",
579 |        "1      1         Alabama     268.0       2.0  1428.0  True\n",
580 |        "2      2  North Carolina     158.0      14.0  2627.0  True\n",
581 |        "3      3  South Carolina     139.0      12.0  1618.0  True\n",
582 |        "4      4         Georgia     137.0      19.0  3479.0  True"
583 |       ]
584 |      },
585 |      "execution_count": 9,
586 |      "metadata": {},
587 |      "output_type": "execute_result"
588 |     }
589 |    ],
590 |    "source": [
591 |     "# location.csv - File containing the dogs' current location\n",
592 |     "location = pd.read_csv(\"/Users/katiehuang/Documents/Data Science/Projects/Data Engineering/Dog Adoption/Datasets/location.csv\")\n",
593 |     "location.head()"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 10,
599 |    "metadata": {},
600 |    "outputs": [
601 |     {
602 |      "data": {
603 |       "text/plain": [
604 |        "Index(['index', 'location', 'exported', 'imported', 'total', 'inUS'], dtype='object')"
605 |       ]
606 |      },
607 |      "execution_count": 10,
608 |      "metadata": {},
609 |      "output_type": "execute_result"
610 |     }
611 |    ],
612 |    "source": [
613 |     "location.columns"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 11,
619 |    "metadata": {},
620 |    "outputs": [],
621 |    "source": [
622 |     "cur, conn = create_database()"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "markdown",
627 |    "metadata": {},
628 |    "source": [
629 |     "If run into the following error while creating database:\n",
630 |     "\n",
631 |     "`ObjectInUse: database \"dog_adoption\" is being accessed by other users\n",
632 |     "DETAIL:  There is 1 other session using the database.`\n",
633 |     "\n",
634 |     "Then run this in psql terminal:\n",
635 |     "\n",
636 |     "`REVOKE CONNECT ON DATABASE dog_adoption FROM public;`\n",
637 |     "\n",
638 |     "`SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE pg_stat_activity.datname = 'dog_adoption';`"
639 |    ]
640 |   },
641 |   {
642 |    "cell_type": "code",
643 |    "execution_count": 12,
644 |    "metadata": {},
645 |    "outputs": [
646 |     {
647 |      "data": {
648 |       "text/plain": [
649 |        "index                int64\n",
650 |        "id                   int64\n",
651 |        "org_id              object\n",
652 |        "url                 object\n",
653 |        "breed_primary       object\n",
654 |        "breed_secondary     object\n",
655 |        "color_primary       object\n",
656 |        "color_secondary     object\n",
657 |        "color_tertiary      object\n",
658 |        "age                 object\n",
659 |        "sex                 object\n",
660 |        "size                object\n",
661 |        "coat                object\n",
662 |        "name                object\n",
663 |        "status              object\n",
664 |        "posted              object\n",
665 |        "contact_city        object\n",
666 |        "contact_state       object\n",
667 |        "contact_zip        float64\n",
668 |        "contact_country     object\n",
669 |        "stateQ              object\n",
670 |        "dtype: object"
671 |       ]
672 |      },
673 |      "execution_count": 12,
674 |      "metadata": {},
675 |      "output_type": "execute_result"
676 |     }
677 |    ],
678 |    "source": [
679 |     "description.dtypes"
680 |    ]
681 |   },
682 |   {
683 |    "cell_type": "markdown",
684 |    "metadata": {},
685 |    "source": [
686 |     "### Create tables\n",
687 |     "\n",
688 |     "This process creates empty table only. "
689 |    ]
690 |   },
691 |   {
692 |    "cell_type": "code",
693 |    "execution_count": 13,
694 |    "metadata": {},
695 |    "outputs": [],
696 |    "source": [
697 |     "create_description_table = (\"\"\"CREATE TABLE IF NOT EXISTS description \\\n",
698 |     "                (index INT PRIMARY KEY, id INT, org_id VARCHAR, url VARCHAR, breed_primary VARCHAR, \\\n",
699 |     "                breed_secondary VARCHAR, color_primary VARCHAR, color_secondary VARCHAR, color_tertiary VARCHAR, \\\n",
700 |     "                age VARCHAR, sex VARCHAR, size VARCHAR, coat VARCHAR, name VARCHAR, status VARCHAR, \\\n",
701 |     "                posted VARCHAR, contact_city VARCHAR, contact_state VARCHAR, contact_zip VARCHAR, \\\n",
702 |     "                contact_country VARCHAR, stateQ VARCHAR)\"\"\")"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": 14,
708 |    "metadata": {},
709 |    "outputs": [],
710 |    "source": [
711 |     "cur.execute(create_description_table)\n",
712 |     "conn.commit()"
713 |    ]
714 |   },
715 |   {
716 |    "cell_type": "code",
717 |    "execution_count": 15,
718 |    "metadata": {},
719 |    "outputs": [
720 |     {
721 |      "data": {
722 |       "text/plain": [
723 |        "index             int64\n",
724 |        "id                int64\n",
725 |        "contact_city     object\n",
726 |        "contact_state    object\n",
727 |        "found            object\n",
728 |        "manual           object\n",
729 |        "remove           object\n",
730 |        "still_there      object\n",
731 |        "dtype: object"
732 |       ]
733 |      },
734 |      "execution_count": 15,
735 |      "metadata": {},
736 |      "output_type": "execute_result"
737 |     }
738 |    ],
739 |    "source": [
740 |     "travel.dtypes"
741 |    ]
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": 16,
746 |    "metadata": {},
747 |    "outputs": [],
748 |    "source": [
749 |     "create_travel_table = (\"\"\"CREATE TABLE IF NOT EXISTS travel \\\n",
750 |     "                (index INT PRIMARY KEY, id INT, contact_city VARCHAR, contact_state VARCHAR, \\\n",
751 |     "                found VARCHAR, manual VARCHAR, remove VARCHAR, still_there VARCHAR)\"\"\")\n",
752 |     "\n",
753 |     "cur.execute(create_travel_table)\n",
754 |     "conn.commit()"
755 |    ]
756 |   },
757 |   {
758 |    "cell_type": "code",
759 |    "execution_count": 17,
760 |    "metadata": {},
761 |    "outputs": [
762 |     {
763 |      "data": {
764 |       "text/plain": [
765 |        "index         int64\n",
766 |        "location     object\n",
767 |        "exported    float64\n",
768 |        "imported    float64\n",
769 |        "total       float64\n",
770 |        "inUS           bool\n",
771 |        "dtype: object"
772 |       ]
773 |      },
774 |      "execution_count": 17,
775 |      "metadata": {},
776 |      "output_type": "execute_result"
777 |     }
778 |    ],
779 |    "source": [
780 |     "location.dtypes"
781 |    ]
782 |   },
783 |   {
784 |    "cell_type": "code",
785 |    "execution_count": 18,
786 |    "metadata": {},
787 |    "outputs": [
788 |     {
789 |      "data": {
790 |       "text/html": [
791 |        "<div>\n",
792 |        "<style scoped>\n",
793 |        "    .dataframe tbody tr th:only-of-type {\n",
794 |        "        vertical-align: middle;\n",
795 |        "    }\n",
796 |        "\n",
797 |        "    .dataframe tbody tr th {\n",
798 |        "        vertical-align: top;\n",
799 |        "    }\n",
800 |        "\n",
801 |        "    .dataframe thead th {\n",
802 |        "        text-align: right;\n",
803 |        "    }\n",
804 |        "</style>\n",
805 |        "<table border=\"1\" class=\"dataframe\">\n",
806 |        "  <thead>\n",
807 |        "    <tr style=\"text-align: right;\">\n",
808 |        "      <th></th>\n",
809 |        "      <th>index</th>\n",
810 |        "      <th>location</th>\n",
811 |        "      <th>exported</th>\n",
812 |        "      <th>imported</th>\n",
813 |        "      <th>total</th>\n",
814 |        "      <th>inUS</th>\n",
815 |        "    </tr>\n",
816 |        "  </thead>\n",
817 |        "  <tbody>\n",
818 |        "    <tr>\n",
819 |        "      <th>0</th>\n",
820 |        "      <td>0</td>\n",
821 |        "      <td>Texas</td>\n",
822 |        "      <td>635.0</td>\n",
823 |        "      <td>NaN</td>\n",
824 |        "      <td>566.0</td>\n",
825 |        "      <td>True</td>\n",
826 |        "    </tr>\n",
827 |        "    <tr>\n",
828 |        "      <th>1</th>\n",
829 |        "      <td>1</td>\n",
830 |        "      <td>Alabama</td>\n",
831 |        "      <td>268.0</td>\n",
832 |        "      <td>2.0</td>\n",
833 |        "      <td>1428.0</td>\n",
834 |        "      <td>True</td>\n",
835 |        "    </tr>\n",
836 |        "    <tr>\n",
837 |        "      <th>2</th>\n",
838 |        "      <td>2</td>\n",
839 |        "      <td>North Carolina</td>\n",
840 |        "      <td>158.0</td>\n",
841 |        "      <td>14.0</td>\n",
842 |        "      <td>2627.0</td>\n",
843 |        "      <td>True</td>\n",
844 |        "    </tr>\n",
845 |        "    <tr>\n",
846 |        "      <th>3</th>\n",
847 |        "      <td>3</td>\n",
848 |        "      <td>South Carolina</td>\n",
849 |        "      <td>139.0</td>\n",
850 |        "      <td>12.0</td>\n",
851 |        "      <td>1618.0</td>\n",
852 |        "      <td>True</td>\n",
853 |        "    </tr>\n",
854 |        "    <tr>\n",
855 |        "      <th>4</th>\n",
856 |        "      <td>4</td>\n",
857 |        "      <td>Georgia</td>\n",
858 |        "      <td>137.0</td>\n",
859 |        "      <td>19.0</td>\n",
860 |        "      <td>3479.0</td>\n",
861 |        "      <td>True</td>\n",
862 |        "    </tr>\n",
863 |        "  </tbody>\n",
864 |        "</table>\n",
865 |        "</div>"
866 |       ],
867 |       "text/plain": [
868 |        "   index        location  exported  imported   total  inUS\n",
869 |        "0      0           Texas     635.0       NaN   566.0  True\n",
870 |        "1      1         Alabama     268.0       2.0  1428.0  True\n",
871 |        "2      2  North Carolina     158.0      14.0  2627.0  True\n",
872 |        "3      3  South Carolina     139.0      12.0  1618.0  True\n",
873 |        "4      4         Georgia     137.0      19.0  3479.0  True"
874 |       ]
875 |      },
876 |      "execution_count": 18,
877 |      "metadata": {},
878 |      "output_type": "execute_result"
879 |     }
880 |    ],
881 |    "source": [
882 |     "location.head()"
883 |    ]
884 |   },
885 |   {
886 |    "cell_type": "code",
887 |    "execution_count": 19,
888 |    "metadata": {},
889 |    "outputs": [],
890 |    "source": [
891 |     "create_location_table = (\"\"\"CREATE TABLE IF NOT EXISTS location \\\n",
892 |     "                (index INT PRIMARY KEY, location VARCHAR, exported FLOAT, imported FLOAT, \\\n",
893 |     "                total FLOAT, inUS BOOL)\"\"\")\n",
894 |     "\n",
895 |     "cur.execute(create_location_table)\n",
896 |     "conn.commit()"
897 |    ]
898 |   },
899 |   {
900 |    "cell_type": "markdown",
901 |    "metadata": {},
902 |    "source": [
903 |     "### Insert rows into table"
904 |    ]
905 |   },
906 |   {
907 |    "cell_type": "code",
908 |    "execution_count": 20,
909 |    "metadata": {
910 |     "tags": []
911 |    },
912 |    "outputs": [],
913 |    "source": [
914 |     "insert_description_table = (\"\"\"INSERT INTO description \\\n",
915 |     "                (index, id, org_id, url, breed_primary, breed_secondary, \\\n",
916 |     "                color_primary, color_secondary, color_tertiary, \\\n",
917 |     "                age, sex, size, coat, name, status, \\\n",
918 |     "                posted, contact_city, contact_state, contact_zip, \\\n",
919 |     "                contact_country, stateQ)\n",
920 |     "                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, \\\n",
921 |     "                 %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)\"\"\")\n",
922 |     "\n",
923 |     "\n",
924 |     "# Iterating through index (i) and rows in description table\n",
925 |     "for i, row in description.iterrows():\n",
926 |     "    # Pass INSERT INTO query in cur.execute() and list of data\n",
927 |     "    cur.execute(insert_description_table, list(row))"
928 |    ]
929 |   },
930 |   {
931 |    "cell_type": "code",
932 |    "execution_count": 22,
933 |    "metadata": {
934 |     "tags": []
935 |    },
936 |    "outputs": [],
937 |    "source": [
938 |     "insert_travel_table = (\"\"\"INSERT INTO travel \\\n",
939 |     "                (index, id, contact_city, contact_state, \\\n",
940 |     "                found, manual, remove, still_there)\n",
941 |     "                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)\"\"\")\n",
942 |     "\n",
943 |     "for i, row in travel.iterrows(): # Iterating row by row in description table\n",
944 |     "    cur.execute(insert_travel_table, list(row))"
945 |    ]
946 |   },
947 |   {
948 |    "cell_type": "code",
949 |    "execution_count": 23,
950 |    "metadata": {},
951 |    "outputs": [],
952 |    "source": [
953 |     "insert_location_table = (\"\"\"INSERT INTO location \\\n",
954 |     "                (index, location, exported, imported, \\\n",
955 |     "                total, inUS)\n",
956 |     "                VALUES (%s, %s, %s, %s, %s, %s)\"\"\")\n",
957 |     "\n",
958 |     "for i, row in location.iterrows(): # Iterating row by row in description table\n",
959 |     "    cur.execute(insert_location_table, list(row))"
960 |    ]
961 |   },
962 |   {
963 |    "cell_type": "code",
964 |    "execution_count": null,
965 |    "metadata": {},
966 |    "outputs": [],
967 |    "source": []
968 |   }
969 |  ],
970 |  "metadata": {
971 |   "kernelspec": {
972 |    "display_name": "Python 3",
973 |    "language": "python",
974 |    "name": "python3"
975 |   },
976 |   "language_info": {
977 |    "codemirror_mode": {
978 |     "name": "ipython",
979 |     "version": 3
980 |    },
981 |    "file_extension": ".py",
982 |    "mimetype": "text/x-python",
983 |    "name": "python",
984 |    "nbconvert_exporter": "python",
985 |    "pygments_lexer": "ipython3",
986 |    "version": "3.8.11"
987 |   }
988 |  },
989 |  "nbformat": 4,
990 |  "nbformat_minor": 4
991 | }
992 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Engineering
 2 | 
 3 | This repo contains the projects related to data engineering.
 4 | 
 5 | ## Projects
 6 | 
 7 | - 🚗 [Uber](https://github.com/katiehuangx/data-engineering/tree/main/Uber%20Project): Built a data pipeline to perform analytics on Uber taxis data using Python, GCP Storage, Compute Engine, Mage (orchestration), BigQuery, and Looker Studio.
 8 | - 🏡 [Airbnb](https://github.com/katiehuangx/data-engineering/tree/main/Airbnb%20Project): [Work in progress] Designed and implemented data warehouse (Snowflake) and conducted transformation using dbt CLI.
 9 | - 🐶 [Dog Adoption](https://github.com/katiehuangx/data-engineering/tree/main/Dog%20Adoption): Create and build data model and deploy into database with Python and PostgreSQL.
10 | 


--------------------------------------------------------------------------------
/Uber Project/GCP/gcp compute machine_vm installation commands.txt:
--------------------------------------------------------------------------------
 1 | # Install Python and pip 
 2 | sudo apt-get install update
 3 | 
 4 | sudo apt-get install python3-distutils
 5 | 
 6 | sudo apt-get install python3-apt
 7 | 
 8 | sudo apt-get install wget
 9 | 
10 | wget https://bootstrap.pypa.io/get-pip.py
11 | 
12 | sudo python3 get-pip.py
13 | 
14 | 
15 | # Install Mage
16 | sudo pip3 install mage-ai
17 | 
18 | # Install Pandas
19 | sudo pip3 install pandas
20 | 
21 | # Install Google Cloud Library
22 | sudo pip3 install google-cloud
23 | 
24 | sudo pip3 install google-cloud-bigquery
25 | 


--------------------------------------------------------------------------------
/Uber Project/Mage/uber_gbq_load.py:
--------------------------------------------------------------------------------
 1 | from mage_ai.data_preparation.repo_manager import get_repo_path
 2 | from mage_ai.io.bigquery import BigQuery
 3 | from mage_ai.io.config import ConfigFileLoader
 4 | from pandas import DataFrame
 5 | from os import path
 6 | 
 7 | if 'data_exporter' not in globals():
 8 |     from mage_ai.data_preparation.decorators import data_exporter
 9 | 
10 | 
11 | @data_exporter
12 | def export_data_to_big_query(data, **kwargs) -> None:
13 |     """
14 |     Template for exporting data to a BigQuery warehouse.
15 |     Specify your configuration settings in 'io_config.yaml'.
16 | 
17 |     Docs: https://docs.mage.ai/design/data-loading#bigquery
18 |     """
19 |     
20 |     config_path = path.join(get_repo_path(), 'io_config.yaml')
21 |     config_profile = 'default'
22 | 
23 |     for key, value in data.items():
24 |         
25 |         table_id = 'uber-project-385706.uber_data_engineering.{}'.format(key)
26 | 
27 |         BigQuery.with_config(ConfigFileLoader(config_path, config_profile)).export(
28 |             DataFrame(value),
29 |             table_id,
30 |             if_exists='replace',  # Specify resolution policy if table name already exists
31 |         )
32 | 


--------------------------------------------------------------------------------
/Uber Project/Mage/uber_load_data.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import pandas as pd
 3 | import requests
 4 | if 'data_loader' not in globals():
 5 |     from mage_ai.data_preparation.decorators import data_loader
 6 | if 'test' not in globals():
 7 |     from mage_ai.data_preparation.decorators import test
 8 | 
 9 | 
10 | @data_loader
11 | def load_data_from_api(*args, **kwargs):
12 |     """
13 |     Template for loading data from API
14 |     """
15 |     url = 'https://storage.googleapis.com/uber-data-engineering-project-khuang/uber_data.csv'
16 |     response = requests.get(url)
17 | 
18 |     return pd.read_csv(io.StringIO(response.text), sep=',')
19 | 
20 | 
21 | @test
22 | def test_output(output, *args) -> None:
23 |     """
24 |     Template code for testing the output of the block.
25 |     """
26 |     assert output is not None, 'The output is undefined'
27 | 


--------------------------------------------------------------------------------
/Uber Project/Mage/uber_transformation.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | if 'transformer' not in globals():
  3 |     from mage_ai.data_preparation.decorators import transformer
  4 | if 'test' not in globals():
  5 |     from mage_ai.data_preparation.decorators import test
  6 | 
  7 | 
  8 | @transformer
  9 | def transform(df, *args, **kwargs):
 10 |     """
 11 |     Template code for a transformer block.
 12 | 
 13 |     Add more parameters to this function if this block has multiple parent blocks.
 14 |     There should be one parameter for each output variable from each parent block.
 15 | 
 16 |     Args:
 17 |         data: The output from the upstream parent block
 18 |         args: The output from any additional upstream blocks (if applicable)
 19 | 
 20 |     Returns:
 21 |         Anything (e.g. data frame, dictionary, array, int, str, etc.)
 22 |     """
 23 | 
 24 |     df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
 25 |     df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
 26 | 
 27 |     df = df.drop_duplicates().reset_index(drop=True)
 28 |     df['trip_id'] = df.index
 29 | 
 30 |     # Create new columns for datetime elements
 31 |     datetime_dim = df[['tpep_pickup_datetime','tpep_dropoff_datetime']].reset_index(drop=True)
 32 |     datetime_dim['tpep_pickup_datetime'] = datetime_dim['tpep_pickup_datetime']
 33 |     datetime_dim['pickup_hour'] = datetime_dim['tpep_pickup_datetime'].dt.hour
 34 |     datetime_dim['pickup_day'] = datetime_dim['tpep_pickup_datetime'].dt.day
 35 |     datetime_dim['pickup_month'] = datetime_dim['tpep_pickup_datetime'].dt.month
 36 |     datetime_dim['pickup_year'] = datetime_dim['tpep_pickup_datetime'].dt.year
 37 |     datetime_dim['pickup_weekday'] = datetime_dim['tpep_pickup_datetime'].dt.weekday
 38 | 
 39 |     datetime_dim['tpep_dropoff_datetime'] = datetime_dim['tpep_dropoff_datetime']
 40 |     datetime_dim['dropoff_hour'] = datetime_dim['tpep_dropoff_datetime'].dt.hour
 41 |     datetime_dim['dropoff_day'] = datetime_dim['tpep_dropoff_datetime'].dt.day
 42 |     datetime_dim['dropoff_month'] = datetime_dim['tpep_dropoff_datetime'].dt.month
 43 |     datetime_dim['dropoff_year'] = datetime_dim['tpep_dropoff_datetime'].dt.year
 44 |     datetime_dim['dropoff_weekday'] = datetime_dim['tpep_dropoff_datetime'].dt.weekday
 45 | 
 46 |     datetime_dim['datetime_id'] = datetime_dim.index
 47 | 
 48 |     # datetime_dim = datetime_dim.rename(columns={'tpep_pickup_datetime': 'datetime_id'}).reset_index(drop=True)
 49 |     datetime_dim = datetime_dim[['datetime_id', 'tpep_pickup_datetime', 'pickup_hour', 'pickup_day', 'pickup_month', 'pickup_year', 'pickup_weekday',
 50 |                                 'tpep_dropoff_datetime', 'dropoff_hour', 'dropoff_day', 'dropoff_month', 'dropoff_year', 'dropoff_weekday']]
 51 | 
 52 |     passenger_count_dim = df[['passenger_count']].reset_index(drop=True)
 53 |     passenger_count_dim['passenger_count_id'] = passenger_count_dim.index
 54 |     passenger_count_dim = passenger_count_dim[['passenger_count_id','passenger_count']]
 55 | 
 56 |     trip_distance_dim = df[['trip_distance']].reset_index(drop=True)
 57 |     trip_distance_dim['trip_distance_id'] = trip_distance_dim.index
 58 |     trip_distance_dim = trip_distance_dim[['trip_distance_id','trip_distance']]
 59 | 
 60 |     rate_code_type = {
 61 |         1:"Standard rate",
 62 |         2:"JFK",
 63 |         3:"Newark",
 64 |         4:"Nassau or Westchester",
 65 |         5:"Negotiated fare",
 66 |         6:"Group ride"
 67 |     }
 68 | 
 69 |     rate_code_dim = df[['RatecodeID']].reset_index(drop=True)
 70 |     rate_code_dim['rate_code_id'] = rate_code_dim.index
 71 |     rate_code_dim['rate_code_name'] = rate_code_dim['RatecodeID'].map(rate_code_type)
 72 |     rate_code_dim = rate_code_dim[['rate_code_id','RatecodeID','rate_code_name']]
 73 | 
 74 |     pickup_location_dim = df[['pickup_longitude', 'pickup_latitude']].reset_index(drop=True)
 75 |     pickup_location_dim['pickup_location_id'] = pickup_location_dim.index
 76 |     pickup_location_dim = pickup_location_dim[['pickup_location_id','pickup_latitude','pickup_longitude']] 
 77 | 
 78 |     dropoff_location_dim = df[['dropoff_longitude', 'dropoff_latitude']].reset_index(drop=True)
 79 |     dropoff_location_dim['dropoff_location_id'] = dropoff_location_dim.index
 80 |     dropoff_location_dim = dropoff_location_dim[['dropoff_location_id','dropoff_latitude','dropoff_longitude']]
 81 | 
 82 |     payment_type_name = {
 83 |         1:"Credit card",
 84 |         2:"Cash",
 85 |         3:"No charge",
 86 |         4:"Dispute",
 87 |         5:"Unknown",
 88 |         6:"Voided trip"
 89 |     }
 90 |     payment_type_dim = df[['payment_type']].reset_index(drop=True)
 91 |     payment_type_dim['payment_type_id'] = payment_type_dim.index
 92 |     payment_type_dim['payment_type_name'] = payment_type_dim['payment_type'].map(payment_type_name)
 93 |     payment_type_dim = payment_type_dim[['payment_type_id','payment_type','payment_type_name']]
 94 | 
 95 |     fact_table = df.merge(passenger_count_dim, left_on='trip_id', right_on='passenger_count_id') \
 96 |                 .merge(trip_distance_dim, left_on='trip_id', right_on='trip_distance_id') \
 97 |                 .merge(rate_code_dim, left_on='trip_id', right_on='rate_code_id') \
 98 |                 .merge(pickup_location_dim, left_on='trip_id', right_on='pickup_location_id') \
 99 |                 .merge(dropoff_location_dim, left_on='trip_id', right_on='dropoff_location_id')\
100 |                 .merge(datetime_dim, left_on='trip_id', right_on='datetime_id') \
101 |                 .merge(payment_type_dim, left_on='trip_id', right_on='payment_type_id') \
102 |                 [['trip_id','VendorID', 'datetime_id', 'passenger_count_id',
103 |                 'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id',
104 |                 'payment_type_id', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
105 |                 'improvement_surcharge', 'total_amount']]
106 | 
107 |     # Return DataFrames in dictionary/json format                
108 |     return {
109 |         'datetime_dim': datetime_dim.to_dict(orient='dict'),
110 |         'passenger_count_dim': passenger_count_dim.to_dict(orient='dict'),
111 |         'trip_distance_dim': trip_distance_dim.to_dict(orient='dict'),
112 |         'rate_code_dim': rate_code_dim.to_dict(orient='dict'),
113 |         'payment_type_dim': payment_type_dim.to_dict(orient='dict'),
114 |         'pickup_location_dim': pickup_location_dim.to_dict(orient='dict'),
115 |         'dropoff_location_dim': dropoff_location_dim.to_dict(orient='dict'),
116 |         'fact_table': fact_table.to_dict(orient='dict')
117 |     }
118 | 
119 | @test
120 | def test_output(output, *args) -> None:
121 |     """
122 |     Template code for testing the output of the block.
123 |     """
124 |     assert output is not None, 'The output is undefined'
125 | 


--------------------------------------------------------------------------------
/Uber Project/README.md:
--------------------------------------------------------------------------------
  1 | # 🚗 Uber Data Engineering End-to-End Project
  2 | 
  3 | ## Objective
  4 | 
  5 | In this project, I designed and implemented an end-to-end data pipeline that consists of several stages:
  6 | 1. Extracted data from NYC Trip Record Data website and loaded into Google Cloud Storage for further processing.
  7 | 3. Transformed and modeled the data using fact and dimensional data modeling concepts using Python on Jupyter Notebook.
  8 | 4. Using ETL concept, I orchestrated the data pipeline on Mage AI and loaded the transformed data into Google BigQuery.
  9 | 5. Developed a dashboard on Looker Studio.
 10 | 
 11 | As this is a data engineering project, my emphasis is primarily on the engineering aspect with a lesser emphasis on analytics and dashboard development.
 12 | 
 13 | The sections below will explain additional details on the technologies and files utilized.
 14 | 
 15 | ## Table of Content
 16 | 
 17 | - [Dataset Used](#dataset-used)
 18 | - [Technologies](technologies)
 19 | - [Data Pipeline Architecture](#data-pipeline-architecture)
 20 | - [Date Modeling](#data-modeling)
 21 | - [Step 1: Cleaning and Transformation](#step-1-cleaning-and-transformation)
 22 | - [Step 2: Storage](#step-2-storage)
 23 | - [Step 3: ETL / Orchestration](#step-3-etl--orchestration)
 24 | - [Step 4: Analytics](#step-4-analytics)
 25 | - [Step 5: Dashboard](#step-5-dashboard)
 26 | 
 27 | ## Dataset Used
 28 | 
 29 | This project uses the TLC Trip Record Data which include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts.
 30 | 
 31 | More info about dataset can be found in the following links:
 32 | - Website: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
 33 | - Data Dictionary: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
 34 | - Raw Data (CSV): https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/uber_data.csv
 35 | 
 36 | ## Technologies
 37 | 
 38 | The following technologies are used to build this project:
 39 | - Language: Python, SQL
 40 | - Extraction and transformation: Jupyter Notebook, Google BigQuery
 41 | - Storage: Google Cloud Storage
 42 | - Orchestration: [Mage AI](https://www.mage.ai)
 43 | - Dashboard: [Looker Studio](https://lookerstudio.google.com)
 44 | 
 45 | ## Data Pipeline Architecture
 46 | 
 47 | <img width="897" alt="Screenshot 2023-05-08 at 11 49 09 AM" src="https://user-images.githubusercontent.com/81607668/236729698-65e193bc-75ee-4ea6-9040-f33f5f2958cb.png">
 48 | 
 49 | Files in the following stages:
 50 | - Step 1: Cleaning and transformation - [Uber Data Engineering.ipynb](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Uber%20Data%20Engineering.ipynb)
 51 | - Step 2: Storage
 52 | - Step 3: ETL, Orchestration - Mage: [Extract](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Mage/uber_load_data.py), [Transform](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Mage/uber_transformation.py), [Load](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Mage/uber_gbq_load.py)
 53 | - Step 4: Analytics - [SQL script](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/sql_script.sql)
 54 | - Step 5: [Dashboard](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Uber_Dashboard.pdf)
 55 | 
 56 | ## Data Modeling
 57 | 
 58 | The datasets are designed using the principles of fact and dim data modeling concepts. 
 59 | 
 60 | ![Data Model](https://user-images.githubusercontent.com/81607668/236725688-995b6049-26c1-440f-b523-7c6c10d507ba.png)
 61 | 
 62 | ## Step 1: Cleaning and Transformation
 63 | 
 64 | In this step, I loaded the CSV file into Jupyter Notebook and carried out data cleaning and transformation activities prior to organizing them into fact and dim tables.
 65 | 
 66 | Here's the specific cleaning and transformation tasks that were performed:
 67 | 1. Converted `tpep_pickup_datetime` and `tpep_dropoff_datetime` columns into datetime format.
 68 | 2. Removed duplicates and reset the index.
 69 | 
 70 | Link to the script: [https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Uber%20Data%20Engineering.ipynb](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Uber%20Data%20Engineering.ipynb)
 71 | 
 72 | <img width="1436" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/83438f14-cae0-4278-8a33-5b536b487d90">
 73 | 
 74 | After completing the above steps, I created the following fact and dimension tables below:
 75 | 
 76 | <img width="1436" alt="Screenshot 2023-09-03 at 4 05 21 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/d1f961f5-dd28-4a5f-bfc9-1d739b85012c">
 77 | 
 78 | 
 79 | <img width="1436" alt="Screenshot 2023-09-03 at 4 05 29 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/3265a206-132f-457f-8323-2c6f681fbf60">
 80 | 
 81 | 
 82 | <img width="1436" alt="Screenshot 2023-09-03 at 4 05 35 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/a0555798-32a7-4c84-ac19-6336868dbf70">
 83 | 
 84 | 
 85 | <img width="1436" alt="Screenshot 2023-09-03 at 4 05 40 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/f7483917-b5eb-400f-a9ea-8ca138db6604">
 86 | 
 87 | 
 88 | <img width="1436" alt="Screenshot 2023-09-03 at 4 05 44 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/31fc871f-bdd3-4d2b-a0b5-04227188ec66">
 89 | 
 90 | 
 91 | <img width="1436" alt="Screenshot 2023-09-03 at 4 05 53 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/ec020455-bb23-4be5-b3f9-92200ccebaae">
 92 | 
 93 | ## Step 2: Storage
 94 | 
 95 | <img width="1436" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/b776b804-a871-4a72-b1e5-b38b6d194bf3">
 96 | 
 97 | ## Step 3: ETL / Orchestration
 98 | 
 99 | 1. Begin by launching the SSH instance and running the following commands below to install the required libraries.
100 | 
101 | <img width="1436" alt="Screenshot 2023-09-03 at 4 10 39 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/1bd9af4c-61aa-4ea5-a485-81b6a6b5d446">
102 | 
103 | ```python
104 | # Install python and pip 
105 | sudo apt-get install update
106 | 
107 | sudo apt-get install python3-distutils
108 | 
109 | sudo apt-get install python3-apt
110 | 
111 | sudo apt-get install wget
112 | 
113 | wget https://bootstrap.pypa.io/get-pip.py
114 | 
115 | sudo python3 get-pip.py
116 | 
117 | # Install Google Cloud Library
118 | sudo pip3 install google-cloud
119 | 
120 | sudo pip3 install google-cloud-bigquery
121 | 
122 | # Install Pandas
123 | sudo pip3 install pandas
124 | ```
125 | 
126 | <img width="1436" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/3ce67bf1-b965-428e-8412-1efd3ce0c95f">
127 | 
128 | 2. After that, I install the Mage AI library from the [Mage AI GitHub](https://github.com/mage-ai/mage-ai#using-pip-or-conda). Then, I create a new project called "uber_de_project".
129 | 
130 | ```python 
131 | # Install Mage library
132 | sudo pip3 install mage-ai
133 | 
134 | # Create new project
135 | mage start demo_project
136 | ```
137 | 
138 | <img width="901" alt="Screenshot 2023-09-03 at 3 43 27 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/2cfbdda6-4998-4dff-8c09-2f76c9b8a977">
139 | 
140 | 3. Next, I conduct orchestration in Mage by accessing the external IP address through a new tab. The link format is: `<external IP address>:<port number>`.
141 | 
142 | After that, I create a new pipeline with the following stages:
143 | - Extract: [load_uber_data](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Mage/uber_load_data.py)
144 | - Transform: [transform_uber](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Mage/uber_transformation.py)
145 | - Load: [load_gbq](https://github.com/katiehuangx/data-engineering/blob/main/Uber%20Project/Mage/uber_load_data.py)
146 | 
147 | <img width="1438" alt="image" src="https://github.com/katiehuangx/data-engineering/assets/81607668/ae8acb39-c66e-41f6-b81b-d1179121c0a4">
148 | 
149 | Before executing the Load pipeline, I download credentials from Google API & Credentials and then update them accordingly in the `io_config.yaml` file within the same pipeline. This step is essential for granting authorization to access and load data into Google BigQuery.
150 | 
151 | ## Step 4: Analytics
152 | 
153 | After running the Load pipeline in Mage, the fact and dim tables are generated in Google BigQuery.
154 | 
155 | <img width="1438" alt="Screenshot 2023-09-03 at 3 41 57 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/81106f7c-f912-462a-ba74-4b1e22120dc6">
156 | 
157 | Here's the additional analyses I performed:
158 | 1. Find the top 10 pickup locations based on the number of trips
159 | <img width="1436" alt="Screenshot 2023-09-03 at 3 46 17 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/87fef0c1-f849-4b0e-8f2d-db68a989a06d">
160 | 
161 | 2. Find the total number of trips by passenger count:
162 | <img width="1436" alt="Screenshot 2023-09-03 at 3 47 48 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/5f563142-9d18-4019-8499-7d9958b7ec05">
163 | 
164 | 3. Find the average fare amount by hour of the day:
165 | <img width="1436" alt="Screenshot 2023-09-03 at 3 48 52 PM" src="https://github.com/katiehuangx/data-engineering/assets/81607668/bf8d4dea-0915-48fb-a673-e5b3d3f37e3f">
166 | 
167 | ## Step 5: Dashboard
168 | 
169 | After completing the analysis, I loaded the relevant tables into Looker Studio and created a dashboard, which you can view [here](https://lookerstudio.google.com/s/s2Cv9HZiz_I).
170 | 
171 | ![Dashoard Pg 1](https://user-images.githubusercontent.com/81607668/236729944-0a66f699-689e-4cbb-a12a-860abdef2cf4.png)
172 | 
173 | ![Dashboard Pg 2](https://user-images.githubusercontent.com/81607668/236729954-cecba4a6-fc90-4944-b27f-cfb9473422bf.png)
174 | 
175 | ***
176 | 


--------------------------------------------------------------------------------
/Uber Project/Uber_Dashboard.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/katiehuangx/data-engineering/aace791c2df1ba2899624d7d51f3d5606e90d79d/Uber Project/Uber_Dashboard.pdf


--------------------------------------------------------------------------------
/Uber Project/sql_script.sql:
--------------------------------------------------------------------------------
 1 | -- Create analytics_table
 2 | CREATE OR REPLACE TABLE `uber-project-385706.uber_data_engineering.analytics_table` AS (
 3 | SELECT 
 4 |   f.trip_id,
 5 |   f.VendorID,
 6 |   d.tpep_pickup_datetime,
 7 |   d.tpep_dropoff_datetime,
 8 |   p.passenger_count,
 9 |   t.trip_distance,
10 |   r.rate_code_name,
11 |   pick.pickup_latitude,
12 |   pick.pickup_longitude,
13 |   drop.dropoff_latitude,
14 |   drop.dropoff_longitude,
15 |   pay.payment_type_name,
16 |   f.fare_amount,
17 |   f.extra,
18 |   f.mta_tax,
19 |   f.tip_amount,
20 |   f.tolls_amount,
21 |   f.improvement_surcharge,
22 |   f.total_amount
23 | FROM `uber-project-385706.uber_data_engineering.fact_table` AS f
24 | INNER JOIN `uber-project-385706.uber_data_engineering.datetime_dim` AS d  
25 |   ON f.datetime_id = d.datetime_id
26 | INNER JOIN `uber-project-385706.uber_data_engineering.passenger_count_dim` AS p
27 |   ON p.passenger_count_id = f.passenger_count_id  
28 | INNER JOIN `uber-project-385706.uber_data_engineering.trip_distance_dim` AS t
29 |   ON t.trip_distance_id = f.trip_distance_id  
30 | INNER JOIN `uber-project-385706.uber_data_engineering.rate_code_dim` AS r 
31 |   ON r.rate_code_id = f.rate_code_id  
32 | INNER JOIN `uber-project-385706.uber_data_engineering.pickup_location_dim` AS pick
33 |  ON pick.pickup_location_id = f.pickup_location_id
34 | INNER JOIN `uber-project-385706.uber_data_engineering.dropoff_location_dim` AS drop
35 |   ON drop.dropoff_location_id = f.dropoff_location_id
36 | INNER JOIN `uber-project-385706.uber_data_engineering.payment_type_dim` AS pay
37 |   ON pay.payment_type_id = f.payment_type_id
38 | );
39 | 
40 | -- ASSIGNMENT QUESTIONS
41 | -- Find the top 10 pickup location based on the number of trips
42 | SELECT 
43 |   pickup_latitude, 
44 |   pickup_longitude, 
45 |   COUNT(*) AS pickup_location_count
46 | FROM `uber-project-385706.uber_data_engineering.analytics_table`
47 | GROUP BY pickup_latitude, pickup_longitude
48 | ORDER BY pickup_location_count DESC
49 | LIMIT 10;
50 | 
51 | -- Find the total number of trips by passenger count
52 | SELECT 
53 |   p.passenger_count, 
54 |   COUNT(f.trip_id) AS trip_count
55 | FROM `uber-project-385706.uber_data_engineering.fact_table` AS f
56 | LEFT JOIN `uber-project-385706.uber_data_engineering.passenger_count_dim` AS p
57 |   ON f.passenger_count_id = p.passenger_count_id
58 | GROUP BY p.passenger_count
59 | ORDER BY trip_count DESC;
60 | 
61 | -- Find the average fare amount by hour of the day
62 | SELECT 
63 |   d.pickup_hour, 
64 |   AVG(f.fare_amount) AS avg_fare
65 | FROM `uber-project-385706.uber_data_engineering.fact_table` AS f
66 | LEFT JOIN `uber-project-385706.uber_data_engineering.datetime_dim` AS d
67 |   ON f.datetime_id = d.datetime_id
68 | GROUP BY d.pickup_hour
69 | ORDER BY d.pickup_hour ASC;
70 | 


--------------------------------------------------------------------------------
/dbt/jaffle shop.md:
--------------------------------------------------------------------------------
  1 | # Jaffle Shop
  2 | 
  3 | ## Refactoring SQL for Modularity [Tutorial]
  4 | 
  5 | ### 1. Migrating Legacy Code 1:1
  6 | 
  7 | **Objectives:**
  8 | - Transfer your legacy code to your dbt project as a `.sql` file in the models folder.
  9 | - Ensure that it can run and build in your data warehouse by running `dbt run`.
 10 | - Depending on the system you're migrating between, you may need to adjust the flavour of SQL in your existing code to successfully build the model.
 11 | 
 12 | 
 13 | In the dbt project, under the models folder, I create a subfolder called `legacy`. Within the `legacy` folder, I create a file called `customer_orders.sql` with the following query provided by the tutorial.
 14 | 
 15 | _Note: Model and YAML files built for this tutorial are affixed with `_tt`._
 16 | 
 17 | ```sql
 18 | -- customer_orders.sql
 19 | WITH paid_orders as (select Orders.ID as order_id,
 20 |         Orders.USER_ID    as customer_id,
 21 |         Orders.ORDER_DATE AS order_placed_at,
 22 |             Orders.STATUS AS order_status,
 23 |         p.total_amount_paid,
 24 |         p.payment_finalized_date,
 25 |         C.FIRST_NAME    as customer_first_name,
 26 |             C.LAST_NAME as customer_last_name
 27 |     FROM raw.jaffle_shop.orders as Orders
 28 |     left join (select ORDERID as order_id, max(CREATED) as payment_finalized_date, sum(AMOUNT) / 100.0 as total_amount_paid
 29 | from raw.stripe.payment
 30 | where STATUS <> 'fail'
 31 | group by 1) p ON orders.ID = p.order_id
 32 | left join raw.jaffle_shop.customers C on orders.USER_ID = C.ID ),
 33 | 
 34 | customer_orders 
 35 |     as (select C.ID as customer_id
 36 |         , min(ORDER_DATE) as first_order_date
 37 |         , max(ORDER_DATE) as most_recent_order_date
 38 |         , count(ORDERS.ID) AS number_of_orders
 39 |     from raw.jaffle_shop.customers C 
 40 |     left join raw.jaffle_shop.orders as Orders
 41 |     on orders.USER_ID = C.ID 
 42 |     group by 1)
 43 | 
 44 | select
 45 |     p.*,
 46 |     ROW_NUMBER() OVER (ORDER BY p.order_id) as transaction_seq,
 47 |     ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY p.order_id) as customer_sales_seq,
 48 |     CASE WHEN c.first_order_date = p.order_placed_at
 49 |     THEN 'new'
 50 |     ELSE 'return' END as nvsr,
 51 |     x.clv_bad as customer_lifetime_value,
 52 |     c.first_order_date as fdos
 53 |     FROM paid_orders p
 54 |     left join customer_orders as c USING (customer_id)
 55 |     LEFT OUTER JOIN 
 56 |     (
 57 |             select
 58 |             p.order_id,
 59 |             sum(t2.total_amount_paid) as clv_bad
 60 |         from paid_orders p
 61 |         left join paid_orders t2 on p.customer_id = t2.customer_id and p.order_id >= t2.order_id
 62 |         group by 1
 63 |         order by p.order_id
 64 |     ) x on x.order_id = p.order_id
 65 |     ORDER BY order_id
 66 | ```
 67 | 
 68 | <img width="1438" alt="image" src="https://user-images.githubusercontent.com/81607668/218353988-c083671c-4c3f-42d5-9d6c-92341c6925b2.png">
 69 | 
 70 | Conduct a `dbt run -m customer_orders` to ensure the model is built in the BigQuery warehouse under `dbt_khuang > customer_orders_tt` (i.e, `dbt_khuang.customer_orders_tt`)
 71 | 
 72 | <img width="1438" alt="Screenshot 2023-02-13 at 9 50 59 AM" src="https://user-images.githubusercontent.com/81607668/218353156-aec88436-b04e-4651-bbf1-4aee04edafe9.png">
 73 | 
 74 | Under the models folder, create a subfolder called marts. Within the marts folder, create a file called `fct_customer_orders_tt.sql` with the following query.
 75 | 
 76 | ```sql
 77 | -- fct_customer_orders_tt.sql
 78 | WITH paid_orders as (select Orders.ID as order_id,
 79 |         Orders.USER_ID    as customer_id,
 80 |         Orders.ORDER_DATE AS order_placed_at,
 81 |             Orders.STATUS AS order_status,
 82 |         p.total_amount_paid,
 83 |         p.payment_finalized_date,
 84 |         C.FIRST_NAME    as customer_first_name,
 85 |             C.LAST_NAME as customer_last_name
 86 |     FROM raw.jaffle_shop.orders as Orders
 87 |     left join (select ORDERID as order_id, max(CREATED) as payment_finalized_date, sum(AMOUNT) / 100.0 as total_amount_paid
 88 | from raw.stripe.payment
 89 | where STATUS <> 'fail'
 90 | group by 1) p ON orders.ID = p.order_id
 91 | left join raw.jaffle_shop.customers C on orders.USER_ID = C.ID ),
 92 | 
 93 | customer_orders 
 94 |     as (select C.ID as customer_id
 95 |         , min(ORDER_DATE) as first_order_date
 96 |         , max(ORDER_DATE) as most_recent_order_date
 97 |         , count(ORDERS.ID) AS number_of_orders
 98 |     from raw.jaffle_shop.customers C 
 99 |     left join raw.jaffle_shop.orders as Orders
100 |     on orders.USER_ID = C.ID 
101 |     group by 1)
102 | 
103 | select
104 |     p.*,
105 |     ROW_NUMBER() OVER (ORDER BY p.order_id) as transaction_seq,
106 |     ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY p.order_id) as customer_sales_seq,
107 |     CASE WHEN c.first_order_date = p.order_placed_at
108 |     THEN 'new'
109 |     ELSE 'return' END as nvsr,
110 |     x.clv_bad as customer_lifetime_value,
111 |     c.first_order_date as fdos
112 |     FROM paid_orders p
113 |     left join customer_orders as c USING (customer_id)
114 |     LEFT OUTER JOIN 
115 |     (
116 |             select
117 |             p.order_id,
118 |             sum(t2.total_amount_paid) as clv_bad
119 |         from paid_orders p
120 |         left join paid_orders t2 on p.customer_id = t2.customer_id and p.order_id >= t2.order_id
121 |         group by 1
122 |         order by p.order_id
123 |     ) x on x.order_id = p.order_id
124 |     ORDER BY order_id
125 | ```
126 | 
127 | <img width="1438" alt="image" src="https://user-images.githubusercontent.com/81607668/218354059-f9eb509f-6387-4d04-8de8-6f43f0b72b76.png">
128 | 
129 | Conduct a `dbt run -m fct_customer_orders_tt` to ensure the model is built in the BigQuery warehouse under `dbt_khuang > fct_customer_orders_tt` (i.e, `dbt_khuang.fct_customer_orders_tt`).
130 | 
131 | In the root folder, I create a file `packages.yml` to utilise the audit helper. I obtained the updated package [here](https://hub.getdbt.com/dbt-labs/audit_helper/latest/).
132 | 
133 | ```yml
134 | packages:
135 |   - package: dbt-labs/audit_helper
136 |     version: 0.7.0
137 | ```
138 | 
139 | <img width="1438" alt="image" src="https://user-images.githubusercontent.com/81607668/218355233-1b2d66fd-36d5-4af8-94ec-ea85e11cdfaf.png">
140 | 
141 | <img width="1438" alt="image" src="https://user-images.githubusercontent.com/81607668/218354873-181096fa-b8a4-4da6-981a-7c6c740fd50f.png">
142 | 
143 | Hit the Preview button. The 1 line output is showing 100% match between the two files, because they are exactly the same. 
144 | 
145 | <img width="1438" alt="image" src="https://user-images.githubusercontent.com/81607668/218355021-946807d5-cfdf-4039-bd6a-3ebf28f71a35.png">
146 | 
147 | ***
148 | 
149 | ### 2 and 3. Implementing Sources and Choosing a Refactoring Strategy
150 | 
151 | **Objectives of Implementing Sources:**
152 | - For each of the raw tables referenced in the new model, configure a source to map to those tables.
153 | - Replace all the explicit table references in your query using the source macro.
154 | 
155 | **Objectives of Choosing a Refactoring Strategy:**
156 | Decide on your refactoring strategy:
157 | - Refactor on top of the existing model - Create a new branch and refactor directly on the model that you created in the steps above.
158 | - Refactor alongside the existing model - Rename the existing model by prepending it with legacy. Then copy the code into a new file with the original file name.
159 | 
160 | In this tutorial, we are selecting the second option.
161 | 
162 | Create a subfolder under models folder called staging.
163 | 
164 | Under your models > staging folder, create two subfolders - one for each source schema that the original query pulls from. These subfolders are stripe and jaffle_shop.
165 | 
166 | Then, I create a file under models > staging > jaffle_shop called `src_jaffle_shop.yml`. The other configurations in the file were from the dbt Fundamentals course's project.
167 | 
168 | <img width="1438" alt="image" src="https://user-images.githubusercontent.com/81607668/218363805-048ec489-ba24-4c84-83bc-315b6dd5e878.png">
169 | 
170 | Create a file under models > staging > stripe called `src_stripe.yml`. 
171 | 
172 | <img width="1438" alt="image" src="https://user-images.githubusercontent.com/81607668/218363845-9c333a00-dbbd-4157-a34f-cb16ae9246b6.png">
173 | 
174 | Conduct a `dbt run -m fct_customer_orders` to ensure that your sources are configured properly and your model rebuilds in the warehouse.
175 | 
176 | Run `dbt docs generate` and inspect the DAG. 
177 | 
178 | <img width="1438" alt="image" src="https://user-images.githubusercontent.com/81607668/218363685-fb8821ce-cec5-485e-9ff9-f32fe38bf860.png">
179 | 
180 | ***
181 | 
182 | ### 4. Cosmetic Cleanups and CTE Groupings
183 | 
184 | Objectives:
185 | - Create one CTE for each source referenced at the top of your model.
186 | - Reimplement subqueries as CTEs beneath the source CTEs.
187 | - Update code to follow your style guide. 
188 | 
189 | **Import CTEs at the top**
190 | 
191 | 1. Refactor the cosmetics of the `fct_customer_orders.sql` model using the following guidelines:
192 | 
193 | - Add whitespacing
194 | - No lines over 80 characters
195 | - Lowercase keywords
196 | 
197 | 2. Refactor the code to follow this structure:
198 | 
199 | ```sql
200 | -- WITH statement
201 | -- Import CTEs
202 | -- Logical CTEs
203 | -- Final CTE
204 | -- Simple Select Statement
205 | ```
206 | 
207 | Steps to achieve this:
208 | a. Add a WITH statement to the top of the `fct_customer_orders` model
209 | b. Add import CTEs after the WITH statement for each source table used in the query
210 | c. Ensure subsequent FROM statements reference the named CTEs instead of {{ source() }}.
211 | 
212 | ### 5. Centralizing Transformations and Splitting Up Models
213 | 
214 | **Objectives:**
215 | - Structure your SQL into layers of modeling via staging models, intermediate models and final models.
216 | 
217 | **Staging models**
218 | - Capture light transformations in staging models on source data. i.e. renaming columns, concatenating fields, converting data types.
219 | - Update aliases with purposeful names. i.e. from `p` to `paid_customers`.
220 | - Scan for redundant transformations in the code and migrate into staging models.
221 | - Build dependencies between the existing model and the newly created staging models.
222 | 
223 | **CTEs or immediate models**
224 | - Inspect the grain of the transformations in latest version of the model and look for opportunities to move filters and aggregations into earlier CTEs.
225 | - Break CTEs into intermediate models if the model is too lengthy or could be reusable.
226 | 
227 | **a. Staging Models**
228 | 
229 | 1. Create new staging files for each source:
230 | - `stg_jaffle_shop__customers.sql` & `stg_jaffle_shop__orders.sql` under `models > staging > jaffle_shop`
231 | - `stg_stripe__payments.sql` under `models > staging > stripe`
232 | 
233 | 2. Make necessary changes in each file:
234 | - Change `id` fields to `<object>_id`
235 | - Make potentially clashing fields more specific, i.e. `status` becomes `order_status`
236 | - Apply rounding or simple transformations, i.e. change `amount` to `round(amount / 100.0, 2) as payment_amount`
237 |     
238 | ```sql
239 | -- models/staging/jaffle_shop/stg_jaffle_shop_customers.sql
240 | with 
241 | 
242 | source as (
243 |     
244 |     select * from {{ source('jaffle_shop', 'customers') }}
245 |     
246 | ),
247 |     
248 | transformed as (
249 | 
250 |     select 
251 | 
252 |         id as customer_id,
253 |         last_name as surname,
254 |         first_name as givenname,
255 |         first_name || ' ' || last_name as full_name
256 |     
257 |     from source
258 | 
259 | )
260 | 
261 | select * from transformed
262 | ```
263 | 
264 | ```sql
265 | -- models/staging/jaffle_shop/stg_jaffle_shop_orders.sql
266 | with
267 | 
268 | source as (
269 |     
270 |     select * from {{ source('jaffle_shop', 'orders') }}
271 | 
272 | ),
273 | 
274 | transformed as (
275 |     select
276 |         id as order_id,
277 |         user_id as customer_id,
278 |         status as order_status,
279 |         order_date as order_placed_at,
280 | 
281 |         case
282 |             when status not in ('returned', 'return_pending') 
283 |             then order_date
284 |         end as valid_order_date,
285 |         
286 |         row_number() over (
287 |             partition by user_id 
288 |             order by order_date, id
289 |         ) as user_order_seq
290 | 
291 |     from source
292 | )
293 | 
294 | select * from transformed
295 | ```
296 | 
297 | ```sql
298 | -- models/staging/jaffle_shop/stg_payments.sql
299 | with 
300 | 
301 | source as (
302 | 
303 |     select * from {{ source('jaffle_shop', 'payments') }}
304 | 
305 | ),
306 | 
307 | transformed as (
308 |     select 
309 |         id as payment_id,
310 |         orderid as order_id,
311 |         created as payment_created_at,
312 |         status as payment_status,
313 |         round(amount/100.0, 2) as payment_amount
314 | 
315 |     from source
316 | 
317 | )
318 | 
319 | select * from transformed
320 | ```
321 | 
322 | 1. Update references in `fct_customer_orders.sql` to point to the new staging models using the `{{ ref('<your_model>') }}` function.
323 | 
324 | 2. Change any column reference from the original column names to the new column names, for example, change id to customer_id.
325 | 
326 | **b. Intermediate Models / Additional CTEs**
327 | 
328 | 1. Create a new intermediate model to store reusable logic.
329 | 
330 | 2. Add a new file `int_orders_tt.sql` under the `marts/intermediate` folder.
331 | 
332 | ```sql
333 | -- models/marts/intermediate/int_orders_tt.sql
334 | with
335 | 
336 | customers as (
337 | 
338 |     select * from {{ ref('stg_jaffle_shop_customers') }}
339 | 
340 | ),
341 | 
342 | orders as (
343 | 
344 |     select * from {{ ref('stg_jaffle_shop_orders') }}
345 | 
346 | ),
347 | 
348 | payments as (
349 | 
350 |     select * from {{ ref('stg_jaffle_shop_payments') }}
351 | 
352 | ),
353 | 
354 | completed_payments as (
355 | 
356 |     select 
357 |         
358 |         order_id, 
359 |         max(payment_created_at) as payment_finalized_date, 
360 |         sum(payment_amount) as total_amount_paid
361 |         
362 |     from payments
363 | 
364 |     where payment_status <> 'fail'
365 |     group by 1
366 | 
367 | ),
368 | 
369 | paid_orders as (
370 |     
371 |     select 
372 | 
373 |         orders.order_id,
374 |         orders.customer_id,
375 |         orders.order_placed_at,
376 |         orders.order_status,
377 |         completed_payments.total_amount_paid,
378 |         completed_payments.payment_finalized_date,
379 |         customers.givenname,
380 |         customers.surname
381 |     
382 |     from orders
383 |     left join completed_payments
384 |     on orders.order_id = completed_payments.order_id
385 | 
386 |     left join customers
387 |     on orders.customer_id = customers.customer_id 
388 |         
389 | )
390 | 
391 | select * from paid_orders
392 | ```
393 | 
394 | 4. Remove helper comments we added for the course (i.e. -- Import CTEs)
395 | 
396 | **c. Final Model**
397 | 
398 | In the final file, be more explicit with ordering of window function subclause. This fixes a future potential bug where if there are multiple orders placed on the same day for one customer ID, this would cause indeterminate ordering.
399 | 
400 | ```sql
401 | -- models/core/fct_customer_orders_tt
402 | with
403 | 
404 | customers as (
405 | 
406 |     select * from {{ ref('stg_jaffle_shop_customers') }}
407 | 
408 | ),
409 | 
410 | paid_orders as (
411 |     
412 |     select * from {{ ref('int_orders_tt') }}
413 |         
414 | ),
415 | 
416 | final as (
417 | 
418 | select
419 |     order_id,
420 |     customer_id,
421 |     order_placed_at,
422 |     order_status,
423 |     total_amount_paid,
424 |     payment_finalized_date,
425 |     givenname,
426 |     surname,
427 | 
428 |     -- sales transaction sequence
429 |     row_number() over (
430 |         order by order_id) as transaction_seq,
431 | 
432 |     -- customer sales sequence
433 |     row_number() over (
434 |         partition by customer_id 
435 |         order by order_id) as customer_sales_seq,
436 | 
437 |     -- new vs returning customers
438 |     case when (
439 |         rank() over (
440 |         partition by customer_id
441 |         order by order_placed_at, order_id)
442 |     ) = 1
443 |     then 'new'
444 |     else 'return' 
445 |     end as nvsr,
446 | 
447 |     -- customer lifetime value
448 |     sum(total_amount_paid) over (
449 |         partition by customer_id
450 |         order by order_placed_at
451 |     ) as customer_lifetime_value,
452 | 
453 |     -- first day of sales
454 |     first_value(order_placed_at) over (
455 |         partition by customer_id
456 |         order by order_placed_at
457 |     ) as fdos
458 |         
459 | from paid_orders
460 | 
461 | )
462 | 
463 | -- Simple Select Statement
464 | 
465 | select * from final
466 | ```
467 | 
468 | ### 6. Auditing
469 | 
470 | **Objectives:**
471 | - Audit your new model against your old query to ensure that none of the changes you implemented changed the results of the modelling.
472 | - The goal is for both the original code and your final model to produce the same results.
473 | 


--------------------------------------------------------------------------------
/notes.md:
--------------------------------------------------------------------------------
 1 | # Katie's Notes
 2 | 
 3 | ## Run postgres in Docker
 4 | 
 5 | Reference: https://www.docker.com/blog/how-to-use-the-postgres-docker-official-image/
 6 | 
 7 | Enter the following docker run command to start a new Postgres instance or container: 
 8 | 
 9 | ```
10 | (base) katiehuang@Katies-MacBook-Air ~ % docker run --name some-postgres -e POSTGRES_PASSWORD=mysecretpassword -d postgres
11 | Unable to find image 'postgres:latest' locally
12 | latest: Pulling from library/postgres
13 | ebc3dc5a2d72: Already exists 
14 | 3911649e4bca: Already exists 
15 | c4ddc1b927db: Already exists 
16 | 34a1b68eb94e: Already exists 
17 | 94797b7742f7: Already exists 
18 | 7778eb70742d: Already exists 
19 | 84a383c97c40: Already exists 
20 | 56567f60de78: Already exists 
21 | 3663d28ad11d: Pull complete 
22 | 1ac7d1542da0: Pull complete 
23 | 0abcab7ee629: Pull complete 
24 | b4a505fe257b: Pull complete 
25 | 63257b07ec0c: Pull complete 
26 | Digest: sha256:6cc97262444f1c45171081bc5a1d4c28b883ea46a6e0d1a45a8eac4a7f4767ab
27 | Status: Downloaded newer image for postgres:latest
28 | be1f885325793532ef8e91ab6c50308363fd18a8ab7a2f50956cd54d88e04aa8
29 | (base) katiehuang@Katies-MacBook-Air ~ % 
30 | ```
31 | 
32 | ```
33 | (base) katiehuang@Katies-MacBook-Air ~ % docker ps
34 | CONTAINER ID   IMAGE      COMMAND                  CREATED         STATUS         PORTS      NAMES
35 | be1f88532579   postgres   "docker-entrypoint.s…"   5 minutes ago   Up 5 minutes   5432/tcp   some-postgres
36 | ```
37 | 
38 | ```
39 | docker ps -- list all images
40 | docker images -- list all images locally
41 | docker ps -a -- shows current and history of run images
42 | docker run <image_id> -- create new container from image
43 | 
44 | docker run -d -- run docker detached
45 | docker run -d -p6000:6379 --run docker image and bind to local host port
46 | 
47 | docker start <container_id> -- restart stopped container
48 | 
49 | docker stop <image_id> -- runs the selected image
50 | docker rmi <image_id> -- removes the image from local machine
51 | ```
52 | 
53 | Rename docker image
54 | ```
55 | (base) katiehuang@Katies-MacBook-Air ~ % docker ps
56 | CONTAINER ID   IMAGE          COMMAND                  CREATED      STATUS      PORTS                    NAMES
57 | 6bddb0f95edb   redis:4.0      "docker-entrypoint.s…"   3 days ago   Up 3 days   0.0.0.0:6001->6379/tcp   strange_keller
58 | 916d08075eda   redis          "docker-entrypoint.s…"   3 days ago   Up 3 days   0.0.0.0:6000->6379/tcp   modest_kowalevski
59 | 
60 | (base) katiehuang@Katies-MacBook-Air ~ % docker run -d -p6001:6379 --name redis-older redis:4.0
61 | 4e5c0fdb2e584ece759e2e167c6794d67dea9af6b31d45f56afaaa7401555222
62 | 
63 | (base) katiehuang@Katies-MacBook-Air ~ % docker run -d -p6000:6379 --name redis-latest redis
64 | 45cb9c30e35a2905fd6e2ea34b62346a048e07001b5d15e1abc0df7cae8d2989
65 | 
66 | (base) katiehuang@Katies-MacBook-Air ~ % docker ps
67 | CONTAINER ID   IMAGE          COMMAND                  CREATED         STATUS         PORTS                    NAMES
68 | 45cb9c30e35a   redis          "docker-entrypoint.s…"   3 seconds ago   Up 2 seconds   0.0.0.0:6000->6379/tcp   redis-latest
69 | 4e5c0fdb2e58   redis:4.0      "docker-entrypoint.s…"   2 minutes ago   Up 2 minutes   0.0.0.0:6001->6379/tcp   redis-older
70 | ```
71 | 
72 | Get the terminal/log file/configuration/environment of running container for debugging
73 | ```(base) katiehuang@Katies-MacBook-Air ~ % docker exec -it 45cb9c30e35a /bin/bash
74 | root@45cb9c30e35a:/data# ls
75 | root@45cb9c30e35a:/data# pwd
76 | /data
77 | root@45cb9c30e35a:/data# cd /
78 | root@45cb9c30e35a:/# ls
79 | bin  boot  data  dev  etc  home  lib  media  mnt  opt  proc  root  run	sbin  srv  sys	tmp  usr  var
80 | root@45cb9c30e35a:/# env
81 | HOSTNAME=45cb9c30e35a
82 | REDIS_DOWNLOAD_SHA=1dee4c6487341cae7bd6432ff7590906522215a061fdef87c7d040a0cb600131
83 | PWD=/
84 | HOME=/root
85 | REDIS_VERSION=7.0.10
86 | GOSU_VERSION=1.16
87 | TERM=xterm
88 | REDIS_DOWNLOAD_URL=http://download.redis.io/releases/redis-7.0.10.tar.gz
89 | SHLVL=1
90 | PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
91 | _=/usr/bin/env
92 | OLDPWD=/data
93 | root@45cb9c30e35a:/# exit
94 | exit
95 | (base) katiehuang@Katies-MacBook-Air ~ %
96 | ```
97 | 
98 | 


--------------------------------------------------------------------------------
/resources.md:
--------------------------------------------------------------------------------
 1 | # Data Engineering Study Resources
 2 | 
 3 | ## Programming Languages
 4 | 
 5 | ### 1. Python
 6 | 1. HarvardX: CS50's Introduction to Programming with Python - [link](https://www.edx.org/learn/python/harvard-university-cs50-s-introduction-to-programming-with-python)
 7 | 2. Udemy: 100 Days of Code - [link](https://www.udemy.com/course/100-days-of-code)
 8 | 3. DataCamp Data Engineer track - [link](https://www.datacamp.com/tracks/data-engineer)
 9 | 
10 | ### 2. SQL
11 | 
12 | Refer to my list of SQL resources under ["Where to Learn SQL"](https://github.com/katiehuangx/Transition-into-Data-Analytics?tab=readme-ov-file#-where-to-learn-sql).
13 | 
14 | ## Data Processing and ETL
15 | 
16 | ### 1. Orchestration of Data Pipelines
17 | 1. Udemy: The Complete Hands-On Introduction to Apache Airflow - [link](https://www.udemy.com/course/the-complete-hands-on-course-to-master-apache-airflow)
18 | 2. Youtube: Airflow tutorial 1: Introduction to Apache Airflow - [link](https://www.youtube.com/watch?v=AHMm1wfGuHE)
19 | 3. Dagster: Dagster Essentials Course - [link](https://courses.dagster.io/courses/dagster-essentials)
20 |    
21 | ## Data Modeling and Warehousing
22 | 
23 | ### 1. Dimensional Modeling and Data Warehousing Concepts
24 | 
25 | Designing data models optimized for analytical queries (e.g., star schema, snowflake schema) and understanding of data warehouse architecture and design principles.
26 | 
27 | 1. Udemy: Data Warehouse Fundamentals for Beginners - [link](https://www.udemy.com/course/data-warehouse-fundamentals-for-beginners/)
28 | 2. freeCodeCamp: Database Design Course - [link](https://www.youtube.com/watch?v=ztHopE5Wnpc)
29 | 3. Guru99: Data Warehouse Tutorial for Beginners - [link](https://www.guru99.com/data-warehousing-tutorial.html)
30 | 
31 | ### 2. Transformation and Modeling
32 | 
33 | Transforming and modeling data within data warehouses using SQL.
34 | 
35 | 1. Udemy: The Complete dbt (Data Build Tool) Bootcamp: Zero to Hero - [link](https://www.udemy.com/course/complete-dbt-data-build-tool-bootcamp-zero-to-hero-learn-dbt)
36 | 2. dbt: dbt Fundamentals Courses - [link](https://courses.getdbt.com/collections)
37 | 
38 | ## Database Technologies
39 | 
40 | ### 1. Relational Databases
41 | 1. Maven Analytics: Advanced MySQL Database Admin - [link](https://courses.mavenanalytics.io/courses/take/advanced-mysql-dba)
42 | 
43 | ### 2. Data Warehousing
44 | 1. Snowflake: Hands-on Essentials Track - [link](https://learn.snowflake.com/en/pages/hands-on-essentials-track/)
45 | 2. Databricks: Introduction to Azure Databricks Training Series - [link](https://www.databricks.com/resources/webinar/azure-databricks-free-training-series-track)
46 | 
47 | ## Version Control
48 | 1. Udacity: Version Control with Git - [link](https://www.udacity.com/course/version-control-with-git--ud123)
49 | 
50 | ## Cloud Platforms
51 | 1. GCP: Data Engineer - [link](https://www.cloudskillsboost.google/paths/16)
52 | 2. GCP: Google Cloud Computing Foundations - [link](https://www.cloudskillsboost.google/paths/36)
53 | 
54 | ## Docker
55 | 
56 | Containerization technology for packaging applications and dependencies.
57 | 
58 | 1. TechWorld with Nana Docker Tutorial for Beginners - [link](https://www.youtube.com/watch?v=3c-iBn73dDE)
59 | 
60 | ***
61 | 
62 | ## Projects
63 | 
64 | 1. Darshil Parmar: End-to-end DE Project [Python, SQL] - links [#1](https://www.youtube.com/watch?v=2xyoz0T47Bs), [#2](https://www.youtube.com/watch?v=K45k-gNNzGo&t=0s), [#3](https://www.youtube.com/watch?v=POjDCe-_G8k&t=426s)
65 | 2. Darshil Parmar: End-to-end Youtube Project [AWS] - [link](https://www.youtube.com/watch?v=yZKJFKu49Dk)
66 | 3. Darshil Parmar: DE Project Cloud Series [GCP] - [link](https://www.youtube.com/playlist?list=PLBJe2dFI4sgt-9GR2j-rTeKtimE9pfqyt)
67 | 4. Darshil Parmar: Uber Data Analytics [GCP, Mage, BigQuery, Looker] - [link](https://www.youtube.com/watch?v=WpQECq5Hx9g)
68 | 5. Darshil Parmar: Olympic Data Analytics [Azure] - [link](https://www.youtube.com/watch?v=IaA9YNlg5hM)
69 | 6. Thu Vu: Youtube API for Python [Python] - [link](https://www.youtube.com/watch?v=D56_Cx36oGY)
70 | 7. SQL Database Design Tutorial for Beginners - [link](https://www.youtube.com/watch?v=MiAl2mQ718s)
71 | 
72 |    
73 | 


--------------------------------------------------------------------------------