├── README.md
├── Uber Data Pipeline (Fixed Version).ipynb
├── Uber Data Pipeline (Video Version).ipynb
├── analytics_query.sql
├── architecture.jpg
├── commands.txt
├── data
└── uber_data.csv
├── data_model.jpeg
└── mage-files
├── extract.py
├── load.py
└── transform.py
/README.md:
--------------------------------------------------------------------------------
1 | # Uber Data Analytics | Modern Data Engineering GCP Project
2 |
3 | ## Introduction
4 |
5 | The goal of this project is to perform data analytics on Uber data using various tools and technologies, including GCP Storage, Python, Compute Instance, Mage Data Pipeline Tool, BigQuery, and Looker Studio.
6 |
7 | ## Architecture
8 |
9 |
10 | ## Technology Used
11 | - Programming Language - Python
12 |
13 | Google Cloud Platform
14 | 1. Google Storage
15 | 2. Compute Instance
16 | 3. BigQuery
17 | 4. Looker Studio
18 |
19 | Modern Data Pipeine Tool - https://www.mage.ai/
20 |
21 | Contibute to this open source project - https://github.com/mage-ai/mage-ai
22 |
23 |
24 | ## Dataset Used
25 | TLC Trip Record Data
26 | Yellow and green taxi trip records include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts.
27 |
28 | Here is the dataset used in the video - https://github.com/darshilparmar/uber-etl-pipeline-data-engineering-project/blob/main/data/uber_data.csv
29 |
30 | More info about dataset can be found here:
31 | 1. Website - https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
32 | 2. Data Dictionary - https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
33 |
34 | ## Data Model
35 |
36 |
37 | ## Complete Video Tutorial
38 | Video Link - https://youtu.be/WpQECq5Hx9g
39 |
--------------------------------------------------------------------------------
/Uber Data Pipeline (Fixed Version).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "id": "368d2580",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import io\n",
11 | "import pandas as pd\n",
12 | "import requests"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 3,
18 | "id": "500446cc",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "url = 'https://storage.googleapis.com/uber-data-engineering-project/uber_data.csv'\n",
23 | "response = requests.get(url)"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 4,
29 | "id": "b37a1ace",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "df = pd.read_csv(io.StringIO(response.text), sep=',')"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 5,
39 | "id": "edc6d495",
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n",
44 | "df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 6,
50 | "id": "4d0c8281",
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "df = df.drop_duplicates().reset_index(drop=True)\n",
55 | "df['trip_id'] = df.index"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 7,
61 | "id": "a35f32dd",
62 | "metadata": {},
63 | "outputs": [
64 | {
65 | "data": {
66 | "text/html": [
67 | "
\n",
68 | "\n",
81 | "
\n",
82 | " \n",
83 | " \n",
84 | " | \n",
85 | " VendorID | \n",
86 | " tpep_pickup_datetime | \n",
87 | " tpep_dropoff_datetime | \n",
88 | " passenger_count | \n",
89 | " trip_distance | \n",
90 | " pickup_longitude | \n",
91 | " pickup_latitude | \n",
92 | " RatecodeID | \n",
93 | " store_and_fwd_flag | \n",
94 | " dropoff_longitude | \n",
95 | " dropoff_latitude | \n",
96 | " payment_type | \n",
97 | " fare_amount | \n",
98 | " extra | \n",
99 | " mta_tax | \n",
100 | " tip_amount | \n",
101 | " tolls_amount | \n",
102 | " improvement_surcharge | \n",
103 | " total_amount | \n",
104 | " trip_id | \n",
105 | "
\n",
106 | " \n",
107 | " \n",
108 | " \n",
109 | " 0 | \n",
110 | " 1 | \n",
111 | " 2016-03-01 | \n",
112 | " 2016-03-01 00:07:55 | \n",
113 | " 1 | \n",
114 | " 2.50 | \n",
115 | " -73.976746 | \n",
116 | " 40.765152 | \n",
117 | " 1 | \n",
118 | " N | \n",
119 | " -74.004265 | \n",
120 | " 40.746128 | \n",
121 | " 1 | \n",
122 | " 9.0 | \n",
123 | " 0.5 | \n",
124 | " 0.5 | \n",
125 | " 2.05 | \n",
126 | " 0.00 | \n",
127 | " 0.3 | \n",
128 | " 12.35 | \n",
129 | " 0 | \n",
130 | "
\n",
131 | " \n",
132 | " 1 | \n",
133 | " 1 | \n",
134 | " 2016-03-01 | \n",
135 | " 2016-03-01 00:11:06 | \n",
136 | " 1 | \n",
137 | " 2.90 | \n",
138 | " -73.983482 | \n",
139 | " 40.767925 | \n",
140 | " 1 | \n",
141 | " N | \n",
142 | " -74.005943 | \n",
143 | " 40.733166 | \n",
144 | " 1 | \n",
145 | " 11.0 | \n",
146 | " 0.5 | \n",
147 | " 0.5 | \n",
148 | " 3.05 | \n",
149 | " 0.00 | \n",
150 | " 0.3 | \n",
151 | " 15.35 | \n",
152 | " 1 | \n",
153 | "
\n",
154 | " \n",
155 | " 2 | \n",
156 | " 2 | \n",
157 | " 2016-03-01 | \n",
158 | " 2016-03-01 00:31:06 | \n",
159 | " 2 | \n",
160 | " 19.98 | \n",
161 | " -73.782021 | \n",
162 | " 40.644810 | \n",
163 | " 1 | \n",
164 | " N | \n",
165 | " -73.974541 | \n",
166 | " 40.675770 | \n",
167 | " 1 | \n",
168 | " 54.5 | \n",
169 | " 0.5 | \n",
170 | " 0.5 | \n",
171 | " 8.00 | \n",
172 | " 0.00 | \n",
173 | " 0.3 | \n",
174 | " 63.80 | \n",
175 | " 2 | \n",
176 | "
\n",
177 | " \n",
178 | " 3 | \n",
179 | " 2 | \n",
180 | " 2016-03-01 | \n",
181 | " 2016-03-01 00:00:00 | \n",
182 | " 3 | \n",
183 | " 10.78 | \n",
184 | " -73.863419 | \n",
185 | " 40.769814 | \n",
186 | " 1 | \n",
187 | " N | \n",
188 | " -73.969650 | \n",
189 | " 40.757767 | \n",
190 | " 1 | \n",
191 | " 31.5 | \n",
192 | " 0.0 | \n",
193 | " 0.5 | \n",
194 | " 3.78 | \n",
195 | " 5.54 | \n",
196 | " 0.3 | \n",
197 | " 41.62 | \n",
198 | " 3 | \n",
199 | "
\n",
200 | " \n",
201 | " 4 | \n",
202 | " 2 | \n",
203 | " 2016-03-01 | \n",
204 | " 2016-03-01 00:00:00 | \n",
205 | " 5 | \n",
206 | " 30.43 | \n",
207 | " -73.971741 | \n",
208 | " 40.792183 | \n",
209 | " 3 | \n",
210 | " N | \n",
211 | " -74.177170 | \n",
212 | " 40.695053 | \n",
213 | " 1 | \n",
214 | " 98.0 | \n",
215 | " 0.0 | \n",
216 | " 0.0 | \n",
217 | " 0.00 | \n",
218 | " 15.50 | \n",
219 | " 0.3 | \n",
220 | " 113.80 | \n",
221 | " 4 | \n",
222 | "
\n",
223 | " \n",
224 | "
\n",
225 | "
"
226 | ],
227 | "text/plain": [
228 | " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n",
229 | "0 1 2016-03-01 2016-03-01 00:07:55 1 \n",
230 | "1 1 2016-03-01 2016-03-01 00:11:06 1 \n",
231 | "2 2 2016-03-01 2016-03-01 00:31:06 2 \n",
232 | "3 2 2016-03-01 2016-03-01 00:00:00 3 \n",
233 | "4 2 2016-03-01 2016-03-01 00:00:00 5 \n",
234 | "\n",
235 | " trip_distance pickup_longitude pickup_latitude RatecodeID \\\n",
236 | "0 2.50 -73.976746 40.765152 1 \n",
237 | "1 2.90 -73.983482 40.767925 1 \n",
238 | "2 19.98 -73.782021 40.644810 1 \n",
239 | "3 10.78 -73.863419 40.769814 1 \n",
240 | "4 30.43 -73.971741 40.792183 3 \n",
241 | "\n",
242 | " store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type \\\n",
243 | "0 N -74.004265 40.746128 1 \n",
244 | "1 N -74.005943 40.733166 1 \n",
245 | "2 N -73.974541 40.675770 1 \n",
246 | "3 N -73.969650 40.757767 1 \n",
247 | "4 N -74.177170 40.695053 1 \n",
248 | "\n",
249 | " fare_amount extra mta_tax tip_amount tolls_amount \\\n",
250 | "0 9.0 0.5 0.5 2.05 0.00 \n",
251 | "1 11.0 0.5 0.5 3.05 0.00 \n",
252 | "2 54.5 0.5 0.5 8.00 0.00 \n",
253 | "3 31.5 0.0 0.5 3.78 5.54 \n",
254 | "4 98.0 0.0 0.0 0.00 15.50 \n",
255 | "\n",
256 | " improvement_surcharge total_amount trip_id \n",
257 | "0 0.3 12.35 0 \n",
258 | "1 0.3 15.35 1 \n",
259 | "2 0.3 63.80 2 \n",
260 | "3 0.3 41.62 3 \n",
261 | "4 0.3 113.80 4 "
262 | ]
263 | },
264 | "execution_count": 7,
265 | "metadata": {},
266 | "output_type": "execute_result"
267 | }
268 | ],
269 | "source": [
270 | "df.head()"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 10,
276 | "id": "4a148bc6",
277 | "metadata": {},
278 | "outputs": [
279 | {
280 | "data": {
281 | "text/html": [
282 | "\n",
283 | "\n",
296 | "
\n",
297 | " \n",
298 | " \n",
299 | " | \n",
300 | " datetime_id | \n",
301 | " tpep_pickup_datetime | \n",
302 | " pick_hour | \n",
303 | " pick_day | \n",
304 | " pick_month | \n",
305 | " pick_year | \n",
306 | " pick_weekday | \n",
307 | " tpep_dropoff_datetime | \n",
308 | " drop_hour | \n",
309 | " drop_day | \n",
310 | " drop_month | \n",
311 | " drop_year | \n",
312 | " drop_weekday | \n",
313 | "
\n",
314 | " \n",
315 | " \n",
316 | " \n",
317 | " 0 | \n",
318 | " 0 | \n",
319 | " 2016-03-01 | \n",
320 | " 0 | \n",
321 | " 1 | \n",
322 | " 3 | \n",
323 | " 2016 | \n",
324 | " 1 | \n",
325 | " 2016-03-01 00:07:55 | \n",
326 | " 0 | \n",
327 | " 1 | \n",
328 | " 3 | \n",
329 | " 2016 | \n",
330 | " 1 | \n",
331 | "
\n",
332 | " \n",
333 | " 1 | \n",
334 | " 1 | \n",
335 | " 2016-03-01 | \n",
336 | " 0 | \n",
337 | " 1 | \n",
338 | " 3 | \n",
339 | " 2016 | \n",
340 | " 1 | \n",
341 | " 2016-03-01 00:11:06 | \n",
342 | " 0 | \n",
343 | " 1 | \n",
344 | " 3 | \n",
345 | " 2016 | \n",
346 | " 1 | \n",
347 | "
\n",
348 | " \n",
349 | " 2 | \n",
350 | " 2 | \n",
351 | " 2016-03-01 | \n",
352 | " 0 | \n",
353 | " 1 | \n",
354 | " 3 | \n",
355 | " 2016 | \n",
356 | " 1 | \n",
357 | " 2016-03-01 00:31:06 | \n",
358 | " 0 | \n",
359 | " 1 | \n",
360 | " 3 | \n",
361 | " 2016 | \n",
362 | " 1 | \n",
363 | "
\n",
364 | " \n",
365 | " 3 | \n",
366 | " 3 | \n",
367 | " 2016-03-01 | \n",
368 | " 0 | \n",
369 | " 1 | \n",
370 | " 3 | \n",
371 | " 2016 | \n",
372 | " 1 | \n",
373 | " 2016-03-01 00:00:00 | \n",
374 | " 0 | \n",
375 | " 1 | \n",
376 | " 3 | \n",
377 | " 2016 | \n",
378 | " 1 | \n",
379 | "
\n",
380 | " \n",
381 | " 4 | \n",
382 | " 4 | \n",
383 | " 2016-03-01 | \n",
384 | " 0 | \n",
385 | " 1 | \n",
386 | " 3 | \n",
387 | " 2016 | \n",
388 | " 1 | \n",
389 | " 2016-03-01 00:00:00 | \n",
390 | " 0 | \n",
391 | " 1 | \n",
392 | " 3 | \n",
393 | " 2016 | \n",
394 | " 1 | \n",
395 | "
\n",
396 | " \n",
397 | "
\n",
398 | "
"
399 | ],
400 | "text/plain": [
401 | " datetime_id tpep_pickup_datetime pick_hour pick_day pick_month \\\n",
402 | "0 0 2016-03-01 0 1 3 \n",
403 | "1 1 2016-03-01 0 1 3 \n",
404 | "2 2 2016-03-01 0 1 3 \n",
405 | "3 3 2016-03-01 0 1 3 \n",
406 | "4 4 2016-03-01 0 1 3 \n",
407 | "\n",
408 | " pick_year pick_weekday tpep_dropoff_datetime drop_hour drop_day \\\n",
409 | "0 2016 1 2016-03-01 00:07:55 0 1 \n",
410 | "1 2016 1 2016-03-01 00:11:06 0 1 \n",
411 | "2 2016 1 2016-03-01 00:31:06 0 1 \n",
412 | "3 2016 1 2016-03-01 00:00:00 0 1 \n",
413 | "4 2016 1 2016-03-01 00:00:00 0 1 \n",
414 | "\n",
415 | " drop_month drop_year drop_weekday \n",
416 | "0 3 2016 1 \n",
417 | "1 3 2016 1 \n",
418 | "2 3 2016 1 \n",
419 | "3 3 2016 1 \n",
420 | "4 3 2016 1 "
421 | ]
422 | },
423 | "execution_count": 10,
424 | "metadata": {},
425 | "output_type": "execute_result"
426 | }
427 | ],
428 | "source": [
429 | "datetime_dim = df[['tpep_pickup_datetime','tpep_dropoff_datetime']].reset_index(drop=True)\n",
430 | "datetime_dim['tpep_pickup_datetime'] = datetime_dim['tpep_pickup_datetime']\n",
431 | "datetime_dim['pick_hour'] = datetime_dim['tpep_pickup_datetime'].dt.hour\n",
432 | "datetime_dim['pick_day'] = datetime_dim['tpep_pickup_datetime'].dt.day\n",
433 | "datetime_dim['pick_month'] = datetime_dim['tpep_pickup_datetime'].dt.month\n",
434 | "datetime_dim['pick_year'] = datetime_dim['tpep_pickup_datetime'].dt.year\n",
435 | "datetime_dim['pick_weekday'] = datetime_dim['tpep_pickup_datetime'].dt.weekday\n",
436 | "\n",
437 | "datetime_dim['tpep_dropoff_datetime'] = datetime_dim['tpep_dropoff_datetime']\n",
438 | "datetime_dim['drop_hour'] = datetime_dim['tpep_dropoff_datetime'].dt.hour\n",
439 | "datetime_dim['drop_day'] = datetime_dim['tpep_dropoff_datetime'].dt.day\n",
440 | "datetime_dim['drop_month'] = datetime_dim['tpep_dropoff_datetime'].dt.month\n",
441 | "datetime_dim['drop_year'] = datetime_dim['tpep_dropoff_datetime'].dt.year\n",
442 | "datetime_dim['drop_weekday'] = datetime_dim['tpep_dropoff_datetime'].dt.weekday\n",
443 | "\n",
444 | "\n",
445 | "datetime_dim['datetime_id'] = datetime_dim.index\n",
446 | "\n",
447 | "# datetime_dim = datetime_dim.rename(columns={'tpep_pickup_datetime': 'datetime_id'}).reset_index(drop=True)\n",
448 | "datetime_dim = datetime_dim[['datetime_id', 'tpep_pickup_datetime', 'pick_hour', 'pick_day', 'pick_month', 'pick_year', 'pick_weekday',\n",
449 | " 'tpep_dropoff_datetime', 'drop_hour', 'drop_day', 'drop_month', 'drop_year', 'drop_weekday']]\n",
450 | "#\n",
451 | "datetime_dim.head()"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 11,
457 | "id": "ba67912f",
458 | "metadata": {},
459 | "outputs": [],
460 | "source": [
461 | "passenger_count_dim = df[['passenger_count']].reset_index(drop=True)\n",
462 | "passenger_count_dim['passenger_count_id'] = passenger_count_dim.index\n",
463 | "passenger_count_dim = passenger_count_dim[['passenger_count_id','passenger_count']]\n",
464 | "\n",
465 | "trip_distance_dim = df[['trip_distance']].reset_index(drop=True)\n",
466 | "trip_distance_dim['trip_distance_id'] = trip_distance_dim.index\n",
467 | "trip_distance_dim = trip_distance_dim[['trip_distance_id','trip_distance']]\n"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": 12,
473 | "id": "fb7c9704",
474 | "metadata": {},
475 | "outputs": [],
476 | "source": [
477 | "rate_code_type = {\n",
478 | " 1:\"Standard rate\",\n",
479 | " 2:\"JFK\",\n",
480 | " 3:\"Newark\",\n",
481 | " 4:\"Nassau or Westchester\",\n",
482 | " 5:\"Negotiated fare\",\n",
483 | " 6:\"Group ride\"\n",
484 | "}\n",
485 | "\n",
486 | "rate_code_dim = df[['RatecodeID']].reset_index(drop=True)\n",
487 | "rate_code_dim['rate_code_id'] = rate_code_dim.index\n",
488 | "rate_code_dim['rate_code_name'] = rate_code_dim['RatecodeID'].map(rate_code_type)\n",
489 | "rate_code_dim = rate_code_dim[['rate_code_id','RatecodeID','rate_code_name']]\n",
490 | "\n",
491 | "# rate_code_dim.head()"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": 44,
497 | "id": "4db826a1",
498 | "metadata": {},
499 | "outputs": [
500 | {
501 | "data": {
502 | "text/html": [
503 | "\n",
504 | "\n",
517 | "
\n",
518 | " \n",
519 | " \n",
520 | " | \n",
521 | " rate_code_id | \n",
522 | " RatecodeID | \n",
523 | " rate_code_name | \n",
524 | "
\n",
525 | " \n",
526 | " \n",
527 | " \n",
528 | " 0 | \n",
529 | " 0 | \n",
530 | " 1 | \n",
531 | " Standard rate | \n",
532 | "
\n",
533 | " \n",
534 | " 1 | \n",
535 | " 1 | \n",
536 | " 3 | \n",
537 | " Newark | \n",
538 | "
\n",
539 | " \n",
540 | " 2 | \n",
541 | " 2 | \n",
542 | " 2 | \n",
543 | " JFK | \n",
544 | "
\n",
545 | " \n",
546 | " 3 | \n",
547 | " 3 | \n",
548 | " 5 | \n",
549 | " Negotiated fare | \n",
550 | "
\n",
551 | " \n",
552 | " 4 | \n",
553 | " 4 | \n",
554 | " 4 | \n",
555 | " Nassau or Westchester | \n",
556 | "
\n",
557 | " \n",
558 | "
\n",
559 | "
"
560 | ],
561 | "text/plain": [
562 | " rate_code_id RatecodeID rate_code_name\n",
563 | "0 0 1 Standard rate\n",
564 | "1 1 3 Newark\n",
565 | "2 2 2 JFK\n",
566 | "3 3 5 Negotiated fare\n",
567 | "4 4 4 Nassau or Westchester"
568 | ]
569 | },
570 | "execution_count": 44,
571 | "metadata": {},
572 | "output_type": "execute_result"
573 | }
574 | ],
575 | "source": [
576 | "rate_code_dim.head()"
577 | ]
578 | },
579 | {
580 | "cell_type": "code",
581 | "execution_count": 13,
582 | "id": "8048bdef",
583 | "metadata": {},
584 | "outputs": [],
585 | "source": [
586 | "pickup_location_dim = df[['pickup_longitude', 'pickup_latitude']].reset_index(drop=True)\n",
587 | "pickup_location_dim['pickup_location_id'] = pickup_location_dim.index\n",
588 | "pickup_location_dim = pickup_location_dim[['pickup_location_id','pickup_latitude','pickup_longitude']] \n",
589 | "\n",
590 | "\n",
591 | "dropoff_location_dim = df[['dropoff_longitude', 'dropoff_latitude']].reset_index(drop=True)\n",
592 | "dropoff_location_dim['dropoff_location_id'] = dropoff_location_dim.index\n",
593 | "dropoff_location_dim = dropoff_location_dim[['dropoff_location_id','dropoff_latitude','dropoff_longitude']]"
594 | ]
595 | },
596 | {
597 | "cell_type": "code",
598 | "execution_count": 14,
599 | "id": "bfb04993",
600 | "metadata": {},
601 | "outputs": [],
602 | "source": [
603 | "payment_type_name = {\n",
604 | " 1:\"Credit card\",\n",
605 | " 2:\"Cash\",\n",
606 | " 3:\"No charge\",\n",
607 | " 4:\"Dispute\",\n",
608 | " 5:\"Unknown\",\n",
609 | " 6:\"Voided trip\"\n",
610 | "}\n",
611 | "payment_type_dim = df[['payment_type']].reset_index(drop=True)\n",
612 | "payment_type_dim['payment_type_id'] = payment_type_dim.index\n",
613 | "payment_type_dim['payment_type_name'] = payment_type_dim['payment_type'].map(payment_type_name)\n",
614 | "payment_type_dim = payment_type_dim[['payment_type_id','payment_type','payment_type_name']]"
615 | ]
616 | },
617 | {
618 | "cell_type": "code",
619 | "execution_count": 15,
620 | "id": "e747865b",
621 | "metadata": {},
622 | "outputs": [],
623 | "source": [
624 | "\n",
625 | "fact_table = df.merge(passenger_count_dim, left_on='trip_id', right_on='passenger_count_id') \\\n",
626 | " .merge(trip_distance_dim, left_on='trip_id', right_on='trip_distance_id') \\\n",
627 | " .merge(rate_code_dim, left_on='trip_id', right_on='rate_code_id') \\\n",
628 | " .merge(pickup_location_dim, left_on='trip_id', right_on='pickup_location_id') \\\n",
629 | " .merge(dropoff_location_dim, left_on='trip_id', right_on='dropoff_location_id')\\\n",
630 | " .merge(datetime_dim, left_on='trip_id', right_on='datetime_id') \\\n",
631 | " .merge(payment_type_dim, left_on='trip_id', right_on='payment_type_id') \\\n",
632 | " [['trip_id','VendorID', 'datetime_id', 'passenger_count_id',\n",
633 | " 'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id',\n",
634 | " 'payment_type_id', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',\n",
635 | " 'improvement_surcharge', 'total_amount']]"
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 16,
641 | "id": "9acdb41a",
642 | "metadata": {},
643 | "outputs": [
644 | {
645 | "data": {
646 | "text/plain": [
647 | "Index(['payment_type_id', 'payment_type', 'payment_type_name'], dtype='object')"
648 | ]
649 | },
650 | "execution_count": 16,
651 | "metadata": {},
652 | "output_type": "execute_result"
653 | }
654 | ],
655 | "source": [
656 | "payment_type_dim.columns"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": 17,
662 | "id": "62e05477",
663 | "metadata": {},
664 | "outputs": [
665 | {
666 | "data": {
667 | "text/plain": [
668 | "Index(['trip_id', 'VendorID', 'datetime_id', 'passenger_count_id',\n",
669 | " 'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag',\n",
670 | " 'pickup_location_id', 'dropoff_location_id', 'payment_type_id',\n",
671 | " 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',\n",
672 | " 'improvement_surcharge', 'total_amount'],\n",
673 | " dtype='object')"
674 | ]
675 | },
676 | "execution_count": 17,
677 | "metadata": {},
678 | "output_type": "execute_result"
679 | }
680 | ],
681 | "source": [
682 | "fact_table.columns"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": null,
688 | "id": "6162ee05",
689 | "metadata": {},
690 | "outputs": [],
691 | "source": []
692 | },
693 | {
694 | "cell_type": "code",
695 | "execution_count": 18,
696 | "id": "254ff9ad",
697 | "metadata": {},
698 | "outputs": [
699 | {
700 | "data": {
701 | "text/html": [
702 | "\n",
703 | "\n",
716 | "
\n",
717 | " \n",
718 | " \n",
719 | " | \n",
720 | " trip_id | \n",
721 | " VendorID | \n",
722 | " datetime_id | \n",
723 | " passenger_count_id | \n",
724 | " trip_distance_id | \n",
725 | " rate_code_id | \n",
726 | " store_and_fwd_flag | \n",
727 | " pickup_location_id | \n",
728 | " dropoff_location_id | \n",
729 | " payment_type_id | \n",
730 | " fare_amount | \n",
731 | " extra | \n",
732 | " mta_tax | \n",
733 | " tip_amount | \n",
734 | " tolls_amount | \n",
735 | " improvement_surcharge | \n",
736 | " total_amount | \n",
737 | "
\n",
738 | " \n",
739 | " \n",
740 | " \n",
741 | " 0 | \n",
742 | " 0 | \n",
743 | " 1 | \n",
744 | " 0 | \n",
745 | " 0 | \n",
746 | " 0 | \n",
747 | " 0 | \n",
748 | " N | \n",
749 | " 0 | \n",
750 | " 0 | \n",
751 | " 0 | \n",
752 | " 9.0 | \n",
753 | " 0.5 | \n",
754 | " 0.5 | \n",
755 | " 2.05 | \n",
756 | " 0.00 | \n",
757 | " 0.3 | \n",
758 | " 12.35 | \n",
759 | "
\n",
760 | " \n",
761 | " 1 | \n",
762 | " 1 | \n",
763 | " 1 | \n",
764 | " 1 | \n",
765 | " 1 | \n",
766 | " 1 | \n",
767 | " 1 | \n",
768 | " N | \n",
769 | " 1 | \n",
770 | " 1 | \n",
771 | " 1 | \n",
772 | " 11.0 | \n",
773 | " 0.5 | \n",
774 | " 0.5 | \n",
775 | " 3.05 | \n",
776 | " 0.00 | \n",
777 | " 0.3 | \n",
778 | " 15.35 | \n",
779 | "
\n",
780 | " \n",
781 | " 2 | \n",
782 | " 2 | \n",
783 | " 2 | \n",
784 | " 2 | \n",
785 | " 2 | \n",
786 | " 2 | \n",
787 | " 2 | \n",
788 | " N | \n",
789 | " 2 | \n",
790 | " 2 | \n",
791 | " 2 | \n",
792 | " 54.5 | \n",
793 | " 0.5 | \n",
794 | " 0.5 | \n",
795 | " 8.00 | \n",
796 | " 0.00 | \n",
797 | " 0.3 | \n",
798 | " 63.80 | \n",
799 | "
\n",
800 | " \n",
801 | " 3 | \n",
802 | " 3 | \n",
803 | " 2 | \n",
804 | " 3 | \n",
805 | " 3 | \n",
806 | " 3 | \n",
807 | " 3 | \n",
808 | " N | \n",
809 | " 3 | \n",
810 | " 3 | \n",
811 | " 3 | \n",
812 | " 31.5 | \n",
813 | " 0.0 | \n",
814 | " 0.5 | \n",
815 | " 3.78 | \n",
816 | " 5.54 | \n",
817 | " 0.3 | \n",
818 | " 41.62 | \n",
819 | "
\n",
820 | " \n",
821 | " 4 | \n",
822 | " 4 | \n",
823 | " 2 | \n",
824 | " 4 | \n",
825 | " 4 | \n",
826 | " 4 | \n",
827 | " 4 | \n",
828 | " N | \n",
829 | " 4 | \n",
830 | " 4 | \n",
831 | " 4 | \n",
832 | " 98.0 | \n",
833 | " 0.0 | \n",
834 | " 0.0 | \n",
835 | " 0.00 | \n",
836 | " 15.50 | \n",
837 | " 0.3 | \n",
838 | " 113.80 | \n",
839 | "
\n",
840 | " \n",
841 | " ... | \n",
842 | " ... | \n",
843 | " ... | \n",
844 | " ... | \n",
845 | " ... | \n",
846 | " ... | \n",
847 | " ... | \n",
848 | " ... | \n",
849 | " ... | \n",
850 | " ... | \n",
851 | " ... | \n",
852 | " ... | \n",
853 | " ... | \n",
854 | " ... | \n",
855 | " ... | \n",
856 | " ... | \n",
857 | " ... | \n",
858 | " ... | \n",
859 | "
\n",
860 | " \n",
861 | " 99995 | \n",
862 | " 99995 | \n",
863 | " 1 | \n",
864 | " 99995 | \n",
865 | " 99995 | \n",
866 | " 99995 | \n",
867 | " 99995 | \n",
868 | " N | \n",
869 | " 99995 | \n",
870 | " 99995 | \n",
871 | " 99995 | \n",
872 | " 5.0 | \n",
873 | " 0.0 | \n",
874 | " 0.5 | \n",
875 | " 0.00 | \n",
876 | " 0.00 | \n",
877 | " 0.3 | \n",
878 | " 5.80 | \n",
879 | "
\n",
880 | " \n",
881 | " 99996 | \n",
882 | " 99996 | \n",
883 | " 1 | \n",
884 | " 99996 | \n",
885 | " 99996 | \n",
886 | " 99996 | \n",
887 | " 99996 | \n",
888 | " N | \n",
889 | " 99996 | \n",
890 | " 99996 | \n",
891 | " 99996 | \n",
892 | " 14.0 | \n",
893 | " 0.0 | \n",
894 | " 0.5 | \n",
895 | " 2.00 | \n",
896 | " 0.00 | \n",
897 | " 0.3 | \n",
898 | " 16.80 | \n",
899 | "
\n",
900 | " \n",
901 | " 99997 | \n",
902 | " 99997 | \n",
903 | " 1 | \n",
904 | " 99997 | \n",
905 | " 99997 | \n",
906 | " 99997 | \n",
907 | " 99997 | \n",
908 | " N | \n",
909 | " 99997 | \n",
910 | " 99997 | \n",
911 | " 99997 | \n",
912 | " 29.0 | \n",
913 | " 0.0 | \n",
914 | " 0.5 | \n",
915 | " 8.80 | \n",
916 | " 5.54 | \n",
917 | " 0.3 | \n",
918 | " 44.14 | \n",
919 | "
\n",
920 | " \n",
921 | " 99998 | \n",
922 | " 99998 | \n",
923 | " 2 | \n",
924 | " 99998 | \n",
925 | " 99998 | \n",
926 | " 99998 | \n",
927 | " 99998 | \n",
928 | " N | \n",
929 | " 99998 | \n",
930 | " 99998 | \n",
931 | " 99998 | \n",
932 | " 5.5 | \n",
933 | " 0.5 | \n",
934 | " 0.5 | \n",
935 | " 1.36 | \n",
936 | " 0.00 | \n",
937 | " 0.3 | \n",
938 | " 8.16 | \n",
939 | "
\n",
940 | " \n",
941 | " 99999 | \n",
942 | " 99999 | \n",
943 | " 1 | \n",
944 | " 99999 | \n",
945 | " 99999 | \n",
946 | " 99999 | \n",
947 | " 99999 | \n",
948 | " N | \n",
949 | " 99999 | \n",
950 | " 99999 | \n",
951 | " 99999 | \n",
952 | " 6.0 | \n",
953 | " 0.0 | \n",
954 | " 0.5 | \n",
955 | " 0.00 | \n",
956 | " 0.00 | \n",
957 | " 0.3 | \n",
958 | " 6.80 | \n",
959 | "
\n",
960 | " \n",
961 | "
\n",
962 | "
100000 rows × 17 columns
\n",
963 | "
"
964 | ],
965 | "text/plain": [
966 | " trip_id VendorID datetime_id passenger_count_id trip_distance_id \\\n",
967 | "0 0 1 0 0 0 \n",
968 | "1 1 1 1 1 1 \n",
969 | "2 2 2 2 2 2 \n",
970 | "3 3 2 3 3 3 \n",
971 | "4 4 2 4 4 4 \n",
972 | "... ... ... ... ... ... \n",
973 | "99995 99995 1 99995 99995 99995 \n",
974 | "99996 99996 1 99996 99996 99996 \n",
975 | "99997 99997 1 99997 99997 99997 \n",
976 | "99998 99998 2 99998 99998 99998 \n",
977 | "99999 99999 1 99999 99999 99999 \n",
978 | "\n",
979 | " rate_code_id store_and_fwd_flag pickup_location_id \\\n",
980 | "0 0 N 0 \n",
981 | "1 1 N 1 \n",
982 | "2 2 N 2 \n",
983 | "3 3 N 3 \n",
984 | "4 4 N 4 \n",
985 | "... ... ... ... \n",
986 | "99995 99995 N 99995 \n",
987 | "99996 99996 N 99996 \n",
988 | "99997 99997 N 99997 \n",
989 | "99998 99998 N 99998 \n",
990 | "99999 99999 N 99999 \n",
991 | "\n",
992 | " dropoff_location_id payment_type_id fare_amount extra mta_tax \\\n",
993 | "0 0 0 9.0 0.5 0.5 \n",
994 | "1 1 1 11.0 0.5 0.5 \n",
995 | "2 2 2 54.5 0.5 0.5 \n",
996 | "3 3 3 31.5 0.0 0.5 \n",
997 | "4 4 4 98.0 0.0 0.0 \n",
998 | "... ... ... ... ... ... \n",
999 | "99995 99995 99995 5.0 0.0 0.5 \n",
1000 | "99996 99996 99996 14.0 0.0 0.5 \n",
1001 | "99997 99997 99997 29.0 0.0 0.5 \n",
1002 | "99998 99998 99998 5.5 0.5 0.5 \n",
1003 | "99999 99999 99999 6.0 0.0 0.5 \n",
1004 | "\n",
1005 | " tip_amount tolls_amount improvement_surcharge total_amount \n",
1006 | "0 2.05 0.00 0.3 12.35 \n",
1007 | "1 3.05 0.00 0.3 15.35 \n",
1008 | "2 8.00 0.00 0.3 63.80 \n",
1009 | "3 3.78 5.54 0.3 41.62 \n",
1010 | "4 0.00 15.50 0.3 113.80 \n",
1011 | "... ... ... ... ... \n",
1012 | "99995 0.00 0.00 0.3 5.80 \n",
1013 | "99996 2.00 0.00 0.3 16.80 \n",
1014 | "99997 8.80 5.54 0.3 44.14 \n",
1015 | "99998 1.36 0.00 0.3 8.16 \n",
1016 | "99999 0.00 0.00 0.3 6.80 \n",
1017 | "\n",
1018 | "[100000 rows x 17 columns]"
1019 | ]
1020 | },
1021 | "execution_count": 18,
1022 | "metadata": {},
1023 | "output_type": "execute_result"
1024 | }
1025 | ],
1026 | "source": [
1027 | "fact_table"
1028 | ]
1029 | },
1030 | {
1031 | "cell_type": "code",
1032 | "execution_count": null,
1033 | "id": "a45c58d1",
1034 | "metadata": {},
1035 | "outputs": [],
1036 | "source": [
1037 | "# CREATE OR REPLACE TABLE `data-with-darshil.uber_dataset.tbl_analysis_report` AS (\n",
1038 | "# SELECT\n",
1039 | "# f.VendorID,\n",
1040 | "# f.tpep_pickup_datetime,\n",
1041 | "# f.tpep_dropoff_datetime,\n",
1042 | "# p.passenger_count,\n",
1043 | "# td.trip_distance,\n",
1044 | "# rc.RatecodeID,\n",
1045 | "# f.store_and_fwd_flag,\n",
1046 | "# pl.pickup_latitude,\n",
1047 | "# pl.pickup_longitude,\n",
1048 | "# dl.dropoff_latitude,\n",
1049 | "# dl.dropoff_longitude,\n",
1050 | "# pt.payment_type,\n",
1051 | "# f.fare_amount,\n",
1052 | "# f.extra,\n",
1053 | "# f.mta_tax,\n",
1054 | "# f.tip_amount,\n",
1055 | "# f.tolls_amount,\n",
1056 | "# f.improvement_surcharge,\n",
1057 | "# f.total_amount\n",
1058 | "# FROM\n",
1059 | "# `data-with-darshil.uber_dataset.fact_table` f\n",
1060 | "# JOIN `data-with-darshil.uber_dataset.passenger_count_dim` p ON f.passenger_count_id = p.passenger_count_id\n",
1061 | "# JOIN `data-with-darshil.uber_dataset.trip_distance_dim` td ON f.trip_distance_id = td.trip_distance_id\n",
1062 | "# JOIN `data-with-darshil.uber_dataset.rate_code_dim` rc ON f.rate_code_id = rc.rate_code_id\n",
1063 | "# JOIN `data-with-darshil.uber_dataset.pickup_location_dim` pl ON f.pickup_location_id = pl.pickup_location_id\n",
1064 | "# JOIN `data-with-darshil.uber_dataset.dropoff_location_dim` dl ON f.dropoff_location_id = dl.dropoff_location_id\n",
1065 | "# JOIN `data-with-darshil.uber_dataset.payment_type_dim` pt ON f.payment_type_id = pt.payment_type_id);"
1066 | ]
1067 | }
1068 | ],
1069 | "metadata": {
1070 | "kernelspec": {
1071 | "display_name": "Python 3 (ipykernel)",
1072 | "language": "python",
1073 | "name": "python3"
1074 | },
1075 | "language_info": {
1076 | "codemirror_mode": {
1077 | "name": "ipython",
1078 | "version": 3
1079 | },
1080 | "file_extension": ".py",
1081 | "mimetype": "text/x-python",
1082 | "name": "python",
1083 | "nbconvert_exporter": "python",
1084 | "pygments_lexer": "ipython3",
1085 | "version": "3.10.6"
1086 | }
1087 | },
1088 | "nbformat": 4,
1089 | "nbformat_minor": 5
1090 | }
1091 |
--------------------------------------------------------------------------------
/Uber Data Pipeline (Video Version).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "9fc22827",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "id": "7a84de07",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "df = pd.read_csv(\"data/uber_data.csv\")"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 4,
26 | "id": "7f70bb49",
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "data": {
31 | "text/html": [
32 | "\n",
33 | "\n",
46 | "
\n",
47 | " \n",
48 | " \n",
49 | " | \n",
50 | " VendorID | \n",
51 | " tpep_pickup_datetime | \n",
52 | " tpep_dropoff_datetime | \n",
53 | " passenger_count | \n",
54 | " trip_distance | \n",
55 | " pickup_longitude | \n",
56 | " pickup_latitude | \n",
57 | " RatecodeID | \n",
58 | " store_and_fwd_flag | \n",
59 | " dropoff_longitude | \n",
60 | " dropoff_latitude | \n",
61 | " payment_type | \n",
62 | " fare_amount | \n",
63 | " extra | \n",
64 | " mta_tax | \n",
65 | " tip_amount | \n",
66 | " tolls_amount | \n",
67 | " improvement_surcharge | \n",
68 | " total_amount | \n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " \n",
73 | " 0 | \n",
74 | " 1 | \n",
75 | " 2016-03-01 00:00:00 | \n",
76 | " 2016-03-01 00:07:55 | \n",
77 | " 1 | \n",
78 | " 2.50 | \n",
79 | " -73.976746 | \n",
80 | " 40.765152 | \n",
81 | " 1 | \n",
82 | " N | \n",
83 | " -74.004265 | \n",
84 | " 40.746128 | \n",
85 | " 1 | \n",
86 | " 9.0 | \n",
87 | " 0.5 | \n",
88 | " 0.5 | \n",
89 | " 2.05 | \n",
90 | " 0.00 | \n",
91 | " 0.3 | \n",
92 | " 12.35 | \n",
93 | "
\n",
94 | " \n",
95 | " 1 | \n",
96 | " 1 | \n",
97 | " 2016-03-01 00:00:00 | \n",
98 | " 2016-03-01 00:11:06 | \n",
99 | " 1 | \n",
100 | " 2.90 | \n",
101 | " -73.983482 | \n",
102 | " 40.767925 | \n",
103 | " 1 | \n",
104 | " N | \n",
105 | " -74.005943 | \n",
106 | " 40.733166 | \n",
107 | " 1 | \n",
108 | " 11.0 | \n",
109 | " 0.5 | \n",
110 | " 0.5 | \n",
111 | " 3.05 | \n",
112 | " 0.00 | \n",
113 | " 0.3 | \n",
114 | " 15.35 | \n",
115 | "
\n",
116 | " \n",
117 | " 2 | \n",
118 | " 2 | \n",
119 | " 2016-03-01 00:00:00 | \n",
120 | " 2016-03-01 00:31:06 | \n",
121 | " 2 | \n",
122 | " 19.98 | \n",
123 | " -73.782021 | \n",
124 | " 40.644810 | \n",
125 | " 1 | \n",
126 | " N | \n",
127 | " -73.974541 | \n",
128 | " 40.675770 | \n",
129 | " 1 | \n",
130 | " 54.5 | \n",
131 | " 0.5 | \n",
132 | " 0.5 | \n",
133 | " 8.00 | \n",
134 | " 0.00 | \n",
135 | " 0.3 | \n",
136 | " 63.80 | \n",
137 | "
\n",
138 | " \n",
139 | " 3 | \n",
140 | " 2 | \n",
141 | " 2016-03-01 00:00:00 | \n",
142 | " 2016-03-01 00:00:00 | \n",
143 | " 3 | \n",
144 | " 10.78 | \n",
145 | " -73.863419 | \n",
146 | " 40.769814 | \n",
147 | " 1 | \n",
148 | " N | \n",
149 | " -73.969650 | \n",
150 | " 40.757767 | \n",
151 | " 1 | \n",
152 | " 31.5 | \n",
153 | " 0.0 | \n",
154 | " 0.5 | \n",
155 | " 3.78 | \n",
156 | " 5.54 | \n",
157 | " 0.3 | \n",
158 | " 41.62 | \n",
159 | "
\n",
160 | " \n",
161 | " 4 | \n",
162 | " 2 | \n",
163 | " 2016-03-01 00:00:00 | \n",
164 | " 2016-03-01 00:00:00 | \n",
165 | " 5 | \n",
166 | " 30.43 | \n",
167 | " -73.971741 | \n",
168 | " 40.792183 | \n",
169 | " 3 | \n",
170 | " N | \n",
171 | " -74.177170 | \n",
172 | " 40.695053 | \n",
173 | " 1 | \n",
174 | " 98.0 | \n",
175 | " 0.0 | \n",
176 | " 0.0 | \n",
177 | " 0.00 | \n",
178 | " 15.50 | \n",
179 | " 0.3 | \n",
180 | " 113.80 | \n",
181 | "
\n",
182 | " \n",
183 | "
\n",
184 | "
"
185 | ],
186 | "text/plain": [
187 | " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n",
188 | "0 1 2016-03-01 00:00:00 2016-03-01 00:07:55 1 \n",
189 | "1 1 2016-03-01 00:00:00 2016-03-01 00:11:06 1 \n",
190 | "2 2 2016-03-01 00:00:00 2016-03-01 00:31:06 2 \n",
191 | "3 2 2016-03-01 00:00:00 2016-03-01 00:00:00 3 \n",
192 | "4 2 2016-03-01 00:00:00 2016-03-01 00:00:00 5 \n",
193 | "\n",
194 | " trip_distance pickup_longitude pickup_latitude RatecodeID \\\n",
195 | "0 2.50 -73.976746 40.765152 1 \n",
196 | "1 2.90 -73.983482 40.767925 1 \n",
197 | "2 19.98 -73.782021 40.644810 1 \n",
198 | "3 10.78 -73.863419 40.769814 1 \n",
199 | "4 30.43 -73.971741 40.792183 3 \n",
200 | "\n",
201 | " store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type \\\n",
202 | "0 N -74.004265 40.746128 1 \n",
203 | "1 N -74.005943 40.733166 1 \n",
204 | "2 N -73.974541 40.675770 1 \n",
205 | "3 N -73.969650 40.757767 1 \n",
206 | "4 N -74.177170 40.695053 1 \n",
207 | "\n",
208 | " fare_amount extra mta_tax tip_amount tolls_amount \\\n",
209 | "0 9.0 0.5 0.5 2.05 0.00 \n",
210 | "1 11.0 0.5 0.5 3.05 0.00 \n",
211 | "2 54.5 0.5 0.5 8.00 0.00 \n",
212 | "3 31.5 0.0 0.5 3.78 5.54 \n",
213 | "4 98.0 0.0 0.0 0.00 15.50 \n",
214 | "\n",
215 | " improvement_surcharge total_amount \n",
216 | "0 0.3 12.35 \n",
217 | "1 0.3 15.35 \n",
218 | "2 0.3 63.80 \n",
219 | "3 0.3 41.62 \n",
220 | "4 0.3 113.80 "
221 | ]
222 | },
223 | "execution_count": 4,
224 | "metadata": {},
225 | "output_type": "execute_result"
226 | }
227 | ],
228 | "source": [
229 | "df.head()"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 6,
235 | "id": "b8720809",
236 | "metadata": {},
237 | "outputs": [
238 | {
239 | "name": "stdout",
240 | "output_type": "stream",
241 | "text": [
242 | "\n",
243 | "RangeIndex: 100000 entries, 0 to 99999\n",
244 | "Data columns (total 19 columns):\n",
245 | " # Column Non-Null Count Dtype \n",
246 | "--- ------ -------------- ----- \n",
247 | " 0 VendorID 100000 non-null int64 \n",
248 | " 1 tpep_pickup_datetime 100000 non-null object \n",
249 | " 2 tpep_dropoff_datetime 100000 non-null object \n",
250 | " 3 passenger_count 100000 non-null int64 \n",
251 | " 4 trip_distance 100000 non-null float64\n",
252 | " 5 pickup_longitude 100000 non-null float64\n",
253 | " 6 pickup_latitude 100000 non-null float64\n",
254 | " 7 RatecodeID 100000 non-null int64 \n",
255 | " 8 store_and_fwd_flag 100000 non-null object \n",
256 | " 9 dropoff_longitude 100000 non-null float64\n",
257 | " 10 dropoff_latitude 100000 non-null float64\n",
258 | " 11 payment_type 100000 non-null int64 \n",
259 | " 12 fare_amount 100000 non-null float64\n",
260 | " 13 extra 100000 non-null float64\n",
261 | " 14 mta_tax 100000 non-null float64\n",
262 | " 15 tip_amount 100000 non-null float64\n",
263 | " 16 tolls_amount 100000 non-null float64\n",
264 | " 17 improvement_surcharge 100000 non-null float64\n",
265 | " 18 total_amount 100000 non-null float64\n",
266 | "dtypes: float64(12), int64(4), object(3)\n",
267 | "memory usage: 14.5+ MB\n"
268 | ]
269 | }
270 | ],
271 | "source": [
272 | "df.info()"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 8,
278 | "id": "c0632db9",
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n",
283 | "df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 9,
289 | "id": "d682dc12",
290 | "metadata": {},
291 | "outputs": [
292 | {
293 | "name": "stdout",
294 | "output_type": "stream",
295 | "text": [
296 | "\n",
297 | "RangeIndex: 100000 entries, 0 to 99999\n",
298 | "Data columns (total 19 columns):\n",
299 | " # Column Non-Null Count Dtype \n",
300 | "--- ------ -------------- ----- \n",
301 | " 0 VendorID 100000 non-null int64 \n",
302 | " 1 tpep_pickup_datetime 100000 non-null datetime64[ns]\n",
303 | " 2 tpep_dropoff_datetime 100000 non-null datetime64[ns]\n",
304 | " 3 passenger_count 100000 non-null int64 \n",
305 | " 4 trip_distance 100000 non-null float64 \n",
306 | " 5 pickup_longitude 100000 non-null float64 \n",
307 | " 6 pickup_latitude 100000 non-null float64 \n",
308 | " 7 RatecodeID 100000 non-null int64 \n",
309 | " 8 store_and_fwd_flag 100000 non-null object \n",
310 | " 9 dropoff_longitude 100000 non-null float64 \n",
311 | " 10 dropoff_latitude 100000 non-null float64 \n",
312 | " 11 payment_type 100000 non-null int64 \n",
313 | " 12 fare_amount 100000 non-null float64 \n",
314 | " 13 extra 100000 non-null float64 \n",
315 | " 14 mta_tax 100000 non-null float64 \n",
316 | " 15 tip_amount 100000 non-null float64 \n",
317 | " 16 tolls_amount 100000 non-null float64 \n",
318 | " 17 improvement_surcharge 100000 non-null float64 \n",
319 | " 18 total_amount 100000 non-null float64 \n",
320 | "dtypes: datetime64[ns](2), float64(12), int64(4), object(1)\n",
321 | "memory usage: 14.5+ MB\n"
322 | ]
323 | }
324 | ],
325 | "source": [
326 | "df.info()"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 19,
332 | "id": "acbb0a89",
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "datetime_dim = df[['tpep_pickup_datetime','tpep_dropoff_datetime']].drop_duplicates().reset_index(drop=True)\n",
337 | "datetime_dim['pick_hour'] = datetime_dim['tpep_pickup_datetime'].dt.hour\n",
338 | "datetime_dim['pick_day'] = datetime_dim['tpep_pickup_datetime'].dt.day\n",
339 | "datetime_dim['pick_month'] = datetime_dim['tpep_pickup_datetime'].dt.month\n",
340 | "datetime_dim['pick_year'] = datetime_dim['tpep_pickup_datetime'].dt.year\n",
341 | "datetime_dim['pick_weekday'] = datetime_dim['tpep_pickup_datetime'].dt.weekday\n",
342 | "\n",
343 | "datetime_dim['drop_hour'] = datetime_dim['tpep_dropoff_datetime'].dt.hour\n",
344 | "datetime_dim['drop_day'] = datetime_dim['tpep_dropoff_datetime'].dt.day\n",
345 | "datetime_dim['drop_month'] = datetime_dim['tpep_dropoff_datetime'].dt.month\n",
346 | "datetime_dim['drop_year'] = datetime_dim['tpep_dropoff_datetime'].dt.year\n",
347 | "datetime_dim['drop_weekday'] = datetime_dim['tpep_dropoff_datetime'].dt.weekday\n",
348 | "\n"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 22,
354 | "id": "29899c21",
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "datetime_dim['datetime_id'] = datetime_dim.index"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 24,
364 | "id": "7a59b1da",
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "datetime_dim = datetime_dim[['datetime_id', 'tpep_pickup_datetime', 'pick_hour', 'pick_day', 'pick_month', 'pick_year', 'pick_weekday',\n",
369 | " 'tpep_dropoff_datetime', 'drop_hour', 'drop_day', 'drop_month', 'drop_year', 'drop_weekday']]\n",
370 | "#"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 25,
376 | "id": "97e42f3b",
377 | "metadata": {},
378 | "outputs": [
379 | {
380 | "data": {
381 | "text/html": [
382 | "\n",
383 | "\n",
396 | "
\n",
397 | " \n",
398 | " \n",
399 | " | \n",
400 | " datetime_id | \n",
401 | " tpep_pickup_datetime | \n",
402 | " pick_hour | \n",
403 | " pick_day | \n",
404 | " pick_month | \n",
405 | " pick_year | \n",
406 | " pick_weekday | \n",
407 | " tpep_dropoff_datetime | \n",
408 | " drop_hour | \n",
409 | " drop_day | \n",
410 | " drop_month | \n",
411 | " drop_year | \n",
412 | " drop_weekday | \n",
413 | "
\n",
414 | " \n",
415 | " \n",
416 | " \n",
417 | " 0 | \n",
418 | " 0 | \n",
419 | " 2016-03-01 00:00:00 | \n",
420 | " 0 | \n",
421 | " 1 | \n",
422 | " 3 | \n",
423 | " 2016 | \n",
424 | " 1 | \n",
425 | " 2016-03-01 00:07:55 | \n",
426 | " 0 | \n",
427 | " 1 | \n",
428 | " 3 | \n",
429 | " 2016 | \n",
430 | " 1 | \n",
431 | "
\n",
432 | " \n",
433 | " 1 | \n",
434 | " 1 | \n",
435 | " 2016-03-01 00:00:00 | \n",
436 | " 0 | \n",
437 | " 1 | \n",
438 | " 3 | \n",
439 | " 2016 | \n",
440 | " 1 | \n",
441 | " 2016-03-01 00:11:06 | \n",
442 | " 0 | \n",
443 | " 1 | \n",
444 | " 3 | \n",
445 | " 2016 | \n",
446 | " 1 | \n",
447 | "
\n",
448 | " \n",
449 | " 2 | \n",
450 | " 2 | \n",
451 | " 2016-03-01 00:00:00 | \n",
452 | " 0 | \n",
453 | " 1 | \n",
454 | " 3 | \n",
455 | " 2016 | \n",
456 | " 1 | \n",
457 | " 2016-03-01 00:31:06 | \n",
458 | " 0 | \n",
459 | " 1 | \n",
460 | " 3 | \n",
461 | " 2016 | \n",
462 | " 1 | \n",
463 | "
\n",
464 | " \n",
465 | " 3 | \n",
466 | " 3 | \n",
467 | " 2016-03-01 00:00:00 | \n",
468 | " 0 | \n",
469 | " 1 | \n",
470 | " 3 | \n",
471 | " 2016 | \n",
472 | " 1 | \n",
473 | " 2016-03-01 00:00:00 | \n",
474 | " 0 | \n",
475 | " 1 | \n",
476 | " 3 | \n",
477 | " 2016 | \n",
478 | " 1 | \n",
479 | "
\n",
480 | " \n",
481 | " 4 | \n",
482 | " 4 | \n",
483 | " 2016-03-01 00:00:01 | \n",
484 | " 0 | \n",
485 | " 1 | \n",
486 | " 3 | \n",
487 | " 2016 | \n",
488 | " 1 | \n",
489 | " 2016-03-01 00:16:04 | \n",
490 | " 0 | \n",
491 | " 1 | \n",
492 | " 3 | \n",
493 | " 2016 | \n",
494 | " 1 | \n",
495 | "
\n",
496 | " \n",
497 | " ... | \n",
498 | " ... | \n",
499 | " ... | \n",
500 | " ... | \n",
501 | " ... | \n",
502 | " ... | \n",
503 | " ... | \n",
504 | " ... | \n",
505 | " ... | \n",
506 | " ... | \n",
507 | " ... | \n",
508 | " ... | \n",
509 | " ... | \n",
510 | " ... | \n",
511 | "
\n",
512 | " \n",
513 | " 99848 | \n",
514 | " 99848 | \n",
515 | " 2016-03-01 06:17:10 | \n",
516 | " 6 | \n",
517 | " 1 | \n",
518 | " 3 | \n",
519 | " 2016 | \n",
520 | " 1 | \n",
521 | " 2016-03-01 06:22:15 | \n",
522 | " 6 | \n",
523 | " 1 | \n",
524 | " 3 | \n",
525 | " 2016 | \n",
526 | " 1 | \n",
527 | "
\n",
528 | " \n",
529 | " 99849 | \n",
530 | " 99849 | \n",
531 | " 2016-03-01 06:17:10 | \n",
532 | " 6 | \n",
533 | " 1 | \n",
534 | " 3 | \n",
535 | " 2016 | \n",
536 | " 1 | \n",
537 | " 2016-03-01 06:32:41 | \n",
538 | " 6 | \n",
539 | " 1 | \n",
540 | " 3 | \n",
541 | " 2016 | \n",
542 | " 1 | \n",
543 | "
\n",
544 | " \n",
545 | " 99850 | \n",
546 | " 99850 | \n",
547 | " 2016-03-01 06:17:10 | \n",
548 | " 6 | \n",
549 | " 1 | \n",
550 | " 3 | \n",
551 | " 2016 | \n",
552 | " 1 | \n",
553 | " 2016-03-01 06:37:23 | \n",
554 | " 6 | \n",
555 | " 1 | \n",
556 | " 3 | \n",
557 | " 2016 | \n",
558 | " 1 | \n",
559 | "
\n",
560 | " \n",
561 | " 99851 | \n",
562 | " 99851 | \n",
563 | " 2016-03-01 06:17:10 | \n",
564 | " 6 | \n",
565 | " 1 | \n",
566 | " 3 | \n",
567 | " 2016 | \n",
568 | " 1 | \n",
569 | " 2016-03-01 06:22:09 | \n",
570 | " 6 | \n",
571 | " 1 | \n",
572 | " 3 | \n",
573 | " 2016 | \n",
574 | " 1 | \n",
575 | "
\n",
576 | " \n",
577 | " 99852 | \n",
578 | " 99852 | \n",
579 | " 2016-03-01 06:17:11 | \n",
580 | " 6 | \n",
581 | " 1 | \n",
582 | " 3 | \n",
583 | " 2016 | \n",
584 | " 1 | \n",
585 | " 2016-03-01 06:22:00 | \n",
586 | " 6 | \n",
587 | " 1 | \n",
588 | " 3 | \n",
589 | " 2016 | \n",
590 | " 1 | \n",
591 | "
\n",
592 | " \n",
593 | "
\n",
594 | "
99853 rows × 13 columns
\n",
595 | "
"
596 | ],
597 | "text/plain": [
598 | " datetime_id tpep_pickup_datetime pick_hour pick_day pick_month \\\n",
599 | "0 0 2016-03-01 00:00:00 0 1 3 \n",
600 | "1 1 2016-03-01 00:00:00 0 1 3 \n",
601 | "2 2 2016-03-01 00:00:00 0 1 3 \n",
602 | "3 3 2016-03-01 00:00:00 0 1 3 \n",
603 | "4 4 2016-03-01 00:00:01 0 1 3 \n",
604 | "... ... ... ... ... ... \n",
605 | "99848 99848 2016-03-01 06:17:10 6 1 3 \n",
606 | "99849 99849 2016-03-01 06:17:10 6 1 3 \n",
607 | "99850 99850 2016-03-01 06:17:10 6 1 3 \n",
608 | "99851 99851 2016-03-01 06:17:10 6 1 3 \n",
609 | "99852 99852 2016-03-01 06:17:11 6 1 3 \n",
610 | "\n",
611 | " pick_year pick_weekday tpep_dropoff_datetime drop_hour drop_day \\\n",
612 | "0 2016 1 2016-03-01 00:07:55 0 1 \n",
613 | "1 2016 1 2016-03-01 00:11:06 0 1 \n",
614 | "2 2016 1 2016-03-01 00:31:06 0 1 \n",
615 | "3 2016 1 2016-03-01 00:00:00 0 1 \n",
616 | "4 2016 1 2016-03-01 00:16:04 0 1 \n",
617 | "... ... ... ... ... ... \n",
618 | "99848 2016 1 2016-03-01 06:22:15 6 1 \n",
619 | "99849 2016 1 2016-03-01 06:32:41 6 1 \n",
620 | "99850 2016 1 2016-03-01 06:37:23 6 1 \n",
621 | "99851 2016 1 2016-03-01 06:22:09 6 1 \n",
622 | "99852 2016 1 2016-03-01 06:22:00 6 1 \n",
623 | "\n",
624 | " drop_month drop_year drop_weekday \n",
625 | "0 3 2016 1 \n",
626 | "1 3 2016 1 \n",
627 | "2 3 2016 1 \n",
628 | "3 3 2016 1 \n",
629 | "4 3 2016 1 \n",
630 | "... ... ... ... \n",
631 | "99848 3 2016 1 \n",
632 | "99849 3 2016 1 \n",
633 | "99850 3 2016 1 \n",
634 | "99851 3 2016 1 \n",
635 | "99852 3 2016 1 \n",
636 | "\n",
637 | "[99853 rows x 13 columns]"
638 | ]
639 | },
640 | "execution_count": 25,
641 | "metadata": {},
642 | "output_type": "execute_result"
643 | }
644 | ],
645 | "source": [
646 | "datetime_dim"
647 | ]
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": 26,
652 | "id": "69a20d11",
653 | "metadata": {},
654 | "outputs": [],
655 | "source": [
656 | "passenger_count_dim = df[['passenger_count']].drop_duplicates().reset_index(drop=True)\n",
657 | "passenger_count_dim['passenger_count_id'] = passenger_count_dim.index\n",
658 | "passenger_count_dim = passenger_count_dim[['passenger_count_id','passenger_count']]\n",
659 | "\n",
660 | "trip_distance_dim = df[['trip_distance']].drop_duplicates().reset_index(drop=True)\n",
661 | "trip_distance_dim['trip_distance_id'] = trip_distance_dim.index\n",
662 | "trip_distance_dim = trip_distance_dim[['trip_distance_id','trip_distance']]\n"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 27,
668 | "id": "f1079250",
669 | "metadata": {},
670 | "outputs": [
671 | {
672 | "data": {
673 | "text/html": [
674 | "\n",
675 | "\n",
688 | "
\n",
689 | " \n",
690 | " \n",
691 | " | \n",
692 | " passenger_count_id | \n",
693 | " passenger_count | \n",
694 | "
\n",
695 | " \n",
696 | " \n",
697 | " \n",
698 | " 0 | \n",
699 | " 0 | \n",
700 | " 1 | \n",
701 | "
\n",
702 | " \n",
703 | " 1 | \n",
704 | " 1 | \n",
705 | " 2 | \n",
706 | "
\n",
707 | " \n",
708 | " 2 | \n",
709 | " 2 | \n",
710 | " 3 | \n",
711 | "
\n",
712 | " \n",
713 | " 3 | \n",
714 | " 3 | \n",
715 | " 5 | \n",
716 | "
\n",
717 | " \n",
718 | " 4 | \n",
719 | " 4 | \n",
720 | " 6 | \n",
721 | "
\n",
722 | " \n",
723 | "
\n",
724 | "
"
725 | ],
726 | "text/plain": [
727 | " passenger_count_id passenger_count\n",
728 | "0 0 1\n",
729 | "1 1 2\n",
730 | "2 2 3\n",
731 | "3 3 5\n",
732 | "4 4 6"
733 | ]
734 | },
735 | "execution_count": 27,
736 | "metadata": {},
737 | "output_type": "execute_result"
738 | }
739 | ],
740 | "source": [
741 | "passenger_count_dim.head()"
742 | ]
743 | },
744 | {
745 | "cell_type": "code",
746 | "execution_count": 28,
747 | "id": "06e72f95",
748 | "metadata": {},
749 | "outputs": [
750 | {
751 | "data": {
752 | "text/html": [
753 | "\n",
754 | "\n",
767 | "
\n",
768 | " \n",
769 | " \n",
770 | " | \n",
771 | " trip_distance_id | \n",
772 | " trip_distance | \n",
773 | "
\n",
774 | " \n",
775 | " \n",
776 | " \n",
777 | " 0 | \n",
778 | " 0 | \n",
779 | " 2.50 | \n",
780 | "
\n",
781 | " \n",
782 | " 1 | \n",
783 | " 1 | \n",
784 | " 2.90 | \n",
785 | "
\n",
786 | " \n",
787 | " 2 | \n",
788 | " 2 | \n",
789 | " 19.98 | \n",
790 | "
\n",
791 | " \n",
792 | " 3 | \n",
793 | " 3 | \n",
794 | " 10.78 | \n",
795 | "
\n",
796 | " \n",
797 | " 4 | \n",
798 | " 4 | \n",
799 | " 30.43 | \n",
800 | "
\n",
801 | " \n",
802 | "
\n",
803 | "
"
804 | ],
805 | "text/plain": [
806 | " trip_distance_id trip_distance\n",
807 | "0 0 2.50\n",
808 | "1 1 2.90\n",
809 | "2 2 19.98\n",
810 | "3 3 10.78\n",
811 | "4 4 30.43"
812 | ]
813 | },
814 | "execution_count": 28,
815 | "metadata": {},
816 | "output_type": "execute_result"
817 | }
818 | ],
819 | "source": [
820 | "trip_distance_dim.head()"
821 | ]
822 | },
823 | {
824 | "cell_type": "code",
825 | "execution_count": 29,
826 | "id": "9d511965",
827 | "metadata": {},
828 | "outputs": [],
829 | "source": [
830 | "rate_code_type = {\n",
831 | " 1:\"Standard rate\",\n",
832 | " 2:\"JFK\",\n",
833 | " 3:\"Newark\",\n",
834 | " 4:\"Nassau or Westchester\",\n",
835 | " 5:\"Negotiated fare\",\n",
836 | " 6:\"Group ride\"\n",
837 | "}\n",
838 | "\n",
839 | "rate_code_dim = df[['RatecodeID']].drop_duplicates().reset_index(drop=True)\n",
840 | "rate_code_dim['rate_code_id'] = rate_code_dim.index\n",
841 | "rate_code_dim['rate_code_name'] = rate_code_dim['RatecodeID'].map(rate_code_type)\n",
842 | "rate_code_dim = rate_code_dim[['rate_code_id','RatecodeID','rate_code_name']]\n"
843 | ]
844 | },
845 | {
846 | "cell_type": "code",
847 | "execution_count": 30,
848 | "id": "400e751a",
849 | "metadata": {},
850 | "outputs": [
851 | {
852 | "data": {
853 | "text/html": [
854 | "\n",
855 | "\n",
868 | "
\n",
869 | " \n",
870 | " \n",
871 | " | \n",
872 | " rate_code_id | \n",
873 | " RatecodeID | \n",
874 | " rate_code_name | \n",
875 | "
\n",
876 | " \n",
877 | " \n",
878 | " \n",
879 | " 0 | \n",
880 | " 0 | \n",
881 | " 1 | \n",
882 | " Standard rate | \n",
883 | "
\n",
884 | " \n",
885 | " 1 | \n",
886 | " 1 | \n",
887 | " 3 | \n",
888 | " Newark | \n",
889 | "
\n",
890 | " \n",
891 | " 2 | \n",
892 | " 2 | \n",
893 | " 2 | \n",
894 | " JFK | \n",
895 | "
\n",
896 | " \n",
897 | " 3 | \n",
898 | " 3 | \n",
899 | " 5 | \n",
900 | " Negotiated fare | \n",
901 | "
\n",
902 | " \n",
903 | " 4 | \n",
904 | " 4 | \n",
905 | " 4 | \n",
906 | " Nassau or Westchester | \n",
907 | "
\n",
908 | " \n",
909 | "
\n",
910 | "
"
911 | ],
912 | "text/plain": [
913 | " rate_code_id RatecodeID rate_code_name\n",
914 | "0 0 1 Standard rate\n",
915 | "1 1 3 Newark\n",
916 | "2 2 2 JFK\n",
917 | "3 3 5 Negotiated fare\n",
918 | "4 4 4 Nassau or Westchester"
919 | ]
920 | },
921 | "execution_count": 30,
922 | "metadata": {},
923 | "output_type": "execute_result"
924 | }
925 | ],
926 | "source": [
927 | "rate_code_dim.head()"
928 | ]
929 | },
930 | {
931 | "cell_type": "code",
932 | "execution_count": 31,
933 | "id": "aaf45842",
934 | "metadata": {},
935 | "outputs": [],
936 | "source": [
937 | "pickup_location_dim = df[['pickup_longitude', 'pickup_latitude']].drop_duplicates().reset_index(drop=True)\n",
938 | "pickup_location_dim['pickup_location_id'] = pickup_location_dim.index\n",
939 | "pickup_location_dim = pickup_location_dim[['pickup_location_id','pickup_latitude','pickup_longitude']] \n",
940 | "\n",
941 | "\n",
942 | "dropoff_location_dim = df[['dropoff_longitude', 'dropoff_latitude']].drop_duplicates().reset_index(drop=True)\n",
943 | "dropoff_location_dim['dropoff_location_id'] = dropoff_location_dim.index\n",
944 | "dropoff_location_dim = dropoff_location_dim[['dropoff_location_id','dropoff_latitude','dropoff_longitude']]"
945 | ]
946 | },
947 | {
948 | "cell_type": "code",
949 | "execution_count": 32,
950 | "id": "998253b7",
951 | "metadata": {},
952 | "outputs": [],
953 | "source": [
954 | "payment_type_name = {\n",
955 | " 1:\"Credit card\",\n",
956 | " 2:\"Cash\",\n",
957 | " 3:\"No charge\",\n",
958 | " 4:\"Dispute\",\n",
959 | " 5:\"Unknown\",\n",
960 | " 6:\"Voided trip\"\n",
961 | "}\n",
962 | "payment_type_dim = df[['payment_type']].drop_duplicates().reset_index(drop=True)\n",
963 | "payment_type_dim['payment_type_id'] = payment_type_dim.index\n",
964 | "payment_type_dim['payment_type_name'] = payment_type_dim['payment_type'].map(payment_type_name)\n",
965 | "payment_type_dim = payment_type_dim[['payment_type_id','payment_type','payment_type_name']]"
966 | ]
967 | },
968 | {
969 | "cell_type": "code",
970 | "execution_count": 34,
971 | "id": "0ab12341",
972 | "metadata": {},
973 | "outputs": [
974 | {
975 | "data": {
976 | "text/html": [
977 | "\n",
978 | "\n",
991 | "
\n",
992 | " \n",
993 | " \n",
994 | " | \n",
995 | " payment_type_id | \n",
996 | " payment_type | \n",
997 | " payment_type_name | \n",
998 | "
\n",
999 | " \n",
1000 | " \n",
1001 | " \n",
1002 | " 0 | \n",
1003 | " 0 | \n",
1004 | " 1 | \n",
1005 | " Credit card | \n",
1006 | "
\n",
1007 | " \n",
1008 | " 1 | \n",
1009 | " 1 | \n",
1010 | " 2 | \n",
1011 | " Cash | \n",
1012 | "
\n",
1013 | " \n",
1014 | " 2 | \n",
1015 | " 2 | \n",
1016 | " 3 | \n",
1017 | " No charge | \n",
1018 | "
\n",
1019 | " \n",
1020 | " 3 | \n",
1021 | " 3 | \n",
1022 | " 4 | \n",
1023 | " Dispute | \n",
1024 | "
\n",
1025 | " \n",
1026 | "
\n",
1027 | "
"
1028 | ],
1029 | "text/plain": [
1030 | " payment_type_id payment_type payment_type_name\n",
1031 | "0 0 1 Credit card\n",
1032 | "1 1 2 Cash\n",
1033 | "2 2 3 No charge\n",
1034 | "3 3 4 Dispute"
1035 | ]
1036 | },
1037 | "execution_count": 34,
1038 | "metadata": {},
1039 | "output_type": "execute_result"
1040 | }
1041 | ],
1042 | "source": [
1043 | "payment_type_dim.head()"
1044 | ]
1045 | },
1046 | {
1047 | "cell_type": "code",
1048 | "execution_count": 37,
1049 | "id": "6f46d41f",
1050 | "metadata": {},
1051 | "outputs": [],
1052 | "source": [
1053 | "fact_table = df.merge(passenger_count_dim, on='passenger_count') \\\n",
1054 | " .merge(trip_distance_dim, on='trip_distance') \\\n",
1055 | " .merge(rate_code_dim, on='RatecodeID') \\\n",
1056 | " .merge(pickup_location_dim, on=['pickup_longitude', 'pickup_latitude']) \\\n",
1057 | " .merge(dropoff_location_dim, on=['dropoff_longitude', 'dropoff_latitude'])\\\n",
1058 | " .merge(datetime_dim, on=['tpep_pickup_datetime','tpep_dropoff_datetime']) \\\n",
1059 | " .merge(payment_type_dim, on='payment_type') \\\n",
1060 | " [['VendorID', 'datetime_id', 'passenger_count_id',\n",
1061 | " 'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id',\n",
1062 | " 'payment_type_id', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',\n",
1063 | " 'improvement_surcharge', 'total_amount']]"
1064 | ]
1065 | },
1066 | {
1067 | "cell_type": "code",
1068 | "execution_count": 39,
1069 | "id": "9d5a38bf",
1070 | "metadata": {},
1071 | "outputs": [
1072 | {
1073 | "data": {
1074 | "text/html": [
1075 | "\n",
1076 | "\n",
1089 | "
\n",
1090 | " \n",
1091 | " \n",
1092 | " | \n",
1093 | " VendorID | \n",
1094 | " datetime_id | \n",
1095 | " passenger_count_id | \n",
1096 | " trip_distance_id | \n",
1097 | " rate_code_id | \n",
1098 | " store_and_fwd_flag | \n",
1099 | " pickup_location_id | \n",
1100 | " dropoff_location_id | \n",
1101 | " payment_type_id | \n",
1102 | " fare_amount | \n",
1103 | " extra | \n",
1104 | " mta_tax | \n",
1105 | " tip_amount | \n",
1106 | " tolls_amount | \n",
1107 | " improvement_surcharge | \n",
1108 | " total_amount | \n",
1109 | "
\n",
1110 | " \n",
1111 | " \n",
1112 | " \n",
1113 | " 0 | \n",
1114 | " 1 | \n",
1115 | " 0 | \n",
1116 | " 0 | \n",
1117 | " 0 | \n",
1118 | " 0 | \n",
1119 | " N | \n",
1120 | " 0 | \n",
1121 | " 0 | \n",
1122 | " 0 | \n",
1123 | " 9.0 | \n",
1124 | " 0.5 | \n",
1125 | " 0.5 | \n",
1126 | " 2.05 | \n",
1127 | " 0.0 | \n",
1128 | " 0.3 | \n",
1129 | " 12.35 | \n",
1130 | "
\n",
1131 | " \n",
1132 | " 1 | \n",
1133 | " 2 | \n",
1134 | " 1491 | \n",
1135 | " 0 | \n",
1136 | " 0 | \n",
1137 | " 0 | \n",
1138 | " N | \n",
1139 | " 1481 | \n",
1140 | " 1484 | \n",
1141 | " 0 | \n",
1142 | " 10.5 | \n",
1143 | " 0.0 | \n",
1144 | " 0.5 | \n",
1145 | " 2.26 | \n",
1146 | " 0.0 | \n",
1147 | " 0.3 | \n",
1148 | " 13.56 | \n",
1149 | "
\n",
1150 | " \n",
1151 | " 2 | \n",
1152 | " 2 | \n",
1153 | " 2834 | \n",
1154 | " 0 | \n",
1155 | " 0 | \n",
1156 | " 0 | \n",
1157 | " N | \n",
1158 | " 2816 | \n",
1159 | " 2819 | \n",
1160 | " 0 | \n",
1161 | " 9.5 | \n",
1162 | " 0.0 | \n",
1163 | " 0.5 | \n",
1164 | " 1.25 | \n",
1165 | " 0.0 | \n",
1166 | " 0.3 | \n",
1167 | " 11.55 | \n",
1168 | "
\n",
1169 | " \n",
1170 | " 3 | \n",
1171 | " 2 | \n",
1172 | " 3488 | \n",
1173 | " 0 | \n",
1174 | " 0 | \n",
1175 | " 0 | \n",
1176 | " N | \n",
1177 | " 3465 | \n",
1178 | " 3470 | \n",
1179 | " 0 | \n",
1180 | " 13.5 | \n",
1181 | " 0.0 | \n",
1182 | " 0.5 | \n",
1183 | " 2.00 | \n",
1184 | " 0.0 | \n",
1185 | " 0.3 | \n",
1186 | " 16.30 | \n",
1187 | "
\n",
1188 | " \n",
1189 | " 4 | \n",
1190 | " 2 | \n",
1191 | " 3923 | \n",
1192 | " 0 | \n",
1193 | " 0 | \n",
1194 | " 0 | \n",
1195 | " N | \n",
1196 | " 3899 | \n",
1197 | " 3903 | \n",
1198 | " 0 | \n",
1199 | " 10.5 | \n",
1200 | " 0.0 | \n",
1201 | " 0.5 | \n",
1202 | " 2.26 | \n",
1203 | " 0.0 | \n",
1204 | " 0.3 | \n",
1205 | " 13.56 | \n",
1206 | "
\n",
1207 | " \n",
1208 | " ... | \n",
1209 | " ... | \n",
1210 | " ... | \n",
1211 | " ... | \n",
1212 | " ... | \n",
1213 | " ... | \n",
1214 | " ... | \n",
1215 | " ... | \n",
1216 | " ... | \n",
1217 | " ... | \n",
1218 | " ... | \n",
1219 | " ... | \n",
1220 | " ... | \n",
1221 | " ... | \n",
1222 | " ... | \n",
1223 | " ... | \n",
1224 | " ... | \n",
1225 | "
\n",
1226 | " \n",
1227 | " 99995 | \n",
1228 | " 1 | \n",
1229 | " 65943 | \n",
1230 | " 0 | \n",
1231 | " 257 | \n",
1232 | " 3 | \n",
1233 | " N | \n",
1234 | " 64896 | \n",
1235 | " 65105 | \n",
1236 | " 3 | \n",
1237 | " 170.0 | \n",
1238 | " 0.0 | \n",
1239 | " 0.0 | \n",
1240 | " 0.00 | \n",
1241 | " 0.0 | \n",
1242 | " 0.3 | \n",
1243 | " 170.30 | \n",
1244 | "
\n",
1245 | " \n",
1246 | " 99996 | \n",
1247 | " 1 | \n",
1248 | " 81651 | \n",
1249 | " 0 | \n",
1250 | " 257 | \n",
1251 | " 3 | \n",
1252 | " N | \n",
1253 | " 80276 | \n",
1254 | " 80547 | \n",
1255 | " 3 | \n",
1256 | " 10.0 | \n",
1257 | " 0.0 | \n",
1258 | " 0.0 | \n",
1259 | " 0.00 | \n",
1260 | " 0.0 | \n",
1261 | " 0.3 | \n",
1262 | " 10.30 | \n",
1263 | "
\n",
1264 | " \n",
1265 | " 99997 | \n",
1266 | " 2 | \n",
1267 | " 87152 | \n",
1268 | " 4 | \n",
1269 | " 257 | \n",
1270 | " 1 | \n",
1271 | " N | \n",
1272 | " 85670 | \n",
1273 | " 85971 | \n",
1274 | " 3 | \n",
1275 | " -20.0 | \n",
1276 | " -0.5 | \n",
1277 | " 0.0 | \n",
1278 | " 0.00 | \n",
1279 | " 0.0 | \n",
1280 | " -0.3 | \n",
1281 | " -20.80 | \n",
1282 | "
\n",
1283 | " \n",
1284 | " 99998 | \n",
1285 | " 2 | \n",
1286 | " 53874 | \n",
1287 | " 4 | \n",
1288 | " 1060 | \n",
1289 | " 1 | \n",
1290 | " N | \n",
1291 | " 53081 | \n",
1292 | " 53222 | \n",
1293 | " 3 | \n",
1294 | " -25.5 | \n",
1295 | " 0.0 | \n",
1296 | " 0.0 | \n",
1297 | " 0.00 | \n",
1298 | " 0.0 | \n",
1299 | " -0.3 | \n",
1300 | " -25.80 | \n",
1301 | "
\n",
1302 | " \n",
1303 | " 99999 | \n",
1304 | " 1 | \n",
1305 | " 88727 | \n",
1306 | " 0 | \n",
1307 | " 1894 | \n",
1308 | " 1 | \n",
1309 | " N | \n",
1310 | " 87206 | \n",
1311 | " 87511 | \n",
1312 | " 3 | \n",
1313 | " 70.5 | \n",
1314 | " 0.5 | \n",
1315 | " 0.0 | \n",
1316 | " 0.00 | \n",
1317 | " 10.5 | \n",
1318 | " 0.3 | \n",
1319 | " 81.80 | \n",
1320 | "
\n",
1321 | " \n",
1322 | "
\n",
1323 | "
100000 rows × 16 columns
\n",
1324 | "
"
1325 | ],
1326 | "text/plain": [
1327 | " VendorID datetime_id passenger_count_id trip_distance_id \\\n",
1328 | "0 1 0 0 0 \n",
1329 | "1 2 1491 0 0 \n",
1330 | "2 2 2834 0 0 \n",
1331 | "3 2 3488 0 0 \n",
1332 | "4 2 3923 0 0 \n",
1333 | "... ... ... ... ... \n",
1334 | "99995 1 65943 0 257 \n",
1335 | "99996 1 81651 0 257 \n",
1336 | "99997 2 87152 4 257 \n",
1337 | "99998 2 53874 4 1060 \n",
1338 | "99999 1 88727 0 1894 \n",
1339 | "\n",
1340 | " rate_code_id store_and_fwd_flag pickup_location_id \\\n",
1341 | "0 0 N 0 \n",
1342 | "1 0 N 1481 \n",
1343 | "2 0 N 2816 \n",
1344 | "3 0 N 3465 \n",
1345 | "4 0 N 3899 \n",
1346 | "... ... ... ... \n",
1347 | "99995 3 N 64896 \n",
1348 | "99996 3 N 80276 \n",
1349 | "99997 1 N 85670 \n",
1350 | "99998 1 N 53081 \n",
1351 | "99999 1 N 87206 \n",
1352 | "\n",
1353 | " dropoff_location_id payment_type_id fare_amount extra mta_tax \\\n",
1354 | "0 0 0 9.0 0.5 0.5 \n",
1355 | "1 1484 0 10.5 0.0 0.5 \n",
1356 | "2 2819 0 9.5 0.0 0.5 \n",
1357 | "3 3470 0 13.5 0.0 0.5 \n",
1358 | "4 3903 0 10.5 0.0 0.5 \n",
1359 | "... ... ... ... ... ... \n",
1360 | "99995 65105 3 170.0 0.0 0.0 \n",
1361 | "99996 80547 3 10.0 0.0 0.0 \n",
1362 | "99997 85971 3 -20.0 -0.5 0.0 \n",
1363 | "99998 53222 3 -25.5 0.0 0.0 \n",
1364 | "99999 87511 3 70.5 0.5 0.0 \n",
1365 | "\n",
1366 | " tip_amount tolls_amount improvement_surcharge total_amount \n",
1367 | "0 2.05 0.0 0.3 12.35 \n",
1368 | "1 2.26 0.0 0.3 13.56 \n",
1369 | "2 1.25 0.0 0.3 11.55 \n",
1370 | "3 2.00 0.0 0.3 16.30 \n",
1371 | "4 2.26 0.0 0.3 13.56 \n",
1372 | "... ... ... ... ... \n",
1373 | "99995 0.00 0.0 0.3 170.30 \n",
1374 | "99996 0.00 0.0 0.3 10.30 \n",
1375 | "99997 0.00 0.0 -0.3 -20.80 \n",
1376 | "99998 0.00 0.0 -0.3 -25.80 \n",
1377 | "99999 0.00 10.5 0.3 81.80 \n",
1378 | "\n",
1379 | "[100000 rows x 16 columns]"
1380 | ]
1381 | },
1382 | "execution_count": 39,
1383 | "metadata": {},
1384 | "output_type": "execute_result"
1385 | }
1386 | ],
1387 | "source": [
1388 | "fact_table"
1389 | ]
1390 | },
1391 | {
1392 | "cell_type": "code",
1393 | "execution_count": null,
1394 | "id": "954c8df4",
1395 | "metadata": {},
1396 | "outputs": [],
1397 | "source": []
1398 | }
1399 | ],
1400 | "metadata": {
1401 | "kernelspec": {
1402 | "display_name": "Python 3 (ipykernel)",
1403 | "language": "python",
1404 | "name": "python3"
1405 | },
1406 | "language_info": {
1407 | "codemirror_mode": {
1408 | "name": "ipython",
1409 | "version": 3
1410 | },
1411 | "file_extension": ".py",
1412 | "mimetype": "text/x-python",
1413 | "name": "python",
1414 | "nbconvert_exporter": "python",
1415 | "pygments_lexer": "ipython3",
1416 | "version": "3.10.6"
1417 | }
1418 | },
1419 | "nbformat": 4,
1420 | "nbformat_minor": 5
1421 | }
1422 |
--------------------------------------------------------------------------------
/analytics_query.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE TABLE `data-with-darshil.uber_data_engineering_yt.tbl_analytics` AS (
2 | SELECT
3 | f.trip_id,
4 | f.VendorID,
5 | d.tpep_pickup_datetime,
6 | d.tpep_dropoff_datetime,
7 | p.passenger_count,
8 | t.trip_distance,
9 | r.rate_code_name,
10 | pick.pickup_latitude,
11 | pick.pickup_longitude,
12 | drop.dropoff_latitude,
13 | drop.dropoff_longitude,
14 | pay.payment_type_name,
15 | f.fare_amount,
16 | f.extra,
17 | f.mta_tax,
18 | f.tip_amount,
19 | f.tolls_amount,
20 | f.improvement_surcharge,
21 | f.total_amount
22 | FROM
23 |
24 | `data-with-darshil.uber_data_engineering_yt.fact_table` f
25 | JOIN `data-with-darshil.uber_data_engineering_yt.datetime_dim` d ON f.datetime_id=d.datetime_id
26 | JOIN `data-with-darshil.uber_data_engineering_yt.passenger_count_dim` p ON p.passenger_count_id=f.passenger_count_id
27 | JOIN `data-with-darshil.uber_data_engineering_yt.trip_distance_dim` t ON t.trip_distance_id=f.trip_distance_id
28 | JOIN `data-with-darshil.uber_data_engineering_yt.rate_code_dim` r ON r.rate_code_id=f.rate_code_id
29 | JOIN `data-with-darshil.uber_data_engineering_yt.pickup_location_dim` pick ON pick.pickup_location_id=f.pickup_location_id
30 | JOIN `data-with-darshil.uber_data_engineering_yt.dropoff_location_dim` drop ON drop.dropoff_location_id=f.dropoff_location_id
31 | JOIN `data-with-darshil.uber_data_engineering_yt.payment_type_dim` pay ON pay.payment_type_id=f.payment_type_id)
32 | ;
33 |
--------------------------------------------------------------------------------
/architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/darshilparmar/uber-etl-pipeline-data-engineering-project/5373ce54237783e15f6fe9b2bdd924b544325bc0/architecture.jpg
--------------------------------------------------------------------------------
/commands.txt:
--------------------------------------------------------------------------------
1 | # Install Python and pip
2 | sudo apt-get update
3 |
4 | sudo apt-get install python3-distutils
5 |
6 | sudo apt-get install python3-apt
7 |
8 | sudo apt-get install wget
9 |
10 | wget https://bootstrap.pypa.io/get-pip.py
11 |
12 | sudo python3 get-pip.py
13 |
14 |
15 | # Install Mage
16 | sudo pip3 install mage-ai
17 |
18 | # Install Pandas
19 | sudo pip3 install pandas
20 |
21 | # Install Google Cloud Library
22 | sudo pip3 install google-cloud
23 |
24 | sudo pip3 install google-cloud-bigquery
25 |
26 |
--------------------------------------------------------------------------------
/data_model.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/darshilparmar/uber-etl-pipeline-data-engineering-project/5373ce54237783e15f6fe9b2bdd924b544325bc0/data_model.jpeg
--------------------------------------------------------------------------------
/mage-files/extract.py:
--------------------------------------------------------------------------------
1 | import io
2 | import pandas as pd
3 | import requests
4 | if 'data_loader' not in globals():
5 | from mage_ai.data_preparation.decorators import data_loader
6 | if 'test' not in globals():
7 | from mage_ai.data_preparation.decorators import test
8 |
9 |
10 | @data_loader
11 | def load_data_from_api(*args, **kwargs):
12 | """
13 | Template for loading data from API
14 | """
15 | url = 'https://storage.googleapis.com/uber-data-engineering-project-darshil/uber_data.csv'
16 | response = requests.get(url)
17 |
18 | return pd.read_csv(io.StringIO(response.text), sep=',')
19 |
20 |
21 | @test
22 | def test_output(output, *args) -> None:
23 | """
24 | Template code for testing the output of the block.
25 | """
26 | assert output is not None, 'The output is undefined'
27 |
--------------------------------------------------------------------------------
/mage-files/load.py:
--------------------------------------------------------------------------------
1 | from mage_ai.data_preparation.repo_manager import get_repo_path
2 | from mage_ai.io.bigquery import BigQuery
3 | from mage_ai.io.config import ConfigFileLoader
4 | from pandas import DataFrame
5 | from os import path
6 |
7 | if 'data_exporter' not in globals():
8 | from mage_ai.data_preparation.decorators import data_exporter
9 |
10 |
11 | @data_exporter
12 | def export_data_to_big_query(data, **kwargs) -> None:
13 | """
14 | Template for exporting data to a BigQuery warehouse.
15 | Specify your configuration settings in 'io_config.yaml'.
16 |
17 | Docs: https://docs.mage.ai/design/data-loading#bigquery
18 |
19 |
20 | """
21 | config_path = path.join(get_repo_path(), 'io_config.yaml')
22 | config_profile = 'default'
23 |
24 | for key, value in data.items():
25 | table_id = 'data-with-darshil.uber_data_engineering_yt.{}'.format(key)
26 | BigQuery.with_config(ConfigFileLoader(config_path, config_profile)).export(
27 | DataFrame(value),
28 | table_id,
29 | if_exists='replace', # Specify resolution policy if table name already exists
30 | )
31 |
--------------------------------------------------------------------------------
/mage-files/transform.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | if 'transformer' not in globals():
3 | from mage_ai.data_preparation.decorators import transformer
4 | if 'test' not in globals():
5 | from mage_ai.data_preparation.decorators import test
6 |
7 |
8 | @transformer
9 | def transform(df, *args, **kwargs):
10 | """
11 | Template code for a transformer block.
12 |
13 | Add more parameters to this function if this block has multiple parent blocks.
14 | There should be one parameter for each output variable from each parent block.
15 |
16 | Args:
17 | data: The output from the upstream parent block
18 | args: The output from any additional upstream blocks (if applicable)
19 |
20 | Returns:
21 | Anything (e.g. data frame, dictionary, array, int, str, etc.)
22 | """
23 | # Specify your transformation logic here
24 | df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
25 | df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
26 |
27 | datetime_dim = df[['tpep_pickup_datetime','tpep_dropoff_datetime']].drop_duplicates().reset_index(drop=True)
28 | datetime_dim['pick_hour'] = datetime_dim['tpep_pickup_datetime'].dt.hour
29 | datetime_dim['pick_day'] = datetime_dim['tpep_pickup_datetime'].dt.day
30 | datetime_dim['pick_month'] = datetime_dim['tpep_pickup_datetime'].dt.month
31 | datetime_dim['pick_year'] = datetime_dim['tpep_pickup_datetime'].dt.year
32 | datetime_dim['pick_weekday'] = datetime_dim['tpep_pickup_datetime'].dt.weekday
33 |
34 | datetime_dim['drop_hour'] = datetime_dim['tpep_dropoff_datetime'].dt.hour
35 | datetime_dim['drop_day'] = datetime_dim['tpep_dropoff_datetime'].dt.day
36 | datetime_dim['drop_month'] = datetime_dim['tpep_dropoff_datetime'].dt.month
37 | datetime_dim['drop_year'] = datetime_dim['tpep_dropoff_datetime'].dt.year
38 | datetime_dim['drop_weekday'] = datetime_dim['tpep_dropoff_datetime'].dt.weekday
39 |
40 | datetime_dim['datetime_id'] = datetime_dim.index
41 | datetime_dim = datetime_dim[['datetime_id', 'tpep_pickup_datetime', 'pick_hour', 'pick_day', 'pick_month', 'pick_year', 'pick_weekday',
42 | 'tpep_dropoff_datetime', 'drop_hour', 'drop_day', 'drop_month', 'drop_year', 'drop_weekday']]
43 |
44 | passenger_count_dim = df[['passenger_count']].drop_duplicates().reset_index(drop=True)
45 | passenger_count_dim['passenger_count_id'] = passenger_count_dim.index
46 | passenger_count_dim = passenger_count_dim[['passenger_count_id','passenger_count']]
47 |
48 | trip_distance_dim = df[['trip_distance']].drop_duplicates().reset_index(drop=True)
49 | trip_distance_dim['trip_distance_id'] = trip_distance_dim.index
50 | trip_distance_dim = trip_distance_dim[['trip_distance_id','trip_distance']]
51 | rate_code_type = {
52 | 1:"Standard rate",
53 | 2:"JFK",
54 | 3:"Newark",
55 | 4:"Nassau or Westchester",
56 | 5:"Negotiated fare",
57 | 6:"Group ride"
58 | }
59 |
60 | rate_code_dim = df[['RatecodeID']].drop_duplicates().reset_index(drop=True)
61 | rate_code_dim['rate_code_id'] = rate_code_dim.index
62 | rate_code_dim['rate_code_name'] = rate_code_dim['RatecodeID'].map(rate_code_type)
63 | rate_code_dim = rate_code_dim[['rate_code_id','RatecodeID','rate_code_name']]
64 |
65 |
66 | pickup_location_dim = df[['pickup_longitude', 'pickup_latitude']].drop_duplicates().reset_index(drop=True)
67 | pickup_location_dim['pickup_location_id'] = pickup_location_dim.index
68 | pickup_location_dim = pickup_location_dim[['pickup_location_id','pickup_latitude','pickup_longitude']]
69 |
70 |
71 | dropoff_location_dim = df[['dropoff_longitude', 'dropoff_latitude']].drop_duplicates().reset_index(drop=True)
72 | dropoff_location_dim['dropoff_location_id'] = dropoff_location_dim.index
73 | dropoff_location_dim = dropoff_location_dim[['dropoff_location_id','dropoff_latitude','dropoff_longitude']]
74 |
75 | payment_type_name = {
76 | 1:"Credit card",
77 | 2:"Cash",
78 | 3:"No charge",
79 | 4:"Dispute",
80 | 5:"Unknown",
81 | 6:"Voided trip"
82 | }
83 | payment_type_dim = df[['payment_type']].drop_duplicates().reset_index(drop=True)
84 | payment_type_dim['payment_type_id'] = payment_type_dim.index
85 | payment_type_dim['payment_type_name'] = payment_type_dim['payment_type'].map(payment_type_name)
86 | payment_type_dim = payment_type_dim[['payment_type_id','payment_type','payment_type_name']]
87 |
88 | fact_table = df.merge(passenger_count_dim, on='passenger_count') \
89 | .merge(trip_distance_dim, on='trip_distance') \
90 | .merge(rate_code_dim, on='RatecodeID') \
91 | .merge(pickup_location_dim, on=['pickup_longitude', 'pickup_latitude']) \
92 | .merge(dropoff_location_dim, on=['dropoff_longitude', 'dropoff_latitude'])\
93 | .merge(datetime_dim, on=['tpep_pickup_datetime','tpep_dropoff_datetime']) \
94 | .merge(payment_type_dim, on='payment_type') \
95 | [['VendorID', 'datetime_id', 'passenger_count_id',
96 | 'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id',
97 | 'payment_type_id', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
98 | 'improvement_surcharge', 'total_amount']]
99 |
100 | return {"datetime_dim":datetime_dim.to_dict(orient="dict"),
101 | "passenger_count_dim":passenger_count_dim.to_dict(orient="dict"),
102 | "trip_distance_dim":trip_distance_dim.to_dict(orient="dict"),
103 | "rate_code_dim":rate_code_dim.to_dict(orient="dict"),
104 | "pickup_location_dim":pickup_location_dim.to_dict(orient="dict"),
105 | "dropoff_location_dim":dropoff_location_dim.to_dict(orient="dict"),
106 | "payment_type_dim":payment_type_dim.to_dict(orient="dict"),
107 | "fact_table":fact_table.to_dict(orient="dict")}
108 |
109 |
110 | @test
111 | def test_output(output, *args) -> None:
112 | """
113 | Template code for testing the output of the block.
114 | """
115 | assert output is not None, 'The output is undefined'
116 |
--------------------------------------------------------------------------------