├── README.md ├── Uber Data Pipeline (Fixed Version).ipynb ├── Uber Data Pipeline (Video Version).ipynb ├── analytics_query.sql ├── architecture.jpg ├── commands.txt ├── data └── uber_data.csv ├── data_model.jpeg └── mage-files ├── extract.py ├── load.py └── transform.py /README.md: -------------------------------------------------------------------------------- 1 | # Uber Data Analytics | Modern Data Engineering GCP Project 2 | 3 | ## Introduction 4 | 5 | The goal of this project is to perform data analytics on Uber data using various tools and technologies, including GCP Storage, Python, Compute Instance, Mage Data Pipeline Tool, BigQuery, and Looker Studio. 6 | 7 | ## Architecture 8 | 9 | 10 | ## Technology Used 11 | - Programming Language - Python 12 | 13 | Google Cloud Platform 14 | 1. Google Storage 15 | 2. Compute Instance 16 | 3. BigQuery 17 | 4. Looker Studio 18 | 19 | Modern Data Pipeine Tool - https://www.mage.ai/ 20 | 21 | Contibute to this open source project - https://github.com/mage-ai/mage-ai 22 | 23 | 24 | ## Dataset Used 25 | TLC Trip Record Data 26 | Yellow and green taxi trip records include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts. 27 | 28 | Here is the dataset used in the video - https://github.com/darshilparmar/uber-etl-pipeline-data-engineering-project/blob/main/data/uber_data.csv 29 | 30 | More info about dataset can be found here: 31 | 1. Website - https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page 32 | 2. Data Dictionary - https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf 33 | 34 | ## Data Model 35 | 36 | 37 | ## Complete Video Tutorial 38 | Video Link - https://youtu.be/WpQECq5Hx9g 39 | -------------------------------------------------------------------------------- /Uber Data Pipeline (Fixed Version).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "368d2580", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import io\n", 11 | "import pandas as pd\n", 12 | "import requests" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 3, 18 | "id": "500446cc", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "url = 'https://storage.googleapis.com/uber-data-engineering-project/uber_data.csv'\n", 23 | "response = requests.get(url)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 4, 29 | "id": "b37a1ace", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "df = pd.read_csv(io.StringIO(response.text), sep=',')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 5, 39 | "id": "edc6d495", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n", 44 | "df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 6, 50 | "id": "4d0c8281", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df = df.drop_duplicates().reset_index(drop=True)\n", 55 | "df['trip_id'] = df.index" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 7, 61 | "id": "a35f32dd", 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/html": [ 67 | "
\n", 68 | "\n", 81 | "\n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distancepickup_longitudepickup_latitudeRatecodeIDstore_and_fwd_flagdropoff_longitudedropoff_latitudepayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amounttrip_id
012016-03-012016-03-01 00:07:5512.50-73.97674640.7651521N-74.00426540.74612819.00.50.52.050.000.312.350
112016-03-012016-03-01 00:11:0612.90-73.98348240.7679251N-74.00594340.733166111.00.50.53.050.000.315.351
222016-03-012016-03-01 00:31:06219.98-73.78202140.6448101N-73.97454140.675770154.50.50.58.000.000.363.802
322016-03-012016-03-01 00:00:00310.78-73.86341940.7698141N-73.96965040.757767131.50.00.53.785.540.341.623
422016-03-012016-03-01 00:00:00530.43-73.97174140.7921833N-74.17717040.695053198.00.00.00.0015.500.3113.804
\n", 225 | "
" 226 | ], 227 | "text/plain": [ 228 | " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", 229 | "0 1 2016-03-01 2016-03-01 00:07:55 1 \n", 230 | "1 1 2016-03-01 2016-03-01 00:11:06 1 \n", 231 | "2 2 2016-03-01 2016-03-01 00:31:06 2 \n", 232 | "3 2 2016-03-01 2016-03-01 00:00:00 3 \n", 233 | "4 2 2016-03-01 2016-03-01 00:00:00 5 \n", 234 | "\n", 235 | " trip_distance pickup_longitude pickup_latitude RatecodeID \\\n", 236 | "0 2.50 -73.976746 40.765152 1 \n", 237 | "1 2.90 -73.983482 40.767925 1 \n", 238 | "2 19.98 -73.782021 40.644810 1 \n", 239 | "3 10.78 -73.863419 40.769814 1 \n", 240 | "4 30.43 -73.971741 40.792183 3 \n", 241 | "\n", 242 | " store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type \\\n", 243 | "0 N -74.004265 40.746128 1 \n", 244 | "1 N -74.005943 40.733166 1 \n", 245 | "2 N -73.974541 40.675770 1 \n", 246 | "3 N -73.969650 40.757767 1 \n", 247 | "4 N -74.177170 40.695053 1 \n", 248 | "\n", 249 | " fare_amount extra mta_tax tip_amount tolls_amount \\\n", 250 | "0 9.0 0.5 0.5 2.05 0.00 \n", 251 | "1 11.0 0.5 0.5 3.05 0.00 \n", 252 | "2 54.5 0.5 0.5 8.00 0.00 \n", 253 | "3 31.5 0.0 0.5 3.78 5.54 \n", 254 | "4 98.0 0.0 0.0 0.00 15.50 \n", 255 | "\n", 256 | " improvement_surcharge total_amount trip_id \n", 257 | "0 0.3 12.35 0 \n", 258 | "1 0.3 15.35 1 \n", 259 | "2 0.3 63.80 2 \n", 260 | "3 0.3 41.62 3 \n", 261 | "4 0.3 113.80 4 " 262 | ] 263 | }, 264 | "execution_count": 7, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "df.head()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 10, 276 | "id": "4a148bc6", 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/html": [ 282 | "
\n", 283 | "\n", 296 | "\n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | "
datetime_idtpep_pickup_datetimepick_hourpick_daypick_monthpick_yearpick_weekdaytpep_dropoff_datetimedrop_hourdrop_daydrop_monthdrop_yeardrop_weekday
002016-03-01013201612016-03-01 00:07:5501320161
112016-03-01013201612016-03-01 00:11:0601320161
222016-03-01013201612016-03-01 00:31:0601320161
332016-03-01013201612016-03-01 00:00:0001320161
442016-03-01013201612016-03-01 00:00:0001320161
\n", 398 | "
" 399 | ], 400 | "text/plain": [ 401 | " datetime_id tpep_pickup_datetime pick_hour pick_day pick_month \\\n", 402 | "0 0 2016-03-01 0 1 3 \n", 403 | "1 1 2016-03-01 0 1 3 \n", 404 | "2 2 2016-03-01 0 1 3 \n", 405 | "3 3 2016-03-01 0 1 3 \n", 406 | "4 4 2016-03-01 0 1 3 \n", 407 | "\n", 408 | " pick_year pick_weekday tpep_dropoff_datetime drop_hour drop_day \\\n", 409 | "0 2016 1 2016-03-01 00:07:55 0 1 \n", 410 | "1 2016 1 2016-03-01 00:11:06 0 1 \n", 411 | "2 2016 1 2016-03-01 00:31:06 0 1 \n", 412 | "3 2016 1 2016-03-01 00:00:00 0 1 \n", 413 | "4 2016 1 2016-03-01 00:00:00 0 1 \n", 414 | "\n", 415 | " drop_month drop_year drop_weekday \n", 416 | "0 3 2016 1 \n", 417 | "1 3 2016 1 \n", 418 | "2 3 2016 1 \n", 419 | "3 3 2016 1 \n", 420 | "4 3 2016 1 " 421 | ] 422 | }, 423 | "execution_count": 10, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "datetime_dim = df[['tpep_pickup_datetime','tpep_dropoff_datetime']].reset_index(drop=True)\n", 430 | "datetime_dim['tpep_pickup_datetime'] = datetime_dim['tpep_pickup_datetime']\n", 431 | "datetime_dim['pick_hour'] = datetime_dim['tpep_pickup_datetime'].dt.hour\n", 432 | "datetime_dim['pick_day'] = datetime_dim['tpep_pickup_datetime'].dt.day\n", 433 | "datetime_dim['pick_month'] = datetime_dim['tpep_pickup_datetime'].dt.month\n", 434 | "datetime_dim['pick_year'] = datetime_dim['tpep_pickup_datetime'].dt.year\n", 435 | "datetime_dim['pick_weekday'] = datetime_dim['tpep_pickup_datetime'].dt.weekday\n", 436 | "\n", 437 | "datetime_dim['tpep_dropoff_datetime'] = datetime_dim['tpep_dropoff_datetime']\n", 438 | "datetime_dim['drop_hour'] = datetime_dim['tpep_dropoff_datetime'].dt.hour\n", 439 | "datetime_dim['drop_day'] = datetime_dim['tpep_dropoff_datetime'].dt.day\n", 440 | "datetime_dim['drop_month'] = datetime_dim['tpep_dropoff_datetime'].dt.month\n", 441 | "datetime_dim['drop_year'] = datetime_dim['tpep_dropoff_datetime'].dt.year\n", 442 | "datetime_dim['drop_weekday'] = datetime_dim['tpep_dropoff_datetime'].dt.weekday\n", 443 | "\n", 444 | "\n", 445 | "datetime_dim['datetime_id'] = datetime_dim.index\n", 446 | "\n", 447 | "# datetime_dim = datetime_dim.rename(columns={'tpep_pickup_datetime': 'datetime_id'}).reset_index(drop=True)\n", 448 | "datetime_dim = datetime_dim[['datetime_id', 'tpep_pickup_datetime', 'pick_hour', 'pick_day', 'pick_month', 'pick_year', 'pick_weekday',\n", 449 | " 'tpep_dropoff_datetime', 'drop_hour', 'drop_day', 'drop_month', 'drop_year', 'drop_weekday']]\n", 450 | "#\n", 451 | "datetime_dim.head()" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 11, 457 | "id": "ba67912f", 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "passenger_count_dim = df[['passenger_count']].reset_index(drop=True)\n", 462 | "passenger_count_dim['passenger_count_id'] = passenger_count_dim.index\n", 463 | "passenger_count_dim = passenger_count_dim[['passenger_count_id','passenger_count']]\n", 464 | "\n", 465 | "trip_distance_dim = df[['trip_distance']].reset_index(drop=True)\n", 466 | "trip_distance_dim['trip_distance_id'] = trip_distance_dim.index\n", 467 | "trip_distance_dim = trip_distance_dim[['trip_distance_id','trip_distance']]\n" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 12, 473 | "id": "fb7c9704", 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "rate_code_type = {\n", 478 | " 1:\"Standard rate\",\n", 479 | " 2:\"JFK\",\n", 480 | " 3:\"Newark\",\n", 481 | " 4:\"Nassau or Westchester\",\n", 482 | " 5:\"Negotiated fare\",\n", 483 | " 6:\"Group ride\"\n", 484 | "}\n", 485 | "\n", 486 | "rate_code_dim = df[['RatecodeID']].reset_index(drop=True)\n", 487 | "rate_code_dim['rate_code_id'] = rate_code_dim.index\n", 488 | "rate_code_dim['rate_code_name'] = rate_code_dim['RatecodeID'].map(rate_code_type)\n", 489 | "rate_code_dim = rate_code_dim[['rate_code_id','RatecodeID','rate_code_name']]\n", 490 | "\n", 491 | "# rate_code_dim.head()" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 44, 497 | "id": "4db826a1", 498 | "metadata": {}, 499 | "outputs": [ 500 | { 501 | "data": { 502 | "text/html": [ 503 | "
\n", 504 | "\n", 517 | "\n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | "
rate_code_idRatecodeIDrate_code_name
001Standard rate
113Newark
222JFK
335Negotiated fare
444Nassau or Westchester
\n", 559 | "
" 560 | ], 561 | "text/plain": [ 562 | " rate_code_id RatecodeID rate_code_name\n", 563 | "0 0 1 Standard rate\n", 564 | "1 1 3 Newark\n", 565 | "2 2 2 JFK\n", 566 | "3 3 5 Negotiated fare\n", 567 | "4 4 4 Nassau or Westchester" 568 | ] 569 | }, 570 | "execution_count": 44, 571 | "metadata": {}, 572 | "output_type": "execute_result" 573 | } 574 | ], 575 | "source": [ 576 | "rate_code_dim.head()" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 13, 582 | "id": "8048bdef", 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "pickup_location_dim = df[['pickup_longitude', 'pickup_latitude']].reset_index(drop=True)\n", 587 | "pickup_location_dim['pickup_location_id'] = pickup_location_dim.index\n", 588 | "pickup_location_dim = pickup_location_dim[['pickup_location_id','pickup_latitude','pickup_longitude']] \n", 589 | "\n", 590 | "\n", 591 | "dropoff_location_dim = df[['dropoff_longitude', 'dropoff_latitude']].reset_index(drop=True)\n", 592 | "dropoff_location_dim['dropoff_location_id'] = dropoff_location_dim.index\n", 593 | "dropoff_location_dim = dropoff_location_dim[['dropoff_location_id','dropoff_latitude','dropoff_longitude']]" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": 14, 599 | "id": "bfb04993", 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "payment_type_name = {\n", 604 | " 1:\"Credit card\",\n", 605 | " 2:\"Cash\",\n", 606 | " 3:\"No charge\",\n", 607 | " 4:\"Dispute\",\n", 608 | " 5:\"Unknown\",\n", 609 | " 6:\"Voided trip\"\n", 610 | "}\n", 611 | "payment_type_dim = df[['payment_type']].reset_index(drop=True)\n", 612 | "payment_type_dim['payment_type_id'] = payment_type_dim.index\n", 613 | "payment_type_dim['payment_type_name'] = payment_type_dim['payment_type'].map(payment_type_name)\n", 614 | "payment_type_dim = payment_type_dim[['payment_type_id','payment_type','payment_type_name']]" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 15, 620 | "id": "e747865b", 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "\n", 625 | "fact_table = df.merge(passenger_count_dim, left_on='trip_id', right_on='passenger_count_id') \\\n", 626 | " .merge(trip_distance_dim, left_on='trip_id', right_on='trip_distance_id') \\\n", 627 | " .merge(rate_code_dim, left_on='trip_id', right_on='rate_code_id') \\\n", 628 | " .merge(pickup_location_dim, left_on='trip_id', right_on='pickup_location_id') \\\n", 629 | " .merge(dropoff_location_dim, left_on='trip_id', right_on='dropoff_location_id')\\\n", 630 | " .merge(datetime_dim, left_on='trip_id', right_on='datetime_id') \\\n", 631 | " .merge(payment_type_dim, left_on='trip_id', right_on='payment_type_id') \\\n", 632 | " [['trip_id','VendorID', 'datetime_id', 'passenger_count_id',\n", 633 | " 'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id',\n", 634 | " 'payment_type_id', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',\n", 635 | " 'improvement_surcharge', 'total_amount']]" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 16, 641 | "id": "9acdb41a", 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "data": { 646 | "text/plain": [ 647 | "Index(['payment_type_id', 'payment_type', 'payment_type_name'], dtype='object')" 648 | ] 649 | }, 650 | "execution_count": 16, 651 | "metadata": {}, 652 | "output_type": "execute_result" 653 | } 654 | ], 655 | "source": [ 656 | "payment_type_dim.columns" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 17, 662 | "id": "62e05477", 663 | "metadata": {}, 664 | "outputs": [ 665 | { 666 | "data": { 667 | "text/plain": [ 668 | "Index(['trip_id', 'VendorID', 'datetime_id', 'passenger_count_id',\n", 669 | " 'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag',\n", 670 | " 'pickup_location_id', 'dropoff_location_id', 'payment_type_id',\n", 671 | " 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',\n", 672 | " 'improvement_surcharge', 'total_amount'],\n", 673 | " dtype='object')" 674 | ] 675 | }, 676 | "execution_count": 17, 677 | "metadata": {}, 678 | "output_type": "execute_result" 679 | } 680 | ], 681 | "source": [ 682 | "fact_table.columns" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "id": "6162ee05", 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 18, 696 | "id": "254ff9ad", 697 | "metadata": {}, 698 | "outputs": [ 699 | { 700 | "data": { 701 | "text/html": [ 702 | "
\n", 703 | "\n", 716 | "\n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | "
trip_idVendorIDdatetime_idpassenger_count_idtrip_distance_idrate_code_idstore_and_fwd_flagpickup_location_iddropoff_location_idpayment_type_idfare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amount
0010000N0009.00.50.52.050.000.312.35
1111111N11111.00.50.53.050.000.315.35
2222222N22254.50.50.58.000.000.363.80
3323333N33331.50.00.53.785.540.341.62
4424444N44498.00.00.00.0015.500.3113.80
......................................................
9999599995199995999959999599995N9999599995999955.00.00.50.000.000.35.80
9999699996199996999969999699996N99996999969999614.00.00.52.000.000.316.80
9999799997199997999979999799997N99997999979999729.00.00.58.805.540.344.14
9999899998299998999989999899998N9999899998999985.50.50.51.360.000.38.16
9999999999199999999999999999999N9999999999999996.00.00.50.000.000.36.80
\n", 962 | "

100000 rows × 17 columns

\n", 963 | "
" 964 | ], 965 | "text/plain": [ 966 | " trip_id VendorID datetime_id passenger_count_id trip_distance_id \\\n", 967 | "0 0 1 0 0 0 \n", 968 | "1 1 1 1 1 1 \n", 969 | "2 2 2 2 2 2 \n", 970 | "3 3 2 3 3 3 \n", 971 | "4 4 2 4 4 4 \n", 972 | "... ... ... ... ... ... \n", 973 | "99995 99995 1 99995 99995 99995 \n", 974 | "99996 99996 1 99996 99996 99996 \n", 975 | "99997 99997 1 99997 99997 99997 \n", 976 | "99998 99998 2 99998 99998 99998 \n", 977 | "99999 99999 1 99999 99999 99999 \n", 978 | "\n", 979 | " rate_code_id store_and_fwd_flag pickup_location_id \\\n", 980 | "0 0 N 0 \n", 981 | "1 1 N 1 \n", 982 | "2 2 N 2 \n", 983 | "3 3 N 3 \n", 984 | "4 4 N 4 \n", 985 | "... ... ... ... \n", 986 | "99995 99995 N 99995 \n", 987 | "99996 99996 N 99996 \n", 988 | "99997 99997 N 99997 \n", 989 | "99998 99998 N 99998 \n", 990 | "99999 99999 N 99999 \n", 991 | "\n", 992 | " dropoff_location_id payment_type_id fare_amount extra mta_tax \\\n", 993 | "0 0 0 9.0 0.5 0.5 \n", 994 | "1 1 1 11.0 0.5 0.5 \n", 995 | "2 2 2 54.5 0.5 0.5 \n", 996 | "3 3 3 31.5 0.0 0.5 \n", 997 | "4 4 4 98.0 0.0 0.0 \n", 998 | "... ... ... ... ... ... \n", 999 | "99995 99995 99995 5.0 0.0 0.5 \n", 1000 | "99996 99996 99996 14.0 0.0 0.5 \n", 1001 | "99997 99997 99997 29.0 0.0 0.5 \n", 1002 | "99998 99998 99998 5.5 0.5 0.5 \n", 1003 | "99999 99999 99999 6.0 0.0 0.5 \n", 1004 | "\n", 1005 | " tip_amount tolls_amount improvement_surcharge total_amount \n", 1006 | "0 2.05 0.00 0.3 12.35 \n", 1007 | "1 3.05 0.00 0.3 15.35 \n", 1008 | "2 8.00 0.00 0.3 63.80 \n", 1009 | "3 3.78 5.54 0.3 41.62 \n", 1010 | "4 0.00 15.50 0.3 113.80 \n", 1011 | "... ... ... ... ... \n", 1012 | "99995 0.00 0.00 0.3 5.80 \n", 1013 | "99996 2.00 0.00 0.3 16.80 \n", 1014 | "99997 8.80 5.54 0.3 44.14 \n", 1015 | "99998 1.36 0.00 0.3 8.16 \n", 1016 | "99999 0.00 0.00 0.3 6.80 \n", 1017 | "\n", 1018 | "[100000 rows x 17 columns]" 1019 | ] 1020 | }, 1021 | "execution_count": 18, 1022 | "metadata": {}, 1023 | "output_type": "execute_result" 1024 | } 1025 | ], 1026 | "source": [ 1027 | "fact_table" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": null, 1033 | "id": "a45c58d1", 1034 | "metadata": {}, 1035 | "outputs": [], 1036 | "source": [ 1037 | "# CREATE OR REPLACE TABLE `data-with-darshil.uber_dataset.tbl_analysis_report` AS (\n", 1038 | "# SELECT\n", 1039 | "# f.VendorID,\n", 1040 | "# f.tpep_pickup_datetime,\n", 1041 | "# f.tpep_dropoff_datetime,\n", 1042 | "# p.passenger_count,\n", 1043 | "# td.trip_distance,\n", 1044 | "# rc.RatecodeID,\n", 1045 | "# f.store_and_fwd_flag,\n", 1046 | "# pl.pickup_latitude,\n", 1047 | "# pl.pickup_longitude,\n", 1048 | "# dl.dropoff_latitude,\n", 1049 | "# dl.dropoff_longitude,\n", 1050 | "# pt.payment_type,\n", 1051 | "# f.fare_amount,\n", 1052 | "# f.extra,\n", 1053 | "# f.mta_tax,\n", 1054 | "# f.tip_amount,\n", 1055 | "# f.tolls_amount,\n", 1056 | "# f.improvement_surcharge,\n", 1057 | "# f.total_amount\n", 1058 | "# FROM\n", 1059 | "# `data-with-darshil.uber_dataset.fact_table` f\n", 1060 | "# JOIN `data-with-darshil.uber_dataset.passenger_count_dim` p ON f.passenger_count_id = p.passenger_count_id\n", 1061 | "# JOIN `data-with-darshil.uber_dataset.trip_distance_dim` td ON f.trip_distance_id = td.trip_distance_id\n", 1062 | "# JOIN `data-with-darshil.uber_dataset.rate_code_dim` rc ON f.rate_code_id = rc.rate_code_id\n", 1063 | "# JOIN `data-with-darshil.uber_dataset.pickup_location_dim` pl ON f.pickup_location_id = pl.pickup_location_id\n", 1064 | "# JOIN `data-with-darshil.uber_dataset.dropoff_location_dim` dl ON f.dropoff_location_id = dl.dropoff_location_id\n", 1065 | "# JOIN `data-with-darshil.uber_dataset.payment_type_dim` pt ON f.payment_type_id = pt.payment_type_id);" 1066 | ] 1067 | } 1068 | ], 1069 | "metadata": { 1070 | "kernelspec": { 1071 | "display_name": "Python 3 (ipykernel)", 1072 | "language": "python", 1073 | "name": "python3" 1074 | }, 1075 | "language_info": { 1076 | "codemirror_mode": { 1077 | "name": "ipython", 1078 | "version": 3 1079 | }, 1080 | "file_extension": ".py", 1081 | "mimetype": "text/x-python", 1082 | "name": "python", 1083 | "nbconvert_exporter": "python", 1084 | "pygments_lexer": "ipython3", 1085 | "version": "3.10.6" 1086 | } 1087 | }, 1088 | "nbformat": 4, 1089 | "nbformat_minor": 5 1090 | } 1091 | -------------------------------------------------------------------------------- /Uber Data Pipeline (Video Version).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "9fc22827", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "7a84de07", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "df = pd.read_csv(\"data/uber_data.csv\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 4, 26 | "id": "7f70bb49", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/html": [ 32 | "
\n", 33 | "\n", 46 | "\n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distancepickup_longitudepickup_latitudeRatecodeIDstore_and_fwd_flagdropoff_longitudedropoff_latitudepayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amount
012016-03-01 00:00:002016-03-01 00:07:5512.50-73.97674640.7651521N-74.00426540.74612819.00.50.52.050.000.312.35
112016-03-01 00:00:002016-03-01 00:11:0612.90-73.98348240.7679251N-74.00594340.733166111.00.50.53.050.000.315.35
222016-03-01 00:00:002016-03-01 00:31:06219.98-73.78202140.6448101N-73.97454140.675770154.50.50.58.000.000.363.80
322016-03-01 00:00:002016-03-01 00:00:00310.78-73.86341940.7698141N-73.96965040.757767131.50.00.53.785.540.341.62
422016-03-01 00:00:002016-03-01 00:00:00530.43-73.97174140.7921833N-74.17717040.695053198.00.00.00.0015.500.3113.80
\n", 184 | "
" 185 | ], 186 | "text/plain": [ 187 | " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", 188 | "0 1 2016-03-01 00:00:00 2016-03-01 00:07:55 1 \n", 189 | "1 1 2016-03-01 00:00:00 2016-03-01 00:11:06 1 \n", 190 | "2 2 2016-03-01 00:00:00 2016-03-01 00:31:06 2 \n", 191 | "3 2 2016-03-01 00:00:00 2016-03-01 00:00:00 3 \n", 192 | "4 2 2016-03-01 00:00:00 2016-03-01 00:00:00 5 \n", 193 | "\n", 194 | " trip_distance pickup_longitude pickup_latitude RatecodeID \\\n", 195 | "0 2.50 -73.976746 40.765152 1 \n", 196 | "1 2.90 -73.983482 40.767925 1 \n", 197 | "2 19.98 -73.782021 40.644810 1 \n", 198 | "3 10.78 -73.863419 40.769814 1 \n", 199 | "4 30.43 -73.971741 40.792183 3 \n", 200 | "\n", 201 | " store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type \\\n", 202 | "0 N -74.004265 40.746128 1 \n", 203 | "1 N -74.005943 40.733166 1 \n", 204 | "2 N -73.974541 40.675770 1 \n", 205 | "3 N -73.969650 40.757767 1 \n", 206 | "4 N -74.177170 40.695053 1 \n", 207 | "\n", 208 | " fare_amount extra mta_tax tip_amount tolls_amount \\\n", 209 | "0 9.0 0.5 0.5 2.05 0.00 \n", 210 | "1 11.0 0.5 0.5 3.05 0.00 \n", 211 | "2 54.5 0.5 0.5 8.00 0.00 \n", 212 | "3 31.5 0.0 0.5 3.78 5.54 \n", 213 | "4 98.0 0.0 0.0 0.00 15.50 \n", 214 | "\n", 215 | " improvement_surcharge total_amount \n", 216 | "0 0.3 12.35 \n", 217 | "1 0.3 15.35 \n", 218 | "2 0.3 63.80 \n", 219 | "3 0.3 41.62 \n", 220 | "4 0.3 113.80 " 221 | ] 222 | }, 223 | "execution_count": 4, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "df.head()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 6, 235 | "id": "b8720809", 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "\n", 243 | "RangeIndex: 100000 entries, 0 to 99999\n", 244 | "Data columns (total 19 columns):\n", 245 | " # Column Non-Null Count Dtype \n", 246 | "--- ------ -------------- ----- \n", 247 | " 0 VendorID 100000 non-null int64 \n", 248 | " 1 tpep_pickup_datetime 100000 non-null object \n", 249 | " 2 tpep_dropoff_datetime 100000 non-null object \n", 250 | " 3 passenger_count 100000 non-null int64 \n", 251 | " 4 trip_distance 100000 non-null float64\n", 252 | " 5 pickup_longitude 100000 non-null float64\n", 253 | " 6 pickup_latitude 100000 non-null float64\n", 254 | " 7 RatecodeID 100000 non-null int64 \n", 255 | " 8 store_and_fwd_flag 100000 non-null object \n", 256 | " 9 dropoff_longitude 100000 non-null float64\n", 257 | " 10 dropoff_latitude 100000 non-null float64\n", 258 | " 11 payment_type 100000 non-null int64 \n", 259 | " 12 fare_amount 100000 non-null float64\n", 260 | " 13 extra 100000 non-null float64\n", 261 | " 14 mta_tax 100000 non-null float64\n", 262 | " 15 tip_amount 100000 non-null float64\n", 263 | " 16 tolls_amount 100000 non-null float64\n", 264 | " 17 improvement_surcharge 100000 non-null float64\n", 265 | " 18 total_amount 100000 non-null float64\n", 266 | "dtypes: float64(12), int64(4), object(3)\n", 267 | "memory usage: 14.5+ MB\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "df.info()" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 8, 278 | "id": "c0632db9", 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n", 283 | "df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 9, 289 | "id": "d682dc12", 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "\n", 297 | "RangeIndex: 100000 entries, 0 to 99999\n", 298 | "Data columns (total 19 columns):\n", 299 | " # Column Non-Null Count Dtype \n", 300 | "--- ------ -------------- ----- \n", 301 | " 0 VendorID 100000 non-null int64 \n", 302 | " 1 tpep_pickup_datetime 100000 non-null datetime64[ns]\n", 303 | " 2 tpep_dropoff_datetime 100000 non-null datetime64[ns]\n", 304 | " 3 passenger_count 100000 non-null int64 \n", 305 | " 4 trip_distance 100000 non-null float64 \n", 306 | " 5 pickup_longitude 100000 non-null float64 \n", 307 | " 6 pickup_latitude 100000 non-null float64 \n", 308 | " 7 RatecodeID 100000 non-null int64 \n", 309 | " 8 store_and_fwd_flag 100000 non-null object \n", 310 | " 9 dropoff_longitude 100000 non-null float64 \n", 311 | " 10 dropoff_latitude 100000 non-null float64 \n", 312 | " 11 payment_type 100000 non-null int64 \n", 313 | " 12 fare_amount 100000 non-null float64 \n", 314 | " 13 extra 100000 non-null float64 \n", 315 | " 14 mta_tax 100000 non-null float64 \n", 316 | " 15 tip_amount 100000 non-null float64 \n", 317 | " 16 tolls_amount 100000 non-null float64 \n", 318 | " 17 improvement_surcharge 100000 non-null float64 \n", 319 | " 18 total_amount 100000 non-null float64 \n", 320 | "dtypes: datetime64[ns](2), float64(12), int64(4), object(1)\n", 321 | "memory usage: 14.5+ MB\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "df.info()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 19, 332 | "id": "acbb0a89", 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "datetime_dim = df[['tpep_pickup_datetime','tpep_dropoff_datetime']].drop_duplicates().reset_index(drop=True)\n", 337 | "datetime_dim['pick_hour'] = datetime_dim['tpep_pickup_datetime'].dt.hour\n", 338 | "datetime_dim['pick_day'] = datetime_dim['tpep_pickup_datetime'].dt.day\n", 339 | "datetime_dim['pick_month'] = datetime_dim['tpep_pickup_datetime'].dt.month\n", 340 | "datetime_dim['pick_year'] = datetime_dim['tpep_pickup_datetime'].dt.year\n", 341 | "datetime_dim['pick_weekday'] = datetime_dim['tpep_pickup_datetime'].dt.weekday\n", 342 | "\n", 343 | "datetime_dim['drop_hour'] = datetime_dim['tpep_dropoff_datetime'].dt.hour\n", 344 | "datetime_dim['drop_day'] = datetime_dim['tpep_dropoff_datetime'].dt.day\n", 345 | "datetime_dim['drop_month'] = datetime_dim['tpep_dropoff_datetime'].dt.month\n", 346 | "datetime_dim['drop_year'] = datetime_dim['tpep_dropoff_datetime'].dt.year\n", 347 | "datetime_dim['drop_weekday'] = datetime_dim['tpep_dropoff_datetime'].dt.weekday\n", 348 | "\n" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 22, 354 | "id": "29899c21", 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "datetime_dim['datetime_id'] = datetime_dim.index" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 24, 364 | "id": "7a59b1da", 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "datetime_dim = datetime_dim[['datetime_id', 'tpep_pickup_datetime', 'pick_hour', 'pick_day', 'pick_month', 'pick_year', 'pick_weekday',\n", 369 | " 'tpep_dropoff_datetime', 'drop_hour', 'drop_day', 'drop_month', 'drop_year', 'drop_weekday']]\n", 370 | "#" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 25, 376 | "id": "97e42f3b", 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "data": { 381 | "text/html": [ 382 | "
\n", 383 | "\n", 396 | "\n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | "
datetime_idtpep_pickup_datetimepick_hourpick_daypick_monthpick_yearpick_weekdaytpep_dropoff_datetimedrop_hourdrop_daydrop_monthdrop_yeardrop_weekday
002016-03-01 00:00:00013201612016-03-01 00:07:5501320161
112016-03-01 00:00:00013201612016-03-01 00:11:0601320161
222016-03-01 00:00:00013201612016-03-01 00:31:0601320161
332016-03-01 00:00:00013201612016-03-01 00:00:0001320161
442016-03-01 00:00:01013201612016-03-01 00:16:0401320161
..........................................
99848998482016-03-01 06:17:10613201612016-03-01 06:22:1561320161
99849998492016-03-01 06:17:10613201612016-03-01 06:32:4161320161
99850998502016-03-01 06:17:10613201612016-03-01 06:37:2361320161
99851998512016-03-01 06:17:10613201612016-03-01 06:22:0961320161
99852998522016-03-01 06:17:11613201612016-03-01 06:22:0061320161
\n", 594 | "

99853 rows × 13 columns

\n", 595 | "
" 596 | ], 597 | "text/plain": [ 598 | " datetime_id tpep_pickup_datetime pick_hour pick_day pick_month \\\n", 599 | "0 0 2016-03-01 00:00:00 0 1 3 \n", 600 | "1 1 2016-03-01 00:00:00 0 1 3 \n", 601 | "2 2 2016-03-01 00:00:00 0 1 3 \n", 602 | "3 3 2016-03-01 00:00:00 0 1 3 \n", 603 | "4 4 2016-03-01 00:00:01 0 1 3 \n", 604 | "... ... ... ... ... ... \n", 605 | "99848 99848 2016-03-01 06:17:10 6 1 3 \n", 606 | "99849 99849 2016-03-01 06:17:10 6 1 3 \n", 607 | "99850 99850 2016-03-01 06:17:10 6 1 3 \n", 608 | "99851 99851 2016-03-01 06:17:10 6 1 3 \n", 609 | "99852 99852 2016-03-01 06:17:11 6 1 3 \n", 610 | "\n", 611 | " pick_year pick_weekday tpep_dropoff_datetime drop_hour drop_day \\\n", 612 | "0 2016 1 2016-03-01 00:07:55 0 1 \n", 613 | "1 2016 1 2016-03-01 00:11:06 0 1 \n", 614 | "2 2016 1 2016-03-01 00:31:06 0 1 \n", 615 | "3 2016 1 2016-03-01 00:00:00 0 1 \n", 616 | "4 2016 1 2016-03-01 00:16:04 0 1 \n", 617 | "... ... ... ... ... ... \n", 618 | "99848 2016 1 2016-03-01 06:22:15 6 1 \n", 619 | "99849 2016 1 2016-03-01 06:32:41 6 1 \n", 620 | "99850 2016 1 2016-03-01 06:37:23 6 1 \n", 621 | "99851 2016 1 2016-03-01 06:22:09 6 1 \n", 622 | "99852 2016 1 2016-03-01 06:22:00 6 1 \n", 623 | "\n", 624 | " drop_month drop_year drop_weekday \n", 625 | "0 3 2016 1 \n", 626 | "1 3 2016 1 \n", 627 | "2 3 2016 1 \n", 628 | "3 3 2016 1 \n", 629 | "4 3 2016 1 \n", 630 | "... ... ... ... \n", 631 | "99848 3 2016 1 \n", 632 | "99849 3 2016 1 \n", 633 | "99850 3 2016 1 \n", 634 | "99851 3 2016 1 \n", 635 | "99852 3 2016 1 \n", 636 | "\n", 637 | "[99853 rows x 13 columns]" 638 | ] 639 | }, 640 | "execution_count": 25, 641 | "metadata": {}, 642 | "output_type": "execute_result" 643 | } 644 | ], 645 | "source": [ 646 | "datetime_dim" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": 26, 652 | "id": "69a20d11", 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "passenger_count_dim = df[['passenger_count']].drop_duplicates().reset_index(drop=True)\n", 657 | "passenger_count_dim['passenger_count_id'] = passenger_count_dim.index\n", 658 | "passenger_count_dim = passenger_count_dim[['passenger_count_id','passenger_count']]\n", 659 | "\n", 660 | "trip_distance_dim = df[['trip_distance']].drop_duplicates().reset_index(drop=True)\n", 661 | "trip_distance_dim['trip_distance_id'] = trip_distance_dim.index\n", 662 | "trip_distance_dim = trip_distance_dim[['trip_distance_id','trip_distance']]\n" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 27, 668 | "id": "f1079250", 669 | "metadata": {}, 670 | "outputs": [ 671 | { 672 | "data": { 673 | "text/html": [ 674 | "
\n", 675 | "\n", 688 | "\n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | "
passenger_count_idpassenger_count
001
112
223
335
446
\n", 724 | "
" 725 | ], 726 | "text/plain": [ 727 | " passenger_count_id passenger_count\n", 728 | "0 0 1\n", 729 | "1 1 2\n", 730 | "2 2 3\n", 731 | "3 3 5\n", 732 | "4 4 6" 733 | ] 734 | }, 735 | "execution_count": 27, 736 | "metadata": {}, 737 | "output_type": "execute_result" 738 | } 739 | ], 740 | "source": [ 741 | "passenger_count_dim.head()" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": 28, 747 | "id": "06e72f95", 748 | "metadata": {}, 749 | "outputs": [ 750 | { 751 | "data": { 752 | "text/html": [ 753 | "
\n", 754 | "\n", 767 | "\n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | "
trip_distance_idtrip_distance
002.50
112.90
2219.98
3310.78
4430.43
\n", 803 | "
" 804 | ], 805 | "text/plain": [ 806 | " trip_distance_id trip_distance\n", 807 | "0 0 2.50\n", 808 | "1 1 2.90\n", 809 | "2 2 19.98\n", 810 | "3 3 10.78\n", 811 | "4 4 30.43" 812 | ] 813 | }, 814 | "execution_count": 28, 815 | "metadata": {}, 816 | "output_type": "execute_result" 817 | } 818 | ], 819 | "source": [ 820 | "trip_distance_dim.head()" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": 29, 826 | "id": "9d511965", 827 | "metadata": {}, 828 | "outputs": [], 829 | "source": [ 830 | "rate_code_type = {\n", 831 | " 1:\"Standard rate\",\n", 832 | " 2:\"JFK\",\n", 833 | " 3:\"Newark\",\n", 834 | " 4:\"Nassau or Westchester\",\n", 835 | " 5:\"Negotiated fare\",\n", 836 | " 6:\"Group ride\"\n", 837 | "}\n", 838 | "\n", 839 | "rate_code_dim = df[['RatecodeID']].drop_duplicates().reset_index(drop=True)\n", 840 | "rate_code_dim['rate_code_id'] = rate_code_dim.index\n", 841 | "rate_code_dim['rate_code_name'] = rate_code_dim['RatecodeID'].map(rate_code_type)\n", 842 | "rate_code_dim = rate_code_dim[['rate_code_id','RatecodeID','rate_code_name']]\n" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": 30, 848 | "id": "400e751a", 849 | "metadata": {}, 850 | "outputs": [ 851 | { 852 | "data": { 853 | "text/html": [ 854 | "
\n", 855 | "\n", 868 | "\n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | "
rate_code_idRatecodeIDrate_code_name
001Standard rate
113Newark
222JFK
335Negotiated fare
444Nassau or Westchester
\n", 910 | "
" 911 | ], 912 | "text/plain": [ 913 | " rate_code_id RatecodeID rate_code_name\n", 914 | "0 0 1 Standard rate\n", 915 | "1 1 3 Newark\n", 916 | "2 2 2 JFK\n", 917 | "3 3 5 Negotiated fare\n", 918 | "4 4 4 Nassau or Westchester" 919 | ] 920 | }, 921 | "execution_count": 30, 922 | "metadata": {}, 923 | "output_type": "execute_result" 924 | } 925 | ], 926 | "source": [ 927 | "rate_code_dim.head()" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": 31, 933 | "id": "aaf45842", 934 | "metadata": {}, 935 | "outputs": [], 936 | "source": [ 937 | "pickup_location_dim = df[['pickup_longitude', 'pickup_latitude']].drop_duplicates().reset_index(drop=True)\n", 938 | "pickup_location_dim['pickup_location_id'] = pickup_location_dim.index\n", 939 | "pickup_location_dim = pickup_location_dim[['pickup_location_id','pickup_latitude','pickup_longitude']] \n", 940 | "\n", 941 | "\n", 942 | "dropoff_location_dim = df[['dropoff_longitude', 'dropoff_latitude']].drop_duplicates().reset_index(drop=True)\n", 943 | "dropoff_location_dim['dropoff_location_id'] = dropoff_location_dim.index\n", 944 | "dropoff_location_dim = dropoff_location_dim[['dropoff_location_id','dropoff_latitude','dropoff_longitude']]" 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "execution_count": 32, 950 | "id": "998253b7", 951 | "metadata": {}, 952 | "outputs": [], 953 | "source": [ 954 | "payment_type_name = {\n", 955 | " 1:\"Credit card\",\n", 956 | " 2:\"Cash\",\n", 957 | " 3:\"No charge\",\n", 958 | " 4:\"Dispute\",\n", 959 | " 5:\"Unknown\",\n", 960 | " 6:\"Voided trip\"\n", 961 | "}\n", 962 | "payment_type_dim = df[['payment_type']].drop_duplicates().reset_index(drop=True)\n", 963 | "payment_type_dim['payment_type_id'] = payment_type_dim.index\n", 964 | "payment_type_dim['payment_type_name'] = payment_type_dim['payment_type'].map(payment_type_name)\n", 965 | "payment_type_dim = payment_type_dim[['payment_type_id','payment_type','payment_type_name']]" 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": 34, 971 | "id": "0ab12341", 972 | "metadata": {}, 973 | "outputs": [ 974 | { 975 | "data": { 976 | "text/html": [ 977 | "
\n", 978 | "\n", 991 | "\n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | "
payment_type_idpayment_typepayment_type_name
001Credit card
112Cash
223No charge
334Dispute
\n", 1027 | "
" 1028 | ], 1029 | "text/plain": [ 1030 | " payment_type_id payment_type payment_type_name\n", 1031 | "0 0 1 Credit card\n", 1032 | "1 1 2 Cash\n", 1033 | "2 2 3 No charge\n", 1034 | "3 3 4 Dispute" 1035 | ] 1036 | }, 1037 | "execution_count": 34, 1038 | "metadata": {}, 1039 | "output_type": "execute_result" 1040 | } 1041 | ], 1042 | "source": [ 1043 | "payment_type_dim.head()" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "code", 1048 | "execution_count": 37, 1049 | "id": "6f46d41f", 1050 | "metadata": {}, 1051 | "outputs": [], 1052 | "source": [ 1053 | "fact_table = df.merge(passenger_count_dim, on='passenger_count') \\\n", 1054 | " .merge(trip_distance_dim, on='trip_distance') \\\n", 1055 | " .merge(rate_code_dim, on='RatecodeID') \\\n", 1056 | " .merge(pickup_location_dim, on=['pickup_longitude', 'pickup_latitude']) \\\n", 1057 | " .merge(dropoff_location_dim, on=['dropoff_longitude', 'dropoff_latitude'])\\\n", 1058 | " .merge(datetime_dim, on=['tpep_pickup_datetime','tpep_dropoff_datetime']) \\\n", 1059 | " .merge(payment_type_dim, on='payment_type') \\\n", 1060 | " [['VendorID', 'datetime_id', 'passenger_count_id',\n", 1061 | " 'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id',\n", 1062 | " 'payment_type_id', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',\n", 1063 | " 'improvement_surcharge', 'total_amount']]" 1064 | ] 1065 | }, 1066 | { 1067 | "cell_type": "code", 1068 | "execution_count": 39, 1069 | "id": "9d5a38bf", 1070 | "metadata": {}, 1071 | "outputs": [ 1072 | { 1073 | "data": { 1074 | "text/html": [ 1075 | "
\n", 1076 | "\n", 1089 | "\n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | "
VendorIDdatetime_idpassenger_count_idtrip_distance_idrate_code_idstore_and_fwd_flagpickup_location_iddropoff_location_idpayment_type_idfare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amount
010000N0009.00.50.52.050.00.312.35
121491000N14811484010.50.00.52.260.00.313.56
222834000N2816281909.50.00.51.250.00.311.55
323488000N34653470013.50.00.52.000.00.316.30
423923000N38993903010.50.00.52.260.00.313.56
...................................................
9999516594302573N64896651053170.00.00.00.000.00.3170.30
9999618165102573N8027680547310.00.00.00.000.00.310.30
9999728715242571N85670859713-20.0-0.50.00.000.0-0.3-20.80
99998253874410601N53081532223-25.50.00.00.000.0-0.3-25.80
99999188727018941N8720687511370.50.50.00.0010.50.381.80
\n", 1323 | "

100000 rows × 16 columns

\n", 1324 | "
" 1325 | ], 1326 | "text/plain": [ 1327 | " VendorID datetime_id passenger_count_id trip_distance_id \\\n", 1328 | "0 1 0 0 0 \n", 1329 | "1 2 1491 0 0 \n", 1330 | "2 2 2834 0 0 \n", 1331 | "3 2 3488 0 0 \n", 1332 | "4 2 3923 0 0 \n", 1333 | "... ... ... ... ... \n", 1334 | "99995 1 65943 0 257 \n", 1335 | "99996 1 81651 0 257 \n", 1336 | "99997 2 87152 4 257 \n", 1337 | "99998 2 53874 4 1060 \n", 1338 | "99999 1 88727 0 1894 \n", 1339 | "\n", 1340 | " rate_code_id store_and_fwd_flag pickup_location_id \\\n", 1341 | "0 0 N 0 \n", 1342 | "1 0 N 1481 \n", 1343 | "2 0 N 2816 \n", 1344 | "3 0 N 3465 \n", 1345 | "4 0 N 3899 \n", 1346 | "... ... ... ... \n", 1347 | "99995 3 N 64896 \n", 1348 | "99996 3 N 80276 \n", 1349 | "99997 1 N 85670 \n", 1350 | "99998 1 N 53081 \n", 1351 | "99999 1 N 87206 \n", 1352 | "\n", 1353 | " dropoff_location_id payment_type_id fare_amount extra mta_tax \\\n", 1354 | "0 0 0 9.0 0.5 0.5 \n", 1355 | "1 1484 0 10.5 0.0 0.5 \n", 1356 | "2 2819 0 9.5 0.0 0.5 \n", 1357 | "3 3470 0 13.5 0.0 0.5 \n", 1358 | "4 3903 0 10.5 0.0 0.5 \n", 1359 | "... ... ... ... ... ... \n", 1360 | "99995 65105 3 170.0 0.0 0.0 \n", 1361 | "99996 80547 3 10.0 0.0 0.0 \n", 1362 | "99997 85971 3 -20.0 -0.5 0.0 \n", 1363 | "99998 53222 3 -25.5 0.0 0.0 \n", 1364 | "99999 87511 3 70.5 0.5 0.0 \n", 1365 | "\n", 1366 | " tip_amount tolls_amount improvement_surcharge total_amount \n", 1367 | "0 2.05 0.0 0.3 12.35 \n", 1368 | "1 2.26 0.0 0.3 13.56 \n", 1369 | "2 1.25 0.0 0.3 11.55 \n", 1370 | "3 2.00 0.0 0.3 16.30 \n", 1371 | "4 2.26 0.0 0.3 13.56 \n", 1372 | "... ... ... ... ... \n", 1373 | "99995 0.00 0.0 0.3 170.30 \n", 1374 | "99996 0.00 0.0 0.3 10.30 \n", 1375 | "99997 0.00 0.0 -0.3 -20.80 \n", 1376 | "99998 0.00 0.0 -0.3 -25.80 \n", 1377 | "99999 0.00 10.5 0.3 81.80 \n", 1378 | "\n", 1379 | "[100000 rows x 16 columns]" 1380 | ] 1381 | }, 1382 | "execution_count": 39, 1383 | "metadata": {}, 1384 | "output_type": "execute_result" 1385 | } 1386 | ], 1387 | "source": [ 1388 | "fact_table" 1389 | ] 1390 | }, 1391 | { 1392 | "cell_type": "code", 1393 | "execution_count": null, 1394 | "id": "954c8df4", 1395 | "metadata": {}, 1396 | "outputs": [], 1397 | "source": [] 1398 | } 1399 | ], 1400 | "metadata": { 1401 | "kernelspec": { 1402 | "display_name": "Python 3 (ipykernel)", 1403 | "language": "python", 1404 | "name": "python3" 1405 | }, 1406 | "language_info": { 1407 | "codemirror_mode": { 1408 | "name": "ipython", 1409 | "version": 3 1410 | }, 1411 | "file_extension": ".py", 1412 | "mimetype": "text/x-python", 1413 | "name": "python", 1414 | "nbconvert_exporter": "python", 1415 | "pygments_lexer": "ipython3", 1416 | "version": "3.10.6" 1417 | } 1418 | }, 1419 | "nbformat": 4, 1420 | "nbformat_minor": 5 1421 | } 1422 | -------------------------------------------------------------------------------- /analytics_query.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE `data-with-darshil.uber_data_engineering_yt.tbl_analytics` AS ( 2 | SELECT 3 | f.trip_id, 4 | f.VendorID, 5 | d.tpep_pickup_datetime, 6 | d.tpep_dropoff_datetime, 7 | p.passenger_count, 8 | t.trip_distance, 9 | r.rate_code_name, 10 | pick.pickup_latitude, 11 | pick.pickup_longitude, 12 | drop.dropoff_latitude, 13 | drop.dropoff_longitude, 14 | pay.payment_type_name, 15 | f.fare_amount, 16 | f.extra, 17 | f.mta_tax, 18 | f.tip_amount, 19 | f.tolls_amount, 20 | f.improvement_surcharge, 21 | f.total_amount 22 | FROM 23 | 24 | `data-with-darshil.uber_data_engineering_yt.fact_table` f 25 | JOIN `data-with-darshil.uber_data_engineering_yt.datetime_dim` d ON f.datetime_id=d.datetime_id 26 | JOIN `data-with-darshil.uber_data_engineering_yt.passenger_count_dim` p ON p.passenger_count_id=f.passenger_count_id 27 | JOIN `data-with-darshil.uber_data_engineering_yt.trip_distance_dim` t ON t.trip_distance_id=f.trip_distance_id 28 | JOIN `data-with-darshil.uber_data_engineering_yt.rate_code_dim` r ON r.rate_code_id=f.rate_code_id 29 | JOIN `data-with-darshil.uber_data_engineering_yt.pickup_location_dim` pick ON pick.pickup_location_id=f.pickup_location_id 30 | JOIN `data-with-darshil.uber_data_engineering_yt.dropoff_location_dim` drop ON drop.dropoff_location_id=f.dropoff_location_id 31 | JOIN `data-with-darshil.uber_data_engineering_yt.payment_type_dim` pay ON pay.payment_type_id=f.payment_type_id) 32 | ; 33 | -------------------------------------------------------------------------------- /architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/darshilparmar/uber-etl-pipeline-data-engineering-project/5373ce54237783e15f6fe9b2bdd924b544325bc0/architecture.jpg -------------------------------------------------------------------------------- /commands.txt: -------------------------------------------------------------------------------- 1 | # Install Python and pip 2 | sudo apt-get update 3 | 4 | sudo apt-get install python3-distutils 5 | 6 | sudo apt-get install python3-apt 7 | 8 | sudo apt-get install wget 9 | 10 | wget https://bootstrap.pypa.io/get-pip.py 11 | 12 | sudo python3 get-pip.py 13 | 14 | 15 | # Install Mage 16 | sudo pip3 install mage-ai 17 | 18 | # Install Pandas 19 | sudo pip3 install pandas 20 | 21 | # Install Google Cloud Library 22 | sudo pip3 install google-cloud 23 | 24 | sudo pip3 install google-cloud-bigquery 25 | 26 | -------------------------------------------------------------------------------- /data_model.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/darshilparmar/uber-etl-pipeline-data-engineering-project/5373ce54237783e15f6fe9b2bdd924b544325bc0/data_model.jpeg -------------------------------------------------------------------------------- /mage-files/extract.py: -------------------------------------------------------------------------------- 1 | import io 2 | import pandas as pd 3 | import requests 4 | if 'data_loader' not in globals(): 5 | from mage_ai.data_preparation.decorators import data_loader 6 | if 'test' not in globals(): 7 | from mage_ai.data_preparation.decorators import test 8 | 9 | 10 | @data_loader 11 | def load_data_from_api(*args, **kwargs): 12 | """ 13 | Template for loading data from API 14 | """ 15 | url = 'https://storage.googleapis.com/uber-data-engineering-project-darshil/uber_data.csv' 16 | response = requests.get(url) 17 | 18 | return pd.read_csv(io.StringIO(response.text), sep=',') 19 | 20 | 21 | @test 22 | def test_output(output, *args) -> None: 23 | """ 24 | Template code for testing the output of the block. 25 | """ 26 | assert output is not None, 'The output is undefined' 27 | -------------------------------------------------------------------------------- /mage-files/load.py: -------------------------------------------------------------------------------- 1 | from mage_ai.data_preparation.repo_manager import get_repo_path 2 | from mage_ai.io.bigquery import BigQuery 3 | from mage_ai.io.config import ConfigFileLoader 4 | from pandas import DataFrame 5 | from os import path 6 | 7 | if 'data_exporter' not in globals(): 8 | from mage_ai.data_preparation.decorators import data_exporter 9 | 10 | 11 | @data_exporter 12 | def export_data_to_big_query(data, **kwargs) -> None: 13 | """ 14 | Template for exporting data to a BigQuery warehouse. 15 | Specify your configuration settings in 'io_config.yaml'. 16 | 17 | Docs: https://docs.mage.ai/design/data-loading#bigquery 18 | 19 | 20 | """ 21 | config_path = path.join(get_repo_path(), 'io_config.yaml') 22 | config_profile = 'default' 23 | 24 | for key, value in data.items(): 25 | table_id = 'data-with-darshil.uber_data_engineering_yt.{}'.format(key) 26 | BigQuery.with_config(ConfigFileLoader(config_path, config_profile)).export( 27 | DataFrame(value), 28 | table_id, 29 | if_exists='replace', # Specify resolution policy if table name already exists 30 | ) 31 | -------------------------------------------------------------------------------- /mage-files/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | if 'transformer' not in globals(): 3 | from mage_ai.data_preparation.decorators import transformer 4 | if 'test' not in globals(): 5 | from mage_ai.data_preparation.decorators import test 6 | 7 | 8 | @transformer 9 | def transform(df, *args, **kwargs): 10 | """ 11 | Template code for a transformer block. 12 | 13 | Add more parameters to this function if this block has multiple parent blocks. 14 | There should be one parameter for each output variable from each parent block. 15 | 16 | Args: 17 | data: The output from the upstream parent block 18 | args: The output from any additional upstream blocks (if applicable) 19 | 20 | Returns: 21 | Anything (e.g. data frame, dictionary, array, int, str, etc.) 22 | """ 23 | # Specify your transformation logic here 24 | df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime']) 25 | df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime']) 26 | 27 | datetime_dim = df[['tpep_pickup_datetime','tpep_dropoff_datetime']].drop_duplicates().reset_index(drop=True) 28 | datetime_dim['pick_hour'] = datetime_dim['tpep_pickup_datetime'].dt.hour 29 | datetime_dim['pick_day'] = datetime_dim['tpep_pickup_datetime'].dt.day 30 | datetime_dim['pick_month'] = datetime_dim['tpep_pickup_datetime'].dt.month 31 | datetime_dim['pick_year'] = datetime_dim['tpep_pickup_datetime'].dt.year 32 | datetime_dim['pick_weekday'] = datetime_dim['tpep_pickup_datetime'].dt.weekday 33 | 34 | datetime_dim['drop_hour'] = datetime_dim['tpep_dropoff_datetime'].dt.hour 35 | datetime_dim['drop_day'] = datetime_dim['tpep_dropoff_datetime'].dt.day 36 | datetime_dim['drop_month'] = datetime_dim['tpep_dropoff_datetime'].dt.month 37 | datetime_dim['drop_year'] = datetime_dim['tpep_dropoff_datetime'].dt.year 38 | datetime_dim['drop_weekday'] = datetime_dim['tpep_dropoff_datetime'].dt.weekday 39 | 40 | datetime_dim['datetime_id'] = datetime_dim.index 41 | datetime_dim = datetime_dim[['datetime_id', 'tpep_pickup_datetime', 'pick_hour', 'pick_day', 'pick_month', 'pick_year', 'pick_weekday', 42 | 'tpep_dropoff_datetime', 'drop_hour', 'drop_day', 'drop_month', 'drop_year', 'drop_weekday']] 43 | 44 | passenger_count_dim = df[['passenger_count']].drop_duplicates().reset_index(drop=True) 45 | passenger_count_dim['passenger_count_id'] = passenger_count_dim.index 46 | passenger_count_dim = passenger_count_dim[['passenger_count_id','passenger_count']] 47 | 48 | trip_distance_dim = df[['trip_distance']].drop_duplicates().reset_index(drop=True) 49 | trip_distance_dim['trip_distance_id'] = trip_distance_dim.index 50 | trip_distance_dim = trip_distance_dim[['trip_distance_id','trip_distance']] 51 | rate_code_type = { 52 | 1:"Standard rate", 53 | 2:"JFK", 54 | 3:"Newark", 55 | 4:"Nassau or Westchester", 56 | 5:"Negotiated fare", 57 | 6:"Group ride" 58 | } 59 | 60 | rate_code_dim = df[['RatecodeID']].drop_duplicates().reset_index(drop=True) 61 | rate_code_dim['rate_code_id'] = rate_code_dim.index 62 | rate_code_dim['rate_code_name'] = rate_code_dim['RatecodeID'].map(rate_code_type) 63 | rate_code_dim = rate_code_dim[['rate_code_id','RatecodeID','rate_code_name']] 64 | 65 | 66 | pickup_location_dim = df[['pickup_longitude', 'pickup_latitude']].drop_duplicates().reset_index(drop=True) 67 | pickup_location_dim['pickup_location_id'] = pickup_location_dim.index 68 | pickup_location_dim = pickup_location_dim[['pickup_location_id','pickup_latitude','pickup_longitude']] 69 | 70 | 71 | dropoff_location_dim = df[['dropoff_longitude', 'dropoff_latitude']].drop_duplicates().reset_index(drop=True) 72 | dropoff_location_dim['dropoff_location_id'] = dropoff_location_dim.index 73 | dropoff_location_dim = dropoff_location_dim[['dropoff_location_id','dropoff_latitude','dropoff_longitude']] 74 | 75 | payment_type_name = { 76 | 1:"Credit card", 77 | 2:"Cash", 78 | 3:"No charge", 79 | 4:"Dispute", 80 | 5:"Unknown", 81 | 6:"Voided trip" 82 | } 83 | payment_type_dim = df[['payment_type']].drop_duplicates().reset_index(drop=True) 84 | payment_type_dim['payment_type_id'] = payment_type_dim.index 85 | payment_type_dim['payment_type_name'] = payment_type_dim['payment_type'].map(payment_type_name) 86 | payment_type_dim = payment_type_dim[['payment_type_id','payment_type','payment_type_name']] 87 | 88 | fact_table = df.merge(passenger_count_dim, on='passenger_count') \ 89 | .merge(trip_distance_dim, on='trip_distance') \ 90 | .merge(rate_code_dim, on='RatecodeID') \ 91 | .merge(pickup_location_dim, on=['pickup_longitude', 'pickup_latitude']) \ 92 | .merge(dropoff_location_dim, on=['dropoff_longitude', 'dropoff_latitude'])\ 93 | .merge(datetime_dim, on=['tpep_pickup_datetime','tpep_dropoff_datetime']) \ 94 | .merge(payment_type_dim, on='payment_type') \ 95 | [['VendorID', 'datetime_id', 'passenger_count_id', 96 | 'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id', 97 | 'payment_type_id', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 98 | 'improvement_surcharge', 'total_amount']] 99 | 100 | return {"datetime_dim":datetime_dim.to_dict(orient="dict"), 101 | "passenger_count_dim":passenger_count_dim.to_dict(orient="dict"), 102 | "trip_distance_dim":trip_distance_dim.to_dict(orient="dict"), 103 | "rate_code_dim":rate_code_dim.to_dict(orient="dict"), 104 | "pickup_location_dim":pickup_location_dim.to_dict(orient="dict"), 105 | "dropoff_location_dim":dropoff_location_dim.to_dict(orient="dict"), 106 | "payment_type_dim":payment_type_dim.to_dict(orient="dict"), 107 | "fact_table":fact_table.to_dict(orient="dict")} 108 | 109 | 110 | @test 111 | def test_output(output, *args) -> None: 112 | """ 113 | Template code for testing the output of the block. 114 | """ 115 | assert output is not None, 'The output is undefined' 116 | --------------------------------------------------------------------------------