├── Data ├── Brazilian ecommerce EDA .ipynb └── brazilian-ecommerce.zip ├── LICENSE ├── README.md └── airflow ├── dags └── late_shipments_to_carrier_dag.py └── scripts ├── s3_download.py ├── s3_upload.py └── spark_missed_deadline_job.py /Data/Brazilian ecommerce EDA .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Super basic EDA\n", 8 | "The concern of this super basic EDA is to determine which order packages were delivered passed the shipping limit date. In other words, the idea is to identify orders where the seller did not deliver the package to the carrier prior to the designated limit that Olist placed on them to do so. " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "Untitled.ipynb \u001b[34mbrazilian-ecommerce\u001b[m\u001b[m brazilian-ecommerce.zip\r\n" 21 | ] 22 | } 23 | ], 24 | "source": [ 25 | "! ls" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import pandas as pd\n", 35 | "import zipfile\n", 36 | "\n", 37 | "zf = zipfile.ZipFile('brazilian-ecommerce.zip') " 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 12, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from zipfile import ZipFile\n", 47 | "\n", 48 | "# Create a ZipFile Object and load sample.zip in it\n", 49 | "with ZipFile('brazilian-ecommerce.zip', 'r') as zipObj:\n", 50 | " # Extract all the contents of zip file in current directory\n", 51 | " zipObj.extractall()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 13, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "Untitled.ipynb olist_order_reviews_dataset.csv\r\n", 64 | "\u001b[34mbrazilian-ecommerce\u001b[m\u001b[m olist_orders_dataset.csv\r\n", 65 | "brazilian-ecommerce.zip olist_products_dataset.csv\r\n", 66 | "olist_customers_dataset.csv olist_sellers_dataset.csv\r\n", 67 | "olist_geolocation_dataset.csv product_category_name_translation.csv\r\n", 68 | "olist_order_items_dataset.csv \u001b[34mtemp_csv\u001b[m\u001b[m\r\n", 69 | "olist_order_payments_dataset.csv\r\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "! ls" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 33, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "import pandas as pd\n", 84 | " \n", 85 | "df_items = pd.read_csv('olist_order_items_dataset.csv')\n", 86 | "df_orders = pd.read_csv('olist_orders_dataset.csv')\n", 87 | "df_products = pd.read_csv('olist_products_dataset.csv')" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 29, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/html": [ 98 | "

\n", 99 | "\n", 112 | "\n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "

	order_id	order_item_id	product_id	seller_id	shipping_limit_date	price	freight_value
0	00010242fe8c5a6d1ba2dd792cb16214	1	4244733e06e7ecb4970a6e2683c13e61	48436dade18ac8b2bce089ec2a041202	2017-09-19 09:45:35	58.90	13.29
1	00018f77f2f0320c557190d7a144bdd3	1	e5f2d52b802189ee658865ca93d83a8f	dd7ddc04e1b6c2c614352b383efe2d36	2017-05-03 11:05:13	239.90	19.93
2	000229ec398224ef6ca0657da4fc703e	1	c777355d18b72b67abbeef9df44fd0fd	5b51032eddd242adc84c38acab88f23d	2018-01-18 14:48:30	199.00	17.87
3	00024acbcdf0a6daa1e931b038114c75	1	7634da152a4610f1595efa32f14722fc	9d7a1d34a5052409006425275ba1c2b4	2018-08-15 10:10:18	12.99	12.79
4	00042b26cf59d7ce69dfabb4e55b4fd9	1	ac6c3623068f30de03045865e4e10089	df560393f3a51e74553ab94004ba5c87	2017-02-13 13:57:51	199.90	18.14

\n", 178 | "

" 179 | ], 180 | "text/plain": [ 181 | " order_id order_item_id \\\n", 182 | "0 00010242fe8c5a6d1ba2dd792cb16214 1 \n", 183 | "1 00018f77f2f0320c557190d7a144bdd3 1 \n", 184 | "2 000229ec398224ef6ca0657da4fc703e 1 \n", 185 | "3 00024acbcdf0a6daa1e931b038114c75 1 \n", 186 | "4 00042b26cf59d7ce69dfabb4e55b4fd9 1 \n", 187 | "\n", 188 | " product_id seller_id \\\n", 189 | "0 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 \n", 190 | "1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 \n", 191 | "2 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d \n", 192 | "3 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 \n", 193 | "4 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 \n", 194 | "\n", 195 | " shipping_limit_date price freight_value \n", 196 | "0 2017-09-19 09:45:35 58.90 13.29 \n", 197 | "1 2017-05-03 11:05:13 239.90 19.93 \n", 198 | "2 2018-01-18 14:48:30 199.00 17.87 \n", 199 | "3 2018-08-15 10:10:18 12.99 12.79 \n", 200 | "4 2017-02-13 13:57:51 199.90 18.14 " 201 | ] 202 | }, 203 | "execution_count": 29, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "df_items.head()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 30, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/html": [ 220 | "

\n", 221 | "\n", 234 | "\n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | "

	order_id	customer_id	order_status	order_purchase_timestamp	order_approved_at	order_delivered_carrier_date	order_delivered_customer_date	order_estimated_delivery_date
0	e481f51cbdc54678b7cc49136f2d6af7	9ef432eb6251297304e76186b10a928d	delivered	2017-10-02 10:56:33	2017-10-02 11:07:15	2017-10-04 19:55:00	2017-10-10 21:25:13	2017-10-18 00:00:00
1	53cdb2fc8bc7dce0b6741e2150273451	b0830fb4747a6c6d20dea0b8c802d7ef	delivered	2018-07-24 20:41:37	2018-07-26 03:24:27	2018-07-26 14:31:00	2018-08-07 15:27:45	2018-08-13 00:00:00
2	47770eb9100c2d0c44946d9cf07ec65d	41ce2a54c0b03bf3443c3d931a367089	delivered	2018-08-08 08:38:49	2018-08-08 08:55:23	2018-08-08 13:50:00	2018-08-17 18:06:29	2018-09-04 00:00:00
3	949d5b44dbf5de918fe9c16f97b45f8a	f88197465ea7920adcdbec7375364d82	delivered	2017-11-18 19:28:06	2017-11-18 19:45:59	2017-11-22 13:39:59	2017-12-02 00:28:42	2017-12-15 00:00:00
4	ad21c59c0840e6cb83a9ceb5573f8159	8ab97904e6daea8866dbdbc4fb7aad2c	delivered	2018-02-13 21:18:39	2018-02-13 22:20:29	2018-02-14 19:46:34	2018-02-16 18:17:02	2018-02-26 00:00:00

\n", 306 | "

" 307 | ], 308 | "text/plain": [ 309 | " order_id customer_id \\\n", 310 | "0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d \n", 311 | "1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef \n", 312 | "2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 \n", 313 | "3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 \n", 314 | "4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c \n", 315 | "\n", 316 | " order_status order_purchase_timestamp order_approved_at \\\n", 317 | "0 delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 \n", 318 | "1 delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 \n", 319 | "2 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 \n", 320 | "3 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 \n", 321 | "4 delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 \n", 322 | "\n", 323 | " order_delivered_carrier_date order_delivered_customer_date \\\n", 324 | "0 2017-10-04 19:55:00 2017-10-10 21:25:13 \n", 325 | "1 2018-07-26 14:31:00 2018-08-07 15:27:45 \n", 326 | "2 2018-08-08 13:50:00 2018-08-17 18:06:29 \n", 327 | "3 2017-11-22 13:39:59 2017-12-02 00:28:42 \n", 328 | "4 2018-02-14 19:46:34 2018-02-16 18:17:02 \n", 329 | "\n", 330 | " order_estimated_delivery_date \n", 331 | "0 2017-10-18 00:00:00 \n", 332 | "1 2018-08-13 00:00:00 \n", 333 | "2 2018-09-04 00:00:00 \n", 334 | "3 2017-12-15 00:00:00 \n", 335 | "4 2018-02-26 00:00:00 " 336 | ] 337 | }, 338 | "execution_count": 30, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "df_orders.head()" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 34, 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/html": [ 355 | "

\n", 356 | "\n", 369 | "\n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | "

	product_id	product_category_name	product_name_lenght	product_description_lenght	product_photos_qty	product_weight_g	product_length_cm	product_height_cm	product_width_cm
0	1e9e8ef04dbcff4541ed26657ea517e5	perfumaria	40.0	287.0	1.0	225.0	16.0	10.0	14.0
1	3aa071139cb16b67ca9e5dea641aaa2f	artes	44.0	276.0	1.0	1000.0	30.0	18.0	20.0
2	96bd76ec8810374ed1b65e291975717f	esporte_lazer	46.0	250.0	1.0	154.0	18.0	9.0	15.0
3	cef67bcfe19066a932b7673e239eb23d	bebes	27.0	261.0	1.0	371.0	26.0	4.0	26.0
4	9dc1a7de274444849c219cff195d0b71	utilidades_domesticas	37.0	402.0	4.0	625.0	20.0	17.0	13.0

\n", 447 | "

" 448 | ], 449 | "text/plain": [ 450 | " product_id product_category_name \\\n", 451 | "0 1e9e8ef04dbcff4541ed26657ea517e5 perfumaria \n", 452 | "1 3aa071139cb16b67ca9e5dea641aaa2f artes \n", 453 | "2 96bd76ec8810374ed1b65e291975717f esporte_lazer \n", 454 | "3 cef67bcfe19066a932b7673e239eb23d bebes \n", 455 | "4 9dc1a7de274444849c219cff195d0b71 utilidades_domesticas \n", 456 | "\n", 457 | " product_name_lenght product_description_lenght product_photos_qty \\\n", 458 | "0 40.0 287.0 1.0 \n", 459 | "1 44.0 276.0 1.0 \n", 460 | "2 46.0 250.0 1.0 \n", 461 | "3 27.0 261.0 1.0 \n", 462 | "4 37.0 402.0 4.0 \n", 463 | "\n", 464 | " product_weight_g product_length_cm product_height_cm product_width_cm \n", 465 | "0 225.0 16.0 10.0 14.0 \n", 466 | "1 1000.0 30.0 18.0 20.0 \n", 467 | "2 154.0 18.0 9.0 15.0 \n", 468 | "3 371.0 26.0 4.0 26.0 \n", 469 | "4 625.0 20.0 17.0 13.0 " 470 | ] 471 | }, 472 | "execution_count": 34, 473 | "metadata": {}, 474 | "output_type": "execute_result" 475 | } 476 | ], 477 | "source": [ 478 | "df_products.head()" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 43, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "# Merge the seller id/shipping limit, order information, and product information\n", 488 | "df = df_items.merge(df_orders, on='order_id').merge(df_products[['product_id', 'product_category_name']], \n", 489 | " on='product_id')\n", 490 | "\n" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 44, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/plain": [ 501 | "Index(['order_id', 'order_item_id', 'product_id', 'seller_id',\n", 502 | " 'shipping_limit_date', 'price', 'freight_value', 'customer_id',\n", 503 | " 'order_status', 'order_purchase_timestamp', 'order_approved_at',\n", 504 | " 'order_delivered_carrier_date', 'order_delivered_customer_date',\n", 505 | " 'order_estimated_delivery_date', 'product_category_name'],\n", 506 | " dtype='object')" 507 | ] 508 | }, 509 | "execution_count": 44, 510 | "metadata": {}, 511 | "output_type": "execute_result" 512 | } 513 | ], 514 | "source": [ 515 | "df.columns" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 84, 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "data": { 525 | "text/html": [ 526 | "

\n", 527 | "\n", 540 | "\n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | "

	order_id	order_item_id	product_id	seller_id	shipping_limit_date	price	freight_value	customer_id	order_status	order_purchase_timestamp	order_approved_at	order_delivered_carrier_date	order_delivered_customer_date	order_estimated_delivery_date	product_category_name
0	00010242fe8c5a6d1ba2dd792cb16214	1	4244733e06e7ecb4970a6e2683c13e61	48436dade18ac8b2bce089ec2a041202	2017-09-19 09:45:35	58.9	13.29	3ce436f183e68e07877b285a838db11a	delivered	2017-09-13 08:59:02	2017-09-13 09:45:35	2017-09-19 18:34:16	2017-09-20 23:43:48	2017-09-29 00:00:00	cool_stuff
1	130898c0987d1801452a8ed92a670612	1	4244733e06e7ecb4970a6e2683c13e61	48436dade18ac8b2bce089ec2a041202	2017-07-05 02:44:11	55.9	17.96	e6eecc5a77de221464d1c4eaff0a9b64	delivered	2017-06-28 11:52:20	2017-06-29 02:44:11	2017-07-05 12:00:33	2017-07-13 20:39:29	2017-07-26 00:00:00	cool_stuff
2	532ed5e14e24ae1f0d735b91524b98b9	1	4244733e06e7ecb4970a6e2683c13e61	48436dade18ac8b2bce089ec2a041202	2018-05-23 10:56:25	64.9	18.33	4ef55bf80f711b372afebcb7c715344a	delivered	2018-05-18 10:25:53	2018-05-18 12:31:43	2018-05-23 14:05:00	2018-06-04 18:34:26	2018-06-07 00:00:00	cool_stuff
9	00018f77f2f0320c557190d7a144bdd3	1	e5f2d52b802189ee658865ca93d83a8f	dd7ddc04e1b6c2c614352b383efe2d36	2017-05-03 11:05:13	239.9	19.93	f6dd3ec061db4e3987629fe6b26e5cce	delivered	2017-04-26 10:53:06	2017-04-26 11:05:13	2017-05-04 14:35:00	2017-05-12 16:04:24	2017-05-15 00:00:00	pet_shop
15	00042b26cf59d7ce69dfabb4e55b4fd9	1	ac6c3623068f30de03045865e4e10089	df560393f3a51e74553ab94004ba5c87	2017-02-13 13:57:51	199.9	18.14	58dbd0b2d70206bf40e62cd34e84d795	delivered	2017-02-04 13:57:51	2017-02-04 14:10:13	2017-02-16 09:46:09	2017-03-01 16:42:31	2017-03-17 00:00:00	ferramentas_jardim

\n", 654 | "

" 655 | ], 656 | "text/plain": [ 657 | " order_id order_item_id \\\n", 658 | "0 00010242fe8c5a6d1ba2dd792cb16214 1 \n", 659 | "1 130898c0987d1801452a8ed92a670612 1 \n", 660 | "2 532ed5e14e24ae1f0d735b91524b98b9 1 \n", 661 | "9 00018f77f2f0320c557190d7a144bdd3 1 \n", 662 | "15 00042b26cf59d7ce69dfabb4e55b4fd9 1 \n", 663 | "\n", 664 | " product_id seller_id \\\n", 665 | "0 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 \n", 666 | "1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 \n", 667 | "2 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 \n", 668 | "9 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 \n", 669 | "15 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 \n", 670 | "\n", 671 | " shipping_limit_date price freight_value \\\n", 672 | "0 2017-09-19 09:45:35 58.9 13.29 \n", 673 | "1 2017-07-05 02:44:11 55.9 17.96 \n", 674 | "2 2018-05-23 10:56:25 64.9 18.33 \n", 675 | "9 2017-05-03 11:05:13 239.9 19.93 \n", 676 | "15 2017-02-13 13:57:51 199.9 18.14 \n", 677 | "\n", 678 | " customer_id order_status order_purchase_timestamp \\\n", 679 | "0 3ce436f183e68e07877b285a838db11a delivered 2017-09-13 08:59:02 \n", 680 | "1 e6eecc5a77de221464d1c4eaff0a9b64 delivered 2017-06-28 11:52:20 \n", 681 | "2 4ef55bf80f711b372afebcb7c715344a delivered 2018-05-18 10:25:53 \n", 682 | "9 f6dd3ec061db4e3987629fe6b26e5cce delivered 2017-04-26 10:53:06 \n", 683 | "15 58dbd0b2d70206bf40e62cd34e84d795 delivered 2017-02-04 13:57:51 \n", 684 | "\n", 685 | " order_approved_at order_delivered_carrier_date \\\n", 686 | "0 2017-09-13 09:45:35 2017-09-19 18:34:16 \n", 687 | "1 2017-06-29 02:44:11 2017-07-05 12:00:33 \n", 688 | "2 2018-05-18 12:31:43 2018-05-23 14:05:00 \n", 689 | "9 2017-04-26 11:05:13 2017-05-04 14:35:00 \n", 690 | "15 2017-02-04 14:10:13 2017-02-16 09:46:09 \n", 691 | "\n", 692 | " order_delivered_customer_date order_estimated_delivery_date \\\n", 693 | "0 2017-09-20 23:43:48 2017-09-29 00:00:00 \n", 694 | "1 2017-07-13 20:39:29 2017-07-26 00:00:00 \n", 695 | "2 2018-06-04 18:34:26 2018-06-07 00:00:00 \n", 696 | "9 2017-05-12 16:04:24 2017-05-15 00:00:00 \n", 697 | "15 2017-03-01 16:42:31 2017-03-17 00:00:00 \n", 698 | "\n", 699 | " product_category_name \n", 700 | "0 cool_stuff \n", 701 | "1 cool_stuff \n", 702 | "2 cool_stuff \n", 703 | "9 pet_shop \n", 704 | "15 ferramentas_jardim " 705 | ] 706 | }, 707 | "execution_count": 84, 708 | "metadata": {}, 709 | "output_type": "execute_result" 710 | } 711 | ], 712 | "source": [ 713 | "# Orders where the shipping deadline to the carrier was missed\n", 714 | "df.loc[df.shipping_limit_date < df.order_delivered_carrier_date].head()\n", 715 | "\n" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 48, 721 | "metadata": {}, 722 | "outputs": [ 723 | { 724 | "data": { 725 | "text/html": [ 726 | "

\n", 727 | "\n", 740 | "\n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | "

	order_id	order_item_id	product_id	seller_id	shipping_limit_date	price	freight_value	customer_id	order_purchase_timestamp	order_approved_at	order_delivered_carrier_date	order_delivered_customer_date	order_estimated_delivery_date	product_category_name
order_status
approved	3	3	3	3	3	3	3	3	3	3	0	0	3	3
canceled	542	542	542	542	542	542	542	542	542	542	76	7	542	528
delivered	110197	110197	110197	110197	110197	110197	110197	110197	110197	110182	110195	110189	110197	108660
invoiced	359	359	359	359	359	359	359	359	359	359	0	0	359	347
processing	357	357	357	357	357	357	357	357	357	357	0	0	357	344
shipped	1185	1185	1185	1185	1185	1185	1185	1185	1185	1185	1185	0	1185	1158
unavailable	7	7	7	7	7	7	7	7	7	7	0	0	7	7

\n", 899 | "

" 900 | ], 901 | "text/plain": [ 902 | " order_id order_item_id product_id seller_id \\\n", 903 | "order_status \n", 904 | "approved 3 3 3 3 \n", 905 | "canceled 542 542 542 542 \n", 906 | "delivered 110197 110197 110197 110197 \n", 907 | "invoiced 359 359 359 359 \n", 908 | "processing 357 357 357 357 \n", 909 | "shipped 1185 1185 1185 1185 \n", 910 | "unavailable 7 7 7 7 \n", 911 | "\n", 912 | " shipping_limit_date price freight_value customer_id \\\n", 913 | "order_status \n", 914 | "approved 3 3 3 3 \n", 915 | "canceled 542 542 542 542 \n", 916 | "delivered 110197 110197 110197 110197 \n", 917 | "invoiced 359 359 359 359 \n", 918 | "processing 357 357 357 357 \n", 919 | "shipped 1185 1185 1185 1185 \n", 920 | "unavailable 7 7 7 7 \n", 921 | "\n", 922 | " order_purchase_timestamp order_approved_at \\\n", 923 | "order_status \n", 924 | "approved 3 3 \n", 925 | "canceled 542 542 \n", 926 | "delivered 110197 110182 \n", 927 | "invoiced 359 359 \n", 928 | "processing 357 357 \n", 929 | "shipped 1185 1185 \n", 930 | "unavailable 7 7 \n", 931 | "\n", 932 | " order_delivered_carrier_date order_delivered_customer_date \\\n", 933 | "order_status \n", 934 | "approved 0 0 \n", 935 | "canceled 76 7 \n", 936 | "delivered 110195 110189 \n", 937 | "invoiced 0 0 \n", 938 | "processing 0 0 \n", 939 | "shipped 1185 0 \n", 940 | "unavailable 0 0 \n", 941 | "\n", 942 | " order_estimated_delivery_date product_category_name \n", 943 | "order_status \n", 944 | "approved 3 3 \n", 945 | "canceled 542 528 \n", 946 | "delivered 110197 108660 \n", 947 | "invoiced 359 347 \n", 948 | "processing 357 344 \n", 949 | "shipped 1185 1158 \n", 950 | "unavailable 7 7 " 951 | ] 952 | }, 953 | "execution_count": 48, 954 | "metadata": {}, 955 | "output_type": "execute_result" 956 | } 957 | ], 958 | "source": [ 959 | "df.groupby('order_status').count()" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 62, 965 | "metadata": {}, 966 | "outputs": [ 967 | { 968 | "name": "stdout", 969 | "output_type": "stream", 970 | "text": [ 971 | "\n" 972 | ] 973 | } 974 | ], 975 | "source": [ 976 | "# Setting up spark\n", 977 | "import findspark\n", 978 | "findspark.init()\n", 979 | "from pyspark import SparkConf, SparkContext\n", 980 | "from pyspark.sql import SparkSession\n", 981 | "from pyspark.sql import *\n", 982 | "from pyspark.sql.functions import *\n", 983 | "conf = SparkConf().setMaster(\"local\").setAppName(\"SparkSQL_NLP\")\n", 984 | "spark = SparkSession.builder.getOrCreate()\n", 985 | "print(spark)" 986 | ] 987 | }, 988 | { 989 | "cell_type": "code", 990 | "execution_count": 77, 991 | "metadata": {}, 992 | "outputs": [ 993 | { 994 | "data": { 995 | "text/plain": [ 996 | "" 997 | ] 998 | }, 999 | "execution_count": 77, 1000 | "metadata": {}, 1001 | "output_type": "execute_result" 1002 | } 1003 | ], 1004 | "source": [ 1005 | "findspark" 1006 | ] 1007 | }, 1008 | { 1009 | "cell_type": "code", 1010 | "execution_count": 64, 1011 | "metadata": {}, 1012 | "outputs": [], 1013 | "source": [ 1014 | "# Set sqlContext from the Spark context\n", 1015 | "from pyspark.sql import SQLContext\n", 1016 | "sqlContext = SQLContext(spark)" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "code", 1021 | "execution_count": 65, 1022 | "metadata": {}, 1023 | "outputs": [], 1024 | "source": [ 1025 | "# Setup a Spark SQL context and read in the pandas dataframe to a Spark dataframe\n", 1026 | "spark.conf.set(\"spark.sql.execution.arrow.enabled\", \"true\")" 1027 | ] 1028 | }, 1029 | { 1030 | "cell_type": "code", 1031 | "execution_count": 66, 1032 | "metadata": {}, 1033 | "outputs": [], 1034 | "source": [ 1035 | "spark = SparkSession.builder.getOrCreate()\n", 1036 | "\n", 1037 | "# Load in csv file into spark dataframe\n", 1038 | "df_items = spark.read.format(\"csv\") \\\n", 1039 | " .option(\"header\", \"true\") \\\n", 1040 | " .option(\"inferSchema\", \"true\") \\\n", 1041 | " .load(\"olist_order_items_dataset.csv\")\n", 1042 | "\n", 1043 | "df_orders = spark.read.format(\"csv\") \\\n", 1044 | " .option(\"header\", \"true\") \\\n", 1045 | " .option(\"inferSchema\", \"true\") \\\n", 1046 | " .load(\"olist_orders_dataset.csv\")\n", 1047 | "\n", 1048 | "df_products = spark.read.format(\"csv\") \\\n", 1049 | " .option(\"header\", \"true\") \\\n", 1050 | " .option(\"inferSchema\", \"true\") \\\n", 1051 | " .load(\"olist_products_dataset.csv\")" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "code", 1056 | "execution_count": 67, 1057 | "metadata": {}, 1058 | "outputs": [], 1059 | "source": [ 1060 | "# Create SQL Tables from dfs\n", 1061 | "df_items.createOrReplaceTempView('items')\n", 1062 | "df_orders.createOrReplaceTempView('orders')\n", 1063 | "df_products.createOrReplaceTempView('products')" 1064 | ] 1065 | }, 1066 | { 1067 | "cell_type": "code", 1068 | "execution_count": 69, 1069 | "metadata": {}, 1070 | "outputs": [ 1071 | { 1072 | "data": { 1073 | "text/plain": [ 1074 | "['order_id',\n", 1075 | " 'order_item_id',\n", 1076 | " 'product_id',\n", 1077 | " 'seller_id',\n", 1078 | " 'shipping_limit_date',\n", 1079 | " 'price',\n", 1080 | " 'freight_value']" 1081 | ] 1082 | }, 1083 | "execution_count": 69, 1084 | "metadata": {}, 1085 | "output_type": "execute_result" 1086 | } 1087 | ], 1088 | "source": [ 1089 | "# Basic SQL query\n", 1090 | "spark.sql('SELECT * FROM items LIMIT 0').columns" 1091 | ] 1092 | }, 1093 | { 1094 | "cell_type": "code", 1095 | "execution_count": 70, 1096 | "metadata": {}, 1097 | "outputs": [ 1098 | { 1099 | "data": { 1100 | "text/plain": [ 1101 | "['product_id',\n", 1102 | " 'product_category_name',\n", 1103 | " 'product_name_lenght',\n", 1104 | " 'product_description_lenght',\n", 1105 | " 'product_photos_qty',\n", 1106 | " 'product_weight_g',\n", 1107 | " 'product_length_cm',\n", 1108 | " 'product_height_cm',\n", 1109 | " 'product_width_cm']" 1110 | ] 1111 | }, 1112 | "execution_count": 70, 1113 | "metadata": {}, 1114 | "output_type": "execute_result" 1115 | } 1116 | ], 1117 | "source": [ 1118 | "spark.sql('SELECT * FROM products LIMIT 0').columns" 1119 | ] 1120 | }, 1121 | { 1122 | "cell_type": "code", 1123 | "execution_count": 71, 1124 | "metadata": {}, 1125 | "outputs": [ 1126 | { 1127 | "data": { 1128 | "text/plain": [ 1129 | "['order_id',\n", 1130 | " 'customer_id',\n", 1131 | " 'order_status',\n", 1132 | " 'order_purchase_timestamp',\n", 1133 | " 'order_approved_at',\n", 1134 | " 'order_delivered_carrier_date',\n", 1135 | " 'order_delivered_customer_date',\n", 1136 | " 'order_estimated_delivery_date']" 1137 | ] 1138 | }, 1139 | "execution_count": 71, 1140 | "metadata": {}, 1141 | "output_type": "execute_result" 1142 | } 1143 | ], 1144 | "source": [ 1145 | "spark.sql('SELECT * FROM orders LIMIT 0').columns" 1146 | ] 1147 | }, 1148 | { 1149 | "cell_type": "code", 1150 | "execution_count": 76, 1151 | "metadata": {}, 1152 | "outputs": [], 1153 | "source": [ 1154 | "late_carrier_deliveries = spark.sql(\"\"\"\n", 1155 | "SELECT i.order_id, i.seller_id, i.shipping_limit_date, i.price, i.freight_value,\n", 1156 | " p.product_id, p.product_category_name, \n", 1157 | " o.customer_id, o.order_status, o.order_purchase_timestamp, o.order_delivered_carrier_date,\n", 1158 | " o.order_delivered_customer_date, o.order_estimated_delivery_date\n", 1159 | "FROM items AS i\n", 1160 | "JOIN orders AS o\n", 1161 | "ON i.order_id = o.order_id\n", 1162 | "JOIN products AS p\n", 1163 | "ON i.product_id = p.product_id\n", 1164 | "WHERE i.shipping_limit_date < o.order_delivered_carrier_date\n", 1165 | "\"\"\")" 1166 | ] 1167 | }, 1168 | { 1169 | "cell_type": "code", 1170 | "execution_count": 79, 1171 | "metadata": {}, 1172 | "outputs": [ 1173 | { 1174 | "name": "stdout", 1175 | "output_type": "stream", 1176 | "text": [ 1177 | "+--------------------+--------------------+-------------------+------+-------------+--------------------+---------------------+--------------------+------------+------------------------+----------------------------+-----------------------------+-----------------------------+\n", 1178 | "| order_id| seller_id|shipping_limit_date| price|freight_value| product_id|product_category_name| customer_id|order_status|order_purchase_timestamp|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|\n", 1179 | "+--------------------+--------------------+-------------------+------+-------------+--------------------+---------------------+--------------------+------------+------------------------+----------------------------+-----------------------------+-----------------------------+\n", 1180 | "|05afef1c185862cab...|53e4c6e0f4312d4d2...|2017-07-31 11:03:10| 27.99| 22.67|ac7e981115ad47f0e...| utilidades_domest...|296de103322e463a1...| delivered| 2017-07-25 10:47:16| 2017-07-31 15:49:35| 2017-08-21 21:10:53| 2017-09-04 00:00:00|\n", 1181 | "|05afef1c185862cab...|53e4c6e0f4312d4d2...|2017-07-31 11:03:10| 27.99| 22.67|ac7e981115ad47f0e...| utilidades_domest...|296de103322e463a1...| delivered| 2017-07-25 10:47:16| 2017-07-31 15:49:35| 2017-08-21 21:10:53| 2017-09-04 00:00:00|\n", 1182 | "|0b93ff37e8344c601...|b19f3ca2ea4759137...|2018-08-02 11:05:18| 998.9| 222.38|be7af429d53adfab1...| papelaria|41b40faae6aeea7a8...| delivered| 2018-07-22 11:13:05| 2018-08-02 16:31:00| 2018-08-07 22:11:10| 2018-08-21 00:00:00|\n", 1183 | "|110fb617059394261...|9e6229250fedbe058...|2018-08-20 03:24:37| 169.9| 19.49|53a72a0b6feec4a5c...| bebes|4e99e5342705c1e76...| shipped| 2018-08-08 16:37:28| 2018-08-20 10:57:00| null| 2018-08-28 00:00:00|\n", 1184 | "|112a61419198a7ce9...|ffdd9f82b9a447f6f...|2018-04-11 17:09:50| 214.0| 70.89|55aec8e90307dc2a7...| utilidades_domest...|dad5e5f178c09543d...| delivered| 2018-04-05 11:13:05| 2018-04-11 23:33:01| 2018-04-13 23:41:40| 2018-04-24 00:00:00|\n", 1185 | "|184f17bb701af22b8...|827f8f69dfa529c56...|2017-05-11 18:15:15| 29.9| 14.1|de4a736d3f2df1266...| fashion_calcados|d6d1c551a52cf82ad...| delivered| 2017-05-06 18:04:27| 2017-05-15 13:16:14| 2017-05-17 14:44:53| 2017-05-31 00:00:00|\n", 1186 | "|1c7b6d63329bdc850...|88460e8ebdecbfecb...|2018-04-19 13:31:39| 73.9| 7.87|ee57070aa3b24a06f...| informatica_acess...|2b8a8ed9920523186...| delivered| 2018-04-13 12:59:58| 2018-04-26 12:59:00| 2018-04-27 19:22:00| 2018-04-26 00:00:00|\n", 1187 | "|235d0ef0d8ee3f66a...|213b25e6f54661939...|2018-08-03 03:55:19|119.85| 11.86|6f0169f259bb0ff43...| casa_construcao|fa4bc396185415281...| delivered| 2018-07-20 13:58:55| 2018-08-03 14:35:00| 2018-08-06 14:56:49| 2018-08-07 00:00:00|\n", 1188 | "|2b9c1a6934f2dd62c...|88460e8ebdecbfecb...|2018-02-15 20:55:49| 74.9| 12.65|e53e557d5a159f5aa...| informatica_acess...|82c5e6b98e498b415...| delivered| 2018-02-10 20:46:25| 2018-02-21 10:19:22| 2018-02-28 21:49:02| 2018-03-14 00:00:00|\n", 1189 | "|3303092810c37e9ab...|cd843d4cf8ef32827...|2017-08-03 10:50:12| 469.0| 90.26|df872c596e00cd016...| moveis_cozinha_ar...|5db727761e58376d9...| delivered| 2017-07-28 10:36:12| 2017-08-03 17:25:30| 2017-08-15 19:05:13| 2017-08-21 00:00:00|\n", 1190 | "|35f4b9d2da608b318...|88460e8ebdecbfecb...|2018-01-02 10:06:22| 109.9| 34.57|e53e557d5a159f5aa...| informatica_acess...|04b07985c482f53e0...| delivered| 2017-12-26 09:49:11| 2018-01-11 23:49:29| 2018-01-24 00:33:55| 2018-01-26 00:00:00|\n", 1191 | "|38541d08d4eb7d571...|54965bbe3e4f07ae0...|2017-11-30 00:52:38| 69.9| 26.81|9d6c66334a27b9dcf...| cama_mesa_banho|937682b4176cf634c...| shipped| 2017-11-24 21:36:30| 2018-01-04 21:07:51| null| 2017-12-20 00:00:00|\n", 1192 | "|3e5d60bbe5a6016db...|0adac9fbd9a2b63cc...|2017-12-11 09:22:02| 10.9| 11.85|293637be1a5ab3bd1...| utilidades_domest...|46ec73876cebebf34...| delivered| 2017-12-04 22:14:17| 2017-12-12 21:22:34| 2017-12-26 21:36:57| 2017-12-27 00:00:00|\n", 1193 | "|3f090361c1e3266bf...|c1849d4d32d7a6cec...|2018-08-10 11:50:23| 19.99| 8.89|39251fd407f6dc1db...| automotivo|c892d1124348cf471...| delivered| 2018-08-06 11:38:43| 2018-08-10 12:07:00| 2018-08-11 16:12:32| 2018-08-13 00:00:00|\n", 1194 | "|41537821ce113ccef...|b2ba3715d723d2451...|2017-12-18 04:09:14| 79.9| 9.37|8b8981e920b8a923f...| construcao_ferram...|0312590499e722678...| delivered| 2017-12-11 13:56:03| 2017-12-18 21:33:03| 2017-12-20 17:29:03| 2017-12-28 00:00:00|\n", 1195 | "+--------------------+--------------------+-------------------+------+-------------+--------------------+---------------------+--------------------+------------+------------------------+----------------------------+-----------------------------+-----------------------------+\n", 1196 | "only showing top 15 rows\n", 1197 | "\n" 1198 | ] 1199 | } 1200 | ], 1201 | "source": [ 1202 | "late_carrier_deliveries.show(15)" 1203 | ] 1204 | }, 1205 | { 1206 | "cell_type": "code", 1207 | "execution_count": 83, 1208 | "metadata": {}, 1209 | "outputs": [], 1210 | "source": [ 1211 | "# Write the results to a single csv file\n", 1212 | "# Note the named output is a directory as opposed to the csv filename\n", 1213 | "# This is because Spark uses HDFS naming conventions for files\n", 1214 | "late_carrier_deliveries.coalesce(1).write.option(\"header\", \"true\").csv(\"missed_shipping_limit_orders\")" 1215 | ] 1216 | }, 1217 | { 1218 | "cell_type": "code", 1219 | "execution_count": null, 1220 | "metadata": {}, 1221 | "outputs": [], 1222 | "source": [] 1223 | } 1224 | ], 1225 | "metadata": { 1226 | "kernelspec": { 1227 | "display_name": "Python 3", 1228 | "language": "python", 1229 | "name": "python3" 1230 | }, 1231 | "language_info": { 1232 | "codemirror_mode": { 1233 | "name": "ipython", 1234 | "version": 3 1235 | }, 1236 | "file_extension": ".py", 1237 | "mimetype": "text/x-python", 1238 | "name": "python", 1239 | "nbconvert_exporter": "python", 1240 | "pygments_lexer": "ipython3", 1241 | "version": "3.6.5" 1242 | } 1243 | }, 1244 | "nbformat": 4, 1245 | "nbformat_minor": 2 1246 | } 1247 | -------------------------------------------------------------------------------- /Data/brazilian-ecommerce.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ajupton/big-data-engineering-project/c32a6fa746d6053d39898d3881d8a013a6f46689/Data/brazilian-ecommerce.zip -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Andy Upton 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 'Little' Big Data Engineering Project 2 | Hey there! Welcome to this repo where I practice building a data engineering pipeline. Here is a basic run down of the project: 3 | 4 | # Problem Statement 5 | Retailers in the current landscape are adapting to the digital age. Digital retail behemoths have carved out substantial market shares in the online space at the same time that traditional retail stores are broadly in decline. In this time of digital flux, an omni-channel retail approach is necessary to keep pace. This is especially true for retailers that have invested in an extensive brick-and-mortar store portfolio or have strong relationships with brick-and-mortar partners. 6 | 7 | This data engineering project uses a real world retail dataset to explore delivery performance at scale. The primary concern of data engineering efforts in this project is to create a strong foundation onto which data analytics and modeling may be applied as well as provide summary reports for daily ingestion by decision makers. 8 | 9 | A series of ETL jobs are programmed as part of this project using python, SQL, Airflow, and Spark to build pipelines that download data from an AWS S3 bucket, apply some manipulations, and then load the cleaned-up data set into another location on the same AWS S3 bucket for higher level analytics. 10 | 11 | # Dataset of choice 12 | The dataset of choice for this project is a series of tables [provided by the Brazilian Ecommerce company Olist](https://www.kaggle.com/olistbr/brazilian-ecommerce/home#olist_orders_dataset.csvhttps://www.kaggle.com/olistbr/brazilian-ecommerce/home#olist_orders_dataset.csv). The table schema is provided here: 13 | 14 | ![](https://i.imgur.com/HRhd2Y0.png) 15 | 16 | # Methodology: 17 | 1. Construct a mock production data lake in AWS S3 replete with the table schema above 18 | 2. Analyze the tables with an eye toward identifying the delivery performance of Olist orders/sellers 19 | 3. Write a Spark and Spark SQL job to join together tables answering the question, "Which orders/sellers missed the deadline imposed by Olist for when their packages need to be delivered to a carrier?" 20 | 4. Build an ETL pipeline using Airflow that accomplishes the following: 21 | * Downloads data from an AWS S3 bucket 22 | * Runs a Spark/Spark SQL job on the downloaded data producing a cleaned-up dataset of delivery deadline missing orders 23 | * Upload the cleaned-up dataset back to the same S3 bucket in a folder primed for higher level analytics 24 | 25 | I'll provide an overview of each of these steps but assume that the reader is already somewhat familiar with python, SQL, Airflow, and Spark. 26 | 27 | ## Step 1: Setup and populate an AWS S3 Bucket 28 | Setting up an S3 bucket is pretty straightforward, and the current 5 GB free tier limit for 12 months is a great way to get started with AWS S3 without having to break the bank. Simply create an AWS account, click on "Services" in the upper left hand side, navigate to "Storage" and select S3. From there you'll be able to create a new S3 bucket following the prompts and inputing settings based on your unique needs. I largely followed the default options. Once created, simply select the bucket and in the Overview tab, there are options to upload objects/files, create folders, or set permissions. Alternatively, it's possible to install AWS Cli to your machine in order to transfer files at the command line. 29 | 30 | The method for interacting with S3 taken here uses the [python library boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html). boto3 has all sorts of functions for interacting with products across the AWS landscape using python. 31 | 32 | We'll use boto3 as part of python scripts to both download the Brazilian ecommerce data to our S3 bucket as well upload the csv file that results from the Spark job back into the S3 bucket. 33 | 34 | In reality, a data lake often consists of various SQL tables or log files, which are often billions of rows long. The Brazilian ecommerce data used here comes in a zip file that contains 9 different csv files as seen schema above. This gets sort of close to a data lake format but in a way that's amenable for practicing and learning. 35 | 36 | ## Step 2: Analyze the tables with an eye toward identifying the delivery performance of Olist orders/sellers 37 | One of the strongest skillsets needed by a data engineer is communication. That communication needs to be flexed primarily in conversations with stakeholders that manage data sets as well as stakeholders that will consume the data at the end of the ETL and stream pipelines that data engineers construct. Data engineers often need to rely on the domain expertise of analysts, data scientists, program managers, executives, and others in order to understand what specific data features are needed for various analytics applications. That said, data engineers still need to possess strong EDA (exploratory data analysis) chops in order to deliver data sets that meet the needs of various stakeholders. As a result, it's very often worthwhile for data engineers to get down and dirty with the data in the data lake through data visualization, the computing of summary statistics, and other exploratory methods. 38 | 39 | My go-to tool of choice in this regard is a Jupyter notebook using python. A typical workflow involes using SQLAlchemy or the pyspark API to connect to a database and pull data with some pythonic SQL and then use numpy, pandas, altair/seaborn/matplotlib/plotly, and other python packages for visualization, the computing of summary statitistics, maybe running a few statistical models, and other EDA techniques. For data that is truly 'big' and can't fit in memory on a single machine, Spark becomes invaluable. Spark SQL makes pulling and manipulating data a breeze as well for those with SQL chops. Of course, this analytics approach is possible in all sorts of AWS services like Athena and EMR among many others. With massive volumes of data, it can often be worthwhile to take samples of data from the data lake and get some EDA in to get a feel for the tables and schema as well as to understand the quirks of the data. OLTP processes often must follow multiple business rules that make it a major challenge to do things like join disparate datasets, hammer down ground-truth insights, or compare current and historical trends. 40 | 41 | Some very basic EDA is applied on the database tables to answer the question "Which orders/sellers missed the deadline imposed by Olist for when their packages need to be delivered to a carrier?". [See the code here or navigate to `Data` -> Brazilian ecommerce.ipynb](https://github.com/ajupton/big-data-engineering-project/blob/master/Data/Brazilian%20ecommerce%20EDA%20.ipynb) 42 | 43 | ## Step 3: Get hands-on with Airflow 44 | There are many tools out there to run big data ETL jobs that turn messy data lakes into analytics-ready data warehouses. Here, I'll focus on one of the most popular open source ETL tools currently available - Apache Airflow. [Check out the Airflow docs here](https://airflow.apache.org/index.html). First, install Airflow with `pip install apache-airflow`. Make sure to include the 'apache' part there, or you'll download an older version of Airflow that will lead to a whole lot of problems down the line. 45 | 46 | _Note:_ It can often be worthwhile to run Airflow in a Docker container. There are some major advantages to using a Docker container such as: 47 | * creating a fully reproducible data analysis in Jupyter notebooks or other dev platforms (something that's less trivial than you might think) 48 | * having fully documented dependencies (a Dockerfile contains descriptions of all the packages/files/stuff that your container needs to run) 49 | * having an isolated environment to work in that ensures your tools don't conflict with each other AND recreates the conditions in development, testing, staging, or production environments to ensure your code will run as expected 50 | * not having to start from scratch by taking advantage of DockerHub container 'templates' 51 | 52 | Here are a few resources to help you get started with Airflow: 53 | * [Airflow quickstart guide](http://airflow.apache.org/start.html) 54 | * [ETL Best Practices with Airflow](https://gtoonstra.github.io/etl-with-airflow/index.html) 55 | * [Developing Workflows with Apache Airflow](http://michal.karzynski.pl/blog/2017/03/19/developing-workflows-with-apache-airflow/) 56 | * [Getting Started with Airflow Using Docker](https://towardsdatascience.com/getting-started-with-airflow-using-docker-cd8b44dbff98) 57 | * [Basic DAG Configuration](https://adataguru.net/basic-dag-configuration/) 58 | 59 | Once you have Airflow installed and are able to run through a HelloWorld dag, you're ready for the next step. 60 | 61 | Airflow is all about writing python scripts to create data pipelines. There are two primary scripts that need to be written for this purpose. The first are dags and the second are operators. No analytics or processing occurs in dags, which encompass the processes or steps involved in developing a pipeline. The actual analytics live in various scripts that are run as per instructed in the dags. 62 | 63 | [The late_shipments_to_carrier.py dag includes the three steps needed to complete our pipeline as defined above.]( https://github.com/ajupton/big-data-engineering-project/blob/master/airflow/dags/late_shipments_to_carrier_dag.py) 64 | 65 | The first step is downloading the brazilian-ecommerce.zip file from your S3 bucket. [The script to accomplish this task is found here.](https://github.com/ajupton/big-data-engineering-project/blob/master/airflow/scripts/s3_download.py) 66 | 67 | The next step is to run a Spark SQL job to do a pretty simply join of three relations and filter for orders/sellers that missed the delivery deadline to get their package to a designated carrier for shipment to the consumer. [This script first unzips the dataset, then sets up a Spark session, runs a simple Spark SQL operation, and then writes the results of the Spark SQL operation to a single csv file.](https://github.com/ajupton/big-data-engineering-project/blob/master/airflow/scripts/spark_missed_deadline_job.py) 68 | 69 | Finally, the dataset identifying orders that missed the carrier delivery deadline is uploaded to the same S3 bucket in a different folder. [This script also screens out non-csv files from being uploaded to keep the folder fairly clean.](https://github.com/ajupton/big-data-engineering-project/blob/master/airflow/scripts/s3_upload.py) 70 | 71 | To run the job, make sure to first edit the paths of each of the scripts to match the paths where you'd like to run your analysis on your own machine and of course make sure to include the specific details of your S3 bucket. 72 | 73 | One thing to note about the scripts is the `.set_upstream()` method applied to the second two operators. This ensures that if, for any reason, the initial file download fails that Airflow will retry the jobs. Another thing to note about the dag is the schedule, which I'm manually triggering using the Airflow UI. There's a lot more depth to job scheduling. 74 | 75 | But there you have it! This is a pretty simple pipeline, and shows how powerful Airflow can be in its ability to schedule various jobs using a variety of technologies like python and Spark. This only scratches the surface of what's capable with Airflow. 76 | 77 | The next step in this process is incorporating AWS EMR into the pipeline to run the Spark job on a cluster instead of locally on my machine. Stay tuned for updates! 78 | -------------------------------------------------------------------------------- /airflow/dags/late_shipments_to_carrier_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.operators.bash_operator import BashOperator 4 | 5 | dag = DAG('late_shipments_to_carrier', 6 | description='Returns list of orders where the seller missed the carrier delivery deadline', 7 | schedule_interval='0 5 * * *', 8 | start_date=datetime(2019, 7, 10), catchup=False) 9 | 10 | # Download the data from S3 11 | s3_download_operator = BashOperator(task_id='s3_download', 12 | bash_command='python /path/to/airflow/scripts/s3_download.py', ##<<<< edit path!! 13 | dag=dag) 14 | 15 | # Run Spark job to return order information where the seller 16 | # missed the deadline to deliver the shipment to the carrier 17 | spark_missed_deadline_operator = BashOperator(task_id='spark_missed_deadline_job', 18 | bash_command='python /path/to/airflow/scripts/spark_missed_deadline_job.py', ##<<<< edit path!! 19 | dag=dag) 20 | 21 | # Specify that the Spark task above depends on the dataset downloading properly 22 | spark_missed_deadline_operator.set_upstream(s3_download_operator) 23 | 24 | # Upload cleaned dataset to S3 25 | s3_upload_operator = BashOperator(task_id='s3_upload', 26 | bash_command='python /path/to/airflow/scripts/s3_upload.py', ##<<<< edit path!! 27 | dag=dag) 28 | 29 | # Specify that the S3 upload task depends on the Spark job running successfully 30 | s3_upload_operator.set_upstream(spark_missed_deadline_operator) 31 | 32 | s3_download_operator >> spark_missed_deadline_operator >> s3_upload_operator 33 | -------------------------------------------------------------------------------- /airflow/scripts/s3_download.py: -------------------------------------------------------------------------------- 1 | from boto3.s3.transfer import S3Transfer 2 | import boto3 3 | 4 | ######################################## 5 | ## Edit the keys/paths for your setup ## 6 | ######################################## 7 | access_key = 'your_access_key_here' 8 | secret_key = 'your_secret_key_here' 9 | s3_bucket_name = 'your_s3_bucket_name' 10 | s3_filename = 'brazilian-ecommerce.zip' 11 | download_path = '/path/to/brazilian-ecommerce.zip' 12 | 13 | client = boto3.client('s3', 14 | aws_access_key_id = access_key, 15 | aws_secret_access_key = secret_key) 16 | 17 | print('client') 18 | 19 | client.download_file(s3_bucket_name, s3_filename, download_path) 20 | -------------------------------------------------------------------------------- /airflow/scripts/s3_upload.py: -------------------------------------------------------------------------------- 1 | import os 2 | from boto3.s3.transfer import S3Transfer 3 | import boto3 4 | 5 | ######################################## 6 | ## Edit the keys/paths for your setup ## 7 | ######################################## 8 | access_key = 'your_access_key_here' 9 | secret_key = 'your_secret_key_here' 10 | s3_bucket_name = 'your_s3_bucket_name' 11 | s3_filename = 'brazilian-ecommerce.zip' 12 | s3_filename = 'missed_shipping_limit_orders.csv' 13 | client = boto3.client('s3', 14 | aws_access_key_id = access_key, 15 | aws_secret_access_key = secret_key) 16 | 17 | print('client') 18 | 19 | transfer = S3Transfer(client) 20 | 21 | print('transfer - ' + s3_bucket_name) 22 | 23 | # Define function to scan through the Spark output uploadDirectory, 24 | # identify csv files, and upload them to the S3 bucket 25 | def uploadDirectory(filepath, s3_bucket_name): 26 | for root, dirs, files in os.walk(filepath): 27 | for file in files: 28 | # Transfer only csv files 29 | if file.endswith('csv'): 30 | transfer.upload_file(os.path.join(root, file), 31 | s3_bucket_name, 32 | "Clean_Data/" + file) # File put into Clean-Data folder 33 | 34 | uploadDirectory(filepath = filepath, s3_bucket_name = s3_bucket_name) 35 | -------------------------------------------------------------------------------- /airflow/scripts/spark_missed_deadline_job.py: -------------------------------------------------------------------------------- 1 | # First, unzip the file with the Olist ecommerce data 2 | from zipfile import ZipFile 3 | 4 | # Create a ZipFile Object and load brazilian-ecommerce.zip in it 5 | with ZipFile('/path/to/Brazilian-ecommerce.zip', #<<<<<<<< edit path location where you saved the file!!! 6 | 'r') as zipObj: 7 | # Extract all the contents of zip file in current directory 8 | zipObj.extractall() 9 | 10 | # Setting up spark 11 | import findspark 12 | findspark.init() 13 | from pyspark import SparkConf, SparkContext 14 | from pyspark.sql import SparkSession 15 | from pyspark.sql import * 16 | from pyspark.sql.functions import * 17 | conf = SparkConf().setMaster("local").setAppName("Missed_Deadlines") 18 | spark = SparkSession.builder.getOrCreate() 19 | print(spark) 20 | 21 | # Set sqlContext from the Spark context 22 | from pyspark.sql import SQLContext 23 | sqlContext = SQLContext(spark) 24 | 25 | # Edit Spark SQL context for ease of use with Pandas 26 | spark.conf.set("spark.sql.execution.arrow.enabled", "true") 27 | 28 | # Build a spark session 29 | spark = SparkSession.builder.getOrCreate() 30 | 31 | # Load in csv files into spark dataframes 32 | df_items = spark.read.format("csv") \ 33 | .option("header", "true") \ 34 | .option("inferSchema", "true") \ 35 | .load("olist_order_items_dataset.csv") 36 | 37 | df_orders = spark.read.format("csv") \ 38 | .option("header", "true") \ 39 | .option("inferSchema", "true") \ 40 | .load("olist_orders_dataset.csv") 41 | 42 | df_products = spark.read.format("csv") \ 43 | .option("header", "true") \ 44 | .option("inferSchema", "true") \ 45 | .load("olist_products_dataset.csv") 46 | 47 | # Create SQL Table Views from dfs for SQL querying 48 | df_items.createOrReplaceTempView('items') 49 | df_orders.createOrReplaceTempView('orders') 50 | df_products.createOrReplaceTempView('products') 51 | 52 | # SQL Query to pull order/seller/product info for orders where the 53 | # seller missed the deadline to deliver the shipment to the carrier 54 | late_carrier_deliveries = spark.sql(""" 55 | SELECT i.order_id, i.seller_id, i.shipping_limit_date, i.price, i.freight_value, 56 | p.product_id, p.product_category_name, 57 | o.customer_id, o.order_status, o.order_purchase_timestamp, o.order_delivered_carrier_date, 58 | o.order_delivered_customer_date, o.order_estimated_delivery_date 59 | FROM items AS i 60 | JOIN orders AS o 61 | ON i.order_id = o.order_id 62 | JOIN products AS p 63 | ON i.product_id = p.product_id 64 | WHERE i.shipping_limit_date < o.order_delivered_carrier_date 65 | """) 66 | 67 | # Write the results to a single csv file 68 | # coalesce(1) requires that the file is small enough to fit 69 | # in the heap memory of the master Spark node and is therefore 70 | # only recommended for very small datasets 71 | # Alternatives are converting the Spark df to a Pandas df before 72 | # writing to disk. 73 | # Otherwise, it's best practice to maintain the partitions to 74 | # take advantage of HDFS 75 | late_carrier_deliveries.coalesce(1) \ 76 | .write \ 77 | .option("header", "true") \ 78 | .csv("/path/where/you/want/missed_shipping_limit_orders.csv") # <<<<<<< edit path 79 | --------------------------------------------------------------------------------