├── README.md └── Idiomatic Pandas.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Idiomatic-Pandas-Tutorial 2 | Pandas Training © MetaSnake 2022, CC BY-NC 3 | 4 | ## Install 5 | 6 | * Clone this repo 7 | * Use notebook environment 8 | * Jupyter (local install) - see https://www.metasnake.com/blog/pydata-dev.html for help 9 | * Colab (requires Google access) 10 | 11 | ## For more Pandas help 12 | Check out my book, Effective Pandas, at https://store.metasnake.com 13 | -------------------------------------------------------------------------------- /Idiomatic Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Idiomatic Pandas\n", 8 | "\n", 9 | "© MetaSnake 2022, CC BY-NC" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "lines_to_next_cell": 0 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import glob\n", 21 | "\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "lines_to_next_cell": 2 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "!pip install pandas matplotlib" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "pd.__version__" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "scrolled": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "pd.show_versions()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Loading Data" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "!ls *.csv" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "data = [pd.read_csv(f, parse_dates=['time'], na_values='-') for f in glob.glob('tweet_activity_metrics___mharrison___*')]\n", 81 | "df = pd.concat(data, ignore_index=True).sort_values('time')\n", 82 | "df" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "df.to_csv('__mharrison__2020-2021.csv', index=False)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "scrolled": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "pd.read_csv('__mharrison__2020-2021.csv')" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Load data from Web" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "url = 'https://github.com/mattharrison/datasets/raw/master/data/__mharrison__2020-2021.csv'\n", 140 | "df = pd.read_csv(url, parse_dates=['time'])" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "scrolled": true 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "df" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Load Data Exercise\n", 159 | "\n", 160 | "* Load the data using the cell above.\n", 161 | "* If you can't do this please alert!" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## Exploring\n", 197 | "\n", 198 | "Definitions\n", 199 | "\n", 200 | "* *Impressions* - Number of times people saw the tweet\n", 201 | "* *Engagements* - Number of \"interactions\" (clicks, replies, retweets, likes)\n", 202 | "* *Engagement rate* - Engagements divided by impressions" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "scrolled": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "df.T" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "df.shape" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "df.dtypes" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "pd.options.display.max_columns" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "scrolled": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "from IPython.display import display\n", 252 | "with pd.option_context('display.max_columns', 240):\n", 253 | " display(df)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "scrolled": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "df.isna().sum()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Explore Exercise\n", 272 | "* Use `.describe` to view the summary statistics\n", 273 | "* Use `.corr` to view column correlations" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "## Types" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": { 315 | "scrolled": true 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "df.dtypes" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": { 326 | "scrolled": true 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "df.memory_usage()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "scrolled": true 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "df.memory_usage(deep=True)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "df.memory_usage(deep=True).sum()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": { 357 | "scrolled": true 358 | }, 359 | "outputs": [], 360 | "source": [ 361 | "(df\n", 362 | " .select_dtypes(int).describe()\n", 363 | ")" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "scrolled": true 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "(df\n", 375 | " #.select_dtypes(float)\n", 376 | " .select_dtypes('float64')\n", 377 | " .describe()\n", 378 | ")" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "(df\n", 388 | " .impressions\n", 389 | " .astype(int))" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "df.assign?" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": { 405 | "scrolled": true 406 | }, 407 | "outputs": [], 408 | "source": [ 409 | "(df\n", 410 | " .assign(impressions=df.impressions.astype(int),\n", 411 | " engagements=df.engagements.astype(int)\n", 412 | " # lots of this here\n", 413 | " )\n", 414 | ")" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": { 421 | "scrolled": true 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "# also note\n", 426 | "(df\n", 427 | " .assign(impressions=df.impressions.astype(int),\n", 428 | " engagement rate=df.engagements rate.astype(int)\n", 429 | " # lots of this here\n", 430 | " )\n", 431 | ")" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": { 438 | "scrolled": true 439 | }, 440 | "outputs": [], 441 | "source": [ 442 | "# fix names\n", 443 | "(df\n", 444 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 445 | ")" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": { 452 | "scrolled": true 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "df.filter(regex=r'promoted')" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": { 463 | "scrolled": true 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "(df\n", 468 | " .drop(columns=[c for c in df.columns if 'promoted' in c])\n", 469 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 470 | " .describe()\n", 471 | ")" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "# be careful with renaming\n", 481 | "(df\n", 482 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 483 | " .drop(columns=[c for c in df.columns if 'promoted' in c])\n", 484 | ")" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "df.drop?" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": { 500 | "scrolled": true 501 | }, 502 | "outputs": [], 503 | "source": [ 504 | "def drop_col(df_, pattern):\n", 505 | " return df_.drop(columns=[c for c in df_.columns if pattern in c])\n", 506 | "\n", 507 | "(df\n", 508 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 509 | " #.pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 510 | " .pipe(drop_col, pattern='promoted')\n", 511 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 512 | ")" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": { 519 | "lines_to_next_cell": 0, 520 | "scrolled": true 521 | }, 522 | "outputs": [], 523 | "source": [ 524 | "\n", 525 | "(df\n", 526 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 527 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 528 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 529 | " .memory_usage(deep=True)\n", 530 | " .sum() # 3 megs\n", 531 | ")" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": { 538 | "lines_to_next_cell": 0 539 | }, 540 | "outputs": [], 541 | "source": [ 542 | "df.pipe?" 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "metadata": {}, 548 | "source": [ 549 | "## Column Cleanup Exercise\n", 550 | "(Please don't mutate here!)\n", 551 | "\n", 552 | "* Use `.loc` to select the *impressions* and *engagement* columns\n", 553 | "* Use `.drop` to select the *impressions* and *engagement* columns\n", 554 | "* Use `.rename` to rename *impressions* to *imp* and *engagement* to *eng*" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": null, 574 | "metadata": {}, 575 | "outputs": [], 576 | "source": [] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": { 589 | "lines_to_next_cell": 2 590 | }, 591 | "outputs": [], 592 | "source": [] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": {}, 604 | "source": [ 605 | "## Ok, Types for real" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "metadata": { 612 | "scrolled": false 613 | }, 614 | "outputs": [], 615 | "source": [ 616 | "\n", 617 | "(df\n", 618 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 619 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 620 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 621 | " .describe()\n", 622 | ")" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "np.iinfo('int64')" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": { 638 | "scrolled": true 639 | }, 640 | "outputs": [], 641 | "source": [ 642 | "for size in ['uint8', 'uint16', 'uint32', 'int8', 'int16', 'int32', 'int64']:\n", 643 | " print(f'{size=} {np.iinfo(size)}')" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "metadata": { 650 | "scrolled": true 651 | }, 652 | "outputs": [], 653 | "source": [ 654 | "\n", 655 | "(df\n", 656 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 657 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 658 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 659 | " .assign(impressions=df.impressions.astype('uint32'),\n", 660 | " engagements=df.engagements.astype('uint16'),\n", 661 | " )\n", 662 | " .describe()\n", 663 | ")" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": { 670 | "scrolled": true 671 | }, 672 | "outputs": [], 673 | "source": [ 674 | "kwargs = {}\n", 675 | "for col in df.select_dtypes(float).columns:\n", 676 | " print(col)\n", 677 | " kwargs[col] = df[col].astype(int)\n", 678 | "kwargs" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "metadata": { 685 | "scrolled": true 686 | }, 687 | "outputs": [], 688 | "source": [ 689 | "# use dict comp if you don't want to type every column\n", 690 | "# assign w/ dict comp. and lambda\n", 691 | "(df\n", 692 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 693 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 694 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 695 | " .assign(impressions=df.impressions.astype('uint32'),\n", 696 | " engagements=df.engagements.astype('uint16'),\n", 697 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']} # less than 255\n", 698 | " )\n", 699 | ")" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "metadata": { 706 | "scrolled": true 707 | }, 708 | "outputs": [], 709 | "source": [ 710 | "# why c=c?\n", 711 | "(df\n", 712 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 713 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 714 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 715 | " .assign(impressions=df.impressions.astype('uint32'),\n", 716 | " engagements=df.engagements.astype('uint16'),\n", 717 | " **{c:lambda df_:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 718 | " **{c:lambda df_:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 719 | " 'detail_expands', 'media_views', 'media_engagements']} # less than 65,535\n", 720 | " )\n", 721 | " #.corr()\n", 722 | " .describe()\n", 723 | ")" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "# https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result\n", 733 | "squares = []\n", 734 | "for x in range(5):\n", 735 | " squares.append(lambda: x**2)\n", 736 | "for s in squares:\n", 737 | " print(s())" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": null, 743 | "metadata": {}, 744 | "outputs": [], 745 | "source": [ 746 | "# https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result\n", 747 | "squares = []\n", 748 | "for x in range(5):\n", 749 | " squares.append(lambda x=x: x**2)\n", 750 | "for s in squares:\n", 751 | " print(s())" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "metadata": { 758 | "scrolled": true 759 | }, 760 | "outputs": [], 761 | "source": [ 762 | "(df\n", 763 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 764 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 765 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 766 | " .assign(impressions=df.impressions.astype('uint32'),\n", 767 | " engagements=df.engagements.astype('uint16'),\n", 768 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 769 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 770 | " 'detail_expands', 'media_views', 'media_engagements']} # less than 65,535\n", 771 | " )\n", 772 | " .describe()\n", 773 | ")" 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": null, 779 | "metadata": { 780 | "scrolled": true 781 | }, 782 | "outputs": [], 783 | "source": [ 784 | "(df\n", 785 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 786 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 787 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 788 | " .assign(impressions=df.impressions.astype('uint32'),\n", 789 | " engagements=df.engagements.astype('uint16'),\n", 790 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 791 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 792 | " 'detail_expands', 'media_views', 'media_engagements']} # less than 65,535\n", 793 | " \n", 794 | " )\n", 795 | " .memory_usage(deep=True) \n", 796 | " .sum() # was 3 megs\n", 797 | ")" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": null, 803 | "metadata": { 804 | "scrolled": false 805 | }, 806 | "outputs": [], 807 | "source": [ 808 | "# most is from text\n", 809 | "(df\n", 810 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 811 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 812 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 813 | " .assign(impressions=df.impressions.astype('uint32'),\n", 814 | " engagements=df.engagements.astype('uint16'),\n", 815 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 816 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 817 | " 'detail_expands', 'media_views', 'media_engagements']} # less than 65,535\n", 818 | " \n", 819 | " )\n", 820 | " .memory_usage(deep=True) \n", 821 | " .pipe(lambda ser: ser/ser.sum()*100)\n", 822 | "# .sum() # was 3 megs\n", 823 | ")" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": null, 829 | "metadata": { 830 | "lines_to_next_cell": 2, 831 | "scrolled": false 832 | }, 833 | "outputs": [], 834 | "source": [ 835 | "# convert first part of permalink to category and add back tweet_id\n", 836 | "(df\n", 837 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 838 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 839 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 840 | " .assign(impressions=df.impressions.astype('uint32'),\n", 841 | " engagements=df.engagements.astype('uint16'),\n", 842 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 843 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 844 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 845 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 846 | " index=df_.index),\n", 847 | " )\n", 848 | " .memory_usage(deep=True) \n", 849 | " .sum() # was 3 megs\n", 850 | ")" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": null, 856 | "metadata": { 857 | "lines_to_next_cell": 2 858 | }, 859 | "outputs": [], 860 | "source": [] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": null, 865 | "metadata": { 866 | "lines_to_next_cell": 2, 867 | "scrolled": true 868 | }, 869 | "outputs": [], 870 | "source": [ 871 | "# convert first part of permalink to category and add back tweet_id\n", 872 | "(df\n", 873 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 874 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 875 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 876 | " .assign(impressions=df.impressions.astype('uint32'),\n", 877 | " engagements=df.engagements.astype('uint16'),\n", 878 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 879 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 880 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 881 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 882 | " index=df_.index),\n", 883 | " )\n", 884 | " .describe()\n", 885 | " #.memory_usage(deep=True) \n", 886 | " #.sum() # was 3 megs\n", 887 | ")" 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": null, 893 | "metadata": {}, 894 | "outputs": [], 895 | "source": [] 896 | }, 897 | { 898 | "cell_type": "markdown", 899 | "metadata": {}, 900 | "source": [ 901 | "## Alternate Integer Conversion Exercise\n", 902 | "(Again, no mutation!)\n", 903 | "\n", 904 | "* Use `.select_dtypes` to filter all `int` columns from `df`\n", 905 | "* Use `.astype` with above to convert all columns to `uint8`\n", 906 | "* Use `.assign` with above to create new dataframe with updated integer columns" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": null, 912 | "metadata": {}, 913 | "outputs": [], 914 | "source": [] 915 | }, 916 | { 917 | "cell_type": "code", 918 | "execution_count": null, 919 | "metadata": {}, 920 | "outputs": [], 921 | "source": [] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": null, 926 | "metadata": {}, 927 | "outputs": [], 928 | "source": [] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": null, 933 | "metadata": {}, 934 | "outputs": [], 935 | "source": [] 936 | }, 937 | { 938 | "cell_type": "markdown", 939 | "metadata": {}, 940 | "source": [ 941 | "## Other Types\n", 942 | "Can apply similar logic to floats, and strings.\n", 943 | "\n", 944 | "Converting \"Tweet_text\" to a category doesn't make sense because it is high cardinality" 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "execution_count": null, 950 | "metadata": { 951 | "scrolled": false 952 | }, 953 | "outputs": [], 954 | "source": [ 955 | "# Uses MORE memory if tweet text is a category!\n", 956 | "(df\n", 957 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 958 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 959 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 960 | " .assign(impressions=df.impressions.astype('uint32'),\n", 961 | " engagements=df.engagements.astype('uint16'),\n", 962 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 963 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 964 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 965 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 966 | " index=df_.index),\n", 967 | " Tweet_text=lambda df_:df_.Tweet_text.astype('category')\n", 968 | " )\n", 969 | " .memory_usage(deep=True) \n", 970 | " .sum() # was 3 megs\n", 971 | ")" 972 | ] 973 | }, 974 | { 975 | "cell_type": "markdown", 976 | "metadata": {}, 977 | "source": [ 978 | "## Other types Exercise\n", 979 | "* Use the `%%timeit` cell magic to see how long it takes to run `.str.lower()` on the original *Tweet permalink* column\n", 980 | "* Create a new dataframe, `df2`, with our current chain\n", 981 | "* Use the `%%timeit` cell magic to see how long it takes to run `.str.lower()` on the *df2.Tweet_permalink* column" 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": null, 987 | "metadata": {}, 988 | "outputs": [], 989 | "source": [] 990 | }, 991 | { 992 | "cell_type": "code", 993 | "execution_count": null, 994 | "metadata": {}, 995 | "outputs": [], 996 | "source": [] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "execution_count": null, 1001 | "metadata": {}, 1002 | "outputs": [], 1003 | "source": [] 1004 | }, 1005 | { 1006 | "cell_type": "code", 1007 | "execution_count": null, 1008 | "metadata": {}, 1009 | "outputs": [], 1010 | "source": [] 1011 | }, 1012 | { 1013 | "cell_type": "code", 1014 | "execution_count": null, 1015 | "metadata": {}, 1016 | "outputs": [], 1017 | "source": [] 1018 | }, 1019 | { 1020 | "cell_type": "markdown", 1021 | "metadata": {}, 1022 | "source": [ 1023 | "## Dates" 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "code", 1028 | "execution_count": null, 1029 | "metadata": { 1030 | "scrolled": true 1031 | }, 1032 | "outputs": [], 1033 | "source": [ 1034 | "(df\n", 1035 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 1036 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 1037 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 1038 | " .assign(impressions=df.impressions.astype('uint32'),\n", 1039 | " engagements=df.engagements.astype('uint16'),\n", 1040 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 1041 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 1042 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 1043 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 1044 | " index=df_.index),\n", 1045 | " )\n", 1046 | " .time\n", 1047 | ")" 1048 | ] 1049 | }, 1050 | { 1051 | "cell_type": "code", 1052 | "execution_count": null, 1053 | "metadata": { 1054 | "scrolled": false 1055 | }, 1056 | "outputs": [], 1057 | "source": [ 1058 | "# Convert to Local Time (already in UTC)\n", 1059 | "(df\n", 1060 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 1061 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 1062 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 1063 | " .astype({c:'uint8' for c in ['replies', 'hashtag_clicks', 'follows']}) # less than 255)\n", 1064 | " .assign(impressions=df.impressions.astype('uint32'),\n", 1065 | " engagements=df.engagements.astype('uint16'),\n", 1066 | " #**{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 1067 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 1068 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 1069 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 1070 | " index=df_.index),\n", 1071 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n", 1072 | " )\n", 1073 | " .time\n", 1074 | ")" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "markdown", 1079 | "metadata": {}, 1080 | "source": [ 1081 | "## Dates Exercise\n", 1082 | "* Create a series with the months of the *time* column\n", 1083 | "* Convert the *time* column to UTC\n", 1084 | "* Convert the *time* column to `America/New_York`" 1085 | ] 1086 | }, 1087 | { 1088 | "cell_type": "code", 1089 | "execution_count": null, 1090 | "metadata": {}, 1091 | "outputs": [], 1092 | "source": [] 1093 | }, 1094 | { 1095 | "cell_type": "code", 1096 | "execution_count": null, 1097 | "metadata": {}, 1098 | "outputs": [], 1099 | "source": [] 1100 | }, 1101 | { 1102 | "cell_type": "code", 1103 | "execution_count": null, 1104 | "metadata": {}, 1105 | "outputs": [], 1106 | "source": [] 1107 | }, 1108 | { 1109 | "cell_type": "code", 1110 | "execution_count": null, 1111 | "metadata": {}, 1112 | "outputs": [], 1113 | "source": [] 1114 | }, 1115 | { 1116 | "cell_type": "markdown", 1117 | "metadata": {}, 1118 | "source": [ 1119 | "## Chain\n", 1120 | "\n", 1121 | "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n", 1122 | "\n", 1123 | "The chain should read like a recipe of ordered steps.\n", 1124 | "\n", 1125 | "(BTW, this is actually what we did above.)\n", 1126 | "\n", 1127 | "
\n", 1128 | " Hint: Leverage .pipe if you can't find a way to chain 😉🐼💪\n", 1129 | "
" 1130 | ] 1131 | }, 1132 | { 1133 | "cell_type": "code", 1134 | "execution_count": null, 1135 | "metadata": {}, 1136 | "outputs": [], 1137 | "source": [ 1138 | "# convert to a function\n", 1139 | "def tweak_twitter(df):\n", 1140 | " return (df\n", 1141 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 1142 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 1143 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 1144 | " .assign(impressions=df.impressions.astype('uint32'),\n", 1145 | " engagements=df.engagements.astype('uint16'),\n", 1146 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 1147 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 1148 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 1149 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 1150 | " index=df_.index),\n", 1151 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n", 1152 | " )\n", 1153 | " )" 1154 | ] 1155 | }, 1156 | { 1157 | "cell_type": "code", 1158 | "execution_count": null, 1159 | "metadata": {}, 1160 | "outputs": [], 1161 | "source": [] 1162 | }, 1163 | { 1164 | "cell_type": "code", 1165 | "execution_count": null, 1166 | "metadata": {}, 1167 | "outputs": [], 1168 | "source": [ 1169 | "# I would want my notebook to start off like this:\n", 1170 | "import glob\n", 1171 | "\n", 1172 | "import numpy as np\n", 1173 | "import pandas as pd\n", 1174 | "\n", 1175 | "data = [pd.read_csv(f, parse_dates=['time'], na_values='-') for f in glob.glob('tweet_activity_metrics___mharrison___*')]\n", 1176 | "df = pd.concat(data, ignore_index=True).sort_values('time')" 1177 | ] 1178 | }, 1179 | { 1180 | "cell_type": "code", 1181 | "execution_count": null, 1182 | "metadata": {}, 1183 | "outputs": [], 1184 | "source": [ 1185 | "def tweak_twitter(df):\n", 1186 | " return (df\n", 1187 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 1188 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 1189 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 1190 | " .assign(impressions=df.impressions.astype('uint32'),\n", 1191 | " engagements=df.engagements.astype('uint16'),\n", 1192 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 1193 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 1194 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 1195 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 1196 | " index=df_.index),\n", 1197 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n", 1198 | " )\n", 1199 | " )\n", 1200 | "twit_df = tweak_twitter(df)" 1201 | ] 1202 | }, 1203 | { 1204 | "cell_type": "code", 1205 | "execution_count": null, 1206 | "metadata": { 1207 | "lines_to_next_cell": 2, 1208 | "scrolled": true 1209 | }, 1210 | "outputs": [], 1211 | "source": [ 1212 | "# compare with non-chain\n", 1213 | "df1 = df.rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 1214 | "keep = [c for c in df1.columns if 'promoted' not in c]\n", 1215 | "df2 = df1[keep]\n", 1216 | "keep2 = [c for c in df2 if c not in ['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone']]\n", 1217 | "df3 = df2[keep2]\n", 1218 | "imps = df3.impressions.astype('uint32')\n", 1219 | "df3.impressions = imps\n", 1220 | "eng = df3.engagements.astype('uint16')\n", 1221 | "df3['engagements'] = eng\n", 1222 | "df3['replies'] = df3.replies.astype('uint8')\n", 1223 | "df3['hashtag_clicks'] = df3.hashtag_clicks.astype('uint8')" 1224 | ] 1225 | }, 1226 | { 1227 | "cell_type": "code", 1228 | "execution_count": null, 1229 | "metadata": { 1230 | "scrolled": true 1231 | }, 1232 | "outputs": [], 1233 | "source": [ 1234 | "# easy to debug\n", 1235 | "# - assign to var (renamed_df)\n", 1236 | "# - comment out\n", 1237 | "# - pipe to display\n", 1238 | "\n", 1239 | "from IPython.display import display\n", 1240 | "\n", 1241 | "def get_var(df, var_name):\n", 1242 | " globals()[var_name] = df\n", 1243 | " return df\n", 1244 | "\n", 1245 | "def tweak_twitter(df):\n", 1246 | " return (df\n", 1247 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 1248 | " .pipe(get_var, 'renamed_df')\n", 1249 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 1250 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 1251 | " .pipe(lambda df_:display(df_) or df_)\n", 1252 | " .assign(impressions=df.impressions.astype('uint32'),\n", 1253 | " engagements=df.engagements.astype('uint16'),\n", 1254 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 1255 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 1256 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 1257 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 1258 | " index=df_.index),\n", 1259 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n", 1260 | " )\n", 1261 | " )\n", 1262 | "twit_df = tweak_twitter(df)" 1263 | ] 1264 | }, 1265 | { 1266 | "cell_type": "code", 1267 | "execution_count": null, 1268 | "metadata": {}, 1269 | "outputs": [], 1270 | "source": [ 1271 | "renamed_df" 1272 | ] 1273 | }, 1274 | { 1275 | "cell_type": "code", 1276 | "execution_count": null, 1277 | "metadata": {}, 1278 | "outputs": [], 1279 | "source": [] 1280 | }, 1281 | { 1282 | "cell_type": "code", 1283 | "execution_count": null, 1284 | "metadata": {}, 1285 | "outputs": [], 1286 | "source": [] 1287 | }, 1288 | { 1289 | "cell_type": "code", 1290 | "execution_count": null, 1291 | "metadata": { 1292 | "lines_to_next_cell": 2 1293 | }, 1294 | "outputs": [], 1295 | "source": [] 1296 | }, 1297 | { 1298 | "cell_type": "code", 1299 | "execution_count": null, 1300 | "metadata": {}, 1301 | "outputs": [], 1302 | "source": [ 1303 | "def tweak_twitter(df):\n", 1304 | " return (df\n", 1305 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 1306 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 1307 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 1308 | " .assign(impressions=df.impressions.astype('uint32'),\n", 1309 | " engagements=df.engagements.astype('uint16'),\n", 1310 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 1311 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 1312 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 1313 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 1314 | " index=df_.index),\n", 1315 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n", 1316 | " )\n", 1317 | " )\n", 1318 | "twit_df = tweak_twitter(df)" 1319 | ] 1320 | }, 1321 | { 1322 | "cell_type": "markdown", 1323 | "metadata": {}, 1324 | "source": [ 1325 | "## Chain Exercise\n", 1326 | "* Use `.pipe` to print the shape of the dataframe after every step in the chain of the `tweak_twitter` function" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "code", 1331 | "execution_count": null, 1332 | "metadata": {}, 1333 | "outputs": [], 1334 | "source": [] 1335 | }, 1336 | { 1337 | "cell_type": "code", 1338 | "execution_count": null, 1339 | "metadata": {}, 1340 | "outputs": [], 1341 | "source": [] 1342 | }, 1343 | { 1344 | "cell_type": "code", 1345 | "execution_count": null, 1346 | "metadata": {}, 1347 | "outputs": [], 1348 | "source": [] 1349 | }, 1350 | { 1351 | "cell_type": "code", 1352 | "execution_count": null, 1353 | "metadata": {}, 1354 | "outputs": [], 1355 | "source": [] 1356 | }, 1357 | { 1358 | "cell_type": "markdown", 1359 | "metadata": {}, 1360 | "source": [ 1361 | "## Don't Mutate\n", 1362 | "\n", 1363 | "> \"you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not.\"\n", 1364 | ">\n", 1365 | "> **jreback** - Pandas core dev\n", 1366 | "\n", 1367 | "\n", 1368 | "\n", 1369 | "https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136\n", 1370 | "\n", 1371 | "* In general, no performance benefits\n", 1372 | "* Prohibits chaining\n", 1373 | "* ``SettingWithCopyWarning`` fun" 1374 | ] 1375 | }, 1376 | { 1377 | "cell_type": "code", 1378 | "execution_count": null, 1379 | "metadata": {}, 1380 | "outputs": [], 1381 | "source": [] 1382 | }, 1383 | { 1384 | "cell_type": "code", 1385 | "execution_count": null, 1386 | "metadata": { 1387 | "lines_to_next_cell": 2 1388 | }, 1389 | "outputs": [], 1390 | "source": [] 1391 | }, 1392 | { 1393 | "cell_type": "code", 1394 | "execution_count": null, 1395 | "metadata": {}, 1396 | "outputs": [], 1397 | "source": [] 1398 | }, 1399 | { 1400 | "cell_type": "code", 1401 | "execution_count": null, 1402 | "metadata": {}, 1403 | "outputs": [], 1404 | "source": [] 1405 | }, 1406 | { 1407 | "cell_type": "code", 1408 | "execution_count": null, 1409 | "metadata": { 1410 | "lines_to_next_cell": 2 1411 | }, 1412 | "outputs": [], 1413 | "source": [] 1414 | }, 1415 | { 1416 | "cell_type": "markdown", 1417 | "metadata": {}, 1418 | "source": [ 1419 | "## Don't Apply (if you can)" 1420 | ] 1421 | }, 1422 | { 1423 | "cell_type": "code", 1424 | "execution_count": null, 1425 | "metadata": {}, 1426 | "outputs": [], 1427 | "source": [ 1428 | "def tweak_twitter(df):\n", 1429 | " return (df\n", 1430 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 1431 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 1432 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 1433 | " .assign(impressions=df.impressions.astype('uint32'),\n", 1434 | " engagements=df.engagements.astype('uint16'),\n", 1435 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 1436 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 1437 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 1438 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 1439 | " index=df_.index),\n", 1440 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n", 1441 | " )\n", 1442 | " )\n", 1443 | "twit_df = tweak_twitter(df)" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "code", 1448 | "execution_count": null, 1449 | "metadata": { 1450 | "scrolled": true 1451 | }, 1452 | "outputs": [], 1453 | "source": [ 1454 | "twit_df" 1455 | ] 1456 | }, 1457 | { 1458 | "cell_type": "code", 1459 | "execution_count": null, 1460 | "metadata": { 1461 | "scrolled": true 1462 | }, 1463 | "outputs": [], 1464 | "source": [ 1465 | "def to_percent(val):\n", 1466 | " return val * 100\n", 1467 | "twit_df.engagement_rate.apply(to_percent)" 1468 | ] 1469 | }, 1470 | { 1471 | "cell_type": "code", 1472 | "execution_count": null, 1473 | "metadata": { 1474 | "lines_to_next_cell": 2 1475 | }, 1476 | "outputs": [], 1477 | "source": [ 1478 | "# same result\n", 1479 | "twit_df.engagement_rate * 100" 1480 | ] 1481 | }, 1482 | { 1483 | "cell_type": "code", 1484 | "execution_count": null, 1485 | "metadata": {}, 1486 | "outputs": [], 1487 | "source": [ 1488 | "%%timeit\n", 1489 | "# however ...\n", 1490 | "twit_df.engagement_rate.apply(to_percent)" 1491 | ] 1492 | }, 1493 | { 1494 | "cell_type": "code", 1495 | "execution_count": null, 1496 | "metadata": {}, 1497 | "outputs": [], 1498 | "source": [ 1499 | "%%timeit\n", 1500 | "twit_df.engagement_rate * 100" 1501 | ] 1502 | }, 1503 | { 1504 | "cell_type": "code", 1505 | "execution_count": null, 1506 | "metadata": {}, 1507 | "outputs": [], 1508 | "source": [ 1509 | "# 14X slower!\n", 1510 | "1008 / 71" 1511 | ] 1512 | }, 1513 | { 1514 | "cell_type": "code", 1515 | "execution_count": null, 1516 | "metadata": {}, 1517 | "outputs": [], 1518 | "source": [ 1519 | "# How would we check if text had unicode?\n", 1520 | "'Hello \\U0001f600'.encode('ascii', errors='replace').decode('ascii')" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "code", 1525 | "execution_count": null, 1526 | "metadata": {}, 1527 | "outputs": [], 1528 | "source": [ 1529 | "'Hello \\U0001f600'.encode('utf8', errors='replace').decode('utf8')" 1530 | ] 1531 | }, 1532 | { 1533 | "cell_type": "code", 1534 | "execution_count": null, 1535 | "metadata": {}, 1536 | "outputs": [], 1537 | "source": [ 1538 | "# story is a little different with text\n", 1539 | "\n", 1540 | "def is_unicode(val):\n", 1541 | " return val.encode('ascii', errors='replace').decode('ascii') != val" 1542 | ] 1543 | }, 1544 | { 1545 | "cell_type": "code", 1546 | "execution_count": null, 1547 | "metadata": {}, 1548 | "outputs": [], 1549 | "source": [ 1550 | "%lsmagic" 1551 | ] 1552 | }, 1553 | { 1554 | "cell_type": "code", 1555 | "execution_count": null, 1556 | "metadata": {}, 1557 | "outputs": [], 1558 | "source": [ 1559 | "%%timeit?" 1560 | ] 1561 | }, 1562 | { 1563 | "cell_type": "code", 1564 | "execution_count": null, 1565 | "metadata": {}, 1566 | "outputs": [], 1567 | "source": [ 1568 | "%%timeit\n", 1569 | "twit_df.Tweet_text.apply(is_unicode)" 1570 | ] 1571 | }, 1572 | { 1573 | "cell_type": "code", 1574 | "execution_count": null, 1575 | "metadata": {}, 1576 | "outputs": [], 1577 | "source": [ 1578 | "%%timeit\n", 1579 | "twit_df.Tweet_text.str.encode('ascii', errors='replace').str.decode('ascii') == twit_df.Tweet_text" 1580 | ] 1581 | }, 1582 | { 1583 | "cell_type": "code", 1584 | "execution_count": null, 1585 | "metadata": {}, 1586 | "outputs": [], 1587 | "source": [ 1588 | "%%timeit\n", 1589 | "twit_df.Tweet_text.str.startswith('@')" 1590 | ] 1591 | }, 1592 | { 1593 | "cell_type": "code", 1594 | "execution_count": null, 1595 | "metadata": {}, 1596 | "outputs": [], 1597 | "source": [ 1598 | "def startswith_at(txt):\n", 1599 | " return txt.startswith('@')" 1600 | ] 1601 | }, 1602 | { 1603 | "cell_type": "code", 1604 | "execution_count": null, 1605 | "metadata": {}, 1606 | "outputs": [], 1607 | "source": [ 1608 | "%%timeit\n", 1609 | "twit_df.Tweet_text.apply(startswith_at)" 1610 | ] 1611 | }, 1612 | { 1613 | "cell_type": "code", 1614 | "execution_count": null, 1615 | "metadata": {}, 1616 | "outputs": [], 1617 | "source": [ 1618 | "def tweak_twitter(df):\n", 1619 | " return (df\n", 1620 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n", 1621 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n", 1622 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n", 1623 | " .assign(impressions=df.impressions.astype('uint32'),\n", 1624 | " engagements=df.engagements.astype('uint16'),\n", 1625 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n", 1626 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n", 1627 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n", 1628 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n", 1629 | " index=df_.index),\n", 1630 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver'),\n", 1631 | " is_reply=lambda df_: df_.Tweet_text.str.startswith('@'),\n", 1632 | " length=lambda df_:df_.Tweet_text.str.len(),\n", 1633 | " num_words=lambda df_:df_.Tweet_text.str.split().apply(len),\n", 1634 | " is_unicode=lambda df_:df_.Tweet_text.str.encode('ascii', errors='replace').str.decode('ascii') != df_.Tweet_text,\n", 1635 | " hour=lambda df_:df_.time.dt.hour,\n", 1636 | " dom=lambda df_:df_.time.dt.day, #day of month\n", 1637 | " dow=lambda df_:df_.time.dt.dayofweek, #day of week\n", 1638 | " at_tweet=lambda df_:df_.Tweet_text.str.contains('@'),\n", 1639 | " has_newlines=lambda df_:df_.Tweet_text.str.contains('\\n'),\n", 1640 | " num_lines=lambda df_:df_.Tweet_text.str.count('\\n'),\n", 1641 | " num_mentions=lambda df_:df_.Tweet_text.str.count('@'),\n", 1642 | " has_hashtag=lambda df_:df_.Tweet_text.str.count('#'),\n", 1643 | " )\n", 1644 | " )\n", 1645 | "twit_df = tweak_twitter(df)" 1646 | ] 1647 | }, 1648 | { 1649 | "cell_type": "code", 1650 | "execution_count": null, 1651 | "metadata": {}, 1652 | "outputs": [], 1653 | "source": [ 1654 | "twit_df" 1655 | ] 1656 | }, 1657 | { 1658 | "cell_type": "markdown", 1659 | "metadata": {}, 1660 | "source": [ 1661 | "## Apply Exercise\n", 1662 | "* Calculate engagement ratio by dividing *engagements* by *impressions*\n", 1663 | "* Calculate engagement ratio 2 by dividing the sum of *replies*, *retweets*, *likes*, *user_profile_clicks*, and *detail_expands* by *impressions*" 1664 | ] 1665 | }, 1666 | { 1667 | "cell_type": "code", 1668 | "execution_count": null, 1669 | "metadata": {}, 1670 | "outputs": [], 1671 | "source": [] 1672 | }, 1673 | { 1674 | "cell_type": "code", 1675 | "execution_count": null, 1676 | "metadata": {}, 1677 | "outputs": [], 1678 | "source": [] 1679 | }, 1680 | { 1681 | "cell_type": "code", 1682 | "execution_count": null, 1683 | "metadata": {}, 1684 | "outputs": [], 1685 | "source": [] 1686 | }, 1687 | { 1688 | "cell_type": "code", 1689 | "execution_count": null, 1690 | "metadata": {}, 1691 | "outputs": [], 1692 | "source": [] 1693 | }, 1694 | { 1695 | "cell_type": "code", 1696 | "execution_count": null, 1697 | "metadata": {}, 1698 | "outputs": [], 1699 | "source": [] 1700 | }, 1701 | { 1702 | "cell_type": "code", 1703 | "execution_count": null, 1704 | "metadata": {}, 1705 | "outputs": [], 1706 | "source": [] 1707 | }, 1708 | { 1709 | "cell_type": "code", 1710 | "execution_count": null, 1711 | "metadata": {}, 1712 | "outputs": [], 1713 | "source": [] 1714 | }, 1715 | { 1716 | "cell_type": "code", 1717 | "execution_count": null, 1718 | "metadata": {}, 1719 | "outputs": [], 1720 | "source": [] 1721 | }, 1722 | { 1723 | "cell_type": "code", 1724 | "execution_count": null, 1725 | "metadata": { 1726 | "lines_to_next_cell": 2 1727 | }, 1728 | "outputs": [], 1729 | "source": [] 1730 | }, 1731 | { 1732 | "cell_type": "markdown", 1733 | "metadata": {}, 1734 | "source": [ 1735 | "## Master Aggregation" 1736 | ] 1737 | }, 1738 | { 1739 | "cell_type": "code", 1740 | "execution_count": null, 1741 | "metadata": { 1742 | "scrolled": true 1743 | }, 1744 | "outputs": [], 1745 | "source": [ 1746 | "(twit_df\n", 1747 | " .groupby(twit_df.time.dt.year)\n", 1748 | " .mean()\n", 1749 | ")" 1750 | ] 1751 | }, 1752 | { 1753 | "cell_type": "code", 1754 | "execution_count": null, 1755 | "metadata": { 1756 | "lines_to_next_cell": 2, 1757 | "scrolled": true 1758 | }, 1759 | "outputs": [], 1760 | "source": [ 1761 | "twit_df.groupby(twit_df.time.dt.year).mean()" 1762 | ] 1763 | }, 1764 | { 1765 | "cell_type": "code", 1766 | "execution_count": null, 1767 | "metadata": { 1768 | "scrolled": false 1769 | }, 1770 | "outputs": [], 1771 | "source": [ 1772 | "(twit_df\n", 1773 | " .groupby(twit_df.time.dt.year)\n", 1774 | " .impressions\n", 1775 | " .mean()\n", 1776 | ")" 1777 | ] 1778 | }, 1779 | { 1780 | "cell_type": "code", 1781 | "execution_count": null, 1782 | "metadata": { 1783 | "scrolled": true 1784 | }, 1785 | "outputs": [], 1786 | "source": [ 1787 | "%%timeit\n", 1788 | "(twit_df\n", 1789 | " .groupby(twit_df.time.dt.year)\n", 1790 | " .mean()\n", 1791 | " [['impressions', 'replies']] # index operation with a list inside \n", 1792 | ")" 1793 | ] 1794 | }, 1795 | { 1796 | "cell_type": "code", 1797 | "execution_count": null, 1798 | "metadata": { 1799 | "scrolled": true 1800 | }, 1801 | "outputs": [], 1802 | "source": [ 1803 | "%%timeit\n", 1804 | "(twit_df\n", 1805 | " .groupby(twit_df.time.dt.year)\n", 1806 | " [['impressions', 'replies']] # index operation with a list inside \n", 1807 | " .mean()\n", 1808 | ")" 1809 | ] 1810 | }, 1811 | { 1812 | "cell_type": "code", 1813 | "execution_count": null, 1814 | "metadata": {}, 1815 | "outputs": [], 1816 | "source": [ 1817 | "twit_df.Tweet_text.str." 1818 | ] 1819 | }, 1820 | { 1821 | "cell_type": "code", 1822 | "execution_count": null, 1823 | "metadata": {}, 1824 | "outputs": [], 1825 | "source": [ 1826 | "twit_df.time.dt.year.rename('year')" 1827 | ] 1828 | }, 1829 | { 1830 | "cell_type": "code", 1831 | "execution_count": null, 1832 | "metadata": {}, 1833 | "outputs": [], 1834 | "source": [ 1835 | "pd.options.display.float_format" 1836 | ] 1837 | }, 1838 | { 1839 | "cell_type": "code", 1840 | "execution_count": null, 1841 | "metadata": { 1842 | "scrolled": true 1843 | }, 1844 | "outputs": [], 1845 | "source": [ 1846 | "(twit_df\n", 1847 | " .groupby([twit_df.time.dt.year.rename('year'), twit_df.time.dt.month.rename('month')])\n", 1848 | " [['impressions', 'replies']]\n", 1849 | " .mean()\n", 1850 | " #.round(2)\n", 1851 | " .style\n", 1852 | " .format({'replies': '{:.3f}', 'impressions': '{:e}'})\n", 1853 | " \n", 1854 | ")" 1855 | ] 1856 | }, 1857 | { 1858 | "cell_type": "code", 1859 | "execution_count": null, 1860 | "metadata": { 1861 | "scrolled": true 1862 | }, 1863 | "outputs": [], 1864 | "source": [ 1865 | "(twit_df\n", 1866 | " .groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n", 1867 | " [['impressions', 'replies']]\n", 1868 | " #.mean()\n", 1869 | " .median()\n", 1870 | " .plot()\n", 1871 | ")" 1872 | ] 1873 | }, 1874 | { 1875 | "cell_type": "code", 1876 | "execution_count": null, 1877 | "metadata": { 1878 | "scrolled": true 1879 | }, 1880 | "outputs": [], 1881 | "source": [ 1882 | "(twit_df\n", 1883 | " #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n", 1884 | " .groupby(pd.Grouper(key='time', freq='2M'))\n", 1885 | " [['impressions', 'replies']]\n", 1886 | " #.mean()\n", 1887 | " .median()\n", 1888 | " .plot()\n", 1889 | ")" 1890 | ] 1891 | }, 1892 | { 1893 | "cell_type": "code", 1894 | "execution_count": null, 1895 | "metadata": { 1896 | "scrolled": true 1897 | }, 1898 | "outputs": [], 1899 | "source": [ 1900 | "(twit_df\n", 1901 | " #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n", 1902 | " .groupby(pd.Grouper(key='time', freq='2w'))\n", 1903 | " [['impressions', 'replies']]\n", 1904 | " .mean()\n", 1905 | " .plot()\n", 1906 | ")" 1907 | ] 1908 | }, 1909 | { 1910 | "cell_type": "code", 1911 | "execution_count": null, 1912 | "metadata": { 1913 | "scrolled": false 1914 | }, 1915 | "outputs": [], 1916 | "source": [ 1917 | "(twit_df\n", 1918 | " #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n", 1919 | " .groupby(pd.Grouper(key='time', freq='7d5h'))\n", 1920 | " [['impressions', 'replies']]\n", 1921 | " .mean()\n", 1922 | " #.plot()\n", 1923 | ")" 1924 | ] 1925 | }, 1926 | { 1927 | "cell_type": "code", 1928 | "execution_count": null, 1929 | "metadata": { 1930 | "scrolled": false 1931 | }, 1932 | "outputs": [], 1933 | "source": [ 1934 | "(twit_df\n", 1935 | " #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n", 1936 | " .groupby([pd.Grouper(key='time', freq='7d5h'), 'is_unicode'])\n", 1937 | " [['impressions', 'replies']]\n", 1938 | " .mean()\n", 1939 | " #.plot()\n", 1940 | ")" 1941 | ] 1942 | }, 1943 | { 1944 | "cell_type": "code", 1945 | "execution_count": null, 1946 | "metadata": { 1947 | "scrolled": true 1948 | }, 1949 | "outputs": [], 1950 | "source": [ 1951 | "# multiple aggregates\n", 1952 | "def second_to_last(ser):\n", 1953 | " try:\n", 1954 | " return ser.iloc[-2]\n", 1955 | " except IndexError:\n", 1956 | " return 0\n", 1957 | "\n", 1958 | "(twit_df\n", 1959 | " .groupby([pd.Grouper(key='time', freq='7d5h'), 'is_unicode'])\n", 1960 | " [['impressions', 'replies']]\n", 1961 | " .agg(['mean', 'median', second_to_last])\n", 1962 | ")" 1963 | ] 1964 | }, 1965 | { 1966 | "cell_type": "code", 1967 | "execution_count": null, 1968 | "metadata": { 1969 | "scrolled": true 1970 | }, 1971 | "outputs": [], 1972 | "source": [ 1973 | "# multiple aggregates\n", 1974 | "\n", 1975 | "(twit_df\n", 1976 | " .groupby([pd.Grouper(key='time', freq='7d5h'), 'is_unicode'])\n", 1977 | " [['impressions', 'replies']]\n", 1978 | " .agg(['mean', 'median', second_to_last])\n", 1979 | " .plot()\n", 1980 | ")" 1981 | ] 1982 | }, 1983 | { 1984 | "cell_type": "code", 1985 | "execution_count": null, 1986 | "metadata": { 1987 | "scrolled": true 1988 | }, 1989 | "outputs": [], 1990 | "source": [ 1991 | "# multiple aggregates\n", 1992 | "\n", 1993 | "(twit_df\n", 1994 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n", 1995 | " [['impressions', 'replies']]\n", 1996 | " .agg(['mean', 'median', second_to_last])\n", 1997 | " .unstack()\n", 1998 | ")" 1999 | ] 2000 | }, 2001 | { 2002 | "cell_type": "code", 2003 | "execution_count": null, 2004 | "metadata": { 2005 | "scrolled": true 2006 | }, 2007 | "outputs": [], 2008 | "source": [ 2009 | "# multiple aggregates\n", 2010 | "\n", 2011 | "(twit_df\n", 2012 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n", 2013 | " [['impressions', 'replies']]\n", 2014 | " .agg(['mean', 'median', second_to_last])\n", 2015 | " .unstack()\n", 2016 | " .impressions\n", 2017 | ")" 2018 | ] 2019 | }, 2020 | { 2021 | "cell_type": "code", 2022 | "execution_count": null, 2023 | "metadata": { 2024 | "scrolled": true 2025 | }, 2026 | "outputs": [], 2027 | "source": [ 2028 | "# multiple aggregates\n", 2029 | "(twit_df\n", 2030 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n", 2031 | " [['impressions', 'replies']]\n", 2032 | " .agg(['mean', 'median', second_to_last])\n", 2033 | " .unstack()\n", 2034 | " .impressions\n", 2035 | " ['mean'] # note have to use index syntax here\n", 2036 | ")" 2037 | ] 2038 | }, 2039 | { 2040 | "cell_type": "code", 2041 | "execution_count": null, 2042 | "metadata": { 2043 | "scrolled": true 2044 | }, 2045 | "outputs": [], 2046 | "source": [ 2047 | "# multiple aggregates\n", 2048 | "(twit_df\n", 2049 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n", 2050 | " [['impressions', 'replies']]\n", 2051 | " .agg(['mean', 'median', second_to_last])\n", 2052 | " .unstack()\n", 2053 | " .impressions\n", 2054 | " .mean # note have to use index syntax here\n", 2055 | ")" 2056 | ] 2057 | }, 2058 | { 2059 | "cell_type": "code", 2060 | "execution_count": null, 2061 | "metadata": { 2062 | "scrolled": true 2063 | }, 2064 | "outputs": [], 2065 | "source": [ 2066 | "# multiple aggregates\n", 2067 | "(twit_df\n", 2068 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n", 2069 | " [['impressions', 'replies']]\n", 2070 | " .agg(['mean', 'median', second_to_last])\n", 2071 | " .unstack()\n", 2072 | " .impressions\n", 2073 | " ['mean']\n", 2074 | " .plot()\n", 2075 | ")" 2076 | ] 2077 | }, 2078 | { 2079 | "cell_type": "code", 2080 | "execution_count": null, 2081 | "metadata": { 2082 | "scrolled": false 2083 | }, 2084 | "outputs": [], 2085 | "source": [ 2086 | "# multiple aggregates\n", 2087 | "# dealing with missing values\n", 2088 | "(twit_df\n", 2089 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n", 2090 | " [['impressions', 'replies']]\n", 2091 | " .agg(['mean', 'median', second_to_last])\n", 2092 | " .unstack()\n", 2093 | " .impressions\n", 2094 | " ['mean']\n", 2095 | " #.fillna(0)\n", 2096 | " #.interpolate()\n", 2097 | " #.bfill()\n", 2098 | " #.dropna()\n", 2099 | " .loc['2021/07':'2021/08']\n", 2100 | " #.plot()\n", 2101 | ")" 2102 | ] 2103 | }, 2104 | { 2105 | "cell_type": "code", 2106 | "execution_count": null, 2107 | "metadata": { 2108 | "scrolled": true 2109 | }, 2110 | "outputs": [], 2111 | "source": [ 2112 | "# multiple aggregates\n", 2113 | "(twit_df\n", 2114 | " .groupby([pd.Grouper(key='time', freq='3d'), 'is_unicode'])\n", 2115 | " [['impressions', 'replies']]\n", 2116 | " .agg(['mean', 'median', second_to_last])\n", 2117 | " .unstack()\n", 2118 | " .impressions\n", 2119 | " ['mean']\n", 2120 | " .interpolate()\n", 2121 | " .rolling(7)\n", 2122 | " .mean()\n", 2123 | " .plot()\n", 2124 | ")" 2125 | ] 2126 | }, 2127 | { 2128 | "cell_type": "code", 2129 | "execution_count": null, 2130 | "metadata": { 2131 | "scrolled": true 2132 | }, 2133 | "outputs": [], 2134 | "source": [ 2135 | "# named aggregation\n", 2136 | "\n", 2137 | "(twit_df\n", 2138 | " .groupby([pd.Grouper(key='time', freq='M'), 'is_unicode'])\n", 2139 | " .agg(total_views=('impressions', 'sum'),\n", 2140 | " mean_views=('impressions', 'mean'),\n", 2141 | " profile_clicks=('user_profile_clicks', lambda ser: ser.sum()))\n", 2142 | ")" 2143 | ] 2144 | }, 2145 | { 2146 | "cell_type": "code", 2147 | "execution_count": null, 2148 | "metadata": { 2149 | "scrolled": true 2150 | }, 2151 | "outputs": [], 2152 | "source": [ 2153 | "# named aggregation - fails with resample\n", 2154 | "\n", 2155 | "(twit_df\n", 2156 | " #.groupby([pd.Grouper(key='time', freq='M'), 'is_unicode'])\n", 2157 | " .set_index('time')\n", 2158 | " .resample('M')\n", 2159 | " .agg(total_views=('impressions', 'sum'),\n", 2160 | " mean_views=('impressions', 'mean'),\n", 2161 | " profile_clicks=('user_profile_clicks', lambda ser: ser.sum()))\n", 2162 | ")" 2163 | ] 2164 | }, 2165 | { 2166 | "cell_type": "code", 2167 | "execution_count": null, 2168 | "metadata": { 2169 | "lines_to_next_cell": 0, 2170 | "scrolled": false 2171 | }, 2172 | "outputs": [], 2173 | "source": [ 2174 | "# named aggregation\n", 2175 | "\n", 2176 | "(twit_df\n", 2177 | " .groupby([pd.Grouper(key='time', freq='M'), 'is_unicode'])\n", 2178 | " .agg(total_views=('impressions', 'sum'),\n", 2179 | " mean_views=('impressions', 'mean'),\n", 2180 | " profile_clicks=('user_profile_clicks', lambda ser: ser.sum()))\n", 2181 | " .unstack()\n", 2182 | " .profile_clicks\n", 2183 | " .plot()\n", 2184 | ")" 2185 | ] 2186 | }, 2187 | { 2188 | "cell_type": "markdown", 2189 | "metadata": {}, 2190 | "source": [ 2191 | "## Aggregation Exercise\n", 2192 | "* What were the total impressions for each year?\n", 2193 | "* What were the total impressions for each month?\n", 2194 | "* Plot the previous\n", 2195 | "* What were the total impressions for unicode and non-unicode tweets for each month?\n", 2196 | "* Plot the previous\n", 2197 | "* What were the total impressions for reply and non-reply tweets for each month?\n", 2198 | "* Plot the previous" 2199 | ] 2200 | }, 2201 | { 2202 | "cell_type": "code", 2203 | "execution_count": null, 2204 | "metadata": {}, 2205 | "outputs": [], 2206 | "source": [] 2207 | }, 2208 | { 2209 | "cell_type": "code", 2210 | "execution_count": null, 2211 | "metadata": {}, 2212 | "outputs": [], 2213 | "source": [] 2214 | }, 2215 | { 2216 | "cell_type": "code", 2217 | "execution_count": null, 2218 | "metadata": {}, 2219 | "outputs": [], 2220 | "source": [] 2221 | }, 2222 | { 2223 | "cell_type": "code", 2224 | "execution_count": null, 2225 | "metadata": {}, 2226 | "outputs": [], 2227 | "source": [] 2228 | }, 2229 | { 2230 | "cell_type": "markdown", 2231 | "metadata": {}, 2232 | "source": [ 2233 | "## Summary\n", 2234 | "\n", 2235 | "* Correct types save space and enable convenient math, string, and date functionality\n", 2236 | "* Chaining operations will:\n", 2237 | " * Make code readable\n", 2238 | " * Remove bugs\n", 2239 | " * Easier to debug\n", 2240 | "* Don't mutate (there's no point). Embrace chaining.\n", 2241 | "* ``.apply`` is slow for math\n", 2242 | "* Aggregations are powerful. Play with them until they make sense\n", 2243 | "\n", 2244 | "Connect with me on LinkedIn or Twitter (@\\_\\_mharrison\\_\\_)" 2245 | ] 2246 | }, 2247 | { 2248 | "cell_type": "code", 2249 | "execution_count": null, 2250 | "metadata": {}, 2251 | "outputs": [], 2252 | "source": [] 2253 | } 2254 | ], 2255 | "metadata": { 2256 | "jupytext": { 2257 | "encoding": "# -*- coding: utf-8 -*-", 2258 | "formats": "ipynb,py:light" 2259 | }, 2260 | "kernelspec": { 2261 | "display_name": "Python 3", 2262 | "language": "python", 2263 | "name": "python3" 2264 | }, 2265 | "language_info": { 2266 | "codemirror_mode": { 2267 | "name": "ipython", 2268 | "version": 3 2269 | }, 2270 | "file_extension": ".py", 2271 | "mimetype": "text/x-python", 2272 | "name": "python", 2273 | "nbconvert_exporter": "python", 2274 | "pygments_lexer": "ipython3", 2275 | "version": "3.8.5" 2276 | } 2277 | }, 2278 | "nbformat": 4, 2279 | "nbformat_minor": 4 2280 | } 2281 | --------------------------------------------------------------------------------