├── README.md
└── Idiomatic Pandas.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Idiomatic-Pandas-Tutorial
2 | Pandas Training © MetaSnake 2022, CC BY-NC
3 |
4 | ## Install
5 |
6 | * Clone this repo
7 | * Use notebook environment
8 | * Jupyter (local install) - see https://www.metasnake.com/blog/pydata-dev.html for help
9 | * Colab (requires Google access)
10 |
11 | ## For more Pandas help
12 | Check out my book, Effective Pandas, at https://store.metasnake.com
13 |
--------------------------------------------------------------------------------
/Idiomatic Pandas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Idiomatic Pandas\n",
8 | "\n",
9 | "© MetaSnake 2022, CC BY-NC"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {
16 | "lines_to_next_cell": 0
17 | },
18 | "outputs": [],
19 | "source": [
20 | "import glob\n",
21 | "\n",
22 | "import matplotlib.pyplot as plt\n",
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {
31 | "lines_to_next_cell": 2
32 | },
33 | "outputs": [],
34 | "source": [
35 | "!pip install pandas matplotlib"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "pd.__version__"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "scrolled": true
52 | },
53 | "outputs": [],
54 | "source": [
55 | "pd.show_versions()"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Loading Data"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "!ls *.csv"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "data = [pd.read_csv(f, parse_dates=['time'], na_values='-') for f in glob.glob('tweet_activity_metrics___mharrison___*')]\n",
81 | "df = pd.concat(data, ignore_index=True).sort_values('time')\n",
82 | "df"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "df.to_csv('__mharrison__2020-2021.csv', index=False)"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {
98 | "scrolled": true
99 | },
100 | "outputs": [],
101 | "source": [
102 | "pd.read_csv('__mharrison__2020-2021.csv')"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": []
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": []
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": []
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "## Load data from Web"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "url = 'https://github.com/mattharrison/datasets/raw/master/data/__mharrison__2020-2021.csv'\n",
140 | "df = pd.read_csv(url, parse_dates=['time'])"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {
147 | "scrolled": true
148 | },
149 | "outputs": [],
150 | "source": [
151 | "df"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "## Load Data Exercise\n",
159 | "\n",
160 | "* Load the data using the cell above.\n",
161 | "* If you can't do this please alert!"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": []
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": []
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": []
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": []
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "## Exploring\n",
197 | "\n",
198 | "Definitions\n",
199 | "\n",
200 | "* *Impressions* - Number of times people saw the tweet\n",
201 | "* *Engagements* - Number of \"interactions\" (clicks, replies, retweets, likes)\n",
202 | "* *Engagement rate* - Engagements divided by impressions"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": null,
208 | "metadata": {
209 | "scrolled": true
210 | },
211 | "outputs": [],
212 | "source": [
213 | "df.T"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "df.shape"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "df.dtypes"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "pd.options.display.max_columns"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {
247 | "scrolled": true
248 | },
249 | "outputs": [],
250 | "source": [
251 | "from IPython.display import display\n",
252 | "with pd.option_context('display.max_columns', 240):\n",
253 | " display(df)"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {
260 | "scrolled": true
261 | },
262 | "outputs": [],
263 | "source": [
264 | "df.isna().sum()"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "## Explore Exercise\n",
272 | "* Use `.describe` to view the summary statistics\n",
273 | "* Use `.corr` to view column correlations"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": []
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": []
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": []
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": null,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": []
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "## Types"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {
315 | "scrolled": true
316 | },
317 | "outputs": [],
318 | "source": [
319 | "df.dtypes"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": null,
325 | "metadata": {
326 | "scrolled": true
327 | },
328 | "outputs": [],
329 | "source": [
330 | "df.memory_usage()"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {
337 | "scrolled": true
338 | },
339 | "outputs": [],
340 | "source": [
341 | "df.memory_usage(deep=True)"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": null,
347 | "metadata": {},
348 | "outputs": [],
349 | "source": [
350 | "df.memory_usage(deep=True).sum()"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {
357 | "scrolled": true
358 | },
359 | "outputs": [],
360 | "source": [
361 | "(df\n",
362 | " .select_dtypes(int).describe()\n",
363 | ")"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {
370 | "scrolled": true
371 | },
372 | "outputs": [],
373 | "source": [
374 | "(df\n",
375 | " #.select_dtypes(float)\n",
376 | " .select_dtypes('float64')\n",
377 | " .describe()\n",
378 | ")"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "(df\n",
388 | " .impressions\n",
389 | " .astype(int))"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": null,
395 | "metadata": {},
396 | "outputs": [],
397 | "source": [
398 | "df.assign?"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "metadata": {
405 | "scrolled": true
406 | },
407 | "outputs": [],
408 | "source": [
409 | "(df\n",
410 | " .assign(impressions=df.impressions.astype(int),\n",
411 | " engagements=df.engagements.astype(int)\n",
412 | " # lots of this here\n",
413 | " )\n",
414 | ")"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {
421 | "scrolled": true
422 | },
423 | "outputs": [],
424 | "source": [
425 | "# also note\n",
426 | "(df\n",
427 | " .assign(impressions=df.impressions.astype(int),\n",
428 | " engagement rate=df.engagements rate.astype(int)\n",
429 | " # lots of this here\n",
430 | " )\n",
431 | ")"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "metadata": {
438 | "scrolled": true
439 | },
440 | "outputs": [],
441 | "source": [
442 | "# fix names\n",
443 | "(df\n",
444 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
445 | ")"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": null,
451 | "metadata": {
452 | "scrolled": true
453 | },
454 | "outputs": [],
455 | "source": [
456 | "df.filter(regex=r'promoted')"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": null,
462 | "metadata": {
463 | "scrolled": true
464 | },
465 | "outputs": [],
466 | "source": [
467 | "(df\n",
468 | " .drop(columns=[c for c in df.columns if 'promoted' in c])\n",
469 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
470 | " .describe()\n",
471 | ")"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "metadata": {},
478 | "outputs": [],
479 | "source": [
480 | "# be careful with renaming\n",
481 | "(df\n",
482 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
483 | " .drop(columns=[c for c in df.columns if 'promoted' in c])\n",
484 | ")"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": [
493 | "df.drop?"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {
500 | "scrolled": true
501 | },
502 | "outputs": [],
503 | "source": [
504 | "def drop_col(df_, pattern):\n",
505 | " return df_.drop(columns=[c for c in df_.columns if pattern in c])\n",
506 | "\n",
507 | "(df\n",
508 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
509 | " #.pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
510 | " .pipe(drop_col, pattern='promoted')\n",
511 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
512 | ")"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": null,
518 | "metadata": {
519 | "lines_to_next_cell": 0,
520 | "scrolled": true
521 | },
522 | "outputs": [],
523 | "source": [
524 | "\n",
525 | "(df\n",
526 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
527 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
528 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
529 | " .memory_usage(deep=True)\n",
530 | " .sum() # 3 megs\n",
531 | ")"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": null,
537 | "metadata": {
538 | "lines_to_next_cell": 0
539 | },
540 | "outputs": [],
541 | "source": [
542 | "df.pipe?"
543 | ]
544 | },
545 | {
546 | "cell_type": "markdown",
547 | "metadata": {},
548 | "source": [
549 | "## Column Cleanup Exercise\n",
550 | "(Please don't mutate here!)\n",
551 | "\n",
552 | "* Use `.loc` to select the *impressions* and *engagement* columns\n",
553 | "* Use `.drop` to select the *impressions* and *engagement* columns\n",
554 | "* Use `.rename` to rename *impressions* to *imp* and *engagement* to *eng*"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": null,
560 | "metadata": {},
561 | "outputs": [],
562 | "source": []
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": null,
567 | "metadata": {},
568 | "outputs": [],
569 | "source": []
570 | },
571 | {
572 | "cell_type": "code",
573 | "execution_count": null,
574 | "metadata": {},
575 | "outputs": [],
576 | "source": []
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": null,
581 | "metadata": {},
582 | "outputs": [],
583 | "source": []
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": null,
588 | "metadata": {
589 | "lines_to_next_cell": 2
590 | },
591 | "outputs": [],
592 | "source": []
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": null,
597 | "metadata": {},
598 | "outputs": [],
599 | "source": []
600 | },
601 | {
602 | "cell_type": "markdown",
603 | "metadata": {},
604 | "source": [
605 | "## Ok, Types for real"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": null,
611 | "metadata": {
612 | "scrolled": false
613 | },
614 | "outputs": [],
615 | "source": [
616 | "\n",
617 | "(df\n",
618 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
619 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
620 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
621 | " .describe()\n",
622 | ")"
623 | ]
624 | },
625 | {
626 | "cell_type": "code",
627 | "execution_count": null,
628 | "metadata": {},
629 | "outputs": [],
630 | "source": [
631 | "np.iinfo('int64')"
632 | ]
633 | },
634 | {
635 | "cell_type": "code",
636 | "execution_count": null,
637 | "metadata": {
638 | "scrolled": true
639 | },
640 | "outputs": [],
641 | "source": [
642 | "for size in ['uint8', 'uint16', 'uint32', 'int8', 'int16', 'int32', 'int64']:\n",
643 | " print(f'{size=} {np.iinfo(size)}')"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": null,
649 | "metadata": {
650 | "scrolled": true
651 | },
652 | "outputs": [],
653 | "source": [
654 | "\n",
655 | "(df\n",
656 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
657 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
658 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
659 | " .assign(impressions=df.impressions.astype('uint32'),\n",
660 | " engagements=df.engagements.astype('uint16'),\n",
661 | " )\n",
662 | " .describe()\n",
663 | ")"
664 | ]
665 | },
666 | {
667 | "cell_type": "code",
668 | "execution_count": null,
669 | "metadata": {
670 | "scrolled": true
671 | },
672 | "outputs": [],
673 | "source": [
674 | "kwargs = {}\n",
675 | "for col in df.select_dtypes(float).columns:\n",
676 | " print(col)\n",
677 | " kwargs[col] = df[col].astype(int)\n",
678 | "kwargs"
679 | ]
680 | },
681 | {
682 | "cell_type": "code",
683 | "execution_count": null,
684 | "metadata": {
685 | "scrolled": true
686 | },
687 | "outputs": [],
688 | "source": [
689 | "# use dict comp if you don't want to type every column\n",
690 | "# assign w/ dict comp. and lambda\n",
691 | "(df\n",
692 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
693 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
694 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
695 | " .assign(impressions=df.impressions.astype('uint32'),\n",
696 | " engagements=df.engagements.astype('uint16'),\n",
697 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']} # less than 255\n",
698 | " )\n",
699 | ")"
700 | ]
701 | },
702 | {
703 | "cell_type": "code",
704 | "execution_count": null,
705 | "metadata": {
706 | "scrolled": true
707 | },
708 | "outputs": [],
709 | "source": [
710 | "# why c=c?\n",
711 | "(df\n",
712 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
713 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
714 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
715 | " .assign(impressions=df.impressions.astype('uint32'),\n",
716 | " engagements=df.engagements.astype('uint16'),\n",
717 | " **{c:lambda df_:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
718 | " **{c:lambda df_:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
719 | " 'detail_expands', 'media_views', 'media_engagements']} # less than 65,535\n",
720 | " )\n",
721 | " #.corr()\n",
722 | " .describe()\n",
723 | ")"
724 | ]
725 | },
726 | {
727 | "cell_type": "code",
728 | "execution_count": null,
729 | "metadata": {},
730 | "outputs": [],
731 | "source": [
732 | "# https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result\n",
733 | "squares = []\n",
734 | "for x in range(5):\n",
735 | " squares.append(lambda: x**2)\n",
736 | "for s in squares:\n",
737 | " print(s())"
738 | ]
739 | },
740 | {
741 | "cell_type": "code",
742 | "execution_count": null,
743 | "metadata": {},
744 | "outputs": [],
745 | "source": [
746 | "# https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result\n",
747 | "squares = []\n",
748 | "for x in range(5):\n",
749 | " squares.append(lambda x=x: x**2)\n",
750 | "for s in squares:\n",
751 | " print(s())"
752 | ]
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": null,
757 | "metadata": {
758 | "scrolled": true
759 | },
760 | "outputs": [],
761 | "source": [
762 | "(df\n",
763 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
764 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
765 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
766 | " .assign(impressions=df.impressions.astype('uint32'),\n",
767 | " engagements=df.engagements.astype('uint16'),\n",
768 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
769 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
770 | " 'detail_expands', 'media_views', 'media_engagements']} # less than 65,535\n",
771 | " )\n",
772 | " .describe()\n",
773 | ")"
774 | ]
775 | },
776 | {
777 | "cell_type": "code",
778 | "execution_count": null,
779 | "metadata": {
780 | "scrolled": true
781 | },
782 | "outputs": [],
783 | "source": [
784 | "(df\n",
785 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
786 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
787 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
788 | " .assign(impressions=df.impressions.astype('uint32'),\n",
789 | " engagements=df.engagements.astype('uint16'),\n",
790 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
791 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
792 | " 'detail_expands', 'media_views', 'media_engagements']} # less than 65,535\n",
793 | " \n",
794 | " )\n",
795 | " .memory_usage(deep=True) \n",
796 | " .sum() # was 3 megs\n",
797 | ")"
798 | ]
799 | },
800 | {
801 | "cell_type": "code",
802 | "execution_count": null,
803 | "metadata": {
804 | "scrolled": false
805 | },
806 | "outputs": [],
807 | "source": [
808 | "# most is from text\n",
809 | "(df\n",
810 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
811 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
812 | " .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
813 | " .assign(impressions=df.impressions.astype('uint32'),\n",
814 | " engagements=df.engagements.astype('uint16'),\n",
815 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
816 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
817 | " 'detail_expands', 'media_views', 'media_engagements']} # less than 65,535\n",
818 | " \n",
819 | " )\n",
820 | " .memory_usage(deep=True) \n",
821 | " .pipe(lambda ser: ser/ser.sum()*100)\n",
822 | "# .sum() # was 3 megs\n",
823 | ")"
824 | ]
825 | },
826 | {
827 | "cell_type": "code",
828 | "execution_count": null,
829 | "metadata": {
830 | "lines_to_next_cell": 2,
831 | "scrolled": false
832 | },
833 | "outputs": [],
834 | "source": [
835 | "# convert first part of permalink to category and add back tweet_id\n",
836 | "(df\n",
837 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
838 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
839 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
840 | " .assign(impressions=df.impressions.astype('uint32'),\n",
841 | " engagements=df.engagements.astype('uint16'),\n",
842 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
843 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
844 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
845 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
846 | " index=df_.index),\n",
847 | " )\n",
848 | " .memory_usage(deep=True) \n",
849 | " .sum() # was 3 megs\n",
850 | ")"
851 | ]
852 | },
853 | {
854 | "cell_type": "code",
855 | "execution_count": null,
856 | "metadata": {
857 | "lines_to_next_cell": 2
858 | },
859 | "outputs": [],
860 | "source": []
861 | },
862 | {
863 | "cell_type": "code",
864 | "execution_count": null,
865 | "metadata": {
866 | "lines_to_next_cell": 2,
867 | "scrolled": true
868 | },
869 | "outputs": [],
870 | "source": [
871 | "# convert first part of permalink to category and add back tweet_id\n",
872 | "(df\n",
873 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
874 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
875 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
876 | " .assign(impressions=df.impressions.astype('uint32'),\n",
877 | " engagements=df.engagements.astype('uint16'),\n",
878 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
879 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
880 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
881 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
882 | " index=df_.index),\n",
883 | " )\n",
884 | " .describe()\n",
885 | " #.memory_usage(deep=True) \n",
886 | " #.sum() # was 3 megs\n",
887 | ")"
888 | ]
889 | },
890 | {
891 | "cell_type": "code",
892 | "execution_count": null,
893 | "metadata": {},
894 | "outputs": [],
895 | "source": []
896 | },
897 | {
898 | "cell_type": "markdown",
899 | "metadata": {},
900 | "source": [
901 | "## Alternate Integer Conversion Exercise\n",
902 | "(Again, no mutation!)\n",
903 | "\n",
904 | "* Use `.select_dtypes` to filter all `int` columns from `df`\n",
905 | "* Use `.astype` with above to convert all columns to `uint8`\n",
906 | "* Use `.assign` with above to create new dataframe with updated integer columns"
907 | ]
908 | },
909 | {
910 | "cell_type": "code",
911 | "execution_count": null,
912 | "metadata": {},
913 | "outputs": [],
914 | "source": []
915 | },
916 | {
917 | "cell_type": "code",
918 | "execution_count": null,
919 | "metadata": {},
920 | "outputs": [],
921 | "source": []
922 | },
923 | {
924 | "cell_type": "code",
925 | "execution_count": null,
926 | "metadata": {},
927 | "outputs": [],
928 | "source": []
929 | },
930 | {
931 | "cell_type": "code",
932 | "execution_count": null,
933 | "metadata": {},
934 | "outputs": [],
935 | "source": []
936 | },
937 | {
938 | "cell_type": "markdown",
939 | "metadata": {},
940 | "source": [
941 | "## Other Types\n",
942 | "Can apply similar logic to floats, and strings.\n",
943 | "\n",
944 | "Converting \"Tweet_text\" to a category doesn't make sense because it is high cardinality"
945 | ]
946 | },
947 | {
948 | "cell_type": "code",
949 | "execution_count": null,
950 | "metadata": {
951 | "scrolled": false
952 | },
953 | "outputs": [],
954 | "source": [
955 | "# Uses MORE memory if tweet text is a category!\n",
956 | "(df\n",
957 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
958 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
959 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
960 | " .assign(impressions=df.impressions.astype('uint32'),\n",
961 | " engagements=df.engagements.astype('uint16'),\n",
962 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
963 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
964 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
965 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
966 | " index=df_.index),\n",
967 | " Tweet_text=lambda df_:df_.Tweet_text.astype('category')\n",
968 | " )\n",
969 | " .memory_usage(deep=True) \n",
970 | " .sum() # was 3 megs\n",
971 | ")"
972 | ]
973 | },
974 | {
975 | "cell_type": "markdown",
976 | "metadata": {},
977 | "source": [
978 | "## Other types Exercise\n",
979 | "* Use the `%%timeit` cell magic to see how long it takes to run `.str.lower()` on the original *Tweet permalink* column\n",
980 | "* Create a new dataframe, `df2`, with our current chain\n",
981 | "* Use the `%%timeit` cell magic to see how long it takes to run `.str.lower()` on the *df2.Tweet_permalink* column"
982 | ]
983 | },
984 | {
985 | "cell_type": "code",
986 | "execution_count": null,
987 | "metadata": {},
988 | "outputs": [],
989 | "source": []
990 | },
991 | {
992 | "cell_type": "code",
993 | "execution_count": null,
994 | "metadata": {},
995 | "outputs": [],
996 | "source": []
997 | },
998 | {
999 | "cell_type": "code",
1000 | "execution_count": null,
1001 | "metadata": {},
1002 | "outputs": [],
1003 | "source": []
1004 | },
1005 | {
1006 | "cell_type": "code",
1007 | "execution_count": null,
1008 | "metadata": {},
1009 | "outputs": [],
1010 | "source": []
1011 | },
1012 | {
1013 | "cell_type": "code",
1014 | "execution_count": null,
1015 | "metadata": {},
1016 | "outputs": [],
1017 | "source": []
1018 | },
1019 | {
1020 | "cell_type": "markdown",
1021 | "metadata": {},
1022 | "source": [
1023 | "## Dates"
1024 | ]
1025 | },
1026 | {
1027 | "cell_type": "code",
1028 | "execution_count": null,
1029 | "metadata": {
1030 | "scrolled": true
1031 | },
1032 | "outputs": [],
1033 | "source": [
1034 | "(df\n",
1035 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
1036 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
1037 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
1038 | " .assign(impressions=df.impressions.astype('uint32'),\n",
1039 | " engagements=df.engagements.astype('uint16'),\n",
1040 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
1041 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
1042 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
1043 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
1044 | " index=df_.index),\n",
1045 | " )\n",
1046 | " .time\n",
1047 | ")"
1048 | ]
1049 | },
1050 | {
1051 | "cell_type": "code",
1052 | "execution_count": null,
1053 | "metadata": {
1054 | "scrolled": false
1055 | },
1056 | "outputs": [],
1057 | "source": [
1058 | "# Convert to Local Time (already in UTC)\n",
1059 | "(df\n",
1060 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
1061 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
1062 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
1063 | " .astype({c:'uint8' for c in ['replies', 'hashtag_clicks', 'follows']}) # less than 255)\n",
1064 | " .assign(impressions=df.impressions.astype('uint32'),\n",
1065 | " engagements=df.engagements.astype('uint16'),\n",
1066 | " #**{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
1067 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
1068 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
1069 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
1070 | " index=df_.index),\n",
1071 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n",
1072 | " )\n",
1073 | " .time\n",
1074 | ")"
1075 | ]
1076 | },
1077 | {
1078 | "cell_type": "markdown",
1079 | "metadata": {},
1080 | "source": [
1081 | "## Dates Exercise\n",
1082 | "* Create a series with the months of the *time* column\n",
1083 | "* Convert the *time* column to UTC\n",
1084 | "* Convert the *time* column to `America/New_York`"
1085 | ]
1086 | },
1087 | {
1088 | "cell_type": "code",
1089 | "execution_count": null,
1090 | "metadata": {},
1091 | "outputs": [],
1092 | "source": []
1093 | },
1094 | {
1095 | "cell_type": "code",
1096 | "execution_count": null,
1097 | "metadata": {},
1098 | "outputs": [],
1099 | "source": []
1100 | },
1101 | {
1102 | "cell_type": "code",
1103 | "execution_count": null,
1104 | "metadata": {},
1105 | "outputs": [],
1106 | "source": []
1107 | },
1108 | {
1109 | "cell_type": "code",
1110 | "execution_count": null,
1111 | "metadata": {},
1112 | "outputs": [],
1113 | "source": []
1114 | },
1115 | {
1116 | "cell_type": "markdown",
1117 | "metadata": {},
1118 | "source": [
1119 | "## Chain\n",
1120 | "\n",
1121 | "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n",
1122 | "\n",
1123 | "The chain should read like a recipe of ordered steps.\n",
1124 | "\n",
1125 | "(BTW, this is actually what we did above.)\n",
1126 | "\n",
1127 | "
\n",
1128 | " Hint: Leverage .pipe if you can't find a way to chain 😉🐼💪\n",
1129 | "
"
1130 | ]
1131 | },
1132 | {
1133 | "cell_type": "code",
1134 | "execution_count": null,
1135 | "metadata": {},
1136 | "outputs": [],
1137 | "source": [
1138 | "# convert to a function\n",
1139 | "def tweak_twitter(df):\n",
1140 | " return (df\n",
1141 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
1142 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
1143 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
1144 | " .assign(impressions=df.impressions.astype('uint32'),\n",
1145 | " engagements=df.engagements.astype('uint16'),\n",
1146 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
1147 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
1148 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
1149 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
1150 | " index=df_.index),\n",
1151 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n",
1152 | " )\n",
1153 | " )"
1154 | ]
1155 | },
1156 | {
1157 | "cell_type": "code",
1158 | "execution_count": null,
1159 | "metadata": {},
1160 | "outputs": [],
1161 | "source": []
1162 | },
1163 | {
1164 | "cell_type": "code",
1165 | "execution_count": null,
1166 | "metadata": {},
1167 | "outputs": [],
1168 | "source": [
1169 | "# I would want my notebook to start off like this:\n",
1170 | "import glob\n",
1171 | "\n",
1172 | "import numpy as np\n",
1173 | "import pandas as pd\n",
1174 | "\n",
1175 | "data = [pd.read_csv(f, parse_dates=['time'], na_values='-') for f in glob.glob('tweet_activity_metrics___mharrison___*')]\n",
1176 | "df = pd.concat(data, ignore_index=True).sort_values('time')"
1177 | ]
1178 | },
1179 | {
1180 | "cell_type": "code",
1181 | "execution_count": null,
1182 | "metadata": {},
1183 | "outputs": [],
1184 | "source": [
1185 | "def tweak_twitter(df):\n",
1186 | " return (df\n",
1187 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
1188 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
1189 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
1190 | " .assign(impressions=df.impressions.astype('uint32'),\n",
1191 | " engagements=df.engagements.astype('uint16'),\n",
1192 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
1193 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
1194 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
1195 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
1196 | " index=df_.index),\n",
1197 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n",
1198 | " )\n",
1199 | " )\n",
1200 | "twit_df = tweak_twitter(df)"
1201 | ]
1202 | },
1203 | {
1204 | "cell_type": "code",
1205 | "execution_count": null,
1206 | "metadata": {
1207 | "lines_to_next_cell": 2,
1208 | "scrolled": true
1209 | },
1210 | "outputs": [],
1211 | "source": [
1212 | "# compare with non-chain\n",
1213 | "df1 = df.rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
1214 | "keep = [c for c in df1.columns if 'promoted' not in c]\n",
1215 | "df2 = df1[keep]\n",
1216 | "keep2 = [c for c in df2 if c not in ['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone']]\n",
1217 | "df3 = df2[keep2]\n",
1218 | "imps = df3.impressions.astype('uint32')\n",
1219 | "df3.impressions = imps\n",
1220 | "eng = df3.engagements.astype('uint16')\n",
1221 | "df3['engagements'] = eng\n",
1222 | "df3['replies'] = df3.replies.astype('uint8')\n",
1223 | "df3['hashtag_clicks'] = df3.hashtag_clicks.astype('uint8')"
1224 | ]
1225 | },
1226 | {
1227 | "cell_type": "code",
1228 | "execution_count": null,
1229 | "metadata": {
1230 | "scrolled": true
1231 | },
1232 | "outputs": [],
1233 | "source": [
1234 | "# easy to debug\n",
1235 | "# - assign to var (renamed_df)\n",
1236 | "# - comment out\n",
1237 | "# - pipe to display\n",
1238 | "\n",
1239 | "from IPython.display import display\n",
1240 | "\n",
1241 | "def get_var(df, var_name):\n",
1242 | " globals()[var_name] = df\n",
1243 | " return df\n",
1244 | "\n",
1245 | "def tweak_twitter(df):\n",
1246 | " return (df\n",
1247 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
1248 | " .pipe(get_var, 'renamed_df')\n",
1249 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
1250 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
1251 | " .pipe(lambda df_:display(df_) or df_)\n",
1252 | " .assign(impressions=df.impressions.astype('uint32'),\n",
1253 | " engagements=df.engagements.astype('uint16'),\n",
1254 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
1255 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
1256 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
1257 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
1258 | " index=df_.index),\n",
1259 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n",
1260 | " )\n",
1261 | " )\n",
1262 | "twit_df = tweak_twitter(df)"
1263 | ]
1264 | },
1265 | {
1266 | "cell_type": "code",
1267 | "execution_count": null,
1268 | "metadata": {},
1269 | "outputs": [],
1270 | "source": [
1271 | "renamed_df"
1272 | ]
1273 | },
1274 | {
1275 | "cell_type": "code",
1276 | "execution_count": null,
1277 | "metadata": {},
1278 | "outputs": [],
1279 | "source": []
1280 | },
1281 | {
1282 | "cell_type": "code",
1283 | "execution_count": null,
1284 | "metadata": {},
1285 | "outputs": [],
1286 | "source": []
1287 | },
1288 | {
1289 | "cell_type": "code",
1290 | "execution_count": null,
1291 | "metadata": {
1292 | "lines_to_next_cell": 2
1293 | },
1294 | "outputs": [],
1295 | "source": []
1296 | },
1297 | {
1298 | "cell_type": "code",
1299 | "execution_count": null,
1300 | "metadata": {},
1301 | "outputs": [],
1302 | "source": [
1303 | "def tweak_twitter(df):\n",
1304 | " return (df\n",
1305 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
1306 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
1307 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
1308 | " .assign(impressions=df.impressions.astype('uint32'),\n",
1309 | " engagements=df.engagements.astype('uint16'),\n",
1310 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
1311 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
1312 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
1313 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
1314 | " index=df_.index),\n",
1315 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n",
1316 | " )\n",
1317 | " )\n",
1318 | "twit_df = tweak_twitter(df)"
1319 | ]
1320 | },
1321 | {
1322 | "cell_type": "markdown",
1323 | "metadata": {},
1324 | "source": [
1325 | "## Chain Exercise\n",
1326 | "* Use `.pipe` to print the shape of the dataframe after every step in the chain of the `tweak_twitter` function"
1327 | ]
1328 | },
1329 | {
1330 | "cell_type": "code",
1331 | "execution_count": null,
1332 | "metadata": {},
1333 | "outputs": [],
1334 | "source": []
1335 | },
1336 | {
1337 | "cell_type": "code",
1338 | "execution_count": null,
1339 | "metadata": {},
1340 | "outputs": [],
1341 | "source": []
1342 | },
1343 | {
1344 | "cell_type": "code",
1345 | "execution_count": null,
1346 | "metadata": {},
1347 | "outputs": [],
1348 | "source": []
1349 | },
1350 | {
1351 | "cell_type": "code",
1352 | "execution_count": null,
1353 | "metadata": {},
1354 | "outputs": [],
1355 | "source": []
1356 | },
1357 | {
1358 | "cell_type": "markdown",
1359 | "metadata": {},
1360 | "source": [
1361 | "## Don't Mutate\n",
1362 | "\n",
1363 | "> \"you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not.\"\n",
1364 | ">\n",
1365 | "> **jreback** - Pandas core dev\n",
1366 | "\n",
1367 | "\n",
1368 | "\n",
1369 | "https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136\n",
1370 | "\n",
1371 | "* In general, no performance benefits\n",
1372 | "* Prohibits chaining\n",
1373 | "* ``SettingWithCopyWarning`` fun"
1374 | ]
1375 | },
1376 | {
1377 | "cell_type": "code",
1378 | "execution_count": null,
1379 | "metadata": {},
1380 | "outputs": [],
1381 | "source": []
1382 | },
1383 | {
1384 | "cell_type": "code",
1385 | "execution_count": null,
1386 | "metadata": {
1387 | "lines_to_next_cell": 2
1388 | },
1389 | "outputs": [],
1390 | "source": []
1391 | },
1392 | {
1393 | "cell_type": "code",
1394 | "execution_count": null,
1395 | "metadata": {},
1396 | "outputs": [],
1397 | "source": []
1398 | },
1399 | {
1400 | "cell_type": "code",
1401 | "execution_count": null,
1402 | "metadata": {},
1403 | "outputs": [],
1404 | "source": []
1405 | },
1406 | {
1407 | "cell_type": "code",
1408 | "execution_count": null,
1409 | "metadata": {
1410 | "lines_to_next_cell": 2
1411 | },
1412 | "outputs": [],
1413 | "source": []
1414 | },
1415 | {
1416 | "cell_type": "markdown",
1417 | "metadata": {},
1418 | "source": [
1419 | "## Don't Apply (if you can)"
1420 | ]
1421 | },
1422 | {
1423 | "cell_type": "code",
1424 | "execution_count": null,
1425 | "metadata": {},
1426 | "outputs": [],
1427 | "source": [
1428 | "def tweak_twitter(df):\n",
1429 | " return (df\n",
1430 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
1431 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
1432 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
1433 | " .assign(impressions=df.impressions.astype('uint32'),\n",
1434 | " engagements=df.engagements.astype('uint16'),\n",
1435 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
1436 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
1437 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
1438 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
1439 | " index=df_.index),\n",
1440 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver')\n",
1441 | " )\n",
1442 | " )\n",
1443 | "twit_df = tweak_twitter(df)"
1444 | ]
1445 | },
1446 | {
1447 | "cell_type": "code",
1448 | "execution_count": null,
1449 | "metadata": {
1450 | "scrolled": true
1451 | },
1452 | "outputs": [],
1453 | "source": [
1454 | "twit_df"
1455 | ]
1456 | },
1457 | {
1458 | "cell_type": "code",
1459 | "execution_count": null,
1460 | "metadata": {
1461 | "scrolled": true
1462 | },
1463 | "outputs": [],
1464 | "source": [
1465 | "def to_percent(val):\n",
1466 | " return val * 100\n",
1467 | "twit_df.engagement_rate.apply(to_percent)"
1468 | ]
1469 | },
1470 | {
1471 | "cell_type": "code",
1472 | "execution_count": null,
1473 | "metadata": {
1474 | "lines_to_next_cell": 2
1475 | },
1476 | "outputs": [],
1477 | "source": [
1478 | "# same result\n",
1479 | "twit_df.engagement_rate * 100"
1480 | ]
1481 | },
1482 | {
1483 | "cell_type": "code",
1484 | "execution_count": null,
1485 | "metadata": {},
1486 | "outputs": [],
1487 | "source": [
1488 | "%%timeit\n",
1489 | "# however ...\n",
1490 | "twit_df.engagement_rate.apply(to_percent)"
1491 | ]
1492 | },
1493 | {
1494 | "cell_type": "code",
1495 | "execution_count": null,
1496 | "metadata": {},
1497 | "outputs": [],
1498 | "source": [
1499 | "%%timeit\n",
1500 | "twit_df.engagement_rate * 100"
1501 | ]
1502 | },
1503 | {
1504 | "cell_type": "code",
1505 | "execution_count": null,
1506 | "metadata": {},
1507 | "outputs": [],
1508 | "source": [
1509 | "# 14X slower!\n",
1510 | "1008 / 71"
1511 | ]
1512 | },
1513 | {
1514 | "cell_type": "code",
1515 | "execution_count": null,
1516 | "metadata": {},
1517 | "outputs": [],
1518 | "source": [
1519 | "# How would we check if text had unicode?\n",
1520 | "'Hello \\U0001f600'.encode('ascii', errors='replace').decode('ascii')"
1521 | ]
1522 | },
1523 | {
1524 | "cell_type": "code",
1525 | "execution_count": null,
1526 | "metadata": {},
1527 | "outputs": [],
1528 | "source": [
1529 | "'Hello \\U0001f600'.encode('utf8', errors='replace').decode('utf8')"
1530 | ]
1531 | },
1532 | {
1533 | "cell_type": "code",
1534 | "execution_count": null,
1535 | "metadata": {},
1536 | "outputs": [],
1537 | "source": [
1538 | "# story is a little different with text\n",
1539 | "\n",
1540 | "def is_unicode(val):\n",
1541 | " return val.encode('ascii', errors='replace').decode('ascii') != val"
1542 | ]
1543 | },
1544 | {
1545 | "cell_type": "code",
1546 | "execution_count": null,
1547 | "metadata": {},
1548 | "outputs": [],
1549 | "source": [
1550 | "%lsmagic"
1551 | ]
1552 | },
1553 | {
1554 | "cell_type": "code",
1555 | "execution_count": null,
1556 | "metadata": {},
1557 | "outputs": [],
1558 | "source": [
1559 | "%%timeit?"
1560 | ]
1561 | },
1562 | {
1563 | "cell_type": "code",
1564 | "execution_count": null,
1565 | "metadata": {},
1566 | "outputs": [],
1567 | "source": [
1568 | "%%timeit\n",
1569 | "twit_df.Tweet_text.apply(is_unicode)"
1570 | ]
1571 | },
1572 | {
1573 | "cell_type": "code",
1574 | "execution_count": null,
1575 | "metadata": {},
1576 | "outputs": [],
1577 | "source": [
1578 | "%%timeit\n",
1579 | "twit_df.Tweet_text.str.encode('ascii', errors='replace').str.decode('ascii') == twit_df.Tweet_text"
1580 | ]
1581 | },
1582 | {
1583 | "cell_type": "code",
1584 | "execution_count": null,
1585 | "metadata": {},
1586 | "outputs": [],
1587 | "source": [
1588 | "%%timeit\n",
1589 | "twit_df.Tweet_text.str.startswith('@')"
1590 | ]
1591 | },
1592 | {
1593 | "cell_type": "code",
1594 | "execution_count": null,
1595 | "metadata": {},
1596 | "outputs": [],
1597 | "source": [
1598 | "def startswith_at(txt):\n",
1599 | " return txt.startswith('@')"
1600 | ]
1601 | },
1602 | {
1603 | "cell_type": "code",
1604 | "execution_count": null,
1605 | "metadata": {},
1606 | "outputs": [],
1607 | "source": [
1608 | "%%timeit\n",
1609 | "twit_df.Tweet_text.apply(startswith_at)"
1610 | ]
1611 | },
1612 | {
1613 | "cell_type": "code",
1614 | "execution_count": null,
1615 | "metadata": {},
1616 | "outputs": [],
1617 | "source": [
1618 | "def tweak_twitter(df):\n",
1619 | " return (df\n",
1620 | " .rename(columns=lambda col_name: col_name.replace(' ', '_'))\n",
1621 | " .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))\n",
1622 | " .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])\n",
1623 | " .assign(impressions=df.impressions.astype('uint32'),\n",
1624 | " engagements=df.engagements.astype('uint16'),\n",
1625 | " **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}, # less than 255\n",
1626 | " **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', \n",
1627 | " 'detail_expands', 'media_views', 'media_engagements']}, # less than 65,535\n",
1628 | " Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', \n",
1629 | " index=df_.index),\n",
1630 | " time=lambda df_: df_.time.dt.tz_convert('America/Denver'),\n",
1631 | " is_reply=lambda df_: df_.Tweet_text.str.startswith('@'),\n",
1632 | " length=lambda df_:df_.Tweet_text.str.len(),\n",
1633 | " num_words=lambda df_:df_.Tweet_text.str.split().apply(len),\n",
1634 | " is_unicode=lambda df_:df_.Tweet_text.str.encode('ascii', errors='replace').str.decode('ascii') != df_.Tweet_text,\n",
1635 | " hour=lambda df_:df_.time.dt.hour,\n",
1636 | " dom=lambda df_:df_.time.dt.day, #day of month\n",
1637 | " dow=lambda df_:df_.time.dt.dayofweek, #day of week\n",
1638 | " at_tweet=lambda df_:df_.Tweet_text.str.contains('@'),\n",
1639 | " has_newlines=lambda df_:df_.Tweet_text.str.contains('\\n'),\n",
1640 | " num_lines=lambda df_:df_.Tweet_text.str.count('\\n'),\n",
1641 | " num_mentions=lambda df_:df_.Tweet_text.str.count('@'),\n",
1642 | " has_hashtag=lambda df_:df_.Tweet_text.str.count('#'),\n",
1643 | " )\n",
1644 | " )\n",
1645 | "twit_df = tweak_twitter(df)"
1646 | ]
1647 | },
1648 | {
1649 | "cell_type": "code",
1650 | "execution_count": null,
1651 | "metadata": {},
1652 | "outputs": [],
1653 | "source": [
1654 | "twit_df"
1655 | ]
1656 | },
1657 | {
1658 | "cell_type": "markdown",
1659 | "metadata": {},
1660 | "source": [
1661 | "## Apply Exercise\n",
1662 | "* Calculate engagement ratio by dividing *engagements* by *impressions*\n",
1663 | "* Calculate engagement ratio 2 by dividing the sum of *replies*, *retweets*, *likes*, *user_profile_clicks*, and *detail_expands* by *impressions*"
1664 | ]
1665 | },
1666 | {
1667 | "cell_type": "code",
1668 | "execution_count": null,
1669 | "metadata": {},
1670 | "outputs": [],
1671 | "source": []
1672 | },
1673 | {
1674 | "cell_type": "code",
1675 | "execution_count": null,
1676 | "metadata": {},
1677 | "outputs": [],
1678 | "source": []
1679 | },
1680 | {
1681 | "cell_type": "code",
1682 | "execution_count": null,
1683 | "metadata": {},
1684 | "outputs": [],
1685 | "source": []
1686 | },
1687 | {
1688 | "cell_type": "code",
1689 | "execution_count": null,
1690 | "metadata": {},
1691 | "outputs": [],
1692 | "source": []
1693 | },
1694 | {
1695 | "cell_type": "code",
1696 | "execution_count": null,
1697 | "metadata": {},
1698 | "outputs": [],
1699 | "source": []
1700 | },
1701 | {
1702 | "cell_type": "code",
1703 | "execution_count": null,
1704 | "metadata": {},
1705 | "outputs": [],
1706 | "source": []
1707 | },
1708 | {
1709 | "cell_type": "code",
1710 | "execution_count": null,
1711 | "metadata": {},
1712 | "outputs": [],
1713 | "source": []
1714 | },
1715 | {
1716 | "cell_type": "code",
1717 | "execution_count": null,
1718 | "metadata": {},
1719 | "outputs": [],
1720 | "source": []
1721 | },
1722 | {
1723 | "cell_type": "code",
1724 | "execution_count": null,
1725 | "metadata": {
1726 | "lines_to_next_cell": 2
1727 | },
1728 | "outputs": [],
1729 | "source": []
1730 | },
1731 | {
1732 | "cell_type": "markdown",
1733 | "metadata": {},
1734 | "source": [
1735 | "## Master Aggregation"
1736 | ]
1737 | },
1738 | {
1739 | "cell_type": "code",
1740 | "execution_count": null,
1741 | "metadata": {
1742 | "scrolled": true
1743 | },
1744 | "outputs": [],
1745 | "source": [
1746 | "(twit_df\n",
1747 | " .groupby(twit_df.time.dt.year)\n",
1748 | " .mean()\n",
1749 | ")"
1750 | ]
1751 | },
1752 | {
1753 | "cell_type": "code",
1754 | "execution_count": null,
1755 | "metadata": {
1756 | "lines_to_next_cell": 2,
1757 | "scrolled": true
1758 | },
1759 | "outputs": [],
1760 | "source": [
1761 | "twit_df.groupby(twit_df.time.dt.year).mean()"
1762 | ]
1763 | },
1764 | {
1765 | "cell_type": "code",
1766 | "execution_count": null,
1767 | "metadata": {
1768 | "scrolled": false
1769 | },
1770 | "outputs": [],
1771 | "source": [
1772 | "(twit_df\n",
1773 | " .groupby(twit_df.time.dt.year)\n",
1774 | " .impressions\n",
1775 | " .mean()\n",
1776 | ")"
1777 | ]
1778 | },
1779 | {
1780 | "cell_type": "code",
1781 | "execution_count": null,
1782 | "metadata": {
1783 | "scrolled": true
1784 | },
1785 | "outputs": [],
1786 | "source": [
1787 | "%%timeit\n",
1788 | "(twit_df\n",
1789 | " .groupby(twit_df.time.dt.year)\n",
1790 | " .mean()\n",
1791 | " [['impressions', 'replies']] # index operation with a list inside \n",
1792 | ")"
1793 | ]
1794 | },
1795 | {
1796 | "cell_type": "code",
1797 | "execution_count": null,
1798 | "metadata": {
1799 | "scrolled": true
1800 | },
1801 | "outputs": [],
1802 | "source": [
1803 | "%%timeit\n",
1804 | "(twit_df\n",
1805 | " .groupby(twit_df.time.dt.year)\n",
1806 | " [['impressions', 'replies']] # index operation with a list inside \n",
1807 | " .mean()\n",
1808 | ")"
1809 | ]
1810 | },
1811 | {
1812 | "cell_type": "code",
1813 | "execution_count": null,
1814 | "metadata": {},
1815 | "outputs": [],
1816 | "source": [
1817 | "twit_df.Tweet_text.str."
1818 | ]
1819 | },
1820 | {
1821 | "cell_type": "code",
1822 | "execution_count": null,
1823 | "metadata": {},
1824 | "outputs": [],
1825 | "source": [
1826 | "twit_df.time.dt.year.rename('year')"
1827 | ]
1828 | },
1829 | {
1830 | "cell_type": "code",
1831 | "execution_count": null,
1832 | "metadata": {},
1833 | "outputs": [],
1834 | "source": [
1835 | "pd.options.display.float_format"
1836 | ]
1837 | },
1838 | {
1839 | "cell_type": "code",
1840 | "execution_count": null,
1841 | "metadata": {
1842 | "scrolled": true
1843 | },
1844 | "outputs": [],
1845 | "source": [
1846 | "(twit_df\n",
1847 | " .groupby([twit_df.time.dt.year.rename('year'), twit_df.time.dt.month.rename('month')])\n",
1848 | " [['impressions', 'replies']]\n",
1849 | " .mean()\n",
1850 | " #.round(2)\n",
1851 | " .style\n",
1852 | " .format({'replies': '{:.3f}', 'impressions': '{:e}'})\n",
1853 | " \n",
1854 | ")"
1855 | ]
1856 | },
1857 | {
1858 | "cell_type": "code",
1859 | "execution_count": null,
1860 | "metadata": {
1861 | "scrolled": true
1862 | },
1863 | "outputs": [],
1864 | "source": [
1865 | "(twit_df\n",
1866 | " .groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n",
1867 | " [['impressions', 'replies']]\n",
1868 | " #.mean()\n",
1869 | " .median()\n",
1870 | " .plot()\n",
1871 | ")"
1872 | ]
1873 | },
1874 | {
1875 | "cell_type": "code",
1876 | "execution_count": null,
1877 | "metadata": {
1878 | "scrolled": true
1879 | },
1880 | "outputs": [],
1881 | "source": [
1882 | "(twit_df\n",
1883 | " #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n",
1884 | " .groupby(pd.Grouper(key='time', freq='2M'))\n",
1885 | " [['impressions', 'replies']]\n",
1886 | " #.mean()\n",
1887 | " .median()\n",
1888 | " .plot()\n",
1889 | ")"
1890 | ]
1891 | },
1892 | {
1893 | "cell_type": "code",
1894 | "execution_count": null,
1895 | "metadata": {
1896 | "scrolled": true
1897 | },
1898 | "outputs": [],
1899 | "source": [
1900 | "(twit_df\n",
1901 | " #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n",
1902 | " .groupby(pd.Grouper(key='time', freq='2w'))\n",
1903 | " [['impressions', 'replies']]\n",
1904 | " .mean()\n",
1905 | " .plot()\n",
1906 | ")"
1907 | ]
1908 | },
1909 | {
1910 | "cell_type": "code",
1911 | "execution_count": null,
1912 | "metadata": {
1913 | "scrolled": false
1914 | },
1915 | "outputs": [],
1916 | "source": [
1917 | "(twit_df\n",
1918 | " #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n",
1919 | " .groupby(pd.Grouper(key='time', freq='7d5h'))\n",
1920 | " [['impressions', 'replies']]\n",
1921 | " .mean()\n",
1922 | " #.plot()\n",
1923 | ")"
1924 | ]
1925 | },
1926 | {
1927 | "cell_type": "code",
1928 | "execution_count": null,
1929 | "metadata": {
1930 | "scrolled": false
1931 | },
1932 | "outputs": [],
1933 | "source": [
1934 | "(twit_df\n",
1935 | " #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])\n",
1936 | " .groupby([pd.Grouper(key='time', freq='7d5h'), 'is_unicode'])\n",
1937 | " [['impressions', 'replies']]\n",
1938 | " .mean()\n",
1939 | " #.plot()\n",
1940 | ")"
1941 | ]
1942 | },
1943 | {
1944 | "cell_type": "code",
1945 | "execution_count": null,
1946 | "metadata": {
1947 | "scrolled": true
1948 | },
1949 | "outputs": [],
1950 | "source": [
1951 | "# multiple aggregates\n",
1952 | "def second_to_last(ser):\n",
1953 | " try:\n",
1954 | " return ser.iloc[-2]\n",
1955 | " except IndexError:\n",
1956 | " return 0\n",
1957 | "\n",
1958 | "(twit_df\n",
1959 | " .groupby([pd.Grouper(key='time', freq='7d5h'), 'is_unicode'])\n",
1960 | " [['impressions', 'replies']]\n",
1961 | " .agg(['mean', 'median', second_to_last])\n",
1962 | ")"
1963 | ]
1964 | },
1965 | {
1966 | "cell_type": "code",
1967 | "execution_count": null,
1968 | "metadata": {
1969 | "scrolled": true
1970 | },
1971 | "outputs": [],
1972 | "source": [
1973 | "# multiple aggregates\n",
1974 | "\n",
1975 | "(twit_df\n",
1976 | " .groupby([pd.Grouper(key='time', freq='7d5h'), 'is_unicode'])\n",
1977 | " [['impressions', 'replies']]\n",
1978 | " .agg(['mean', 'median', second_to_last])\n",
1979 | " .plot()\n",
1980 | ")"
1981 | ]
1982 | },
1983 | {
1984 | "cell_type": "code",
1985 | "execution_count": null,
1986 | "metadata": {
1987 | "scrolled": true
1988 | },
1989 | "outputs": [],
1990 | "source": [
1991 | "# multiple aggregates\n",
1992 | "\n",
1993 | "(twit_df\n",
1994 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n",
1995 | " [['impressions', 'replies']]\n",
1996 | " .agg(['mean', 'median', second_to_last])\n",
1997 | " .unstack()\n",
1998 | ")"
1999 | ]
2000 | },
2001 | {
2002 | "cell_type": "code",
2003 | "execution_count": null,
2004 | "metadata": {
2005 | "scrolled": true
2006 | },
2007 | "outputs": [],
2008 | "source": [
2009 | "# multiple aggregates\n",
2010 | "\n",
2011 | "(twit_df\n",
2012 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n",
2013 | " [['impressions', 'replies']]\n",
2014 | " .agg(['mean', 'median', second_to_last])\n",
2015 | " .unstack()\n",
2016 | " .impressions\n",
2017 | ")"
2018 | ]
2019 | },
2020 | {
2021 | "cell_type": "code",
2022 | "execution_count": null,
2023 | "metadata": {
2024 | "scrolled": true
2025 | },
2026 | "outputs": [],
2027 | "source": [
2028 | "# multiple aggregates\n",
2029 | "(twit_df\n",
2030 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n",
2031 | " [['impressions', 'replies']]\n",
2032 | " .agg(['mean', 'median', second_to_last])\n",
2033 | " .unstack()\n",
2034 | " .impressions\n",
2035 | " ['mean'] # note have to use index syntax here\n",
2036 | ")"
2037 | ]
2038 | },
2039 | {
2040 | "cell_type": "code",
2041 | "execution_count": null,
2042 | "metadata": {
2043 | "scrolled": true
2044 | },
2045 | "outputs": [],
2046 | "source": [
2047 | "# multiple aggregates\n",
2048 | "(twit_df\n",
2049 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n",
2050 | " [['impressions', 'replies']]\n",
2051 | " .agg(['mean', 'median', second_to_last])\n",
2052 | " .unstack()\n",
2053 | " .impressions\n",
2054 | " .mean # note have to use index syntax here\n",
2055 | ")"
2056 | ]
2057 | },
2058 | {
2059 | "cell_type": "code",
2060 | "execution_count": null,
2061 | "metadata": {
2062 | "scrolled": true
2063 | },
2064 | "outputs": [],
2065 | "source": [
2066 | "# multiple aggregates\n",
2067 | "(twit_df\n",
2068 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n",
2069 | " [['impressions', 'replies']]\n",
2070 | " .agg(['mean', 'median', second_to_last])\n",
2071 | " .unstack()\n",
2072 | " .impressions\n",
2073 | " ['mean']\n",
2074 | " .plot()\n",
2075 | ")"
2076 | ]
2077 | },
2078 | {
2079 | "cell_type": "code",
2080 | "execution_count": null,
2081 | "metadata": {
2082 | "scrolled": false
2083 | },
2084 | "outputs": [],
2085 | "source": [
2086 | "# multiple aggregates\n",
2087 | "# dealing with missing values\n",
2088 | "(twit_df\n",
2089 | " .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])\n",
2090 | " [['impressions', 'replies']]\n",
2091 | " .agg(['mean', 'median', second_to_last])\n",
2092 | " .unstack()\n",
2093 | " .impressions\n",
2094 | " ['mean']\n",
2095 | " #.fillna(0)\n",
2096 | " #.interpolate()\n",
2097 | " #.bfill()\n",
2098 | " #.dropna()\n",
2099 | " .loc['2021/07':'2021/08']\n",
2100 | " #.plot()\n",
2101 | ")"
2102 | ]
2103 | },
2104 | {
2105 | "cell_type": "code",
2106 | "execution_count": null,
2107 | "metadata": {
2108 | "scrolled": true
2109 | },
2110 | "outputs": [],
2111 | "source": [
2112 | "# multiple aggregates\n",
2113 | "(twit_df\n",
2114 | " .groupby([pd.Grouper(key='time', freq='3d'), 'is_unicode'])\n",
2115 | " [['impressions', 'replies']]\n",
2116 | " .agg(['mean', 'median', second_to_last])\n",
2117 | " .unstack()\n",
2118 | " .impressions\n",
2119 | " ['mean']\n",
2120 | " .interpolate()\n",
2121 | " .rolling(7)\n",
2122 | " .mean()\n",
2123 | " .plot()\n",
2124 | ")"
2125 | ]
2126 | },
2127 | {
2128 | "cell_type": "code",
2129 | "execution_count": null,
2130 | "metadata": {
2131 | "scrolled": true
2132 | },
2133 | "outputs": [],
2134 | "source": [
2135 | "# named aggregation\n",
2136 | "\n",
2137 | "(twit_df\n",
2138 | " .groupby([pd.Grouper(key='time', freq='M'), 'is_unicode'])\n",
2139 | " .agg(total_views=('impressions', 'sum'),\n",
2140 | " mean_views=('impressions', 'mean'),\n",
2141 | " profile_clicks=('user_profile_clicks', lambda ser: ser.sum()))\n",
2142 | ")"
2143 | ]
2144 | },
2145 | {
2146 | "cell_type": "code",
2147 | "execution_count": null,
2148 | "metadata": {
2149 | "scrolled": true
2150 | },
2151 | "outputs": [],
2152 | "source": [
2153 | "# named aggregation - fails with resample\n",
2154 | "\n",
2155 | "(twit_df\n",
2156 | " #.groupby([pd.Grouper(key='time', freq='M'), 'is_unicode'])\n",
2157 | " .set_index('time')\n",
2158 | " .resample('M')\n",
2159 | " .agg(total_views=('impressions', 'sum'),\n",
2160 | " mean_views=('impressions', 'mean'),\n",
2161 | " profile_clicks=('user_profile_clicks', lambda ser: ser.sum()))\n",
2162 | ")"
2163 | ]
2164 | },
2165 | {
2166 | "cell_type": "code",
2167 | "execution_count": null,
2168 | "metadata": {
2169 | "lines_to_next_cell": 0,
2170 | "scrolled": false
2171 | },
2172 | "outputs": [],
2173 | "source": [
2174 | "# named aggregation\n",
2175 | "\n",
2176 | "(twit_df\n",
2177 | " .groupby([pd.Grouper(key='time', freq='M'), 'is_unicode'])\n",
2178 | " .agg(total_views=('impressions', 'sum'),\n",
2179 | " mean_views=('impressions', 'mean'),\n",
2180 | " profile_clicks=('user_profile_clicks', lambda ser: ser.sum()))\n",
2181 | " .unstack()\n",
2182 | " .profile_clicks\n",
2183 | " .plot()\n",
2184 | ")"
2185 | ]
2186 | },
2187 | {
2188 | "cell_type": "markdown",
2189 | "metadata": {},
2190 | "source": [
2191 | "## Aggregation Exercise\n",
2192 | "* What were the total impressions for each year?\n",
2193 | "* What were the total impressions for each month?\n",
2194 | "* Plot the previous\n",
2195 | "* What were the total impressions for unicode and non-unicode tweets for each month?\n",
2196 | "* Plot the previous\n",
2197 | "* What were the total impressions for reply and non-reply tweets for each month?\n",
2198 | "* Plot the previous"
2199 | ]
2200 | },
2201 | {
2202 | "cell_type": "code",
2203 | "execution_count": null,
2204 | "metadata": {},
2205 | "outputs": [],
2206 | "source": []
2207 | },
2208 | {
2209 | "cell_type": "code",
2210 | "execution_count": null,
2211 | "metadata": {},
2212 | "outputs": [],
2213 | "source": []
2214 | },
2215 | {
2216 | "cell_type": "code",
2217 | "execution_count": null,
2218 | "metadata": {},
2219 | "outputs": [],
2220 | "source": []
2221 | },
2222 | {
2223 | "cell_type": "code",
2224 | "execution_count": null,
2225 | "metadata": {},
2226 | "outputs": [],
2227 | "source": []
2228 | },
2229 | {
2230 | "cell_type": "markdown",
2231 | "metadata": {},
2232 | "source": [
2233 | "## Summary\n",
2234 | "\n",
2235 | "* Correct types save space and enable convenient math, string, and date functionality\n",
2236 | "* Chaining operations will:\n",
2237 | " * Make code readable\n",
2238 | " * Remove bugs\n",
2239 | " * Easier to debug\n",
2240 | "* Don't mutate (there's no point). Embrace chaining.\n",
2241 | "* ``.apply`` is slow for math\n",
2242 | "* Aggregations are powerful. Play with them until they make sense\n",
2243 | "\n",
2244 | "Connect with me on LinkedIn or Twitter (@\\_\\_mharrison\\_\\_)"
2245 | ]
2246 | },
2247 | {
2248 | "cell_type": "code",
2249 | "execution_count": null,
2250 | "metadata": {},
2251 | "outputs": [],
2252 | "source": []
2253 | }
2254 | ],
2255 | "metadata": {
2256 | "jupytext": {
2257 | "encoding": "# -*- coding: utf-8 -*-",
2258 | "formats": "ipynb,py:light"
2259 | },
2260 | "kernelspec": {
2261 | "display_name": "Python 3",
2262 | "language": "python",
2263 | "name": "python3"
2264 | },
2265 | "language_info": {
2266 | "codemirror_mode": {
2267 | "name": "ipython",
2268 | "version": 3
2269 | },
2270 | "file_extension": ".py",
2271 | "mimetype": "text/x-python",
2272 | "name": "python",
2273 | "nbconvert_exporter": "python",
2274 | "pygments_lexer": "ipython3",
2275 | "version": "3.8.5"
2276 | }
2277 | },
2278 | "nbformat": 4,
2279 | "nbformat_minor": 4
2280 | }
2281 |
--------------------------------------------------------------------------------