├── .devcontainer └── devcontainer.json ├── GSS.csv ├── README.md ├── honest.fth ├── pandas-best-practices.ipynb └── requirements.txt /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "mcr.microsoft.com/devcontainers/universal:2", 3 | "hostRequirements": { 4 | "cpus": 2 5 | }, 6 | "waitFor": "onCreateCommand", 7 | "updateContentCommand": "python3 -m pip install -r requirements.txt", 8 | "postCreateCommand": "", 9 | "customizations": { 10 | "codespaces": { 11 | "openFiles": [] 12 | }, 13 | "vscode": { 14 | "extensions": [ 15 | "ms-toolsai.jupyter", 16 | "ms-python.python" 17 | ] 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pearson-pandas-best-practices 2 | 3 | This course introduces best practices for Pandas. 4 | 5 | ## Resources 6 | 7 | See the author's book, [Effective Pandas (digital)](https://store.metasnake.com/effective-pandas-book) [(physical)](https://amzn.to/43dt50h) 8 | 9 | ![Effective Pandas](https://d31ezp3r8jwmks.cloudfront.net/3ytw9atdhoe9ezz1i5hctlspkre4) 10 | 11 | -------------------------------------------------------------------------------- /honest.fth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mattharrison/pearson-pandas-best-practices/f67fb49f132784eb09152f3e169196ea26e0cbb4/honest.fth -------------------------------------------------------------------------------- /pandas-best-practices.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "597d4814", 6 | "metadata": { 7 | "lines_to_next_cell": 0, 8 | "pycharm": { 9 | "name": "#%% md\n" 10 | } 11 | }, 12 | "source": [ 13 | "# Pandas Best Practices\n", 14 | "## 5 Tips for Better Pandas Code" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "fe77712e", 21 | "metadata": { 22 | "lines_to_next_cell": 2 23 | }, 24 | "outputs": [], 25 | "source": [] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "6188e883", 30 | "metadata": { 31 | "pycharm": { 32 | "name": "#%% md\n" 33 | } 34 | }, 35 | "source": [ 36 | "## About Matt Harrison @\\_\\_mharrison\\_\\_\n", 37 | "\n", 38 | "* Author of Effective Pandas, Machine Learning Pocket Reference, and Illustrated Guide to Python 3.\n", 39 | "* Advisor at Ponder (creators of Modin)\n", 40 | "* Corporate trainer at MetaSnake. Taught Pandas to 1000's of students.\n", 41 | "* Use coupon LIVE for 10% off Effective Pandas book or bundle ( https://store.metasnake.com )" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "8bf569d6", 48 | "metadata": { 49 | "lines_to_next_cell": 2, 50 | "pycharm": { 51 | "name": "#%%\n" 52 | } 53 | }, 54 | "outputs": [], 55 | "source": [] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "68b20887", 61 | "metadata": { 62 | "lines_to_next_cell": 2, 63 | "pycharm": { 64 | "name": "#%%\n" 65 | } 66 | }, 67 | "outputs": [], 68 | "source": [] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "b3fd6901", 74 | "metadata": { 75 | "lines_to_next_cell": 2, 76 | "pycharm": { 77 | "name": "#%%\n" 78 | } 79 | }, 80 | "outputs": [], 81 | "source": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "352e081c", 87 | "metadata": { 88 | "lines_to_next_cell": 2, 89 | "pycharm": { 90 | "name": "#%%\n" 91 | } 92 | }, 93 | "outputs": [], 94 | "source": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "61a77bda", 100 | "metadata": { 101 | "lines_to_next_cell": 2, 102 | "pycharm": { 103 | "name": "#%%\n" 104 | } 105 | }, 106 | "outputs": [], 107 | "source": [] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "c7a1b91f", 113 | "metadata": { 114 | "lines_to_next_cell": 2, 115 | "pycharm": { 116 | "name": "#%%\n" 117 | } 118 | }, 119 | "outputs": [], 120 | "source": [] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "5c67d9ed", 125 | "metadata": { 126 | "pycharm": { 127 | "name": "#%% md\n" 128 | } 129 | }, 130 | "source": [ 131 | "## Practice this on your data with your team!\n", 132 | "* Contact me matt@metasnake.com\n", 133 | "* Follow on Twitter @\\_\\_mharrison\\_\\_" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "b56b65e9", 140 | "metadata": { 141 | "lines_to_next_cell": 2, 142 | "pycharm": { 143 | "name": "#%%\n" 144 | } 145 | }, 146 | "outputs": [], 147 | "source": [] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "7c579886", 153 | "metadata": { 154 | "lines_to_next_cell": 2, 155 | "pycharm": { 156 | "name": "#%%\n" 157 | } 158 | }, 159 | "outputs": [], 160 | "source": [] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "id": "c38061e7", 165 | "metadata": { 166 | "pycharm": { 167 | "name": "#%% md\n" 168 | } 169 | }, 170 | "source": [ 171 | "## Outline\n", 172 | "\n", 173 | "* Load Data\n", 174 | "* Types\n", 175 | "* Chaining\n", 176 | "* Mutation\n", 177 | "* Apply\n", 178 | "* Aggregation" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "dc9d13b9", 184 | "metadata": { 185 | "pycharm": { 186 | "name": "#%% md\n" 187 | } 188 | }, 189 | "source": [ 190 | "## Imports" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "id": "ec04b162", 197 | "metadata": { 198 | "lines_to_next_cell": 2, 199 | "pycharm": { 200 | "name": "#%%\n" 201 | } 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "%matplotlib inline\n", 206 | "from IPython.display import display\n", 207 | "import numpy as np\n", 208 | "import pandas as pd\n", 209 | "import pyarrow\n", 210 | "\n", 211 | "import io\n", 212 | "import zipfile" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "id": "29ef6997", 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "pd.__version__" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "ae401f97", 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "pyarrow.__version__" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "id": "1dea3558", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "id": "1d7ee1c0", 246 | "metadata": {}, 247 | "source": [ 248 | "## Data Preprocessing\n", 249 | "\n", 250 | "Don't run this code. I'm providing it here to show you where the data came from.\n", 251 | "(If you really want to run this download the ZIP file and update the path)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "id": "3a5723a0", 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# https://gss.norc.org/get-the-data/spss\n", 262 | "# https://gss.norc.org/Documents/spss/gss_spss_with_codebook.zip\n", 263 | "# takes a few minutes on my computer to load\n", 264 | "path = '/mnt/c/Users/matt/Downloads/gss_spss_with_codebook.zip'\n", 265 | "with zipfile.ZipFile(path) as z:\n", 266 | " print(z.namelist())\n", 267 | " with open('gss.sav', mode='bw') as fout:\n", 268 | " fout.write(z.open('GSS7218_R3.sav').read())\n", 269 | " gss = pd.read_spss('gss.sav')" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "f8587141", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "!pip install pyreadstat" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "id": "f4b686fc", 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "%%time\n", 290 | "import pyreadstat\n", 291 | "gss, meta = pyreadstat.read_sav('gss.sav')" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "id": "c73cd05b", 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "gss.shape" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "id": "1e2e1777", 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "gss.to_feather('gss.fth')" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "id": "46ec0b7c", 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "%%time\n", 322 | "raw = pd.read_feather('~/Dropbox/work/jupyter/gss.fth')" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "id": "f029dc45", 329 | "metadata": { 330 | "lines_to_next_cell": 0, 331 | "pycharm": { 332 | "name": "#%%\n" 333 | } 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "raw" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "id": "8cad5ba6", 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "# 6000 columns!\n", 348 | "raw.shape" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "id": "d08680b5", 355 | "metadata": { 356 | "lines_to_next_cell": 0 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n", 361 | " 'INCOME06','HONEST','TICKET']\n", 362 | "\n", 363 | "raw[cols].to_feather('honest.fth')" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "id": "506f2f1d", 370 | "metadata": { 371 | "lines_to_next_cell": 2 372 | }, 373 | "outputs": [], 374 | "source": [] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "id": "5ab74806", 379 | "metadata": {}, 380 | "source": [ 381 | "## Loading Data\n", 382 | "\n", 383 | "This is the data we will be using. Run this code!" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "id": "07444860", 390 | "metadata": { 391 | "lines_to_next_cell": 2, 392 | "pycharm": { 393 | "name": "#%%\n" 394 | } 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "raw = pd.read_feather('honest.fth', dtype_backend='pyarrow')" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "id": "582255a1", 405 | "metadata": { 406 | "lines_to_next_cell": 2, 407 | "pycharm": { 408 | "name": "#%%\n" 409 | } 410 | }, 411 | "outputs": [], 412 | "source": [] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "id": "db1c15f3", 417 | "metadata": { 418 | "pycharm": { 419 | "name": "#%% md\n" 420 | } 421 | }, 422 | "source": [ 423 | "## My Cleanup\n", 424 | "See GSS_Codebook.pdf for explanation\n", 425 | "\n", 426 | "Columns:\n", 427 | "\n", 428 | "* YEAR\n", 429 | "* ID - RESPONDENT ID NUMBER\n", 430 | "* AGE - AGE OF RESPONENT\n", 431 | "* HRS1 - NUMBER OF HOURS WORKED LAST WEEK\n", 432 | "* OCC - R'S CENSUS OCCUPATION CODE (1970) - Page 126 (VAR: OCC) see page 125 for notes APPENDIX F,G,H\n", 433 | " Appendix F - Page 3286\n", 434 | "* MAJOR1 - COLLEGE MAJOR 1\n", 435 | "* SEX - RESPONDENTS SEX\n", 436 | "* RACE - RACE OF RESPONDENT\n", 437 | "* BORN - WAS R BORN IN THIS COUNTRY\n", 438 | "* INCOME - TOTAL FAMILY INCOME 1970\n", 439 | "* INCOME06 - TOTAL FAMILY INCOME 2006\n", 440 | "* HONEST - HONEST\n", 441 | "* TICKET - EVER RECEIVED A TRAFFIC TICKET\n" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "65089c43", 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n", 452 | " 'INCOME06','HONEST','TICKET']\n", 453 | "\n", 454 | "raw[cols].isna().mean()*100" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "id": "67f1d8f4", 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "(raw\n", 465 | " [cols]\n", 466 | " .isna()\n", 467 | " .mean()*100\n", 468 | ")" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "id": "df146d91", 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "MAJOR= '''RESPONSE PUNCH 1972-82 1982B 1983-87 1987B 1988-91 1993-98 2000-04 2006 2008 2010 2012 2014 2016 2018 ALL\n", 479 | "Accounting/bookkeeping 1 0 0 0 0 0 0 0 0 0 0 28 32 30 29 119\n", 480 | "Advertising 2 0 0 0 0 0 0 0 0 0 0 3 2 0 0 5\n", 481 | "Agriculture/horticulture 3 0 0 0 0 0 0 0 0 0 0 8 2 7 5 22\n", 482 | "Allied health 4 0 0 0 0 0 0 0 0 0 0 0 2 1 0 3\n", 483 | "Anthropology 5 0 0 0 0 0 0 0 0 0 0 3 5 1 1 10\n", 484 | "Architecture 6 0 0 0 0 0 0 0 0 0 0 2 3 5 3 13\n", 485 | "Art 7 0 0 0 0 0 0 0 0 0 0 6 7 11 10 34\n", 486 | "Biology 8 0 0 0 0 0 0 0 0 0 0 16 22 33 26 97\n", 487 | "Business administration 9 0 0 0 0 0 0 0 0 0 0 90 142 172 138 542\n", 488 | "Chemistry 11 0 0 0 0 0 0 0 0 0 0 5 8 10 4 27\n", 489 | "Communications/speech 12 0 0 0 0 0 0 0 0 0 0 20 18 26 18 82\n", 490 | "Comm. disorders 13 0 0 0 0 0 0 0 0 0 0 4 6 2 2 14\n", 491 | "Computer science 14 0 0 0 0 0 0 0 0 0 0 25 24 33 17 99\n", 492 | "Dentistry 15 0 0 0 0 0 0 0 0 0 0 2 4 3 5 14\n", 493 | "Education 16 0 0 0 0 0 0 0 0 0 0 73 91 97 79 340\n", 494 | "Economics 17 0 0 0 0 0 0 0 0 0 0 11 19 13 19 62\n", 495 | "Engineering 18 0 0 0 0 0 0 0 0 0 0 47 49 47 54 197\n", 496 | "English 19 0 0 0 0 0 0 0 0 0 0 23 26 27 24 100\n", 497 | "Finance 20 0 0 0 0 0 0 0 0 0 0 7 15 14 16 52\n", 498 | "Foreign language 21 0 0 0 0 0 0 0 0 0 0 4 8 6 5 23\n", 499 | "Forestry 22 0 0 0 0 0 0 0 0 0 0 1 0 3 0 4\n", 500 | "Geography 23 0 0 0 0 0 0 0 0 0 0 0 2 2 4 8\n", 501 | "Geology 24 0 0 0 0 0 0 0 0 0 0 1 3 4 2 10\n", 502 | "History 25 0 0 0 0 0 0 0 0 0 0 10 19 14 19 62\n", 503 | "Home economics 26 0 0 0 0 0 0 0 0 0 0 0 0 3 2 5\n", 504 | "Industry & techn 27 0 0 0 0 0 0 0 0 0 0 3 4 6 0 13\n", 505 | "Journalism 28 0 0 0 0 0 0 0 0 0 0 5 6 6 4 21\n", 506 | "Law 29 0 0 0 0 0 0 0 0 0 0 13 18 23 14 68\n", 507 | "Law enforcement 30 0 0 0 0 0 0 0 0 0 0 3 5 4 2 14\n", 508 | "Library science 31 0 0 0 0 0 0 0 0 0 0 4 5 2 3 14\n", 509 | "Marketing 32 0 0 0 0 0 0 0 0 0 0 11 15 13 12 51\n", 510 | "Mathematics 33 0 0 0 0 0 0 0 0 0 0 5 10 12 5 32\n", 511 | "Medicine 34 0 0 0 0 0 0 0 0 0 0 9 25 12 11 57\n", 512 | "Music 35 0 0 0 0 0 0 0 0 0 0 4 2 10 2 18\n", 513 | "Nursing 36 0 0 0 0 0 0 0 0 0 0 36 39 60 51 186\n", 514 | "Optometry 37 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 515 | "Pharmacy 38 0 0 0 0 0 0 0 0 0 0 2 5 4 4 15\n", 516 | "Philosophy 39 0 0 0 0 0 0 0 0 0 0 2 0 2 2 6\n", 517 | "Physical education 40 0 0 0 0 0 0 0 0 0 0 9 6 16 6 37\n", 518 | "Physics 41 0 0 0 0 0 0 0 0 0 0 3 6 7 4 20\n", 519 | "Psychology 42 0 0 0 0 0 0 0 0 0 0 32 32 34 29 127\n", 520 | "Political science/international relations 43 0 0 0 0 0 0 0 0 0 0 16 22 19 14 71\n", 521 | "Sociology 44 0 0 0 0 0 0 0 0 0 0 9 15 10 12 46\n", 522 | "Special education 45 0 0 0 0 0 0 0 0 0 0 5 3 5 2 15\n", 523 | "Theater arts 46 0 0 0 0 0 0 0 0 0 0 6 2 3 1 12\n", 524 | "Theology 47 0 0 0 0 0 0 0 0 0 0 6 6 13 8 33\n", 525 | "Veterinary medicine 48 0 0 0 0 0 0 0 0 0 0 1 5 3 4 13\n", 526 | "Liberal arts 49 0 0 0 0 0 0 0 0 0 0 8 16 12 10 46\n", 527 | "Other 50 0 0 0 0 0 0 0 0 0 0 8 10 21 27 66\n", 528 | "General sciences 51 0 0 0 0 0 0 0 0 0 0 10 13 15 14 52\n", 529 | "Social work 52 0 0 0 0 0 0 0 0 0 0 7 17 24 7 55\n", 530 | "General studies 53 0 0 0 0 0 0 0 0 0 0 2 5 7 7 21\n", 531 | "Other vocational 54 0 0 0 0 0 0 0 0 0 0 5 11 6 5 27\n", 532 | "Health 55 0 0 0 0 0 0 0 0 0 0 23 31 31 42 127\n", 533 | "Industrial Relations 56 0 0 0 0 0 0 0 0 0 0 1 0 0 3 4\n", 534 | "Child/Human/Family Development 57 0 0 0 0 0 0 0 0 0 0 11 3 7 7 28\n", 535 | "Food Science/Nutrition/Culinary Arts 58 0 0 0 0 0 0 0 0 0 0 3 6 9 9 27\n", 536 | "Environmental Science/Ecology 59 0 0 0 0 0 0 0 0 0 0 5 5 6 8 24\n", 537 | "Social Sciences 60 0 0 0 0 0 0 0 0 0 0 4 2 7 5 18\n", 538 | "Human Services/Human Resources 61 0 0 0 0 0 0 0 0 0 0 3 7 7 5 22\n", 539 | "Visual Arts/Graphic Design/Design and Drafting 62 0 0 0 0 0 0 0 0 0 0 3 8 9 10 30\n", 540 | "Fine Arts 63 0 0 0 0 0 0 0 0 0 0 4 5 5 6 20\n", 541 | "Humanities 64 0 0 0 0 0 0 0 0 0 0 0 2 0 1 3\n", 542 | "Ethnic studies 65 0 0 0 0 0 0 0 0 0 0 3 1 0 0 4\n", 543 | "Educational administration 66 0 0 0 0 0 0 0 0 0 0 3 4 8 9 24\n", 544 | "Television/Film 67 0 0 0 0 0 0 0 0 0 0 0 2 6 1 9\n", 545 | "Aviation/Aeronatics 68 0 0 0 0 0 0 0 0 0 0 2 1 1 3 7\n", 546 | "Statistics/Biostatistics 69 0 0 0 0 0 0 0 0 0 0 0 0 2 2 4\n", 547 | "Criminology/Criminal Justice 70 0 0 0 0 0 0 0 0 0 0 13 17 17 13 60\n", 548 | "Administrative Science/Public Administration 71 0 0 0 0 0 0 0 0 0 0 2 11 3 5 21\n", 549 | "Electronics 72 0 0 0 0 0 0 0 0 0 0 6 6 5 9 26\n", 550 | "Urban and Regional Planning 73 0 0 0 0 0 0 0 0 0 0 1 1 3 2 7\n", 551 | "Mechanics/Machine Trade 74 0 0 0 0 0 0 0 0 0 0 0 1 1 4 6\n", 552 | "Dance 75 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n", 553 | "Gerontology 76 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n", 554 | "Public Relations 77 0 0 0 0 0 0 0 0 0 0 3 1 2 1 7\n", 555 | "Textiles/Cloth 78 0 0 0 0 0 0 0 0 0 0 3 4 0 0 7\n", 556 | "Parks and Recreation 79 0 0 0 0 0 0 0 0 0 0 1 2 1 0 4\n", 557 | "Information Technology 80 0 0 0 0 0 0 0 0 0 0 0 5 8 11 24\n", 558 | "Fashion 81 0 0 0 0 0 0 0 0 0 0 0 0 3 1 4\n", 559 | "Counseling 82 0 0 0 0 0 0 0 0 0 0 0 0 11 9 20\n", 560 | "Don't know/UNCODED 98 0 0 0 0 0 0 0 0 0 0 2 3 0 0 5\n", 561 | "No answer 99 0 0 0 0 0 0 0 0 0 0 0 1 5 3 9\n", 562 | "Not applicable 0 13626 354 7542 353 5907 10334 8394 4510 2023 2044 1263 1597 1795 1435 61177'''\n", 563 | "\n", 564 | "# copy paste slight tweak from page 186\n", 565 | "major_dict = {int(row.split()[-16]): ' '.join(row.split()[:-16]) for row in MAJOR.split('\\n')[1:]}\n", 566 | "major_dict" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "id": "b9d6c34d", 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [ 576 | "raw.MAJOR1.value_counts()" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "id": "74652b6d", 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "(raw\n", 587 | " [cols]\n", 588 | " .assign(\n", 589 | " MAJOR1=raw.MAJOR1.fillna(99).astype('int').replace(major_dict),\n", 590 | " SEX=raw.SEX#\n", 591 | " \n", 592 | " .astype(int)\n", 593 | " .replace({1:'Male', 2:'Female'}),\n", 594 | " RACE=raw.RACE.astype(int).replace({1:'White', 2:'Black', 3:'Other'}),\n", 595 | " OCC=raw.OCC.fillna(9999).astype(int),\n", 596 | " BORN=raw.BORN.fillna(4).astype(int).replace({1:'Yes', 2:'No', 3:'Don\\'t Know',\n", 597 | " 4:'No answer', 5:'Not applicable'}),\n", 598 | " INCOME=raw.INCOME.fillna(99).astype(int).replace({99:'No answer', **dict(enumerate(['Not applicable',\n", 599 | " 0,1000,3000,4000,5000,6000,\n", 600 | " 7000,8000,10000,15000,20000,25000,]))}),\n", 601 | " INCOME06=raw.INCOME06.fillna(26).astype(int).replace({26:'Refused', **dict(enumerate(['Not applicable',\n", 602 | " 0,1000,3000,4000,5000,6000,\n", 603 | " 7000,8000,10000,12500,15000,\n", 604 | " 17500,20000,22500,25000,30_000,\n", 605 | " 35_000, 40_000, 50_000, 60_000,\n", 606 | " 75_000, 90_000, 110_000, 130_000,\n", 607 | " 150_000]))}),\n", 608 | " HONEST=raw.HONEST.fillna(9).astype(int).replace({1:'Most desirable', 2:'3 most desireable',\n", 609 | " 3:'Not mentioned', 4: '3 least desireable',\n", 610 | " 5: 'One least desireable',\n", 611 | " 9:'No answer'}),\n", 612 | " TICKET=raw.TICKET.fillna(9).astype(int).replace({1:'Yes', 2:'No', 3:'Refused', 9: 'No answer'}),\n", 613 | " )\n", 614 | " .astype({'YEAR':int, 'ID': 'uint16[pyarrow]'})\n", 615 | " .to_csv('GSS.csv')\n", 616 | ")" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "id": "a14afd45", 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "id": "ce8f0020", 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "id": "043a0085", 638 | "metadata": {}, 639 | "source": [ 640 | "## Types\n", 641 | "Getting the right types will enable analysis and correctness.\n" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "id": "5d6c6cd5", 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "%%time\n", 652 | "gss = pd.read_csv('GSS.csv', index_col=0, dtype_backend='pyarrow', engine='pyarrow')" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "id": "d26b8f6c", 659 | "metadata": { 660 | "pycharm": { 661 | "name": "#%%\n" 662 | } 663 | }, 664 | "outputs": [], 665 | "source": [ 666 | "gss.dtypes" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": null, 672 | "id": "19890585", 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "gss" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "id": "852115fe", 683 | "metadata": { 684 | "pycharm": { 685 | "name": "#%%\n" 686 | } 687 | }, 688 | "outputs": [], 689 | "source": [ 690 | "gss.memory_usage(deep=True)" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "id": "b5cfc13b", 697 | "metadata": { 698 | "pycharm": { 699 | "name": "#%%\n" 700 | } 701 | }, 702 | "outputs": [], 703 | "source": [ 704 | "# 36 M (pandas 1)\n", 705 | "# 8.6 M (Pandas 2)\n", 706 | "gss.memory_usage(deep=True).sum()" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": null, 712 | "id": "f1d1b51d", 713 | "metadata": { 714 | "lines_to_next_cell": 2, 715 | "pycharm": { 716 | "name": "#%%\n" 717 | } 718 | }, 719 | "outputs": [], 720 | "source": [] 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "id": "5fcab8c0", 725 | "metadata": { 726 | "pycharm": { 727 | "name": "#%% md\n" 728 | } 729 | }, 730 | "source": [ 731 | "## Ints" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": null, 737 | "id": "ad4eddc7", 738 | "metadata": { 739 | "pycharm": { 740 | "name": "#%%\n" 741 | } 742 | }, 743 | "outputs": [], 744 | "source": [ 745 | "gss.select_dtypes(int).describe()" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": null, 751 | "id": "ac323e5e", 752 | "metadata": { 753 | "pycharm": { 754 | "name": "#%%\n" 755 | } 756 | }, 757 | "outputs": [], 758 | "source": [ 759 | "# chaining\n", 760 | "(gss\n", 761 | " .select_dtypes(int)\n", 762 | " .describe()\n", 763 | ")" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": null, 769 | "id": "2351d051", 770 | "metadata": { 771 | "pycharm": { 772 | "name": "#%%\n" 773 | } 774 | }, 775 | "outputs": [], 776 | "source": [ 777 | "# can comb08 be an int8?\n", 778 | "# Do completion on int\n", 779 | "np.iinfo(np.int)" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "id": "323df8fb", 786 | "metadata": { 787 | "pycharm": { 788 | "name": "#%%\n" 789 | } 790 | }, 791 | "outputs": [], 792 | "source": [ 793 | "np.iinfo(np.uint8)" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "id": "bb063be4", 800 | "metadata": { 801 | "pycharm": { 802 | "name": "#%%\n" 803 | } 804 | }, 805 | "outputs": [], 806 | "source": [ 807 | "np.iinfo(np.uint16)" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": null, 813 | "id": "d0fab927", 814 | "metadata": { 815 | "pycharm": { 816 | "name": "#%%\n" 817 | } 818 | }, 819 | "outputs": [], 820 | "source": [ 821 | "# chaining\n", 822 | "(gss\n", 823 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' })\n", 824 | " .select_dtypes(['uint16'])\n", 825 | " .describe()\n", 826 | ")" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": null, 832 | "id": "a1d0ed15", 833 | "metadata": { 834 | "lines_to_next_cell": 2, 835 | "pycharm": { 836 | "name": "#%%\n" 837 | } 838 | }, 839 | "outputs": [], 840 | "source": [ 841 | "# chaining\n", 842 | "# use 'integer' so see all int-like columns\n", 843 | "(gss\n", 844 | " .astype({#'YEAR': 'uint16[pyarrow]',\n", 845 | " 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n", 846 | " .select_dtypes(['integer']) # see https://numpy.org/doc/stable/reference/arrays.scalars.html\n", 847 | " .describe()\n", 848 | ")" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": null, 854 | "id": "c5d4c3e1", 855 | "metadata": { 856 | "lines_to_next_cell": 2, 857 | "pycharm": { 858 | "name": "#%%\n" 859 | } 860 | }, 861 | "outputs": [], 862 | "source": [ 863 | "# Inspect memory usage\n", 864 | "(gss\n", 865 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n", 866 | " .memory_usage(deep=True)\n", 867 | " .sum() # was 36M\n", 868 | ")" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": null, 874 | "id": "8ad6e733", 875 | "metadata": { 876 | "lines_to_next_cell": 2, 877 | "pycharm": { 878 | "name": "#%%\n" 879 | } 880 | }, 881 | "outputs": [], 882 | "source": [] 883 | }, 884 | { 885 | "cell_type": "markdown", 886 | "id": "f339194e", 887 | "metadata": {}, 888 | "source": [ 889 | "## Int Exercise\n", 890 | "* Try converting *YEAR* to `'int8'`. What do the values look like?\n", 891 | "* Try converting *YEAR* to `'int8[pyarrow]'`. What do the values look like?" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": null, 897 | "id": "908545d1", 898 | "metadata": {}, 899 | "outputs": [], 900 | "source": [] 901 | }, 902 | { 903 | "cell_type": "code", 904 | "execution_count": null, 905 | "id": "18a3bf52", 906 | "metadata": {}, 907 | "outputs": [], 908 | "source": [] 909 | }, 910 | { 911 | "cell_type": "markdown", 912 | "id": "b09f89c6", 913 | "metadata": {}, 914 | "source": [ 915 | "## Floats" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": null, 921 | "id": "e7fed87e", 922 | "metadata": { 923 | "pycharm": { 924 | "name": "#%%\n" 925 | } 926 | }, 927 | "outputs": [], 928 | "source": [ 929 | "(gss\n", 930 | ".select_dtypes('float'))" 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": null, 936 | "id": "49265726", 937 | "metadata": { 938 | "pycharm": { 939 | "name": "#%%\n" 940 | } 941 | }, 942 | "outputs": [], 943 | "source": [ 944 | "# surprise! age and hours worked looks int-like\n", 945 | "gss.HRS1.describe()" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": null, 951 | "id": "cd39df3c", 952 | "metadata": { 953 | "pycharm": { 954 | "name": "#%%\n" 955 | } 956 | }, 957 | "outputs": [], 958 | "source": [ 959 | "# opps! missing values\n", 960 | "gss.HRS1.value_counts(dropna=False)" 961 | ] 962 | }, 963 | { 964 | "cell_type": "code", 965 | "execution_count": null, 966 | "id": "31a67da2", 967 | "metadata": { 968 | "pycharm": { 969 | "name": "#%%\n" 970 | } 971 | }, 972 | "outputs": [], 973 | "source": [ 974 | "# where are they missing?\n", 975 | "(gss\n", 976 | " .query('HRS1.isna()')\n", 977 | ")" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": null, 983 | "id": "e697d070", 984 | "metadata": { 985 | "pycharm": { 986 | "name": "#%%\n" 987 | } 988 | }, 989 | "outputs": [], 990 | "source": [ 991 | "# where are they missing?\n", 992 | "(gss\n", 993 | " .query('AGE.isna()')\n", 994 | ")" 995 | ] 996 | }, 997 | { 998 | "cell_type": "code", 999 | "execution_count": null, 1000 | "id": "a9166e1a", 1001 | "metadata": { 1002 | "pycharm": { 1003 | "name": "#%%\n" 1004 | } 1005 | }, 1006 | "outputs": [], 1007 | "source": [ 1008 | "# where are they missing?\n", 1009 | "# It turns out that ID is not consistent across years\n", 1010 | "(gss\n", 1011 | " .query('ID == 229')\n", 1012 | ")" 1013 | ] 1014 | }, 1015 | { 1016 | "cell_type": "code", 1017 | "execution_count": null, 1018 | "id": "81a8c902", 1019 | "metadata": { 1020 | "lines_to_next_cell": 2, 1021 | "pycharm": { 1022 | "name": "#%%\n" 1023 | } 1024 | }, 1025 | "outputs": [], 1026 | "source": [ 1027 | "# Convert to integers\n", 1028 | "(gss\n", 1029 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1030 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n", 1031 | ")" 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": null, 1037 | "id": "d3388e2c", 1038 | "metadata": { 1039 | "lines_to_next_cell": 2, 1040 | "pycharm": { 1041 | "name": "#%%\n" 1042 | } 1043 | }, 1044 | "outputs": [], 1045 | "source": [ 1046 | "(gss\n", 1047 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1048 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n", 1049 | " .memory_usage(deep=True)\n", 1050 | " .sum() # was 36M \n", 1051 | ")" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "code", 1056 | "execution_count": null, 1057 | "id": "9bb70ac2", 1058 | "metadata": { 1059 | "lines_to_next_cell": 2, 1060 | "pycharm": { 1061 | "name": "#%%\n" 1062 | } 1063 | }, 1064 | "outputs": [], 1065 | "source": [] 1066 | }, 1067 | { 1068 | "cell_type": "markdown", 1069 | "id": "75bfd716", 1070 | "metadata": {}, 1071 | "source": [ 1072 | "## Float Exercise\n", 1073 | "\n", 1074 | "* What is the mean of the numeric columns?\n", 1075 | "* How many values are missing in the numeric columns?" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "code", 1080 | "execution_count": null, 1081 | "id": "e3e30c4e", 1082 | "metadata": { 1083 | "lines_to_next_cell": 2 1084 | }, 1085 | "outputs": [], 1086 | "source": [] 1087 | }, 1088 | { 1089 | "cell_type": "markdown", 1090 | "id": "a136fe09", 1091 | "metadata": {}, 1092 | "source": [ 1093 | "## Objects" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": null, 1099 | "id": "4f8b0477", 1100 | "metadata": { 1101 | "pycharm": { 1102 | "name": "#%%\n" 1103 | } 1104 | }, 1105 | "outputs": [], 1106 | "source": [ 1107 | "# pandas 1.x\n", 1108 | "(gss\n", 1109 | " .select_dtypes(object)\n", 1110 | ")" 1111 | ] 1112 | }, 1113 | { 1114 | "cell_type": "code", 1115 | "execution_count": null, 1116 | "id": "f80da8d2", 1117 | "metadata": { 1118 | "pycharm": { 1119 | "name": "#%%\n" 1120 | } 1121 | }, 1122 | "outputs": [], 1123 | "source": [ 1124 | "# pandas 2\n", 1125 | "(gss\n", 1126 | " .select_dtypes('string') # str doesn't work\n", 1127 | ")" 1128 | ] 1129 | }, 1130 | { 1131 | "cell_type": "code", 1132 | "execution_count": null, 1133 | "id": "7f762143", 1134 | "metadata": { 1135 | "pycharm": { 1136 | "name": "#%%\n" 1137 | } 1138 | }, 1139 | "outputs": [], 1140 | "source": [ 1141 | "# My goto method - .value_counts\n", 1142 | "# looks categorical\n", 1143 | "(gss.MAJOR1.value_counts(dropna=False))" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": null, 1149 | "id": "55c21c7a", 1150 | "metadata": { 1151 | "lines_to_next_cell": 2, 1152 | "pycharm": { 1153 | "name": "#%%\n" 1154 | } 1155 | }, 1156 | "outputs": [], 1157 | "source": [ 1158 | "(gss\n", 1159 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1160 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1161 | " 'MAJOR1': 'category'})\n", 1162 | " .memory_usage(deep=True)\n", 1163 | " .sum() # was 36M \n", 1164 | ")" 1165 | ] 1166 | }, 1167 | { 1168 | "cell_type": "code", 1169 | "execution_count": null, 1170 | "id": "69969c1b", 1171 | "metadata": {}, 1172 | "outputs": [], 1173 | "source": [ 1174 | "(gss\n", 1175 | " .select_dtypes(object)\n", 1176 | " .columns\n", 1177 | ")" 1178 | ] 1179 | }, 1180 | { 1181 | "cell_type": "code", 1182 | "execution_count": null, 1183 | "id": "f5d51601", 1184 | "metadata": { 1185 | "lines_to_next_cell": 0, 1186 | "pycharm": { 1187 | "name": "#%%\n" 1188 | } 1189 | }, 1190 | "outputs": [], 1191 | "source": [ 1192 | "# wow!\n", 1193 | "(gss\n", 1194 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1195 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1196 | " 'MAJOR1': 'category',\n", 1197 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1198 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}}) \n", 1199 | " .memory_usage(deep=True)\n", 1200 | " .sum() # was 36M \n", 1201 | ")" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "code", 1206 | "execution_count": null, 1207 | "id": "17206364", 1208 | "metadata": {}, 1209 | "outputs": [], 1210 | "source": [] 1211 | }, 1212 | { 1213 | "cell_type": "code", 1214 | "execution_count": null, 1215 | "id": "50bf3fa6", 1216 | "metadata": { 1217 | "lines_to_next_cell": 2 1218 | }, 1219 | "outputs": [], 1220 | "source": [] 1221 | }, 1222 | { 1223 | "cell_type": "markdown", 1224 | "id": "246041ae", 1225 | "metadata": {}, 1226 | "source": [ 1227 | "## Category Exercises\n", 1228 | "* There is a `.cat` attribute on the category columns. What can you do with this attribute? (Use `dir` or tab completion to inspect).\n", 1229 | "* Categories can be ordered. How do you order *INCOME*?\n", 1230 | "* Order the *HONEST* column." 1231 | ] 1232 | }, 1233 | { 1234 | "cell_type": "code", 1235 | "execution_count": null, 1236 | "id": "f543c52c", 1237 | "metadata": {}, 1238 | "outputs": [], 1239 | "source": [] 1240 | }, 1241 | { 1242 | "cell_type": "code", 1243 | "execution_count": null, 1244 | "id": "338e5ba3", 1245 | "metadata": {}, 1246 | "outputs": [], 1247 | "source": [] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": null, 1252 | "id": "3fb313d6", 1253 | "metadata": {}, 1254 | "outputs": [], 1255 | "source": [] 1256 | }, 1257 | { 1258 | "cell_type": "code", 1259 | "execution_count": null, 1260 | "id": "f1d75a84", 1261 | "metadata": {}, 1262 | "outputs": [], 1263 | "source": [] 1264 | }, 1265 | { 1266 | "cell_type": "code", 1267 | "execution_count": null, 1268 | "id": "5c87e18d", 1269 | "metadata": {}, 1270 | "outputs": [], 1271 | "source": [] 1272 | }, 1273 | { 1274 | "cell_type": "code", 1275 | "execution_count": null, 1276 | "id": "85aaccbb", 1277 | "metadata": {}, 1278 | "outputs": [], 1279 | "source": [] 1280 | }, 1281 | { 1282 | "cell_type": "code", 1283 | "execution_count": null, 1284 | "id": "8a321513", 1285 | "metadata": {}, 1286 | "outputs": [], 1287 | "source": [] 1288 | }, 1289 | { 1290 | "cell_type": "markdown", 1291 | "id": "8af7a3d4", 1292 | "metadata": {}, 1293 | "source": [ 1294 | "## Make a Function" 1295 | ] 1296 | }, 1297 | { 1298 | "cell_type": "code", 1299 | "execution_count": null, 1300 | "id": "cb9a32b3", 1301 | "metadata": { 1302 | "lines_to_next_cell": 2, 1303 | "pycharm": { 1304 | "name": "#%%\n" 1305 | } 1306 | }, 1307 | "outputs": [], 1308 | "source": [ 1309 | "# a glorious function\n", 1310 | "# add ordered categories to this\n", 1311 | "def tweak_gss(gss):\n", 1312 | " return (gss\n", 1313 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1314 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1315 | " 'MAJOR1': 'category',\n", 1316 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1317 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1318 | " )\n", 1319 | "\n", 1320 | "tweak_gss(gss)" 1321 | ] 1322 | }, 1323 | { 1324 | "cell_type": "markdown", 1325 | "id": "1c615739", 1326 | "metadata": {}, 1327 | "source": [ 1328 | "## Function Exercise\n", 1329 | "* Rearrange your notebook. Put the imports, code to load raw data, and tweak function at the top of the notebook. Restart the kernel and validate that your code works." 1330 | ] 1331 | }, 1332 | { 1333 | "cell_type": "code", 1334 | "execution_count": null, 1335 | "id": "c61b9f0a", 1336 | "metadata": {}, 1337 | "outputs": [], 1338 | "source": [] 1339 | }, 1340 | { 1341 | "cell_type": "code", 1342 | "execution_count": null, 1343 | "id": "6589902c", 1344 | "metadata": {}, 1345 | "outputs": [], 1346 | "source": [] 1347 | }, 1348 | { 1349 | "cell_type": "markdown", 1350 | "id": "b350e12e", 1351 | "metadata": { 1352 | "lines_to_next_cell": 2 1353 | }, 1354 | "source": [ 1355 | "## Fix Column Names" 1356 | ] 1357 | }, 1358 | { 1359 | "cell_type": "code", 1360 | "execution_count": null, 1361 | "id": "99b39238", 1362 | "metadata": { 1363 | "lines_to_next_cell": 0, 1364 | "pycharm": { 1365 | "name": "#%%\n" 1366 | } 1367 | }, 1368 | "outputs": [], 1369 | "source": [ 1370 | "# a glorious function\n", 1371 | "def tweak_gss(gss):\n", 1372 | " return (gss\n", 1373 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1374 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1375 | " 'MAJOR1': 'category',\n", 1376 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1377 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1378 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1379 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1380 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1381 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1382 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1383 | " 'HONEST':'honesty_rank',\n", 1384 | " 'TICKET':'traffic_ticket'})\n", 1385 | " )\n", 1386 | "\n", 1387 | "tweak_gss(gss)" 1388 | ] 1389 | }, 1390 | { 1391 | "cell_type": "code", 1392 | "execution_count": null, 1393 | "id": "84ecc0de", 1394 | "metadata": { 1395 | "lines_to_next_cell": 2, 1396 | "pycharm": { 1397 | "name": "#%%\n" 1398 | } 1399 | }, 1400 | "outputs": [], 1401 | "source": [] 1402 | }, 1403 | { 1404 | "cell_type": "code", 1405 | "execution_count": null, 1406 | "id": "bf14ec3f", 1407 | "metadata": { 1408 | "lines_to_next_cell": 2, 1409 | "pycharm": { 1410 | "name": "#%%\n" 1411 | } 1412 | }, 1413 | "outputs": [], 1414 | "source": [] 1415 | }, 1416 | { 1417 | "cell_type": "markdown", 1418 | "id": "003b96b9", 1419 | "metadata": { 1420 | "pycharm": { 1421 | "name": "#%% md\n" 1422 | } 1423 | }, 1424 | "source": [ 1425 | "## Chain\n", 1426 | "\n", 1427 | "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n", 1428 | "\n", 1429 | "The chain should read like a recipe of ordered steps.\n", 1430 | "\n", 1431 | "(BTW, this is actually what we did above.)\n", 1432 | "\n", 1433 | "
\n", 1434 | " Hint: Leverage .pipe if you can't find a way to chain 😉🐼💪\n", 1435 | "
\n", 1436 | " \n", 1437 | "\n", 1438 | "\n" 1439 | ] 1440 | }, 1441 | { 1442 | "cell_type": "code", 1443 | "execution_count": null, 1444 | "id": "a74cd1a9", 1445 | "metadata": { 1446 | "lines_to_next_cell": 0, 1447 | "pycharm": { 1448 | "name": "#%%\n" 1449 | } 1450 | }, 1451 | "outputs": [], 1452 | "source": [ 1453 | "# a glorious function\n", 1454 | "def tweak_gss(gss):\n", 1455 | " return (gss\n", 1456 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1457 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1458 | " 'MAJOR1': 'category',\n", 1459 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1460 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1461 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1462 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1463 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1464 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1465 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1466 | " 'HONEST':'honesty_rank',\n", 1467 | " 'TICKET':'traffic_ticket'})\n", 1468 | " )\n", 1469 | "\n", 1470 | "tweak_gss(gss)" 1471 | ] 1472 | }, 1473 | { 1474 | "cell_type": "code", 1475 | "execution_count": null, 1476 | "id": "efc594da", 1477 | "metadata": { 1478 | "pycharm": { 1479 | "name": "#%%\n" 1480 | } 1481 | }, 1482 | "outputs": [], 1483 | "source": [ 1484 | "# compare chain to this mess\n", 1485 | "gss2 = gss.copy()\n", 1486 | "year = gss.YEAR\n", 1487 | "year_int = year.astype('uint16')\n", 1488 | "gss2['year'] = year_int\n", 1489 | "id = gss.ID\n", 1490 | "id_int = id.astype('uint16')\n", 1491 | "gss2['year_id'] = id_int\n", 1492 | "occ = gss.OCC\n", 1493 | "occ_int = occ.astype('uint16')\n", 1494 | "gss2['occupation'] = occ_int\n", 1495 | "\n", 1496 | "# more of this" 1497 | ] 1498 | }, 1499 | { 1500 | "cell_type": "code", 1501 | "execution_count": null, 1502 | "id": "32411eaf", 1503 | "metadata": { 1504 | "lines_to_next_cell": 0, 1505 | "pycharm": { 1506 | "name": "#%%\n" 1507 | } 1508 | }, 1509 | "outputs": [], 1510 | "source": [ 1511 | "# easy to debug\n", 1512 | "# - assign to var (df3)\n", 1513 | "# - comment out\n", 1514 | "# - pipe to display\n", 1515 | "\n", 1516 | "\n", 1517 | "from IPython.display import display\n", 1518 | "\n", 1519 | "def get_var(df, var_name):\n", 1520 | " globals()[var_name] = df\n", 1521 | " return df\n", 1522 | "\n", 1523 | "def tweak_gss(gss):\n", 1524 | " return (gss\n", 1525 | " .pipe(get_var, 'df3') \n", 1526 | " .pipe(lambda df: print(df.shape) or df) \n", 1527 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1528 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1529 | " 'MAJOR1': 'category',\n", 1530 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1531 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1532 | " .pipe(lambda df: print(df.shape) or df) \n", 1533 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1534 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1535 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1536 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1537 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1538 | " 'HONEST':'honesty_rank',\n", 1539 | " 'TICKET':'traffic_ticket'})\n", 1540 | " .pipe(lambda df: print(df.shape) or df) \n", 1541 | " )\n", 1542 | "\n", 1543 | "tweak_gss(gss)" 1544 | ] 1545 | }, 1546 | { 1547 | "cell_type": "code", 1548 | "execution_count": null, 1549 | "id": "fdc2894e", 1550 | "metadata": { 1551 | "pycharm": { 1552 | "name": "#%%\n" 1553 | } 1554 | }, 1555 | "outputs": [], 1556 | "source": [ 1557 | "# inspect intermediate data frame\n", 1558 | "df3" 1559 | ] 1560 | }, 1561 | { 1562 | "cell_type": "markdown", 1563 | "id": "1842701c", 1564 | "metadata": { 1565 | "pycharm": { 1566 | "name": "#%%\n" 1567 | } 1568 | }, 1569 | "source": [ 1570 | "## Chain Exercise\n", 1571 | "* Write a function that acccepts a dataframe and an index value. It should print any rows that match the index and return the dataframe that was passed in.\n", 1572 | "* Use the function with pipe after each step of the chain. Show the rows for index 2 and 64,813.\n", 1573 | "\n", 1574 | "\n", 1575 | "\n", 1576 | "\n", 1577 | "\n", 1578 | "\n", 1579 | "\n", 1580 | "\n", 1581 | "## Don't Mutate\n", 1582 | "\n", 1583 | "> \"you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not.\"\n", 1584 | ">\n", 1585 | "> **jreback** - Pandas core dev\n", 1586 | "\n", 1587 | "\n", 1588 | "\n", 1589 | "https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136\n", 1590 | "\n", 1591 | "* In general, no performance benefits\n", 1592 | "* Prohibits chaining\n", 1593 | "* ``SettingWithCopyWarning`` fun\n" 1594 | ] 1595 | }, 1596 | { 1597 | "cell_type": "code", 1598 | "execution_count": null, 1599 | "id": "9b1955ed", 1600 | "metadata": { 1601 | "lines_to_next_cell": 2, 1602 | "pycharm": { 1603 | "name": "#%%\n" 1604 | } 1605 | }, 1606 | "outputs": [], 1607 | "source": [ 1608 | "pd.read_csv??" 1609 | ] 1610 | }, 1611 | { 1612 | "cell_type": "code", 1613 | "execution_count": null, 1614 | "id": "bce7abe3", 1615 | "metadata": { 1616 | "lines_to_next_cell": 2, 1617 | "pycharm": { 1618 | "name": "#%%\n" 1619 | } 1620 | }, 1621 | "outputs": [], 1622 | "source": [] 1623 | }, 1624 | { 1625 | "cell_type": "code", 1626 | "execution_count": null, 1627 | "id": "4e6a8e2f", 1628 | "metadata": { 1629 | "lines_to_next_cell": 2, 1630 | "pycharm": { 1631 | "name": "#%%\n" 1632 | } 1633 | }, 1634 | "outputs": [], 1635 | "source": [] 1636 | }, 1637 | { 1638 | "cell_type": "markdown", 1639 | "id": "2a263d38", 1640 | "metadata": { 1641 | "pycharm": { 1642 | "name": "#%% md\n" 1643 | } 1644 | }, 1645 | "source": [ 1646 | "## Don't Apply (if you can)" 1647 | ] 1648 | }, 1649 | { 1650 | "cell_type": "code", 1651 | "execution_count": null, 1652 | "id": "9e68b584", 1653 | "metadata": { 1654 | "lines_to_next_cell": 0, 1655 | "pycharm": { 1656 | "name": "#%%\n" 1657 | } 1658 | }, 1659 | "outputs": [], 1660 | "source": [ 1661 | "# a glorious function\n", 1662 | "def tweak_gss(gss):\n", 1663 | " return (gss\n", 1664 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1665 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1666 | " 'MAJOR1': 'category',\n", 1667 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1668 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1669 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1670 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1671 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1672 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1673 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1674 | " 'HONEST':'honesty_rank',\n", 1675 | " 'TICKET':'traffic_ticket'})\n", 1676 | " )\n", 1677 | "\n", 1678 | "gss2 = tweak_gss(gss)" 1679 | ] 1680 | }, 1681 | { 1682 | "cell_type": "code", 1683 | "execution_count": null, 1684 | "id": "1a82332f", 1685 | "metadata": { 1686 | "pycharm": { 1687 | "name": "#%%\n" 1688 | } 1689 | }, 1690 | "outputs": [], 1691 | "source": [ 1692 | "# convert age to months\n", 1693 | "def to_months(val):\n", 1694 | " return val * 12\n", 1695 | "\n", 1696 | "gss2.age.apply(to_months)" 1697 | ] 1698 | }, 1699 | { 1700 | "cell_type": "code", 1701 | "execution_count": null, 1702 | "id": "a221e972", 1703 | "metadata": { 1704 | "pycharm": { 1705 | "name": "#%%\n" 1706 | } 1707 | }, 1708 | "outputs": [], 1709 | "source": [ 1710 | "# this gives the sames results\n", 1711 | "gss2.age * 12" 1712 | ] 1713 | }, 1714 | { 1715 | "cell_type": "code", 1716 | "execution_count": null, 1717 | "id": "9cb2b9d2", 1718 | "metadata": { 1719 | "pycharm": { 1720 | "name": "#%%\n" 1721 | } 1722 | }, 1723 | "outputs": [], 1724 | "source": [ 1725 | "%%timeit\n", 1726 | "gss2.age.apply(to_months)" 1727 | ] 1728 | }, 1729 | { 1730 | "cell_type": "code", 1731 | "execution_count": null, 1732 | "id": "51bcc862", 1733 | "metadata": { 1734 | "pycharm": { 1735 | "name": "#%%\n" 1736 | } 1737 | }, 1738 | "outputs": [], 1739 | "source": [ 1740 | "%%timeit\n", 1741 | "gss2.age * 12" 1742 | ] 1743 | }, 1744 | { 1745 | "cell_type": "code", 1746 | "execution_count": null, 1747 | "id": "72a01657", 1748 | "metadata": { 1749 | "pycharm": { 1750 | "name": "#%%\n" 1751 | } 1752 | }, 1753 | "outputs": [], 1754 | "source": [ 1755 | "# ~42x slower!\n", 1756 | "4_590 / 110" 1757 | ] 1758 | }, 1759 | { 1760 | "cell_type": "code", 1761 | "execution_count": null, 1762 | "id": "619094f7", 1763 | "metadata": {}, 1764 | "outputs": [], 1765 | "source": [ 1766 | "gss.MAJOR1.value_counts()[:20]" 1767 | ] 1768 | }, 1769 | { 1770 | "cell_type": "code", 1771 | "execution_count": null, 1772 | "id": "f4817aee", 1773 | "metadata": { 1774 | "pycharm": { 1775 | "name": "#%%\n" 1776 | } 1777 | }, 1778 | "outputs": [], 1779 | "source": [ 1780 | "def is_science(val):\n", 1781 | " return val in {'Engineering', 'Computer science', 'Biology'}" 1782 | ] 1783 | }, 1784 | { 1785 | "cell_type": "code", 1786 | "execution_count": null, 1787 | "id": "f00a069c", 1788 | "metadata": { 1789 | "pycharm": { 1790 | "name": "#%%\n" 1791 | } 1792 | }, 1793 | "outputs": [], 1794 | "source": [ 1795 | "%%timeit\n", 1796 | "# string\n", 1797 | "gss.MAJOR1.apply(is_science)" 1798 | ] 1799 | }, 1800 | { 1801 | "cell_type": "code", 1802 | "execution_count": null, 1803 | "id": "5e13ae10", 1804 | "metadata": { 1805 | "pycharm": { 1806 | "name": "#%%\n" 1807 | } 1808 | }, 1809 | "outputs": [], 1810 | "source": [ 1811 | "%%timeit\n", 1812 | "gss.MAJOR1.isin({'Engineering', 'Computer science', 'Biology'})" 1813 | ] 1814 | }, 1815 | { 1816 | "cell_type": "code", 1817 | "execution_count": null, 1818 | "id": "dc933ec1", 1819 | "metadata": { 1820 | "lines_to_next_cell": 0, 1821 | "pycharm": { 1822 | "name": "#%%\n" 1823 | } 1824 | }, 1825 | "outputs": [], 1826 | "source": [ 1827 | "%%timeit\n", 1828 | "# categorical\n", 1829 | "gss2.college_major.isin({'Engineering', 'Computer science', 'Biology'})" 1830 | ] 1831 | }, 1832 | { 1833 | "cell_type": "code", 1834 | "execution_count": null, 1835 | "id": "42a822c2", 1836 | "metadata": { 1837 | "lines_to_next_cell": 2 1838 | }, 1839 | "outputs": [], 1840 | "source": [] 1841 | }, 1842 | { 1843 | "cell_type": "markdown", 1844 | "id": "d56720b4", 1845 | "metadata": {}, 1846 | "source": [ 1847 | "## Apply Exercise\n", 1848 | "* Make a new column called *minutes_worked* derived the *hours_worked* column.\n", 1849 | "* Make a new column called *income_ratio*.\n", 1850 | " * Convert the income columns to numbers (replace `'No answer'` and `'Refused'` with `np.nan`).\n", 1851 | " * Fill in the missing values with the median\n", 1852 | " * Divide the 2006 value by 1970 value" 1853 | ] 1854 | }, 1855 | { 1856 | "cell_type": "code", 1857 | "execution_count": null, 1858 | "id": "3b818eb6", 1859 | "metadata": {}, 1860 | "outputs": [], 1861 | "source": [] 1862 | }, 1863 | { 1864 | "cell_type": "code", 1865 | "execution_count": null, 1866 | "id": "59ebffeb", 1867 | "metadata": {}, 1868 | "outputs": [], 1869 | "source": [] 1870 | }, 1871 | { 1872 | "cell_type": "code", 1873 | "execution_count": null, 1874 | "id": "9d2a4e10", 1875 | "metadata": {}, 1876 | "outputs": [], 1877 | "source": [] 1878 | }, 1879 | { 1880 | "cell_type": "code", 1881 | "execution_count": null, 1882 | "id": "132efb76", 1883 | "metadata": {}, 1884 | "outputs": [], 1885 | "source": [] 1886 | }, 1887 | { 1888 | "cell_type": "code", 1889 | "execution_count": null, 1890 | "id": "f3631607", 1891 | "metadata": {}, 1892 | "outputs": [], 1893 | "source": [] 1894 | }, 1895 | { 1896 | "cell_type": "code", 1897 | "execution_count": null, 1898 | "id": "b1f627b9", 1899 | "metadata": {}, 1900 | "outputs": [], 1901 | "source": [] 1902 | }, 1903 | { 1904 | "cell_type": "code", 1905 | "execution_count": null, 1906 | "id": "eee8ef40", 1907 | "metadata": {}, 1908 | "outputs": [], 1909 | "source": [] 1910 | }, 1911 | { 1912 | "cell_type": "code", 1913 | "execution_count": null, 1914 | "id": "c19d66ca", 1915 | "metadata": {}, 1916 | "outputs": [], 1917 | "source": [] 1918 | }, 1919 | { 1920 | "cell_type": "code", 1921 | "execution_count": null, 1922 | "id": "85c49f0f", 1923 | "metadata": {}, 1924 | "outputs": [], 1925 | "source": [] 1926 | }, 1927 | { 1928 | "cell_type": "code", 1929 | "execution_count": null, 1930 | "id": "537487a7", 1931 | "metadata": {}, 1932 | "outputs": [], 1933 | "source": [] 1934 | }, 1935 | { 1936 | "cell_type": "code", 1937 | "execution_count": null, 1938 | "id": "f62adb56", 1939 | "metadata": {}, 1940 | "outputs": [], 1941 | "source": [] 1942 | }, 1943 | { 1944 | "cell_type": "code", 1945 | "execution_count": null, 1946 | "id": "8c59b615", 1947 | "metadata": {}, 1948 | "outputs": [], 1949 | "source": [] 1950 | }, 1951 | { 1952 | "cell_type": "code", 1953 | "execution_count": null, 1954 | "id": "55ce9070", 1955 | "metadata": {}, 1956 | "outputs": [], 1957 | "source": [] 1958 | }, 1959 | { 1960 | "cell_type": "code", 1961 | "execution_count": null, 1962 | "id": "15e221af", 1963 | "metadata": {}, 1964 | "outputs": [], 1965 | "source": [] 1966 | }, 1967 | { 1968 | "cell_type": "code", 1969 | "execution_count": null, 1970 | "id": "7ed68b41", 1971 | "metadata": {}, 1972 | "outputs": [], 1973 | "source": [] 1974 | }, 1975 | { 1976 | "cell_type": "code", 1977 | "execution_count": null, 1978 | "id": "ae90e79b", 1979 | "metadata": {}, 1980 | "outputs": [], 1981 | "source": [] 1982 | }, 1983 | { 1984 | "cell_type": "code", 1985 | "execution_count": null, 1986 | "id": "96420545", 1987 | "metadata": {}, 1988 | "outputs": [], 1989 | "source": [] 1990 | }, 1991 | { 1992 | "cell_type": "code", 1993 | "execution_count": null, 1994 | "id": "ba8633ef", 1995 | "metadata": {}, 1996 | "outputs": [], 1997 | "source": [] 1998 | }, 1999 | { 2000 | "cell_type": "code", 2001 | "execution_count": null, 2002 | "id": "8993c970", 2003 | "metadata": {}, 2004 | "outputs": [], 2005 | "source": [] 2006 | }, 2007 | { 2008 | "cell_type": "code", 2009 | "execution_count": null, 2010 | "id": "0ca9ac23", 2011 | "metadata": {}, 2012 | "outputs": [], 2013 | "source": [] 2014 | }, 2015 | { 2016 | "cell_type": "code", 2017 | "execution_count": null, 2018 | "id": "139432bb", 2019 | "metadata": {}, 2020 | "outputs": [], 2021 | "source": [] 2022 | }, 2023 | { 2024 | "cell_type": "code", 2025 | "execution_count": null, 2026 | "id": "816e9c31", 2027 | "metadata": {}, 2028 | "outputs": [], 2029 | "source": [] 2030 | }, 2031 | { 2032 | "cell_type": "code", 2033 | "execution_count": null, 2034 | "id": "4f71cd5b", 2035 | "metadata": {}, 2036 | "outputs": [], 2037 | "source": [] 2038 | }, 2039 | { 2040 | "cell_type": "code", 2041 | "execution_count": null, 2042 | "id": "8f4975a6", 2043 | "metadata": {}, 2044 | "outputs": [], 2045 | "source": [] 2046 | }, 2047 | { 2048 | "cell_type": "code", 2049 | "execution_count": null, 2050 | "id": "44aa5162", 2051 | "metadata": {}, 2052 | "outputs": [], 2053 | "source": [] 2054 | }, 2055 | { 2056 | "cell_type": "code", 2057 | "execution_count": null, 2058 | "id": "e5702992", 2059 | "metadata": {}, 2060 | "outputs": [], 2061 | "source": [] 2062 | }, 2063 | { 2064 | "cell_type": "code", 2065 | "execution_count": null, 2066 | "id": "c0dead63", 2067 | "metadata": { 2068 | "lines_to_next_cell": 2 2069 | }, 2070 | "outputs": [], 2071 | "source": [] 2072 | }, 2073 | { 2074 | "cell_type": "markdown", 2075 | "id": "e0faa823", 2076 | "metadata": {}, 2077 | "source": [ 2078 | "## Master Aggregation\n", 2079 | "\n", 2080 | "Let's compare age by sex by year...🤔" 2081 | ] 2082 | }, 2083 | { 2084 | "cell_type": "code", 2085 | "execution_count": null, 2086 | "id": "d444c6b8", 2087 | "metadata": { 2088 | "pycharm": { 2089 | "name": "#%%\n" 2090 | } 2091 | }, 2092 | "outputs": [], 2093 | "source": [ 2094 | "(gss2\n", 2095 | " .groupby('year')\n", 2096 | " .mean()\n", 2097 | ")" 2098 | ] 2099 | }, 2100 | { 2101 | "cell_type": "code", 2102 | "execution_count": null, 2103 | "id": "85441b08", 2104 | "metadata": { 2105 | "pycharm": { 2106 | "name": "#%%\n" 2107 | } 2108 | }, 2109 | "outputs": [], 2110 | "source": [ 2111 | "(gss2\n", 2112 | " .groupby('year')\n", 2113 | " .mean(numeric_only=True)\n", 2114 | ")" 2115 | ] 2116 | }, 2117 | { 2118 | "cell_type": "code", 2119 | "execution_count": null, 2120 | "id": "eadbc6cd", 2121 | "metadata": { 2122 | "pycharm": { 2123 | "name": "#%%\n" 2124 | } 2125 | }, 2126 | "outputs": [], 2127 | "source": [ 2128 | "(gss2\n", 2129 | " .groupby('year')\n", 2130 | " [['age', 'hours_worked']]\n", 2131 | " .mean()\n", 2132 | ")" 2133 | ] 2134 | }, 2135 | { 2136 | "cell_type": "code", 2137 | "execution_count": null, 2138 | "id": "a6d008ae", 2139 | "metadata": { 2140 | "pycharm": { 2141 | "name": "#%%\n" 2142 | } 2143 | }, 2144 | "outputs": [], 2145 | "source": [ 2146 | "import matplotlib.pyplot as plt\n", 2147 | "import seaborn as sns\n", 2148 | "#plt.style.use('pandas1book') \n", 2149 | "sns.set_context('talk')\n", 2150 | "plt.plot(range(10))" 2151 | ] 2152 | }, 2153 | { 2154 | "cell_type": "code", 2155 | "execution_count": null, 2156 | "id": "ffc36b52", 2157 | "metadata": { 2158 | "pycharm": { 2159 | "name": "#%%\n" 2160 | } 2161 | }, 2162 | "outputs": [], 2163 | "source": [ 2164 | "(gss2\n", 2165 | " .groupby('year')\n", 2166 | " [['age', 'hours_worked']]\n", 2167 | " .median()\n", 2168 | " .plot()\n", 2169 | ")" 2170 | ] 2171 | }, 2172 | { 2173 | "cell_type": "code", 2174 | "execution_count": null, 2175 | "id": "5bdcd3e8", 2176 | "metadata": { 2177 | "pycharm": { 2178 | "name": "#%%\n" 2179 | } 2180 | }, 2181 | "outputs": [], 2182 | "source": [ 2183 | "(gss2\n", 2184 | " .groupby('year')\n", 2185 | " [['age', 'hours_worked']]\n", 2186 | " #.mean()\n", 2187 | " #.median()\n", 2188 | " #.std()\n", 2189 | " .max()\n", 2190 | " .plot()\n", 2191 | ")" 2192 | ] 2193 | }, 2194 | { 2195 | "cell_type": "code", 2196 | "execution_count": null, 2197 | "id": "54ebb97d", 2198 | "metadata": { 2199 | "lines_to_next_cell": 2, 2200 | "pycharm": { 2201 | "name": "#%%\n" 2202 | } 2203 | }, 2204 | "outputs": [], 2205 | "source": [ 2206 | "# add sex\n", 2207 | "(gss2\n", 2208 | " .groupby(['year', 'sex'])\n", 2209 | " [['age', 'hours_worked']]\n", 2210 | " .mean()\n", 2211 | " #.median()\n", 2212 | " #.std()\n", 2213 | " #.max()\n", 2214 | " #.plot()\n", 2215 | ")" 2216 | ] 2217 | }, 2218 | { 2219 | "cell_type": "code", 2220 | "execution_count": null, 2221 | "id": "266a53da", 2222 | "metadata": { 2223 | "lines_to_next_cell": 2, 2224 | "pycharm": { 2225 | "name": "#%%\n" 2226 | } 2227 | }, 2228 | "outputs": [], 2229 | "source": [ 2230 | "# add sex\n", 2231 | "(gss2\n", 2232 | " .groupby(['year', 'sex'])\n", 2233 | " [['age', 'hours_worked']]\n", 2234 | " .mean()\n", 2235 | " #.median()\n", 2236 | " #.std()\n", 2237 | " #.max()\n", 2238 | " .plot()\n", 2239 | ")" 2240 | ] 2241 | }, 2242 | { 2243 | "cell_type": "code", 2244 | "execution_count": null, 2245 | "id": "2e20f409", 2246 | "metadata": { 2247 | "lines_to_next_cell": 2, 2248 | "pycharm": { 2249 | "name": "#%%\n" 2250 | } 2251 | }, 2252 | "outputs": [], 2253 | "source": [ 2254 | "# unstack\n", 2255 | "(gss2\n", 2256 | " .groupby(['year', 'sex'])\n", 2257 | " [['age', 'hours_worked']]\n", 2258 | " .mean()\n", 2259 | " #.median()\n", 2260 | " #.std()\n", 2261 | " #.max()\n", 2262 | " .unstack() \n", 2263 | " .plot()\n", 2264 | ")" 2265 | ] 2266 | }, 2267 | { 2268 | "cell_type": "code", 2269 | "execution_count": null, 2270 | "id": "d5481e10", 2271 | "metadata": { 2272 | "lines_to_next_cell": 2, 2273 | "pycharm": { 2274 | "name": "#%%\n" 2275 | } 2276 | }, 2277 | "outputs": [], 2278 | "source": [ 2279 | "(gss2\n", 2280 | " .groupby(['year', 'sex'])\n", 2281 | " [['age', 'hours_worked']]\n", 2282 | " .mean()\n", 2283 | " .unstack()\n", 2284 | " .age\n", 2285 | ")" 2286 | ] 2287 | }, 2288 | { 2289 | "cell_type": "code", 2290 | "execution_count": null, 2291 | "id": "9e01d055", 2292 | "metadata": { 2293 | "lines_to_next_cell": 2, 2294 | "pycharm": { 2295 | "name": "#%%\n" 2296 | } 2297 | }, 2298 | "outputs": [], 2299 | "source": [ 2300 | "(gss2\n", 2301 | " .groupby(['year', 'sex'])\n", 2302 | " [['age', 'hours_worked']]\n", 2303 | " .mean()\n", 2304 | " .unstack()\n", 2305 | " .age\n", 2306 | " .plot()\n", 2307 | " .legend(bbox_to_anchor=(1,1))\n", 2308 | ")" 2309 | ] 2310 | }, 2311 | { 2312 | "cell_type": "code", 2313 | "execution_count": null, 2314 | "id": "d1528728", 2315 | "metadata": { 2316 | "pycharm": { 2317 | "name": "#%%\n" 2318 | } 2319 | }, 2320 | "outputs": [], 2321 | "source": [ 2322 | "# Let's try looking at hours worked\n", 2323 | "(gss2\n", 2324 | " .groupby(['year', 'sex'])\n", 2325 | " [['age', 'hours_worked']]\n", 2326 | " .mean()\n", 2327 | " .unstack()\n", 2328 | " .hours_worked\n", 2329 | " .plot()\n", 2330 | " .legend(bbox_to_anchor=(1,1))\n", 2331 | ")" 2332 | ] 2333 | }, 2334 | { 2335 | "cell_type": "code", 2336 | "execution_count": null, 2337 | "id": "a52537a5", 2338 | "metadata": { 2339 | "lines_to_next_cell": 2, 2340 | "pycharm": { 2341 | "name": "#%%\n" 2342 | } 2343 | }, 2344 | "outputs": [], 2345 | "source": [ 2346 | "# Multiple aggregates\n", 2347 | "def second(group):\n", 2348 | " return group.iloc[1]\n", 2349 | "(gss2\n", 2350 | " .groupby(['year', 'sex'])\n", 2351 | " [['age', 'hours_worked']]\n", 2352 | " .agg(['min', 'max', 'mean', second])\n", 2353 | " \n", 2354 | ")" 2355 | ] 2356 | }, 2357 | { 2358 | "cell_type": "code", 2359 | "execution_count": null, 2360 | "id": "b780beb4", 2361 | "metadata": { 2362 | "lines_to_next_cell": 2, 2363 | "pycharm": { 2364 | "name": "#%%\n" 2365 | } 2366 | }, 2367 | "outputs": [], 2368 | "source": [] 2369 | }, 2370 | { 2371 | "cell_type": "markdown", 2372 | "id": "9aca44a7", 2373 | "metadata": {}, 2374 | "source": [ 2375 | "## Aggregation Exercise\n", 2376 | "* Which occupation has the highest median hours worked?\n", 2377 | "* Which occupation has the lowest age?\n", 2378 | "* What is the breakdown of respondents by race for each year?\n", 2379 | "* Convert the previous to a percentage.\n", 2380 | "* How many unique occupations are there for each year?\n", 2381 | "* What is the most popular college_major for each year?\n", 2382 | "* What is the second most popular college_major for each year?" 2383 | ] 2384 | }, 2385 | { 2386 | "cell_type": "code", 2387 | "execution_count": null, 2388 | "id": "9e5477d4", 2389 | "metadata": {}, 2390 | "outputs": [], 2391 | "source": [] 2392 | }, 2393 | { 2394 | "cell_type": "code", 2395 | "execution_count": null, 2396 | "id": "bcc93724", 2397 | "metadata": {}, 2398 | "outputs": [], 2399 | "source": [] 2400 | }, 2401 | { 2402 | "cell_type": "code", 2403 | "execution_count": null, 2404 | "id": "3a7f368e", 2405 | "metadata": {}, 2406 | "outputs": [], 2407 | "source": [] 2408 | }, 2409 | { 2410 | "cell_type": "code", 2411 | "execution_count": null, 2412 | "id": "697919b8", 2413 | "metadata": {}, 2414 | "outputs": [], 2415 | "source": [] 2416 | }, 2417 | { 2418 | "cell_type": "code", 2419 | "execution_count": null, 2420 | "id": "3d93f8db", 2421 | "metadata": {}, 2422 | "outputs": [], 2423 | "source": [] 2424 | }, 2425 | { 2426 | "cell_type": "code", 2427 | "execution_count": null, 2428 | "id": "3907736f", 2429 | "metadata": {}, 2430 | "outputs": [], 2431 | "source": [] 2432 | }, 2433 | { 2434 | "cell_type": "code", 2435 | "execution_count": null, 2436 | "id": "18186540", 2437 | "metadata": {}, 2438 | "outputs": [], 2439 | "source": [] 2440 | }, 2441 | { 2442 | "cell_type": "code", 2443 | "execution_count": null, 2444 | "id": "f1089e32", 2445 | "metadata": {}, 2446 | "outputs": [], 2447 | "source": [] 2448 | }, 2449 | { 2450 | "cell_type": "code", 2451 | "execution_count": null, 2452 | "id": "116593ba", 2453 | "metadata": {}, 2454 | "outputs": [], 2455 | "source": [] 2456 | }, 2457 | { 2458 | "cell_type": "code", 2459 | "execution_count": null, 2460 | "id": "aaa6e44e", 2461 | "metadata": {}, 2462 | "outputs": [], 2463 | "source": [] 2464 | }, 2465 | { 2466 | "cell_type": "code", 2467 | "execution_count": null, 2468 | "id": "648b7e1e", 2469 | "metadata": {}, 2470 | "outputs": [], 2471 | "source": [] 2472 | }, 2473 | { 2474 | "cell_type": "code", 2475 | "execution_count": null, 2476 | "id": "7ee8d6cc", 2477 | "metadata": {}, 2478 | "outputs": [], 2479 | "source": [] 2480 | }, 2481 | { 2482 | "cell_type": "code", 2483 | "execution_count": null, 2484 | "id": "4f252bd0", 2485 | "metadata": {}, 2486 | "outputs": [], 2487 | "source": [] 2488 | }, 2489 | { 2490 | "cell_type": "code", 2491 | "execution_count": null, 2492 | "id": "b6090240", 2493 | "metadata": {}, 2494 | "outputs": [], 2495 | "source": [] 2496 | }, 2497 | { 2498 | "cell_type": "code", 2499 | "execution_count": null, 2500 | "id": "a37bb2df", 2501 | "metadata": { 2502 | "lines_to_next_cell": 2 2503 | }, 2504 | "outputs": [], 2505 | "source": [] 2506 | }, 2507 | { 2508 | "cell_type": "markdown", 2509 | "id": "7cf8f182", 2510 | "metadata": {}, 2511 | "source": [ 2512 | "## Summary\n", 2513 | "\n", 2514 | "* Correct types save space and enable convenient math, string, and date functionality\n", 2515 | "* Chaining operations will:\n", 2516 | " * Make code readable\n", 2517 | " * Remove bugs\n", 2518 | " * Easier to debug\n", 2519 | "* Don't mutate (there's no point). Embrace chaining.\n", 2520 | "* ``.apply`` is slow for math\n", 2521 | "* Aggregations are powerful. Play with them until they make sense\n", 2522 | "\n", 2523 | "Follow on Twitter ``@__mharrison__``\n", 2524 | "\n", 2525 | "Book giveaway!" 2526 | ] 2527 | }, 2528 | { 2529 | "cell_type": "code", 2530 | "execution_count": null, 2531 | "id": "89444b02", 2532 | "metadata": { 2533 | "pycharm": { 2534 | "name": "#%%\n" 2535 | } 2536 | }, 2537 | "outputs": [], 2538 | "source": [ 2539 | "import random\n", 2540 | "random.randrange(1,13)" 2541 | ] 2542 | }, 2543 | { 2544 | "cell_type": "code", 2545 | "execution_count": null, 2546 | "id": "35931834", 2547 | "metadata": { 2548 | "lines_to_next_cell": 2, 2549 | "pycharm": { 2550 | "name": "#%%\n" 2551 | } 2552 | }, 2553 | "outputs": [], 2554 | "source": [] 2555 | }, 2556 | { 2557 | "cell_type": "code", 2558 | "execution_count": null, 2559 | "id": "53ab759b", 2560 | "metadata": { 2561 | "lines_to_next_cell": 2, 2562 | "pycharm": { 2563 | "name": "#%%\n" 2564 | } 2565 | }, 2566 | "outputs": [], 2567 | "source": [] 2568 | }, 2569 | { 2570 | "cell_type": "code", 2571 | "execution_count": null, 2572 | "id": "063efccd", 2573 | "metadata": { 2574 | "lines_to_next_cell": 2, 2575 | "pycharm": { 2576 | "name": "#%%\n" 2577 | } 2578 | }, 2579 | "outputs": [], 2580 | "source": [] 2581 | }, 2582 | { 2583 | "cell_type": "code", 2584 | "execution_count": null, 2585 | "id": "26386f48", 2586 | "metadata": { 2587 | "pycharm": { 2588 | "name": "#%%\n" 2589 | } 2590 | }, 2591 | "outputs": [], 2592 | "source": [] 2593 | } 2594 | ], 2595 | "metadata": { 2596 | "kernelspec": { 2597 | "display_name": "Python 3 (ipykernel)", 2598 | "language": "python", 2599 | "name": "python3" 2600 | } 2601 | }, 2602 | "nbformat": 4, 2603 | "nbformat_minor": 5 2604 | } 2605 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | notebook==7.0.3 2 | pandas==2.1.1 3 | pyarrow==13.0.0 4 | 5 | --------------------------------------------------------------------------------