├── honest.fth ├── pyproject.toml ├── .devcontainer └── devcontainer.json ├── README.md └── pandas-best-practices.ipynb /honest.fth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mattharrison/pearson-pandas-best-practices/main/honest.fth -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pearson-pandas-best-practices" 3 | version = "0.1.0" 4 | requires-python = ">=3.12" 5 | dependencies = [ 6 | "matplotlib>=3.10.5", 7 | "notebook>=7.4.5", 8 | "pandas>=2.3.1", 9 | "pyarrow>=21.0.0", 10 | "seaborn>=0.13.2", 11 | ] 12 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "mcr.microsoft.com/devcontainers/universal:2", 3 | "hostRequirements": { 4 | "cpus": 2 5 | }, 6 | "waitFor": "onCreateCommand", 7 | "updateContentCommand": "", 8 | "postCreateCommand": "python3 -m pip install uv; uv sync", 9 | "customizations": { 10 | "codespaces": { 11 | "openFiles": [] 12 | }, 13 | "vscode": { 14 | "extensions": [ 15 | "ms-toolsai.jupyter", 16 | "ms-python.python" 17 | ], 18 | "features": { 19 | "ghcr.io/va-h/devcontainers-features/uv:1": {} 20 | }, 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pearson-pandas-best-practices 2 | 3 | This course introduces best practices for Pandas. 4 | 5 | ## Resources 6 | 7 | See the author's book, [Effective Pandas (digital)](https://store.metasnake.com/effective-pandas-book) [(physical)](https://amzn.to/43dt50h) 8 | 9 | ![Effective Pandas](https://d31ezp3r8jwmks.cloudfront.net/3ytw9atdhoe9ezz1i5hctlspkre4) 10 | 11 | ## Running 12 | 13 | You have a few options for running. 14 | The easiest option: 15 | 16 | ### Use Codespaces. 17 | 18 | - Click on the green "Code" button 19 | - Select the "Codespaces" Tab. 20 | - Hit "Create codespace on main" button 21 | - Wait a few minutes for the codespace to load 22 | 23 | ### Run Locally with UV 24 | 25 | - Install UV (https://docs.astral.sh/uv/getting-started/installation/) 26 | - Check out the project 27 | - Change into the project directory 28 | - Run `uv sync` 29 | - Start jupyter `uv run jupyter notebook` 30 | -------------------------------------------------------------------------------- /pandas-best-practices.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "88981ab7", 6 | "metadata": {}, 7 | "source": [] 8 | }, 9 | { 10 | "cell_type": "markdown", 11 | "id": "597d4814", 12 | "metadata": { 13 | "lines_to_next_cell": 0, 14 | "pycharm": { 15 | "name": "#%% md\n" 16 | } 17 | }, 18 | "source": [ 19 | "# Pandas Best Practices\n", 20 | "## 5 Tips for Better Pandas Code" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "fe77712e", 27 | "metadata": { 28 | "lines_to_next_cell": 2 29 | }, 30 | "outputs": [], 31 | "source": [] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "6188e883", 36 | "metadata": { 37 | "pycharm": { 38 | "name": "#%% md\n" 39 | } 40 | }, 41 | "source": [ 42 | "## About Matt Harrison @\\_\\_mharrison\\_\\_\n", 43 | "\n", 44 | "* Author of Effective Pandas, Machine Learning Pocket Reference, and Illustrated Guide to Python 3.\n", 45 | "* Advisor at Ponder (creators of Modin)\n", 46 | "* Corporate trainer at MetaSnake. Taught Pandas to 1000's of students.\n", 47 | "* Use coupon LIVE for 10% off Effective Pandas book or bundle ( https://store.metasnake.com )" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "8bf569d6", 54 | "metadata": { 55 | "lines_to_next_cell": 2, 56 | "pycharm": { 57 | "name": "#%%\n" 58 | } 59 | }, 60 | "outputs": [], 61 | "source": [] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "68b20887", 67 | "metadata": { 68 | "lines_to_next_cell": 2, 69 | "pycharm": { 70 | "name": "#%%\n" 71 | } 72 | }, 73 | "outputs": [], 74 | "source": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "b3fd6901", 80 | "metadata": { 81 | "lines_to_next_cell": 2, 82 | "pycharm": { 83 | "name": "#%%\n" 84 | } 85 | }, 86 | "outputs": [], 87 | "source": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "352e081c", 93 | "metadata": { 94 | "lines_to_next_cell": 2, 95 | "pycharm": { 96 | "name": "#%%\n" 97 | } 98 | }, 99 | "outputs": [], 100 | "source": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "61a77bda", 106 | "metadata": { 107 | "lines_to_next_cell": 2, 108 | "pycharm": { 109 | "name": "#%%\n" 110 | } 111 | }, 112 | "outputs": [], 113 | "source": [] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "c7a1b91f", 119 | "metadata": { 120 | "lines_to_next_cell": 2, 121 | "pycharm": { 122 | "name": "#%%\n" 123 | } 124 | }, 125 | "outputs": [], 126 | "source": [] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "id": "5c67d9ed", 131 | "metadata": { 132 | "pycharm": { 133 | "name": "#%% md\n" 134 | } 135 | }, 136 | "source": [ 137 | "## Practice this on your data with your team!\n", 138 | "* Contact me matt@metasnake.com\n", 139 | "* Follow on Twitter @\\_\\_mharrison\\_\\_" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "b56b65e9", 146 | "metadata": { 147 | "lines_to_next_cell": 2, 148 | "pycharm": { 149 | "name": "#%%\n" 150 | } 151 | }, 152 | "outputs": [], 153 | "source": [] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "id": "7c579886", 159 | "metadata": { 160 | "lines_to_next_cell": 2, 161 | "pycharm": { 162 | "name": "#%%\n" 163 | } 164 | }, 165 | "outputs": [], 166 | "source": [] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "id": "c38061e7", 171 | "metadata": { 172 | "pycharm": { 173 | "name": "#%% md\n" 174 | } 175 | }, 176 | "source": [ 177 | "## Outline\n", 178 | "\n", 179 | "* Load Data\n", 180 | "* Types\n", 181 | "* Chaining\n", 182 | "* Mutation\n", 183 | "* Apply\n", 184 | "* Aggregation" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "dc9d13b9", 190 | "metadata": { 191 | "pycharm": { 192 | "name": "#%% md\n" 193 | } 194 | }, 195 | "source": [ 196 | "## Imports" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 1, 202 | "id": "ec04b162", 203 | "metadata": { 204 | "lines_to_next_cell": 2, 205 | "pycharm": { 206 | "name": "#%%\n" 207 | } 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "%matplotlib inline\n", 212 | "from IPython.display import display\n", 213 | "import numpy as np\n", 214 | "import pandas as pd\n", 215 | "import pyarrow\n", 216 | "\n", 217 | "import io\n", 218 | "import zipfile" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "29ef6997", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "pd.__version__" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "id": "ae401f97", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "pyarrow.__version__" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "id": "1dea3558", 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "id": "1d7ee1c0", 252 | "metadata": {}, 253 | "source": [ 254 | "## Data Preprocessing\n", 255 | "\n", 256 | "Don't run this code. I'm providing it here to show you where the data came from.\n", 257 | "(If you really want to run this download the ZIP file and update the path)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "id": "3a5723a0", 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "# https://gss.norc.org/get-the-data/spss\n", 268 | "# https://gss.norc.org/Documents/spss/gss_spss_with_codebook.zip\n", 269 | "# takes a few minutes on my computer to load\n", 270 | "path = '/mnt/c/Users/matt/Downloads/gss_spss_with_codebook.zip'\n", 271 | "with zipfile.ZipFile(path) as z:\n", 272 | " print(z.namelist())\n", 273 | " with open('gss.sav', mode='bw') as fout:\n", 274 | " fout.write(z.open('GSS7218_R3.sav').read())\n", 275 | " gss = pd.read_spss('gss.sav')" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "id": "f8587141", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "!pip install pyreadstat" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "f4b686fc", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "%%time\n", 296 | "import pyreadstat\n", 297 | "gss, meta = pyreadstat.read_sav('gss.sav')" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "c73cd05b", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "gss.shape" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "id": "1e2e1777", 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "gss.to_feather('gss.fth')" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "id": "46ec0b7c", 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "%%time\n", 328 | "raw = pd.read_feather('~/Dropbox/work/jupyter/gss.fth')" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "id": "f029dc45", 335 | "metadata": { 336 | "lines_to_next_cell": 0, 337 | "pycharm": { 338 | "name": "#%%\n" 339 | } 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "raw" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "id": "8cad5ba6", 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "# 6000 columns!\n", 354 | "raw.shape" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "id": "d08680b5", 361 | "metadata": { 362 | "lines_to_next_cell": 0 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n", 367 | " 'INCOME06','HONEST','TICKET']\n", 368 | "\n", 369 | "raw[cols].to_feather('honest.fth')" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "id": "506f2f1d", 376 | "metadata": { 377 | "lines_to_next_cell": 2 378 | }, 379 | "outputs": [], 380 | "source": [] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "id": "5ab74806", 385 | "metadata": {}, 386 | "source": [ 387 | "## Loading Data\n", 388 | "\n", 389 | "This is the data we will be using. Run this code!" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "id": "07444860", 396 | "metadata": { 397 | "lines_to_next_cell": 2, 398 | "pycharm": { 399 | "name": "#%%\n" 400 | } 401 | }, 402 | "outputs": [], 403 | "source": [ 404 | "raw = pd.read_feather('honest.fth', dtype_backend='pyarrow')" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "582255a1", 411 | "metadata": { 412 | "lines_to_next_cell": 2, 413 | "pycharm": { 414 | "name": "#%%\n" 415 | } 416 | }, 417 | "outputs": [], 418 | "source": [] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "id": "db1c15f3", 423 | "metadata": { 424 | "pycharm": { 425 | "name": "#%% md\n" 426 | } 427 | }, 428 | "source": [ 429 | "## My Cleanup\n", 430 | "See GSS_Codebook.pdf for explanation\n", 431 | "\n", 432 | "Columns:\n", 433 | "\n", 434 | "* YEAR\n", 435 | "* ID - RESPONDENT ID NUMBER\n", 436 | "* AGE - AGE OF RESPONENT\n", 437 | "* HRS1 - NUMBER OF HOURS WORKED LAST WEEK\n", 438 | "* OCC - R'S CENSUS OCCUPATION CODE (1970) - Page 126 (VAR: OCC) see page 125 for notes APPENDIX F,G,H\n", 439 | " Appendix F - Page 3286\n", 440 | "* MAJOR1 - COLLEGE MAJOR 1\n", 441 | "* SEX - RESPONDENTS SEX\n", 442 | "* RACE - RACE OF RESPONDENT\n", 443 | "* BORN - WAS R BORN IN THIS COUNTRY\n", 444 | "* INCOME - TOTAL FAMILY INCOME 1970\n", 445 | "* INCOME06 - TOTAL FAMILY INCOME 2006\n", 446 | "* HONEST - HONEST\n", 447 | "* TICKET - EVER RECEIVED A TRAFFIC TICKET\n" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "id": "65089c43", 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n", 458 | " 'INCOME06','HONEST','TICKET']\n", 459 | "\n", 460 | "raw[cols].isna().mean()*100" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "id": "67f1d8f4", 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "(raw\n", 471 | " [cols]\n", 472 | " .isna()\n", 473 | " .mean()*100\n", 474 | ")" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "id": "df146d91", 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "MAJOR= '''RESPONSE PUNCH 1972-82 1982B 1983-87 1987B 1988-91 1993-98 2000-04 2006 2008 2010 2012 2014 2016 2018 ALL\n", 485 | "Accounting/bookkeeping 1 0 0 0 0 0 0 0 0 0 0 28 32 30 29 119\n", 486 | "Advertising 2 0 0 0 0 0 0 0 0 0 0 3 2 0 0 5\n", 487 | "Agriculture/horticulture 3 0 0 0 0 0 0 0 0 0 0 8 2 7 5 22\n", 488 | "Allied health 4 0 0 0 0 0 0 0 0 0 0 0 2 1 0 3\n", 489 | "Anthropology 5 0 0 0 0 0 0 0 0 0 0 3 5 1 1 10\n", 490 | "Architecture 6 0 0 0 0 0 0 0 0 0 0 2 3 5 3 13\n", 491 | "Art 7 0 0 0 0 0 0 0 0 0 0 6 7 11 10 34\n", 492 | "Biology 8 0 0 0 0 0 0 0 0 0 0 16 22 33 26 97\n", 493 | "Business administration 9 0 0 0 0 0 0 0 0 0 0 90 142 172 138 542\n", 494 | "Chemistry 11 0 0 0 0 0 0 0 0 0 0 5 8 10 4 27\n", 495 | "Communications/speech 12 0 0 0 0 0 0 0 0 0 0 20 18 26 18 82\n", 496 | "Comm. disorders 13 0 0 0 0 0 0 0 0 0 0 4 6 2 2 14\n", 497 | "Computer science 14 0 0 0 0 0 0 0 0 0 0 25 24 33 17 99\n", 498 | "Dentistry 15 0 0 0 0 0 0 0 0 0 0 2 4 3 5 14\n", 499 | "Education 16 0 0 0 0 0 0 0 0 0 0 73 91 97 79 340\n", 500 | "Economics 17 0 0 0 0 0 0 0 0 0 0 11 19 13 19 62\n", 501 | "Engineering 18 0 0 0 0 0 0 0 0 0 0 47 49 47 54 197\n", 502 | "English 19 0 0 0 0 0 0 0 0 0 0 23 26 27 24 100\n", 503 | "Finance 20 0 0 0 0 0 0 0 0 0 0 7 15 14 16 52\n", 504 | "Foreign language 21 0 0 0 0 0 0 0 0 0 0 4 8 6 5 23\n", 505 | "Forestry 22 0 0 0 0 0 0 0 0 0 0 1 0 3 0 4\n", 506 | "Geography 23 0 0 0 0 0 0 0 0 0 0 0 2 2 4 8\n", 507 | "Geology 24 0 0 0 0 0 0 0 0 0 0 1 3 4 2 10\n", 508 | "History 25 0 0 0 0 0 0 0 0 0 0 10 19 14 19 62\n", 509 | "Home economics 26 0 0 0 0 0 0 0 0 0 0 0 0 3 2 5\n", 510 | "Industry & techn 27 0 0 0 0 0 0 0 0 0 0 3 4 6 0 13\n", 511 | "Journalism 28 0 0 0 0 0 0 0 0 0 0 5 6 6 4 21\n", 512 | "Law 29 0 0 0 0 0 0 0 0 0 0 13 18 23 14 68\n", 513 | "Law enforcement 30 0 0 0 0 0 0 0 0 0 0 3 5 4 2 14\n", 514 | "Library science 31 0 0 0 0 0 0 0 0 0 0 4 5 2 3 14\n", 515 | "Marketing 32 0 0 0 0 0 0 0 0 0 0 11 15 13 12 51\n", 516 | "Mathematics 33 0 0 0 0 0 0 0 0 0 0 5 10 12 5 32\n", 517 | "Medicine 34 0 0 0 0 0 0 0 0 0 0 9 25 12 11 57\n", 518 | "Music 35 0 0 0 0 0 0 0 0 0 0 4 2 10 2 18\n", 519 | "Nursing 36 0 0 0 0 0 0 0 0 0 0 36 39 60 51 186\n", 520 | "Optometry 37 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 521 | "Pharmacy 38 0 0 0 0 0 0 0 0 0 0 2 5 4 4 15\n", 522 | "Philosophy 39 0 0 0 0 0 0 0 0 0 0 2 0 2 2 6\n", 523 | "Physical education 40 0 0 0 0 0 0 0 0 0 0 9 6 16 6 37\n", 524 | "Physics 41 0 0 0 0 0 0 0 0 0 0 3 6 7 4 20\n", 525 | "Psychology 42 0 0 0 0 0 0 0 0 0 0 32 32 34 29 127\n", 526 | "Political science/international relations 43 0 0 0 0 0 0 0 0 0 0 16 22 19 14 71\n", 527 | "Sociology 44 0 0 0 0 0 0 0 0 0 0 9 15 10 12 46\n", 528 | "Special education 45 0 0 0 0 0 0 0 0 0 0 5 3 5 2 15\n", 529 | "Theater arts 46 0 0 0 0 0 0 0 0 0 0 6 2 3 1 12\n", 530 | "Theology 47 0 0 0 0 0 0 0 0 0 0 6 6 13 8 33\n", 531 | "Veterinary medicine 48 0 0 0 0 0 0 0 0 0 0 1 5 3 4 13\n", 532 | "Liberal arts 49 0 0 0 0 0 0 0 0 0 0 8 16 12 10 46\n", 533 | "Other 50 0 0 0 0 0 0 0 0 0 0 8 10 21 27 66\n", 534 | "General sciences 51 0 0 0 0 0 0 0 0 0 0 10 13 15 14 52\n", 535 | "Social work 52 0 0 0 0 0 0 0 0 0 0 7 17 24 7 55\n", 536 | "General studies 53 0 0 0 0 0 0 0 0 0 0 2 5 7 7 21\n", 537 | "Other vocational 54 0 0 0 0 0 0 0 0 0 0 5 11 6 5 27\n", 538 | "Health 55 0 0 0 0 0 0 0 0 0 0 23 31 31 42 127\n", 539 | "Industrial Relations 56 0 0 0 0 0 0 0 0 0 0 1 0 0 3 4\n", 540 | "Child/Human/Family Development 57 0 0 0 0 0 0 0 0 0 0 11 3 7 7 28\n", 541 | "Food Science/Nutrition/Culinary Arts 58 0 0 0 0 0 0 0 0 0 0 3 6 9 9 27\n", 542 | "Environmental Science/Ecology 59 0 0 0 0 0 0 0 0 0 0 5 5 6 8 24\n", 543 | "Social Sciences 60 0 0 0 0 0 0 0 0 0 0 4 2 7 5 18\n", 544 | "Human Services/Human Resources 61 0 0 0 0 0 0 0 0 0 0 3 7 7 5 22\n", 545 | "Visual Arts/Graphic Design/Design and Drafting 62 0 0 0 0 0 0 0 0 0 0 3 8 9 10 30\n", 546 | "Fine Arts 63 0 0 0 0 0 0 0 0 0 0 4 5 5 6 20\n", 547 | "Humanities 64 0 0 0 0 0 0 0 0 0 0 0 2 0 1 3\n", 548 | "Ethnic studies 65 0 0 0 0 0 0 0 0 0 0 3 1 0 0 4\n", 549 | "Educational administration 66 0 0 0 0 0 0 0 0 0 0 3 4 8 9 24\n", 550 | "Television/Film 67 0 0 0 0 0 0 0 0 0 0 0 2 6 1 9\n", 551 | "Aviation/Aeronatics 68 0 0 0 0 0 0 0 0 0 0 2 1 1 3 7\n", 552 | "Statistics/Biostatistics 69 0 0 0 0 0 0 0 0 0 0 0 0 2 2 4\n", 553 | "Criminology/Criminal Justice 70 0 0 0 0 0 0 0 0 0 0 13 17 17 13 60\n", 554 | "Administrative Science/Public Administration 71 0 0 0 0 0 0 0 0 0 0 2 11 3 5 21\n", 555 | "Electronics 72 0 0 0 0 0 0 0 0 0 0 6 6 5 9 26\n", 556 | "Urban and Regional Planning 73 0 0 0 0 0 0 0 0 0 0 1 1 3 2 7\n", 557 | "Mechanics/Machine Trade 74 0 0 0 0 0 0 0 0 0 0 0 1 1 4 6\n", 558 | "Dance 75 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n", 559 | "Gerontology 76 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n", 560 | "Public Relations 77 0 0 0 0 0 0 0 0 0 0 3 1 2 1 7\n", 561 | "Textiles/Cloth 78 0 0 0 0 0 0 0 0 0 0 3 4 0 0 7\n", 562 | "Parks and Recreation 79 0 0 0 0 0 0 0 0 0 0 1 2 1 0 4\n", 563 | "Information Technology 80 0 0 0 0 0 0 0 0 0 0 0 5 8 11 24\n", 564 | "Fashion 81 0 0 0 0 0 0 0 0 0 0 0 0 3 1 4\n", 565 | "Counseling 82 0 0 0 0 0 0 0 0 0 0 0 0 11 9 20\n", 566 | "Don't know/UNCODED 98 0 0 0 0 0 0 0 0 0 0 2 3 0 0 5\n", 567 | "No answer 99 0 0 0 0 0 0 0 0 0 0 0 1 5 3 9\n", 568 | "Not applicable 0 13626 354 7542 353 5907 10334 8394 4510 2023 2044 1263 1597 1795 1435 61177'''\n", 569 | "\n", 570 | "# copy paste slight tweak from page 186\n", 571 | "major_dict = {int(row.split()[-16]): ' '.join(row.split()[:-16]) for row in MAJOR.split('\\n')[1:]}\n", 572 | "major_dict" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": null, 578 | "id": "b9d6c34d", 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "raw.MAJOR1.value_counts()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "id": "74652b6d", 589 | "metadata": {}, 590 | "outputs": [], 591 | "source": [ 592 | "(raw\n", 593 | " [cols]\n", 594 | " .assign(\n", 595 | " MAJOR1=raw.MAJOR1.fillna(99).astype('int').replace(major_dict),\n", 596 | " SEX=raw.SEX#\n", 597 | " \n", 598 | " .astype(int)\n", 599 | " .replace({1:'Male', 2:'Female'}),\n", 600 | " RACE=raw.RACE.astype(int).replace({1:'White', 2:'Black', 3:'Other'}),\n", 601 | " OCC=raw.OCC.fillna(9999).astype(int),\n", 602 | " BORN=raw.BORN.fillna(4).astype(int).replace({1:'Yes', 2:'No', 3:'Don\\'t Know',\n", 603 | " 4:'No answer', 5:'Not applicable'}),\n", 604 | " INCOME=raw.INCOME.fillna(99).astype(int).replace({99:'No answer', **dict(enumerate(['Not applicable',\n", 605 | " 0,1000,3000,4000,5000,6000,\n", 606 | " 7000,8000,10000,15000,20000,25000,]))}),\n", 607 | " INCOME06=raw.INCOME06.fillna(26).astype(int).replace({26:'Refused', **dict(enumerate(['Not applicable',\n", 608 | " 0,1000,3000,4000,5000,6000,\n", 609 | " 7000,8000,10000,12500,15000,\n", 610 | " 17500,20000,22500,25000,30_000,\n", 611 | " 35_000, 40_000, 50_000, 60_000,\n", 612 | " 75_000, 90_000, 110_000, 130_000,\n", 613 | " 150_000]))}),\n", 614 | " HONEST=raw.HONEST.fillna(9).astype(int).replace({1:'Most desirable', 2:'3 most desireable',\n", 615 | " 3:'Not mentioned', 4: '3 least desireable',\n", 616 | " 5: 'One least desireable',\n", 617 | " 9:'No answer'}),\n", 618 | " TICKET=raw.TICKET.fillna(9).astype(int).replace({1:'Yes', 2:'No', 3:'Refused', 9: 'No answer'}),\n", 619 | " )\n", 620 | " .astype({'YEAR':int, 'ID': 'uint16[pyarrow]'})\n", 621 | " .to_csv('GSS.csv')\n", 622 | ")" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "id": "a14afd45", 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "id": "ce8f0020", 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [] 640 | }, 641 | { 642 | "cell_type": "markdown", 643 | "id": "043a0085", 644 | "metadata": {}, 645 | "source": [ 646 | "## Types\n", 647 | "Getting the right types will enable analysis and correctness.\n" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "id": "5d6c6cd5", 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "%%time\n", 658 | "gss = pd.read_csv('GSS.csv', index_col=0, dtype_backend='pyarrow', engine='pyarrow')" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": null, 664 | "id": "d26b8f6c", 665 | "metadata": { 666 | "pycharm": { 667 | "name": "#%%\n" 668 | } 669 | }, 670 | "outputs": [], 671 | "source": [ 672 | "gss.dtypes" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "id": "19890585", 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "gss" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "id": "852115fe", 689 | "metadata": { 690 | "pycharm": { 691 | "name": "#%%\n" 692 | } 693 | }, 694 | "outputs": [], 695 | "source": [ 696 | "gss.memory_usage(deep=True)" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "id": "b5cfc13b", 703 | "metadata": { 704 | "pycharm": { 705 | "name": "#%%\n" 706 | } 707 | }, 708 | "outputs": [], 709 | "source": [ 710 | "# 36 M (pandas 1)\n", 711 | "# 8.6 M (Pandas 2)\n", 712 | "gss.memory_usage(deep=True).sum()" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": null, 718 | "id": "f1d1b51d", 719 | "metadata": { 720 | "lines_to_next_cell": 2, 721 | "pycharm": { 722 | "name": "#%%\n" 723 | } 724 | }, 725 | "outputs": [], 726 | "source": [] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "id": "5fcab8c0", 731 | "metadata": { 732 | "pycharm": { 733 | "name": "#%% md\n" 734 | } 735 | }, 736 | "source": [ 737 | "## Ints" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": null, 743 | "id": "ad4eddc7", 744 | "metadata": { 745 | "pycharm": { 746 | "name": "#%%\n" 747 | } 748 | }, 749 | "outputs": [], 750 | "source": [ 751 | "gss.select_dtypes(int).describe()" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "id": "ac323e5e", 758 | "metadata": { 759 | "pycharm": { 760 | "name": "#%%\n" 761 | } 762 | }, 763 | "outputs": [], 764 | "source": [ 765 | "# chaining\n", 766 | "(gss\n", 767 | " .select_dtypes(int)\n", 768 | " .describe()\n", 769 | ")" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "id": "2351d051", 776 | "metadata": { 777 | "pycharm": { 778 | "name": "#%%\n" 779 | } 780 | }, 781 | "outputs": [], 782 | "source": [ 783 | "# can comb08 be an int8?\n", 784 | "# Do completion on int\n", 785 | "np.iinfo(np.int)" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": null, 791 | "id": "323df8fb", 792 | "metadata": { 793 | "pycharm": { 794 | "name": "#%%\n" 795 | } 796 | }, 797 | "outputs": [], 798 | "source": [ 799 | "np.iinfo(np.uint8)" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": null, 805 | "id": "bb063be4", 806 | "metadata": { 807 | "pycharm": { 808 | "name": "#%%\n" 809 | } 810 | }, 811 | "outputs": [], 812 | "source": [ 813 | "np.iinfo(np.uint16)" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": null, 819 | "id": "d0fab927", 820 | "metadata": { 821 | "pycharm": { 822 | "name": "#%%\n" 823 | } 824 | }, 825 | "outputs": [], 826 | "source": [ 827 | "# chaining\n", 828 | "(gss\n", 829 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' })\n", 830 | " .select_dtypes(['uint16'])\n", 831 | " .describe()\n", 832 | ")" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "id": "a1d0ed15", 839 | "metadata": { 840 | "lines_to_next_cell": 2, 841 | "pycharm": { 842 | "name": "#%%\n" 843 | } 844 | }, 845 | "outputs": [], 846 | "source": [ 847 | "# chaining\n", 848 | "# use 'integer' so see all int-like columns\n", 849 | "(gss\n", 850 | " .astype({#'YEAR': 'uint16[pyarrow]',\n", 851 | " 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n", 852 | " .select_dtypes(['integer']) # see https://numpy.org/doc/stable/reference/arrays.scalars.html\n", 853 | " .describe()\n", 854 | ")" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "id": "c5d4c3e1", 861 | "metadata": { 862 | "lines_to_next_cell": 2, 863 | "pycharm": { 864 | "name": "#%%\n" 865 | } 866 | }, 867 | "outputs": [], 868 | "source": [ 869 | "# Inspect memory usage\n", 870 | "(gss\n", 871 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n", 872 | " .memory_usage(deep=True)\n", 873 | " .sum() # was 36M\n", 874 | ")" 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": null, 880 | "id": "8ad6e733", 881 | "metadata": { 882 | "lines_to_next_cell": 2, 883 | "pycharm": { 884 | "name": "#%%\n" 885 | } 886 | }, 887 | "outputs": [], 888 | "source": [] 889 | }, 890 | { 891 | "cell_type": "markdown", 892 | "id": "f339194e", 893 | "metadata": {}, 894 | "source": [ 895 | "## Int Exercise\n", 896 | "* Try converting *YEAR* to `'int8'`. What do the values look like?\n", 897 | "* Try converting *YEAR* to `'int8[pyarrow]'`. What do the values look like?" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": null, 903 | "id": "908545d1", 904 | "metadata": {}, 905 | "outputs": [], 906 | "source": [] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": null, 911 | "id": "18a3bf52", 912 | "metadata": {}, 913 | "outputs": [], 914 | "source": [] 915 | }, 916 | { 917 | "cell_type": "markdown", 918 | "id": "b09f89c6", 919 | "metadata": {}, 920 | "source": [ 921 | "## Floats" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": null, 927 | "id": "e7fed87e", 928 | "metadata": { 929 | "pycharm": { 930 | "name": "#%%\n" 931 | } 932 | }, 933 | "outputs": [], 934 | "source": [ 935 | "(gss\n", 936 | ".select_dtypes('float'))" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "id": "49265726", 943 | "metadata": { 944 | "pycharm": { 945 | "name": "#%%\n" 946 | } 947 | }, 948 | "outputs": [], 949 | "source": [ 950 | "# surprise! age and hours worked looks int-like\n", 951 | "gss.HRS1.describe()" 952 | ] 953 | }, 954 | { 955 | "cell_type": "code", 956 | "execution_count": null, 957 | "id": "cd39df3c", 958 | "metadata": { 959 | "pycharm": { 960 | "name": "#%%\n" 961 | } 962 | }, 963 | "outputs": [], 964 | "source": [ 965 | "# opps! missing values\n", 966 | "gss.HRS1.value_counts(dropna=False)" 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": null, 972 | "id": "31a67da2", 973 | "metadata": { 974 | "pycharm": { 975 | "name": "#%%\n" 976 | } 977 | }, 978 | "outputs": [], 979 | "source": [ 980 | "# where are they missing?\n", 981 | "(gss\n", 982 | " .query('HRS1.isna()')\n", 983 | ")" 984 | ] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": null, 989 | "id": "e697d070", 990 | "metadata": { 991 | "pycharm": { 992 | "name": "#%%\n" 993 | } 994 | }, 995 | "outputs": [], 996 | "source": [ 997 | "# where are they missing?\n", 998 | "(gss\n", 999 | " .query('AGE.isna()')\n", 1000 | ")" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": null, 1006 | "id": "a9166e1a", 1007 | "metadata": { 1008 | "pycharm": { 1009 | "name": "#%%\n" 1010 | } 1011 | }, 1012 | "outputs": [], 1013 | "source": [ 1014 | "# where are they missing?\n", 1015 | "# It turns out that ID is not consistent across years\n", 1016 | "(gss\n", 1017 | " .query('ID == 229')\n", 1018 | ")" 1019 | ] 1020 | }, 1021 | { 1022 | "cell_type": "code", 1023 | "execution_count": null, 1024 | "id": "81a8c902", 1025 | "metadata": { 1026 | "lines_to_next_cell": 2, 1027 | "pycharm": { 1028 | "name": "#%%\n" 1029 | } 1030 | }, 1031 | "outputs": [], 1032 | "source": [ 1033 | "# Convert to integers\n", 1034 | "(gss\n", 1035 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1036 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n", 1037 | ")" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": null, 1043 | "id": "d3388e2c", 1044 | "metadata": { 1045 | "lines_to_next_cell": 2, 1046 | "pycharm": { 1047 | "name": "#%%\n" 1048 | } 1049 | }, 1050 | "outputs": [], 1051 | "source": [ 1052 | "(gss\n", 1053 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1054 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n", 1055 | " .memory_usage(deep=True)\n", 1056 | " .sum() # was 36M \n", 1057 | ")" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "code", 1062 | "execution_count": null, 1063 | "id": "9bb70ac2", 1064 | "metadata": { 1065 | "lines_to_next_cell": 2, 1066 | "pycharm": { 1067 | "name": "#%%\n" 1068 | } 1069 | }, 1070 | "outputs": [], 1071 | "source": [] 1072 | }, 1073 | { 1074 | "cell_type": "markdown", 1075 | "id": "75bfd716", 1076 | "metadata": {}, 1077 | "source": [ 1078 | "## Float Exercise\n", 1079 | "\n", 1080 | "* What is the mean of the numeric columns?\n", 1081 | "* How many values are missing in the numeric columns?" 1082 | ] 1083 | }, 1084 | { 1085 | "cell_type": "code", 1086 | "execution_count": null, 1087 | "id": "e3e30c4e", 1088 | "metadata": { 1089 | "lines_to_next_cell": 2 1090 | }, 1091 | "outputs": [], 1092 | "source": [] 1093 | }, 1094 | { 1095 | "cell_type": "markdown", 1096 | "id": "a136fe09", 1097 | "metadata": {}, 1098 | "source": [ 1099 | "## Objects" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": null, 1105 | "id": "4f8b0477", 1106 | "metadata": { 1107 | "pycharm": { 1108 | "name": "#%%\n" 1109 | } 1110 | }, 1111 | "outputs": [], 1112 | "source": [ 1113 | "# pandas 1.x\n", 1114 | "(gss\n", 1115 | " .select_dtypes(object)\n", 1116 | ")" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": null, 1122 | "id": "f80da8d2", 1123 | "metadata": { 1124 | "pycharm": { 1125 | "name": "#%%\n" 1126 | } 1127 | }, 1128 | "outputs": [], 1129 | "source": [ 1130 | "# pandas 2\n", 1131 | "(gss\n", 1132 | " .select_dtypes('string') # str doesn't work\n", 1133 | ")" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": null, 1139 | "id": "7f762143", 1140 | "metadata": { 1141 | "pycharm": { 1142 | "name": "#%%\n" 1143 | } 1144 | }, 1145 | "outputs": [], 1146 | "source": [ 1147 | "# My goto method - .value_counts\n", 1148 | "# looks categorical\n", 1149 | "(gss.MAJOR1.value_counts(dropna=False))" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "code", 1154 | "execution_count": null, 1155 | "id": "55c21c7a", 1156 | "metadata": { 1157 | "lines_to_next_cell": 2, 1158 | "pycharm": { 1159 | "name": "#%%\n" 1160 | } 1161 | }, 1162 | "outputs": [], 1163 | "source": [ 1164 | "(gss\n", 1165 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1166 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1167 | " 'MAJOR1': 'category'})\n", 1168 | " .memory_usage(deep=True)\n", 1169 | " .sum() # was 36M \n", 1170 | ")" 1171 | ] 1172 | }, 1173 | { 1174 | "cell_type": "code", 1175 | "execution_count": null, 1176 | "id": "69969c1b", 1177 | "metadata": {}, 1178 | "outputs": [], 1179 | "source": [ 1180 | "(gss\n", 1181 | " .select_dtypes(object)\n", 1182 | " .columns\n", 1183 | ")" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "code", 1188 | "execution_count": null, 1189 | "id": "f5d51601", 1190 | "metadata": { 1191 | "lines_to_next_cell": 0, 1192 | "pycharm": { 1193 | "name": "#%%\n" 1194 | } 1195 | }, 1196 | "outputs": [], 1197 | "source": [ 1198 | "# wow!\n", 1199 | "(gss\n", 1200 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1201 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1202 | " 'MAJOR1': 'category',\n", 1203 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1204 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}}) \n", 1205 | " .memory_usage(deep=True)\n", 1206 | " .sum() # was 36M \n", 1207 | ")" 1208 | ] 1209 | }, 1210 | { 1211 | "cell_type": "code", 1212 | "execution_count": null, 1213 | "id": "17206364", 1214 | "metadata": {}, 1215 | "outputs": [], 1216 | "source": [] 1217 | }, 1218 | { 1219 | "cell_type": "code", 1220 | "execution_count": null, 1221 | "id": "50bf3fa6", 1222 | "metadata": { 1223 | "lines_to_next_cell": 2 1224 | }, 1225 | "outputs": [], 1226 | "source": [] 1227 | }, 1228 | { 1229 | "cell_type": "markdown", 1230 | "id": "246041ae", 1231 | "metadata": {}, 1232 | "source": [ 1233 | "## Category Exercises\n", 1234 | "* There is a `.cat` attribute on the category columns. What can you do with this attribute? (Use `dir` or tab completion to inspect).\n", 1235 | "* Categories can be ordered. How do you order *INCOME*?\n", 1236 | "* Order the *HONEST* column." 1237 | ] 1238 | }, 1239 | { 1240 | "cell_type": "code", 1241 | "execution_count": null, 1242 | "id": "f543c52c", 1243 | "metadata": {}, 1244 | "outputs": [], 1245 | "source": [] 1246 | }, 1247 | { 1248 | "cell_type": "code", 1249 | "execution_count": null, 1250 | "id": "338e5ba3", 1251 | "metadata": {}, 1252 | "outputs": [], 1253 | "source": [] 1254 | }, 1255 | { 1256 | "cell_type": "code", 1257 | "execution_count": null, 1258 | "id": "3fb313d6", 1259 | "metadata": {}, 1260 | "outputs": [], 1261 | "source": [] 1262 | }, 1263 | { 1264 | "cell_type": "code", 1265 | "execution_count": null, 1266 | "id": "f1d75a84", 1267 | "metadata": {}, 1268 | "outputs": [], 1269 | "source": [] 1270 | }, 1271 | { 1272 | "cell_type": "code", 1273 | "execution_count": null, 1274 | "id": "5c87e18d", 1275 | "metadata": {}, 1276 | "outputs": [], 1277 | "source": [] 1278 | }, 1279 | { 1280 | "cell_type": "code", 1281 | "execution_count": null, 1282 | "id": "85aaccbb", 1283 | "metadata": {}, 1284 | "outputs": [], 1285 | "source": [] 1286 | }, 1287 | { 1288 | "cell_type": "code", 1289 | "execution_count": null, 1290 | "id": "8a321513", 1291 | "metadata": {}, 1292 | "outputs": [], 1293 | "source": [] 1294 | }, 1295 | { 1296 | "cell_type": "markdown", 1297 | "id": "8af7a3d4", 1298 | "metadata": {}, 1299 | "source": [ 1300 | "## Make a Function" 1301 | ] 1302 | }, 1303 | { 1304 | "cell_type": "code", 1305 | "execution_count": null, 1306 | "id": "cb9a32b3", 1307 | "metadata": { 1308 | "lines_to_next_cell": 2, 1309 | "pycharm": { 1310 | "name": "#%%\n" 1311 | } 1312 | }, 1313 | "outputs": [], 1314 | "source": [ 1315 | "# a glorious function\n", 1316 | "# add ordered categories to this\n", 1317 | "def tweak_gss(gss):\n", 1318 | " return (gss\n", 1319 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1320 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1321 | " 'MAJOR1': 'category',\n", 1322 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1323 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1324 | " )\n", 1325 | "\n", 1326 | "tweak_gss(gss)" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "markdown", 1331 | "id": "1c615739", 1332 | "metadata": {}, 1333 | "source": [ 1334 | "## Function Exercise\n", 1335 | "* Rearrange your notebook. Put the imports, code to load raw data, and tweak function at the top of the notebook. Restart the kernel and validate that your code works." 1336 | ] 1337 | }, 1338 | { 1339 | "cell_type": "code", 1340 | "execution_count": null, 1341 | "id": "c61b9f0a", 1342 | "metadata": {}, 1343 | "outputs": [], 1344 | "source": [] 1345 | }, 1346 | { 1347 | "cell_type": "code", 1348 | "execution_count": null, 1349 | "id": "6589902c", 1350 | "metadata": {}, 1351 | "outputs": [], 1352 | "source": [] 1353 | }, 1354 | { 1355 | "cell_type": "markdown", 1356 | "id": "b350e12e", 1357 | "metadata": { 1358 | "lines_to_next_cell": 2 1359 | }, 1360 | "source": [ 1361 | "## Fix Column Names" 1362 | ] 1363 | }, 1364 | { 1365 | "cell_type": "code", 1366 | "execution_count": null, 1367 | "id": "99b39238", 1368 | "metadata": { 1369 | "lines_to_next_cell": 0, 1370 | "pycharm": { 1371 | "name": "#%%\n" 1372 | } 1373 | }, 1374 | "outputs": [], 1375 | "source": [ 1376 | "# a glorious function\n", 1377 | "def tweak_gss(gss):\n", 1378 | " return (gss\n", 1379 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1380 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1381 | " 'MAJOR1': 'category',\n", 1382 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1383 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1384 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1385 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1386 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1387 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1388 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1389 | " 'HONEST':'honesty_rank',\n", 1390 | " 'TICKET':'traffic_ticket'})\n", 1391 | " )\n", 1392 | "\n", 1393 | "tweak_gss(gss)" 1394 | ] 1395 | }, 1396 | { 1397 | "cell_type": "code", 1398 | "execution_count": null, 1399 | "id": "84ecc0de", 1400 | "metadata": { 1401 | "lines_to_next_cell": 2, 1402 | "pycharm": { 1403 | "name": "#%%\n" 1404 | } 1405 | }, 1406 | "outputs": [], 1407 | "source": [] 1408 | }, 1409 | { 1410 | "cell_type": "code", 1411 | "execution_count": null, 1412 | "id": "bf14ec3f", 1413 | "metadata": { 1414 | "lines_to_next_cell": 2, 1415 | "pycharm": { 1416 | "name": "#%%\n" 1417 | } 1418 | }, 1419 | "outputs": [], 1420 | "source": [] 1421 | }, 1422 | { 1423 | "cell_type": "markdown", 1424 | "id": "003b96b9", 1425 | "metadata": { 1426 | "pycharm": { 1427 | "name": "#%% md\n" 1428 | } 1429 | }, 1430 | "source": [ 1431 | "## Chain\n", 1432 | "\n", 1433 | "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n", 1434 | "\n", 1435 | "The chain should read like a recipe of ordered steps.\n", 1436 | "\n", 1437 | "(BTW, this is actually what we did above.)\n", 1438 | "\n", 1439 | "
\n", 1440 | " Hint: Leverage .pipe if you can't find a way to chain 😉🐼💪\n", 1441 | "
\n", 1442 | " \n", 1443 | "\n", 1444 | "\n" 1445 | ] 1446 | }, 1447 | { 1448 | "cell_type": "code", 1449 | "execution_count": null, 1450 | "id": "a74cd1a9", 1451 | "metadata": { 1452 | "lines_to_next_cell": 0, 1453 | "pycharm": { 1454 | "name": "#%%\n" 1455 | } 1456 | }, 1457 | "outputs": [], 1458 | "source": [ 1459 | "# a glorious function\n", 1460 | "def tweak_gss(gss):\n", 1461 | " return (gss\n", 1462 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1463 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1464 | " 'MAJOR1': 'category',\n", 1465 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1466 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1467 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1468 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1469 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1470 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1471 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1472 | " 'HONEST':'honesty_rank',\n", 1473 | " 'TICKET':'traffic_ticket'})\n", 1474 | " )\n", 1475 | "\n", 1476 | "tweak_gss(gss)" 1477 | ] 1478 | }, 1479 | { 1480 | "cell_type": "code", 1481 | "execution_count": null, 1482 | "id": "efc594da", 1483 | "metadata": { 1484 | "pycharm": { 1485 | "name": "#%%\n" 1486 | } 1487 | }, 1488 | "outputs": [], 1489 | "source": [ 1490 | "# compare chain to this mess\n", 1491 | "gss2 = gss.copy()\n", 1492 | "year = gss.YEAR\n", 1493 | "year_int = year.astype('uint16')\n", 1494 | "gss2['year'] = year_int\n", 1495 | "id = gss.ID\n", 1496 | "id_int = id.astype('uint16')\n", 1497 | "gss2['year_id'] = id_int\n", 1498 | "occ = gss.OCC\n", 1499 | "occ_int = occ.astype('uint16')\n", 1500 | "gss2['occupation'] = occ_int\n", 1501 | "\n", 1502 | "# more of this" 1503 | ] 1504 | }, 1505 | { 1506 | "cell_type": "code", 1507 | "execution_count": null, 1508 | "id": "32411eaf", 1509 | "metadata": { 1510 | "lines_to_next_cell": 0, 1511 | "pycharm": { 1512 | "name": "#%%\n" 1513 | } 1514 | }, 1515 | "outputs": [], 1516 | "source": [ 1517 | "# easy to debug\n", 1518 | "# - assign to var (df3)\n", 1519 | "# - comment out\n", 1520 | "# - pipe to display\n", 1521 | "\n", 1522 | "\n", 1523 | "from IPython.display import display\n", 1524 | "\n", 1525 | "def get_var(df, var_name):\n", 1526 | " globals()[var_name] = df\n", 1527 | " return df\n", 1528 | "\n", 1529 | "def tweak_gss(gss):\n", 1530 | " return (gss\n", 1531 | " .pipe(get_var, 'df3') \n", 1532 | " .pipe(lambda df: print(df.shape) or df) \n", 1533 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1534 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1535 | " 'MAJOR1': 'category',\n", 1536 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1537 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1538 | " .pipe(lambda df: print(df.shape) or df) \n", 1539 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1540 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1541 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1542 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1543 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1544 | " 'HONEST':'honesty_rank',\n", 1545 | " 'TICKET':'traffic_ticket'})\n", 1546 | " .pipe(lambda df: print(df.shape) or df) \n", 1547 | " )\n", 1548 | "\n", 1549 | "tweak_gss(gss)" 1550 | ] 1551 | }, 1552 | { 1553 | "cell_type": "code", 1554 | "execution_count": null, 1555 | "id": "fdc2894e", 1556 | "metadata": { 1557 | "pycharm": { 1558 | "name": "#%%\n" 1559 | } 1560 | }, 1561 | "outputs": [], 1562 | "source": [ 1563 | "# inspect intermediate data frame\n", 1564 | "df3" 1565 | ] 1566 | }, 1567 | { 1568 | "cell_type": "markdown", 1569 | "id": "1842701c", 1570 | "metadata": { 1571 | "pycharm": { 1572 | "name": "#%%\n" 1573 | } 1574 | }, 1575 | "source": [ 1576 | "## Chain Exercise\n", 1577 | "* Write a function that acccepts a dataframe and an index value. It should print any rows that match the index and return the dataframe that was passed in.\n", 1578 | "* Use the function with pipe after each step of the chain. Show the rows for index 2 and 64,813.\n", 1579 | "\n", 1580 | "\n", 1581 | "\n", 1582 | "\n", 1583 | "\n", 1584 | "\n", 1585 | "\n", 1586 | "\n", 1587 | "## Don't Mutate\n", 1588 | "\n", 1589 | "> \"you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not.\"\n", 1590 | ">\n", 1591 | "> **jreback** - Pandas core dev\n", 1592 | "\n", 1593 | "\n", 1594 | "\n", 1595 | "https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136\n", 1596 | "\n", 1597 | "* In general, no performance benefits\n", 1598 | "* Prohibits chaining\n", 1599 | "* ``SettingWithCopyWarning`` fun\n" 1600 | ] 1601 | }, 1602 | { 1603 | "cell_type": "code", 1604 | "execution_count": null, 1605 | "id": "9b1955ed", 1606 | "metadata": { 1607 | "lines_to_next_cell": 2, 1608 | "pycharm": { 1609 | "name": "#%%\n" 1610 | } 1611 | }, 1612 | "outputs": [], 1613 | "source": [ 1614 | "pd.read_csv??" 1615 | ] 1616 | }, 1617 | { 1618 | "cell_type": "code", 1619 | "execution_count": null, 1620 | "id": "bce7abe3", 1621 | "metadata": { 1622 | "lines_to_next_cell": 2, 1623 | "pycharm": { 1624 | "name": "#%%\n" 1625 | } 1626 | }, 1627 | "outputs": [], 1628 | "source": [] 1629 | }, 1630 | { 1631 | "cell_type": "code", 1632 | "execution_count": null, 1633 | "id": "4e6a8e2f", 1634 | "metadata": { 1635 | "lines_to_next_cell": 2, 1636 | "pycharm": { 1637 | "name": "#%%\n" 1638 | } 1639 | }, 1640 | "outputs": [], 1641 | "source": [] 1642 | }, 1643 | { 1644 | "cell_type": "markdown", 1645 | "id": "2a263d38", 1646 | "metadata": { 1647 | "pycharm": { 1648 | "name": "#%% md\n" 1649 | } 1650 | }, 1651 | "source": [ 1652 | "## Don't Apply (if you can)" 1653 | ] 1654 | }, 1655 | { 1656 | "cell_type": "code", 1657 | "execution_count": null, 1658 | "id": "9e68b584", 1659 | "metadata": { 1660 | "lines_to_next_cell": 0, 1661 | "pycharm": { 1662 | "name": "#%%\n" 1663 | } 1664 | }, 1665 | "outputs": [], 1666 | "source": [ 1667 | "# a glorious function\n", 1668 | "def tweak_gss(gss):\n", 1669 | " return (gss\n", 1670 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1671 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1672 | " 'MAJOR1': 'category',\n", 1673 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1674 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1675 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1676 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1677 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1678 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1679 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1680 | " 'HONEST':'honesty_rank',\n", 1681 | " 'TICKET':'traffic_ticket'})\n", 1682 | " )\n", 1683 | "\n", 1684 | "gss2 = tweak_gss(gss)" 1685 | ] 1686 | }, 1687 | { 1688 | "cell_type": "code", 1689 | "execution_count": null, 1690 | "id": "1a82332f", 1691 | "metadata": { 1692 | "pycharm": { 1693 | "name": "#%%\n" 1694 | } 1695 | }, 1696 | "outputs": [], 1697 | "source": [ 1698 | "# convert age to months\n", 1699 | "def to_months(val):\n", 1700 | " return val * 12\n", 1701 | "\n", 1702 | "gss2.age.apply(to_months)" 1703 | ] 1704 | }, 1705 | { 1706 | "cell_type": "code", 1707 | "execution_count": null, 1708 | "id": "a221e972", 1709 | "metadata": { 1710 | "pycharm": { 1711 | "name": "#%%\n" 1712 | } 1713 | }, 1714 | "outputs": [], 1715 | "source": [ 1716 | "# this gives the sames results\n", 1717 | "gss2.age * 12" 1718 | ] 1719 | }, 1720 | { 1721 | "cell_type": "code", 1722 | "execution_count": null, 1723 | "id": "9cb2b9d2", 1724 | "metadata": { 1725 | "pycharm": { 1726 | "name": "#%%\n" 1727 | } 1728 | }, 1729 | "outputs": [], 1730 | "source": [ 1731 | "%%timeit\n", 1732 | "gss2.age.apply(to_months)" 1733 | ] 1734 | }, 1735 | { 1736 | "cell_type": "code", 1737 | "execution_count": null, 1738 | "id": "51bcc862", 1739 | "metadata": { 1740 | "pycharm": { 1741 | "name": "#%%\n" 1742 | } 1743 | }, 1744 | "outputs": [], 1745 | "source": [ 1746 | "%%timeit\n", 1747 | "gss2.age * 12" 1748 | ] 1749 | }, 1750 | { 1751 | "cell_type": "code", 1752 | "execution_count": null, 1753 | "id": "72a01657", 1754 | "metadata": { 1755 | "pycharm": { 1756 | "name": "#%%\n" 1757 | } 1758 | }, 1759 | "outputs": [], 1760 | "source": [ 1761 | "# ~42x slower!\n", 1762 | "4_590 / 110" 1763 | ] 1764 | }, 1765 | { 1766 | "cell_type": "code", 1767 | "execution_count": null, 1768 | "id": "619094f7", 1769 | "metadata": {}, 1770 | "outputs": [], 1771 | "source": [ 1772 | "gss.MAJOR1.value_counts()[:20]" 1773 | ] 1774 | }, 1775 | { 1776 | "cell_type": "code", 1777 | "execution_count": null, 1778 | "id": "f4817aee", 1779 | "metadata": { 1780 | "pycharm": { 1781 | "name": "#%%\n" 1782 | } 1783 | }, 1784 | "outputs": [], 1785 | "source": [ 1786 | "def is_science(val):\n", 1787 | " return val in {'Engineering', 'Computer science', 'Biology'}" 1788 | ] 1789 | }, 1790 | { 1791 | "cell_type": "code", 1792 | "execution_count": null, 1793 | "id": "f00a069c", 1794 | "metadata": { 1795 | "pycharm": { 1796 | "name": "#%%\n" 1797 | } 1798 | }, 1799 | "outputs": [], 1800 | "source": [ 1801 | "%%timeit\n", 1802 | "# string\n", 1803 | "gss.MAJOR1.apply(is_science)" 1804 | ] 1805 | }, 1806 | { 1807 | "cell_type": "code", 1808 | "execution_count": null, 1809 | "id": "5e13ae10", 1810 | "metadata": { 1811 | "pycharm": { 1812 | "name": "#%%\n" 1813 | } 1814 | }, 1815 | "outputs": [], 1816 | "source": [ 1817 | "%%timeit\n", 1818 | "gss.MAJOR1.isin({'Engineering', 'Computer science', 'Biology'})" 1819 | ] 1820 | }, 1821 | { 1822 | "cell_type": "code", 1823 | "execution_count": null, 1824 | "id": "dc933ec1", 1825 | "metadata": { 1826 | "lines_to_next_cell": 0, 1827 | "pycharm": { 1828 | "name": "#%%\n" 1829 | } 1830 | }, 1831 | "outputs": [], 1832 | "source": [ 1833 | "%%timeit\n", 1834 | "# categorical\n", 1835 | "gss2.college_major.isin({'Engineering', 'Computer science', 'Biology'})" 1836 | ] 1837 | }, 1838 | { 1839 | "cell_type": "code", 1840 | "execution_count": null, 1841 | "id": "42a822c2", 1842 | "metadata": { 1843 | "lines_to_next_cell": 2 1844 | }, 1845 | "outputs": [], 1846 | "source": [] 1847 | }, 1848 | { 1849 | "cell_type": "markdown", 1850 | "id": "d56720b4", 1851 | "metadata": {}, 1852 | "source": [ 1853 | "## Apply Exercise\n", 1854 | "* Make a new column called *minutes_worked* derived the *hours_worked* column.\n", 1855 | "* Make a new column called *income_ratio*.\n", 1856 | " * Convert the income columns to numbers (replace `'No answer'` and `'Refused'` with `np.nan`).\n", 1857 | " * Fill in the missing values with the median\n", 1858 | " * Divide the 2006 value by 1970 value" 1859 | ] 1860 | }, 1861 | { 1862 | "cell_type": "code", 1863 | "execution_count": null, 1864 | "id": "3b818eb6", 1865 | "metadata": {}, 1866 | "outputs": [], 1867 | "source": [] 1868 | }, 1869 | { 1870 | "cell_type": "code", 1871 | "execution_count": null, 1872 | "id": "59ebffeb", 1873 | "metadata": {}, 1874 | "outputs": [], 1875 | "source": [] 1876 | }, 1877 | { 1878 | "cell_type": "code", 1879 | "execution_count": null, 1880 | "id": "9d2a4e10", 1881 | "metadata": {}, 1882 | "outputs": [], 1883 | "source": [] 1884 | }, 1885 | { 1886 | "cell_type": "code", 1887 | "execution_count": null, 1888 | "id": "132efb76", 1889 | "metadata": {}, 1890 | "outputs": [], 1891 | "source": [] 1892 | }, 1893 | { 1894 | "cell_type": "code", 1895 | "execution_count": null, 1896 | "id": "f3631607", 1897 | "metadata": {}, 1898 | "outputs": [], 1899 | "source": [] 1900 | }, 1901 | { 1902 | "cell_type": "code", 1903 | "execution_count": null, 1904 | "id": "b1f627b9", 1905 | "metadata": {}, 1906 | "outputs": [], 1907 | "source": [] 1908 | }, 1909 | { 1910 | "cell_type": "code", 1911 | "execution_count": null, 1912 | "id": "eee8ef40", 1913 | "metadata": {}, 1914 | "outputs": [], 1915 | "source": [] 1916 | }, 1917 | { 1918 | "cell_type": "code", 1919 | "execution_count": null, 1920 | "id": "c19d66ca", 1921 | "metadata": {}, 1922 | "outputs": [], 1923 | "source": [] 1924 | }, 1925 | { 1926 | "cell_type": "code", 1927 | "execution_count": null, 1928 | "id": "85c49f0f", 1929 | "metadata": {}, 1930 | "outputs": [], 1931 | "source": [] 1932 | }, 1933 | { 1934 | "cell_type": "code", 1935 | "execution_count": null, 1936 | "id": "537487a7", 1937 | "metadata": {}, 1938 | "outputs": [], 1939 | "source": [] 1940 | }, 1941 | { 1942 | "cell_type": "code", 1943 | "execution_count": null, 1944 | "id": "f62adb56", 1945 | "metadata": {}, 1946 | "outputs": [], 1947 | "source": [] 1948 | }, 1949 | { 1950 | "cell_type": "code", 1951 | "execution_count": null, 1952 | "id": "8c59b615", 1953 | "metadata": {}, 1954 | "outputs": [], 1955 | "source": [] 1956 | }, 1957 | { 1958 | "cell_type": "code", 1959 | "execution_count": null, 1960 | "id": "55ce9070", 1961 | "metadata": {}, 1962 | "outputs": [], 1963 | "source": [] 1964 | }, 1965 | { 1966 | "cell_type": "code", 1967 | "execution_count": null, 1968 | "id": "15e221af", 1969 | "metadata": {}, 1970 | "outputs": [], 1971 | "source": [] 1972 | }, 1973 | { 1974 | "cell_type": "code", 1975 | "execution_count": null, 1976 | "id": "7ed68b41", 1977 | "metadata": {}, 1978 | "outputs": [], 1979 | "source": [] 1980 | }, 1981 | { 1982 | "cell_type": "code", 1983 | "execution_count": null, 1984 | "id": "ae90e79b", 1985 | "metadata": {}, 1986 | "outputs": [], 1987 | "source": [] 1988 | }, 1989 | { 1990 | "cell_type": "code", 1991 | "execution_count": null, 1992 | "id": "96420545", 1993 | "metadata": {}, 1994 | "outputs": [], 1995 | "source": [] 1996 | }, 1997 | { 1998 | "cell_type": "code", 1999 | "execution_count": null, 2000 | "id": "ba8633ef", 2001 | "metadata": {}, 2002 | "outputs": [], 2003 | "source": [] 2004 | }, 2005 | { 2006 | "cell_type": "code", 2007 | "execution_count": null, 2008 | "id": "8993c970", 2009 | "metadata": {}, 2010 | "outputs": [], 2011 | "source": [] 2012 | }, 2013 | { 2014 | "cell_type": "code", 2015 | "execution_count": null, 2016 | "id": "0ca9ac23", 2017 | "metadata": {}, 2018 | "outputs": [], 2019 | "source": [] 2020 | }, 2021 | { 2022 | "cell_type": "code", 2023 | "execution_count": null, 2024 | "id": "139432bb", 2025 | "metadata": {}, 2026 | "outputs": [], 2027 | "source": [] 2028 | }, 2029 | { 2030 | "cell_type": "code", 2031 | "execution_count": null, 2032 | "id": "816e9c31", 2033 | "metadata": {}, 2034 | "outputs": [], 2035 | "source": [] 2036 | }, 2037 | { 2038 | "cell_type": "code", 2039 | "execution_count": null, 2040 | "id": "4f71cd5b", 2041 | "metadata": {}, 2042 | "outputs": [], 2043 | "source": [] 2044 | }, 2045 | { 2046 | "cell_type": "code", 2047 | "execution_count": null, 2048 | "id": "8f4975a6", 2049 | "metadata": {}, 2050 | "outputs": [], 2051 | "source": [] 2052 | }, 2053 | { 2054 | "cell_type": "code", 2055 | "execution_count": null, 2056 | "id": "44aa5162", 2057 | "metadata": {}, 2058 | "outputs": [], 2059 | "source": [] 2060 | }, 2061 | { 2062 | "cell_type": "code", 2063 | "execution_count": null, 2064 | "id": "e5702992", 2065 | "metadata": {}, 2066 | "outputs": [], 2067 | "source": [] 2068 | }, 2069 | { 2070 | "cell_type": "code", 2071 | "execution_count": null, 2072 | "id": "c0dead63", 2073 | "metadata": { 2074 | "lines_to_next_cell": 2 2075 | }, 2076 | "outputs": [], 2077 | "source": [] 2078 | }, 2079 | { 2080 | "cell_type": "markdown", 2081 | "id": "e0faa823", 2082 | "metadata": {}, 2083 | "source": [ 2084 | "## Master Aggregation\n", 2085 | "\n", 2086 | "Let's compare age by sex by year...🤔" 2087 | ] 2088 | }, 2089 | { 2090 | "cell_type": "code", 2091 | "execution_count": null, 2092 | "id": "d444c6b8", 2093 | "metadata": { 2094 | "pycharm": { 2095 | "name": "#%%\n" 2096 | } 2097 | }, 2098 | "outputs": [], 2099 | "source": [ 2100 | "(gss2\n", 2101 | " .groupby('year')\n", 2102 | " .mean()\n", 2103 | ")" 2104 | ] 2105 | }, 2106 | { 2107 | "cell_type": "code", 2108 | "execution_count": null, 2109 | "id": "85441b08", 2110 | "metadata": { 2111 | "pycharm": { 2112 | "name": "#%%\n" 2113 | } 2114 | }, 2115 | "outputs": [], 2116 | "source": [ 2117 | "(gss2\n", 2118 | " .groupby('year')\n", 2119 | " .mean(numeric_only=True)\n", 2120 | ")" 2121 | ] 2122 | }, 2123 | { 2124 | "cell_type": "code", 2125 | "execution_count": null, 2126 | "id": "eadbc6cd", 2127 | "metadata": { 2128 | "pycharm": { 2129 | "name": "#%%\n" 2130 | } 2131 | }, 2132 | "outputs": [], 2133 | "source": [ 2134 | "(gss2\n", 2135 | " .groupby('year')\n", 2136 | " [['age', 'hours_worked']]\n", 2137 | " .mean()\n", 2138 | ")" 2139 | ] 2140 | }, 2141 | { 2142 | "cell_type": "code", 2143 | "execution_count": null, 2144 | "id": "a6d008ae", 2145 | "metadata": { 2146 | "pycharm": { 2147 | "name": "#%%\n" 2148 | } 2149 | }, 2150 | "outputs": [], 2151 | "source": [ 2152 | "import matplotlib.pyplot as plt\n", 2153 | "import seaborn as sns\n", 2154 | "\n", 2155 | "sns.set_context('talk')\n", 2156 | "plt.plot(range(10))" 2157 | ] 2158 | }, 2159 | { 2160 | "cell_type": "code", 2161 | "execution_count": null, 2162 | "id": "ffc36b52", 2163 | "metadata": { 2164 | "pycharm": { 2165 | "name": "#%%\n" 2166 | } 2167 | }, 2168 | "outputs": [], 2169 | "source": [ 2170 | "(gss2\n", 2171 | " .groupby('year')\n", 2172 | " [['age', 'hours_worked']]\n", 2173 | " .median()\n", 2174 | " .plot()\n", 2175 | ")" 2176 | ] 2177 | }, 2178 | { 2179 | "cell_type": "code", 2180 | "execution_count": null, 2181 | "id": "5bdcd3e8", 2182 | "metadata": { 2183 | "pycharm": { 2184 | "name": "#%%\n" 2185 | } 2186 | }, 2187 | "outputs": [], 2188 | "source": [ 2189 | "(gss2\n", 2190 | " .groupby('year')\n", 2191 | " [['age', 'hours_worked']]\n", 2192 | " #.mean()\n", 2193 | " #.median()\n", 2194 | " #.std()\n", 2195 | " .max()\n", 2196 | " .plot()\n", 2197 | ")" 2198 | ] 2199 | }, 2200 | { 2201 | "cell_type": "code", 2202 | "execution_count": null, 2203 | "id": "54ebb97d", 2204 | "metadata": { 2205 | "lines_to_next_cell": 2, 2206 | "pycharm": { 2207 | "name": "#%%\n" 2208 | } 2209 | }, 2210 | "outputs": [], 2211 | "source": [ 2212 | "# add sex\n", 2213 | "(gss2\n", 2214 | " .groupby(['year', 'sex'])\n", 2215 | " [['age', 'hours_worked']]\n", 2216 | " .mean()\n", 2217 | " #.median()\n", 2218 | " #.std()\n", 2219 | " #.max()\n", 2220 | " #.plot()\n", 2221 | ")" 2222 | ] 2223 | }, 2224 | { 2225 | "cell_type": "code", 2226 | "execution_count": null, 2227 | "id": "266a53da", 2228 | "metadata": { 2229 | "lines_to_next_cell": 2, 2230 | "pycharm": { 2231 | "name": "#%%\n" 2232 | } 2233 | }, 2234 | "outputs": [], 2235 | "source": [ 2236 | "# add sex\n", 2237 | "(gss2\n", 2238 | " .groupby(['year', 'sex'])\n", 2239 | " [['age', 'hours_worked']]\n", 2240 | " .mean()\n", 2241 | " #.median()\n", 2242 | " #.std()\n", 2243 | " #.max()\n", 2244 | " .plot()\n", 2245 | ")" 2246 | ] 2247 | }, 2248 | { 2249 | "cell_type": "code", 2250 | "execution_count": null, 2251 | "id": "2e20f409", 2252 | "metadata": { 2253 | "lines_to_next_cell": 2, 2254 | "pycharm": { 2255 | "name": "#%%\n" 2256 | } 2257 | }, 2258 | "outputs": [], 2259 | "source": [ 2260 | "# unstack\n", 2261 | "(gss2\n", 2262 | " .groupby(['year', 'sex'])\n", 2263 | " [['age', 'hours_worked']]\n", 2264 | " .mean()\n", 2265 | " #.median()\n", 2266 | " #.std()\n", 2267 | " #.max()\n", 2268 | " .unstack() \n", 2269 | " .plot()\n", 2270 | ")" 2271 | ] 2272 | }, 2273 | { 2274 | "cell_type": "code", 2275 | "execution_count": null, 2276 | "id": "d5481e10", 2277 | "metadata": { 2278 | "lines_to_next_cell": 2, 2279 | "pycharm": { 2280 | "name": "#%%\n" 2281 | } 2282 | }, 2283 | "outputs": [], 2284 | "source": [ 2285 | "(gss2\n", 2286 | " .groupby(['year', 'sex'])\n", 2287 | " [['age', 'hours_worked']]\n", 2288 | " .mean()\n", 2289 | " .unstack()\n", 2290 | " .age\n", 2291 | ")" 2292 | ] 2293 | }, 2294 | { 2295 | "cell_type": "code", 2296 | "execution_count": null, 2297 | "id": "9e01d055", 2298 | "metadata": { 2299 | "lines_to_next_cell": 2, 2300 | "pycharm": { 2301 | "name": "#%%\n" 2302 | } 2303 | }, 2304 | "outputs": [], 2305 | "source": [ 2306 | "(gss2\n", 2307 | " .groupby(['year', 'sex'])\n", 2308 | " [['age', 'hours_worked']]\n", 2309 | " .mean()\n", 2310 | " .unstack()\n", 2311 | " .age\n", 2312 | " .plot()\n", 2313 | " .legend(bbox_to_anchor=(1,1))\n", 2314 | ")" 2315 | ] 2316 | }, 2317 | { 2318 | "cell_type": "code", 2319 | "execution_count": null, 2320 | "id": "d1528728", 2321 | "metadata": { 2322 | "pycharm": { 2323 | "name": "#%%\n" 2324 | } 2325 | }, 2326 | "outputs": [], 2327 | "source": [ 2328 | "# Let's try looking at hours worked\n", 2329 | "(gss2\n", 2330 | " .groupby(['year', 'sex'])\n", 2331 | " [['age', 'hours_worked']]\n", 2332 | " .mean()\n", 2333 | " .unstack()\n", 2334 | " .hours_worked\n", 2335 | " .plot()\n", 2336 | " .legend(bbox_to_anchor=(1,1))\n", 2337 | ")" 2338 | ] 2339 | }, 2340 | { 2341 | "cell_type": "code", 2342 | "execution_count": null, 2343 | "id": "a52537a5", 2344 | "metadata": { 2345 | "lines_to_next_cell": 2, 2346 | "pycharm": { 2347 | "name": "#%%\n" 2348 | } 2349 | }, 2350 | "outputs": [], 2351 | "source": [ 2352 | "# Multiple aggregates\n", 2353 | "def second(group):\n", 2354 | " return group.iloc[1]\n", 2355 | "(gss2\n", 2356 | " .groupby(['year', 'sex'])\n", 2357 | " [['age', 'hours_worked']]\n", 2358 | " .agg(['min', 'max', 'mean', second])\n", 2359 | " \n", 2360 | ")" 2361 | ] 2362 | }, 2363 | { 2364 | "cell_type": "code", 2365 | "execution_count": null, 2366 | "id": "b780beb4", 2367 | "metadata": { 2368 | "lines_to_next_cell": 2, 2369 | "pycharm": { 2370 | "name": "#%%\n" 2371 | } 2372 | }, 2373 | "outputs": [], 2374 | "source": [] 2375 | }, 2376 | { 2377 | "cell_type": "markdown", 2378 | "id": "9aca44a7", 2379 | "metadata": {}, 2380 | "source": [ 2381 | "## Aggregation Exercise\n", 2382 | "* Which occupation has the highest median hours worked?\n", 2383 | "* Which occupation has the lowest age?\n", 2384 | "* What is the breakdown of respondents by race for each year?\n", 2385 | "* Convert the previous to a percentage.\n", 2386 | "* How many unique occupations are there for each year?\n", 2387 | "* What is the most popular college_major for each year?\n", 2388 | "* What is the second most popular college_major for each year?" 2389 | ] 2390 | }, 2391 | { 2392 | "cell_type": "code", 2393 | "execution_count": null, 2394 | "id": "9e5477d4", 2395 | "metadata": {}, 2396 | "outputs": [], 2397 | "source": [] 2398 | }, 2399 | { 2400 | "cell_type": "code", 2401 | "execution_count": null, 2402 | "id": "bcc93724", 2403 | "metadata": {}, 2404 | "outputs": [], 2405 | "source": [] 2406 | }, 2407 | { 2408 | "cell_type": "code", 2409 | "execution_count": null, 2410 | "id": "3a7f368e", 2411 | "metadata": {}, 2412 | "outputs": [], 2413 | "source": [] 2414 | }, 2415 | { 2416 | "cell_type": "code", 2417 | "execution_count": null, 2418 | "id": "697919b8", 2419 | "metadata": {}, 2420 | "outputs": [], 2421 | "source": [] 2422 | }, 2423 | { 2424 | "cell_type": "code", 2425 | "execution_count": null, 2426 | "id": "3d93f8db", 2427 | "metadata": {}, 2428 | "outputs": [], 2429 | "source": [] 2430 | }, 2431 | { 2432 | "cell_type": "code", 2433 | "execution_count": null, 2434 | "id": "3907736f", 2435 | "metadata": {}, 2436 | "outputs": [], 2437 | "source": [] 2438 | }, 2439 | { 2440 | "cell_type": "code", 2441 | "execution_count": null, 2442 | "id": "18186540", 2443 | "metadata": {}, 2444 | "outputs": [], 2445 | "source": [] 2446 | }, 2447 | { 2448 | "cell_type": "code", 2449 | "execution_count": null, 2450 | "id": "f1089e32", 2451 | "metadata": {}, 2452 | "outputs": [], 2453 | "source": [] 2454 | }, 2455 | { 2456 | "cell_type": "code", 2457 | "execution_count": null, 2458 | "id": "116593ba", 2459 | "metadata": {}, 2460 | "outputs": [], 2461 | "source": [] 2462 | }, 2463 | { 2464 | "cell_type": "code", 2465 | "execution_count": null, 2466 | "id": "aaa6e44e", 2467 | "metadata": {}, 2468 | "outputs": [], 2469 | "source": [] 2470 | }, 2471 | { 2472 | "cell_type": "code", 2473 | "execution_count": null, 2474 | "id": "648b7e1e", 2475 | "metadata": {}, 2476 | "outputs": [], 2477 | "source": [] 2478 | }, 2479 | { 2480 | "cell_type": "code", 2481 | "execution_count": null, 2482 | "id": "7ee8d6cc", 2483 | "metadata": {}, 2484 | "outputs": [], 2485 | "source": [] 2486 | }, 2487 | { 2488 | "cell_type": "code", 2489 | "execution_count": null, 2490 | "id": "4f252bd0", 2491 | "metadata": {}, 2492 | "outputs": [], 2493 | "source": [] 2494 | }, 2495 | { 2496 | "cell_type": "code", 2497 | "execution_count": null, 2498 | "id": "b6090240", 2499 | "metadata": {}, 2500 | "outputs": [], 2501 | "source": [] 2502 | }, 2503 | { 2504 | "cell_type": "code", 2505 | "execution_count": null, 2506 | "id": "a37bb2df", 2507 | "metadata": { 2508 | "lines_to_next_cell": 2 2509 | }, 2510 | "outputs": [], 2511 | "source": [] 2512 | }, 2513 | { 2514 | "cell_type": "markdown", 2515 | "id": "7cf8f182", 2516 | "metadata": {}, 2517 | "source": [ 2518 | "## Summary\n", 2519 | "\n", 2520 | "* Correct types save space and enable convenient math, string, and date functionality\n", 2521 | "* Chaining operations will:\n", 2522 | " * Make code readable\n", 2523 | " * Remove bugs\n", 2524 | " * Easier to debug\n", 2525 | "* Don't mutate (there's no point). Embrace chaining.\n", 2526 | "* ``.apply`` is slow for math\n", 2527 | "* Aggregations are powerful. Play with them until they make sense\n", 2528 | "\n", 2529 | "Follow on LinkedIn/X|Twitter/Bsky ``@__mharrison__``\n", 2530 | "\n" 2531 | ] 2532 | }, 2533 | { 2534 | "cell_type": "code", 2535 | "execution_count": null, 2536 | "id": "35931834", 2537 | "metadata": { 2538 | "lines_to_next_cell": 2, 2539 | "pycharm": { 2540 | "name": "#%%\n" 2541 | } 2542 | }, 2543 | "outputs": [], 2544 | "source": [] 2545 | }, 2546 | { 2547 | "cell_type": "code", 2548 | "execution_count": null, 2549 | "id": "53ab759b", 2550 | "metadata": { 2551 | "lines_to_next_cell": 2, 2552 | "pycharm": { 2553 | "name": "#%%\n" 2554 | } 2555 | }, 2556 | "outputs": [], 2557 | "source": [] 2558 | }, 2559 | { 2560 | "cell_type": "code", 2561 | "execution_count": null, 2562 | "id": "063efccd", 2563 | "metadata": { 2564 | "lines_to_next_cell": 2, 2565 | "pycharm": { 2566 | "name": "#%%\n" 2567 | } 2568 | }, 2569 | "outputs": [], 2570 | "source": [] 2571 | }, 2572 | { 2573 | "cell_type": "code", 2574 | "execution_count": null, 2575 | "id": "26386f48", 2576 | "metadata": { 2577 | "pycharm": { 2578 | "name": "#%%\n" 2579 | } 2580 | }, 2581 | "outputs": [], 2582 | "source": [] 2583 | } 2584 | ], 2585 | "metadata": { 2586 | "kernelspec": { 2587 | "display_name": "pearson-pandas-best-practices", 2588 | "language": "python", 2589 | "name": "python3" 2590 | }, 2591 | "language_info": { 2592 | "codemirror_mode": { 2593 | "name": "ipython", 2594 | "version": 3 2595 | }, 2596 | "file_extension": ".py", 2597 | "mimetype": "text/x-python", 2598 | "name": "python", 2599 | "nbconvert_exporter": "python", 2600 | "pygments_lexer": "ipython3", 2601 | "version": "3.12.1" 2602 | } 2603 | }, 2604 | "nbformat": 4, 2605 | "nbformat_minor": 5 2606 | } 2607 | --------------------------------------------------------------------------------