├── .devcontainer
    └── devcontainer.json
├── GSS.csv
├── README.md
├── honest.fth
├── pandas-best-practices.ipynb
└── requirements.txt


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "image": "mcr.microsoft.com/devcontainers/universal:2",
 3 |     "hostRequirements": {
 4 |         "cpus": 2
 5 |     },
 6 |     "waitFor": "onCreateCommand",
 7 |     "updateContentCommand": "python3 -m pip install -r requirements.txt",
 8 |     "postCreateCommand": "",
 9 |     "customizations": {
10 |         "codespaces": {
11 |             "openFiles": []
12 |         },
13 |         "vscode": {
14 |             "extensions": [
15 |                 "ms-toolsai.jupyter",
16 |                 "ms-python.python"
17 |             ]
18 |         }
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pearson-pandas-best-practices
 2 | 
 3 | This course introduces best practices for Pandas.
 4 | 
 5 | ## Resources
 6 | 
 7 | See the author's book, [Effective Pandas (digital)](https://store.metasnake.com/effective-pandas-book) [(physical)](https://amzn.to/43dt50h)
 8 | 
 9 | ![Effective Pandas](https://d31ezp3r8jwmks.cloudfront.net/3ytw9atdhoe9ezz1i5hctlspkre4)
10 | 
11 | 


--------------------------------------------------------------------------------
/honest.fth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattharrison/pearson-pandas-best-practices/f67fb49f132784eb09152f3e169196ea26e0cbb4/honest.fth


--------------------------------------------------------------------------------
/pandas-best-practices.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "597d4814",
   6 |    "metadata": {
   7 |     "lines_to_next_cell": 0,
   8 |     "pycharm": {
   9 |      "name": "#%% md\n"
  10 |     }
  11 |    },
  12 |    "source": [
  13 |     "# Pandas Best Practices\n",
  14 |     "## 5 Tips for Better Pandas Code"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": null,
  20 |    "id": "fe77712e",
  21 |    "metadata": {
  22 |     "lines_to_next_cell": 2
  23 |    },
  24 |    "outputs": [],
  25 |    "source": []
  26 |   },
  27 |   {
  28 |    "cell_type": "markdown",
  29 |    "id": "6188e883",
  30 |    "metadata": {
  31 |     "pycharm": {
  32 |      "name": "#%% md\n"
  33 |     }
  34 |    },
  35 |    "source": [
  36 |     "## About Matt  Harrison @\\_\\_mharrison\\_\\_\n",
  37 |     "\n",
  38 |     "* Author of Effective Pandas, Machine Learning Pocket Reference, and Illustrated Guide to Python 3.\n",
  39 |     "* Advisor at Ponder (creators of Modin)\n",
  40 |     "* Corporate trainer at MetaSnake. Taught Pandas to 1000's of students.\n",
  41 |     "* Use coupon LIVE for 10% off Effective Pandas book or bundle ( https://store.metasnake.com )"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": null,
  47 |    "id": "8bf569d6",
  48 |    "metadata": {
  49 |     "lines_to_next_cell": 2,
  50 |     "pycharm": {
  51 |      "name": "#%%\n"
  52 |     }
  53 |    },
  54 |    "outputs": [],
  55 |    "source": []
  56 |   },
  57 |   {
  58 |    "cell_type": "code",
  59 |    "execution_count": null,
  60 |    "id": "68b20887",
  61 |    "metadata": {
  62 |     "lines_to_next_cell": 2,
  63 |     "pycharm": {
  64 |      "name": "#%%\n"
  65 |     }
  66 |    },
  67 |    "outputs": [],
  68 |    "source": []
  69 |   },
  70 |   {
  71 |    "cell_type": "code",
  72 |    "execution_count": null,
  73 |    "id": "b3fd6901",
  74 |    "metadata": {
  75 |     "lines_to_next_cell": 2,
  76 |     "pycharm": {
  77 |      "name": "#%%\n"
  78 |     }
  79 |    },
  80 |    "outputs": [],
  81 |    "source": []
  82 |   },
  83 |   {
  84 |    "cell_type": "code",
  85 |    "execution_count": null,
  86 |    "id": "352e081c",
  87 |    "metadata": {
  88 |     "lines_to_next_cell": 2,
  89 |     "pycharm": {
  90 |      "name": "#%%\n"
  91 |     }
  92 |    },
  93 |    "outputs": [],
  94 |    "source": []
  95 |   },
  96 |   {
  97 |    "cell_type": "code",
  98 |    "execution_count": null,
  99 |    "id": "61a77bda",
 100 |    "metadata": {
 101 |     "lines_to_next_cell": 2,
 102 |     "pycharm": {
 103 |      "name": "#%%\n"
 104 |     }
 105 |    },
 106 |    "outputs": [],
 107 |    "source": []
 108 |   },
 109 |   {
 110 |    "cell_type": "code",
 111 |    "execution_count": null,
 112 |    "id": "c7a1b91f",
 113 |    "metadata": {
 114 |     "lines_to_next_cell": 2,
 115 |     "pycharm": {
 116 |      "name": "#%%\n"
 117 |     }
 118 |    },
 119 |    "outputs": [],
 120 |    "source": []
 121 |   },
 122 |   {
 123 |    "cell_type": "markdown",
 124 |    "id": "5c67d9ed",
 125 |    "metadata": {
 126 |     "pycharm": {
 127 |      "name": "#%% md\n"
 128 |     }
 129 |    },
 130 |    "source": [
 131 |     "## Practice this on your data with your team!\n",
 132 |     "* Contact me matt@metasnake.com\n",
 133 |     "* Follow on Twitter @\\_\\_mharrison\\_\\_"
 134 |    ]
 135 |   },
 136 |   {
 137 |    "cell_type": "code",
 138 |    "execution_count": null,
 139 |    "id": "b56b65e9",
 140 |    "metadata": {
 141 |     "lines_to_next_cell": 2,
 142 |     "pycharm": {
 143 |      "name": "#%%\n"
 144 |     }
 145 |    },
 146 |    "outputs": [],
 147 |    "source": []
 148 |   },
 149 |   {
 150 |    "cell_type": "code",
 151 |    "execution_count": null,
 152 |    "id": "7c579886",
 153 |    "metadata": {
 154 |     "lines_to_next_cell": 2,
 155 |     "pycharm": {
 156 |      "name": "#%%\n"
 157 |     }
 158 |    },
 159 |    "outputs": [],
 160 |    "source": []
 161 |   },
 162 |   {
 163 |    "cell_type": "markdown",
 164 |    "id": "c38061e7",
 165 |    "metadata": {
 166 |     "pycharm": {
 167 |      "name": "#%% md\n"
 168 |     }
 169 |    },
 170 |    "source": [
 171 |     "## Outline\n",
 172 |     "\n",
 173 |     "* Load Data\n",
 174 |     "* Types\n",
 175 |     "* Chaining\n",
 176 |     "* Mutation\n",
 177 |     "* Apply\n",
 178 |     "* Aggregation"
 179 |    ]
 180 |   },
 181 |   {
 182 |    "cell_type": "markdown",
 183 |    "id": "dc9d13b9",
 184 |    "metadata": {
 185 |     "pycharm": {
 186 |      "name": "#%% md\n"
 187 |     }
 188 |    },
 189 |    "source": [
 190 |     "## Imports"
 191 |    ]
 192 |   },
 193 |   {
 194 |    "cell_type": "code",
 195 |    "execution_count": null,
 196 |    "id": "ec04b162",
 197 |    "metadata": {
 198 |     "lines_to_next_cell": 2,
 199 |     "pycharm": {
 200 |      "name": "#%%\n"
 201 |     }
 202 |    },
 203 |    "outputs": [],
 204 |    "source": [
 205 |     "%matplotlib inline\n",
 206 |     "from IPython.display import display\n",
 207 |     "import numpy as np\n",
 208 |     "import pandas as pd\n",
 209 |     "import pyarrow\n",
 210 |     "\n",
 211 |     "import io\n",
 212 |     "import zipfile"
 213 |    ]
 214 |   },
 215 |   {
 216 |    "cell_type": "code",
 217 |    "execution_count": null,
 218 |    "id": "29ef6997",
 219 |    "metadata": {},
 220 |    "outputs": [],
 221 |    "source": [
 222 |     "pd.__version__"
 223 |    ]
 224 |   },
 225 |   {
 226 |    "cell_type": "code",
 227 |    "execution_count": null,
 228 |    "id": "ae401f97",
 229 |    "metadata": {},
 230 |    "outputs": [],
 231 |    "source": [
 232 |     "pyarrow.__version__"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "code",
 237 |    "execution_count": null,
 238 |    "id": "1dea3558",
 239 |    "metadata": {},
 240 |    "outputs": [],
 241 |    "source": []
 242 |   },
 243 |   {
 244 |    "cell_type": "markdown",
 245 |    "id": "1d7ee1c0",
 246 |    "metadata": {},
 247 |    "source": [
 248 |     "## Data Preprocessing\n",
 249 |     "\n",
 250 |     "Don't run this code. I'm providing it here to show you where the data came from.\n",
 251 |     "(If you really want to run this download the ZIP file and update the path)"
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "code",
 256 |    "execution_count": null,
 257 |    "id": "3a5723a0",
 258 |    "metadata": {},
 259 |    "outputs": [],
 260 |    "source": [
 261 |     "# https://gss.norc.org/get-the-data/spss\n",
 262 |     "# https://gss.norc.org/Documents/spss/gss_spss_with_codebook.zip\n",
 263 |     "# takes a few minutes on my computer to load\n",
 264 |     "path = '/mnt/c/Users/matt/Downloads/gss_spss_with_codebook.zip'\n",
 265 |     "with zipfile.ZipFile(path) as z:\n",
 266 |     "    print(z.namelist())\n",
 267 |     "    with open('gss.sav', mode='bw') as fout:\n",
 268 |     "        fout.write(z.open('GSS7218_R3.sav').read())\n",
 269 |     "    gss = pd.read_spss('gss.sav')"
 270 |    ]
 271 |   },
 272 |   {
 273 |    "cell_type": "code",
 274 |    "execution_count": null,
 275 |    "id": "f8587141",
 276 |    "metadata": {},
 277 |    "outputs": [],
 278 |    "source": [
 279 |     "!pip install pyreadstat"
 280 |    ]
 281 |   },
 282 |   {
 283 |    "cell_type": "code",
 284 |    "execution_count": null,
 285 |    "id": "f4b686fc",
 286 |    "metadata": {},
 287 |    "outputs": [],
 288 |    "source": [
 289 |     "%%time\n",
 290 |     "import pyreadstat\n",
 291 |     "gss, meta = pyreadstat.read_sav('gss.sav')"
 292 |    ]
 293 |   },
 294 |   {
 295 |    "cell_type": "code",
 296 |    "execution_count": null,
 297 |    "id": "c73cd05b",
 298 |    "metadata": {},
 299 |    "outputs": [],
 300 |    "source": [
 301 |     "gss.shape"
 302 |    ]
 303 |   },
 304 |   {
 305 |    "cell_type": "code",
 306 |    "execution_count": null,
 307 |    "id": "1e2e1777",
 308 |    "metadata": {},
 309 |    "outputs": [],
 310 |    "source": [
 311 |     "gss.to_feather('gss.fth')"
 312 |    ]
 313 |   },
 314 |   {
 315 |    "cell_type": "code",
 316 |    "execution_count": null,
 317 |    "id": "46ec0b7c",
 318 |    "metadata": {},
 319 |    "outputs": [],
 320 |    "source": [
 321 |     "%%time\n",
 322 |     "raw = pd.read_feather('~/Dropbox/work/jupyter/gss.fth')"
 323 |    ]
 324 |   },
 325 |   {
 326 |    "cell_type": "code",
 327 |    "execution_count": null,
 328 |    "id": "f029dc45",
 329 |    "metadata": {
 330 |     "lines_to_next_cell": 0,
 331 |     "pycharm": {
 332 |      "name": "#%%\n"
 333 |     }
 334 |    },
 335 |    "outputs": [],
 336 |    "source": [
 337 |     "raw"
 338 |    ]
 339 |   },
 340 |   {
 341 |    "cell_type": "code",
 342 |    "execution_count": null,
 343 |    "id": "8cad5ba6",
 344 |    "metadata": {},
 345 |    "outputs": [],
 346 |    "source": [
 347 |     "# 6000 columns!\n",
 348 |     "raw.shape"
 349 |    ]
 350 |   },
 351 |   {
 352 |    "cell_type": "code",
 353 |    "execution_count": null,
 354 |    "id": "d08680b5",
 355 |    "metadata": {
 356 |     "lines_to_next_cell": 0
 357 |    },
 358 |    "outputs": [],
 359 |    "source": [
 360 |     "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n",
 361 |     "        'INCOME06','HONEST','TICKET']\n",
 362 |     "\n",
 363 |     "raw[cols].to_feather('honest.fth')"
 364 |    ]
 365 |   },
 366 |   {
 367 |    "cell_type": "code",
 368 |    "execution_count": null,
 369 |    "id": "506f2f1d",
 370 |    "metadata": {
 371 |     "lines_to_next_cell": 2
 372 |    },
 373 |    "outputs": [],
 374 |    "source": []
 375 |   },
 376 |   {
 377 |    "cell_type": "markdown",
 378 |    "id": "5ab74806",
 379 |    "metadata": {},
 380 |    "source": [
 381 |     "## Loading Data\n",
 382 |     "\n",
 383 |     "This is the data we will be using. Run this code!"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "code",
 388 |    "execution_count": null,
 389 |    "id": "07444860",
 390 |    "metadata": {
 391 |     "lines_to_next_cell": 2,
 392 |     "pycharm": {
 393 |      "name": "#%%\n"
 394 |     }
 395 |    },
 396 |    "outputs": [],
 397 |    "source": [
 398 |     "raw = pd.read_feather('honest.fth', dtype_backend='pyarrow')"
 399 |    ]
 400 |   },
 401 |   {
 402 |    "cell_type": "code",
 403 |    "execution_count": null,
 404 |    "id": "582255a1",
 405 |    "metadata": {
 406 |     "lines_to_next_cell": 2,
 407 |     "pycharm": {
 408 |      "name": "#%%\n"
 409 |     }
 410 |    },
 411 |    "outputs": [],
 412 |    "source": []
 413 |   },
 414 |   {
 415 |    "cell_type": "markdown",
 416 |    "id": "db1c15f3",
 417 |    "metadata": {
 418 |     "pycharm": {
 419 |      "name": "#%% md\n"
 420 |     }
 421 |    },
 422 |    "source": [
 423 |     "## My Cleanup\n",
 424 |     "See GSS_Codebook.pdf for explanation\n",
 425 |     "\n",
 426 |     "Columns:\n",
 427 |     "\n",
 428 |     "* YEAR\n",
 429 |     "* ID - RESPONDENT ID NUMBER\n",
 430 |     "* AGE - AGE OF RESPONENT\n",
 431 |     "* HRS1 - NUMBER OF HOURS WORKED LAST WEEK\n",
 432 |     "* OCC - R'S CENSUS OCCUPATION CODE (1970) - Page 126 (VAR: OCC) see page 125 for notes APPENDIX F,G,H\n",
 433 |     "   Appendix F - Page 3286\n",
 434 |     "* MAJOR1 - COLLEGE MAJOR 1\n",
 435 |     "* SEX - RESPONDENTS SEX\n",
 436 |     "* RACE - RACE OF RESPONDENT\n",
 437 |     "* BORN -  WAS R BORN IN THIS COUNTRY\n",
 438 |     "* INCOME - TOTAL FAMILY INCOME 1970\n",
 439 |     "* INCOME06 - TOTAL FAMILY INCOME 2006\n",
 440 |     "* HONEST - HONEST\n",
 441 |     "* TICKET - EVER RECEIVED A TRAFFIC TICKET\n"
 442 |    ]
 443 |   },
 444 |   {
 445 |    "cell_type": "code",
 446 |    "execution_count": null,
 447 |    "id": "65089c43",
 448 |    "metadata": {},
 449 |    "outputs": [],
 450 |    "source": [
 451 |     "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n",
 452 |     "        'INCOME06','HONEST','TICKET']\n",
 453 |     "\n",
 454 |     "raw[cols].isna().mean()*100"
 455 |    ]
 456 |   },
 457 |   {
 458 |    "cell_type": "code",
 459 |    "execution_count": null,
 460 |    "id": "67f1d8f4",
 461 |    "metadata": {},
 462 |    "outputs": [],
 463 |    "source": [
 464 |     "(raw\n",
 465 |     " [cols]\n",
 466 |     " .isna()\n",
 467 |     " .mean()*100\n",
 468 |     ")"
 469 |    ]
 470 |   },
 471 |   {
 472 |    "cell_type": "code",
 473 |    "execution_count": null,
 474 |    "id": "df146d91",
 475 |    "metadata": {},
 476 |    "outputs": [],
 477 |    "source": [
 478 |     "MAJOR= '''RESPONSE PUNCH 1972-82 1982B 1983-87 1987B 1988-91 1993-98 2000-04 2006 2008 2010 2012 2014 2016 2018 ALL\n",
 479 |     "Accounting/bookkeeping 1 0 0 0 0 0 0 0 0 0 0 28 32 30 29 119\n",
 480 |     "Advertising 2 0 0 0 0 0 0 0 0 0 0 3 2 0 0 5\n",
 481 |     "Agriculture/horticulture 3 0 0 0 0 0 0 0 0 0 0 8 2 7 5 22\n",
 482 |     "Allied health 4 0 0 0 0 0 0 0 0 0 0 0 2 1 0 3\n",
 483 |     "Anthropology 5 0 0 0 0 0 0 0 0 0 0 3 5 1 1 10\n",
 484 |     "Architecture 6 0 0 0 0 0 0 0 0 0 0 2 3 5 3 13\n",
 485 |     "Art 7 0 0 0 0 0 0 0 0 0 0 6 7 11 10 34\n",
 486 |     "Biology 8 0 0 0 0 0 0 0 0 0 0 16 22 33 26 97\n",
 487 |     "Business administration 9 0 0 0 0 0 0 0 0 0 0 90 142 172 138 542\n",
 488 |     "Chemistry 11 0 0 0 0 0 0 0 0 0 0 5 8 10 4 27\n",
 489 |     "Communications/speech 12 0 0 0 0 0 0 0 0 0 0 20 18 26 18 82\n",
 490 |     "Comm. disorders 13 0 0 0 0 0 0 0 0 0 0 4 6 2 2 14\n",
 491 |     "Computer science 14 0 0 0 0 0 0 0 0 0 0 25 24 33 17 99\n",
 492 |     "Dentistry 15 0 0 0 0 0 0 0 0 0 0 2 4 3 5 14\n",
 493 |     "Education 16 0 0 0 0 0 0 0 0 0 0 73 91 97 79 340\n",
 494 |     "Economics 17 0 0 0 0 0 0 0 0 0 0 11 19 13 19 62\n",
 495 |     "Engineering 18 0 0 0 0 0 0 0 0 0 0 47 49 47 54 197\n",
 496 |     "English 19 0 0 0 0 0 0 0 0 0 0 23 26 27 24 100\n",
 497 |     "Finance 20 0 0 0 0 0 0 0 0 0 0 7 15 14 16 52\n",
 498 |     "Foreign language 21 0 0 0 0 0 0 0 0 0 0 4 8 6 5 23\n",
 499 |     "Forestry 22 0 0 0 0 0 0 0 0 0 0 1 0 3 0 4\n",
 500 |     "Geography 23 0 0 0 0 0 0 0 0 0 0 0 2 2 4 8\n",
 501 |     "Geology 24 0 0 0 0 0 0 0 0 0 0 1 3 4 2 10\n",
 502 |     "History 25 0 0 0 0 0 0 0 0 0 0 10 19 14 19 62\n",
 503 |     "Home economics 26 0 0 0 0 0 0 0 0 0 0 0 0 3 2 5\n",
 504 |     "Industry & techn 27 0 0 0 0 0 0 0 0 0 0 3 4 6 0 13\n",
 505 |     "Journalism 28 0 0 0 0 0 0 0 0 0 0 5 6 6 4 21\n",
 506 |     "Law 29 0 0 0 0 0 0 0 0 0 0 13 18 23 14 68\n",
 507 |     "Law enforcement 30 0 0 0 0 0 0 0 0 0 0 3 5 4 2 14\n",
 508 |     "Library science 31 0 0 0 0 0 0 0 0 0 0 4 5 2 3 14\n",
 509 |     "Marketing 32 0 0 0 0 0 0 0 0 0 0 11 15 13 12 51\n",
 510 |     "Mathematics 33 0 0 0 0 0 0 0 0 0 0 5 10 12 5 32\n",
 511 |     "Medicine 34 0 0 0 0 0 0 0 0 0 0 9 25 12 11 57\n",
 512 |     "Music 35 0 0 0 0 0 0 0 0 0 0 4 2 10 2 18\n",
 513 |     "Nursing 36 0 0 0 0 0 0 0 0 0 0 36 39 60 51 186\n",
 514 |     "Optometry 37 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
 515 |     "Pharmacy 38 0 0 0 0 0 0 0 0 0 0 2 5 4 4 15\n",
 516 |     "Philosophy 39 0 0 0 0 0 0 0 0 0 0 2 0 2 2 6\n",
 517 |     "Physical education 40 0 0 0 0 0 0 0 0 0 0 9 6 16 6 37\n",
 518 |     "Physics 41 0 0 0 0 0 0 0 0 0 0 3 6 7 4 20\n",
 519 |     "Psychology 42 0 0 0 0 0 0 0 0 0 0 32 32 34 29 127\n",
 520 |     "Political science/international relations 43 0 0 0 0 0 0 0 0 0 0 16 22 19 14 71\n",
 521 |     "Sociology 44 0 0 0 0 0 0 0 0 0 0 9 15 10 12 46\n",
 522 |     "Special education 45 0 0 0 0 0 0 0 0 0 0 5 3 5 2 15\n",
 523 |     "Theater arts 46 0 0 0 0 0 0 0 0 0 0 6 2 3 1 12\n",
 524 |     "Theology 47 0 0 0 0 0 0 0 0 0 0 6 6 13 8 33\n",
 525 |     "Veterinary medicine 48 0 0 0 0 0 0 0 0 0 0 1 5 3 4 13\n",
 526 |     "Liberal arts 49 0 0 0 0 0 0 0 0 0 0 8 16 12 10 46\n",
 527 |     "Other 50 0 0 0 0 0 0 0 0 0 0 8 10 21 27 66\n",
 528 |     "General sciences 51 0 0 0 0 0 0 0 0 0 0 10 13 15 14 52\n",
 529 |     "Social work 52 0 0 0 0 0 0 0 0 0 0 7 17 24 7 55\n",
 530 |     "General studies 53 0 0 0 0 0 0 0 0 0 0 2 5 7 7 21\n",
 531 |     "Other vocational 54 0 0 0 0 0 0 0 0 0 0 5 11 6 5 27\n",
 532 |     "Health 55 0 0 0 0 0 0 0 0 0 0 23 31 31 42 127\n",
 533 |     "Industrial Relations 56 0 0 0 0 0 0 0 0 0 0 1 0 0 3 4\n",
 534 |     "Child/Human/Family Development 57 0 0 0 0 0 0 0 0 0 0 11 3 7 7 28\n",
 535 |     "Food Science/Nutrition/Culinary Arts 58 0 0 0 0 0 0 0 0 0 0 3 6 9 9 27\n",
 536 |     "Environmental Science/Ecology 59 0 0 0 0 0 0 0 0 0 0 5 5 6 8 24\n",
 537 |     "Social Sciences 60 0 0 0 0 0 0 0 0 0 0 4 2 7 5 18\n",
 538 |     "Human Services/Human Resources 61 0 0 0 0 0 0 0 0 0 0 3 7 7 5 22\n",
 539 |     "Visual Arts/Graphic Design/Design and Drafting 62 0 0 0 0 0 0 0 0 0 0 3 8 9 10 30\n",
 540 |     "Fine Arts 63 0 0 0 0 0 0 0 0 0 0 4 5 5 6 20\n",
 541 |     "Humanities 64 0 0 0 0 0 0 0 0 0 0 0 2 0 1 3\n",
 542 |     "Ethnic studies 65 0 0 0 0 0 0 0 0 0 0 3 1 0 0 4\n",
 543 |     "Educational administration 66 0 0 0 0 0 0 0 0 0 0 3 4 8 9 24\n",
 544 |     "Television/Film 67 0 0 0 0 0 0 0 0 0 0 0 2 6 1 9\n",
 545 |     "Aviation/Aeronatics 68 0 0 0 0 0 0 0 0 0 0 2 1 1 3 7\n",
 546 |     "Statistics/Biostatistics 69 0 0 0 0 0 0 0 0 0 0 0 0 2 2 4\n",
 547 |     "Criminology/Criminal Justice 70 0 0 0 0 0 0 0 0 0 0 13 17 17 13 60\n",
 548 |     "Administrative Science/Public Administration 71 0 0 0 0 0 0 0 0 0 0 2 11 3 5 21\n",
 549 |     "Electronics 72 0 0 0 0 0 0 0 0 0 0 6 6 5 9 26\n",
 550 |     "Urban and Regional Planning 73 0 0 0 0 0 0 0 0 0 0 1 1 3 2 7\n",
 551 |     "Mechanics/Machine Trade 74 0 0 0 0 0 0 0 0 0 0 0 1 1 4 6\n",
 552 |     "Dance 75 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n",
 553 |     "Gerontology 76 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n",
 554 |     "Public Relations 77 0 0 0 0 0 0 0 0 0 0 3 1 2 1 7\n",
 555 |     "Textiles/Cloth 78 0 0 0 0 0 0 0 0 0 0 3 4 0 0 7\n",
 556 |     "Parks and Recreation 79 0 0 0 0 0 0 0 0 0 0 1 2 1 0 4\n",
 557 |     "Information Technology 80 0 0 0 0 0 0 0 0 0 0 0 5 8 11 24\n",
 558 |     "Fashion 81 0 0 0 0 0 0 0 0 0 0 0 0 3 1 4\n",
 559 |     "Counseling 82 0 0 0 0 0 0 0 0 0 0 0 0 11 9 20\n",
 560 |     "Don't know/UNCODED 98 0 0 0 0 0 0 0 0 0 0 2 3 0 0 5\n",
 561 |     "No answer 99 0 0 0 0 0 0 0 0 0 0 0 1 5 3 9\n",
 562 |     "Not applicable 0 13626 354 7542 353 5907 10334 8394 4510 2023 2044 1263 1597 1795 1435 61177'''\n",
 563 |     "\n",
 564 |     "# copy paste slight tweak from page 186\n",
 565 |     "major_dict = {int(row.split()[-16]): ' '.join(row.split()[:-16])  for row in MAJOR.split('\\n')[1:]}\n",
 566 |     "major_dict"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": null,
 572 |    "id": "b9d6c34d",
 573 |    "metadata": {},
 574 |    "outputs": [],
 575 |    "source": [
 576 |     "raw.MAJOR1.value_counts()"
 577 |    ]
 578 |   },
 579 |   {
 580 |    "cell_type": "code",
 581 |    "execution_count": null,
 582 |    "id": "74652b6d",
 583 |    "metadata": {},
 584 |    "outputs": [],
 585 |    "source": [
 586 |     "(raw\n",
 587 |     " [cols]\n",
 588 |     " .assign(\n",
 589 |     "     MAJOR1=raw.MAJOR1.fillna(99).astype('int').replace(major_dict),\n",
 590 |     "     SEX=raw.SEX#\n",
 591 |     "           \n",
 592 |     "           .astype(int)\n",
 593 |     "           .replace({1:'Male', 2:'Female'}),\n",
 594 |     "     RACE=raw.RACE.astype(int).replace({1:'White', 2:'Black', 3:'Other'}),\n",
 595 |     "     OCC=raw.OCC.fillna(9999).astype(int),\n",
 596 |     "     BORN=raw.BORN.fillna(4).astype(int).replace({1:'Yes', 2:'No', 3:'Don\\'t Know',\n",
 597 |     "                                                    4:'No answer', 5:'Not applicable'}),\n",
 598 |     "     INCOME=raw.INCOME.fillna(99).astype(int).replace({99:'No answer', **dict(enumerate(['Not applicable',\n",
 599 |     "                                                                                  0,1000,3000,4000,5000,6000,\n",
 600 |     "                                                                                  7000,8000,10000,15000,20000,25000,]))}),\n",
 601 |     "     INCOME06=raw.INCOME06.fillna(26).astype(int).replace({26:'Refused', **dict(enumerate(['Not applicable',\n",
 602 |     "                                                                                  0,1000,3000,4000,5000,6000,\n",
 603 |     "                                                                                  7000,8000,10000,12500,15000,\n",
 604 |     "                                                                                  17500,20000,22500,25000,30_000,\n",
 605 |     "                                                                                  35_000, 40_000, 50_000, 60_000,\n",
 606 |     "                                                                                 75_000, 90_000, 110_000, 130_000,\n",
 607 |     "                                                                                 150_000]))}),\n",
 608 |     "     HONEST=raw.HONEST.fillna(9).astype(int).replace({1:'Most desirable', 2:'3 most desireable',\n",
 609 |     "                                                                   3:'Not mentioned', 4:  '3 least desireable',\n",
 610 |     "                                                                   5: 'One least desireable',\n",
 611 |     "                                                                    9:'No answer'}),\n",
 612 |     "     TICKET=raw.TICKET.fillna(9).astype(int).replace({1:'Yes', 2:'No', 3:'Refused', 9: 'No answer'}),\n",
 613 |     "     )\n",
 614 |     " .astype({'YEAR':int, 'ID': 'uint16[pyarrow]'})\n",
 615 |     " .to_csv('GSS.csv')\n",
 616 |     ")"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "code",
 621 |    "execution_count": null,
 622 |    "id": "a14afd45",
 623 |    "metadata": {},
 624 |    "outputs": [],
 625 |    "source": []
 626 |   },
 627 |   {
 628 |    "cell_type": "code",
 629 |    "execution_count": null,
 630 |    "id": "ce8f0020",
 631 |    "metadata": {},
 632 |    "outputs": [],
 633 |    "source": []
 634 |   },
 635 |   {
 636 |    "cell_type": "markdown",
 637 |    "id": "043a0085",
 638 |    "metadata": {},
 639 |    "source": [
 640 |     "## Types\n",
 641 |     "Getting the right types will enable analysis and correctness.\n"
 642 |    ]
 643 |   },
 644 |   {
 645 |    "cell_type": "code",
 646 |    "execution_count": null,
 647 |    "id": "5d6c6cd5",
 648 |    "metadata": {},
 649 |    "outputs": [],
 650 |    "source": [
 651 |     "%%time\n",
 652 |     "gss = pd.read_csv('GSS.csv', index_col=0, dtype_backend='pyarrow', engine='pyarrow')"
 653 |    ]
 654 |   },
 655 |   {
 656 |    "cell_type": "code",
 657 |    "execution_count": null,
 658 |    "id": "d26b8f6c",
 659 |    "metadata": {
 660 |     "pycharm": {
 661 |      "name": "#%%\n"
 662 |     }
 663 |    },
 664 |    "outputs": [],
 665 |    "source": [
 666 |     "gss.dtypes"
 667 |    ]
 668 |   },
 669 |   {
 670 |    "cell_type": "code",
 671 |    "execution_count": null,
 672 |    "id": "19890585",
 673 |    "metadata": {},
 674 |    "outputs": [],
 675 |    "source": [
 676 |     "gss"
 677 |    ]
 678 |   },
 679 |   {
 680 |    "cell_type": "code",
 681 |    "execution_count": null,
 682 |    "id": "852115fe",
 683 |    "metadata": {
 684 |     "pycharm": {
 685 |      "name": "#%%\n"
 686 |     }
 687 |    },
 688 |    "outputs": [],
 689 |    "source": [
 690 |     "gss.memory_usage(deep=True)"
 691 |    ]
 692 |   },
 693 |   {
 694 |    "cell_type": "code",
 695 |    "execution_count": null,
 696 |    "id": "b5cfc13b",
 697 |    "metadata": {
 698 |     "pycharm": {
 699 |      "name": "#%%\n"
 700 |     }
 701 |    },
 702 |    "outputs": [],
 703 |    "source": [
 704 |     "# 36 M (pandas 1)\n",
 705 |     "# 8.6 M (Pandas 2)\n",
 706 |     "gss.memory_usage(deep=True).sum()"
 707 |    ]
 708 |   },
 709 |   {
 710 |    "cell_type": "code",
 711 |    "execution_count": null,
 712 |    "id": "f1d1b51d",
 713 |    "metadata": {
 714 |     "lines_to_next_cell": 2,
 715 |     "pycharm": {
 716 |      "name": "#%%\n"
 717 |     }
 718 |    },
 719 |    "outputs": [],
 720 |    "source": []
 721 |   },
 722 |   {
 723 |    "cell_type": "markdown",
 724 |    "id": "5fcab8c0",
 725 |    "metadata": {
 726 |     "pycharm": {
 727 |      "name": "#%% md\n"
 728 |     }
 729 |    },
 730 |    "source": [
 731 |     "## Ints"
 732 |    ]
 733 |   },
 734 |   {
 735 |    "cell_type": "code",
 736 |    "execution_count": null,
 737 |    "id": "ad4eddc7",
 738 |    "metadata": {
 739 |     "pycharm": {
 740 |      "name": "#%%\n"
 741 |     }
 742 |    },
 743 |    "outputs": [],
 744 |    "source": [
 745 |     "gss.select_dtypes(int).describe()"
 746 |    ]
 747 |   },
 748 |   {
 749 |    "cell_type": "code",
 750 |    "execution_count": null,
 751 |    "id": "ac323e5e",
 752 |    "metadata": {
 753 |     "pycharm": {
 754 |      "name": "#%%\n"
 755 |     }
 756 |    },
 757 |    "outputs": [],
 758 |    "source": [
 759 |     "# chaining\n",
 760 |     "(gss\n",
 761 |     " .select_dtypes(int)\n",
 762 |     " .describe()\n",
 763 |     ")"
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "code",
 768 |    "execution_count": null,
 769 |    "id": "2351d051",
 770 |    "metadata": {
 771 |     "pycharm": {
 772 |      "name": "#%%\n"
 773 |     }
 774 |    },
 775 |    "outputs": [],
 776 |    "source": [
 777 |     "# can comb08 be an int8?\n",
 778 |     "# Do completion on int\n",
 779 |     "np.iinfo(np.int)"
 780 |    ]
 781 |   },
 782 |   {
 783 |    "cell_type": "code",
 784 |    "execution_count": null,
 785 |    "id": "323df8fb",
 786 |    "metadata": {
 787 |     "pycharm": {
 788 |      "name": "#%%\n"
 789 |     }
 790 |    },
 791 |    "outputs": [],
 792 |    "source": [
 793 |     "np.iinfo(np.uint8)"
 794 |    ]
 795 |   },
 796 |   {
 797 |    "cell_type": "code",
 798 |    "execution_count": null,
 799 |    "id": "bb063be4",
 800 |    "metadata": {
 801 |     "pycharm": {
 802 |      "name": "#%%\n"
 803 |     }
 804 |    },
 805 |    "outputs": [],
 806 |    "source": [
 807 |     "np.iinfo(np.uint16)"
 808 |    ]
 809 |   },
 810 |   {
 811 |    "cell_type": "code",
 812 |    "execution_count": null,
 813 |    "id": "d0fab927",
 814 |    "metadata": {
 815 |     "pycharm": {
 816 |      "name": "#%%\n"
 817 |     }
 818 |    },
 819 |    "outputs": [],
 820 |    "source": [
 821 |     "# chaining\n",
 822 |     "(gss\n",
 823 |     " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' })\n",
 824 |     " .select_dtypes(['uint16'])\n",
 825 |     " .describe()\n",
 826 |     ")"
 827 |    ]
 828 |   },
 829 |   {
 830 |    "cell_type": "code",
 831 |    "execution_count": null,
 832 |    "id": "a1d0ed15",
 833 |    "metadata": {
 834 |     "lines_to_next_cell": 2,
 835 |     "pycharm": {
 836 |      "name": "#%%\n"
 837 |     }
 838 |    },
 839 |    "outputs": [],
 840 |    "source": [
 841 |     "# chaining\n",
 842 |     "# use 'integer' so see all int-like columns\n",
 843 |     "(gss\n",
 844 |     " .astype({#'YEAR': 'uint16[pyarrow]',\n",
 845 |     "          'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n",
 846 |     " .select_dtypes(['integer'])  # see https://numpy.org/doc/stable/reference/arrays.scalars.html\n",
 847 |     " .describe()\n",
 848 |     ")"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "code",
 853 |    "execution_count": null,
 854 |    "id": "c5d4c3e1",
 855 |    "metadata": {
 856 |     "lines_to_next_cell": 2,
 857 |     "pycharm": {
 858 |      "name": "#%%\n"
 859 |     }
 860 |    },
 861 |    "outputs": [],
 862 |    "source": [
 863 |     "# Inspect memory usage\n",
 864 |     "(gss\n",
 865 |     " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n",
 866 |     " .memory_usage(deep=True)\n",
 867 |     " .sum()  # was 36M\n",
 868 |     ")"
 869 |    ]
 870 |   },
 871 |   {
 872 |    "cell_type": "code",
 873 |    "execution_count": null,
 874 |    "id": "8ad6e733",
 875 |    "metadata": {
 876 |     "lines_to_next_cell": 2,
 877 |     "pycharm": {
 878 |      "name": "#%%\n"
 879 |     }
 880 |    },
 881 |    "outputs": [],
 882 |    "source": []
 883 |   },
 884 |   {
 885 |    "cell_type": "markdown",
 886 |    "id": "f339194e",
 887 |    "metadata": {},
 888 |    "source": [
 889 |     "## Int Exercise\n",
 890 |     "* Try converting *YEAR* to `'int8'`. What do the values look like?\n",
 891 |     "* Try converting *YEAR* to `'int8[pyarrow]'`. What do the values look like?"
 892 |    ]
 893 |   },
 894 |   {
 895 |    "cell_type": "code",
 896 |    "execution_count": null,
 897 |    "id": "908545d1",
 898 |    "metadata": {},
 899 |    "outputs": [],
 900 |    "source": []
 901 |   },
 902 |   {
 903 |    "cell_type": "code",
 904 |    "execution_count": null,
 905 |    "id": "18a3bf52",
 906 |    "metadata": {},
 907 |    "outputs": [],
 908 |    "source": []
 909 |   },
 910 |   {
 911 |    "cell_type": "markdown",
 912 |    "id": "b09f89c6",
 913 |    "metadata": {},
 914 |    "source": [
 915 |     "## Floats"
 916 |    ]
 917 |   },
 918 |   {
 919 |    "cell_type": "code",
 920 |    "execution_count": null,
 921 |    "id": "e7fed87e",
 922 |    "metadata": {
 923 |     "pycharm": {
 924 |      "name": "#%%\n"
 925 |     }
 926 |    },
 927 |    "outputs": [],
 928 |    "source": [
 929 |     "(gss\n",
 930 |     ".select_dtypes('float'))"
 931 |    ]
 932 |   },
 933 |   {
 934 |    "cell_type": "code",
 935 |    "execution_count": null,
 936 |    "id": "49265726",
 937 |    "metadata": {
 938 |     "pycharm": {
 939 |      "name": "#%%\n"
 940 |     }
 941 |    },
 942 |    "outputs": [],
 943 |    "source": [
 944 |     "# surprise! age and hours worked looks int-like\n",
 945 |     "gss.HRS1.describe()"
 946 |    ]
 947 |   },
 948 |   {
 949 |    "cell_type": "code",
 950 |    "execution_count": null,
 951 |    "id": "cd39df3c",
 952 |    "metadata": {
 953 |     "pycharm": {
 954 |      "name": "#%%\n"
 955 |     }
 956 |    },
 957 |    "outputs": [],
 958 |    "source": [
 959 |     "# opps! missing values\n",
 960 |     "gss.HRS1.value_counts(dropna=False)"
 961 |    ]
 962 |   },
 963 |   {
 964 |    "cell_type": "code",
 965 |    "execution_count": null,
 966 |    "id": "31a67da2",
 967 |    "metadata": {
 968 |     "pycharm": {
 969 |      "name": "#%%\n"
 970 |     }
 971 |    },
 972 |    "outputs": [],
 973 |    "source": [
 974 |     "# where are they missing?\n",
 975 |     "(gss\n",
 976 |     "  .query('HRS1.isna()')\n",
 977 |     ")"
 978 |    ]
 979 |   },
 980 |   {
 981 |    "cell_type": "code",
 982 |    "execution_count": null,
 983 |    "id": "e697d070",
 984 |    "metadata": {
 985 |     "pycharm": {
 986 |      "name": "#%%\n"
 987 |     }
 988 |    },
 989 |    "outputs": [],
 990 |    "source": [
 991 |     "# where are they missing?\n",
 992 |     "(gss\n",
 993 |     "  .query('AGE.isna()')\n",
 994 |     ")"
 995 |    ]
 996 |   },
 997 |   {
 998 |    "cell_type": "code",
 999 |    "execution_count": null,
1000 |    "id": "a9166e1a",
1001 |    "metadata": {
1002 |     "pycharm": {
1003 |      "name": "#%%\n"
1004 |     }
1005 |    },
1006 |    "outputs": [],
1007 |    "source": [
1008 |     "# where are they missing?\n",
1009 |     "# It turns out that ID is not consistent across years\n",
1010 |     "(gss\n",
1011 |     "  .query('ID == 229')\n",
1012 |     ")"
1013 |    ]
1014 |   },
1015 |   {
1016 |    "cell_type": "code",
1017 |    "execution_count": null,
1018 |    "id": "81a8c902",
1019 |    "metadata": {
1020 |     "lines_to_next_cell": 2,
1021 |     "pycharm": {
1022 |      "name": "#%%\n"
1023 |     }
1024 |    },
1025 |    "outputs": [],
1026 |    "source": [
1027 |     "# Convert to integers\n",
1028 |     "(gss\n",
1029 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1030 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n",
1031 |     ")"
1032 |    ]
1033 |   },
1034 |   {
1035 |    "cell_type": "code",
1036 |    "execution_count": null,
1037 |    "id": "d3388e2c",
1038 |    "metadata": {
1039 |     "lines_to_next_cell": 2,
1040 |     "pycharm": {
1041 |      "name": "#%%\n"
1042 |     }
1043 |    },
1044 |    "outputs": [],
1045 |    "source": [
1046 |     "(gss\n",
1047 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1048 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n",
1049 |     " .memory_usage(deep=True)\n",
1050 |     " .sum()  # was 36M  \n",
1051 |     ")"
1052 |    ]
1053 |   },
1054 |   {
1055 |    "cell_type": "code",
1056 |    "execution_count": null,
1057 |    "id": "9bb70ac2",
1058 |    "metadata": {
1059 |     "lines_to_next_cell": 2,
1060 |     "pycharm": {
1061 |      "name": "#%%\n"
1062 |     }
1063 |    },
1064 |    "outputs": [],
1065 |    "source": []
1066 |   },
1067 |   {
1068 |    "cell_type": "markdown",
1069 |    "id": "75bfd716",
1070 |    "metadata": {},
1071 |    "source": [
1072 |     "## Float Exercise\n",
1073 |     "\n",
1074 |     "* What is the mean of the numeric columns?\n",
1075 |     "* How many values are missing in the numeric columns?"
1076 |    ]
1077 |   },
1078 |   {
1079 |    "cell_type": "code",
1080 |    "execution_count": null,
1081 |    "id": "e3e30c4e",
1082 |    "metadata": {
1083 |     "lines_to_next_cell": 2
1084 |    },
1085 |    "outputs": [],
1086 |    "source": []
1087 |   },
1088 |   {
1089 |    "cell_type": "markdown",
1090 |    "id": "a136fe09",
1091 |    "metadata": {},
1092 |    "source": [
1093 |     "## Objects"
1094 |    ]
1095 |   },
1096 |   {
1097 |    "cell_type": "code",
1098 |    "execution_count": null,
1099 |    "id": "4f8b0477",
1100 |    "metadata": {
1101 |     "pycharm": {
1102 |      "name": "#%%\n"
1103 |     }
1104 |    },
1105 |    "outputs": [],
1106 |    "source": [
1107 |     "# pandas 1.x\n",
1108 |     "(gss\n",
1109 |     " .select_dtypes(object)\n",
1110 |     ")"
1111 |    ]
1112 |   },
1113 |   {
1114 |    "cell_type": "code",
1115 |    "execution_count": null,
1116 |    "id": "f80da8d2",
1117 |    "metadata": {
1118 |     "pycharm": {
1119 |      "name": "#%%\n"
1120 |     }
1121 |    },
1122 |    "outputs": [],
1123 |    "source": [
1124 |     "# pandas 2\n",
1125 |     "(gss\n",
1126 |     " .select_dtypes('string') # str doesn't work\n",
1127 |     ")"
1128 |    ]
1129 |   },
1130 |   {
1131 |    "cell_type": "code",
1132 |    "execution_count": null,
1133 |    "id": "7f762143",
1134 |    "metadata": {
1135 |     "pycharm": {
1136 |      "name": "#%%\n"
1137 |     }
1138 |    },
1139 |    "outputs": [],
1140 |    "source": [
1141 |     "# My goto method - .value_counts\n",
1142 |     "# looks categorical\n",
1143 |     "(gss.MAJOR1.value_counts(dropna=False))"
1144 |    ]
1145 |   },
1146 |   {
1147 |    "cell_type": "code",
1148 |    "execution_count": null,
1149 |    "id": "55c21c7a",
1150 |    "metadata": {
1151 |     "lines_to_next_cell": 2,
1152 |     "pycharm": {
1153 |      "name": "#%%\n"
1154 |     }
1155 |    },
1156 |    "outputs": [],
1157 |    "source": [
1158 |     "(gss\n",
1159 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1160 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1161 |     "         'MAJOR1': 'category'})\n",
1162 |     " .memory_usage(deep=True)\n",
1163 |     " .sum()  # was 36M  \n",
1164 |     ")"
1165 |    ]
1166 |   },
1167 |   {
1168 |    "cell_type": "code",
1169 |    "execution_count": null,
1170 |    "id": "69969c1b",
1171 |    "metadata": {},
1172 |    "outputs": [],
1173 |    "source": [
1174 |     "(gss\n",
1175 |     " .select_dtypes(object)\n",
1176 |     " .columns\n",
1177 |     ")"
1178 |    ]
1179 |   },
1180 |   {
1181 |    "cell_type": "code",
1182 |    "execution_count": null,
1183 |    "id": "f5d51601",
1184 |    "metadata": {
1185 |     "lines_to_next_cell": 0,
1186 |     "pycharm": {
1187 |      "name": "#%%\n"
1188 |     }
1189 |    },
1190 |    "outputs": [],
1191 |    "source": [
1192 |     "# wow!\n",
1193 |     "(gss\n",
1194 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1195 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1196 |     "         'MAJOR1': 'category',\n",
1197 |     "          **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1198 |     "                'INCOME', 'INCOME06', 'HONEST','TICKET']}})           \n",
1199 |     " .memory_usage(deep=True)\n",
1200 |     " .sum()  # was 36M  \n",
1201 |     ")"
1202 |    ]
1203 |   },
1204 |   {
1205 |    "cell_type": "code",
1206 |    "execution_count": null,
1207 |    "id": "17206364",
1208 |    "metadata": {},
1209 |    "outputs": [],
1210 |    "source": []
1211 |   },
1212 |   {
1213 |    "cell_type": "code",
1214 |    "execution_count": null,
1215 |    "id": "50bf3fa6",
1216 |    "metadata": {
1217 |     "lines_to_next_cell": 2
1218 |    },
1219 |    "outputs": [],
1220 |    "source": []
1221 |   },
1222 |   {
1223 |    "cell_type": "markdown",
1224 |    "id": "246041ae",
1225 |    "metadata": {},
1226 |    "source": [
1227 |     "## Category Exercises\n",
1228 |     "* There is a `.cat` attribute on the category columns. What can you do with this attribute? (Use `dir` or tab completion to inspect).\n",
1229 |     "* Categories can be ordered. How do you order *INCOME*?\n",
1230 |     "* Order the *HONEST* column."
1231 |    ]
1232 |   },
1233 |   {
1234 |    "cell_type": "code",
1235 |    "execution_count": null,
1236 |    "id": "f543c52c",
1237 |    "metadata": {},
1238 |    "outputs": [],
1239 |    "source": []
1240 |   },
1241 |   {
1242 |    "cell_type": "code",
1243 |    "execution_count": null,
1244 |    "id": "338e5ba3",
1245 |    "metadata": {},
1246 |    "outputs": [],
1247 |    "source": []
1248 |   },
1249 |   {
1250 |    "cell_type": "code",
1251 |    "execution_count": null,
1252 |    "id": "3fb313d6",
1253 |    "metadata": {},
1254 |    "outputs": [],
1255 |    "source": []
1256 |   },
1257 |   {
1258 |    "cell_type": "code",
1259 |    "execution_count": null,
1260 |    "id": "f1d75a84",
1261 |    "metadata": {},
1262 |    "outputs": [],
1263 |    "source": []
1264 |   },
1265 |   {
1266 |    "cell_type": "code",
1267 |    "execution_count": null,
1268 |    "id": "5c87e18d",
1269 |    "metadata": {},
1270 |    "outputs": [],
1271 |    "source": []
1272 |   },
1273 |   {
1274 |    "cell_type": "code",
1275 |    "execution_count": null,
1276 |    "id": "85aaccbb",
1277 |    "metadata": {},
1278 |    "outputs": [],
1279 |    "source": []
1280 |   },
1281 |   {
1282 |    "cell_type": "code",
1283 |    "execution_count": null,
1284 |    "id": "8a321513",
1285 |    "metadata": {},
1286 |    "outputs": [],
1287 |    "source": []
1288 |   },
1289 |   {
1290 |    "cell_type": "markdown",
1291 |    "id": "8af7a3d4",
1292 |    "metadata": {},
1293 |    "source": [
1294 |     "## Make a Function"
1295 |    ]
1296 |   },
1297 |   {
1298 |    "cell_type": "code",
1299 |    "execution_count": null,
1300 |    "id": "cb9a32b3",
1301 |    "metadata": {
1302 |     "lines_to_next_cell": 2,
1303 |     "pycharm": {
1304 |      "name": "#%%\n"
1305 |     }
1306 |    },
1307 |    "outputs": [],
1308 |    "source": [
1309 |     "# a glorious function\n",
1310 |     "# add ordered categories to this\n",
1311 |     "def tweak_gss(gss):\n",
1312 |     "    return (gss\n",
1313 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1314 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1315 |     "             'MAJOR1': 'category',\n",
1316 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1317 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1318 |     "               )\n",
1319 |     "\n",
1320 |     "tweak_gss(gss)"
1321 |    ]
1322 |   },
1323 |   {
1324 |    "cell_type": "markdown",
1325 |    "id": "1c615739",
1326 |    "metadata": {},
1327 |    "source": [
1328 |     "## Function Exercise\n",
1329 |     "* Rearrange your notebook. Put the imports, code to load raw data, and tweak function at the top of the notebook. Restart the kernel and validate that your code works."
1330 |    ]
1331 |   },
1332 |   {
1333 |    "cell_type": "code",
1334 |    "execution_count": null,
1335 |    "id": "c61b9f0a",
1336 |    "metadata": {},
1337 |    "outputs": [],
1338 |    "source": []
1339 |   },
1340 |   {
1341 |    "cell_type": "code",
1342 |    "execution_count": null,
1343 |    "id": "6589902c",
1344 |    "metadata": {},
1345 |    "outputs": [],
1346 |    "source": []
1347 |   },
1348 |   {
1349 |    "cell_type": "markdown",
1350 |    "id": "b350e12e",
1351 |    "metadata": {
1352 |     "lines_to_next_cell": 2
1353 |    },
1354 |    "source": [
1355 |     "## Fix Column Names"
1356 |    ]
1357 |   },
1358 |   {
1359 |    "cell_type": "code",
1360 |    "execution_count": null,
1361 |    "id": "99b39238",
1362 |    "metadata": {
1363 |     "lines_to_next_cell": 0,
1364 |     "pycharm": {
1365 |      "name": "#%%\n"
1366 |     }
1367 |    },
1368 |    "outputs": [],
1369 |    "source": [
1370 |     "# a glorious function\n",
1371 |     "def tweak_gss(gss):\n",
1372 |     "    return (gss\n",
1373 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1374 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1375 |     "             'MAJOR1': 'category',\n",
1376 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1377 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1378 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1379 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1380 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1381 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1382 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1383 |     "          'HONEST':'honesty_rank',\n",
1384 |     "          'TICKET':'traffic_ticket'})\n",
1385 |     "    )\n",
1386 |     "\n",
1387 |     "tweak_gss(gss)"
1388 |    ]
1389 |   },
1390 |   {
1391 |    "cell_type": "code",
1392 |    "execution_count": null,
1393 |    "id": "84ecc0de",
1394 |    "metadata": {
1395 |     "lines_to_next_cell": 2,
1396 |     "pycharm": {
1397 |      "name": "#%%\n"
1398 |     }
1399 |    },
1400 |    "outputs": [],
1401 |    "source": []
1402 |   },
1403 |   {
1404 |    "cell_type": "code",
1405 |    "execution_count": null,
1406 |    "id": "bf14ec3f",
1407 |    "metadata": {
1408 |     "lines_to_next_cell": 2,
1409 |     "pycharm": {
1410 |      "name": "#%%\n"
1411 |     }
1412 |    },
1413 |    "outputs": [],
1414 |    "source": []
1415 |   },
1416 |   {
1417 |    "cell_type": "markdown",
1418 |    "id": "003b96b9",
1419 |    "metadata": {
1420 |     "pycharm": {
1421 |      "name": "#%% md\n"
1422 |     }
1423 |    },
1424 |    "source": [
1425 |     "## Chain\n",
1426 |     "\n",
1427 |     "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n",
1428 |     "\n",
1429 |     "The chain should read like a recipe of ordered steps.\n",
1430 |     "\n",
1431 |     "(BTW, this is actually what we did above.)\n",
1432 |     "\n",
1433 |     "<div class='alert alert-warning'>\n",
1434 |     "    Hint: Leverage <tt>.pipe</tt> if you can't find a way to chain 😉🐼💪\n",
1435 |     "</div>\n",
1436 |     "    \n",
1437 |     "\n",
1438 |     "\n"
1439 |    ]
1440 |   },
1441 |   {
1442 |    "cell_type": "code",
1443 |    "execution_count": null,
1444 |    "id": "a74cd1a9",
1445 |    "metadata": {
1446 |     "lines_to_next_cell": 0,
1447 |     "pycharm": {
1448 |      "name": "#%%\n"
1449 |     }
1450 |    },
1451 |    "outputs": [],
1452 |    "source": [
1453 |     "# a glorious function\n",
1454 |     "def tweak_gss(gss):\n",
1455 |     "    return (gss\n",
1456 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1457 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1458 |     "             'MAJOR1': 'category',\n",
1459 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1460 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1461 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1462 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1463 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1464 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1465 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1466 |     "          'HONEST':'honesty_rank',\n",
1467 |     "          'TICKET':'traffic_ticket'})\n",
1468 |     "    )\n",
1469 |     "\n",
1470 |     "tweak_gss(gss)"
1471 |    ]
1472 |   },
1473 |   {
1474 |    "cell_type": "code",
1475 |    "execution_count": null,
1476 |    "id": "efc594da",
1477 |    "metadata": {
1478 |     "pycharm": {
1479 |      "name": "#%%\n"
1480 |     }
1481 |    },
1482 |    "outputs": [],
1483 |    "source": [
1484 |     "# compare chain to this mess\n",
1485 |     "gss2 = gss.copy()\n",
1486 |     "year = gss.YEAR\n",
1487 |     "year_int = year.astype('uint16')\n",
1488 |     "gss2['year'] = year_int\n",
1489 |     "id = gss.ID\n",
1490 |     "id_int = id.astype('uint16')\n",
1491 |     "gss2['year_id'] = id_int\n",
1492 |     "occ = gss.OCC\n",
1493 |     "occ_int = occ.astype('uint16')\n",
1494 |     "gss2['occupation'] = occ_int\n",
1495 |     "\n",
1496 |     "# more of this"
1497 |    ]
1498 |   },
1499 |   {
1500 |    "cell_type": "code",
1501 |    "execution_count": null,
1502 |    "id": "32411eaf",
1503 |    "metadata": {
1504 |     "lines_to_next_cell": 0,
1505 |     "pycharm": {
1506 |      "name": "#%%\n"
1507 |     }
1508 |    },
1509 |    "outputs": [],
1510 |    "source": [
1511 |     "# easy to debug\n",
1512 |     "#  - assign to var (df3)\n",
1513 |     "#  - comment out\n",
1514 |     "#  - pipe to display\n",
1515 |     "\n",
1516 |     "\n",
1517 |     "from IPython.display import display\n",
1518 |     "\n",
1519 |     "def get_var(df, var_name):\n",
1520 |     "    globals()[var_name] = df\n",
1521 |     "    return df\n",
1522 |     "\n",
1523 |     "def tweak_gss(gss):\n",
1524 |     "    return (gss\n",
1525 |     "      .pipe(get_var, 'df3')   \n",
1526 |     "     .pipe(lambda df: print(df.shape) or df)                \n",
1527 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1528 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1529 |     "             'MAJOR1': 'category',\n",
1530 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1531 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1532 |     "     .pipe(lambda df: print(df.shape) or df)                            \n",
1533 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1534 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1535 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1536 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1537 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1538 |     "          'HONEST':'honesty_rank',\n",
1539 |     "          'TICKET':'traffic_ticket'})\n",
1540 |     "     .pipe(lambda df: print(df.shape) or df)                            \n",
1541 |     "    )\n",
1542 |     "\n",
1543 |     "tweak_gss(gss)"
1544 |    ]
1545 |   },
1546 |   {
1547 |    "cell_type": "code",
1548 |    "execution_count": null,
1549 |    "id": "fdc2894e",
1550 |    "metadata": {
1551 |     "pycharm": {
1552 |      "name": "#%%\n"
1553 |     }
1554 |    },
1555 |    "outputs": [],
1556 |    "source": [
1557 |     "# inspect intermediate data frame\n",
1558 |     "df3"
1559 |    ]
1560 |   },
1561 |   {
1562 |    "cell_type": "markdown",
1563 |    "id": "1842701c",
1564 |    "metadata": {
1565 |     "pycharm": {
1566 |      "name": "#%%\n"
1567 |     }
1568 |    },
1569 |    "source": [
1570 |     "## Chain Exercise\n",
1571 |     "* Write a function that acccepts a dataframe and an index value. It should print any rows that match the index and return the dataframe that was passed in.\n",
1572 |     "* Use the function with pipe after each step of the chain. Show the rows for index 2 and 64,813.\n",
1573 |     "\n",
1574 |     "\n",
1575 |     "\n",
1576 |     "\n",
1577 |     "\n",
1578 |     "\n",
1579 |     "\n",
1580 |     "\n",
1581 |     "## Don't Mutate\n",
1582 |     "\n",
1583 |     "> \"you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not.\"\n",
1584 |     ">\n",
1585 |     "> **jreback** - Pandas core dev\n",
1586 |     "\n",
1587 |     "\n",
1588 |     "\n",
1589 |     "https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136\n",
1590 |     "\n",
1591 |     "* In general, no performance benefits\n",
1592 |     "* Prohibits chaining\n",
1593 |     "* ``SettingWithCopyWarning`` fun\n"
1594 |    ]
1595 |   },
1596 |   {
1597 |    "cell_type": "code",
1598 |    "execution_count": null,
1599 |    "id": "9b1955ed",
1600 |    "metadata": {
1601 |     "lines_to_next_cell": 2,
1602 |     "pycharm": {
1603 |      "name": "#%%\n"
1604 |     }
1605 |    },
1606 |    "outputs": [],
1607 |    "source": [
1608 |     "pd.read_csv??"
1609 |    ]
1610 |   },
1611 |   {
1612 |    "cell_type": "code",
1613 |    "execution_count": null,
1614 |    "id": "bce7abe3",
1615 |    "metadata": {
1616 |     "lines_to_next_cell": 2,
1617 |     "pycharm": {
1618 |      "name": "#%%\n"
1619 |     }
1620 |    },
1621 |    "outputs": [],
1622 |    "source": []
1623 |   },
1624 |   {
1625 |    "cell_type": "code",
1626 |    "execution_count": null,
1627 |    "id": "4e6a8e2f",
1628 |    "metadata": {
1629 |     "lines_to_next_cell": 2,
1630 |     "pycharm": {
1631 |      "name": "#%%\n"
1632 |     }
1633 |    },
1634 |    "outputs": [],
1635 |    "source": []
1636 |   },
1637 |   {
1638 |    "cell_type": "markdown",
1639 |    "id": "2a263d38",
1640 |    "metadata": {
1641 |     "pycharm": {
1642 |      "name": "#%% md\n"
1643 |     }
1644 |    },
1645 |    "source": [
1646 |     "## Don't Apply (if you can)"
1647 |    ]
1648 |   },
1649 |   {
1650 |    "cell_type": "code",
1651 |    "execution_count": null,
1652 |    "id": "9e68b584",
1653 |    "metadata": {
1654 |     "lines_to_next_cell": 0,
1655 |     "pycharm": {
1656 |      "name": "#%%\n"
1657 |     }
1658 |    },
1659 |    "outputs": [],
1660 |    "source": [
1661 |     "# a glorious function\n",
1662 |     "def tweak_gss(gss):\n",
1663 |     "    return (gss\n",
1664 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1665 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1666 |     "             'MAJOR1': 'category',\n",
1667 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1668 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1669 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1670 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1671 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1672 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1673 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1674 |     "          'HONEST':'honesty_rank',\n",
1675 |     "          'TICKET':'traffic_ticket'})\n",
1676 |     "    )\n",
1677 |     "\n",
1678 |     "gss2 = tweak_gss(gss)"
1679 |    ]
1680 |   },
1681 |   {
1682 |    "cell_type": "code",
1683 |    "execution_count": null,
1684 |    "id": "1a82332f",
1685 |    "metadata": {
1686 |     "pycharm": {
1687 |      "name": "#%%\n"
1688 |     }
1689 |    },
1690 |    "outputs": [],
1691 |    "source": [
1692 |     "# convert age to months\n",
1693 |     "def to_months(val):\n",
1694 |     "    return val * 12\n",
1695 |     "\n",
1696 |     "gss2.age.apply(to_months)"
1697 |    ]
1698 |   },
1699 |   {
1700 |    "cell_type": "code",
1701 |    "execution_count": null,
1702 |    "id": "a221e972",
1703 |    "metadata": {
1704 |     "pycharm": {
1705 |      "name": "#%%\n"
1706 |     }
1707 |    },
1708 |    "outputs": [],
1709 |    "source": [
1710 |     "# this gives the sames results\n",
1711 |     "gss2.age * 12"
1712 |    ]
1713 |   },
1714 |   {
1715 |    "cell_type": "code",
1716 |    "execution_count": null,
1717 |    "id": "9cb2b9d2",
1718 |    "metadata": {
1719 |     "pycharm": {
1720 |      "name": "#%%\n"
1721 |     }
1722 |    },
1723 |    "outputs": [],
1724 |    "source": [
1725 |     "%%timeit\n",
1726 |     "gss2.age.apply(to_months)"
1727 |    ]
1728 |   },
1729 |   {
1730 |    "cell_type": "code",
1731 |    "execution_count": null,
1732 |    "id": "51bcc862",
1733 |    "metadata": {
1734 |     "pycharm": {
1735 |      "name": "#%%\n"
1736 |     }
1737 |    },
1738 |    "outputs": [],
1739 |    "source": [
1740 |     "%%timeit\n",
1741 |     "gss2.age * 12"
1742 |    ]
1743 |   },
1744 |   {
1745 |    "cell_type": "code",
1746 |    "execution_count": null,
1747 |    "id": "72a01657",
1748 |    "metadata": {
1749 |     "pycharm": {
1750 |      "name": "#%%\n"
1751 |     }
1752 |    },
1753 |    "outputs": [],
1754 |    "source": [
1755 |     "# ~42x slower!\n",
1756 |     "4_590 / 110"
1757 |    ]
1758 |   },
1759 |   {
1760 |    "cell_type": "code",
1761 |    "execution_count": null,
1762 |    "id": "619094f7",
1763 |    "metadata": {},
1764 |    "outputs": [],
1765 |    "source": [
1766 |     "gss.MAJOR1.value_counts()[:20]"
1767 |    ]
1768 |   },
1769 |   {
1770 |    "cell_type": "code",
1771 |    "execution_count": null,
1772 |    "id": "f4817aee",
1773 |    "metadata": {
1774 |     "pycharm": {
1775 |      "name": "#%%\n"
1776 |     }
1777 |    },
1778 |    "outputs": [],
1779 |    "source": [
1780 |     "def is_science(val):\n",
1781 |     "    return val in {'Engineering', 'Computer science', 'Biology'}"
1782 |    ]
1783 |   },
1784 |   {
1785 |    "cell_type": "code",
1786 |    "execution_count": null,
1787 |    "id": "f00a069c",
1788 |    "metadata": {
1789 |     "pycharm": {
1790 |      "name": "#%%\n"
1791 |     }
1792 |    },
1793 |    "outputs": [],
1794 |    "source": [
1795 |     "%%timeit\n",
1796 |     "# string\n",
1797 |     "gss.MAJOR1.apply(is_science)"
1798 |    ]
1799 |   },
1800 |   {
1801 |    "cell_type": "code",
1802 |    "execution_count": null,
1803 |    "id": "5e13ae10",
1804 |    "metadata": {
1805 |     "pycharm": {
1806 |      "name": "#%%\n"
1807 |     }
1808 |    },
1809 |    "outputs": [],
1810 |    "source": [
1811 |     "%%timeit\n",
1812 |     "gss.MAJOR1.isin({'Engineering', 'Computer science', 'Biology'})"
1813 |    ]
1814 |   },
1815 |   {
1816 |    "cell_type": "code",
1817 |    "execution_count": null,
1818 |    "id": "dc933ec1",
1819 |    "metadata": {
1820 |     "lines_to_next_cell": 0,
1821 |     "pycharm": {
1822 |      "name": "#%%\n"
1823 |     }
1824 |    },
1825 |    "outputs": [],
1826 |    "source": [
1827 |     "%%timeit\n",
1828 |     "# categorical\n",
1829 |     "gss2.college_major.isin({'Engineering', 'Computer science', 'Biology'})"
1830 |    ]
1831 |   },
1832 |   {
1833 |    "cell_type": "code",
1834 |    "execution_count": null,
1835 |    "id": "42a822c2",
1836 |    "metadata": {
1837 |     "lines_to_next_cell": 2
1838 |    },
1839 |    "outputs": [],
1840 |    "source": []
1841 |   },
1842 |   {
1843 |    "cell_type": "markdown",
1844 |    "id": "d56720b4",
1845 |    "metadata": {},
1846 |    "source": [
1847 |     "## Apply Exercise\n",
1848 |     "* Make a new column called *minutes_worked* derived the *hours_worked* column.\n",
1849 |     "* Make a new column called *income_ratio*.\n",
1850 |     "  * Convert the income columns to numbers (replace `'No answer'` and `'Refused'` with `np.nan`).\n",
1851 |     "  * Fill in the missing values with the median\n",
1852 |     "  * Divide the 2006 value by 1970 value"
1853 |    ]
1854 |   },
1855 |   {
1856 |    "cell_type": "code",
1857 |    "execution_count": null,
1858 |    "id": "3b818eb6",
1859 |    "metadata": {},
1860 |    "outputs": [],
1861 |    "source": []
1862 |   },
1863 |   {
1864 |    "cell_type": "code",
1865 |    "execution_count": null,
1866 |    "id": "59ebffeb",
1867 |    "metadata": {},
1868 |    "outputs": [],
1869 |    "source": []
1870 |   },
1871 |   {
1872 |    "cell_type": "code",
1873 |    "execution_count": null,
1874 |    "id": "9d2a4e10",
1875 |    "metadata": {},
1876 |    "outputs": [],
1877 |    "source": []
1878 |   },
1879 |   {
1880 |    "cell_type": "code",
1881 |    "execution_count": null,
1882 |    "id": "132efb76",
1883 |    "metadata": {},
1884 |    "outputs": [],
1885 |    "source": []
1886 |   },
1887 |   {
1888 |    "cell_type": "code",
1889 |    "execution_count": null,
1890 |    "id": "f3631607",
1891 |    "metadata": {},
1892 |    "outputs": [],
1893 |    "source": []
1894 |   },
1895 |   {
1896 |    "cell_type": "code",
1897 |    "execution_count": null,
1898 |    "id": "b1f627b9",
1899 |    "metadata": {},
1900 |    "outputs": [],
1901 |    "source": []
1902 |   },
1903 |   {
1904 |    "cell_type": "code",
1905 |    "execution_count": null,
1906 |    "id": "eee8ef40",
1907 |    "metadata": {},
1908 |    "outputs": [],
1909 |    "source": []
1910 |   },
1911 |   {
1912 |    "cell_type": "code",
1913 |    "execution_count": null,
1914 |    "id": "c19d66ca",
1915 |    "metadata": {},
1916 |    "outputs": [],
1917 |    "source": []
1918 |   },
1919 |   {
1920 |    "cell_type": "code",
1921 |    "execution_count": null,
1922 |    "id": "85c49f0f",
1923 |    "metadata": {},
1924 |    "outputs": [],
1925 |    "source": []
1926 |   },
1927 |   {
1928 |    "cell_type": "code",
1929 |    "execution_count": null,
1930 |    "id": "537487a7",
1931 |    "metadata": {},
1932 |    "outputs": [],
1933 |    "source": []
1934 |   },
1935 |   {
1936 |    "cell_type": "code",
1937 |    "execution_count": null,
1938 |    "id": "f62adb56",
1939 |    "metadata": {},
1940 |    "outputs": [],
1941 |    "source": []
1942 |   },
1943 |   {
1944 |    "cell_type": "code",
1945 |    "execution_count": null,
1946 |    "id": "8c59b615",
1947 |    "metadata": {},
1948 |    "outputs": [],
1949 |    "source": []
1950 |   },
1951 |   {
1952 |    "cell_type": "code",
1953 |    "execution_count": null,
1954 |    "id": "55ce9070",
1955 |    "metadata": {},
1956 |    "outputs": [],
1957 |    "source": []
1958 |   },
1959 |   {
1960 |    "cell_type": "code",
1961 |    "execution_count": null,
1962 |    "id": "15e221af",
1963 |    "metadata": {},
1964 |    "outputs": [],
1965 |    "source": []
1966 |   },
1967 |   {
1968 |    "cell_type": "code",
1969 |    "execution_count": null,
1970 |    "id": "7ed68b41",
1971 |    "metadata": {},
1972 |    "outputs": [],
1973 |    "source": []
1974 |   },
1975 |   {
1976 |    "cell_type": "code",
1977 |    "execution_count": null,
1978 |    "id": "ae90e79b",
1979 |    "metadata": {},
1980 |    "outputs": [],
1981 |    "source": []
1982 |   },
1983 |   {
1984 |    "cell_type": "code",
1985 |    "execution_count": null,
1986 |    "id": "96420545",
1987 |    "metadata": {},
1988 |    "outputs": [],
1989 |    "source": []
1990 |   },
1991 |   {
1992 |    "cell_type": "code",
1993 |    "execution_count": null,
1994 |    "id": "ba8633ef",
1995 |    "metadata": {},
1996 |    "outputs": [],
1997 |    "source": []
1998 |   },
1999 |   {
2000 |    "cell_type": "code",
2001 |    "execution_count": null,
2002 |    "id": "8993c970",
2003 |    "metadata": {},
2004 |    "outputs": [],
2005 |    "source": []
2006 |   },
2007 |   {
2008 |    "cell_type": "code",
2009 |    "execution_count": null,
2010 |    "id": "0ca9ac23",
2011 |    "metadata": {},
2012 |    "outputs": [],
2013 |    "source": []
2014 |   },
2015 |   {
2016 |    "cell_type": "code",
2017 |    "execution_count": null,
2018 |    "id": "139432bb",
2019 |    "metadata": {},
2020 |    "outputs": [],
2021 |    "source": []
2022 |   },
2023 |   {
2024 |    "cell_type": "code",
2025 |    "execution_count": null,
2026 |    "id": "816e9c31",
2027 |    "metadata": {},
2028 |    "outputs": [],
2029 |    "source": []
2030 |   },
2031 |   {
2032 |    "cell_type": "code",
2033 |    "execution_count": null,
2034 |    "id": "4f71cd5b",
2035 |    "metadata": {},
2036 |    "outputs": [],
2037 |    "source": []
2038 |   },
2039 |   {
2040 |    "cell_type": "code",
2041 |    "execution_count": null,
2042 |    "id": "8f4975a6",
2043 |    "metadata": {},
2044 |    "outputs": [],
2045 |    "source": []
2046 |   },
2047 |   {
2048 |    "cell_type": "code",
2049 |    "execution_count": null,
2050 |    "id": "44aa5162",
2051 |    "metadata": {},
2052 |    "outputs": [],
2053 |    "source": []
2054 |   },
2055 |   {
2056 |    "cell_type": "code",
2057 |    "execution_count": null,
2058 |    "id": "e5702992",
2059 |    "metadata": {},
2060 |    "outputs": [],
2061 |    "source": []
2062 |   },
2063 |   {
2064 |    "cell_type": "code",
2065 |    "execution_count": null,
2066 |    "id": "c0dead63",
2067 |    "metadata": {
2068 |     "lines_to_next_cell": 2
2069 |    },
2070 |    "outputs": [],
2071 |    "source": []
2072 |   },
2073 |   {
2074 |    "cell_type": "markdown",
2075 |    "id": "e0faa823",
2076 |    "metadata": {},
2077 |    "source": [
2078 |     "## Master Aggregation\n",
2079 |     "\n",
2080 |     "Let's compare age by sex by year...🤔"
2081 |    ]
2082 |   },
2083 |   {
2084 |    "cell_type": "code",
2085 |    "execution_count": null,
2086 |    "id": "d444c6b8",
2087 |    "metadata": {
2088 |     "pycharm": {
2089 |      "name": "#%%\n"
2090 |     }
2091 |    },
2092 |    "outputs": [],
2093 |    "source": [
2094 |     "(gss2\n",
2095 |     "   .groupby('year')\n",
2096 |     "   .mean()\n",
2097 |     ")"
2098 |    ]
2099 |   },
2100 |   {
2101 |    "cell_type": "code",
2102 |    "execution_count": null,
2103 |    "id": "85441b08",
2104 |    "metadata": {
2105 |     "pycharm": {
2106 |      "name": "#%%\n"
2107 |     }
2108 |    },
2109 |    "outputs": [],
2110 |    "source": [
2111 |     "(gss2\n",
2112 |     "   .groupby('year')\n",
2113 |     "   .mean(numeric_only=True)\n",
2114 |     ")"
2115 |    ]
2116 |   },
2117 |   {
2118 |    "cell_type": "code",
2119 |    "execution_count": null,
2120 |    "id": "eadbc6cd",
2121 |    "metadata": {
2122 |     "pycharm": {
2123 |      "name": "#%%\n"
2124 |     }
2125 |    },
2126 |    "outputs": [],
2127 |    "source": [
2128 |     "(gss2\n",
2129 |     "   .groupby('year')\n",
2130 |     "   [['age', 'hours_worked']]\n",
2131 |     "   .mean()\n",
2132 |     ")"
2133 |    ]
2134 |   },
2135 |   {
2136 |    "cell_type": "code",
2137 |    "execution_count": null,
2138 |    "id": "a6d008ae",
2139 |    "metadata": {
2140 |     "pycharm": {
2141 |      "name": "#%%\n"
2142 |     }
2143 |    },
2144 |    "outputs": [],
2145 |    "source": [
2146 |     "import matplotlib.pyplot as plt\n",
2147 |     "import seaborn as sns\n",
2148 |     "#plt.style.use('pandas1book') \n",
2149 |     "sns.set_context('talk')\n",
2150 |     "plt.plot(range(10))"
2151 |    ]
2152 |   },
2153 |   {
2154 |    "cell_type": "code",
2155 |    "execution_count": null,
2156 |    "id": "ffc36b52",
2157 |    "metadata": {
2158 |     "pycharm": {
2159 |      "name": "#%%\n"
2160 |     }
2161 |    },
2162 |    "outputs": [],
2163 |    "source": [
2164 |     "(gss2\n",
2165 |     "   .groupby('year')\n",
2166 |     "   [['age', 'hours_worked']]\n",
2167 |     "   .median()\n",
2168 |     "   .plot()\n",
2169 |     ")"
2170 |    ]
2171 |   },
2172 |   {
2173 |    "cell_type": "code",
2174 |    "execution_count": null,
2175 |    "id": "5bdcd3e8",
2176 |    "metadata": {
2177 |     "pycharm": {
2178 |      "name": "#%%\n"
2179 |     }
2180 |    },
2181 |    "outputs": [],
2182 |    "source": [
2183 |     "(gss2\n",
2184 |     "   .groupby('year')\n",
2185 |     "   [['age', 'hours_worked']]\n",
2186 |     "   #.mean()\n",
2187 |     "   #.median()\n",
2188 |     "   #.std()\n",
2189 |     "   .max()\n",
2190 |     "   .plot()\n",
2191 |     ")"
2192 |    ]
2193 |   },
2194 |   {
2195 |    "cell_type": "code",
2196 |    "execution_count": null,
2197 |    "id": "54ebb97d",
2198 |    "metadata": {
2199 |     "lines_to_next_cell": 2,
2200 |     "pycharm": {
2201 |      "name": "#%%\n"
2202 |     }
2203 |    },
2204 |    "outputs": [],
2205 |    "source": [
2206 |     "# add sex\n",
2207 |     "(gss2\n",
2208 |     "   .groupby(['year', 'sex'])\n",
2209 |     "   [['age', 'hours_worked']]\n",
2210 |     "   .mean()\n",
2211 |     "   #.median()\n",
2212 |     "   #.std()\n",
2213 |     "   #.max()\n",
2214 |     "   #.plot()\n",
2215 |     ")"
2216 |    ]
2217 |   },
2218 |   {
2219 |    "cell_type": "code",
2220 |    "execution_count": null,
2221 |    "id": "266a53da",
2222 |    "metadata": {
2223 |     "lines_to_next_cell": 2,
2224 |     "pycharm": {
2225 |      "name": "#%%\n"
2226 |     }
2227 |    },
2228 |    "outputs": [],
2229 |    "source": [
2230 |     "# add sex\n",
2231 |     "(gss2\n",
2232 |     "   .groupby(['year', 'sex'])\n",
2233 |     "   [['age', 'hours_worked']]\n",
2234 |     "   .mean()\n",
2235 |     "   #.median()\n",
2236 |     "   #.std()\n",
2237 |     "   #.max()\n",
2238 |     "   .plot()\n",
2239 |     ")"
2240 |    ]
2241 |   },
2242 |   {
2243 |    "cell_type": "code",
2244 |    "execution_count": null,
2245 |    "id": "2e20f409",
2246 |    "metadata": {
2247 |     "lines_to_next_cell": 2,
2248 |     "pycharm": {
2249 |      "name": "#%%\n"
2250 |     }
2251 |    },
2252 |    "outputs": [],
2253 |    "source": [
2254 |     "# unstack\n",
2255 |     "(gss2\n",
2256 |     "   .groupby(['year', 'sex'])\n",
2257 |     "   [['age', 'hours_worked']]\n",
2258 |     "   .mean()\n",
2259 |     "   #.median()\n",
2260 |     "   #.std()\n",
2261 |     "   #.max()\n",
2262 |     "   .unstack() \n",
2263 |     "   .plot()\n",
2264 |     ")"
2265 |    ]
2266 |   },
2267 |   {
2268 |    "cell_type": "code",
2269 |    "execution_count": null,
2270 |    "id": "d5481e10",
2271 |    "metadata": {
2272 |     "lines_to_next_cell": 2,
2273 |     "pycharm": {
2274 |      "name": "#%%\n"
2275 |     }
2276 |    },
2277 |    "outputs": [],
2278 |    "source": [
2279 |     "(gss2\n",
2280 |     "   .groupby(['year', 'sex'])\n",
2281 |     "   [['age', 'hours_worked']]\n",
2282 |     "   .mean()\n",
2283 |     "   .unstack()\n",
2284 |     "   .age\n",
2285 |     ")"
2286 |    ]
2287 |   },
2288 |   {
2289 |    "cell_type": "code",
2290 |    "execution_count": null,
2291 |    "id": "9e01d055",
2292 |    "metadata": {
2293 |     "lines_to_next_cell": 2,
2294 |     "pycharm": {
2295 |      "name": "#%%\n"
2296 |     }
2297 |    },
2298 |    "outputs": [],
2299 |    "source": [
2300 |     "(gss2\n",
2301 |     "   .groupby(['year', 'sex'])\n",
2302 |     "   [['age', 'hours_worked']]\n",
2303 |     "   .mean()\n",
2304 |     "   .unstack()\n",
2305 |     "   .age\n",
2306 |     "   .plot()\n",
2307 |     "   .legend(bbox_to_anchor=(1,1))\n",
2308 |     ")"
2309 |    ]
2310 |   },
2311 |   {
2312 |    "cell_type": "code",
2313 |    "execution_count": null,
2314 |    "id": "d1528728",
2315 |    "metadata": {
2316 |     "pycharm": {
2317 |      "name": "#%%\n"
2318 |     }
2319 |    },
2320 |    "outputs": [],
2321 |    "source": [
2322 |     "# Let's try looking at hours worked\n",
2323 |     "(gss2\n",
2324 |     "   .groupby(['year', 'sex'])\n",
2325 |     "   [['age', 'hours_worked']]\n",
2326 |     "   .mean()\n",
2327 |     "   .unstack()\n",
2328 |     "   .hours_worked\n",
2329 |     "   .plot()\n",
2330 |     "   .legend(bbox_to_anchor=(1,1))\n",
2331 |     ")"
2332 |    ]
2333 |   },
2334 |   {
2335 |    "cell_type": "code",
2336 |    "execution_count": null,
2337 |    "id": "a52537a5",
2338 |    "metadata": {
2339 |     "lines_to_next_cell": 2,
2340 |     "pycharm": {
2341 |      "name": "#%%\n"
2342 |     }
2343 |    },
2344 |    "outputs": [],
2345 |    "source": [
2346 |     "# Multiple aggregates\n",
2347 |     "def second(group):\n",
2348 |     "    return group.iloc[1]\n",
2349 |     "(gss2\n",
2350 |     "   .groupby(['year', 'sex'])\n",
2351 |     "   [['age', 'hours_worked']]\n",
2352 |     "  .agg(['min', 'max', 'mean', second])\n",
2353 |     "   \n",
2354 |     ")"
2355 |    ]
2356 |   },
2357 |   {
2358 |    "cell_type": "code",
2359 |    "execution_count": null,
2360 |    "id": "b780beb4",
2361 |    "metadata": {
2362 |     "lines_to_next_cell": 2,
2363 |     "pycharm": {
2364 |      "name": "#%%\n"
2365 |     }
2366 |    },
2367 |    "outputs": [],
2368 |    "source": []
2369 |   },
2370 |   {
2371 |    "cell_type": "markdown",
2372 |    "id": "9aca44a7",
2373 |    "metadata": {},
2374 |    "source": [
2375 |     "## Aggregation Exercise\n",
2376 |     "* Which occupation has the highest median hours worked?\n",
2377 |     "* Which occupation has the lowest age?\n",
2378 |     "* What is the breakdown of respondents by race for each year?\n",
2379 |     "* Convert the previous to a percentage.\n",
2380 |     "* How many unique occupations are there for each year?\n",
2381 |     "* What is the most popular college_major for each year?\n",
2382 |     "* What is the second most popular college_major for each year?"
2383 |    ]
2384 |   },
2385 |   {
2386 |    "cell_type": "code",
2387 |    "execution_count": null,
2388 |    "id": "9e5477d4",
2389 |    "metadata": {},
2390 |    "outputs": [],
2391 |    "source": []
2392 |   },
2393 |   {
2394 |    "cell_type": "code",
2395 |    "execution_count": null,
2396 |    "id": "bcc93724",
2397 |    "metadata": {},
2398 |    "outputs": [],
2399 |    "source": []
2400 |   },
2401 |   {
2402 |    "cell_type": "code",
2403 |    "execution_count": null,
2404 |    "id": "3a7f368e",
2405 |    "metadata": {},
2406 |    "outputs": [],
2407 |    "source": []
2408 |   },
2409 |   {
2410 |    "cell_type": "code",
2411 |    "execution_count": null,
2412 |    "id": "697919b8",
2413 |    "metadata": {},
2414 |    "outputs": [],
2415 |    "source": []
2416 |   },
2417 |   {
2418 |    "cell_type": "code",
2419 |    "execution_count": null,
2420 |    "id": "3d93f8db",
2421 |    "metadata": {},
2422 |    "outputs": [],
2423 |    "source": []
2424 |   },
2425 |   {
2426 |    "cell_type": "code",
2427 |    "execution_count": null,
2428 |    "id": "3907736f",
2429 |    "metadata": {},
2430 |    "outputs": [],
2431 |    "source": []
2432 |   },
2433 |   {
2434 |    "cell_type": "code",
2435 |    "execution_count": null,
2436 |    "id": "18186540",
2437 |    "metadata": {},
2438 |    "outputs": [],
2439 |    "source": []
2440 |   },
2441 |   {
2442 |    "cell_type": "code",
2443 |    "execution_count": null,
2444 |    "id": "f1089e32",
2445 |    "metadata": {},
2446 |    "outputs": [],
2447 |    "source": []
2448 |   },
2449 |   {
2450 |    "cell_type": "code",
2451 |    "execution_count": null,
2452 |    "id": "116593ba",
2453 |    "metadata": {},
2454 |    "outputs": [],
2455 |    "source": []
2456 |   },
2457 |   {
2458 |    "cell_type": "code",
2459 |    "execution_count": null,
2460 |    "id": "aaa6e44e",
2461 |    "metadata": {},
2462 |    "outputs": [],
2463 |    "source": []
2464 |   },
2465 |   {
2466 |    "cell_type": "code",
2467 |    "execution_count": null,
2468 |    "id": "648b7e1e",
2469 |    "metadata": {},
2470 |    "outputs": [],
2471 |    "source": []
2472 |   },
2473 |   {
2474 |    "cell_type": "code",
2475 |    "execution_count": null,
2476 |    "id": "7ee8d6cc",
2477 |    "metadata": {},
2478 |    "outputs": [],
2479 |    "source": []
2480 |   },
2481 |   {
2482 |    "cell_type": "code",
2483 |    "execution_count": null,
2484 |    "id": "4f252bd0",
2485 |    "metadata": {},
2486 |    "outputs": [],
2487 |    "source": []
2488 |   },
2489 |   {
2490 |    "cell_type": "code",
2491 |    "execution_count": null,
2492 |    "id": "b6090240",
2493 |    "metadata": {},
2494 |    "outputs": [],
2495 |    "source": []
2496 |   },
2497 |   {
2498 |    "cell_type": "code",
2499 |    "execution_count": null,
2500 |    "id": "a37bb2df",
2501 |    "metadata": {
2502 |     "lines_to_next_cell": 2
2503 |    },
2504 |    "outputs": [],
2505 |    "source": []
2506 |   },
2507 |   {
2508 |    "cell_type": "markdown",
2509 |    "id": "7cf8f182",
2510 |    "metadata": {},
2511 |    "source": [
2512 |     "## Summary\n",
2513 |     "\n",
2514 |     "* Correct types save space and enable convenient math, string, and date functionality\n",
2515 |     "* Chaining operations will:\n",
2516 |     "   * Make code readable\n",
2517 |     "   * Remove bugs\n",
2518 |     "   * Easier to debug\n",
2519 |     "* Don't mutate (there's no point). Embrace chaining.\n",
2520 |     "* ``.apply`` is slow for math\n",
2521 |     "* Aggregations are powerful. Play with them until they make sense\n",
2522 |     "\n",
2523 |     "Follow on Twitter ``@__mharrison__``\n",
2524 |     "\n",
2525 |     "Book giveaway!"
2526 |    ]
2527 |   },
2528 |   {
2529 |    "cell_type": "code",
2530 |    "execution_count": null,
2531 |    "id": "89444b02",
2532 |    "metadata": {
2533 |     "pycharm": {
2534 |      "name": "#%%\n"
2535 |     }
2536 |    },
2537 |    "outputs": [],
2538 |    "source": [
2539 |     "import random\n",
2540 |     "random.randrange(1,13)"
2541 |    ]
2542 |   },
2543 |   {
2544 |    "cell_type": "code",
2545 |    "execution_count": null,
2546 |    "id": "35931834",
2547 |    "metadata": {
2548 |     "lines_to_next_cell": 2,
2549 |     "pycharm": {
2550 |      "name": "#%%\n"
2551 |     }
2552 |    },
2553 |    "outputs": [],
2554 |    "source": []
2555 |   },
2556 |   {
2557 |    "cell_type": "code",
2558 |    "execution_count": null,
2559 |    "id": "53ab759b",
2560 |    "metadata": {
2561 |     "lines_to_next_cell": 2,
2562 |     "pycharm": {
2563 |      "name": "#%%\n"
2564 |     }
2565 |    },
2566 |    "outputs": [],
2567 |    "source": []
2568 |   },
2569 |   {
2570 |    "cell_type": "code",
2571 |    "execution_count": null,
2572 |    "id": "063efccd",
2573 |    "metadata": {
2574 |     "lines_to_next_cell": 2,
2575 |     "pycharm": {
2576 |      "name": "#%%\n"
2577 |     }
2578 |    },
2579 |    "outputs": [],
2580 |    "source": []
2581 |   },
2582 |   {
2583 |    "cell_type": "code",
2584 |    "execution_count": null,
2585 |    "id": "26386f48",
2586 |    "metadata": {
2587 |     "pycharm": {
2588 |      "name": "#%%\n"
2589 |     }
2590 |    },
2591 |    "outputs": [],
2592 |    "source": []
2593 |   }
2594 |  ],
2595 |  "metadata": {
2596 |   "kernelspec": {
2597 |    "display_name": "Python 3 (ipykernel)",
2598 |    "language": "python",
2599 |    "name": "python3"
2600 |   }
2601 |  },
2602 |  "nbformat": 4,
2603 |  "nbformat_minor": 5
2604 | }
2605 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | notebook==7.0.3
2 | pandas==2.1.1
3 | pyarrow==13.0.0
4 | 
5 | 


--------------------------------------------------------------------------------