├── honest.fth
├── pyproject.toml
├── .devcontainer
    └── devcontainer.json
├── README.md
└── pandas-best-practices.ipynb


/honest.fth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattharrison/pearson-pandas-best-practices/main/honest.fth


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pearson-pandas-best-practices"
 3 | version = "0.1.0"
 4 | requires-python = ">=3.12"
 5 | dependencies = [
 6 |     "matplotlib>=3.10.5",
 7 |     "notebook>=7.4.5",
 8 |     "pandas>=2.3.1",
 9 |     "pyarrow>=21.0.0",
10 |     "seaborn>=0.13.2",
11 | ]
12 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "image": "mcr.microsoft.com/devcontainers/universal:2",
 3 |     "hostRequirements": {
 4 |         "cpus": 2
 5 |     },
 6 |     "waitFor": "onCreateCommand",
 7 |     "updateContentCommand": "",
 8 |     "postCreateCommand": "python3 -m pip install uv; uv sync",
 9 |     "customizations": {
10 |         "codespaces": {
11 |             "openFiles": []
12 |         },
13 |         "vscode": {
14 |             "extensions": [
15 |                 "ms-toolsai.jupyter",
16 |                 "ms-python.python"
17 |             ],
18 |             "features": {
19 |                 "ghcr.io/va-h/devcontainers-features/uv:1": {}
20 |             },
21 |         }
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pearson-pandas-best-practices
 2 | 
 3 | This course introduces best practices for Pandas.
 4 | 
 5 | ## Resources
 6 | 
 7 | See the author's book, [Effective Pandas (digital)](https://store.metasnake.com/effective-pandas-book) [(physical)](https://amzn.to/43dt50h)
 8 | 
 9 | ![Effective Pandas](https://d31ezp3r8jwmks.cloudfront.net/3ytw9atdhoe9ezz1i5hctlspkre4)
10 | 
11 | ## Running
12 | 
13 | You have a few options for running.
14 | The easiest option:
15 | 
16 | ### Use Codespaces. 
17 | 
18 | - Click on the green "Code" button 
19 | - Select the "Codespaces" Tab. 
20 | - Hit "Create codespace on main" button
21 | - Wait a few minutes for the codespace to load
22 | 
23 | ### Run Locally with UV
24 | 
25 | - Install UV (https://docs.astral.sh/uv/getting-started/installation/)
26 | - Check out the project
27 | - Change into the project directory
28 | - Run `uv sync`
29 | - Start jupyter `uv run jupyter notebook`
30 | 


--------------------------------------------------------------------------------
/pandas-best-practices.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "88981ab7",
   6 |    "metadata": {},
   7 |    "source": []
   8 |   },
   9 |   {
  10 |    "cell_type": "markdown",
  11 |    "id": "597d4814",
  12 |    "metadata": {
  13 |     "lines_to_next_cell": 0,
  14 |     "pycharm": {
  15 |      "name": "#%% md\n"
  16 |     }
  17 |    },
  18 |    "source": [
  19 |     "# Pandas Best Practices\n",
  20 |     "## 5 Tips for Better Pandas Code"
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "code",
  25 |    "execution_count": null,
  26 |    "id": "fe77712e",
  27 |    "metadata": {
  28 |     "lines_to_next_cell": 2
  29 |    },
  30 |    "outputs": [],
  31 |    "source": []
  32 |   },
  33 |   {
  34 |    "cell_type": "markdown",
  35 |    "id": "6188e883",
  36 |    "metadata": {
  37 |     "pycharm": {
  38 |      "name": "#%% md\n"
  39 |     }
  40 |    },
  41 |    "source": [
  42 |     "## About Matt  Harrison @\\_\\_mharrison\\_\\_\n",
  43 |     "\n",
  44 |     "* Author of Effective Pandas, Machine Learning Pocket Reference, and Illustrated Guide to Python 3.\n",
  45 |     "* Advisor at Ponder (creators of Modin)\n",
  46 |     "* Corporate trainer at MetaSnake. Taught Pandas to 1000's of students.\n",
  47 |     "* Use coupon LIVE for 10% off Effective Pandas book or bundle ( https://store.metasnake.com )"
  48 |    ]
  49 |   },
  50 |   {
  51 |    "cell_type": "code",
  52 |    "execution_count": null,
  53 |    "id": "8bf569d6",
  54 |    "metadata": {
  55 |     "lines_to_next_cell": 2,
  56 |     "pycharm": {
  57 |      "name": "#%%\n"
  58 |     }
  59 |    },
  60 |    "outputs": [],
  61 |    "source": []
  62 |   },
  63 |   {
  64 |    "cell_type": "code",
  65 |    "execution_count": null,
  66 |    "id": "68b20887",
  67 |    "metadata": {
  68 |     "lines_to_next_cell": 2,
  69 |     "pycharm": {
  70 |      "name": "#%%\n"
  71 |     }
  72 |    },
  73 |    "outputs": [],
  74 |    "source": []
  75 |   },
  76 |   {
  77 |    "cell_type": "code",
  78 |    "execution_count": null,
  79 |    "id": "b3fd6901",
  80 |    "metadata": {
  81 |     "lines_to_next_cell": 2,
  82 |     "pycharm": {
  83 |      "name": "#%%\n"
  84 |     }
  85 |    },
  86 |    "outputs": [],
  87 |    "source": []
  88 |   },
  89 |   {
  90 |    "cell_type": "code",
  91 |    "execution_count": null,
  92 |    "id": "352e081c",
  93 |    "metadata": {
  94 |     "lines_to_next_cell": 2,
  95 |     "pycharm": {
  96 |      "name": "#%%\n"
  97 |     }
  98 |    },
  99 |    "outputs": [],
 100 |    "source": []
 101 |   },
 102 |   {
 103 |    "cell_type": "code",
 104 |    "execution_count": null,
 105 |    "id": "61a77bda",
 106 |    "metadata": {
 107 |     "lines_to_next_cell": 2,
 108 |     "pycharm": {
 109 |      "name": "#%%\n"
 110 |     }
 111 |    },
 112 |    "outputs": [],
 113 |    "source": []
 114 |   },
 115 |   {
 116 |    "cell_type": "code",
 117 |    "execution_count": null,
 118 |    "id": "c7a1b91f",
 119 |    "metadata": {
 120 |     "lines_to_next_cell": 2,
 121 |     "pycharm": {
 122 |      "name": "#%%\n"
 123 |     }
 124 |    },
 125 |    "outputs": [],
 126 |    "source": []
 127 |   },
 128 |   {
 129 |    "cell_type": "markdown",
 130 |    "id": "5c67d9ed",
 131 |    "metadata": {
 132 |     "pycharm": {
 133 |      "name": "#%% md\n"
 134 |     }
 135 |    },
 136 |    "source": [
 137 |     "## Practice this on your data with your team!\n",
 138 |     "* Contact me matt@metasnake.com\n",
 139 |     "* Follow on Twitter @\\_\\_mharrison\\_\\_"
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": null,
 145 |    "id": "b56b65e9",
 146 |    "metadata": {
 147 |     "lines_to_next_cell": 2,
 148 |     "pycharm": {
 149 |      "name": "#%%\n"
 150 |     }
 151 |    },
 152 |    "outputs": [],
 153 |    "source": []
 154 |   },
 155 |   {
 156 |    "cell_type": "code",
 157 |    "execution_count": null,
 158 |    "id": "7c579886",
 159 |    "metadata": {
 160 |     "lines_to_next_cell": 2,
 161 |     "pycharm": {
 162 |      "name": "#%%\n"
 163 |     }
 164 |    },
 165 |    "outputs": [],
 166 |    "source": []
 167 |   },
 168 |   {
 169 |    "cell_type": "markdown",
 170 |    "id": "c38061e7",
 171 |    "metadata": {
 172 |     "pycharm": {
 173 |      "name": "#%% md\n"
 174 |     }
 175 |    },
 176 |    "source": [
 177 |     "## Outline\n",
 178 |     "\n",
 179 |     "* Load Data\n",
 180 |     "* Types\n",
 181 |     "* Chaining\n",
 182 |     "* Mutation\n",
 183 |     "* Apply\n",
 184 |     "* Aggregation"
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "markdown",
 189 |    "id": "dc9d13b9",
 190 |    "metadata": {
 191 |     "pycharm": {
 192 |      "name": "#%% md\n"
 193 |     }
 194 |    },
 195 |    "source": [
 196 |     "## Imports"
 197 |    ]
 198 |   },
 199 |   {
 200 |    "cell_type": "code",
 201 |    "execution_count": 1,
 202 |    "id": "ec04b162",
 203 |    "metadata": {
 204 |     "lines_to_next_cell": 2,
 205 |     "pycharm": {
 206 |      "name": "#%%\n"
 207 |     }
 208 |    },
 209 |    "outputs": [],
 210 |    "source": [
 211 |     "%matplotlib inline\n",
 212 |     "from IPython.display import display\n",
 213 |     "import numpy as np\n",
 214 |     "import pandas as pd\n",
 215 |     "import pyarrow\n",
 216 |     "\n",
 217 |     "import io\n",
 218 |     "import zipfile"
 219 |    ]
 220 |   },
 221 |   {
 222 |    "cell_type": "code",
 223 |    "execution_count": null,
 224 |    "id": "29ef6997",
 225 |    "metadata": {},
 226 |    "outputs": [],
 227 |    "source": [
 228 |     "pd.__version__"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "code",
 233 |    "execution_count": null,
 234 |    "id": "ae401f97",
 235 |    "metadata": {},
 236 |    "outputs": [],
 237 |    "source": [
 238 |     "pyarrow.__version__"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "code",
 243 |    "execution_count": null,
 244 |    "id": "1dea3558",
 245 |    "metadata": {},
 246 |    "outputs": [],
 247 |    "source": []
 248 |   },
 249 |   {
 250 |    "cell_type": "markdown",
 251 |    "id": "1d7ee1c0",
 252 |    "metadata": {},
 253 |    "source": [
 254 |     "## Data Preprocessing\n",
 255 |     "\n",
 256 |     "Don't run this code. I'm providing it here to show you where the data came from.\n",
 257 |     "(If you really want to run this download the ZIP file and update the path)"
 258 |    ]
 259 |   },
 260 |   {
 261 |    "cell_type": "code",
 262 |    "execution_count": null,
 263 |    "id": "3a5723a0",
 264 |    "metadata": {},
 265 |    "outputs": [],
 266 |    "source": [
 267 |     "# https://gss.norc.org/get-the-data/spss\n",
 268 |     "# https://gss.norc.org/Documents/spss/gss_spss_with_codebook.zip\n",
 269 |     "# takes a few minutes on my computer to load\n",
 270 |     "path = '/mnt/c/Users/matt/Downloads/gss_spss_with_codebook.zip'\n",
 271 |     "with zipfile.ZipFile(path) as z:\n",
 272 |     "    print(z.namelist())\n",
 273 |     "    with open('gss.sav', mode='bw') as fout:\n",
 274 |     "        fout.write(z.open('GSS7218_R3.sav').read())\n",
 275 |     "    gss = pd.read_spss('gss.sav')"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "code",
 280 |    "execution_count": null,
 281 |    "id": "f8587141",
 282 |    "metadata": {},
 283 |    "outputs": [],
 284 |    "source": [
 285 |     "!pip install pyreadstat"
 286 |    ]
 287 |   },
 288 |   {
 289 |    "cell_type": "code",
 290 |    "execution_count": null,
 291 |    "id": "f4b686fc",
 292 |    "metadata": {},
 293 |    "outputs": [],
 294 |    "source": [
 295 |     "%%time\n",
 296 |     "import pyreadstat\n",
 297 |     "gss, meta = pyreadstat.read_sav('gss.sav')"
 298 |    ]
 299 |   },
 300 |   {
 301 |    "cell_type": "code",
 302 |    "execution_count": null,
 303 |    "id": "c73cd05b",
 304 |    "metadata": {},
 305 |    "outputs": [],
 306 |    "source": [
 307 |     "gss.shape"
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "code",
 312 |    "execution_count": null,
 313 |    "id": "1e2e1777",
 314 |    "metadata": {},
 315 |    "outputs": [],
 316 |    "source": [
 317 |     "gss.to_feather('gss.fth')"
 318 |    ]
 319 |   },
 320 |   {
 321 |    "cell_type": "code",
 322 |    "execution_count": null,
 323 |    "id": "46ec0b7c",
 324 |    "metadata": {},
 325 |    "outputs": [],
 326 |    "source": [
 327 |     "%%time\n",
 328 |     "raw = pd.read_feather('~/Dropbox/work/jupyter/gss.fth')"
 329 |    ]
 330 |   },
 331 |   {
 332 |    "cell_type": "code",
 333 |    "execution_count": null,
 334 |    "id": "f029dc45",
 335 |    "metadata": {
 336 |     "lines_to_next_cell": 0,
 337 |     "pycharm": {
 338 |      "name": "#%%\n"
 339 |     }
 340 |    },
 341 |    "outputs": [],
 342 |    "source": [
 343 |     "raw"
 344 |    ]
 345 |   },
 346 |   {
 347 |    "cell_type": "code",
 348 |    "execution_count": null,
 349 |    "id": "8cad5ba6",
 350 |    "metadata": {},
 351 |    "outputs": [],
 352 |    "source": [
 353 |     "# 6000 columns!\n",
 354 |     "raw.shape"
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "code",
 359 |    "execution_count": null,
 360 |    "id": "d08680b5",
 361 |    "metadata": {
 362 |     "lines_to_next_cell": 0
 363 |    },
 364 |    "outputs": [],
 365 |    "source": [
 366 |     "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n",
 367 |     "        'INCOME06','HONEST','TICKET']\n",
 368 |     "\n",
 369 |     "raw[cols].to_feather('honest.fth')"
 370 |    ]
 371 |   },
 372 |   {
 373 |    "cell_type": "code",
 374 |    "execution_count": null,
 375 |    "id": "506f2f1d",
 376 |    "metadata": {
 377 |     "lines_to_next_cell": 2
 378 |    },
 379 |    "outputs": [],
 380 |    "source": []
 381 |   },
 382 |   {
 383 |    "cell_type": "markdown",
 384 |    "id": "5ab74806",
 385 |    "metadata": {},
 386 |    "source": [
 387 |     "## Loading Data\n",
 388 |     "\n",
 389 |     "This is the data we will be using. Run this code!"
 390 |    ]
 391 |   },
 392 |   {
 393 |    "cell_type": "code",
 394 |    "execution_count": null,
 395 |    "id": "07444860",
 396 |    "metadata": {
 397 |     "lines_to_next_cell": 2,
 398 |     "pycharm": {
 399 |      "name": "#%%\n"
 400 |     }
 401 |    },
 402 |    "outputs": [],
 403 |    "source": [
 404 |     "raw = pd.read_feather('honest.fth', dtype_backend='pyarrow')"
 405 |    ]
 406 |   },
 407 |   {
 408 |    "cell_type": "code",
 409 |    "execution_count": null,
 410 |    "id": "582255a1",
 411 |    "metadata": {
 412 |     "lines_to_next_cell": 2,
 413 |     "pycharm": {
 414 |      "name": "#%%\n"
 415 |     }
 416 |    },
 417 |    "outputs": [],
 418 |    "source": []
 419 |   },
 420 |   {
 421 |    "cell_type": "markdown",
 422 |    "id": "db1c15f3",
 423 |    "metadata": {
 424 |     "pycharm": {
 425 |      "name": "#%% md\n"
 426 |     }
 427 |    },
 428 |    "source": [
 429 |     "## My Cleanup\n",
 430 |     "See GSS_Codebook.pdf for explanation\n",
 431 |     "\n",
 432 |     "Columns:\n",
 433 |     "\n",
 434 |     "* YEAR\n",
 435 |     "* ID - RESPONDENT ID NUMBER\n",
 436 |     "* AGE - AGE OF RESPONENT\n",
 437 |     "* HRS1 - NUMBER OF HOURS WORKED LAST WEEK\n",
 438 |     "* OCC - R'S CENSUS OCCUPATION CODE (1970) - Page 126 (VAR: OCC) see page 125 for notes APPENDIX F,G,H\n",
 439 |     "   Appendix F - Page 3286\n",
 440 |     "* MAJOR1 - COLLEGE MAJOR 1\n",
 441 |     "* SEX - RESPONDENTS SEX\n",
 442 |     "* RACE - RACE OF RESPONDENT\n",
 443 |     "* BORN -  WAS R BORN IN THIS COUNTRY\n",
 444 |     "* INCOME - TOTAL FAMILY INCOME 1970\n",
 445 |     "* INCOME06 - TOTAL FAMILY INCOME 2006\n",
 446 |     "* HONEST - HONEST\n",
 447 |     "* TICKET - EVER RECEIVED A TRAFFIC TICKET\n"
 448 |    ]
 449 |   },
 450 |   {
 451 |    "cell_type": "code",
 452 |    "execution_count": null,
 453 |    "id": "65089c43",
 454 |    "metadata": {},
 455 |    "outputs": [],
 456 |    "source": [
 457 |     "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n",
 458 |     "        'INCOME06','HONEST','TICKET']\n",
 459 |     "\n",
 460 |     "raw[cols].isna().mean()*100"
 461 |    ]
 462 |   },
 463 |   {
 464 |    "cell_type": "code",
 465 |    "execution_count": null,
 466 |    "id": "67f1d8f4",
 467 |    "metadata": {},
 468 |    "outputs": [],
 469 |    "source": [
 470 |     "(raw\n",
 471 |     " [cols]\n",
 472 |     " .isna()\n",
 473 |     " .mean()*100\n",
 474 |     ")"
 475 |    ]
 476 |   },
 477 |   {
 478 |    "cell_type": "code",
 479 |    "execution_count": null,
 480 |    "id": "df146d91",
 481 |    "metadata": {},
 482 |    "outputs": [],
 483 |    "source": [
 484 |     "MAJOR= '''RESPONSE PUNCH 1972-82 1982B 1983-87 1987B 1988-91 1993-98 2000-04 2006 2008 2010 2012 2014 2016 2018 ALL\n",
 485 |     "Accounting/bookkeeping 1 0 0 0 0 0 0 0 0 0 0 28 32 30 29 119\n",
 486 |     "Advertising 2 0 0 0 0 0 0 0 0 0 0 3 2 0 0 5\n",
 487 |     "Agriculture/horticulture 3 0 0 0 0 0 0 0 0 0 0 8 2 7 5 22\n",
 488 |     "Allied health 4 0 0 0 0 0 0 0 0 0 0 0 2 1 0 3\n",
 489 |     "Anthropology 5 0 0 0 0 0 0 0 0 0 0 3 5 1 1 10\n",
 490 |     "Architecture 6 0 0 0 0 0 0 0 0 0 0 2 3 5 3 13\n",
 491 |     "Art 7 0 0 0 0 0 0 0 0 0 0 6 7 11 10 34\n",
 492 |     "Biology 8 0 0 0 0 0 0 0 0 0 0 16 22 33 26 97\n",
 493 |     "Business administration 9 0 0 0 0 0 0 0 0 0 0 90 142 172 138 542\n",
 494 |     "Chemistry 11 0 0 0 0 0 0 0 0 0 0 5 8 10 4 27\n",
 495 |     "Communications/speech 12 0 0 0 0 0 0 0 0 0 0 20 18 26 18 82\n",
 496 |     "Comm. disorders 13 0 0 0 0 0 0 0 0 0 0 4 6 2 2 14\n",
 497 |     "Computer science 14 0 0 0 0 0 0 0 0 0 0 25 24 33 17 99\n",
 498 |     "Dentistry 15 0 0 0 0 0 0 0 0 0 0 2 4 3 5 14\n",
 499 |     "Education 16 0 0 0 0 0 0 0 0 0 0 73 91 97 79 340\n",
 500 |     "Economics 17 0 0 0 0 0 0 0 0 0 0 11 19 13 19 62\n",
 501 |     "Engineering 18 0 0 0 0 0 0 0 0 0 0 47 49 47 54 197\n",
 502 |     "English 19 0 0 0 0 0 0 0 0 0 0 23 26 27 24 100\n",
 503 |     "Finance 20 0 0 0 0 0 0 0 0 0 0 7 15 14 16 52\n",
 504 |     "Foreign language 21 0 0 0 0 0 0 0 0 0 0 4 8 6 5 23\n",
 505 |     "Forestry 22 0 0 0 0 0 0 0 0 0 0 1 0 3 0 4\n",
 506 |     "Geography 23 0 0 0 0 0 0 0 0 0 0 0 2 2 4 8\n",
 507 |     "Geology 24 0 0 0 0 0 0 0 0 0 0 1 3 4 2 10\n",
 508 |     "History 25 0 0 0 0 0 0 0 0 0 0 10 19 14 19 62\n",
 509 |     "Home economics 26 0 0 0 0 0 0 0 0 0 0 0 0 3 2 5\n",
 510 |     "Industry & techn 27 0 0 0 0 0 0 0 0 0 0 3 4 6 0 13\n",
 511 |     "Journalism 28 0 0 0 0 0 0 0 0 0 0 5 6 6 4 21\n",
 512 |     "Law 29 0 0 0 0 0 0 0 0 0 0 13 18 23 14 68\n",
 513 |     "Law enforcement 30 0 0 0 0 0 0 0 0 0 0 3 5 4 2 14\n",
 514 |     "Library science 31 0 0 0 0 0 0 0 0 0 0 4 5 2 3 14\n",
 515 |     "Marketing 32 0 0 0 0 0 0 0 0 0 0 11 15 13 12 51\n",
 516 |     "Mathematics 33 0 0 0 0 0 0 0 0 0 0 5 10 12 5 32\n",
 517 |     "Medicine 34 0 0 0 0 0 0 0 0 0 0 9 25 12 11 57\n",
 518 |     "Music 35 0 0 0 0 0 0 0 0 0 0 4 2 10 2 18\n",
 519 |     "Nursing 36 0 0 0 0 0 0 0 0 0 0 36 39 60 51 186\n",
 520 |     "Optometry 37 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
 521 |     "Pharmacy 38 0 0 0 0 0 0 0 0 0 0 2 5 4 4 15\n",
 522 |     "Philosophy 39 0 0 0 0 0 0 0 0 0 0 2 0 2 2 6\n",
 523 |     "Physical education 40 0 0 0 0 0 0 0 0 0 0 9 6 16 6 37\n",
 524 |     "Physics 41 0 0 0 0 0 0 0 0 0 0 3 6 7 4 20\n",
 525 |     "Psychology 42 0 0 0 0 0 0 0 0 0 0 32 32 34 29 127\n",
 526 |     "Political science/international relations 43 0 0 0 0 0 0 0 0 0 0 16 22 19 14 71\n",
 527 |     "Sociology 44 0 0 0 0 0 0 0 0 0 0 9 15 10 12 46\n",
 528 |     "Special education 45 0 0 0 0 0 0 0 0 0 0 5 3 5 2 15\n",
 529 |     "Theater arts 46 0 0 0 0 0 0 0 0 0 0 6 2 3 1 12\n",
 530 |     "Theology 47 0 0 0 0 0 0 0 0 0 0 6 6 13 8 33\n",
 531 |     "Veterinary medicine 48 0 0 0 0 0 0 0 0 0 0 1 5 3 4 13\n",
 532 |     "Liberal arts 49 0 0 0 0 0 0 0 0 0 0 8 16 12 10 46\n",
 533 |     "Other 50 0 0 0 0 0 0 0 0 0 0 8 10 21 27 66\n",
 534 |     "General sciences 51 0 0 0 0 0 0 0 0 0 0 10 13 15 14 52\n",
 535 |     "Social work 52 0 0 0 0 0 0 0 0 0 0 7 17 24 7 55\n",
 536 |     "General studies 53 0 0 0 0 0 0 0 0 0 0 2 5 7 7 21\n",
 537 |     "Other vocational 54 0 0 0 0 0 0 0 0 0 0 5 11 6 5 27\n",
 538 |     "Health 55 0 0 0 0 0 0 0 0 0 0 23 31 31 42 127\n",
 539 |     "Industrial Relations 56 0 0 0 0 0 0 0 0 0 0 1 0 0 3 4\n",
 540 |     "Child/Human/Family Development 57 0 0 0 0 0 0 0 0 0 0 11 3 7 7 28\n",
 541 |     "Food Science/Nutrition/Culinary Arts 58 0 0 0 0 0 0 0 0 0 0 3 6 9 9 27\n",
 542 |     "Environmental Science/Ecology 59 0 0 0 0 0 0 0 0 0 0 5 5 6 8 24\n",
 543 |     "Social Sciences 60 0 0 0 0 0 0 0 0 0 0 4 2 7 5 18\n",
 544 |     "Human Services/Human Resources 61 0 0 0 0 0 0 0 0 0 0 3 7 7 5 22\n",
 545 |     "Visual Arts/Graphic Design/Design and Drafting 62 0 0 0 0 0 0 0 0 0 0 3 8 9 10 30\n",
 546 |     "Fine Arts 63 0 0 0 0 0 0 0 0 0 0 4 5 5 6 20\n",
 547 |     "Humanities 64 0 0 0 0 0 0 0 0 0 0 0 2 0 1 3\n",
 548 |     "Ethnic studies 65 0 0 0 0 0 0 0 0 0 0 3 1 0 0 4\n",
 549 |     "Educational administration 66 0 0 0 0 0 0 0 0 0 0 3 4 8 9 24\n",
 550 |     "Television/Film 67 0 0 0 0 0 0 0 0 0 0 0 2 6 1 9\n",
 551 |     "Aviation/Aeronatics 68 0 0 0 0 0 0 0 0 0 0 2 1 1 3 7\n",
 552 |     "Statistics/Biostatistics 69 0 0 0 0 0 0 0 0 0 0 0 0 2 2 4\n",
 553 |     "Criminology/Criminal Justice 70 0 0 0 0 0 0 0 0 0 0 13 17 17 13 60\n",
 554 |     "Administrative Science/Public Administration 71 0 0 0 0 0 0 0 0 0 0 2 11 3 5 21\n",
 555 |     "Electronics 72 0 0 0 0 0 0 0 0 0 0 6 6 5 9 26\n",
 556 |     "Urban and Regional Planning 73 0 0 0 0 0 0 0 0 0 0 1 1 3 2 7\n",
 557 |     "Mechanics/Machine Trade 74 0 0 0 0 0 0 0 0 0 0 0 1 1 4 6\n",
 558 |     "Dance 75 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n",
 559 |     "Gerontology 76 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n",
 560 |     "Public Relations 77 0 0 0 0 0 0 0 0 0 0 3 1 2 1 7\n",
 561 |     "Textiles/Cloth 78 0 0 0 0 0 0 0 0 0 0 3 4 0 0 7\n",
 562 |     "Parks and Recreation 79 0 0 0 0 0 0 0 0 0 0 1 2 1 0 4\n",
 563 |     "Information Technology 80 0 0 0 0 0 0 0 0 0 0 0 5 8 11 24\n",
 564 |     "Fashion 81 0 0 0 0 0 0 0 0 0 0 0 0 3 1 4\n",
 565 |     "Counseling 82 0 0 0 0 0 0 0 0 0 0 0 0 11 9 20\n",
 566 |     "Don't know/UNCODED 98 0 0 0 0 0 0 0 0 0 0 2 3 0 0 5\n",
 567 |     "No answer 99 0 0 0 0 0 0 0 0 0 0 0 1 5 3 9\n",
 568 |     "Not applicable 0 13626 354 7542 353 5907 10334 8394 4510 2023 2044 1263 1597 1795 1435 61177'''\n",
 569 |     "\n",
 570 |     "# copy paste slight tweak from page 186\n",
 571 |     "major_dict = {int(row.split()[-16]): ' '.join(row.split()[:-16])  for row in MAJOR.split('\\n')[1:]}\n",
 572 |     "major_dict"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "code",
 577 |    "execution_count": null,
 578 |    "id": "b9d6c34d",
 579 |    "metadata": {},
 580 |    "outputs": [],
 581 |    "source": [
 582 |     "raw.MAJOR1.value_counts()"
 583 |    ]
 584 |   },
 585 |   {
 586 |    "cell_type": "code",
 587 |    "execution_count": null,
 588 |    "id": "74652b6d",
 589 |    "metadata": {},
 590 |    "outputs": [],
 591 |    "source": [
 592 |     "(raw\n",
 593 |     " [cols]\n",
 594 |     " .assign(\n",
 595 |     "     MAJOR1=raw.MAJOR1.fillna(99).astype('int').replace(major_dict),\n",
 596 |     "     SEX=raw.SEX#\n",
 597 |     "           \n",
 598 |     "           .astype(int)\n",
 599 |     "           .replace({1:'Male', 2:'Female'}),\n",
 600 |     "     RACE=raw.RACE.astype(int).replace({1:'White', 2:'Black', 3:'Other'}),\n",
 601 |     "     OCC=raw.OCC.fillna(9999).astype(int),\n",
 602 |     "     BORN=raw.BORN.fillna(4).astype(int).replace({1:'Yes', 2:'No', 3:'Don\\'t Know',\n",
 603 |     "                                                    4:'No answer', 5:'Not applicable'}),\n",
 604 |     "     INCOME=raw.INCOME.fillna(99).astype(int).replace({99:'No answer', **dict(enumerate(['Not applicable',\n",
 605 |     "                                                                                  0,1000,3000,4000,5000,6000,\n",
 606 |     "                                                                                  7000,8000,10000,15000,20000,25000,]))}),\n",
 607 |     "     INCOME06=raw.INCOME06.fillna(26).astype(int).replace({26:'Refused', **dict(enumerate(['Not applicable',\n",
 608 |     "                                                                                  0,1000,3000,4000,5000,6000,\n",
 609 |     "                                                                                  7000,8000,10000,12500,15000,\n",
 610 |     "                                                                                  17500,20000,22500,25000,30_000,\n",
 611 |     "                                                                                  35_000, 40_000, 50_000, 60_000,\n",
 612 |     "                                                                                 75_000, 90_000, 110_000, 130_000,\n",
 613 |     "                                                                                 150_000]))}),\n",
 614 |     "     HONEST=raw.HONEST.fillna(9).astype(int).replace({1:'Most desirable', 2:'3 most desireable',\n",
 615 |     "                                                                   3:'Not mentioned', 4:  '3 least desireable',\n",
 616 |     "                                                                   5: 'One least desireable',\n",
 617 |     "                                                                    9:'No answer'}),\n",
 618 |     "     TICKET=raw.TICKET.fillna(9).astype(int).replace({1:'Yes', 2:'No', 3:'Refused', 9: 'No answer'}),\n",
 619 |     "     )\n",
 620 |     " .astype({'YEAR':int, 'ID': 'uint16[pyarrow]'})\n",
 621 |     " .to_csv('GSS.csv')\n",
 622 |     ")"
 623 |    ]
 624 |   },
 625 |   {
 626 |    "cell_type": "code",
 627 |    "execution_count": null,
 628 |    "id": "a14afd45",
 629 |    "metadata": {},
 630 |    "outputs": [],
 631 |    "source": []
 632 |   },
 633 |   {
 634 |    "cell_type": "code",
 635 |    "execution_count": null,
 636 |    "id": "ce8f0020",
 637 |    "metadata": {},
 638 |    "outputs": [],
 639 |    "source": []
 640 |   },
 641 |   {
 642 |    "cell_type": "markdown",
 643 |    "id": "043a0085",
 644 |    "metadata": {},
 645 |    "source": [
 646 |     "## Types\n",
 647 |     "Getting the right types will enable analysis and correctness.\n"
 648 |    ]
 649 |   },
 650 |   {
 651 |    "cell_type": "code",
 652 |    "execution_count": null,
 653 |    "id": "5d6c6cd5",
 654 |    "metadata": {},
 655 |    "outputs": [],
 656 |    "source": [
 657 |     "%%time\n",
 658 |     "gss = pd.read_csv('GSS.csv', index_col=0, dtype_backend='pyarrow', engine='pyarrow')"
 659 |    ]
 660 |   },
 661 |   {
 662 |    "cell_type": "code",
 663 |    "execution_count": null,
 664 |    "id": "d26b8f6c",
 665 |    "metadata": {
 666 |     "pycharm": {
 667 |      "name": "#%%\n"
 668 |     }
 669 |    },
 670 |    "outputs": [],
 671 |    "source": [
 672 |     "gss.dtypes"
 673 |    ]
 674 |   },
 675 |   {
 676 |    "cell_type": "code",
 677 |    "execution_count": null,
 678 |    "id": "19890585",
 679 |    "metadata": {},
 680 |    "outputs": [],
 681 |    "source": [
 682 |     "gss"
 683 |    ]
 684 |   },
 685 |   {
 686 |    "cell_type": "code",
 687 |    "execution_count": null,
 688 |    "id": "852115fe",
 689 |    "metadata": {
 690 |     "pycharm": {
 691 |      "name": "#%%\n"
 692 |     }
 693 |    },
 694 |    "outputs": [],
 695 |    "source": [
 696 |     "gss.memory_usage(deep=True)"
 697 |    ]
 698 |   },
 699 |   {
 700 |    "cell_type": "code",
 701 |    "execution_count": null,
 702 |    "id": "b5cfc13b",
 703 |    "metadata": {
 704 |     "pycharm": {
 705 |      "name": "#%%\n"
 706 |     }
 707 |    },
 708 |    "outputs": [],
 709 |    "source": [
 710 |     "# 36 M (pandas 1)\n",
 711 |     "# 8.6 M (Pandas 2)\n",
 712 |     "gss.memory_usage(deep=True).sum()"
 713 |    ]
 714 |   },
 715 |   {
 716 |    "cell_type": "code",
 717 |    "execution_count": null,
 718 |    "id": "f1d1b51d",
 719 |    "metadata": {
 720 |     "lines_to_next_cell": 2,
 721 |     "pycharm": {
 722 |      "name": "#%%\n"
 723 |     }
 724 |    },
 725 |    "outputs": [],
 726 |    "source": []
 727 |   },
 728 |   {
 729 |    "cell_type": "markdown",
 730 |    "id": "5fcab8c0",
 731 |    "metadata": {
 732 |     "pycharm": {
 733 |      "name": "#%% md\n"
 734 |     }
 735 |    },
 736 |    "source": [
 737 |     "## Ints"
 738 |    ]
 739 |   },
 740 |   {
 741 |    "cell_type": "code",
 742 |    "execution_count": null,
 743 |    "id": "ad4eddc7",
 744 |    "metadata": {
 745 |     "pycharm": {
 746 |      "name": "#%%\n"
 747 |     }
 748 |    },
 749 |    "outputs": [],
 750 |    "source": [
 751 |     "gss.select_dtypes(int).describe()"
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "code",
 756 |    "execution_count": null,
 757 |    "id": "ac323e5e",
 758 |    "metadata": {
 759 |     "pycharm": {
 760 |      "name": "#%%\n"
 761 |     }
 762 |    },
 763 |    "outputs": [],
 764 |    "source": [
 765 |     "# chaining\n",
 766 |     "(gss\n",
 767 |     " .select_dtypes(int)\n",
 768 |     " .describe()\n",
 769 |     ")"
 770 |    ]
 771 |   },
 772 |   {
 773 |    "cell_type": "code",
 774 |    "execution_count": null,
 775 |    "id": "2351d051",
 776 |    "metadata": {
 777 |     "pycharm": {
 778 |      "name": "#%%\n"
 779 |     }
 780 |    },
 781 |    "outputs": [],
 782 |    "source": [
 783 |     "# can comb08 be an int8?\n",
 784 |     "# Do completion on int\n",
 785 |     "np.iinfo(np.int)"
 786 |    ]
 787 |   },
 788 |   {
 789 |    "cell_type": "code",
 790 |    "execution_count": null,
 791 |    "id": "323df8fb",
 792 |    "metadata": {
 793 |     "pycharm": {
 794 |      "name": "#%%\n"
 795 |     }
 796 |    },
 797 |    "outputs": [],
 798 |    "source": [
 799 |     "np.iinfo(np.uint8)"
 800 |    ]
 801 |   },
 802 |   {
 803 |    "cell_type": "code",
 804 |    "execution_count": null,
 805 |    "id": "bb063be4",
 806 |    "metadata": {
 807 |     "pycharm": {
 808 |      "name": "#%%\n"
 809 |     }
 810 |    },
 811 |    "outputs": [],
 812 |    "source": [
 813 |     "np.iinfo(np.uint16)"
 814 |    ]
 815 |   },
 816 |   {
 817 |    "cell_type": "code",
 818 |    "execution_count": null,
 819 |    "id": "d0fab927",
 820 |    "metadata": {
 821 |     "pycharm": {
 822 |      "name": "#%%\n"
 823 |     }
 824 |    },
 825 |    "outputs": [],
 826 |    "source": [
 827 |     "# chaining\n",
 828 |     "(gss\n",
 829 |     " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' })\n",
 830 |     " .select_dtypes(['uint16'])\n",
 831 |     " .describe()\n",
 832 |     ")"
 833 |    ]
 834 |   },
 835 |   {
 836 |    "cell_type": "code",
 837 |    "execution_count": null,
 838 |    "id": "a1d0ed15",
 839 |    "metadata": {
 840 |     "lines_to_next_cell": 2,
 841 |     "pycharm": {
 842 |      "name": "#%%\n"
 843 |     }
 844 |    },
 845 |    "outputs": [],
 846 |    "source": [
 847 |     "# chaining\n",
 848 |     "# use 'integer' so see all int-like columns\n",
 849 |     "(gss\n",
 850 |     " .astype({#'YEAR': 'uint16[pyarrow]',\n",
 851 |     "          'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n",
 852 |     " .select_dtypes(['integer'])  # see https://numpy.org/doc/stable/reference/arrays.scalars.html\n",
 853 |     " .describe()\n",
 854 |     ")"
 855 |    ]
 856 |   },
 857 |   {
 858 |    "cell_type": "code",
 859 |    "execution_count": null,
 860 |    "id": "c5d4c3e1",
 861 |    "metadata": {
 862 |     "lines_to_next_cell": 2,
 863 |     "pycharm": {
 864 |      "name": "#%%\n"
 865 |     }
 866 |    },
 867 |    "outputs": [],
 868 |    "source": [
 869 |     "# Inspect memory usage\n",
 870 |     "(gss\n",
 871 |     " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n",
 872 |     " .memory_usage(deep=True)\n",
 873 |     " .sum()  # was 36M\n",
 874 |     ")"
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "code",
 879 |    "execution_count": null,
 880 |    "id": "8ad6e733",
 881 |    "metadata": {
 882 |     "lines_to_next_cell": 2,
 883 |     "pycharm": {
 884 |      "name": "#%%\n"
 885 |     }
 886 |    },
 887 |    "outputs": [],
 888 |    "source": []
 889 |   },
 890 |   {
 891 |    "cell_type": "markdown",
 892 |    "id": "f339194e",
 893 |    "metadata": {},
 894 |    "source": [
 895 |     "## Int Exercise\n",
 896 |     "* Try converting *YEAR* to `'int8'`. What do the values look like?\n",
 897 |     "* Try converting *YEAR* to `'int8[pyarrow]'`. What do the values look like?"
 898 |    ]
 899 |   },
 900 |   {
 901 |    "cell_type": "code",
 902 |    "execution_count": null,
 903 |    "id": "908545d1",
 904 |    "metadata": {},
 905 |    "outputs": [],
 906 |    "source": []
 907 |   },
 908 |   {
 909 |    "cell_type": "code",
 910 |    "execution_count": null,
 911 |    "id": "18a3bf52",
 912 |    "metadata": {},
 913 |    "outputs": [],
 914 |    "source": []
 915 |   },
 916 |   {
 917 |    "cell_type": "markdown",
 918 |    "id": "b09f89c6",
 919 |    "metadata": {},
 920 |    "source": [
 921 |     "## Floats"
 922 |    ]
 923 |   },
 924 |   {
 925 |    "cell_type": "code",
 926 |    "execution_count": null,
 927 |    "id": "e7fed87e",
 928 |    "metadata": {
 929 |     "pycharm": {
 930 |      "name": "#%%\n"
 931 |     }
 932 |    },
 933 |    "outputs": [],
 934 |    "source": [
 935 |     "(gss\n",
 936 |     ".select_dtypes('float'))"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "code",
 941 |    "execution_count": null,
 942 |    "id": "49265726",
 943 |    "metadata": {
 944 |     "pycharm": {
 945 |      "name": "#%%\n"
 946 |     }
 947 |    },
 948 |    "outputs": [],
 949 |    "source": [
 950 |     "# surprise! age and hours worked looks int-like\n",
 951 |     "gss.HRS1.describe()"
 952 |    ]
 953 |   },
 954 |   {
 955 |    "cell_type": "code",
 956 |    "execution_count": null,
 957 |    "id": "cd39df3c",
 958 |    "metadata": {
 959 |     "pycharm": {
 960 |      "name": "#%%\n"
 961 |     }
 962 |    },
 963 |    "outputs": [],
 964 |    "source": [
 965 |     "# opps! missing values\n",
 966 |     "gss.HRS1.value_counts(dropna=False)"
 967 |    ]
 968 |   },
 969 |   {
 970 |    "cell_type": "code",
 971 |    "execution_count": null,
 972 |    "id": "31a67da2",
 973 |    "metadata": {
 974 |     "pycharm": {
 975 |      "name": "#%%\n"
 976 |     }
 977 |    },
 978 |    "outputs": [],
 979 |    "source": [
 980 |     "# where are they missing?\n",
 981 |     "(gss\n",
 982 |     "  .query('HRS1.isna()')\n",
 983 |     ")"
 984 |    ]
 985 |   },
 986 |   {
 987 |    "cell_type": "code",
 988 |    "execution_count": null,
 989 |    "id": "e697d070",
 990 |    "metadata": {
 991 |     "pycharm": {
 992 |      "name": "#%%\n"
 993 |     }
 994 |    },
 995 |    "outputs": [],
 996 |    "source": [
 997 |     "# where are they missing?\n",
 998 |     "(gss\n",
 999 |     "  .query('AGE.isna()')\n",
1000 |     ")"
1001 |    ]
1002 |   },
1003 |   {
1004 |    "cell_type": "code",
1005 |    "execution_count": null,
1006 |    "id": "a9166e1a",
1007 |    "metadata": {
1008 |     "pycharm": {
1009 |      "name": "#%%\n"
1010 |     }
1011 |    },
1012 |    "outputs": [],
1013 |    "source": [
1014 |     "# where are they missing?\n",
1015 |     "# It turns out that ID is not consistent across years\n",
1016 |     "(gss\n",
1017 |     "  .query('ID == 229')\n",
1018 |     ")"
1019 |    ]
1020 |   },
1021 |   {
1022 |    "cell_type": "code",
1023 |    "execution_count": null,
1024 |    "id": "81a8c902",
1025 |    "metadata": {
1026 |     "lines_to_next_cell": 2,
1027 |     "pycharm": {
1028 |      "name": "#%%\n"
1029 |     }
1030 |    },
1031 |    "outputs": [],
1032 |    "source": [
1033 |     "# Convert to integers\n",
1034 |     "(gss\n",
1035 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1036 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n",
1037 |     ")"
1038 |    ]
1039 |   },
1040 |   {
1041 |    "cell_type": "code",
1042 |    "execution_count": null,
1043 |    "id": "d3388e2c",
1044 |    "metadata": {
1045 |     "lines_to_next_cell": 2,
1046 |     "pycharm": {
1047 |      "name": "#%%\n"
1048 |     }
1049 |    },
1050 |    "outputs": [],
1051 |    "source": [
1052 |     "(gss\n",
1053 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1054 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n",
1055 |     " .memory_usage(deep=True)\n",
1056 |     " .sum()  # was 36M  \n",
1057 |     ")"
1058 |    ]
1059 |   },
1060 |   {
1061 |    "cell_type": "code",
1062 |    "execution_count": null,
1063 |    "id": "9bb70ac2",
1064 |    "metadata": {
1065 |     "lines_to_next_cell": 2,
1066 |     "pycharm": {
1067 |      "name": "#%%\n"
1068 |     }
1069 |    },
1070 |    "outputs": [],
1071 |    "source": []
1072 |   },
1073 |   {
1074 |    "cell_type": "markdown",
1075 |    "id": "75bfd716",
1076 |    "metadata": {},
1077 |    "source": [
1078 |     "## Float Exercise\n",
1079 |     "\n",
1080 |     "* What is the mean of the numeric columns?\n",
1081 |     "* How many values are missing in the numeric columns?"
1082 |    ]
1083 |   },
1084 |   {
1085 |    "cell_type": "code",
1086 |    "execution_count": null,
1087 |    "id": "e3e30c4e",
1088 |    "metadata": {
1089 |     "lines_to_next_cell": 2
1090 |    },
1091 |    "outputs": [],
1092 |    "source": []
1093 |   },
1094 |   {
1095 |    "cell_type": "markdown",
1096 |    "id": "a136fe09",
1097 |    "metadata": {},
1098 |    "source": [
1099 |     "## Objects"
1100 |    ]
1101 |   },
1102 |   {
1103 |    "cell_type": "code",
1104 |    "execution_count": null,
1105 |    "id": "4f8b0477",
1106 |    "metadata": {
1107 |     "pycharm": {
1108 |      "name": "#%%\n"
1109 |     }
1110 |    },
1111 |    "outputs": [],
1112 |    "source": [
1113 |     "# pandas 1.x\n",
1114 |     "(gss\n",
1115 |     " .select_dtypes(object)\n",
1116 |     ")"
1117 |    ]
1118 |   },
1119 |   {
1120 |    "cell_type": "code",
1121 |    "execution_count": null,
1122 |    "id": "f80da8d2",
1123 |    "metadata": {
1124 |     "pycharm": {
1125 |      "name": "#%%\n"
1126 |     }
1127 |    },
1128 |    "outputs": [],
1129 |    "source": [
1130 |     "# pandas 2\n",
1131 |     "(gss\n",
1132 |     " .select_dtypes('string') # str doesn't work\n",
1133 |     ")"
1134 |    ]
1135 |   },
1136 |   {
1137 |    "cell_type": "code",
1138 |    "execution_count": null,
1139 |    "id": "7f762143",
1140 |    "metadata": {
1141 |     "pycharm": {
1142 |      "name": "#%%\n"
1143 |     }
1144 |    },
1145 |    "outputs": [],
1146 |    "source": [
1147 |     "# My goto method - .value_counts\n",
1148 |     "# looks categorical\n",
1149 |     "(gss.MAJOR1.value_counts(dropna=False))"
1150 |    ]
1151 |   },
1152 |   {
1153 |    "cell_type": "code",
1154 |    "execution_count": null,
1155 |    "id": "55c21c7a",
1156 |    "metadata": {
1157 |     "lines_to_next_cell": 2,
1158 |     "pycharm": {
1159 |      "name": "#%%\n"
1160 |     }
1161 |    },
1162 |    "outputs": [],
1163 |    "source": [
1164 |     "(gss\n",
1165 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1166 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1167 |     "         'MAJOR1': 'category'})\n",
1168 |     " .memory_usage(deep=True)\n",
1169 |     " .sum()  # was 36M  \n",
1170 |     ")"
1171 |    ]
1172 |   },
1173 |   {
1174 |    "cell_type": "code",
1175 |    "execution_count": null,
1176 |    "id": "69969c1b",
1177 |    "metadata": {},
1178 |    "outputs": [],
1179 |    "source": [
1180 |     "(gss\n",
1181 |     " .select_dtypes(object)\n",
1182 |     " .columns\n",
1183 |     ")"
1184 |    ]
1185 |   },
1186 |   {
1187 |    "cell_type": "code",
1188 |    "execution_count": null,
1189 |    "id": "f5d51601",
1190 |    "metadata": {
1191 |     "lines_to_next_cell": 0,
1192 |     "pycharm": {
1193 |      "name": "#%%\n"
1194 |     }
1195 |    },
1196 |    "outputs": [],
1197 |    "source": [
1198 |     "# wow!\n",
1199 |     "(gss\n",
1200 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1201 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1202 |     "         'MAJOR1': 'category',\n",
1203 |     "          **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1204 |     "                'INCOME', 'INCOME06', 'HONEST','TICKET']}})           \n",
1205 |     " .memory_usage(deep=True)\n",
1206 |     " .sum()  # was 36M  \n",
1207 |     ")"
1208 |    ]
1209 |   },
1210 |   {
1211 |    "cell_type": "code",
1212 |    "execution_count": null,
1213 |    "id": "17206364",
1214 |    "metadata": {},
1215 |    "outputs": [],
1216 |    "source": []
1217 |   },
1218 |   {
1219 |    "cell_type": "code",
1220 |    "execution_count": null,
1221 |    "id": "50bf3fa6",
1222 |    "metadata": {
1223 |     "lines_to_next_cell": 2
1224 |    },
1225 |    "outputs": [],
1226 |    "source": []
1227 |   },
1228 |   {
1229 |    "cell_type": "markdown",
1230 |    "id": "246041ae",
1231 |    "metadata": {},
1232 |    "source": [
1233 |     "## Category Exercises\n",
1234 |     "* There is a `.cat` attribute on the category columns. What can you do with this attribute? (Use `dir` or tab completion to inspect).\n",
1235 |     "* Categories can be ordered. How do you order *INCOME*?\n",
1236 |     "* Order the *HONEST* column."
1237 |    ]
1238 |   },
1239 |   {
1240 |    "cell_type": "code",
1241 |    "execution_count": null,
1242 |    "id": "f543c52c",
1243 |    "metadata": {},
1244 |    "outputs": [],
1245 |    "source": []
1246 |   },
1247 |   {
1248 |    "cell_type": "code",
1249 |    "execution_count": null,
1250 |    "id": "338e5ba3",
1251 |    "metadata": {},
1252 |    "outputs": [],
1253 |    "source": []
1254 |   },
1255 |   {
1256 |    "cell_type": "code",
1257 |    "execution_count": null,
1258 |    "id": "3fb313d6",
1259 |    "metadata": {},
1260 |    "outputs": [],
1261 |    "source": []
1262 |   },
1263 |   {
1264 |    "cell_type": "code",
1265 |    "execution_count": null,
1266 |    "id": "f1d75a84",
1267 |    "metadata": {},
1268 |    "outputs": [],
1269 |    "source": []
1270 |   },
1271 |   {
1272 |    "cell_type": "code",
1273 |    "execution_count": null,
1274 |    "id": "5c87e18d",
1275 |    "metadata": {},
1276 |    "outputs": [],
1277 |    "source": []
1278 |   },
1279 |   {
1280 |    "cell_type": "code",
1281 |    "execution_count": null,
1282 |    "id": "85aaccbb",
1283 |    "metadata": {},
1284 |    "outputs": [],
1285 |    "source": []
1286 |   },
1287 |   {
1288 |    "cell_type": "code",
1289 |    "execution_count": null,
1290 |    "id": "8a321513",
1291 |    "metadata": {},
1292 |    "outputs": [],
1293 |    "source": []
1294 |   },
1295 |   {
1296 |    "cell_type": "markdown",
1297 |    "id": "8af7a3d4",
1298 |    "metadata": {},
1299 |    "source": [
1300 |     "## Make a Function"
1301 |    ]
1302 |   },
1303 |   {
1304 |    "cell_type": "code",
1305 |    "execution_count": null,
1306 |    "id": "cb9a32b3",
1307 |    "metadata": {
1308 |     "lines_to_next_cell": 2,
1309 |     "pycharm": {
1310 |      "name": "#%%\n"
1311 |     }
1312 |    },
1313 |    "outputs": [],
1314 |    "source": [
1315 |     "# a glorious function\n",
1316 |     "# add ordered categories to this\n",
1317 |     "def tweak_gss(gss):\n",
1318 |     "    return (gss\n",
1319 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1320 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1321 |     "             'MAJOR1': 'category',\n",
1322 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1323 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1324 |     "               )\n",
1325 |     "\n",
1326 |     "tweak_gss(gss)"
1327 |    ]
1328 |   },
1329 |   {
1330 |    "cell_type": "markdown",
1331 |    "id": "1c615739",
1332 |    "metadata": {},
1333 |    "source": [
1334 |     "## Function Exercise\n",
1335 |     "* Rearrange your notebook. Put the imports, code to load raw data, and tweak function at the top of the notebook. Restart the kernel and validate that your code works."
1336 |    ]
1337 |   },
1338 |   {
1339 |    "cell_type": "code",
1340 |    "execution_count": null,
1341 |    "id": "c61b9f0a",
1342 |    "metadata": {},
1343 |    "outputs": [],
1344 |    "source": []
1345 |   },
1346 |   {
1347 |    "cell_type": "code",
1348 |    "execution_count": null,
1349 |    "id": "6589902c",
1350 |    "metadata": {},
1351 |    "outputs": [],
1352 |    "source": []
1353 |   },
1354 |   {
1355 |    "cell_type": "markdown",
1356 |    "id": "b350e12e",
1357 |    "metadata": {
1358 |     "lines_to_next_cell": 2
1359 |    },
1360 |    "source": [
1361 |     "## Fix Column Names"
1362 |    ]
1363 |   },
1364 |   {
1365 |    "cell_type": "code",
1366 |    "execution_count": null,
1367 |    "id": "99b39238",
1368 |    "metadata": {
1369 |     "lines_to_next_cell": 0,
1370 |     "pycharm": {
1371 |      "name": "#%%\n"
1372 |     }
1373 |    },
1374 |    "outputs": [],
1375 |    "source": [
1376 |     "# a glorious function\n",
1377 |     "def tweak_gss(gss):\n",
1378 |     "    return (gss\n",
1379 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1380 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1381 |     "             'MAJOR1': 'category',\n",
1382 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1383 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1384 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1385 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1386 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1387 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1388 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1389 |     "          'HONEST':'honesty_rank',\n",
1390 |     "          'TICKET':'traffic_ticket'})\n",
1391 |     "    )\n",
1392 |     "\n",
1393 |     "tweak_gss(gss)"
1394 |    ]
1395 |   },
1396 |   {
1397 |    "cell_type": "code",
1398 |    "execution_count": null,
1399 |    "id": "84ecc0de",
1400 |    "metadata": {
1401 |     "lines_to_next_cell": 2,
1402 |     "pycharm": {
1403 |      "name": "#%%\n"
1404 |     }
1405 |    },
1406 |    "outputs": [],
1407 |    "source": []
1408 |   },
1409 |   {
1410 |    "cell_type": "code",
1411 |    "execution_count": null,
1412 |    "id": "bf14ec3f",
1413 |    "metadata": {
1414 |     "lines_to_next_cell": 2,
1415 |     "pycharm": {
1416 |      "name": "#%%\n"
1417 |     }
1418 |    },
1419 |    "outputs": [],
1420 |    "source": []
1421 |   },
1422 |   {
1423 |    "cell_type": "markdown",
1424 |    "id": "003b96b9",
1425 |    "metadata": {
1426 |     "pycharm": {
1427 |      "name": "#%% md\n"
1428 |     }
1429 |    },
1430 |    "source": [
1431 |     "## Chain\n",
1432 |     "\n",
1433 |     "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n",
1434 |     "\n",
1435 |     "The chain should read like a recipe of ordered steps.\n",
1436 |     "\n",
1437 |     "(BTW, this is actually what we did above.)\n",
1438 |     "\n",
1439 |     "<div class='alert alert-warning'>\n",
1440 |     "    Hint: Leverage <tt>.pipe</tt> if you can't find a way to chain 😉🐼💪\n",
1441 |     "</div>\n",
1442 |     "    \n",
1443 |     "\n",
1444 |     "\n"
1445 |    ]
1446 |   },
1447 |   {
1448 |    "cell_type": "code",
1449 |    "execution_count": null,
1450 |    "id": "a74cd1a9",
1451 |    "metadata": {
1452 |     "lines_to_next_cell": 0,
1453 |     "pycharm": {
1454 |      "name": "#%%\n"
1455 |     }
1456 |    },
1457 |    "outputs": [],
1458 |    "source": [
1459 |     "# a glorious function\n",
1460 |     "def tweak_gss(gss):\n",
1461 |     "    return (gss\n",
1462 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1463 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1464 |     "             'MAJOR1': 'category',\n",
1465 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1466 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1467 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1468 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1469 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1470 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1471 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1472 |     "          'HONEST':'honesty_rank',\n",
1473 |     "          'TICKET':'traffic_ticket'})\n",
1474 |     "    )\n",
1475 |     "\n",
1476 |     "tweak_gss(gss)"
1477 |    ]
1478 |   },
1479 |   {
1480 |    "cell_type": "code",
1481 |    "execution_count": null,
1482 |    "id": "efc594da",
1483 |    "metadata": {
1484 |     "pycharm": {
1485 |      "name": "#%%\n"
1486 |     }
1487 |    },
1488 |    "outputs": [],
1489 |    "source": [
1490 |     "# compare chain to this mess\n",
1491 |     "gss2 = gss.copy()\n",
1492 |     "year = gss.YEAR\n",
1493 |     "year_int = year.astype('uint16')\n",
1494 |     "gss2['year'] = year_int\n",
1495 |     "id = gss.ID\n",
1496 |     "id_int = id.astype('uint16')\n",
1497 |     "gss2['year_id'] = id_int\n",
1498 |     "occ = gss.OCC\n",
1499 |     "occ_int = occ.astype('uint16')\n",
1500 |     "gss2['occupation'] = occ_int\n",
1501 |     "\n",
1502 |     "# more of this"
1503 |    ]
1504 |   },
1505 |   {
1506 |    "cell_type": "code",
1507 |    "execution_count": null,
1508 |    "id": "32411eaf",
1509 |    "metadata": {
1510 |     "lines_to_next_cell": 0,
1511 |     "pycharm": {
1512 |      "name": "#%%\n"
1513 |     }
1514 |    },
1515 |    "outputs": [],
1516 |    "source": [
1517 |     "# easy to debug\n",
1518 |     "#  - assign to var (df3)\n",
1519 |     "#  - comment out\n",
1520 |     "#  - pipe to display\n",
1521 |     "\n",
1522 |     "\n",
1523 |     "from IPython.display import display\n",
1524 |     "\n",
1525 |     "def get_var(df, var_name):\n",
1526 |     "    globals()[var_name] = df\n",
1527 |     "    return df\n",
1528 |     "\n",
1529 |     "def tweak_gss(gss):\n",
1530 |     "    return (gss\n",
1531 |     "      .pipe(get_var, 'df3')   \n",
1532 |     "     .pipe(lambda df: print(df.shape) or df)                \n",
1533 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1534 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1535 |     "             'MAJOR1': 'category',\n",
1536 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1537 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1538 |     "     .pipe(lambda df: print(df.shape) or df)                            \n",
1539 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1540 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1541 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1542 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1543 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1544 |     "          'HONEST':'honesty_rank',\n",
1545 |     "          'TICKET':'traffic_ticket'})\n",
1546 |     "     .pipe(lambda df: print(df.shape) or df)                            \n",
1547 |     "    )\n",
1548 |     "\n",
1549 |     "tweak_gss(gss)"
1550 |    ]
1551 |   },
1552 |   {
1553 |    "cell_type": "code",
1554 |    "execution_count": null,
1555 |    "id": "fdc2894e",
1556 |    "metadata": {
1557 |     "pycharm": {
1558 |      "name": "#%%\n"
1559 |     }
1560 |    },
1561 |    "outputs": [],
1562 |    "source": [
1563 |     "# inspect intermediate data frame\n",
1564 |     "df3"
1565 |    ]
1566 |   },
1567 |   {
1568 |    "cell_type": "markdown",
1569 |    "id": "1842701c",
1570 |    "metadata": {
1571 |     "pycharm": {
1572 |      "name": "#%%\n"
1573 |     }
1574 |    },
1575 |    "source": [
1576 |     "## Chain Exercise\n",
1577 |     "* Write a function that acccepts a dataframe and an index value. It should print any rows that match the index and return the dataframe that was passed in.\n",
1578 |     "* Use the function with pipe after each step of the chain. Show the rows for index 2 and 64,813.\n",
1579 |     "\n",
1580 |     "\n",
1581 |     "\n",
1582 |     "\n",
1583 |     "\n",
1584 |     "\n",
1585 |     "\n",
1586 |     "\n",
1587 |     "## Don't Mutate\n",
1588 |     "\n",
1589 |     "> \"you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not.\"\n",
1590 |     ">\n",
1591 |     "> **jreback** - Pandas core dev\n",
1592 |     "\n",
1593 |     "\n",
1594 |     "\n",
1595 |     "https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136\n",
1596 |     "\n",
1597 |     "* In general, no performance benefits\n",
1598 |     "* Prohibits chaining\n",
1599 |     "* ``SettingWithCopyWarning`` fun\n"
1600 |    ]
1601 |   },
1602 |   {
1603 |    "cell_type": "code",
1604 |    "execution_count": null,
1605 |    "id": "9b1955ed",
1606 |    "metadata": {
1607 |     "lines_to_next_cell": 2,
1608 |     "pycharm": {
1609 |      "name": "#%%\n"
1610 |     }
1611 |    },
1612 |    "outputs": [],
1613 |    "source": [
1614 |     "pd.read_csv??"
1615 |    ]
1616 |   },
1617 |   {
1618 |    "cell_type": "code",
1619 |    "execution_count": null,
1620 |    "id": "bce7abe3",
1621 |    "metadata": {
1622 |     "lines_to_next_cell": 2,
1623 |     "pycharm": {
1624 |      "name": "#%%\n"
1625 |     }
1626 |    },
1627 |    "outputs": [],
1628 |    "source": []
1629 |   },
1630 |   {
1631 |    "cell_type": "code",
1632 |    "execution_count": null,
1633 |    "id": "4e6a8e2f",
1634 |    "metadata": {
1635 |     "lines_to_next_cell": 2,
1636 |     "pycharm": {
1637 |      "name": "#%%\n"
1638 |     }
1639 |    },
1640 |    "outputs": [],
1641 |    "source": []
1642 |   },
1643 |   {
1644 |    "cell_type": "markdown",
1645 |    "id": "2a263d38",
1646 |    "metadata": {
1647 |     "pycharm": {
1648 |      "name": "#%% md\n"
1649 |     }
1650 |    },
1651 |    "source": [
1652 |     "## Don't Apply (if you can)"
1653 |    ]
1654 |   },
1655 |   {
1656 |    "cell_type": "code",
1657 |    "execution_count": null,
1658 |    "id": "9e68b584",
1659 |    "metadata": {
1660 |     "lines_to_next_cell": 0,
1661 |     "pycharm": {
1662 |      "name": "#%%\n"
1663 |     }
1664 |    },
1665 |    "outputs": [],
1666 |    "source": [
1667 |     "# a glorious function\n",
1668 |     "def tweak_gss(gss):\n",
1669 |     "    return (gss\n",
1670 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1671 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1672 |     "             'MAJOR1': 'category',\n",
1673 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1674 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1675 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1676 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1677 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1678 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1679 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1680 |     "          'HONEST':'honesty_rank',\n",
1681 |     "          'TICKET':'traffic_ticket'})\n",
1682 |     "    )\n",
1683 |     "\n",
1684 |     "gss2 = tweak_gss(gss)"
1685 |    ]
1686 |   },
1687 |   {
1688 |    "cell_type": "code",
1689 |    "execution_count": null,
1690 |    "id": "1a82332f",
1691 |    "metadata": {
1692 |     "pycharm": {
1693 |      "name": "#%%\n"
1694 |     }
1695 |    },
1696 |    "outputs": [],
1697 |    "source": [
1698 |     "# convert age to months\n",
1699 |     "def to_months(val):\n",
1700 |     "    return val * 12\n",
1701 |     "\n",
1702 |     "gss2.age.apply(to_months)"
1703 |    ]
1704 |   },
1705 |   {
1706 |    "cell_type": "code",
1707 |    "execution_count": null,
1708 |    "id": "a221e972",
1709 |    "metadata": {
1710 |     "pycharm": {
1711 |      "name": "#%%\n"
1712 |     }
1713 |    },
1714 |    "outputs": [],
1715 |    "source": [
1716 |     "# this gives the sames results\n",
1717 |     "gss2.age * 12"
1718 |    ]
1719 |   },
1720 |   {
1721 |    "cell_type": "code",
1722 |    "execution_count": null,
1723 |    "id": "9cb2b9d2",
1724 |    "metadata": {
1725 |     "pycharm": {
1726 |      "name": "#%%\n"
1727 |     }
1728 |    },
1729 |    "outputs": [],
1730 |    "source": [
1731 |     "%%timeit\n",
1732 |     "gss2.age.apply(to_months)"
1733 |    ]
1734 |   },
1735 |   {
1736 |    "cell_type": "code",
1737 |    "execution_count": null,
1738 |    "id": "51bcc862",
1739 |    "metadata": {
1740 |     "pycharm": {
1741 |      "name": "#%%\n"
1742 |     }
1743 |    },
1744 |    "outputs": [],
1745 |    "source": [
1746 |     "%%timeit\n",
1747 |     "gss2.age * 12"
1748 |    ]
1749 |   },
1750 |   {
1751 |    "cell_type": "code",
1752 |    "execution_count": null,
1753 |    "id": "72a01657",
1754 |    "metadata": {
1755 |     "pycharm": {
1756 |      "name": "#%%\n"
1757 |     }
1758 |    },
1759 |    "outputs": [],
1760 |    "source": [
1761 |     "# ~42x slower!\n",
1762 |     "4_590 / 110"
1763 |    ]
1764 |   },
1765 |   {
1766 |    "cell_type": "code",
1767 |    "execution_count": null,
1768 |    "id": "619094f7",
1769 |    "metadata": {},
1770 |    "outputs": [],
1771 |    "source": [
1772 |     "gss.MAJOR1.value_counts()[:20]"
1773 |    ]
1774 |   },
1775 |   {
1776 |    "cell_type": "code",
1777 |    "execution_count": null,
1778 |    "id": "f4817aee",
1779 |    "metadata": {
1780 |     "pycharm": {
1781 |      "name": "#%%\n"
1782 |     }
1783 |    },
1784 |    "outputs": [],
1785 |    "source": [
1786 |     "def is_science(val):\n",
1787 |     "    return val in {'Engineering', 'Computer science', 'Biology'}"
1788 |    ]
1789 |   },
1790 |   {
1791 |    "cell_type": "code",
1792 |    "execution_count": null,
1793 |    "id": "f00a069c",
1794 |    "metadata": {
1795 |     "pycharm": {
1796 |      "name": "#%%\n"
1797 |     }
1798 |    },
1799 |    "outputs": [],
1800 |    "source": [
1801 |     "%%timeit\n",
1802 |     "# string\n",
1803 |     "gss.MAJOR1.apply(is_science)"
1804 |    ]
1805 |   },
1806 |   {
1807 |    "cell_type": "code",
1808 |    "execution_count": null,
1809 |    "id": "5e13ae10",
1810 |    "metadata": {
1811 |     "pycharm": {
1812 |      "name": "#%%\n"
1813 |     }
1814 |    },
1815 |    "outputs": [],
1816 |    "source": [
1817 |     "%%timeit\n",
1818 |     "gss.MAJOR1.isin({'Engineering', 'Computer science', 'Biology'})"
1819 |    ]
1820 |   },
1821 |   {
1822 |    "cell_type": "code",
1823 |    "execution_count": null,
1824 |    "id": "dc933ec1",
1825 |    "metadata": {
1826 |     "lines_to_next_cell": 0,
1827 |     "pycharm": {
1828 |      "name": "#%%\n"
1829 |     }
1830 |    },
1831 |    "outputs": [],
1832 |    "source": [
1833 |     "%%timeit\n",
1834 |     "# categorical\n",
1835 |     "gss2.college_major.isin({'Engineering', 'Computer science', 'Biology'})"
1836 |    ]
1837 |   },
1838 |   {
1839 |    "cell_type": "code",
1840 |    "execution_count": null,
1841 |    "id": "42a822c2",
1842 |    "metadata": {
1843 |     "lines_to_next_cell": 2
1844 |    },
1845 |    "outputs": [],
1846 |    "source": []
1847 |   },
1848 |   {
1849 |    "cell_type": "markdown",
1850 |    "id": "d56720b4",
1851 |    "metadata": {},
1852 |    "source": [
1853 |     "## Apply Exercise\n",
1854 |     "* Make a new column called *minutes_worked* derived the *hours_worked* column.\n",
1855 |     "* Make a new column called *income_ratio*.\n",
1856 |     "  * Convert the income columns to numbers (replace `'No answer'` and `'Refused'` with `np.nan`).\n",
1857 |     "  * Fill in the missing values with the median\n",
1858 |     "  * Divide the 2006 value by 1970 value"
1859 |    ]
1860 |   },
1861 |   {
1862 |    "cell_type": "code",
1863 |    "execution_count": null,
1864 |    "id": "3b818eb6",
1865 |    "metadata": {},
1866 |    "outputs": [],
1867 |    "source": []
1868 |   },
1869 |   {
1870 |    "cell_type": "code",
1871 |    "execution_count": null,
1872 |    "id": "59ebffeb",
1873 |    "metadata": {},
1874 |    "outputs": [],
1875 |    "source": []
1876 |   },
1877 |   {
1878 |    "cell_type": "code",
1879 |    "execution_count": null,
1880 |    "id": "9d2a4e10",
1881 |    "metadata": {},
1882 |    "outputs": [],
1883 |    "source": []
1884 |   },
1885 |   {
1886 |    "cell_type": "code",
1887 |    "execution_count": null,
1888 |    "id": "132efb76",
1889 |    "metadata": {},
1890 |    "outputs": [],
1891 |    "source": []
1892 |   },
1893 |   {
1894 |    "cell_type": "code",
1895 |    "execution_count": null,
1896 |    "id": "f3631607",
1897 |    "metadata": {},
1898 |    "outputs": [],
1899 |    "source": []
1900 |   },
1901 |   {
1902 |    "cell_type": "code",
1903 |    "execution_count": null,
1904 |    "id": "b1f627b9",
1905 |    "metadata": {},
1906 |    "outputs": [],
1907 |    "source": []
1908 |   },
1909 |   {
1910 |    "cell_type": "code",
1911 |    "execution_count": null,
1912 |    "id": "eee8ef40",
1913 |    "metadata": {},
1914 |    "outputs": [],
1915 |    "source": []
1916 |   },
1917 |   {
1918 |    "cell_type": "code",
1919 |    "execution_count": null,
1920 |    "id": "c19d66ca",
1921 |    "metadata": {},
1922 |    "outputs": [],
1923 |    "source": []
1924 |   },
1925 |   {
1926 |    "cell_type": "code",
1927 |    "execution_count": null,
1928 |    "id": "85c49f0f",
1929 |    "metadata": {},
1930 |    "outputs": [],
1931 |    "source": []
1932 |   },
1933 |   {
1934 |    "cell_type": "code",
1935 |    "execution_count": null,
1936 |    "id": "537487a7",
1937 |    "metadata": {},
1938 |    "outputs": [],
1939 |    "source": []
1940 |   },
1941 |   {
1942 |    "cell_type": "code",
1943 |    "execution_count": null,
1944 |    "id": "f62adb56",
1945 |    "metadata": {},
1946 |    "outputs": [],
1947 |    "source": []
1948 |   },
1949 |   {
1950 |    "cell_type": "code",
1951 |    "execution_count": null,
1952 |    "id": "8c59b615",
1953 |    "metadata": {},
1954 |    "outputs": [],
1955 |    "source": []
1956 |   },
1957 |   {
1958 |    "cell_type": "code",
1959 |    "execution_count": null,
1960 |    "id": "55ce9070",
1961 |    "metadata": {},
1962 |    "outputs": [],
1963 |    "source": []
1964 |   },
1965 |   {
1966 |    "cell_type": "code",
1967 |    "execution_count": null,
1968 |    "id": "15e221af",
1969 |    "metadata": {},
1970 |    "outputs": [],
1971 |    "source": []
1972 |   },
1973 |   {
1974 |    "cell_type": "code",
1975 |    "execution_count": null,
1976 |    "id": "7ed68b41",
1977 |    "metadata": {},
1978 |    "outputs": [],
1979 |    "source": []
1980 |   },
1981 |   {
1982 |    "cell_type": "code",
1983 |    "execution_count": null,
1984 |    "id": "ae90e79b",
1985 |    "metadata": {},
1986 |    "outputs": [],
1987 |    "source": []
1988 |   },
1989 |   {
1990 |    "cell_type": "code",
1991 |    "execution_count": null,
1992 |    "id": "96420545",
1993 |    "metadata": {},
1994 |    "outputs": [],
1995 |    "source": []
1996 |   },
1997 |   {
1998 |    "cell_type": "code",
1999 |    "execution_count": null,
2000 |    "id": "ba8633ef",
2001 |    "metadata": {},
2002 |    "outputs": [],
2003 |    "source": []
2004 |   },
2005 |   {
2006 |    "cell_type": "code",
2007 |    "execution_count": null,
2008 |    "id": "8993c970",
2009 |    "metadata": {},
2010 |    "outputs": [],
2011 |    "source": []
2012 |   },
2013 |   {
2014 |    "cell_type": "code",
2015 |    "execution_count": null,
2016 |    "id": "0ca9ac23",
2017 |    "metadata": {},
2018 |    "outputs": [],
2019 |    "source": []
2020 |   },
2021 |   {
2022 |    "cell_type": "code",
2023 |    "execution_count": null,
2024 |    "id": "139432bb",
2025 |    "metadata": {},
2026 |    "outputs": [],
2027 |    "source": []
2028 |   },
2029 |   {
2030 |    "cell_type": "code",
2031 |    "execution_count": null,
2032 |    "id": "816e9c31",
2033 |    "metadata": {},
2034 |    "outputs": [],
2035 |    "source": []
2036 |   },
2037 |   {
2038 |    "cell_type": "code",
2039 |    "execution_count": null,
2040 |    "id": "4f71cd5b",
2041 |    "metadata": {},
2042 |    "outputs": [],
2043 |    "source": []
2044 |   },
2045 |   {
2046 |    "cell_type": "code",
2047 |    "execution_count": null,
2048 |    "id": "8f4975a6",
2049 |    "metadata": {},
2050 |    "outputs": [],
2051 |    "source": []
2052 |   },
2053 |   {
2054 |    "cell_type": "code",
2055 |    "execution_count": null,
2056 |    "id": "44aa5162",
2057 |    "metadata": {},
2058 |    "outputs": [],
2059 |    "source": []
2060 |   },
2061 |   {
2062 |    "cell_type": "code",
2063 |    "execution_count": null,
2064 |    "id": "e5702992",
2065 |    "metadata": {},
2066 |    "outputs": [],
2067 |    "source": []
2068 |   },
2069 |   {
2070 |    "cell_type": "code",
2071 |    "execution_count": null,
2072 |    "id": "c0dead63",
2073 |    "metadata": {
2074 |     "lines_to_next_cell": 2
2075 |    },
2076 |    "outputs": [],
2077 |    "source": []
2078 |   },
2079 |   {
2080 |    "cell_type": "markdown",
2081 |    "id": "e0faa823",
2082 |    "metadata": {},
2083 |    "source": [
2084 |     "## Master Aggregation\n",
2085 |     "\n",
2086 |     "Let's compare age by sex by year...🤔"
2087 |    ]
2088 |   },
2089 |   {
2090 |    "cell_type": "code",
2091 |    "execution_count": null,
2092 |    "id": "d444c6b8",
2093 |    "metadata": {
2094 |     "pycharm": {
2095 |      "name": "#%%\n"
2096 |     }
2097 |    },
2098 |    "outputs": [],
2099 |    "source": [
2100 |     "(gss2\n",
2101 |     "   .groupby('year')\n",
2102 |     "   .mean()\n",
2103 |     ")"
2104 |    ]
2105 |   },
2106 |   {
2107 |    "cell_type": "code",
2108 |    "execution_count": null,
2109 |    "id": "85441b08",
2110 |    "metadata": {
2111 |     "pycharm": {
2112 |      "name": "#%%\n"
2113 |     }
2114 |    },
2115 |    "outputs": [],
2116 |    "source": [
2117 |     "(gss2\n",
2118 |     "   .groupby('year')\n",
2119 |     "   .mean(numeric_only=True)\n",
2120 |     ")"
2121 |    ]
2122 |   },
2123 |   {
2124 |    "cell_type": "code",
2125 |    "execution_count": null,
2126 |    "id": "eadbc6cd",
2127 |    "metadata": {
2128 |     "pycharm": {
2129 |      "name": "#%%\n"
2130 |     }
2131 |    },
2132 |    "outputs": [],
2133 |    "source": [
2134 |     "(gss2\n",
2135 |     "   .groupby('year')\n",
2136 |     "   [['age', 'hours_worked']]\n",
2137 |     "   .mean()\n",
2138 |     ")"
2139 |    ]
2140 |   },
2141 |   {
2142 |    "cell_type": "code",
2143 |    "execution_count": null,
2144 |    "id": "a6d008ae",
2145 |    "metadata": {
2146 |     "pycharm": {
2147 |      "name": "#%%\n"
2148 |     }
2149 |    },
2150 |    "outputs": [],
2151 |    "source": [
2152 |     "import matplotlib.pyplot as plt\n",
2153 |     "import seaborn as sns\n",
2154 |     "\n",
2155 |     "sns.set_context('talk')\n",
2156 |     "plt.plot(range(10))"
2157 |    ]
2158 |   },
2159 |   {
2160 |    "cell_type": "code",
2161 |    "execution_count": null,
2162 |    "id": "ffc36b52",
2163 |    "metadata": {
2164 |     "pycharm": {
2165 |      "name": "#%%\n"
2166 |     }
2167 |    },
2168 |    "outputs": [],
2169 |    "source": [
2170 |     "(gss2\n",
2171 |     "   .groupby('year')\n",
2172 |     "   [['age', 'hours_worked']]\n",
2173 |     "   .median()\n",
2174 |     "   .plot()\n",
2175 |     ")"
2176 |    ]
2177 |   },
2178 |   {
2179 |    "cell_type": "code",
2180 |    "execution_count": null,
2181 |    "id": "5bdcd3e8",
2182 |    "metadata": {
2183 |     "pycharm": {
2184 |      "name": "#%%\n"
2185 |     }
2186 |    },
2187 |    "outputs": [],
2188 |    "source": [
2189 |     "(gss2\n",
2190 |     "   .groupby('year')\n",
2191 |     "   [['age', 'hours_worked']]\n",
2192 |     "   #.mean()\n",
2193 |     "   #.median()\n",
2194 |     "   #.std()\n",
2195 |     "   .max()\n",
2196 |     "   .plot()\n",
2197 |     ")"
2198 |    ]
2199 |   },
2200 |   {
2201 |    "cell_type": "code",
2202 |    "execution_count": null,
2203 |    "id": "54ebb97d",
2204 |    "metadata": {
2205 |     "lines_to_next_cell": 2,
2206 |     "pycharm": {
2207 |      "name": "#%%\n"
2208 |     }
2209 |    },
2210 |    "outputs": [],
2211 |    "source": [
2212 |     "# add sex\n",
2213 |     "(gss2\n",
2214 |     "   .groupby(['year', 'sex'])\n",
2215 |     "   [['age', 'hours_worked']]\n",
2216 |     "   .mean()\n",
2217 |     "   #.median()\n",
2218 |     "   #.std()\n",
2219 |     "   #.max()\n",
2220 |     "   #.plot()\n",
2221 |     ")"
2222 |    ]
2223 |   },
2224 |   {
2225 |    "cell_type": "code",
2226 |    "execution_count": null,
2227 |    "id": "266a53da",
2228 |    "metadata": {
2229 |     "lines_to_next_cell": 2,
2230 |     "pycharm": {
2231 |      "name": "#%%\n"
2232 |     }
2233 |    },
2234 |    "outputs": [],
2235 |    "source": [
2236 |     "# add sex\n",
2237 |     "(gss2\n",
2238 |     "   .groupby(['year', 'sex'])\n",
2239 |     "   [['age', 'hours_worked']]\n",
2240 |     "   .mean()\n",
2241 |     "   #.median()\n",
2242 |     "   #.std()\n",
2243 |     "   #.max()\n",
2244 |     "   .plot()\n",
2245 |     ")"
2246 |    ]
2247 |   },
2248 |   {
2249 |    "cell_type": "code",
2250 |    "execution_count": null,
2251 |    "id": "2e20f409",
2252 |    "metadata": {
2253 |     "lines_to_next_cell": 2,
2254 |     "pycharm": {
2255 |      "name": "#%%\n"
2256 |     }
2257 |    },
2258 |    "outputs": [],
2259 |    "source": [
2260 |     "# unstack\n",
2261 |     "(gss2\n",
2262 |     "   .groupby(['year', 'sex'])\n",
2263 |     "   [['age', 'hours_worked']]\n",
2264 |     "   .mean()\n",
2265 |     "   #.median()\n",
2266 |     "   #.std()\n",
2267 |     "   #.max()\n",
2268 |     "   .unstack() \n",
2269 |     "   .plot()\n",
2270 |     ")"
2271 |    ]
2272 |   },
2273 |   {
2274 |    "cell_type": "code",
2275 |    "execution_count": null,
2276 |    "id": "d5481e10",
2277 |    "metadata": {
2278 |     "lines_to_next_cell": 2,
2279 |     "pycharm": {
2280 |      "name": "#%%\n"
2281 |     }
2282 |    },
2283 |    "outputs": [],
2284 |    "source": [
2285 |     "(gss2\n",
2286 |     "   .groupby(['year', 'sex'])\n",
2287 |     "   [['age', 'hours_worked']]\n",
2288 |     "   .mean()\n",
2289 |     "   .unstack()\n",
2290 |     "   .age\n",
2291 |     ")"
2292 |    ]
2293 |   },
2294 |   {
2295 |    "cell_type": "code",
2296 |    "execution_count": null,
2297 |    "id": "9e01d055",
2298 |    "metadata": {
2299 |     "lines_to_next_cell": 2,
2300 |     "pycharm": {
2301 |      "name": "#%%\n"
2302 |     }
2303 |    },
2304 |    "outputs": [],
2305 |    "source": [
2306 |     "(gss2\n",
2307 |     "   .groupby(['year', 'sex'])\n",
2308 |     "   [['age', 'hours_worked']]\n",
2309 |     "   .mean()\n",
2310 |     "   .unstack()\n",
2311 |     "   .age\n",
2312 |     "   .plot()\n",
2313 |     "   .legend(bbox_to_anchor=(1,1))\n",
2314 |     ")"
2315 |    ]
2316 |   },
2317 |   {
2318 |    "cell_type": "code",
2319 |    "execution_count": null,
2320 |    "id": "d1528728",
2321 |    "metadata": {
2322 |     "pycharm": {
2323 |      "name": "#%%\n"
2324 |     }
2325 |    },
2326 |    "outputs": [],
2327 |    "source": [
2328 |     "# Let's try looking at hours worked\n",
2329 |     "(gss2\n",
2330 |     "   .groupby(['year', 'sex'])\n",
2331 |     "   [['age', 'hours_worked']]\n",
2332 |     "   .mean()\n",
2333 |     "   .unstack()\n",
2334 |     "   .hours_worked\n",
2335 |     "   .plot()\n",
2336 |     "   .legend(bbox_to_anchor=(1,1))\n",
2337 |     ")"
2338 |    ]
2339 |   },
2340 |   {
2341 |    "cell_type": "code",
2342 |    "execution_count": null,
2343 |    "id": "a52537a5",
2344 |    "metadata": {
2345 |     "lines_to_next_cell": 2,
2346 |     "pycharm": {
2347 |      "name": "#%%\n"
2348 |     }
2349 |    },
2350 |    "outputs": [],
2351 |    "source": [
2352 |     "# Multiple aggregates\n",
2353 |     "def second(group):\n",
2354 |     "    return group.iloc[1]\n",
2355 |     "(gss2\n",
2356 |     "   .groupby(['year', 'sex'])\n",
2357 |     "   [['age', 'hours_worked']]\n",
2358 |     "  .agg(['min', 'max', 'mean', second])\n",
2359 |     "   \n",
2360 |     ")"
2361 |    ]
2362 |   },
2363 |   {
2364 |    "cell_type": "code",
2365 |    "execution_count": null,
2366 |    "id": "b780beb4",
2367 |    "metadata": {
2368 |     "lines_to_next_cell": 2,
2369 |     "pycharm": {
2370 |      "name": "#%%\n"
2371 |     }
2372 |    },
2373 |    "outputs": [],
2374 |    "source": []
2375 |   },
2376 |   {
2377 |    "cell_type": "markdown",
2378 |    "id": "9aca44a7",
2379 |    "metadata": {},
2380 |    "source": [
2381 |     "## Aggregation Exercise\n",
2382 |     "* Which occupation has the highest median hours worked?\n",
2383 |     "* Which occupation has the lowest age?\n",
2384 |     "* What is the breakdown of respondents by race for each year?\n",
2385 |     "* Convert the previous to a percentage.\n",
2386 |     "* How many unique occupations are there for each year?\n",
2387 |     "* What is the most popular college_major for each year?\n",
2388 |     "* What is the second most popular college_major for each year?"
2389 |    ]
2390 |   },
2391 |   {
2392 |    "cell_type": "code",
2393 |    "execution_count": null,
2394 |    "id": "9e5477d4",
2395 |    "metadata": {},
2396 |    "outputs": [],
2397 |    "source": []
2398 |   },
2399 |   {
2400 |    "cell_type": "code",
2401 |    "execution_count": null,
2402 |    "id": "bcc93724",
2403 |    "metadata": {},
2404 |    "outputs": [],
2405 |    "source": []
2406 |   },
2407 |   {
2408 |    "cell_type": "code",
2409 |    "execution_count": null,
2410 |    "id": "3a7f368e",
2411 |    "metadata": {},
2412 |    "outputs": [],
2413 |    "source": []
2414 |   },
2415 |   {
2416 |    "cell_type": "code",
2417 |    "execution_count": null,
2418 |    "id": "697919b8",
2419 |    "metadata": {},
2420 |    "outputs": [],
2421 |    "source": []
2422 |   },
2423 |   {
2424 |    "cell_type": "code",
2425 |    "execution_count": null,
2426 |    "id": "3d93f8db",
2427 |    "metadata": {},
2428 |    "outputs": [],
2429 |    "source": []
2430 |   },
2431 |   {
2432 |    "cell_type": "code",
2433 |    "execution_count": null,
2434 |    "id": "3907736f",
2435 |    "metadata": {},
2436 |    "outputs": [],
2437 |    "source": []
2438 |   },
2439 |   {
2440 |    "cell_type": "code",
2441 |    "execution_count": null,
2442 |    "id": "18186540",
2443 |    "metadata": {},
2444 |    "outputs": [],
2445 |    "source": []
2446 |   },
2447 |   {
2448 |    "cell_type": "code",
2449 |    "execution_count": null,
2450 |    "id": "f1089e32",
2451 |    "metadata": {},
2452 |    "outputs": [],
2453 |    "source": []
2454 |   },
2455 |   {
2456 |    "cell_type": "code",
2457 |    "execution_count": null,
2458 |    "id": "116593ba",
2459 |    "metadata": {},
2460 |    "outputs": [],
2461 |    "source": []
2462 |   },
2463 |   {
2464 |    "cell_type": "code",
2465 |    "execution_count": null,
2466 |    "id": "aaa6e44e",
2467 |    "metadata": {},
2468 |    "outputs": [],
2469 |    "source": []
2470 |   },
2471 |   {
2472 |    "cell_type": "code",
2473 |    "execution_count": null,
2474 |    "id": "648b7e1e",
2475 |    "metadata": {},
2476 |    "outputs": [],
2477 |    "source": []
2478 |   },
2479 |   {
2480 |    "cell_type": "code",
2481 |    "execution_count": null,
2482 |    "id": "7ee8d6cc",
2483 |    "metadata": {},
2484 |    "outputs": [],
2485 |    "source": []
2486 |   },
2487 |   {
2488 |    "cell_type": "code",
2489 |    "execution_count": null,
2490 |    "id": "4f252bd0",
2491 |    "metadata": {},
2492 |    "outputs": [],
2493 |    "source": []
2494 |   },
2495 |   {
2496 |    "cell_type": "code",
2497 |    "execution_count": null,
2498 |    "id": "b6090240",
2499 |    "metadata": {},
2500 |    "outputs": [],
2501 |    "source": []
2502 |   },
2503 |   {
2504 |    "cell_type": "code",
2505 |    "execution_count": null,
2506 |    "id": "a37bb2df",
2507 |    "metadata": {
2508 |     "lines_to_next_cell": 2
2509 |    },
2510 |    "outputs": [],
2511 |    "source": []
2512 |   },
2513 |   {
2514 |    "cell_type": "markdown",
2515 |    "id": "7cf8f182",
2516 |    "metadata": {},
2517 |    "source": [
2518 |     "## Summary\n",
2519 |     "\n",
2520 |     "* Correct types save space and enable convenient math, string, and date functionality\n",
2521 |     "* Chaining operations will:\n",
2522 |     "   * Make code readable\n",
2523 |     "   * Remove bugs\n",
2524 |     "   * Easier to debug\n",
2525 |     "* Don't mutate (there's no point). Embrace chaining.\n",
2526 |     "* ``.apply`` is slow for math\n",
2527 |     "* Aggregations are powerful. Play with them until they make sense\n",
2528 |     "\n",
2529 |     "Follow on LinkedIn/X|Twitter/Bsky ``@__mharrison__``\n",
2530 |     "\n"
2531 |    ]
2532 |   },
2533 |   {
2534 |    "cell_type": "code",
2535 |    "execution_count": null,
2536 |    "id": "35931834",
2537 |    "metadata": {
2538 |     "lines_to_next_cell": 2,
2539 |     "pycharm": {
2540 |      "name": "#%%\n"
2541 |     }
2542 |    },
2543 |    "outputs": [],
2544 |    "source": []
2545 |   },
2546 |   {
2547 |    "cell_type": "code",
2548 |    "execution_count": null,
2549 |    "id": "53ab759b",
2550 |    "metadata": {
2551 |     "lines_to_next_cell": 2,
2552 |     "pycharm": {
2553 |      "name": "#%%\n"
2554 |     }
2555 |    },
2556 |    "outputs": [],
2557 |    "source": []
2558 |   },
2559 |   {
2560 |    "cell_type": "code",
2561 |    "execution_count": null,
2562 |    "id": "063efccd",
2563 |    "metadata": {
2564 |     "lines_to_next_cell": 2,
2565 |     "pycharm": {
2566 |      "name": "#%%\n"
2567 |     }
2568 |    },
2569 |    "outputs": [],
2570 |    "source": []
2571 |   },
2572 |   {
2573 |    "cell_type": "code",
2574 |    "execution_count": null,
2575 |    "id": "26386f48",
2576 |    "metadata": {
2577 |     "pycharm": {
2578 |      "name": "#%%\n"
2579 |     }
2580 |    },
2581 |    "outputs": [],
2582 |    "source": []
2583 |   }
2584 |  ],
2585 |  "metadata": {
2586 |   "kernelspec": {
2587 |    "display_name": "pearson-pandas-best-practices",
2588 |    "language": "python",
2589 |    "name": "python3"
2590 |   },
2591 |   "language_info": {
2592 |    "codemirror_mode": {
2593 |     "name": "ipython",
2594 |     "version": 3
2595 |    },
2596 |    "file_extension": ".py",
2597 |    "mimetype": "text/x-python",
2598 |    "name": "python",
2599 |    "nbconvert_exporter": "python",
2600 |    "pygments_lexer": "ipython3",
2601 |    "version": "3.12.1"
2602 |   }
2603 |  },
2604 |  "nbformat": 4,
2605 |  "nbformat_minor": 5
2606 | }
2607 | 


--------------------------------------------------------------------------------