├── .devcontainer
    └── devcontainer.json
├── GSS.csv
├── README.md
├── honest.fth
├── idiomatic-pandas.ipynb
├── preso.ipynb
└── requirements.txt


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "image": "mcr.microsoft.com/devcontainers/universal:2",
 3 |   "waitFor": "onCreateCommand",
 4 |   "updateContentCommand": "python3 -m pip install -r requirements.txt",
 5 |   "postCreateCommand": "",
 6 |   "customizations": {
 7 |     "codespaces": {
 8 |       "openFiles": []
 9 |     },
10 |     "vscode": {
11 |       "extensions": [
12 |         "ms-toolsai.jupyter",
13 |         "ms-python.python"
14 |       ]
15 |     }
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 2023-scipy-pandas
 2 | 
 3 | This repository contains a Jupyter notebook for the Idiomatic Pandas tutorial. The notebook covers various topics and interactive exercises designed to reinforce your learning. You can run the notebook in a local virtual environment or directly in GitHub Codespaces. 
 4 | 
 5 | ## Structure of the Repository
 6 | 
 7 | - `idiomatic-pandas.ipynb` - This notebook
 8 | - `honest.fth` - The data for the tutorial
 9 | - `requirements.txt` - This file lists the Python dependencies required to run the notebook.
10 | 
11 | ## Getting Started 
12 | 
13 | Here's how you can set up and run this project:
14 | 
15 | ### Option 1: Running in Local Virtual Environment 
16 | 
17 | 1. **Clone the Repository** 
18 | 
19 |     First, clone this repository to your local machine using the following command:
20 | 
21 |     ```bash
22 |     git clone https://github.com/your_username/your_repository.git
23 |     ```
24 | 
25 | 2. **Create and Activate Virtual Environment** 
26 | 
27 |     It is always a good practice to create a virtual environment for your Python projects. Here's how you can do it:
28 | 
29 |     For Windows:
30 | 
31 |     ```bash
32 |     python -m venv tutorial_env
33 |     tutorial_env\Scripts\activate
34 |     ```
35 | 
36 |     For macOS/Linux:
37 | 
38 |     ```bash
39 |     python3 -m venv tutorial_env
40 |     source tutorial_env/bin/activate
41 |     ```
42 | 
43 | 3. **Install Dependencies** 
44 | 
45 |     Once your virtual environment is activated, you can install the necessary dependencies using pip. Navigate to the directory containing `requirements.txt` file and run:
46 | 
47 |     ```bash
48 |     pip install -r requirements.txt
49 |     ```
50 | 
51 | 4. **Launch Jupyter Notebook** 
52 | 
53 |     After you have your environment set up and dependencies installed, you can start Jupyter notebook by running:
54 | 
55 |     ```bash
56 |     jupyter notebook
57 |     ```
58 | 
59 |     Then, in your web browser, navigate to the location of the notebook file and click to open it.
60 | 
61 | ### Option 2: Running in GitHub Codespaces 
62 | 
63 | GitHub Codespaces is a service that allows you to develop in the cloud instead of locally. Here's how you can use it for this project:
64 | 
65 | 1. **Open the Repository in Codespaces** 
66 | 
67 |     Navigate to this repository in GitHub. Click the `Code` button in the repository header and then select `Open with Codespaces`. 
68 | 
69 | 2. **Wait for a while**
70 | 
71 |     To let the codespace start
72 | 
73 | 3. **Open the notebook**
74 | 
75 | 4. **Click on "Select Kernel" -> Python Environments... -> Python 3.10**
76 |     
77 |     You should be good to go.
78 | 
79 | ## Visit MetaSnake for Help
80 | 
81 | We hope you enjoy this tutorial and find it helpful. If you want to apply this process to your data with your teams, visit www.metasnake.com for Python and Data training for your teams. Buy *Effective Pandas* to up your pandas skills.
82 | 


--------------------------------------------------------------------------------
/honest.fth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattharrison/2023-scipy-pandas/90eb298401110d6dd688bc67c045167ed8e9d73f/honest.fth


--------------------------------------------------------------------------------
/idiomatic-pandas.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "81cc5b34",
   6 |    "metadata": {
   7 |     "lines_to_next_cell": 0,
   8 |     "pycharm": {
   9 |      "name": "#%% md\n"
  10 |     }
  11 |    },
  12 |    "source": [
  13 |     "# Idiomatic Pandas\n",
  14 |     "## 5 Tips for Better Pandas Code\n",
  15 |     "\n",
  16 |     "https://github.com/mattharrison/2023-scipy-pandas"
  17 |    ]
  18 |   },
  19 |   {
  20 |    "cell_type": "code",
  21 |    "execution_count": null,
  22 |    "id": "d1532d10",
  23 |    "metadata": {
  24 |     "lines_to_next_cell": 2
  25 |    },
  26 |    "outputs": [],
  27 |    "source": []
  28 |   },
  29 |   {
  30 |    "cell_type": "markdown",
  31 |    "id": "3b359fb0",
  32 |    "metadata": {
  33 |     "pycharm": {
  34 |      "name": "#%% md\n"
  35 |     }
  36 |    },
  37 |    "source": [
  38 |     "## About Matt  Harrison @\\_\\_mharrison\\_\\_\n",
  39 |     "\n",
  40 |     "* Author of Effective Pandas, Machine Learning Pocket Reference, and Illustrated Guide to Python 3.\n",
  41 |     "* Advisor at Ponder (creators of Modin)\n",
  42 |     "* Corporate trainer at MetaSnake. Taught Pandas to 1000's of students."
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "code",
  47 |    "execution_count": null,
  48 |    "id": "fd45b865",
  49 |    "metadata": {
  50 |     "lines_to_next_cell": 2,
  51 |     "pycharm": {
  52 |      "name": "#%%\n"
  53 |     }
  54 |    },
  55 |    "outputs": [],
  56 |    "source": []
  57 |   },
  58 |   {
  59 |    "cell_type": "code",
  60 |    "execution_count": null,
  61 |    "id": "ee074cf6",
  62 |    "metadata": {
  63 |     "lines_to_next_cell": 2,
  64 |     "pycharm": {
  65 |      "name": "#%%\n"
  66 |     }
  67 |    },
  68 |    "outputs": [],
  69 |    "source": []
  70 |   },
  71 |   {
  72 |    "cell_type": "code",
  73 |    "execution_count": null,
  74 |    "id": "56dea0d6",
  75 |    "metadata": {
  76 |     "lines_to_next_cell": 2,
  77 |     "pycharm": {
  78 |      "name": "#%%\n"
  79 |     }
  80 |    },
  81 |    "outputs": [],
  82 |    "source": []
  83 |   },
  84 |   {
  85 |    "cell_type": "code",
  86 |    "execution_count": null,
  87 |    "id": "74599ee6",
  88 |    "metadata": {
  89 |     "lines_to_next_cell": 2,
  90 |     "pycharm": {
  91 |      "name": "#%%\n"
  92 |     }
  93 |    },
  94 |    "outputs": [],
  95 |    "source": []
  96 |   },
  97 |   {
  98 |    "cell_type": "code",
  99 |    "execution_count": null,
 100 |    "id": "ddaf4356",
 101 |    "metadata": {
 102 |     "lines_to_next_cell": 2,
 103 |     "pycharm": {
 104 |      "name": "#%%\n"
 105 |     }
 106 |    },
 107 |    "outputs": [],
 108 |    "source": []
 109 |   },
 110 |   {
 111 |    "cell_type": "code",
 112 |    "execution_count": null,
 113 |    "id": "691fab46",
 114 |    "metadata": {
 115 |     "lines_to_next_cell": 2,
 116 |     "pycharm": {
 117 |      "name": "#%%\n"
 118 |     }
 119 |    },
 120 |    "outputs": [],
 121 |    "source": []
 122 |   },
 123 |   {
 124 |    "cell_type": "markdown",
 125 |    "id": "ac1d390a",
 126 |    "metadata": {
 127 |     "pycharm": {
 128 |      "name": "#%% md\n"
 129 |     }
 130 |    },
 131 |    "source": [
 132 |     "## Practice this on your data with your team!\n",
 133 |     "* Contact me matt@metasnake.com\n",
 134 |     "* Follow on Twitter @\\_\\_mharrison\\_\\_"
 135 |    ]
 136 |   },
 137 |   {
 138 |    "cell_type": "code",
 139 |    "execution_count": null,
 140 |    "id": "0d9e4268",
 141 |    "metadata": {
 142 |     "lines_to_next_cell": 2,
 143 |     "pycharm": {
 144 |      "name": "#%%\n"
 145 |     }
 146 |    },
 147 |    "outputs": [],
 148 |    "source": []
 149 |   },
 150 |   {
 151 |    "cell_type": "code",
 152 |    "execution_count": null,
 153 |    "id": "e9f646d4",
 154 |    "metadata": {
 155 |     "lines_to_next_cell": 2,
 156 |     "pycharm": {
 157 |      "name": "#%%\n"
 158 |     }
 159 |    },
 160 |    "outputs": [],
 161 |    "source": []
 162 |   },
 163 |   {
 164 |    "cell_type": "markdown",
 165 |    "id": "ec83fe17",
 166 |    "metadata": {
 167 |     "lines_to_next_cell": 0,
 168 |     "pycharm": {
 169 |      "name": "#%% md\n"
 170 |     }
 171 |    },
 172 |    "source": [
 173 |     "## Outline\n",
 174 |     "\n",
 175 |     "* Load Data\n",
 176 |     "* Types\n",
 177 |     "* Chaining\n",
 178 |     "* Mutation\n",
 179 |     "* Apply\n",
 180 |     "* Aggregation"
 181 |    ]
 182 |   },
 183 |   {
 184 |    "cell_type": "code",
 185 |    "execution_count": null,
 186 |    "id": "88919775",
 187 |    "metadata": {
 188 |     "lines_to_next_cell": 2
 189 |    },
 190 |    "outputs": [],
 191 |    "source": []
 192 |   },
 193 |   {
 194 |    "cell_type": "markdown",
 195 |    "id": "b6b8d4fe",
 196 |    "metadata": {},
 197 |    "source": [
 198 |     "## Imports"
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": null,
 204 |    "id": "677e4ecd",
 205 |    "metadata": {
 206 |     "lines_to_next_cell": 2,
 207 |     "pycharm": {
 208 |      "name": "#%%\n"
 209 |     }
 210 |    },
 211 |    "outputs": [],
 212 |    "source": [
 213 |     "%matplotlib inline\n",
 214 |     "from IPython.display import display\n",
 215 |     "import numpy as np\n",
 216 |     "import pandas as pd\n",
 217 |     "import pyarrow\n",
 218 |     "\n",
 219 |     "import io\n",
 220 |     "import zipfile\n",
 221 |     "#import modin.pandas as pd"
 222 |    ]
 223 |   },
 224 |   {
 225 |    "cell_type": "code",
 226 |    "execution_count": null,
 227 |    "id": "108ee461",
 228 |    "metadata": {},
 229 |    "outputs": [],
 230 |    "source": [
 231 |     "pd.__version__"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "code",
 236 |    "execution_count": null,
 237 |    "id": "e01647d1",
 238 |    "metadata": {},
 239 |    "outputs": [],
 240 |    "source": [
 241 |     "pyarrow.__version__"
 242 |    ]
 243 |   },
 244 |   {
 245 |    "cell_type": "code",
 246 |    "execution_count": null,
 247 |    "id": "565ba8c3",
 248 |    "metadata": {},
 249 |    "outputs": [],
 250 |    "source": []
 251 |   },
 252 |   {
 253 |    "cell_type": "code",
 254 |    "execution_count": null,
 255 |    "id": "6dfaa61a",
 256 |    "metadata": {},
 257 |    "outputs": [],
 258 |    "source": []
 259 |   },
 260 |   {
 261 |    "cell_type": "markdown",
 262 |    "id": "4084693e",
 263 |    "metadata": {
 264 |     "pycharm": {
 265 |      "name": "#%% md\n"
 266 |     }
 267 |    },
 268 |    "source": [
 269 |     "## Data\n",
 270 |     "\n",
 271 |     "Don't need to run this, but this is how I created the data"
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "code",
 276 |    "execution_count": null,
 277 |    "id": "766a6963",
 278 |    "metadata": {},
 279 |    "outputs": [],
 280 |    "source": [
 281 |     "# https://gss.norc.org/get-the-data/stata\n",
 282 |     "# takes a few minutes on my computer to load\n",
 283 |     "path = '~/Downloads/gss_spss_with_codebook.zip'\n",
 284 |     "with zipfile.ZipFile(path) as z:\n",
 285 |     "    print(z.namelist())\n",
 286 |     "    with open('gss.sav', mode='bw') as fout:\n",
 287 |     "        fout.write(z.open('GSS7218_R3.sav').read())\n",
 288 |     "    gss = pd.read_spss('gss.sav')"
 289 |    ]
 290 |   },
 291 |   {
 292 |    "cell_type": "code",
 293 |    "execution_count": null,
 294 |    "id": "0bff9887",
 295 |    "metadata": {},
 296 |    "outputs": [],
 297 |    "source": [
 298 |     "!pip install pyreadstat"
 299 |    ]
 300 |   },
 301 |   {
 302 |    "cell_type": "code",
 303 |    "execution_count": null,
 304 |    "id": "b113d17e",
 305 |    "metadata": {},
 306 |    "outputs": [],
 307 |    "source": [
 308 |     "%%time\n",
 309 |     "import pyreadstat\n",
 310 |     "gss, meta = pyreadstat.read_sav('gss.sav')"
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "code",
 315 |    "execution_count": null,
 316 |    "id": "356a1732",
 317 |    "metadata": {},
 318 |    "outputs": [],
 319 |    "source": [
 320 |     "gss.shape"
 321 |    ]
 322 |   },
 323 |   {
 324 |    "cell_type": "code",
 325 |    "execution_count": null,
 326 |    "id": "41a1eb09",
 327 |    "metadata": {},
 328 |    "outputs": [],
 329 |    "source": [
 330 |     "gss.to_feather('gss.fth')"
 331 |    ]
 332 |   },
 333 |   {
 334 |    "cell_type": "code",
 335 |    "execution_count": null,
 336 |    "id": "fe3f9902",
 337 |    "metadata": {},
 338 |    "outputs": [],
 339 |    "source": [
 340 |     "%%time\n",
 341 |     "raw = pd.read_feather('~/Dropbox/work/jupyter/gss.fth')"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "code",
 346 |    "execution_count": null,
 347 |    "id": "17a1c4e6",
 348 |    "metadata": {
 349 |     "lines_to_next_cell": 0,
 350 |     "pycharm": {
 351 |      "name": "#%%\n"
 352 |     }
 353 |    },
 354 |    "outputs": [],
 355 |    "source": [
 356 |     "raw"
 357 |    ]
 358 |   },
 359 |   {
 360 |    "cell_type": "code",
 361 |    "execution_count": null,
 362 |    "id": "c02fcbf9",
 363 |    "metadata": {},
 364 |    "outputs": [],
 365 |    "source": [
 366 |     "# 6000 columns!\n",
 367 |     "raw.shape"
 368 |    ]
 369 |   },
 370 |   {
 371 |    "cell_type": "code",
 372 |    "execution_count": null,
 373 |    "id": "e0bcc8ff",
 374 |    "metadata": {
 375 |     "lines_to_next_cell": 0
 376 |    },
 377 |    "outputs": [],
 378 |    "source": [
 379 |     "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n",
 380 |     "        'INCOME06','HONEST','TICKET']\n",
 381 |     "\n",
 382 |     "raw[cols].to_feather('honest.fth')"
 383 |    ]
 384 |   },
 385 |   {
 386 |    "cell_type": "code",
 387 |    "execution_count": null,
 388 |    "id": "591958d3",
 389 |    "metadata": {
 390 |     "lines_to_next_cell": 2
 391 |    },
 392 |    "outputs": [],
 393 |    "source": []
 394 |   },
 395 |   {
 396 |    "cell_type": "markdown",
 397 |    "id": "7dedee7b",
 398 |    "metadata": {},
 399 |    "source": [
 400 |     "## Loading Data"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "code",
 405 |    "execution_count": null,
 406 |    "id": "20e8aa70",
 407 |    "metadata": {
 408 |     "lines_to_next_cell": 2,
 409 |     "pycharm": {
 410 |      "name": "#%%\n"
 411 |     }
 412 |    },
 413 |    "outputs": [],
 414 |    "source": [
 415 |     "raw = pd.read_feather('honest.fth', dtype_backend='pyarrow')"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "code",
 420 |    "execution_count": null,
 421 |    "id": "5f95238b",
 422 |    "metadata": {
 423 |     "lines_to_next_cell": 2,
 424 |     "pycharm": {
 425 |      "name": "#%%\n"
 426 |     }
 427 |    },
 428 |    "outputs": [],
 429 |    "source": []
 430 |   },
 431 |   {
 432 |    "cell_type": "markdown",
 433 |    "id": "8da48799",
 434 |    "metadata": {
 435 |     "pycharm": {
 436 |      "name": "#%% md\n"
 437 |     }
 438 |    },
 439 |    "source": [
 440 |     "## My Cleanup\n",
 441 |     "See GSS_Codebook.pdf for explanation\n",
 442 |     "\n",
 443 |     "Columns:\n",
 444 |     "\n",
 445 |     "* YEAR\n",
 446 |     "* ID - RESPONDENT ID NUMBER\n",
 447 |     "* AGE - AGE OF RESPONENT\n",
 448 |     "* HRS1 - NUMBER OF HOURS WORKED LAST WEEK\n",
 449 |     "* OCC - R'S CENSUS OCCUPATION CODE (1970) - Page 126 (VAR: OCC) see page 125 for notes APPENDIX F,G,H\n",
 450 |     "   Appendix F - Page 3286\n",
 451 |     "* MAJOR1 - COLLEGE MAJOR 1\n",
 452 |     "* SEX - RESPONDENTS SEX\n",
 453 |     "* RACE - RACE OF RESPONDENT\n",
 454 |     "* BORN -  WAS R BORN IN THIS COUNTRY\n",
 455 |     "* INCOME - TOTAL FAMILY INCOME 1970\n",
 456 |     "* INCOME06 - TOTAL FAMILY INCOME 2006\n",
 457 |     "* HONEST - HONEST\n",
 458 |     "* TICKET - EVER RECEIVED A TRAFFIC TICKET\n"
 459 |    ]
 460 |   },
 461 |   {
 462 |    "cell_type": "code",
 463 |    "execution_count": null,
 464 |    "id": "a288f332",
 465 |    "metadata": {},
 466 |    "outputs": [],
 467 |    "source": [
 468 |     "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n",
 469 |     "        'INCOME06','HONEST','TICKET']\n",
 470 |     "\n",
 471 |     "raw[cols].isna().mean()*100"
 472 |    ]
 473 |   },
 474 |   {
 475 |    "cell_type": "code",
 476 |    "execution_count": null,
 477 |    "id": "b6de3c2f",
 478 |    "metadata": {},
 479 |    "outputs": [],
 480 |    "source": [
 481 |     "(raw\n",
 482 |     " [cols]\n",
 483 |     " .isna()\n",
 484 |     " .mean()*100\n",
 485 |     ")"
 486 |    ]
 487 |   },
 488 |   {
 489 |    "cell_type": "code",
 490 |    "execution_count": null,
 491 |    "id": "a284ba6e",
 492 |    "metadata": {},
 493 |    "outputs": [],
 494 |    "source": [
 495 |     "MAJOR= '''RESPONSE PUNCH 1972-82 1982B 1983-87 1987B 1988-91 1993-98 2000-04 2006 2008 2010 2012 2014 2016 2018 ALL\n",
 496 |     "Accounting/bookkeeping 1 0 0 0 0 0 0 0 0 0 0 28 32 30 29 119\n",
 497 |     "Advertising 2 0 0 0 0 0 0 0 0 0 0 3 2 0 0 5\n",
 498 |     "Agriculture/horticulture 3 0 0 0 0 0 0 0 0 0 0 8 2 7 5 22\n",
 499 |     "Allied health 4 0 0 0 0 0 0 0 0 0 0 0 2 1 0 3\n",
 500 |     "Anthropology 5 0 0 0 0 0 0 0 0 0 0 3 5 1 1 10\n",
 501 |     "Architecture 6 0 0 0 0 0 0 0 0 0 0 2 3 5 3 13\n",
 502 |     "Art 7 0 0 0 0 0 0 0 0 0 0 6 7 11 10 34\n",
 503 |     "Biology 8 0 0 0 0 0 0 0 0 0 0 16 22 33 26 97\n",
 504 |     "Business administration 9 0 0 0 0 0 0 0 0 0 0 90 142 172 138 542\n",
 505 |     "Chemistry 11 0 0 0 0 0 0 0 0 0 0 5 8 10 4 27\n",
 506 |     "Communications/speech 12 0 0 0 0 0 0 0 0 0 0 20 18 26 18 82\n",
 507 |     "Comm. disorders 13 0 0 0 0 0 0 0 0 0 0 4 6 2 2 14\n",
 508 |     "Computer science 14 0 0 0 0 0 0 0 0 0 0 25 24 33 17 99\n",
 509 |     "Dentistry 15 0 0 0 0 0 0 0 0 0 0 2 4 3 5 14\n",
 510 |     "Education 16 0 0 0 0 0 0 0 0 0 0 73 91 97 79 340\n",
 511 |     "Economics 17 0 0 0 0 0 0 0 0 0 0 11 19 13 19 62\n",
 512 |     "Engineering 18 0 0 0 0 0 0 0 0 0 0 47 49 47 54 197\n",
 513 |     "English 19 0 0 0 0 0 0 0 0 0 0 23 26 27 24 100\n",
 514 |     "Finance 20 0 0 0 0 0 0 0 0 0 0 7 15 14 16 52\n",
 515 |     "Foreign language 21 0 0 0 0 0 0 0 0 0 0 4 8 6 5 23\n",
 516 |     "Forestry 22 0 0 0 0 0 0 0 0 0 0 1 0 3 0 4\n",
 517 |     "Geography 23 0 0 0 0 0 0 0 0 0 0 0 2 2 4 8\n",
 518 |     "Geology 24 0 0 0 0 0 0 0 0 0 0 1 3 4 2 10\n",
 519 |     "History 25 0 0 0 0 0 0 0 0 0 0 10 19 14 19 62\n",
 520 |     "Home economics 26 0 0 0 0 0 0 0 0 0 0 0 0 3 2 5\n",
 521 |     "Industry & techn 27 0 0 0 0 0 0 0 0 0 0 3 4 6 0 13\n",
 522 |     "Journalism 28 0 0 0 0 0 0 0 0 0 0 5 6 6 4 21\n",
 523 |     "Law 29 0 0 0 0 0 0 0 0 0 0 13 18 23 14 68\n",
 524 |     "Law enforcement 30 0 0 0 0 0 0 0 0 0 0 3 5 4 2 14\n",
 525 |     "Library science 31 0 0 0 0 0 0 0 0 0 0 4 5 2 3 14\n",
 526 |     "Marketing 32 0 0 0 0 0 0 0 0 0 0 11 15 13 12 51\n",
 527 |     "Mathematics 33 0 0 0 0 0 0 0 0 0 0 5 10 12 5 32\n",
 528 |     "Medicine 34 0 0 0 0 0 0 0 0 0 0 9 25 12 11 57\n",
 529 |     "Music 35 0 0 0 0 0 0 0 0 0 0 4 2 10 2 18\n",
 530 |     "Nursing 36 0 0 0 0 0 0 0 0 0 0 36 39 60 51 186\n",
 531 |     "Optometry 37 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
 532 |     "Pharmacy 38 0 0 0 0 0 0 0 0 0 0 2 5 4 4 15\n",
 533 |     "Philosophy 39 0 0 0 0 0 0 0 0 0 0 2 0 2 2 6\n",
 534 |     "Physical education 40 0 0 0 0 0 0 0 0 0 0 9 6 16 6 37\n",
 535 |     "Physics 41 0 0 0 0 0 0 0 0 0 0 3 6 7 4 20\n",
 536 |     "Psychology 42 0 0 0 0 0 0 0 0 0 0 32 32 34 29 127\n",
 537 |     "Political science/international relations 43 0 0 0 0 0 0 0 0 0 0 16 22 19 14 71\n",
 538 |     "Sociology 44 0 0 0 0 0 0 0 0 0 0 9 15 10 12 46\n",
 539 |     "Special education 45 0 0 0 0 0 0 0 0 0 0 5 3 5 2 15\n",
 540 |     "Theater arts 46 0 0 0 0 0 0 0 0 0 0 6 2 3 1 12\n",
 541 |     "Theology 47 0 0 0 0 0 0 0 0 0 0 6 6 13 8 33\n",
 542 |     "Veterinary medicine 48 0 0 0 0 0 0 0 0 0 0 1 5 3 4 13\n",
 543 |     "Liberal arts 49 0 0 0 0 0 0 0 0 0 0 8 16 12 10 46\n",
 544 |     "Other 50 0 0 0 0 0 0 0 0 0 0 8 10 21 27 66\n",
 545 |     "General sciences 51 0 0 0 0 0 0 0 0 0 0 10 13 15 14 52\n",
 546 |     "Social work 52 0 0 0 0 0 0 0 0 0 0 7 17 24 7 55\n",
 547 |     "General studies 53 0 0 0 0 0 0 0 0 0 0 2 5 7 7 21\n",
 548 |     "Other vocational 54 0 0 0 0 0 0 0 0 0 0 5 11 6 5 27\n",
 549 |     "Health 55 0 0 0 0 0 0 0 0 0 0 23 31 31 42 127\n",
 550 |     "Industrial Relations 56 0 0 0 0 0 0 0 0 0 0 1 0 0 3 4\n",
 551 |     "Child/Human/Family Development 57 0 0 0 0 0 0 0 0 0 0 11 3 7 7 28\n",
 552 |     "Food Science/Nutrition/Culinary Arts 58 0 0 0 0 0 0 0 0 0 0 3 6 9 9 27\n",
 553 |     "Environmental Science/Ecology 59 0 0 0 0 0 0 0 0 0 0 5 5 6 8 24\n",
 554 |     "Social Sciences 60 0 0 0 0 0 0 0 0 0 0 4 2 7 5 18\n",
 555 |     "Human Services/Human Resources 61 0 0 0 0 0 0 0 0 0 0 3 7 7 5 22\n",
 556 |     "Visual Arts/Graphic Design/Design and Drafting 62 0 0 0 0 0 0 0 0 0 0 3 8 9 10 30\n",
 557 |     "Fine Arts 63 0 0 0 0 0 0 0 0 0 0 4 5 5 6 20\n",
 558 |     "Humanities 64 0 0 0 0 0 0 0 0 0 0 0 2 0 1 3\n",
 559 |     "Ethnic studies 65 0 0 0 0 0 0 0 0 0 0 3 1 0 0 4\n",
 560 |     "Educational administration 66 0 0 0 0 0 0 0 0 0 0 3 4 8 9 24\n",
 561 |     "Television/Film 67 0 0 0 0 0 0 0 0 0 0 0 2 6 1 9\n",
 562 |     "Aviation/Aeronatics 68 0 0 0 0 0 0 0 0 0 0 2 1 1 3 7\n",
 563 |     "Statistics/Biostatistics 69 0 0 0 0 0 0 0 0 0 0 0 0 2 2 4\n",
 564 |     "Criminology/Criminal Justice 70 0 0 0 0 0 0 0 0 0 0 13 17 17 13 60\n",
 565 |     "Administrative Science/Public Administration 71 0 0 0 0 0 0 0 0 0 0 2 11 3 5 21\n",
 566 |     "Electronics 72 0 0 0 0 0 0 0 0 0 0 6 6 5 9 26\n",
 567 |     "Urban and Regional Planning 73 0 0 0 0 0 0 0 0 0 0 1 1 3 2 7\n",
 568 |     "Mechanics/Machine Trade 74 0 0 0 0 0 0 0 0 0 0 0 1 1 4 6\n",
 569 |     "Dance 75 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n",
 570 |     "Gerontology 76 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n",
 571 |     "Public Relations 77 0 0 0 0 0 0 0 0 0 0 3 1 2 1 7\n",
 572 |     "Textiles/Cloth 78 0 0 0 0 0 0 0 0 0 0 3 4 0 0 7\n",
 573 |     "Parks and Recreation 79 0 0 0 0 0 0 0 0 0 0 1 2 1 0 4\n",
 574 |     "Information Technology 80 0 0 0 0 0 0 0 0 0 0 0 5 8 11 24\n",
 575 |     "Fashion 81 0 0 0 0 0 0 0 0 0 0 0 0 3 1 4\n",
 576 |     "Counseling 82 0 0 0 0 0 0 0 0 0 0 0 0 11 9 20\n",
 577 |     "Don't know/UNCODED 98 0 0 0 0 0 0 0 0 0 0 2 3 0 0 5\n",
 578 |     "No answer 99 0 0 0 0 0 0 0 0 0 0 0 1 5 3 9\n",
 579 |     "Not applicable 0 13626 354 7542 353 5907 10334 8394 4510 2023 2044 1263 1597 1795 1435 61177'''\n",
 580 |     "\n",
 581 |     "# copy paste slight tweak from page 186\n",
 582 |     "major_dict = {int(row.split()[-16]): ' '.join(row.split()[:-16])  for row in MAJOR.split('\\n')[1:]}\n",
 583 |     "major_dict"
 584 |    ]
 585 |   },
 586 |   {
 587 |    "cell_type": "code",
 588 |    "execution_count": null,
 589 |    "id": "fd16476a",
 590 |    "metadata": {},
 591 |    "outputs": [],
 592 |    "source": [
 593 |     "raw.MAJOR1.value_counts()"
 594 |    ]
 595 |   },
 596 |   {
 597 |    "cell_type": "code",
 598 |    "execution_count": null,
 599 |    "id": "382102ad",
 600 |    "metadata": {},
 601 |    "outputs": [],
 602 |    "source": [
 603 |     "(raw\n",
 604 |     " [cols]\n",
 605 |     " .assign(\n",
 606 |     "     MAJOR1=raw.MAJOR1.fillna(99).astype('int').replace(major_dict),\n",
 607 |     "     SEX=raw.SEX#\n",
 608 |     "           \n",
 609 |     "           .astype(int)\n",
 610 |     "           .replace({1:'Male', 2:'Female'}),\n",
 611 |     "     RACE=raw.RACE.astype(int).replace({1:'White', 2:'Black', 3:'Other'}),\n",
 612 |     "     OCC=raw.OCC.fillna(9999).astype(int),\n",
 613 |     "     BORN=raw.BORN.fillna(4).astype(int).replace({1:'Yes', 2:'No', 3:'Don\\'t Know',\n",
 614 |     "                                                    4:'No answer', 5:'Not applicable'}),\n",
 615 |     "     INCOME=raw.INCOME.fillna(99).astype(int).replace({99:'No answer', **dict(enumerate(['Not applicable',\n",
 616 |     "                                                                                  0,1000,3000,4000,5000,6000,\n",
 617 |     "                                                                                  7000,8000,10000,15000,20000,25000,]))}),\n",
 618 |     "     INCOME06=raw.INCOME06.fillna(26).astype(int).replace({26:'Refused', **dict(enumerate(['Not applicable',\n",
 619 |     "                                                                                  0,1000,3000,4000,5000,6000,\n",
 620 |     "                                                                                  7000,8000,10000,12500,15000,\n",
 621 |     "                                                                                  17500,20000,22500,25000,30_000,\n",
 622 |     "                                                                                  35_000, 40_000, 50_000, 60_000,\n",
 623 |     "                                                                                 75_000, 90_000, 110_000, 130_000,\n",
 624 |     "                                                                                 150_000]))}),\n",
 625 |     "     HONEST=raw.HONEST.fillna(9).astype(int).replace({1:'Most desirable', 2:'3 most desireable',\n",
 626 |     "                                                                   3:'Not mentioned', 4:  '3 least desireable',\n",
 627 |     "                                                                   5: 'One least desireable',\n",
 628 |     "                                                                    9:'No answer'}),\n",
 629 |     "     TICKET=raw.TICKET.fillna(9).astype(int).replace({1:'Yes', 2:'No', 3:'Refused', 9: 'No answer'}),\n",
 630 |     "     )\n",
 631 |     " .astype({'YEAR':int, 'ID': 'uint16[pyarrow]'})\n",
 632 |     " .to_csv('GSS.csv')\n",
 633 |     ")"
 634 |    ]
 635 |   },
 636 |   {
 637 |    "cell_type": "code",
 638 |    "execution_count": null,
 639 |    "id": "8366b83d",
 640 |    "metadata": {},
 641 |    "outputs": [],
 642 |    "source": []
 643 |   },
 644 |   {
 645 |    "cell_type": "code",
 646 |    "execution_count": null,
 647 |    "id": "be251648",
 648 |    "metadata": {},
 649 |    "outputs": [],
 650 |    "source": []
 651 |   },
 652 |   {
 653 |    "cell_type": "markdown",
 654 |    "id": "394409a6",
 655 |    "metadata": {},
 656 |    "source": [
 657 |     "## Types\n",
 658 |     "Getting the right types will enable analysis and correctness.\n"
 659 |    ]
 660 |   },
 661 |   {
 662 |    "cell_type": "code",
 663 |    "execution_count": null,
 664 |    "id": "bc415e79",
 665 |    "metadata": {},
 666 |    "outputs": [],
 667 |    "source": [
 668 |     "%%time\n",
 669 |     "gss = pd.read_csv('GSS.csv', index_col=0, dtype_backend='pyarrow', engine='pyarrow')"
 670 |    ]
 671 |   },
 672 |   {
 673 |    "cell_type": "code",
 674 |    "execution_count": null,
 675 |    "id": "b6391a1b",
 676 |    "metadata": {
 677 |     "pycharm": {
 678 |      "name": "#%%\n"
 679 |     }
 680 |    },
 681 |    "outputs": [],
 682 |    "source": [
 683 |     "gss.dtypes"
 684 |    ]
 685 |   },
 686 |   {
 687 |    "cell_type": "code",
 688 |    "execution_count": null,
 689 |    "id": "fe39dcbf",
 690 |    "metadata": {},
 691 |    "outputs": [],
 692 |    "source": [
 693 |     "gss"
 694 |    ]
 695 |   },
 696 |   {
 697 |    "cell_type": "code",
 698 |    "execution_count": null,
 699 |    "id": "883a91c7",
 700 |    "metadata": {
 701 |     "pycharm": {
 702 |      "name": "#%%\n"
 703 |     }
 704 |    },
 705 |    "outputs": [],
 706 |    "source": [
 707 |     "gss.memory_usage(deep=True)"
 708 |    ]
 709 |   },
 710 |   {
 711 |    "cell_type": "code",
 712 |    "execution_count": null,
 713 |    "id": "6f564675",
 714 |    "metadata": {
 715 |     "pycharm": {
 716 |      "name": "#%%\n"
 717 |     }
 718 |    },
 719 |    "outputs": [],
 720 |    "source": [
 721 |     "# 36 M (pandas 1)\n",
 722 |     "# 8.6 M (Pandas 2)\n",
 723 |     "gss.memory_usage(deep=True).sum()"
 724 |    ]
 725 |   },
 726 |   {
 727 |    "cell_type": "code",
 728 |    "execution_count": null,
 729 |    "id": "f0a102d3",
 730 |    "metadata": {
 731 |     "lines_to_next_cell": 2,
 732 |     "pycharm": {
 733 |      "name": "#%%\n"
 734 |     }
 735 |    },
 736 |    "outputs": [],
 737 |    "source": []
 738 |   },
 739 |   {
 740 |    "cell_type": "markdown",
 741 |    "id": "ea3e9a77",
 742 |    "metadata": {
 743 |     "pycharm": {
 744 |      "name": "#%% md\n"
 745 |     }
 746 |    },
 747 |    "source": [
 748 |     "## Ints"
 749 |    ]
 750 |   },
 751 |   {
 752 |    "cell_type": "code",
 753 |    "execution_count": null,
 754 |    "id": "c6a0a74f",
 755 |    "metadata": {
 756 |     "pycharm": {
 757 |      "name": "#%%\n"
 758 |     }
 759 |    },
 760 |    "outputs": [],
 761 |    "source": [
 762 |     "gss.select_dtypes(int).describe()"
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "code",
 767 |    "execution_count": null,
 768 |    "id": "04eb0242",
 769 |    "metadata": {
 770 |     "pycharm": {
 771 |      "name": "#%%\n"
 772 |     }
 773 |    },
 774 |    "outputs": [],
 775 |    "source": [
 776 |     "# chaining\n",
 777 |     "(gss\n",
 778 |     " .select_dtypes(int)\n",
 779 |     " .describe()\n",
 780 |     ")"
 781 |    ]
 782 |   },
 783 |   {
 784 |    "cell_type": "code",
 785 |    "execution_count": null,
 786 |    "id": "70ce85a3",
 787 |    "metadata": {
 788 |     "pycharm": {
 789 |      "name": "#%%\n"
 790 |     }
 791 |    },
 792 |    "outputs": [],
 793 |    "source": [
 794 |     "# can comb08 be an int8?\n",
 795 |     "# Do completion on int\n",
 796 |     "np.iinfo(np.int)"
 797 |    ]
 798 |   },
 799 |   {
 800 |    "cell_type": "code",
 801 |    "execution_count": null,
 802 |    "id": "459e3d43",
 803 |    "metadata": {
 804 |     "pycharm": {
 805 |      "name": "#%%\n"
 806 |     }
 807 |    },
 808 |    "outputs": [],
 809 |    "source": [
 810 |     "np.iinfo(np.uint8)"
 811 |    ]
 812 |   },
 813 |   {
 814 |    "cell_type": "code",
 815 |    "execution_count": null,
 816 |    "id": "5fbb6683",
 817 |    "metadata": {
 818 |     "pycharm": {
 819 |      "name": "#%%\n"
 820 |     }
 821 |    },
 822 |    "outputs": [],
 823 |    "source": [
 824 |     "np.iinfo(np.uint16)"
 825 |    ]
 826 |   },
 827 |   {
 828 |    "cell_type": "code",
 829 |    "execution_count": null,
 830 |    "id": "15a94872",
 831 |    "metadata": {
 832 |     "pycharm": {
 833 |      "name": "#%%\n"
 834 |     }
 835 |    },
 836 |    "outputs": [],
 837 |    "source": [
 838 |     "# chaining\n",
 839 |     "(gss\n",
 840 |     " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' })\n",
 841 |     " .select_dtypes(['uint16'])\n",
 842 |     " .describe()\n",
 843 |     ")"
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "code",
 848 |    "execution_count": null,
 849 |    "id": "24a7fea9",
 850 |    "metadata": {
 851 |     "lines_to_next_cell": 2,
 852 |     "pycharm": {
 853 |      "name": "#%%\n"
 854 |     }
 855 |    },
 856 |    "outputs": [],
 857 |    "source": [
 858 |     "# chaining\n",
 859 |     "# use 'integer' so see all int-like columns\n",
 860 |     "(gss\n",
 861 |     " .astype({#'YEAR': 'uint16[pyarrow]',\n",
 862 |     "          'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n",
 863 |     " .select_dtypes(['integer'])  # see https://numpy.org/doc/stable/reference/arrays.scalars.html\n",
 864 |     " .describe()\n",
 865 |     ")"
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "code",
 870 |    "execution_count": null,
 871 |    "id": "fb27bf58",
 872 |    "metadata": {
 873 |     "lines_to_next_cell": 2,
 874 |     "pycharm": {
 875 |      "name": "#%%\n"
 876 |     }
 877 |    },
 878 |    "outputs": [],
 879 |    "source": [
 880 |     "# Inspect memory usage\n",
 881 |     "(gss\n",
 882 |     " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n",
 883 |     " .memory_usage(deep=True)\n",
 884 |     " .sum()  # was 36M\n",
 885 |     ")"
 886 |    ]
 887 |   },
 888 |   {
 889 |    "cell_type": "code",
 890 |    "execution_count": null,
 891 |    "id": "b8a61fa7",
 892 |    "metadata": {
 893 |     "lines_to_next_cell": 2,
 894 |     "pycharm": {
 895 |      "name": "#%%\n"
 896 |     }
 897 |    },
 898 |    "outputs": [],
 899 |    "source": []
 900 |   },
 901 |   {
 902 |    "cell_type": "markdown",
 903 |    "id": "fdd212a2",
 904 |    "metadata": {},
 905 |    "source": [
 906 |     "## Int Exercise\n",
 907 |     "* Try converting *YEAR* to `'int8'`. What do the values look like?\n",
 908 |     "* Try converting *YEAR* to `'int8[pyarrow]'`. What do the values look like?"
 909 |    ]
 910 |   },
 911 |   {
 912 |    "cell_type": "code",
 913 |    "execution_count": null,
 914 |    "id": "370c2e84",
 915 |    "metadata": {},
 916 |    "outputs": [],
 917 |    "source": []
 918 |   },
 919 |   {
 920 |    "cell_type": "code",
 921 |    "execution_count": null,
 922 |    "id": "09cdc54a",
 923 |    "metadata": {},
 924 |    "outputs": [],
 925 |    "source": []
 926 |   },
 927 |   {
 928 |    "cell_type": "markdown",
 929 |    "id": "5df9482a",
 930 |    "metadata": {},
 931 |    "source": [
 932 |     "## Floats"
 933 |    ]
 934 |   },
 935 |   {
 936 |    "cell_type": "code",
 937 |    "execution_count": null,
 938 |    "id": "fab11648",
 939 |    "metadata": {
 940 |     "pycharm": {
 941 |      "name": "#%%\n"
 942 |     }
 943 |    },
 944 |    "outputs": [],
 945 |    "source": [
 946 |     "(gss\n",
 947 |     ".select_dtypes('float'))"
 948 |    ]
 949 |   },
 950 |   {
 951 |    "cell_type": "code",
 952 |    "execution_count": null,
 953 |    "id": "5c8490a5",
 954 |    "metadata": {
 955 |     "pycharm": {
 956 |      "name": "#%%\n"
 957 |     }
 958 |    },
 959 |    "outputs": [],
 960 |    "source": [
 961 |     "# surprise! age and hours worked looks int-like\n",
 962 |     "gss.HRS1.describe()"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": null,
 968 |    "id": "6f357afd",
 969 |    "metadata": {
 970 |     "pycharm": {
 971 |      "name": "#%%\n"
 972 |     }
 973 |    },
 974 |    "outputs": [],
 975 |    "source": [
 976 |     "# opps! missing values\n",
 977 |     "gss.HRS1.value_counts(dropna=False)"
 978 |    ]
 979 |   },
 980 |   {
 981 |    "cell_type": "code",
 982 |    "execution_count": null,
 983 |    "id": "ce14eddb",
 984 |    "metadata": {
 985 |     "pycharm": {
 986 |      "name": "#%%\n"
 987 |     }
 988 |    },
 989 |    "outputs": [],
 990 |    "source": [
 991 |     "# where are they missing?\n",
 992 |     "(gss\n",
 993 |     "  .query('HRS1.isna()')\n",
 994 |     ")"
 995 |    ]
 996 |   },
 997 |   {
 998 |    "cell_type": "code",
 999 |    "execution_count": null,
1000 |    "id": "dbe482fb",
1001 |    "metadata": {
1002 |     "pycharm": {
1003 |      "name": "#%%\n"
1004 |     }
1005 |    },
1006 |    "outputs": [],
1007 |    "source": [
1008 |     "# where are they missing?\n",
1009 |     "(gss\n",
1010 |     "  .query('AGE.isna()')\n",
1011 |     ")"
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "code",
1016 |    "execution_count": null,
1017 |    "id": "36817f9c",
1018 |    "metadata": {
1019 |     "pycharm": {
1020 |      "name": "#%%\n"
1021 |     }
1022 |    },
1023 |    "outputs": [],
1024 |    "source": [
1025 |     "# where are they missing?\n",
1026 |     "# It turns out that ID is not consistent across years\n",
1027 |     "(gss\n",
1028 |     "  .query('ID == 229')\n",
1029 |     ")"
1030 |    ]
1031 |   },
1032 |   {
1033 |    "cell_type": "code",
1034 |    "execution_count": null,
1035 |    "id": "75b3c785",
1036 |    "metadata": {
1037 |     "lines_to_next_cell": 2,
1038 |     "pycharm": {
1039 |      "name": "#%%\n"
1040 |     }
1041 |    },
1042 |    "outputs": [],
1043 |    "source": [
1044 |     "# Convert to integers\n",
1045 |     "(gss\n",
1046 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1047 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n",
1048 |     ")"
1049 |    ]
1050 |   },
1051 |   {
1052 |    "cell_type": "code",
1053 |    "execution_count": null,
1054 |    "id": "8e5bc829",
1055 |    "metadata": {
1056 |     "lines_to_next_cell": 2,
1057 |     "pycharm": {
1058 |      "name": "#%%\n"
1059 |     }
1060 |    },
1061 |    "outputs": [],
1062 |    "source": [
1063 |     "(gss\n",
1064 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1065 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n",
1066 |     " .memory_usage(deep=True)\n",
1067 |     " .sum()  # was 36M  \n",
1068 |     ")"
1069 |    ]
1070 |   },
1071 |   {
1072 |    "cell_type": "code",
1073 |    "execution_count": null,
1074 |    "id": "7642b746",
1075 |    "metadata": {
1076 |     "lines_to_next_cell": 2,
1077 |     "pycharm": {
1078 |      "name": "#%%\n"
1079 |     }
1080 |    },
1081 |    "outputs": [],
1082 |    "source": []
1083 |   },
1084 |   {
1085 |    "cell_type": "markdown",
1086 |    "id": "5ca5890d",
1087 |    "metadata": {},
1088 |    "source": [
1089 |     "## Float Exercise\n",
1090 |     "\n",
1091 |     "* What is the mean of the numeric columns?\n",
1092 |     "* How many values are missing in the numeric columns?"
1093 |    ]
1094 |   },
1095 |   {
1096 |    "cell_type": "code",
1097 |    "execution_count": null,
1098 |    "id": "fab90ba5",
1099 |    "metadata": {
1100 |     "lines_to_next_cell": 2
1101 |    },
1102 |    "outputs": [],
1103 |    "source": []
1104 |   },
1105 |   {
1106 |    "cell_type": "markdown",
1107 |    "id": "a083aa86",
1108 |    "metadata": {},
1109 |    "source": [
1110 |     "## Objects"
1111 |    ]
1112 |   },
1113 |   {
1114 |    "cell_type": "code",
1115 |    "execution_count": null,
1116 |    "id": "490050a8",
1117 |    "metadata": {
1118 |     "pycharm": {
1119 |      "name": "#%%\n"
1120 |     }
1121 |    },
1122 |    "outputs": [],
1123 |    "source": [
1124 |     "# pandas 1.x\n",
1125 |     "(gss\n",
1126 |     " .select_dtypes(object)\n",
1127 |     ")"
1128 |    ]
1129 |   },
1130 |   {
1131 |    "cell_type": "code",
1132 |    "execution_count": null,
1133 |    "id": "0363a462",
1134 |    "metadata": {
1135 |     "pycharm": {
1136 |      "name": "#%%\n"
1137 |     }
1138 |    },
1139 |    "outputs": [],
1140 |    "source": [
1141 |     "# pandas 2\n",
1142 |     "(gss\n",
1143 |     " .select_dtypes('string') # str doesn't work\n",
1144 |     ")"
1145 |    ]
1146 |   },
1147 |   {
1148 |    "cell_type": "code",
1149 |    "execution_count": null,
1150 |    "id": "a5f79d7f",
1151 |    "metadata": {
1152 |     "pycharm": {
1153 |      "name": "#%%\n"
1154 |     }
1155 |    },
1156 |    "outputs": [],
1157 |    "source": [
1158 |     "# My goto method - .value_counts\n",
1159 |     "# looks categorical\n",
1160 |     "(gss.MAJOR1.value_counts(dropna=False))"
1161 |    ]
1162 |   },
1163 |   {
1164 |    "cell_type": "code",
1165 |    "execution_count": null,
1166 |    "id": "72004253",
1167 |    "metadata": {
1168 |     "lines_to_next_cell": 2,
1169 |     "pycharm": {
1170 |      "name": "#%%\n"
1171 |     }
1172 |    },
1173 |    "outputs": [],
1174 |    "source": [
1175 |     "(gss\n",
1176 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1177 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1178 |     "         'MAJOR1': 'category'})\n",
1179 |     " .memory_usage(deep=True)\n",
1180 |     " .sum()  # was 36M  \n",
1181 |     ")"
1182 |    ]
1183 |   },
1184 |   {
1185 |    "cell_type": "code",
1186 |    "execution_count": null,
1187 |    "id": "1b38e847",
1188 |    "metadata": {},
1189 |    "outputs": [],
1190 |    "source": [
1191 |     "(gss\n",
1192 |     " .select_dtypes(object)\n",
1193 |     " .columns\n",
1194 |     ")"
1195 |    ]
1196 |   },
1197 |   {
1198 |    "cell_type": "code",
1199 |    "execution_count": null,
1200 |    "id": "c4529135",
1201 |    "metadata": {
1202 |     "lines_to_next_cell": 0,
1203 |     "pycharm": {
1204 |      "name": "#%%\n"
1205 |     }
1206 |    },
1207 |    "outputs": [],
1208 |    "source": [
1209 |     "# wow!\n",
1210 |     "(gss\n",
1211 |     "  .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1212 |     "         'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1213 |     "         'MAJOR1': 'category',\n",
1214 |     "          **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1215 |     "                'INCOME', 'INCOME06', 'HONEST','TICKET']}})           \n",
1216 |     " .memory_usage(deep=True)\n",
1217 |     " .sum()  # was 36M  \n",
1218 |     ")"
1219 |    ]
1220 |   },
1221 |   {
1222 |    "cell_type": "code",
1223 |    "execution_count": null,
1224 |    "id": "6df39625",
1225 |    "metadata": {},
1226 |    "outputs": [],
1227 |    "source": []
1228 |   },
1229 |   {
1230 |    "cell_type": "code",
1231 |    "execution_count": null,
1232 |    "id": "2bd53a07",
1233 |    "metadata": {
1234 |     "lines_to_next_cell": 2
1235 |    },
1236 |    "outputs": [],
1237 |    "source": []
1238 |   },
1239 |   {
1240 |    "cell_type": "markdown",
1241 |    "id": "5c96b300",
1242 |    "metadata": {},
1243 |    "source": [
1244 |     "## String and Category Exercises\n",
1245 |     "* There is a `.cat` attribute on the category columns. What can you do with this attribute? (Use `dir` or tab completion to inspect).\n",
1246 |     "* Categories can be ordered. How do you order *INCOME*?\n",
1247 |     "* There is an `.str` attribute on the string and category columns. What can you do with this attribute? (Use `dir` or tab completion to inspect).\n",
1248 |     "* Uppercase the values in the *TICKET* column."
1249 |    ]
1250 |   },
1251 |   {
1252 |    "cell_type": "code",
1253 |    "execution_count": null,
1254 |    "id": "91a86c01",
1255 |    "metadata": {},
1256 |    "outputs": [],
1257 |    "source": []
1258 |   },
1259 |   {
1260 |    "cell_type": "code",
1261 |    "execution_count": null,
1262 |    "id": "aeab919f",
1263 |    "metadata": {},
1264 |    "outputs": [],
1265 |    "source": [
1266 |     " "
1267 |    ]
1268 |   },
1269 |   {
1270 |    "cell_type": "code",
1271 |    "execution_count": null,
1272 |    "id": "2af1bf07",
1273 |    "metadata": {},
1274 |    "outputs": [],
1275 |    "source": []
1276 |   },
1277 |   {
1278 |    "cell_type": "code",
1279 |    "execution_count": null,
1280 |    "id": "0d0e9df0",
1281 |    "metadata": {},
1282 |    "outputs": [],
1283 |    "source": []
1284 |   },
1285 |   {
1286 |    "cell_type": "code",
1287 |    "execution_count": null,
1288 |    "id": "41672375",
1289 |    "metadata": {},
1290 |    "outputs": [],
1291 |    "source": []
1292 |   },
1293 |   {
1294 |    "cell_type": "code",
1295 |    "execution_count": null,
1296 |    "id": "d51ca7d3",
1297 |    "metadata": {},
1298 |    "outputs": [],
1299 |    "source": []
1300 |   },
1301 |   {
1302 |    "cell_type": "code",
1303 |    "execution_count": null,
1304 |    "id": "42dcf06d",
1305 |    "metadata": {},
1306 |    "outputs": [],
1307 |    "source": []
1308 |   },
1309 |   {
1310 |    "cell_type": "code",
1311 |    "execution_count": null,
1312 |    "id": "d13689ff",
1313 |    "metadata": {},
1314 |    "outputs": [],
1315 |    "source": []
1316 |   },
1317 |   {
1318 |    "cell_type": "code",
1319 |    "execution_count": null,
1320 |    "id": "28320f9c",
1321 |    "metadata": {},
1322 |    "outputs": [],
1323 |    "source": []
1324 |   },
1325 |   {
1326 |    "cell_type": "code",
1327 |    "execution_count": null,
1328 |    "id": "7e74d51d",
1329 |    "metadata": {},
1330 |    "outputs": [],
1331 |    "source": []
1332 |   },
1333 |   {
1334 |    "cell_type": "code",
1335 |    "execution_count": null,
1336 |    "id": "5cc2eedf",
1337 |    "metadata": {},
1338 |    "outputs": [],
1339 |    "source": []
1340 |   },
1341 |   {
1342 |    "cell_type": "code",
1343 |    "execution_count": null,
1344 |    "id": "8e0408ab",
1345 |    "metadata": {},
1346 |    "outputs": [],
1347 |    "source": []
1348 |   },
1349 |   {
1350 |    "cell_type": "markdown",
1351 |    "id": "f0d47559",
1352 |    "metadata": {},
1353 |    "source": [
1354 |     "## Make a Function"
1355 |    ]
1356 |   },
1357 |   {
1358 |    "cell_type": "code",
1359 |    "execution_count": null,
1360 |    "id": "48b7fd47",
1361 |    "metadata": {
1362 |     "lines_to_next_cell": 2,
1363 |     "pycharm": {
1364 |      "name": "#%%\n"
1365 |     }
1366 |    },
1367 |    "outputs": [],
1368 |    "source": [
1369 |     "# a glorious function\n",
1370 |     "# add ordered categories to this\n",
1371 |     "def tweak_gss(gss):\n",
1372 |     "    return (gss\n",
1373 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1374 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1375 |     "             'MAJOR1': 'category',\n",
1376 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1377 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1378 |     "               )\n",
1379 |     "\n",
1380 |     "tweak_gss(gss)"
1381 |    ]
1382 |   },
1383 |   {
1384 |    "cell_type": "markdown",
1385 |    "id": "3a3cc33c",
1386 |    "metadata": {},
1387 |    "source": [
1388 |     "## Function Exercise\n",
1389 |     "* Rearrange your notebook. Put the imports, code to load raw data, and tweak function at the top of the notebook. Restart the kernel and validate that your code works."
1390 |    ]
1391 |   },
1392 |   {
1393 |    "cell_type": "code",
1394 |    "execution_count": null,
1395 |    "id": "5fabd502",
1396 |    "metadata": {},
1397 |    "outputs": [],
1398 |    "source": []
1399 |   },
1400 |   {
1401 |    "cell_type": "code",
1402 |    "execution_count": null,
1403 |    "id": "01e1d15d",
1404 |    "metadata": {},
1405 |    "outputs": [],
1406 |    "source": []
1407 |   },
1408 |   {
1409 |    "cell_type": "markdown",
1410 |    "id": "94863cf2",
1411 |    "metadata": {
1412 |     "lines_to_next_cell": 2
1413 |    },
1414 |    "source": [
1415 |     "## Fix Column Names"
1416 |    ]
1417 |   },
1418 |   {
1419 |    "cell_type": "code",
1420 |    "execution_count": null,
1421 |    "id": "489290dd",
1422 |    "metadata": {
1423 |     "lines_to_next_cell": 0,
1424 |     "pycharm": {
1425 |      "name": "#%%\n"
1426 |     }
1427 |    },
1428 |    "outputs": [],
1429 |    "source": [
1430 |     "# a glorious function\n",
1431 |     "def tweak_gss(gss):\n",
1432 |     "    return (gss\n",
1433 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1434 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1435 |     "             'MAJOR1': 'category',\n",
1436 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1437 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1438 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1439 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1440 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1441 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1442 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1443 |     "          'HONEST':'honesty_rank',\n",
1444 |     "          'TICKET':'traffic_ticket'})\n",
1445 |     "    )\n",
1446 |     "\n",
1447 |     "tweak_gss(gss)"
1448 |    ]
1449 |   },
1450 |   {
1451 |    "cell_type": "code",
1452 |    "execution_count": null,
1453 |    "id": "0f953444",
1454 |    "metadata": {
1455 |     "lines_to_next_cell": 2,
1456 |     "pycharm": {
1457 |      "name": "#%%\n"
1458 |     }
1459 |    },
1460 |    "outputs": [],
1461 |    "source": []
1462 |   },
1463 |   {
1464 |    "cell_type": "code",
1465 |    "execution_count": null,
1466 |    "id": "70e7086c",
1467 |    "metadata": {
1468 |     "lines_to_next_cell": 2,
1469 |     "pycharm": {
1470 |      "name": "#%%\n"
1471 |     }
1472 |    },
1473 |    "outputs": [],
1474 |    "source": []
1475 |   },
1476 |   {
1477 |    "cell_type": "markdown",
1478 |    "id": "dd6ea0f5",
1479 |    "metadata": {
1480 |     "pycharm": {
1481 |      "name": "#%% md\n"
1482 |     }
1483 |    },
1484 |    "source": [
1485 |     "## Chain\n",
1486 |     "\n",
1487 |     "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n",
1488 |     "\n",
1489 |     "The chain should read like a recipe of ordered steps.\n",
1490 |     "\n",
1491 |     "(BTW, this is actually what we did above.)\n",
1492 |     "\n",
1493 |     "<div class='alert alert-warning'>\n",
1494 |     "    Hint: Leverage <tt>.pipe</tt> if you can't find a way to chain 😉🐼💪\n",
1495 |     "</div>\n",
1496 |     "    \n",
1497 |     "\n",
1498 |     "\n"
1499 |    ]
1500 |   },
1501 |   {
1502 |    "cell_type": "code",
1503 |    "execution_count": null,
1504 |    "id": "00e42106",
1505 |    "metadata": {
1506 |     "lines_to_next_cell": 0,
1507 |     "pycharm": {
1508 |      "name": "#%%\n"
1509 |     }
1510 |    },
1511 |    "outputs": [],
1512 |    "source": [
1513 |     "# a glorious function\n",
1514 |     "def tweak_gss(gss):\n",
1515 |     "    return (gss\n",
1516 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1517 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1518 |     "             'MAJOR1': 'category',\n",
1519 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1520 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1521 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1522 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1523 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1524 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1525 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1526 |     "          'HONEST':'honesty_rank',\n",
1527 |     "          'TICKET':'traffic_ticket'})\n",
1528 |     "    )\n",
1529 |     "\n",
1530 |     "tweak_gss(gss)"
1531 |    ]
1532 |   },
1533 |   {
1534 |    "cell_type": "code",
1535 |    "execution_count": null,
1536 |    "id": "9575a725",
1537 |    "metadata": {
1538 |     "pycharm": {
1539 |      "name": "#%%\n"
1540 |     }
1541 |    },
1542 |    "outputs": [],
1543 |    "source": [
1544 |     "# compare chain to this mess\n",
1545 |     "gss2 = gss.copy()\n",
1546 |     "year = gss.YEAR\n",
1547 |     "year_int = year.astype('uint16')\n",
1548 |     "gss2['year'] = year_int\n",
1549 |     "id = gss.ID\n",
1550 |     "id_int = id.astype('uint16')\n",
1551 |     "gss2['year_id'] = id_int\n",
1552 |     "occ = gss.OCC\n",
1553 |     "occ_int = occ.astype('uint16')\n",
1554 |     "gss2['occupation'] = occ_int\n",
1555 |     "\n",
1556 |     "# more of this"
1557 |    ]
1558 |   },
1559 |   {
1560 |    "cell_type": "code",
1561 |    "execution_count": null,
1562 |    "id": "adf6c5d3",
1563 |    "metadata": {
1564 |     "lines_to_next_cell": 0,
1565 |     "pycharm": {
1566 |      "name": "#%%\n"
1567 |     }
1568 |    },
1569 |    "outputs": [],
1570 |    "source": [
1571 |     "# easy to debug\n",
1572 |     "#  - assign to var (df3)\n",
1573 |     "#  - comment out\n",
1574 |     "#  - pipe to display\n",
1575 |     "\n",
1576 |     "\n",
1577 |     "from IPython.display import display\n",
1578 |     "\n",
1579 |     "def get_var(df, var_name):\n",
1580 |     "    globals()[var_name] = df\n",
1581 |     "    return df\n",
1582 |     "\n",
1583 |     "def tweak_gss(gss):\n",
1584 |     "    return (gss\n",
1585 |     "      .pipe(get_var, 'df3')   \n",
1586 |     "     .pipe(lambda df: print(df.shape) or df)                \n",
1587 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1588 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1589 |     "             'MAJOR1': 'category',\n",
1590 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1591 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1592 |     "     .pipe(lambda df: print(df.shape) or df)                            \n",
1593 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1594 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1595 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1596 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1597 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1598 |     "          'HONEST':'honesty_rank',\n",
1599 |     "          'TICKET':'traffic_ticket'})\n",
1600 |     "     .pipe(lambda df: print(df.shape) or df)                            \n",
1601 |     "    )\n",
1602 |     "\n",
1603 |     "tweak_gss(gss)"
1604 |    ]
1605 |   },
1606 |   {
1607 |    "cell_type": "code",
1608 |    "execution_count": null,
1609 |    "id": "eecbef14",
1610 |    "metadata": {
1611 |     "pycharm": {
1612 |      "name": "#%%\n"
1613 |     }
1614 |    },
1615 |    "outputs": [],
1616 |    "source": [
1617 |     "# inspect intermediate data frame\n",
1618 |     "df3"
1619 |    ]
1620 |   },
1621 |   {
1622 |    "cell_type": "markdown",
1623 |    "id": "22b9adc5",
1624 |    "metadata": {
1625 |     "pycharm": {
1626 |      "name": "#%%\n"
1627 |     }
1628 |    },
1629 |    "source": [
1630 |     "## Chain Exercise\n",
1631 |     "* Write a function that acccepts a dataframe and an index value. It should print any rows that match the index and return the dataframe that was passed in.\n",
1632 |     "* Use the function with pipe after each step of the chain. Show the rows for index 2 and 64,813.\n",
1633 |     "\n",
1634 |     "\n",
1635 |     "\n",
1636 |     "\n",
1637 |     "\n",
1638 |     "\n",
1639 |     "\n",
1640 |     "\n",
1641 |     "## Don't Mutate\n",
1642 |     "\n",
1643 |     "> \"you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not.\"\n",
1644 |     ">\n",
1645 |     "> **jreback** - Pandas core dev\n",
1646 |     "\n",
1647 |     "\n",
1648 |     "\n",
1649 |     "https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136\n",
1650 |     "\n",
1651 |     "* In general, no performance benefits\n",
1652 |     "* Prohibits chaining\n",
1653 |     "* ``SettingWithCopyWarning`` fun\n"
1654 |    ]
1655 |   },
1656 |   {
1657 |    "cell_type": "code",
1658 |    "execution_count": null,
1659 |    "id": "3ddb47c9",
1660 |    "metadata": {
1661 |     "lines_to_next_cell": 2,
1662 |     "pycharm": {
1663 |      "name": "#%%\n"
1664 |     }
1665 |    },
1666 |    "outputs": [],
1667 |    "source": []
1668 |   },
1669 |   {
1670 |    "cell_type": "code",
1671 |    "execution_count": null,
1672 |    "id": "46dd644b",
1673 |    "metadata": {
1674 |     "lines_to_next_cell": 2,
1675 |     "pycharm": {
1676 |      "name": "#%%\n"
1677 |     }
1678 |    },
1679 |    "outputs": [],
1680 |    "source": []
1681 |   },
1682 |   {
1683 |    "cell_type": "markdown",
1684 |    "id": "fbb24710",
1685 |    "metadata": {
1686 |     "pycharm": {
1687 |      "name": "#%% md\n"
1688 |     }
1689 |    },
1690 |    "source": [
1691 |     "## Don't Apply (if you can)"
1692 |    ]
1693 |   },
1694 |   {
1695 |    "cell_type": "code",
1696 |    "execution_count": null,
1697 |    "id": "b62e0ab8",
1698 |    "metadata": {
1699 |     "lines_to_next_cell": 0,
1700 |     "pycharm": {
1701 |      "name": "#%%\n"
1702 |     }
1703 |    },
1704 |    "outputs": [],
1705 |    "source": [
1706 |     "# a glorious function\n",
1707 |     "def tweak_gss(gss):\n",
1708 |     "    return (gss\n",
1709 |     "      .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n",
1710 |     "             'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n",
1711 |     "             'MAJOR1': 'category',\n",
1712 |     "              **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n",
1713 |     "                    'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n",
1714 |     "     .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n",
1715 |     "          'HRS1': 'hours_worked', 'OCC': 'occupation', \n",
1716 |     "          'MAJOR1': 'college_major', 'SEX':'sex', \n",
1717 |     "          'RACE':'race', 'BORN':'born_in_US',\n",
1718 |     "          'INCOME':'income_1970', 'INCOME06': 'income_2006',\n",
1719 |     "          'HONEST':'honesty_rank',\n",
1720 |     "          'TICKET':'traffic_ticket'})\n",
1721 |     "    )\n",
1722 |     "\n",
1723 |     "gss2 = tweak_gss(gss)"
1724 |    ]
1725 |   },
1726 |   {
1727 |    "cell_type": "code",
1728 |    "execution_count": null,
1729 |    "id": "2729d99a",
1730 |    "metadata": {
1731 |     "pycharm": {
1732 |      "name": "#%%\n"
1733 |     }
1734 |    },
1735 |    "outputs": [],
1736 |    "source": [
1737 |     "# convert age to months\n",
1738 |     "def to_months(val):\n",
1739 |     "    return val * 12\n",
1740 |     "\n",
1741 |     "gss2.age.apply(to_months)"
1742 |    ]
1743 |   },
1744 |   {
1745 |    "cell_type": "code",
1746 |    "execution_count": null,
1747 |    "id": "d7215eb9",
1748 |    "metadata": {
1749 |     "pycharm": {
1750 |      "name": "#%%\n"
1751 |     }
1752 |    },
1753 |    "outputs": [],
1754 |    "source": [
1755 |     "# this gives the sames results\n",
1756 |     "gss2.age * 12"
1757 |    ]
1758 |   },
1759 |   {
1760 |    "cell_type": "code",
1761 |    "execution_count": null,
1762 |    "id": "cc267044",
1763 |    "metadata": {
1764 |     "pycharm": {
1765 |      "name": "#%%\n"
1766 |     }
1767 |    },
1768 |    "outputs": [],
1769 |    "source": [
1770 |     "%%timeit\n",
1771 |     "gss2.age.apply(to_months)"
1772 |    ]
1773 |   },
1774 |   {
1775 |    "cell_type": "code",
1776 |    "execution_count": null,
1777 |    "id": "832f1b22",
1778 |    "metadata": {
1779 |     "pycharm": {
1780 |      "name": "#%%\n"
1781 |     }
1782 |    },
1783 |    "outputs": [],
1784 |    "source": [
1785 |     "%%timeit\n",
1786 |     "gss2.age * 12"
1787 |    ]
1788 |   },
1789 |   {
1790 |    "cell_type": "code",
1791 |    "execution_count": null,
1792 |    "id": "98db1eff",
1793 |    "metadata": {
1794 |     "pycharm": {
1795 |      "name": "#%%\n"
1796 |     }
1797 |    },
1798 |    "outputs": [],
1799 |    "source": [
1800 |     "# ~42x slower!\n",
1801 |     "4_590 / 110"
1802 |    ]
1803 |   },
1804 |   {
1805 |    "cell_type": "code",
1806 |    "execution_count": null,
1807 |    "id": "42dcedc4",
1808 |    "metadata": {},
1809 |    "outputs": [],
1810 |    "source": [
1811 |     "gss.MAJOR1.value_counts()[:20]"
1812 |    ]
1813 |   },
1814 |   {
1815 |    "cell_type": "code",
1816 |    "execution_count": null,
1817 |    "id": "2d844116",
1818 |    "metadata": {
1819 |     "pycharm": {
1820 |      "name": "#%%\n"
1821 |     }
1822 |    },
1823 |    "outputs": [],
1824 |    "source": [
1825 |     "def is_science(val):\n",
1826 |     "    return val in {'Engineering', 'Computer science', 'Biology'}"
1827 |    ]
1828 |   },
1829 |   {
1830 |    "cell_type": "code",
1831 |    "execution_count": null,
1832 |    "id": "9ca1aeed",
1833 |    "metadata": {
1834 |     "pycharm": {
1835 |      "name": "#%%\n"
1836 |     }
1837 |    },
1838 |    "outputs": [],
1839 |    "source": [
1840 |     "%%timeit\n",
1841 |     "# string\n",
1842 |     "gss.MAJOR1.apply(is_science)"
1843 |    ]
1844 |   },
1845 |   {
1846 |    "cell_type": "code",
1847 |    "execution_count": null,
1848 |    "id": "3fc9c8c8",
1849 |    "metadata": {
1850 |     "pycharm": {
1851 |      "name": "#%%\n"
1852 |     }
1853 |    },
1854 |    "outputs": [],
1855 |    "source": [
1856 |     "%%timeit\n",
1857 |     "gss.MAJOR1.isin({'Engineering', 'Computer science', 'Biology'})"
1858 |    ]
1859 |   },
1860 |   {
1861 |    "cell_type": "code",
1862 |    "execution_count": null,
1863 |    "id": "a3b50066",
1864 |    "metadata": {
1865 |     "lines_to_next_cell": 0,
1866 |     "pycharm": {
1867 |      "name": "#%%\n"
1868 |     }
1869 |    },
1870 |    "outputs": [],
1871 |    "source": [
1872 |     "%%timeit\n",
1873 |     "# categorical\n",
1874 |     "gss2.college_major.isin({'Engineering', 'Computer science', 'Biology'})"
1875 |    ]
1876 |   },
1877 |   {
1878 |    "cell_type": "code",
1879 |    "execution_count": null,
1880 |    "id": "5a77d117",
1881 |    "metadata": {
1882 |     "lines_to_next_cell": 2
1883 |    },
1884 |    "outputs": [],
1885 |    "source": []
1886 |   },
1887 |   {
1888 |    "cell_type": "markdown",
1889 |    "id": "0d1cf16c",
1890 |    "metadata": {},
1891 |    "source": [
1892 |     "## Apply Exercise\n",
1893 |     "* Make a new column called *minutes_worked* derived the *hours_worked* column.\n",
1894 |     "* Make a new column called *income_ratio*.\n",
1895 |     "  * Convert the income columns to numbers (replace `'No answer'` and `'Refused'` with `np.nan`).\n",
1896 |     "  * Fill in the missing values with the median\n",
1897 |     "  * Divide the 2006 value by 1970 value"
1898 |    ]
1899 |   },
1900 |   {
1901 |    "cell_type": "code",
1902 |    "execution_count": null,
1903 |    "id": "d5d31937",
1904 |    "metadata": {},
1905 |    "outputs": [],
1906 |    "source": []
1907 |   },
1908 |   {
1909 |    "cell_type": "code",
1910 |    "execution_count": null,
1911 |    "id": "3130309f",
1912 |    "metadata": {},
1913 |    "outputs": [],
1914 |    "source": []
1915 |   },
1916 |   {
1917 |    "cell_type": "code",
1918 |    "execution_count": null,
1919 |    "id": "0b1479f4",
1920 |    "metadata": {},
1921 |    "outputs": [],
1922 |    "source": []
1923 |   },
1924 |   {
1925 |    "cell_type": "code",
1926 |    "execution_count": null,
1927 |    "id": "83b7af00",
1928 |    "metadata": {},
1929 |    "outputs": [],
1930 |    "source": []
1931 |   },
1932 |   {
1933 |    "cell_type": "code",
1934 |    "execution_count": null,
1935 |    "id": "bb139d47",
1936 |    "metadata": {},
1937 |    "outputs": [],
1938 |    "source": []
1939 |   },
1940 |   {
1941 |    "cell_type": "code",
1942 |    "execution_count": null,
1943 |    "id": "11c08537",
1944 |    "metadata": {},
1945 |    "outputs": [],
1946 |    "source": []
1947 |   },
1948 |   {
1949 |    "cell_type": "code",
1950 |    "execution_count": null,
1951 |    "id": "fdbe4d9d",
1952 |    "metadata": {},
1953 |    "outputs": [],
1954 |    "source": []
1955 |   },
1956 |   {
1957 |    "cell_type": "code",
1958 |    "execution_count": null,
1959 |    "id": "2a7f0a1b",
1960 |    "metadata": {},
1961 |    "outputs": [],
1962 |    "source": []
1963 |   },
1964 |   {
1965 |    "cell_type": "code",
1966 |    "execution_count": null,
1967 |    "id": "150dcad2",
1968 |    "metadata": {},
1969 |    "outputs": [],
1970 |    "source": []
1971 |   },
1972 |   {
1973 |    "cell_type": "code",
1974 |    "execution_count": null,
1975 |    "id": "9b7584f0",
1976 |    "metadata": {},
1977 |    "outputs": [],
1978 |    "source": []
1979 |   },
1980 |   {
1981 |    "cell_type": "code",
1982 |    "execution_count": null,
1983 |    "id": "ef44a54e",
1984 |    "metadata": {},
1985 |    "outputs": [],
1986 |    "source": []
1987 |   },
1988 |   {
1989 |    "cell_type": "code",
1990 |    "execution_count": null,
1991 |    "id": "733dd2a5",
1992 |    "metadata": {},
1993 |    "outputs": [],
1994 |    "source": []
1995 |   },
1996 |   {
1997 |    "cell_type": "code",
1998 |    "execution_count": null,
1999 |    "id": "d3ffd870",
2000 |    "metadata": {},
2001 |    "outputs": [],
2002 |    "source": []
2003 |   },
2004 |   {
2005 |    "cell_type": "code",
2006 |    "execution_count": null,
2007 |    "id": "68d8b634",
2008 |    "metadata": {},
2009 |    "outputs": [],
2010 |    "source": []
2011 |   },
2012 |   {
2013 |    "cell_type": "code",
2014 |    "execution_count": null,
2015 |    "id": "cae071cd",
2016 |    "metadata": {},
2017 |    "outputs": [],
2018 |    "source": []
2019 |   },
2020 |   {
2021 |    "cell_type": "code",
2022 |    "execution_count": null,
2023 |    "id": "f8291cf2",
2024 |    "metadata": {},
2025 |    "outputs": [],
2026 |    "source": []
2027 |   },
2028 |   {
2029 |    "cell_type": "code",
2030 |    "execution_count": null,
2031 |    "id": "95246f3c",
2032 |    "metadata": {},
2033 |    "outputs": [],
2034 |    "source": []
2035 |   },
2036 |   {
2037 |    "cell_type": "code",
2038 |    "execution_count": null,
2039 |    "id": "b69650d6",
2040 |    "metadata": {},
2041 |    "outputs": [],
2042 |    "source": []
2043 |   },
2044 |   {
2045 |    "cell_type": "code",
2046 |    "execution_count": null,
2047 |    "id": "9eef0415",
2048 |    "metadata": {},
2049 |    "outputs": [],
2050 |    "source": []
2051 |   },
2052 |   {
2053 |    "cell_type": "code",
2054 |    "execution_count": null,
2055 |    "id": "5b6b9cea",
2056 |    "metadata": {},
2057 |    "outputs": [],
2058 |    "source": []
2059 |   },
2060 |   {
2061 |    "cell_type": "code",
2062 |    "execution_count": null,
2063 |    "id": "614f9720",
2064 |    "metadata": {},
2065 |    "outputs": [],
2066 |    "source": []
2067 |   },
2068 |   {
2069 |    "cell_type": "code",
2070 |    "execution_count": null,
2071 |    "id": "36ad6d8a",
2072 |    "metadata": {},
2073 |    "outputs": [],
2074 |    "source": []
2075 |   },
2076 |   {
2077 |    "cell_type": "code",
2078 |    "execution_count": null,
2079 |    "id": "51c1ea57",
2080 |    "metadata": {},
2081 |    "outputs": [],
2082 |    "source": []
2083 |   },
2084 |   {
2085 |    "cell_type": "code",
2086 |    "execution_count": null,
2087 |    "id": "82f92026",
2088 |    "metadata": {},
2089 |    "outputs": [],
2090 |    "source": []
2091 |   },
2092 |   {
2093 |    "cell_type": "code",
2094 |    "execution_count": null,
2095 |    "id": "ddb7c9f1",
2096 |    "metadata": {},
2097 |    "outputs": [],
2098 |    "source": []
2099 |   },
2100 |   {
2101 |    "cell_type": "code",
2102 |    "execution_count": null,
2103 |    "id": "b36c49df",
2104 |    "metadata": {},
2105 |    "outputs": [],
2106 |    "source": []
2107 |   },
2108 |   {
2109 |    "cell_type": "code",
2110 |    "execution_count": null,
2111 |    "id": "97c684e1",
2112 |    "metadata": {
2113 |     "lines_to_next_cell": 2
2114 |    },
2115 |    "outputs": [],
2116 |    "source": []
2117 |   },
2118 |   {
2119 |    "cell_type": "markdown",
2120 |    "id": "3267d042",
2121 |    "metadata": {},
2122 |    "source": [
2123 |     "## Master Aggregation\n",
2124 |     "\n",
2125 |     "Let's compare age by sex by year...🤔"
2126 |    ]
2127 |   },
2128 |   {
2129 |    "cell_type": "code",
2130 |    "execution_count": null,
2131 |    "id": "9c1354bc",
2132 |    "metadata": {
2133 |     "pycharm": {
2134 |      "name": "#%%\n"
2135 |     }
2136 |    },
2137 |    "outputs": [],
2138 |    "source": [
2139 |     "(gss2\n",
2140 |     "   .groupby('year')\n",
2141 |     "   .mean()\n",
2142 |     ")"
2143 |    ]
2144 |   },
2145 |   {
2146 |    "cell_type": "code",
2147 |    "execution_count": null,
2148 |    "id": "142093b3",
2149 |    "metadata": {
2150 |     "pycharm": {
2151 |      "name": "#%%\n"
2152 |     }
2153 |    },
2154 |    "outputs": [],
2155 |    "source": [
2156 |     "(gss2\n",
2157 |     "   .groupby('year')\n",
2158 |     "   .mean(numeric_only=True)\n",
2159 |     ")"
2160 |    ]
2161 |   },
2162 |   {
2163 |    "cell_type": "code",
2164 |    "execution_count": null,
2165 |    "id": "360706dd",
2166 |    "metadata": {
2167 |     "pycharm": {
2168 |      "name": "#%%\n"
2169 |     }
2170 |    },
2171 |    "outputs": [],
2172 |    "source": [
2173 |     "(gss2\n",
2174 |     "   .groupby('year')\n",
2175 |     "   [['age', 'hours_worked']]\n",
2176 |     "   .mean()\n",
2177 |     ")"
2178 |    ]
2179 |   },
2180 |   {
2181 |    "cell_type": "code",
2182 |    "execution_count": null,
2183 |    "id": "3c554355",
2184 |    "metadata": {
2185 |     "pycharm": {
2186 |      "name": "#%%\n"
2187 |     }
2188 |    },
2189 |    "outputs": [],
2190 |    "source": [
2191 |     "import matplotlib.pyplot as plt\n",
2192 |     "import seaborn as sns\n",
2193 |     "#plt.style.use('pandas1book') \n",
2194 |     "sns.set_context('talk')\n",
2195 |     "plt.plot(range(10))"
2196 |    ]
2197 |   },
2198 |   {
2199 |    "cell_type": "code",
2200 |    "execution_count": null,
2201 |    "id": "c052c593",
2202 |    "metadata": {
2203 |     "pycharm": {
2204 |      "name": "#%%\n"
2205 |     }
2206 |    },
2207 |    "outputs": [],
2208 |    "source": [
2209 |     "(gss2\n",
2210 |     "   .groupby('year')\n",
2211 |     "   [['age', 'hours_worked']]\n",
2212 |     "   .median()\n",
2213 |     "   .plot()\n",
2214 |     ")"
2215 |    ]
2216 |   },
2217 |   {
2218 |    "cell_type": "code",
2219 |    "execution_count": null,
2220 |    "id": "50fabf0e",
2221 |    "metadata": {
2222 |     "pycharm": {
2223 |      "name": "#%%\n"
2224 |     }
2225 |    },
2226 |    "outputs": [],
2227 |    "source": [
2228 |     "(gss2\n",
2229 |     "   .groupby('year')\n",
2230 |     "   [['age', 'hours_worked']]\n",
2231 |     "   #.mean()\n",
2232 |     "   #.median()\n",
2233 |     "   #.std()\n",
2234 |     "   .max()\n",
2235 |     "   .plot()\n",
2236 |     ")"
2237 |    ]
2238 |   },
2239 |   {
2240 |    "cell_type": "code",
2241 |    "execution_count": null,
2242 |    "id": "21434b7c",
2243 |    "metadata": {
2244 |     "lines_to_next_cell": 2,
2245 |     "pycharm": {
2246 |      "name": "#%%\n"
2247 |     }
2248 |    },
2249 |    "outputs": [],
2250 |    "source": [
2251 |     "# add sex\n",
2252 |     "(gss2\n",
2253 |     "   .groupby(['year', 'sex'])\n",
2254 |     "   [['age', 'hours_worked']]\n",
2255 |     "   .mean()\n",
2256 |     "   #.median()\n",
2257 |     "   #.std()\n",
2258 |     "   #.max()\n",
2259 |     "   #.plot()\n",
2260 |     ")"
2261 |    ]
2262 |   },
2263 |   {
2264 |    "cell_type": "code",
2265 |    "execution_count": null,
2266 |    "id": "6a3c6ff9",
2267 |    "metadata": {
2268 |     "lines_to_next_cell": 2,
2269 |     "pycharm": {
2270 |      "name": "#%%\n"
2271 |     }
2272 |    },
2273 |    "outputs": [],
2274 |    "source": [
2275 |     "# add sex\n",
2276 |     "(gss2\n",
2277 |     "   .groupby(['year', 'sex'])\n",
2278 |     "   [['age', 'hours_worked']]\n",
2279 |     "   .mean()\n",
2280 |     "   #.median()\n",
2281 |     "   #.std()\n",
2282 |     "   #.max()\n",
2283 |     "   .plot()\n",
2284 |     ")"
2285 |    ]
2286 |   },
2287 |   {
2288 |    "cell_type": "code",
2289 |    "execution_count": null,
2290 |    "id": "e9340d84",
2291 |    "metadata": {
2292 |     "lines_to_next_cell": 2,
2293 |     "pycharm": {
2294 |      "name": "#%%\n"
2295 |     }
2296 |    },
2297 |    "outputs": [],
2298 |    "source": [
2299 |     "# unstack\n",
2300 |     "(gss2\n",
2301 |     "   .groupby(['year', 'sex'])\n",
2302 |     "   [['age', 'hours_worked']]\n",
2303 |     "   .mean()\n",
2304 |     "   #.median()\n",
2305 |     "   #.std()\n",
2306 |     "   #.max()\n",
2307 |     "   .unstack() \n",
2308 |     "   .plot()\n",
2309 |     ")"
2310 |    ]
2311 |   },
2312 |   {
2313 |    "cell_type": "code",
2314 |    "execution_count": null,
2315 |    "id": "1b079aef",
2316 |    "metadata": {
2317 |     "lines_to_next_cell": 2,
2318 |     "pycharm": {
2319 |      "name": "#%%\n"
2320 |     }
2321 |    },
2322 |    "outputs": [],
2323 |    "source": [
2324 |     "(gss2\n",
2325 |     "   .groupby(['year', 'sex'])\n",
2326 |     "   [['age', 'hours_worked']]\n",
2327 |     "   .mean()\n",
2328 |     "   .unstack()\n",
2329 |     "   .age\n",
2330 |     ")"
2331 |    ]
2332 |   },
2333 |   {
2334 |    "cell_type": "code",
2335 |    "execution_count": null,
2336 |    "id": "1be1ec7d",
2337 |    "metadata": {
2338 |     "lines_to_next_cell": 2,
2339 |     "pycharm": {
2340 |      "name": "#%%\n"
2341 |     }
2342 |    },
2343 |    "outputs": [],
2344 |    "source": [
2345 |     "(gss2\n",
2346 |     "   .groupby(['year', 'sex'])\n",
2347 |     "   [['age', 'hours_worked']]\n",
2348 |     "   .mean()\n",
2349 |     "   .unstack()\n",
2350 |     "   .age\n",
2351 |     "   .plot()\n",
2352 |     "   .legend(bbox_to_anchor=(1,1))\n",
2353 |     ")"
2354 |    ]
2355 |   },
2356 |   {
2357 |    "cell_type": "code",
2358 |    "execution_count": null,
2359 |    "id": "3f6c2888",
2360 |    "metadata": {
2361 |     "pycharm": {
2362 |      "name": "#%%\n"
2363 |     }
2364 |    },
2365 |    "outputs": [],
2366 |    "source": [
2367 |     "# Let's try looking at hours worked\n",
2368 |     "(gss2\n",
2369 |     "   .groupby(['year', 'sex'])\n",
2370 |     "   [['age', 'hours_worked']]\n",
2371 |     "   .mean()\n",
2372 |     "   .unstack()\n",
2373 |     "   .hours_worked\n",
2374 |     "   .plot()\n",
2375 |     "   .legend(bbox_to_anchor=(1,1))\n",
2376 |     ")"
2377 |    ]
2378 |   },
2379 |   {
2380 |    "cell_type": "code",
2381 |    "execution_count": null,
2382 |    "id": "79ae0f43",
2383 |    "metadata": {
2384 |     "lines_to_next_cell": 2,
2385 |     "pycharm": {
2386 |      "name": "#%%\n"
2387 |     }
2388 |    },
2389 |    "outputs": [],
2390 |    "source": [
2391 |     "# Multiple aggregates\n",
2392 |     "def second(group):\n",
2393 |     "    return group.iloc[1]\n",
2394 |     "(gss2\n",
2395 |     "   .groupby(['year', 'sex'])\n",
2396 |     "   [['age', 'hours_worked']]\n",
2397 |     "  .agg(['min', 'max', 'mean', second])\n",
2398 |     "   \n",
2399 |     ")"
2400 |    ]
2401 |   },
2402 |   {
2403 |    "cell_type": "code",
2404 |    "execution_count": null,
2405 |    "id": "b35de234",
2406 |    "metadata": {
2407 |     "lines_to_next_cell": 2,
2408 |     "pycharm": {
2409 |      "name": "#%%\n"
2410 |     }
2411 |    },
2412 |    "outputs": [],
2413 |    "source": []
2414 |   },
2415 |   {
2416 |    "cell_type": "markdown",
2417 |    "id": "12d0ae8e",
2418 |    "metadata": {},
2419 |    "source": [
2420 |     "## Aggregation Exercise\n",
2421 |     "* Which occupation has the highest median hours worked?\n",
2422 |     "* Which occupation has the lowest age?\n",
2423 |     "* What is the breakdown of respondents by race for each year?\n",
2424 |     "* Convert the previous to a percentage.\n",
2425 |     "* How many unique occupations are there for each year?\n",
2426 |     "* What is the most popular college_major for each year?\n",
2427 |     "* What is the second most popular college_major for each year?"
2428 |    ]
2429 |   },
2430 |   {
2431 |    "cell_type": "code",
2432 |    "execution_count": null,
2433 |    "id": "b786c770",
2434 |    "metadata": {},
2435 |    "outputs": [],
2436 |    "source": []
2437 |   },
2438 |   {
2439 |    "cell_type": "code",
2440 |    "execution_count": null,
2441 |    "id": "9163d6da",
2442 |    "metadata": {},
2443 |    "outputs": [],
2444 |    "source": []
2445 |   },
2446 |   {
2447 |    "cell_type": "code",
2448 |    "execution_count": null,
2449 |    "id": "cd21d9ff",
2450 |    "metadata": {},
2451 |    "outputs": [],
2452 |    "source": []
2453 |   },
2454 |   {
2455 |    "cell_type": "code",
2456 |    "execution_count": null,
2457 |    "id": "92d8ced6",
2458 |    "metadata": {},
2459 |    "outputs": [],
2460 |    "source": []
2461 |   },
2462 |   {
2463 |    "cell_type": "code",
2464 |    "execution_count": null,
2465 |    "id": "d72c54ff",
2466 |    "metadata": {},
2467 |    "outputs": [],
2468 |    "source": []
2469 |   },
2470 |   {
2471 |    "cell_type": "code",
2472 |    "execution_count": null,
2473 |    "id": "f34a0907",
2474 |    "metadata": {},
2475 |    "outputs": [],
2476 |    "source": []
2477 |   },
2478 |   {
2479 |    "cell_type": "code",
2480 |    "execution_count": null,
2481 |    "id": "cd35850d",
2482 |    "metadata": {},
2483 |    "outputs": [],
2484 |    "source": []
2485 |   },
2486 |   {
2487 |    "cell_type": "code",
2488 |    "execution_count": null,
2489 |    "id": "d448387c",
2490 |    "metadata": {},
2491 |    "outputs": [],
2492 |    "source": []
2493 |   },
2494 |   {
2495 |    "cell_type": "code",
2496 |    "execution_count": null,
2497 |    "id": "cc33c024",
2498 |    "metadata": {},
2499 |    "outputs": [],
2500 |    "source": []
2501 |   },
2502 |   {
2503 |    "cell_type": "code",
2504 |    "execution_count": null,
2505 |    "id": "94e6c4f8",
2506 |    "metadata": {},
2507 |    "outputs": [],
2508 |    "source": []
2509 |   },
2510 |   {
2511 |    "cell_type": "code",
2512 |    "execution_count": null,
2513 |    "id": "6336616d",
2514 |    "metadata": {},
2515 |    "outputs": [],
2516 |    "source": []
2517 |   },
2518 |   {
2519 |    "cell_type": "code",
2520 |    "execution_count": null,
2521 |    "id": "e893c1db",
2522 |    "metadata": {},
2523 |    "outputs": [],
2524 |    "source": []
2525 |   },
2526 |   {
2527 |    "cell_type": "code",
2528 |    "execution_count": null,
2529 |    "id": "2bf69066",
2530 |    "metadata": {},
2531 |    "outputs": [],
2532 |    "source": []
2533 |   },
2534 |   {
2535 |    "cell_type": "code",
2536 |    "execution_count": null,
2537 |    "id": "18825c08",
2538 |    "metadata": {},
2539 |    "outputs": [],
2540 |    "source": []
2541 |   },
2542 |   {
2543 |    "cell_type": "code",
2544 |    "execution_count": null,
2545 |    "id": "0fb2ac56",
2546 |    "metadata": {},
2547 |    "outputs": [],
2548 |    "source": []
2549 |   },
2550 |   {
2551 |    "cell_type": "code",
2552 |    "execution_count": null,
2553 |    "id": "6978e294",
2554 |    "metadata": {},
2555 |    "outputs": [],
2556 |    "source": []
2557 |   },
2558 |   {
2559 |    "cell_type": "code",
2560 |    "execution_count": null,
2561 |    "id": "84e51e22",
2562 |    "metadata": {},
2563 |    "outputs": [],
2564 |    "source": []
2565 |   },
2566 |   {
2567 |    "cell_type": "code",
2568 |    "execution_count": null,
2569 |    "id": "0bf3baf4",
2570 |    "metadata": {},
2571 |    "outputs": [],
2572 |    "source": []
2573 |   },
2574 |   {
2575 |    "cell_type": "code",
2576 |    "execution_count": null,
2577 |    "id": "cb13bfdb",
2578 |    "metadata": {},
2579 |    "outputs": [],
2580 |    "source": []
2581 |   },
2582 |   {
2583 |    "cell_type": "code",
2584 |    "execution_count": null,
2585 |    "id": "03397cd7",
2586 |    "metadata": {},
2587 |    "outputs": [],
2588 |    "source": []
2589 |   },
2590 |   {
2591 |    "cell_type": "markdown",
2592 |    "id": "fdb17ae7",
2593 |    "metadata": {},
2594 |    "source": [
2595 |     "## Summary\n",
2596 |     "\n",
2597 |     "* Correct types save space and enable convenient math, string, and date functionality\n",
2598 |     "* Chaining operations will:\n",
2599 |     "   * Make code readable\n",
2600 |     "   * Remove bugs\n",
2601 |     "   * Easier to debug\n",
2602 |     "* Don't mutate (there's no point). Embrace chaining.\n",
2603 |     "* ``.apply`` is slow for math\n",
2604 |     "* Aggregations are powerful. Play with them until they make sense\n",
2605 |     "\n",
2606 |     "Follow on Twitter ``@__mharrison__``\n",
2607 |     "\n",
2608 |     "Book giveaway!"
2609 |    ]
2610 |   },
2611 |   {
2612 |    "cell_type": "code",
2613 |    "execution_count": null,
2614 |    "id": "c193b4d2",
2615 |    "metadata": {
2616 |     "pycharm": {
2617 |      "name": "#%%\n"
2618 |     }
2619 |    },
2620 |    "outputs": [],
2621 |    "source": [
2622 |     "import random\n",
2623 |     "random.randrange(1,13)"
2624 |    ]
2625 |   },
2626 |   {
2627 |    "cell_type": "code",
2628 |    "execution_count": null,
2629 |    "id": "5810cbdc",
2630 |    "metadata": {
2631 |     "lines_to_next_cell": 2,
2632 |     "pycharm": {
2633 |      "name": "#%%\n"
2634 |     }
2635 |    },
2636 |    "outputs": [],
2637 |    "source": []
2638 |   },
2639 |   {
2640 |    "cell_type": "code",
2641 |    "execution_count": null,
2642 |    "id": "b0ba79ab",
2643 |    "metadata": {
2644 |     "lines_to_next_cell": 2,
2645 |     "pycharm": {
2646 |      "name": "#%%\n"
2647 |     }
2648 |    },
2649 |    "outputs": [],
2650 |    "source": []
2651 |   },
2652 |   {
2653 |    "cell_type": "code",
2654 |    "execution_count": null,
2655 |    "id": "ca3c1dd9",
2656 |    "metadata": {
2657 |     "lines_to_next_cell": 2,
2658 |     "pycharm": {
2659 |      "name": "#%%\n"
2660 |     }
2661 |    },
2662 |    "outputs": [],
2663 |    "source": []
2664 |   },
2665 |   {
2666 |    "cell_type": "code",
2667 |    "execution_count": null,
2668 |    "id": "1d479d51",
2669 |    "metadata": {
2670 |     "pycharm": {
2671 |      "name": "#%%\n"
2672 |     }
2673 |    },
2674 |    "outputs": [],
2675 |    "source": []
2676 |   }
2677 |  ],
2678 |  "metadata": {
2679 |   "kernelspec": {
2680 |    "display_name": "Python 3 (ipykernel)",
2681 |    "language": "python",
2682 |    "name": "python3"
2683 |   }
2684 |  },
2685 |  "nbformat": 4,
2686 |  "nbformat_minor": 5
2687 | }
2688 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==2.0.2
2 | pyarrow==12.0.0
3 | 
4 | 


--------------------------------------------------------------------------------