├── honest.fth ├── pyproject.toml ├── .devcontainer └── devcontainer.json ├── README.md └── pandas-best-practices.ipynb /honest.fth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mattharrison/pearson-pandas-best-practices/main/honest.fth -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pearson-pandas-best-practices" 3 | version = "0.1.0" 4 | requires-python = ">=3.12" 5 | dependencies = [ 6 | "matplotlib>=3.10.5", 7 | "notebook>=7.4.5", 8 | "pandas>=2.3.1", 9 | "pyarrow>=21.0.0", 10 | "seaborn>=0.13.2", 11 | ] 12 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "mcr.microsoft.com/devcontainers/universal:2", 3 | "hostRequirements": { 4 | "cpus": 2 5 | }, 6 | "waitFor": "onCreateCommand", 7 | "updateContentCommand": "", 8 | "postCreateCommand": "python3 -m pip install uv; uv sync", 9 | "customizations": { 10 | "codespaces": { 11 | "openFiles": [] 12 | }, 13 | "vscode": { 14 | "extensions": [ 15 | "ms-toolsai.jupyter", 16 | "ms-python.python" 17 | ], 18 | "features": { 19 | "ghcr.io/va-h/devcontainers-features/uv:1": {} 20 | }, 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pearson-pandas-best-practices 2 | 3 | This course introduces best practices for Pandas. 4 | 5 | ## Resources 6 | 7 | See the author's book, [Effective Pandas (digital)](https://store.metasnake.com/effective-pandas-book) [(physical)](https://amzn.to/43dt50h) 8 | 9 |  10 | 11 | ## Running 12 | 13 | You have a few options for running. 14 | The easiest option: 15 | 16 | ### Use Codespaces. 17 | 18 | - Click on the green "Code" button 19 | - Select the "Codespaces" Tab. 20 | - Hit "Create codespace on main" button 21 | - Wait a few minutes for the codespace to load 22 | 23 | ### Run Locally with UV 24 | 25 | - Install UV (https://docs.astral.sh/uv/getting-started/installation/) 26 | - Check out the project 27 | - Change into the project directory 28 | - Run `uv sync` 29 | - Start jupyter `uv run jupyter notebook` 30 | -------------------------------------------------------------------------------- /pandas-best-practices.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "88981ab7", 6 | "metadata": {}, 7 | "source": [] 8 | }, 9 | { 10 | "cell_type": "markdown", 11 | "id": "597d4814", 12 | "metadata": { 13 | "lines_to_next_cell": 0, 14 | "pycharm": { 15 | "name": "#%% md\n" 16 | } 17 | }, 18 | "source": [ 19 | "# Pandas Best Practices\n", 20 | "## 5 Tips for Better Pandas Code" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "fe77712e", 27 | "metadata": { 28 | "lines_to_next_cell": 2 29 | }, 30 | "outputs": [], 31 | "source": [] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "6188e883", 36 | "metadata": { 37 | "pycharm": { 38 | "name": "#%% md\n" 39 | } 40 | }, 41 | "source": [ 42 | "## About Matt Harrison @\\_\\_mharrison\\_\\_\n", 43 | "\n", 44 | "* Author of Effective Pandas, Machine Learning Pocket Reference, and Illustrated Guide to Python 3.\n", 45 | "* Advisor at Ponder (creators of Modin)\n", 46 | "* Corporate trainer at MetaSnake. Taught Pandas to 1000's of students.\n", 47 | "* Use coupon LIVE for 10% off Effective Pandas book or bundle ( https://store.metasnake.com )" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "8bf569d6", 54 | "metadata": { 55 | "lines_to_next_cell": 2, 56 | "pycharm": { 57 | "name": "#%%\n" 58 | } 59 | }, 60 | "outputs": [], 61 | "source": [] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "68b20887", 67 | "metadata": { 68 | "lines_to_next_cell": 2, 69 | "pycharm": { 70 | "name": "#%%\n" 71 | } 72 | }, 73 | "outputs": [], 74 | "source": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "b3fd6901", 80 | "metadata": { 81 | "lines_to_next_cell": 2, 82 | "pycharm": { 83 | "name": "#%%\n" 84 | } 85 | }, 86 | "outputs": [], 87 | "source": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "352e081c", 93 | "metadata": { 94 | "lines_to_next_cell": 2, 95 | "pycharm": { 96 | "name": "#%%\n" 97 | } 98 | }, 99 | "outputs": [], 100 | "source": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "61a77bda", 106 | "metadata": { 107 | "lines_to_next_cell": 2, 108 | "pycharm": { 109 | "name": "#%%\n" 110 | } 111 | }, 112 | "outputs": [], 113 | "source": [] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "c7a1b91f", 119 | "metadata": { 120 | "lines_to_next_cell": 2, 121 | "pycharm": { 122 | "name": "#%%\n" 123 | } 124 | }, 125 | "outputs": [], 126 | "source": [] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "id": "5c67d9ed", 131 | "metadata": { 132 | "pycharm": { 133 | "name": "#%% md\n" 134 | } 135 | }, 136 | "source": [ 137 | "## Practice this on your data with your team!\n", 138 | "* Contact me matt@metasnake.com\n", 139 | "* Follow on Twitter @\\_\\_mharrison\\_\\_" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "b56b65e9", 146 | "metadata": { 147 | "lines_to_next_cell": 2, 148 | "pycharm": { 149 | "name": "#%%\n" 150 | } 151 | }, 152 | "outputs": [], 153 | "source": [] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "id": "7c579886", 159 | "metadata": { 160 | "lines_to_next_cell": 2, 161 | "pycharm": { 162 | "name": "#%%\n" 163 | } 164 | }, 165 | "outputs": [], 166 | "source": [] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "id": "c38061e7", 171 | "metadata": { 172 | "pycharm": { 173 | "name": "#%% md\n" 174 | } 175 | }, 176 | "source": [ 177 | "## Outline\n", 178 | "\n", 179 | "* Load Data\n", 180 | "* Types\n", 181 | "* Chaining\n", 182 | "* Mutation\n", 183 | "* Apply\n", 184 | "* Aggregation" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "dc9d13b9", 190 | "metadata": { 191 | "pycharm": { 192 | "name": "#%% md\n" 193 | } 194 | }, 195 | "source": [ 196 | "## Imports" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 1, 202 | "id": "ec04b162", 203 | "metadata": { 204 | "lines_to_next_cell": 2, 205 | "pycharm": { 206 | "name": "#%%\n" 207 | } 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "%matplotlib inline\n", 212 | "from IPython.display import display\n", 213 | "import numpy as np\n", 214 | "import pandas as pd\n", 215 | "import pyarrow\n", 216 | "\n", 217 | "import io\n", 218 | "import zipfile" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "29ef6997", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "pd.__version__" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "id": "ae401f97", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "pyarrow.__version__" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "id": "1dea3558", 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "id": "1d7ee1c0", 252 | "metadata": {}, 253 | "source": [ 254 | "## Data Preprocessing\n", 255 | "\n", 256 | "Don't run this code. I'm providing it here to show you where the data came from.\n", 257 | "(If you really want to run this download the ZIP file and update the path)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "id": "3a5723a0", 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "# https://gss.norc.org/get-the-data/spss\n", 268 | "# https://gss.norc.org/Documents/spss/gss_spss_with_codebook.zip\n", 269 | "# takes a few minutes on my computer to load\n", 270 | "path = '/mnt/c/Users/matt/Downloads/gss_spss_with_codebook.zip'\n", 271 | "with zipfile.ZipFile(path) as z:\n", 272 | " print(z.namelist())\n", 273 | " with open('gss.sav', mode='bw') as fout:\n", 274 | " fout.write(z.open('GSS7218_R3.sav').read())\n", 275 | " gss = pd.read_spss('gss.sav')" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "id": "f8587141", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "!pip install pyreadstat" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "f4b686fc", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "%%time\n", 296 | "import pyreadstat\n", 297 | "gss, meta = pyreadstat.read_sav('gss.sav')" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "c73cd05b", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "gss.shape" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "id": "1e2e1777", 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "gss.to_feather('gss.fth')" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "id": "46ec0b7c", 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "%%time\n", 328 | "raw = pd.read_feather('~/Dropbox/work/jupyter/gss.fth')" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "id": "f029dc45", 335 | "metadata": { 336 | "lines_to_next_cell": 0, 337 | "pycharm": { 338 | "name": "#%%\n" 339 | } 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "raw" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "id": "8cad5ba6", 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "# 6000 columns!\n", 354 | "raw.shape" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "id": "d08680b5", 361 | "metadata": { 362 | "lines_to_next_cell": 0 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n", 367 | " 'INCOME06','HONEST','TICKET']\n", 368 | "\n", 369 | "raw[cols].to_feather('honest.fth')" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "id": "506f2f1d", 376 | "metadata": { 377 | "lines_to_next_cell": 2 378 | }, 379 | "outputs": [], 380 | "source": [] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "id": "5ab74806", 385 | "metadata": {}, 386 | "source": [ 387 | "## Loading Data\n", 388 | "\n", 389 | "This is the data we will be using. Run this code!" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "id": "07444860", 396 | "metadata": { 397 | "lines_to_next_cell": 2, 398 | "pycharm": { 399 | "name": "#%%\n" 400 | } 401 | }, 402 | "outputs": [], 403 | "source": [ 404 | "raw = pd.read_feather('honest.fth', dtype_backend='pyarrow')" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "582255a1", 411 | "metadata": { 412 | "lines_to_next_cell": 2, 413 | "pycharm": { 414 | "name": "#%%\n" 415 | } 416 | }, 417 | "outputs": [], 418 | "source": [] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "id": "db1c15f3", 423 | "metadata": { 424 | "pycharm": { 425 | "name": "#%% md\n" 426 | } 427 | }, 428 | "source": [ 429 | "## My Cleanup\n", 430 | "See GSS_Codebook.pdf for explanation\n", 431 | "\n", 432 | "Columns:\n", 433 | "\n", 434 | "* YEAR\n", 435 | "* ID - RESPONDENT ID NUMBER\n", 436 | "* AGE - AGE OF RESPONENT\n", 437 | "* HRS1 - NUMBER OF HOURS WORKED LAST WEEK\n", 438 | "* OCC - R'S CENSUS OCCUPATION CODE (1970) - Page 126 (VAR: OCC) see page 125 for notes APPENDIX F,G,H\n", 439 | " Appendix F - Page 3286\n", 440 | "* MAJOR1 - COLLEGE MAJOR 1\n", 441 | "* SEX - RESPONDENTS SEX\n", 442 | "* RACE - RACE OF RESPONDENT\n", 443 | "* BORN - WAS R BORN IN THIS COUNTRY\n", 444 | "* INCOME - TOTAL FAMILY INCOME 1970\n", 445 | "* INCOME06 - TOTAL FAMILY INCOME 2006\n", 446 | "* HONEST - HONEST\n", 447 | "* TICKET - EVER RECEIVED A TRAFFIC TICKET\n" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "id": "65089c43", 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n", 458 | " 'INCOME06','HONEST','TICKET']\n", 459 | "\n", 460 | "raw[cols].isna().mean()*100" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "id": "67f1d8f4", 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "(raw\n", 471 | " [cols]\n", 472 | " .isna()\n", 473 | " .mean()*100\n", 474 | ")" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "id": "df146d91", 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "MAJOR= '''RESPONSE PUNCH 1972-82 1982B 1983-87 1987B 1988-91 1993-98 2000-04 2006 2008 2010 2012 2014 2016 2018 ALL\n", 485 | "Accounting/bookkeeping 1 0 0 0 0 0 0 0 0 0 0 28 32 30 29 119\n", 486 | "Advertising 2 0 0 0 0 0 0 0 0 0 0 3 2 0 0 5\n", 487 | "Agriculture/horticulture 3 0 0 0 0 0 0 0 0 0 0 8 2 7 5 22\n", 488 | "Allied health 4 0 0 0 0 0 0 0 0 0 0 0 2 1 0 3\n", 489 | "Anthropology 5 0 0 0 0 0 0 0 0 0 0 3 5 1 1 10\n", 490 | "Architecture 6 0 0 0 0 0 0 0 0 0 0 2 3 5 3 13\n", 491 | "Art 7 0 0 0 0 0 0 0 0 0 0 6 7 11 10 34\n", 492 | "Biology 8 0 0 0 0 0 0 0 0 0 0 16 22 33 26 97\n", 493 | "Business administration 9 0 0 0 0 0 0 0 0 0 0 90 142 172 138 542\n", 494 | "Chemistry 11 0 0 0 0 0 0 0 0 0 0 5 8 10 4 27\n", 495 | "Communications/speech 12 0 0 0 0 0 0 0 0 0 0 20 18 26 18 82\n", 496 | "Comm. disorders 13 0 0 0 0 0 0 0 0 0 0 4 6 2 2 14\n", 497 | "Computer science 14 0 0 0 0 0 0 0 0 0 0 25 24 33 17 99\n", 498 | "Dentistry 15 0 0 0 0 0 0 0 0 0 0 2 4 3 5 14\n", 499 | "Education 16 0 0 0 0 0 0 0 0 0 0 73 91 97 79 340\n", 500 | "Economics 17 0 0 0 0 0 0 0 0 0 0 11 19 13 19 62\n", 501 | "Engineering 18 0 0 0 0 0 0 0 0 0 0 47 49 47 54 197\n", 502 | "English 19 0 0 0 0 0 0 0 0 0 0 23 26 27 24 100\n", 503 | "Finance 20 0 0 0 0 0 0 0 0 0 0 7 15 14 16 52\n", 504 | "Foreign language 21 0 0 0 0 0 0 0 0 0 0 4 8 6 5 23\n", 505 | "Forestry 22 0 0 0 0 0 0 0 0 0 0 1 0 3 0 4\n", 506 | "Geography 23 0 0 0 0 0 0 0 0 0 0 0 2 2 4 8\n", 507 | "Geology 24 0 0 0 0 0 0 0 0 0 0 1 3 4 2 10\n", 508 | "History 25 0 0 0 0 0 0 0 0 0 0 10 19 14 19 62\n", 509 | "Home economics 26 0 0 0 0 0 0 0 0 0 0 0 0 3 2 5\n", 510 | "Industry & techn 27 0 0 0 0 0 0 0 0 0 0 3 4 6 0 13\n", 511 | "Journalism 28 0 0 0 0 0 0 0 0 0 0 5 6 6 4 21\n", 512 | "Law 29 0 0 0 0 0 0 0 0 0 0 13 18 23 14 68\n", 513 | "Law enforcement 30 0 0 0 0 0 0 0 0 0 0 3 5 4 2 14\n", 514 | "Library science 31 0 0 0 0 0 0 0 0 0 0 4 5 2 3 14\n", 515 | "Marketing 32 0 0 0 0 0 0 0 0 0 0 11 15 13 12 51\n", 516 | "Mathematics 33 0 0 0 0 0 0 0 0 0 0 5 10 12 5 32\n", 517 | "Medicine 34 0 0 0 0 0 0 0 0 0 0 9 25 12 11 57\n", 518 | "Music 35 0 0 0 0 0 0 0 0 0 0 4 2 10 2 18\n", 519 | "Nursing 36 0 0 0 0 0 0 0 0 0 0 36 39 60 51 186\n", 520 | "Optometry 37 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 521 | "Pharmacy 38 0 0 0 0 0 0 0 0 0 0 2 5 4 4 15\n", 522 | "Philosophy 39 0 0 0 0 0 0 0 0 0 0 2 0 2 2 6\n", 523 | "Physical education 40 0 0 0 0 0 0 0 0 0 0 9 6 16 6 37\n", 524 | "Physics 41 0 0 0 0 0 0 0 0 0 0 3 6 7 4 20\n", 525 | "Psychology 42 0 0 0 0 0 0 0 0 0 0 32 32 34 29 127\n", 526 | "Political science/international relations 43 0 0 0 0 0 0 0 0 0 0 16 22 19 14 71\n", 527 | "Sociology 44 0 0 0 0 0 0 0 0 0 0 9 15 10 12 46\n", 528 | "Special education 45 0 0 0 0 0 0 0 0 0 0 5 3 5 2 15\n", 529 | "Theater arts 46 0 0 0 0 0 0 0 0 0 0 6 2 3 1 12\n", 530 | "Theology 47 0 0 0 0 0 0 0 0 0 0 6 6 13 8 33\n", 531 | "Veterinary medicine 48 0 0 0 0 0 0 0 0 0 0 1 5 3 4 13\n", 532 | "Liberal arts 49 0 0 0 0 0 0 0 0 0 0 8 16 12 10 46\n", 533 | "Other 50 0 0 0 0 0 0 0 0 0 0 8 10 21 27 66\n", 534 | "General sciences 51 0 0 0 0 0 0 0 0 0 0 10 13 15 14 52\n", 535 | "Social work 52 0 0 0 0 0 0 0 0 0 0 7 17 24 7 55\n", 536 | "General studies 53 0 0 0 0 0 0 0 0 0 0 2 5 7 7 21\n", 537 | "Other vocational 54 0 0 0 0 0 0 0 0 0 0 5 11 6 5 27\n", 538 | "Health 55 0 0 0 0 0 0 0 0 0 0 23 31 31 42 127\n", 539 | "Industrial Relations 56 0 0 0 0 0 0 0 0 0 0 1 0 0 3 4\n", 540 | "Child/Human/Family Development 57 0 0 0 0 0 0 0 0 0 0 11 3 7 7 28\n", 541 | "Food Science/Nutrition/Culinary Arts 58 0 0 0 0 0 0 0 0 0 0 3 6 9 9 27\n", 542 | "Environmental Science/Ecology 59 0 0 0 0 0 0 0 0 0 0 5 5 6 8 24\n", 543 | "Social Sciences 60 0 0 0 0 0 0 0 0 0 0 4 2 7 5 18\n", 544 | "Human Services/Human Resources 61 0 0 0 0 0 0 0 0 0 0 3 7 7 5 22\n", 545 | "Visual Arts/Graphic Design/Design and Drafting 62 0 0 0 0 0 0 0 0 0 0 3 8 9 10 30\n", 546 | "Fine Arts 63 0 0 0 0 0 0 0 0 0 0 4 5 5 6 20\n", 547 | "Humanities 64 0 0 0 0 0 0 0 0 0 0 0 2 0 1 3\n", 548 | "Ethnic studies 65 0 0 0 0 0 0 0 0 0 0 3 1 0 0 4\n", 549 | "Educational administration 66 0 0 0 0 0 0 0 0 0 0 3 4 8 9 24\n", 550 | "Television/Film 67 0 0 0 0 0 0 0 0 0 0 0 2 6 1 9\n", 551 | "Aviation/Aeronatics 68 0 0 0 0 0 0 0 0 0 0 2 1 1 3 7\n", 552 | "Statistics/Biostatistics 69 0 0 0 0 0 0 0 0 0 0 0 0 2 2 4\n", 553 | "Criminology/Criminal Justice 70 0 0 0 0 0 0 0 0 0 0 13 17 17 13 60\n", 554 | "Administrative Science/Public Administration 71 0 0 0 0 0 0 0 0 0 0 2 11 3 5 21\n", 555 | "Electronics 72 0 0 0 0 0 0 0 0 0 0 6 6 5 9 26\n", 556 | "Urban and Regional Planning 73 0 0 0 0 0 0 0 0 0 0 1 1 3 2 7\n", 557 | "Mechanics/Machine Trade 74 0 0 0 0 0 0 0 0 0 0 0 1 1 4 6\n", 558 | "Dance 75 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n", 559 | "Gerontology 76 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n", 560 | "Public Relations 77 0 0 0 0 0 0 0 0 0 0 3 1 2 1 7\n", 561 | "Textiles/Cloth 78 0 0 0 0 0 0 0 0 0 0 3 4 0 0 7\n", 562 | "Parks and Recreation 79 0 0 0 0 0 0 0 0 0 0 1 2 1 0 4\n", 563 | "Information Technology 80 0 0 0 0 0 0 0 0 0 0 0 5 8 11 24\n", 564 | "Fashion 81 0 0 0 0 0 0 0 0 0 0 0 0 3 1 4\n", 565 | "Counseling 82 0 0 0 0 0 0 0 0 0 0 0 0 11 9 20\n", 566 | "Don't know/UNCODED 98 0 0 0 0 0 0 0 0 0 0 2 3 0 0 5\n", 567 | "No answer 99 0 0 0 0 0 0 0 0 0 0 0 1 5 3 9\n", 568 | "Not applicable 0 13626 354 7542 353 5907 10334 8394 4510 2023 2044 1263 1597 1795 1435 61177'''\n", 569 | "\n", 570 | "# copy paste slight tweak from page 186\n", 571 | "major_dict = {int(row.split()[-16]): ' '.join(row.split()[:-16]) for row in MAJOR.split('\\n')[1:]}\n", 572 | "major_dict" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": null, 578 | "id": "b9d6c34d", 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "raw.MAJOR1.value_counts()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "id": "74652b6d", 589 | "metadata": {}, 590 | "outputs": [], 591 | "source": [ 592 | "(raw\n", 593 | " [cols]\n", 594 | " .assign(\n", 595 | " MAJOR1=raw.MAJOR1.fillna(99).astype('int').replace(major_dict),\n", 596 | " SEX=raw.SEX#\n", 597 | " \n", 598 | " .astype(int)\n", 599 | " .replace({1:'Male', 2:'Female'}),\n", 600 | " RACE=raw.RACE.astype(int).replace({1:'White', 2:'Black', 3:'Other'}),\n", 601 | " OCC=raw.OCC.fillna(9999).astype(int),\n", 602 | " BORN=raw.BORN.fillna(4).astype(int).replace({1:'Yes', 2:'No', 3:'Don\\'t Know',\n", 603 | " 4:'No answer', 5:'Not applicable'}),\n", 604 | " INCOME=raw.INCOME.fillna(99).astype(int).replace({99:'No answer', **dict(enumerate(['Not applicable',\n", 605 | " 0,1000,3000,4000,5000,6000,\n", 606 | " 7000,8000,10000,15000,20000,25000,]))}),\n", 607 | " INCOME06=raw.INCOME06.fillna(26).astype(int).replace({26:'Refused', **dict(enumerate(['Not applicable',\n", 608 | " 0,1000,3000,4000,5000,6000,\n", 609 | " 7000,8000,10000,12500,15000,\n", 610 | " 17500,20000,22500,25000,30_000,\n", 611 | " 35_000, 40_000, 50_000, 60_000,\n", 612 | " 75_000, 90_000, 110_000, 130_000,\n", 613 | " 150_000]))}),\n", 614 | " HONEST=raw.HONEST.fillna(9).astype(int).replace({1:'Most desirable', 2:'3 most desireable',\n", 615 | " 3:'Not mentioned', 4: '3 least desireable',\n", 616 | " 5: 'One least desireable',\n", 617 | " 9:'No answer'}),\n", 618 | " TICKET=raw.TICKET.fillna(9).astype(int).replace({1:'Yes', 2:'No', 3:'Refused', 9: 'No answer'}),\n", 619 | " )\n", 620 | " .astype({'YEAR':int, 'ID': 'uint16[pyarrow]'})\n", 621 | " .to_csv('GSS.csv')\n", 622 | ")" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "id": "a14afd45", 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "id": "ce8f0020", 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [] 640 | }, 641 | { 642 | "cell_type": "markdown", 643 | "id": "043a0085", 644 | "metadata": {}, 645 | "source": [ 646 | "## Types\n", 647 | "Getting the right types will enable analysis and correctness.\n" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "id": "5d6c6cd5", 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "%%time\n", 658 | "gss = pd.read_csv('GSS.csv', index_col=0, dtype_backend='pyarrow', engine='pyarrow')" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": null, 664 | "id": "d26b8f6c", 665 | "metadata": { 666 | "pycharm": { 667 | "name": "#%%\n" 668 | } 669 | }, 670 | "outputs": [], 671 | "source": [ 672 | "gss.dtypes" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "id": "19890585", 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "gss" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "id": "852115fe", 689 | "metadata": { 690 | "pycharm": { 691 | "name": "#%%\n" 692 | } 693 | }, 694 | "outputs": [], 695 | "source": [ 696 | "gss.memory_usage(deep=True)" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "id": "b5cfc13b", 703 | "metadata": { 704 | "pycharm": { 705 | "name": "#%%\n" 706 | } 707 | }, 708 | "outputs": [], 709 | "source": [ 710 | "# 36 M (pandas 1)\n", 711 | "# 8.6 M (Pandas 2)\n", 712 | "gss.memory_usage(deep=True).sum()" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": null, 718 | "id": "f1d1b51d", 719 | "metadata": { 720 | "lines_to_next_cell": 2, 721 | "pycharm": { 722 | "name": "#%%\n" 723 | } 724 | }, 725 | "outputs": [], 726 | "source": [] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "id": "5fcab8c0", 731 | "metadata": { 732 | "pycharm": { 733 | "name": "#%% md\n" 734 | } 735 | }, 736 | "source": [ 737 | "## Ints" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": null, 743 | "id": "ad4eddc7", 744 | "metadata": { 745 | "pycharm": { 746 | "name": "#%%\n" 747 | } 748 | }, 749 | "outputs": [], 750 | "source": [ 751 | "gss.select_dtypes(int).describe()" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "id": "ac323e5e", 758 | "metadata": { 759 | "pycharm": { 760 | "name": "#%%\n" 761 | } 762 | }, 763 | "outputs": [], 764 | "source": [ 765 | "# chaining\n", 766 | "(gss\n", 767 | " .select_dtypes(int)\n", 768 | " .describe()\n", 769 | ")" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "id": "2351d051", 776 | "metadata": { 777 | "pycharm": { 778 | "name": "#%%\n" 779 | } 780 | }, 781 | "outputs": [], 782 | "source": [ 783 | "# can comb08 be an int8?\n", 784 | "# Do completion on int\n", 785 | "np.iinfo(np.int)" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": null, 791 | "id": "323df8fb", 792 | "metadata": { 793 | "pycharm": { 794 | "name": "#%%\n" 795 | } 796 | }, 797 | "outputs": [], 798 | "source": [ 799 | "np.iinfo(np.uint8)" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": null, 805 | "id": "bb063be4", 806 | "metadata": { 807 | "pycharm": { 808 | "name": "#%%\n" 809 | } 810 | }, 811 | "outputs": [], 812 | "source": [ 813 | "np.iinfo(np.uint16)" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": null, 819 | "id": "d0fab927", 820 | "metadata": { 821 | "pycharm": { 822 | "name": "#%%\n" 823 | } 824 | }, 825 | "outputs": [], 826 | "source": [ 827 | "# chaining\n", 828 | "(gss\n", 829 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' })\n", 830 | " .select_dtypes(['uint16'])\n", 831 | " .describe()\n", 832 | ")" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "id": "a1d0ed15", 839 | "metadata": { 840 | "lines_to_next_cell": 2, 841 | "pycharm": { 842 | "name": "#%%\n" 843 | } 844 | }, 845 | "outputs": [], 846 | "source": [ 847 | "# chaining\n", 848 | "# use 'integer' so see all int-like columns\n", 849 | "(gss\n", 850 | " .astype({#'YEAR': 'uint16[pyarrow]',\n", 851 | " 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n", 852 | " .select_dtypes(['integer']) # see https://numpy.org/doc/stable/reference/arrays.scalars.html\n", 853 | " .describe()\n", 854 | ")" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "id": "c5d4c3e1", 861 | "metadata": { 862 | "lines_to_next_cell": 2, 863 | "pycharm": { 864 | "name": "#%%\n" 865 | } 866 | }, 867 | "outputs": [], 868 | "source": [ 869 | "# Inspect memory usage\n", 870 | "(gss\n", 871 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n", 872 | " .memory_usage(deep=True)\n", 873 | " .sum() # was 36M\n", 874 | ")" 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": null, 880 | "id": "8ad6e733", 881 | "metadata": { 882 | "lines_to_next_cell": 2, 883 | "pycharm": { 884 | "name": "#%%\n" 885 | } 886 | }, 887 | "outputs": [], 888 | "source": [] 889 | }, 890 | { 891 | "cell_type": "markdown", 892 | "id": "f339194e", 893 | "metadata": {}, 894 | "source": [ 895 | "## Int Exercise\n", 896 | "* Try converting *YEAR* to `'int8'`. What do the values look like?\n", 897 | "* Try converting *YEAR* to `'int8[pyarrow]'`. What do the values look like?" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": null, 903 | "id": "908545d1", 904 | "metadata": {}, 905 | "outputs": [], 906 | "source": [] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": null, 911 | "id": "18a3bf52", 912 | "metadata": {}, 913 | "outputs": [], 914 | "source": [] 915 | }, 916 | { 917 | "cell_type": "markdown", 918 | "id": "b09f89c6", 919 | "metadata": {}, 920 | "source": [ 921 | "## Floats" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": null, 927 | "id": "e7fed87e", 928 | "metadata": { 929 | "pycharm": { 930 | "name": "#%%\n" 931 | } 932 | }, 933 | "outputs": [], 934 | "source": [ 935 | "(gss\n", 936 | ".select_dtypes('float'))" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "id": "49265726", 943 | "metadata": { 944 | "pycharm": { 945 | "name": "#%%\n" 946 | } 947 | }, 948 | "outputs": [], 949 | "source": [ 950 | "# surprise! age and hours worked looks int-like\n", 951 | "gss.HRS1.describe()" 952 | ] 953 | }, 954 | { 955 | "cell_type": "code", 956 | "execution_count": null, 957 | "id": "cd39df3c", 958 | "metadata": { 959 | "pycharm": { 960 | "name": "#%%\n" 961 | } 962 | }, 963 | "outputs": [], 964 | "source": [ 965 | "# opps! missing values\n", 966 | "gss.HRS1.value_counts(dropna=False)" 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": null, 972 | "id": "31a67da2", 973 | "metadata": { 974 | "pycharm": { 975 | "name": "#%%\n" 976 | } 977 | }, 978 | "outputs": [], 979 | "source": [ 980 | "# where are they missing?\n", 981 | "(gss\n", 982 | " .query('HRS1.isna()')\n", 983 | ")" 984 | ] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": null, 989 | "id": "e697d070", 990 | "metadata": { 991 | "pycharm": { 992 | "name": "#%%\n" 993 | } 994 | }, 995 | "outputs": [], 996 | "source": [ 997 | "# where are they missing?\n", 998 | "(gss\n", 999 | " .query('AGE.isna()')\n", 1000 | ")" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": null, 1006 | "id": "a9166e1a", 1007 | "metadata": { 1008 | "pycharm": { 1009 | "name": "#%%\n" 1010 | } 1011 | }, 1012 | "outputs": [], 1013 | "source": [ 1014 | "# where are they missing?\n", 1015 | "# It turns out that ID is not consistent across years\n", 1016 | "(gss\n", 1017 | " .query('ID == 229')\n", 1018 | ")" 1019 | ] 1020 | }, 1021 | { 1022 | "cell_type": "code", 1023 | "execution_count": null, 1024 | "id": "81a8c902", 1025 | "metadata": { 1026 | "lines_to_next_cell": 2, 1027 | "pycharm": { 1028 | "name": "#%%\n" 1029 | } 1030 | }, 1031 | "outputs": [], 1032 | "source": [ 1033 | "# Convert to integers\n", 1034 | "(gss\n", 1035 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1036 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n", 1037 | ")" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": null, 1043 | "id": "d3388e2c", 1044 | "metadata": { 1045 | "lines_to_next_cell": 2, 1046 | "pycharm": { 1047 | "name": "#%%\n" 1048 | } 1049 | }, 1050 | "outputs": [], 1051 | "source": [ 1052 | "(gss\n", 1053 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1054 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n", 1055 | " .memory_usage(deep=True)\n", 1056 | " .sum() # was 36M \n", 1057 | ")" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "code", 1062 | "execution_count": null, 1063 | "id": "9bb70ac2", 1064 | "metadata": { 1065 | "lines_to_next_cell": 2, 1066 | "pycharm": { 1067 | "name": "#%%\n" 1068 | } 1069 | }, 1070 | "outputs": [], 1071 | "source": [] 1072 | }, 1073 | { 1074 | "cell_type": "markdown", 1075 | "id": "75bfd716", 1076 | "metadata": {}, 1077 | "source": [ 1078 | "## Float Exercise\n", 1079 | "\n", 1080 | "* What is the mean of the numeric columns?\n", 1081 | "* How many values are missing in the numeric columns?" 1082 | ] 1083 | }, 1084 | { 1085 | "cell_type": "code", 1086 | "execution_count": null, 1087 | "id": "e3e30c4e", 1088 | "metadata": { 1089 | "lines_to_next_cell": 2 1090 | }, 1091 | "outputs": [], 1092 | "source": [] 1093 | }, 1094 | { 1095 | "cell_type": "markdown", 1096 | "id": "a136fe09", 1097 | "metadata": {}, 1098 | "source": [ 1099 | "## Objects" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": null, 1105 | "id": "4f8b0477", 1106 | "metadata": { 1107 | "pycharm": { 1108 | "name": "#%%\n" 1109 | } 1110 | }, 1111 | "outputs": [], 1112 | "source": [ 1113 | "# pandas 1.x\n", 1114 | "(gss\n", 1115 | " .select_dtypes(object)\n", 1116 | ")" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": null, 1122 | "id": "f80da8d2", 1123 | "metadata": { 1124 | "pycharm": { 1125 | "name": "#%%\n" 1126 | } 1127 | }, 1128 | "outputs": [], 1129 | "source": [ 1130 | "# pandas 2\n", 1131 | "(gss\n", 1132 | " .select_dtypes('string') # str doesn't work\n", 1133 | ")" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": null, 1139 | "id": "7f762143", 1140 | "metadata": { 1141 | "pycharm": { 1142 | "name": "#%%\n" 1143 | } 1144 | }, 1145 | "outputs": [], 1146 | "source": [ 1147 | "# My goto method - .value_counts\n", 1148 | "# looks categorical\n", 1149 | "(gss.MAJOR1.value_counts(dropna=False))" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "code", 1154 | "execution_count": null, 1155 | "id": "55c21c7a", 1156 | "metadata": { 1157 | "lines_to_next_cell": 2, 1158 | "pycharm": { 1159 | "name": "#%%\n" 1160 | } 1161 | }, 1162 | "outputs": [], 1163 | "source": [ 1164 | "(gss\n", 1165 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1166 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1167 | " 'MAJOR1': 'category'})\n", 1168 | " .memory_usage(deep=True)\n", 1169 | " .sum() # was 36M \n", 1170 | ")" 1171 | ] 1172 | }, 1173 | { 1174 | "cell_type": "code", 1175 | "execution_count": null, 1176 | "id": "69969c1b", 1177 | "metadata": {}, 1178 | "outputs": [], 1179 | "source": [ 1180 | "(gss\n", 1181 | " .select_dtypes(object)\n", 1182 | " .columns\n", 1183 | ")" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "code", 1188 | "execution_count": null, 1189 | "id": "f5d51601", 1190 | "metadata": { 1191 | "lines_to_next_cell": 0, 1192 | "pycharm": { 1193 | "name": "#%%\n" 1194 | } 1195 | }, 1196 | "outputs": [], 1197 | "source": [ 1198 | "# wow!\n", 1199 | "(gss\n", 1200 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1201 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1202 | " 'MAJOR1': 'category',\n", 1203 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1204 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}}) \n", 1205 | " .memory_usage(deep=True)\n", 1206 | " .sum() # was 36M \n", 1207 | ")" 1208 | ] 1209 | }, 1210 | { 1211 | "cell_type": "code", 1212 | "execution_count": null, 1213 | "id": "17206364", 1214 | "metadata": {}, 1215 | "outputs": [], 1216 | "source": [] 1217 | }, 1218 | { 1219 | "cell_type": "code", 1220 | "execution_count": null, 1221 | "id": "50bf3fa6", 1222 | "metadata": { 1223 | "lines_to_next_cell": 2 1224 | }, 1225 | "outputs": [], 1226 | "source": [] 1227 | }, 1228 | { 1229 | "cell_type": "markdown", 1230 | "id": "246041ae", 1231 | "metadata": {}, 1232 | "source": [ 1233 | "## Category Exercises\n", 1234 | "* There is a `.cat` attribute on the category columns. What can you do with this attribute? (Use `dir` or tab completion to inspect).\n", 1235 | "* Categories can be ordered. How do you order *INCOME*?\n", 1236 | "* Order the *HONEST* column." 1237 | ] 1238 | }, 1239 | { 1240 | "cell_type": "code", 1241 | "execution_count": null, 1242 | "id": "f543c52c", 1243 | "metadata": {}, 1244 | "outputs": [], 1245 | "source": [] 1246 | }, 1247 | { 1248 | "cell_type": "code", 1249 | "execution_count": null, 1250 | "id": "338e5ba3", 1251 | "metadata": {}, 1252 | "outputs": [], 1253 | "source": [] 1254 | }, 1255 | { 1256 | "cell_type": "code", 1257 | "execution_count": null, 1258 | "id": "3fb313d6", 1259 | "metadata": {}, 1260 | "outputs": [], 1261 | "source": [] 1262 | }, 1263 | { 1264 | "cell_type": "code", 1265 | "execution_count": null, 1266 | "id": "f1d75a84", 1267 | "metadata": {}, 1268 | "outputs": [], 1269 | "source": [] 1270 | }, 1271 | { 1272 | "cell_type": "code", 1273 | "execution_count": null, 1274 | "id": "5c87e18d", 1275 | "metadata": {}, 1276 | "outputs": [], 1277 | "source": [] 1278 | }, 1279 | { 1280 | "cell_type": "code", 1281 | "execution_count": null, 1282 | "id": "85aaccbb", 1283 | "metadata": {}, 1284 | "outputs": [], 1285 | "source": [] 1286 | }, 1287 | { 1288 | "cell_type": "code", 1289 | "execution_count": null, 1290 | "id": "8a321513", 1291 | "metadata": {}, 1292 | "outputs": [], 1293 | "source": [] 1294 | }, 1295 | { 1296 | "cell_type": "markdown", 1297 | "id": "8af7a3d4", 1298 | "metadata": {}, 1299 | "source": [ 1300 | "## Make a Function" 1301 | ] 1302 | }, 1303 | { 1304 | "cell_type": "code", 1305 | "execution_count": null, 1306 | "id": "cb9a32b3", 1307 | "metadata": { 1308 | "lines_to_next_cell": 2, 1309 | "pycharm": { 1310 | "name": "#%%\n" 1311 | } 1312 | }, 1313 | "outputs": [], 1314 | "source": [ 1315 | "# a glorious function\n", 1316 | "# add ordered categories to this\n", 1317 | "def tweak_gss(gss):\n", 1318 | " return (gss\n", 1319 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1320 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1321 | " 'MAJOR1': 'category',\n", 1322 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1323 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1324 | " )\n", 1325 | "\n", 1326 | "tweak_gss(gss)" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "markdown", 1331 | "id": "1c615739", 1332 | "metadata": {}, 1333 | "source": [ 1334 | "## Function Exercise\n", 1335 | "* Rearrange your notebook. Put the imports, code to load raw data, and tweak function at the top of the notebook. Restart the kernel and validate that your code works." 1336 | ] 1337 | }, 1338 | { 1339 | "cell_type": "code", 1340 | "execution_count": null, 1341 | "id": "c61b9f0a", 1342 | "metadata": {}, 1343 | "outputs": [], 1344 | "source": [] 1345 | }, 1346 | { 1347 | "cell_type": "code", 1348 | "execution_count": null, 1349 | "id": "6589902c", 1350 | "metadata": {}, 1351 | "outputs": [], 1352 | "source": [] 1353 | }, 1354 | { 1355 | "cell_type": "markdown", 1356 | "id": "b350e12e", 1357 | "metadata": { 1358 | "lines_to_next_cell": 2 1359 | }, 1360 | "source": [ 1361 | "## Fix Column Names" 1362 | ] 1363 | }, 1364 | { 1365 | "cell_type": "code", 1366 | "execution_count": null, 1367 | "id": "99b39238", 1368 | "metadata": { 1369 | "lines_to_next_cell": 0, 1370 | "pycharm": { 1371 | "name": "#%%\n" 1372 | } 1373 | }, 1374 | "outputs": [], 1375 | "source": [ 1376 | "# a glorious function\n", 1377 | "def tweak_gss(gss):\n", 1378 | " return (gss\n", 1379 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1380 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1381 | " 'MAJOR1': 'category',\n", 1382 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1383 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1384 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1385 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1386 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1387 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1388 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1389 | " 'HONEST':'honesty_rank',\n", 1390 | " 'TICKET':'traffic_ticket'})\n", 1391 | " )\n", 1392 | "\n", 1393 | "tweak_gss(gss)" 1394 | ] 1395 | }, 1396 | { 1397 | "cell_type": "code", 1398 | "execution_count": null, 1399 | "id": "84ecc0de", 1400 | "metadata": { 1401 | "lines_to_next_cell": 2, 1402 | "pycharm": { 1403 | "name": "#%%\n" 1404 | } 1405 | }, 1406 | "outputs": [], 1407 | "source": [] 1408 | }, 1409 | { 1410 | "cell_type": "code", 1411 | "execution_count": null, 1412 | "id": "bf14ec3f", 1413 | "metadata": { 1414 | "lines_to_next_cell": 2, 1415 | "pycharm": { 1416 | "name": "#%%\n" 1417 | } 1418 | }, 1419 | "outputs": [], 1420 | "source": [] 1421 | }, 1422 | { 1423 | "cell_type": "markdown", 1424 | "id": "003b96b9", 1425 | "metadata": { 1426 | "pycharm": { 1427 | "name": "#%% md\n" 1428 | } 1429 | }, 1430 | "source": [ 1431 | "## Chain\n", 1432 | "\n", 1433 | "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n", 1434 | "\n", 1435 | "The chain should read like a recipe of ordered steps.\n", 1436 | "\n", 1437 | "(BTW, this is actually what we did above.)\n", 1438 | "\n", 1439 | "