├── .devcontainer └── devcontainer.json ├── GSS.csv ├── README.md ├── honest.fth ├── pandas-best-practices.ipynb └── requirements.txt /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "mcr.microsoft.com/devcontainers/universal:2", 3 | "hostRequirements": { 4 | "cpus": 2 5 | }, 6 | "waitFor": "onCreateCommand", 7 | "updateContentCommand": "python3 -m pip install -r requirements.txt", 8 | "postCreateCommand": "", 9 | "customizations": { 10 | "codespaces": { 11 | "openFiles": [] 12 | }, 13 | "vscode": { 14 | "extensions": [ 15 | "ms-toolsai.jupyter", 16 | "ms-python.python" 17 | ] 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pearson-pandas-best-practices 2 | 3 | This course introduces best practices for Pandas. 4 | 5 | ## Resources 6 | 7 | See the author's book, [Effective Pandas (digital)](https://store.metasnake.com/effective-pandas-book) [(physical)](https://amzn.to/43dt50h) 8 | 9 |  10 | 11 | -------------------------------------------------------------------------------- /honest.fth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mattharrison/pearson-pandas-best-practices/f67fb49f132784eb09152f3e169196ea26e0cbb4/honest.fth -------------------------------------------------------------------------------- /pandas-best-practices.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "597d4814", 6 | "metadata": { 7 | "lines_to_next_cell": 0, 8 | "pycharm": { 9 | "name": "#%% md\n" 10 | } 11 | }, 12 | "source": [ 13 | "# Pandas Best Practices\n", 14 | "## 5 Tips for Better Pandas Code" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "fe77712e", 21 | "metadata": { 22 | "lines_to_next_cell": 2 23 | }, 24 | "outputs": [], 25 | "source": [] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "6188e883", 30 | "metadata": { 31 | "pycharm": { 32 | "name": "#%% md\n" 33 | } 34 | }, 35 | "source": [ 36 | "## About Matt Harrison @\\_\\_mharrison\\_\\_\n", 37 | "\n", 38 | "* Author of Effective Pandas, Machine Learning Pocket Reference, and Illustrated Guide to Python 3.\n", 39 | "* Advisor at Ponder (creators of Modin)\n", 40 | "* Corporate trainer at MetaSnake. Taught Pandas to 1000's of students.\n", 41 | "* Use coupon LIVE for 10% off Effective Pandas book or bundle ( https://store.metasnake.com )" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "8bf569d6", 48 | "metadata": { 49 | "lines_to_next_cell": 2, 50 | "pycharm": { 51 | "name": "#%%\n" 52 | } 53 | }, 54 | "outputs": [], 55 | "source": [] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "68b20887", 61 | "metadata": { 62 | "lines_to_next_cell": 2, 63 | "pycharm": { 64 | "name": "#%%\n" 65 | } 66 | }, 67 | "outputs": [], 68 | "source": [] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "b3fd6901", 74 | "metadata": { 75 | "lines_to_next_cell": 2, 76 | "pycharm": { 77 | "name": "#%%\n" 78 | } 79 | }, 80 | "outputs": [], 81 | "source": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "352e081c", 87 | "metadata": { 88 | "lines_to_next_cell": 2, 89 | "pycharm": { 90 | "name": "#%%\n" 91 | } 92 | }, 93 | "outputs": [], 94 | "source": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "61a77bda", 100 | "metadata": { 101 | "lines_to_next_cell": 2, 102 | "pycharm": { 103 | "name": "#%%\n" 104 | } 105 | }, 106 | "outputs": [], 107 | "source": [] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "c7a1b91f", 113 | "metadata": { 114 | "lines_to_next_cell": 2, 115 | "pycharm": { 116 | "name": "#%%\n" 117 | } 118 | }, 119 | "outputs": [], 120 | "source": [] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "5c67d9ed", 125 | "metadata": { 126 | "pycharm": { 127 | "name": "#%% md\n" 128 | } 129 | }, 130 | "source": [ 131 | "## Practice this on your data with your team!\n", 132 | "* Contact me matt@metasnake.com\n", 133 | "* Follow on Twitter @\\_\\_mharrison\\_\\_" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "b56b65e9", 140 | "metadata": { 141 | "lines_to_next_cell": 2, 142 | "pycharm": { 143 | "name": "#%%\n" 144 | } 145 | }, 146 | "outputs": [], 147 | "source": [] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "7c579886", 153 | "metadata": { 154 | "lines_to_next_cell": 2, 155 | "pycharm": { 156 | "name": "#%%\n" 157 | } 158 | }, 159 | "outputs": [], 160 | "source": [] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "id": "c38061e7", 165 | "metadata": { 166 | "pycharm": { 167 | "name": "#%% md\n" 168 | } 169 | }, 170 | "source": [ 171 | "## Outline\n", 172 | "\n", 173 | "* Load Data\n", 174 | "* Types\n", 175 | "* Chaining\n", 176 | "* Mutation\n", 177 | "* Apply\n", 178 | "* Aggregation" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "dc9d13b9", 184 | "metadata": { 185 | "pycharm": { 186 | "name": "#%% md\n" 187 | } 188 | }, 189 | "source": [ 190 | "## Imports" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "id": "ec04b162", 197 | "metadata": { 198 | "lines_to_next_cell": 2, 199 | "pycharm": { 200 | "name": "#%%\n" 201 | } 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "%matplotlib inline\n", 206 | "from IPython.display import display\n", 207 | "import numpy as np\n", 208 | "import pandas as pd\n", 209 | "import pyarrow\n", 210 | "\n", 211 | "import io\n", 212 | "import zipfile" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "id": "29ef6997", 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "pd.__version__" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "ae401f97", 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "pyarrow.__version__" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "id": "1dea3558", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "id": "1d7ee1c0", 246 | "metadata": {}, 247 | "source": [ 248 | "## Data Preprocessing\n", 249 | "\n", 250 | "Don't run this code. I'm providing it here to show you where the data came from.\n", 251 | "(If you really want to run this download the ZIP file and update the path)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "id": "3a5723a0", 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# https://gss.norc.org/get-the-data/spss\n", 262 | "# https://gss.norc.org/Documents/spss/gss_spss_with_codebook.zip\n", 263 | "# takes a few minutes on my computer to load\n", 264 | "path = '/mnt/c/Users/matt/Downloads/gss_spss_with_codebook.zip'\n", 265 | "with zipfile.ZipFile(path) as z:\n", 266 | " print(z.namelist())\n", 267 | " with open('gss.sav', mode='bw') as fout:\n", 268 | " fout.write(z.open('GSS7218_R3.sav').read())\n", 269 | " gss = pd.read_spss('gss.sav')" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "f8587141", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "!pip install pyreadstat" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "id": "f4b686fc", 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "%%time\n", 290 | "import pyreadstat\n", 291 | "gss, meta = pyreadstat.read_sav('gss.sav')" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "id": "c73cd05b", 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "gss.shape" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "id": "1e2e1777", 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "gss.to_feather('gss.fth')" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "id": "46ec0b7c", 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "%%time\n", 322 | "raw = pd.read_feather('~/Dropbox/work/jupyter/gss.fth')" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "id": "f029dc45", 329 | "metadata": { 330 | "lines_to_next_cell": 0, 331 | "pycharm": { 332 | "name": "#%%\n" 333 | } 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "raw" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "id": "8cad5ba6", 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "# 6000 columns!\n", 348 | "raw.shape" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "id": "d08680b5", 355 | "metadata": { 356 | "lines_to_next_cell": 0 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n", 361 | " 'INCOME06','HONEST','TICKET']\n", 362 | "\n", 363 | "raw[cols].to_feather('honest.fth')" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "id": "506f2f1d", 370 | "metadata": { 371 | "lines_to_next_cell": 2 372 | }, 373 | "outputs": [], 374 | "source": [] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "id": "5ab74806", 379 | "metadata": {}, 380 | "source": [ 381 | "## Loading Data\n", 382 | "\n", 383 | "This is the data we will be using. Run this code!" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "id": "07444860", 390 | "metadata": { 391 | "lines_to_next_cell": 2, 392 | "pycharm": { 393 | "name": "#%%\n" 394 | } 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "raw = pd.read_feather('honest.fth', dtype_backend='pyarrow')" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "id": "582255a1", 405 | "metadata": { 406 | "lines_to_next_cell": 2, 407 | "pycharm": { 408 | "name": "#%%\n" 409 | } 410 | }, 411 | "outputs": [], 412 | "source": [] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "id": "db1c15f3", 417 | "metadata": { 418 | "pycharm": { 419 | "name": "#%% md\n" 420 | } 421 | }, 422 | "source": [ 423 | "## My Cleanup\n", 424 | "See GSS_Codebook.pdf for explanation\n", 425 | "\n", 426 | "Columns:\n", 427 | "\n", 428 | "* YEAR\n", 429 | "* ID - RESPONDENT ID NUMBER\n", 430 | "* AGE - AGE OF RESPONENT\n", 431 | "* HRS1 - NUMBER OF HOURS WORKED LAST WEEK\n", 432 | "* OCC - R'S CENSUS OCCUPATION CODE (1970) - Page 126 (VAR: OCC) see page 125 for notes APPENDIX F,G,H\n", 433 | " Appendix F - Page 3286\n", 434 | "* MAJOR1 - COLLEGE MAJOR 1\n", 435 | "* SEX - RESPONDENTS SEX\n", 436 | "* RACE - RACE OF RESPONDENT\n", 437 | "* BORN - WAS R BORN IN THIS COUNTRY\n", 438 | "* INCOME - TOTAL FAMILY INCOME 1970\n", 439 | "* INCOME06 - TOTAL FAMILY INCOME 2006\n", 440 | "* HONEST - HONEST\n", 441 | "* TICKET - EVER RECEIVED A TRAFFIC TICKET\n" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "65089c43", 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "cols = ['YEAR','ID','AGE', 'HRS1','OCC','MAJOR1','SEX','RACE','BORN','INCOME',\n", 452 | " 'INCOME06','HONEST','TICKET']\n", 453 | "\n", 454 | "raw[cols].isna().mean()*100" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "id": "67f1d8f4", 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "(raw\n", 465 | " [cols]\n", 466 | " .isna()\n", 467 | " .mean()*100\n", 468 | ")" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "id": "df146d91", 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "MAJOR= '''RESPONSE PUNCH 1972-82 1982B 1983-87 1987B 1988-91 1993-98 2000-04 2006 2008 2010 2012 2014 2016 2018 ALL\n", 479 | "Accounting/bookkeeping 1 0 0 0 0 0 0 0 0 0 0 28 32 30 29 119\n", 480 | "Advertising 2 0 0 0 0 0 0 0 0 0 0 3 2 0 0 5\n", 481 | "Agriculture/horticulture 3 0 0 0 0 0 0 0 0 0 0 8 2 7 5 22\n", 482 | "Allied health 4 0 0 0 0 0 0 0 0 0 0 0 2 1 0 3\n", 483 | "Anthropology 5 0 0 0 0 0 0 0 0 0 0 3 5 1 1 10\n", 484 | "Architecture 6 0 0 0 0 0 0 0 0 0 0 2 3 5 3 13\n", 485 | "Art 7 0 0 0 0 0 0 0 0 0 0 6 7 11 10 34\n", 486 | "Biology 8 0 0 0 0 0 0 0 0 0 0 16 22 33 26 97\n", 487 | "Business administration 9 0 0 0 0 0 0 0 0 0 0 90 142 172 138 542\n", 488 | "Chemistry 11 0 0 0 0 0 0 0 0 0 0 5 8 10 4 27\n", 489 | "Communications/speech 12 0 0 0 0 0 0 0 0 0 0 20 18 26 18 82\n", 490 | "Comm. disorders 13 0 0 0 0 0 0 0 0 0 0 4 6 2 2 14\n", 491 | "Computer science 14 0 0 0 0 0 0 0 0 0 0 25 24 33 17 99\n", 492 | "Dentistry 15 0 0 0 0 0 0 0 0 0 0 2 4 3 5 14\n", 493 | "Education 16 0 0 0 0 0 0 0 0 0 0 73 91 97 79 340\n", 494 | "Economics 17 0 0 0 0 0 0 0 0 0 0 11 19 13 19 62\n", 495 | "Engineering 18 0 0 0 0 0 0 0 0 0 0 47 49 47 54 197\n", 496 | "English 19 0 0 0 0 0 0 0 0 0 0 23 26 27 24 100\n", 497 | "Finance 20 0 0 0 0 0 0 0 0 0 0 7 15 14 16 52\n", 498 | "Foreign language 21 0 0 0 0 0 0 0 0 0 0 4 8 6 5 23\n", 499 | "Forestry 22 0 0 0 0 0 0 0 0 0 0 1 0 3 0 4\n", 500 | "Geography 23 0 0 0 0 0 0 0 0 0 0 0 2 2 4 8\n", 501 | "Geology 24 0 0 0 0 0 0 0 0 0 0 1 3 4 2 10\n", 502 | "History 25 0 0 0 0 0 0 0 0 0 0 10 19 14 19 62\n", 503 | "Home economics 26 0 0 0 0 0 0 0 0 0 0 0 0 3 2 5\n", 504 | "Industry & techn 27 0 0 0 0 0 0 0 0 0 0 3 4 6 0 13\n", 505 | "Journalism 28 0 0 0 0 0 0 0 0 0 0 5 6 6 4 21\n", 506 | "Law 29 0 0 0 0 0 0 0 0 0 0 13 18 23 14 68\n", 507 | "Law enforcement 30 0 0 0 0 0 0 0 0 0 0 3 5 4 2 14\n", 508 | "Library science 31 0 0 0 0 0 0 0 0 0 0 4 5 2 3 14\n", 509 | "Marketing 32 0 0 0 0 0 0 0 0 0 0 11 15 13 12 51\n", 510 | "Mathematics 33 0 0 0 0 0 0 0 0 0 0 5 10 12 5 32\n", 511 | "Medicine 34 0 0 0 0 0 0 0 0 0 0 9 25 12 11 57\n", 512 | "Music 35 0 0 0 0 0 0 0 0 0 0 4 2 10 2 18\n", 513 | "Nursing 36 0 0 0 0 0 0 0 0 0 0 36 39 60 51 186\n", 514 | "Optometry 37 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 515 | "Pharmacy 38 0 0 0 0 0 0 0 0 0 0 2 5 4 4 15\n", 516 | "Philosophy 39 0 0 0 0 0 0 0 0 0 0 2 0 2 2 6\n", 517 | "Physical education 40 0 0 0 0 0 0 0 0 0 0 9 6 16 6 37\n", 518 | "Physics 41 0 0 0 0 0 0 0 0 0 0 3 6 7 4 20\n", 519 | "Psychology 42 0 0 0 0 0 0 0 0 0 0 32 32 34 29 127\n", 520 | "Political science/international relations 43 0 0 0 0 0 0 0 0 0 0 16 22 19 14 71\n", 521 | "Sociology 44 0 0 0 0 0 0 0 0 0 0 9 15 10 12 46\n", 522 | "Special education 45 0 0 0 0 0 0 0 0 0 0 5 3 5 2 15\n", 523 | "Theater arts 46 0 0 0 0 0 0 0 0 0 0 6 2 3 1 12\n", 524 | "Theology 47 0 0 0 0 0 0 0 0 0 0 6 6 13 8 33\n", 525 | "Veterinary medicine 48 0 0 0 0 0 0 0 0 0 0 1 5 3 4 13\n", 526 | "Liberal arts 49 0 0 0 0 0 0 0 0 0 0 8 16 12 10 46\n", 527 | "Other 50 0 0 0 0 0 0 0 0 0 0 8 10 21 27 66\n", 528 | "General sciences 51 0 0 0 0 0 0 0 0 0 0 10 13 15 14 52\n", 529 | "Social work 52 0 0 0 0 0 0 0 0 0 0 7 17 24 7 55\n", 530 | "General studies 53 0 0 0 0 0 0 0 0 0 0 2 5 7 7 21\n", 531 | "Other vocational 54 0 0 0 0 0 0 0 0 0 0 5 11 6 5 27\n", 532 | "Health 55 0 0 0 0 0 0 0 0 0 0 23 31 31 42 127\n", 533 | "Industrial Relations 56 0 0 0 0 0 0 0 0 0 0 1 0 0 3 4\n", 534 | "Child/Human/Family Development 57 0 0 0 0 0 0 0 0 0 0 11 3 7 7 28\n", 535 | "Food Science/Nutrition/Culinary Arts 58 0 0 0 0 0 0 0 0 0 0 3 6 9 9 27\n", 536 | "Environmental Science/Ecology 59 0 0 0 0 0 0 0 0 0 0 5 5 6 8 24\n", 537 | "Social Sciences 60 0 0 0 0 0 0 0 0 0 0 4 2 7 5 18\n", 538 | "Human Services/Human Resources 61 0 0 0 0 0 0 0 0 0 0 3 7 7 5 22\n", 539 | "Visual Arts/Graphic Design/Design and Drafting 62 0 0 0 0 0 0 0 0 0 0 3 8 9 10 30\n", 540 | "Fine Arts 63 0 0 0 0 0 0 0 0 0 0 4 5 5 6 20\n", 541 | "Humanities 64 0 0 0 0 0 0 0 0 0 0 0 2 0 1 3\n", 542 | "Ethnic studies 65 0 0 0 0 0 0 0 0 0 0 3 1 0 0 4\n", 543 | "Educational administration 66 0 0 0 0 0 0 0 0 0 0 3 4 8 9 24\n", 544 | "Television/Film 67 0 0 0 0 0 0 0 0 0 0 0 2 6 1 9\n", 545 | "Aviation/Aeronatics 68 0 0 0 0 0 0 0 0 0 0 2 1 1 3 7\n", 546 | "Statistics/Biostatistics 69 0 0 0 0 0 0 0 0 0 0 0 0 2 2 4\n", 547 | "Criminology/Criminal Justice 70 0 0 0 0 0 0 0 0 0 0 13 17 17 13 60\n", 548 | "Administrative Science/Public Administration 71 0 0 0 0 0 0 0 0 0 0 2 11 3 5 21\n", 549 | "Electronics 72 0 0 0 0 0 0 0 0 0 0 6 6 5 9 26\n", 550 | "Urban and Regional Planning 73 0 0 0 0 0 0 0 0 0 0 1 1 3 2 7\n", 551 | "Mechanics/Machine Trade 74 0 0 0 0 0 0 0 0 0 0 0 1 1 4 6\n", 552 | "Dance 75 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n", 553 | "Gerontology 76 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3\n", 554 | "Public Relations 77 0 0 0 0 0 0 0 0 0 0 3 1 2 1 7\n", 555 | "Textiles/Cloth 78 0 0 0 0 0 0 0 0 0 0 3 4 0 0 7\n", 556 | "Parks and Recreation 79 0 0 0 0 0 0 0 0 0 0 1 2 1 0 4\n", 557 | "Information Technology 80 0 0 0 0 0 0 0 0 0 0 0 5 8 11 24\n", 558 | "Fashion 81 0 0 0 0 0 0 0 0 0 0 0 0 3 1 4\n", 559 | "Counseling 82 0 0 0 0 0 0 0 0 0 0 0 0 11 9 20\n", 560 | "Don't know/UNCODED 98 0 0 0 0 0 0 0 0 0 0 2 3 0 0 5\n", 561 | "No answer 99 0 0 0 0 0 0 0 0 0 0 0 1 5 3 9\n", 562 | "Not applicable 0 13626 354 7542 353 5907 10334 8394 4510 2023 2044 1263 1597 1795 1435 61177'''\n", 563 | "\n", 564 | "# copy paste slight tweak from page 186\n", 565 | "major_dict = {int(row.split()[-16]): ' '.join(row.split()[:-16]) for row in MAJOR.split('\\n')[1:]}\n", 566 | "major_dict" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "id": "b9d6c34d", 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [ 576 | "raw.MAJOR1.value_counts()" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "id": "74652b6d", 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "(raw\n", 587 | " [cols]\n", 588 | " .assign(\n", 589 | " MAJOR1=raw.MAJOR1.fillna(99).astype('int').replace(major_dict),\n", 590 | " SEX=raw.SEX#\n", 591 | " \n", 592 | " .astype(int)\n", 593 | " .replace({1:'Male', 2:'Female'}),\n", 594 | " RACE=raw.RACE.astype(int).replace({1:'White', 2:'Black', 3:'Other'}),\n", 595 | " OCC=raw.OCC.fillna(9999).astype(int),\n", 596 | " BORN=raw.BORN.fillna(4).astype(int).replace({1:'Yes', 2:'No', 3:'Don\\'t Know',\n", 597 | " 4:'No answer', 5:'Not applicable'}),\n", 598 | " INCOME=raw.INCOME.fillna(99).astype(int).replace({99:'No answer', **dict(enumerate(['Not applicable',\n", 599 | " 0,1000,3000,4000,5000,6000,\n", 600 | " 7000,8000,10000,15000,20000,25000,]))}),\n", 601 | " INCOME06=raw.INCOME06.fillna(26).astype(int).replace({26:'Refused', **dict(enumerate(['Not applicable',\n", 602 | " 0,1000,3000,4000,5000,6000,\n", 603 | " 7000,8000,10000,12500,15000,\n", 604 | " 17500,20000,22500,25000,30_000,\n", 605 | " 35_000, 40_000, 50_000, 60_000,\n", 606 | " 75_000, 90_000, 110_000, 130_000,\n", 607 | " 150_000]))}),\n", 608 | " HONEST=raw.HONEST.fillna(9).astype(int).replace({1:'Most desirable', 2:'3 most desireable',\n", 609 | " 3:'Not mentioned', 4: '3 least desireable',\n", 610 | " 5: 'One least desireable',\n", 611 | " 9:'No answer'}),\n", 612 | " TICKET=raw.TICKET.fillna(9).astype(int).replace({1:'Yes', 2:'No', 3:'Refused', 9: 'No answer'}),\n", 613 | " )\n", 614 | " .astype({'YEAR':int, 'ID': 'uint16[pyarrow]'})\n", 615 | " .to_csv('GSS.csv')\n", 616 | ")" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "id": "a14afd45", 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "id": "ce8f0020", 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "id": "043a0085", 638 | "metadata": {}, 639 | "source": [ 640 | "## Types\n", 641 | "Getting the right types will enable analysis and correctness.\n" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "id": "5d6c6cd5", 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "%%time\n", 652 | "gss = pd.read_csv('GSS.csv', index_col=0, dtype_backend='pyarrow', engine='pyarrow')" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "id": "d26b8f6c", 659 | "metadata": { 660 | "pycharm": { 661 | "name": "#%%\n" 662 | } 663 | }, 664 | "outputs": [], 665 | "source": [ 666 | "gss.dtypes" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": null, 672 | "id": "19890585", 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "gss" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "id": "852115fe", 683 | "metadata": { 684 | "pycharm": { 685 | "name": "#%%\n" 686 | } 687 | }, 688 | "outputs": [], 689 | "source": [ 690 | "gss.memory_usage(deep=True)" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "id": "b5cfc13b", 697 | "metadata": { 698 | "pycharm": { 699 | "name": "#%%\n" 700 | } 701 | }, 702 | "outputs": [], 703 | "source": [ 704 | "# 36 M (pandas 1)\n", 705 | "# 8.6 M (Pandas 2)\n", 706 | "gss.memory_usage(deep=True).sum()" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": null, 712 | "id": "f1d1b51d", 713 | "metadata": { 714 | "lines_to_next_cell": 2, 715 | "pycharm": { 716 | "name": "#%%\n" 717 | } 718 | }, 719 | "outputs": [], 720 | "source": [] 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "id": "5fcab8c0", 725 | "metadata": { 726 | "pycharm": { 727 | "name": "#%% md\n" 728 | } 729 | }, 730 | "source": [ 731 | "## Ints" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": null, 737 | "id": "ad4eddc7", 738 | "metadata": { 739 | "pycharm": { 740 | "name": "#%%\n" 741 | } 742 | }, 743 | "outputs": [], 744 | "source": [ 745 | "gss.select_dtypes(int).describe()" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": null, 751 | "id": "ac323e5e", 752 | "metadata": { 753 | "pycharm": { 754 | "name": "#%%\n" 755 | } 756 | }, 757 | "outputs": [], 758 | "source": [ 759 | "# chaining\n", 760 | "(gss\n", 761 | " .select_dtypes(int)\n", 762 | " .describe()\n", 763 | ")" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": null, 769 | "id": "2351d051", 770 | "metadata": { 771 | "pycharm": { 772 | "name": "#%%\n" 773 | } 774 | }, 775 | "outputs": [], 776 | "source": [ 777 | "# can comb08 be an int8?\n", 778 | "# Do completion on int\n", 779 | "np.iinfo(np.int)" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "id": "323df8fb", 786 | "metadata": { 787 | "pycharm": { 788 | "name": "#%%\n" 789 | } 790 | }, 791 | "outputs": [], 792 | "source": [ 793 | "np.iinfo(np.uint8)" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "id": "bb063be4", 800 | "metadata": { 801 | "pycharm": { 802 | "name": "#%%\n" 803 | } 804 | }, 805 | "outputs": [], 806 | "source": [ 807 | "np.iinfo(np.uint16)" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": null, 813 | "id": "d0fab927", 814 | "metadata": { 815 | "pycharm": { 816 | "name": "#%%\n" 817 | } 818 | }, 819 | "outputs": [], 820 | "source": [ 821 | "# chaining\n", 822 | "(gss\n", 823 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' })\n", 824 | " .select_dtypes(['uint16'])\n", 825 | " .describe()\n", 826 | ")" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": null, 832 | "id": "a1d0ed15", 833 | "metadata": { 834 | "lines_to_next_cell": 2, 835 | "pycharm": { 836 | "name": "#%%\n" 837 | } 838 | }, 839 | "outputs": [], 840 | "source": [ 841 | "# chaining\n", 842 | "# use 'integer' so see all int-like columns\n", 843 | "(gss\n", 844 | " .astype({#'YEAR': 'uint16[pyarrow]',\n", 845 | " 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n", 846 | " .select_dtypes(['integer']) # see https://numpy.org/doc/stable/reference/arrays.scalars.html\n", 847 | " .describe()\n", 848 | ")" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": null, 854 | "id": "c5d4c3e1", 855 | "metadata": { 856 | "lines_to_next_cell": 2, 857 | "pycharm": { 858 | "name": "#%%\n" 859 | } 860 | }, 861 | "outputs": [], 862 | "source": [ 863 | "# Inspect memory usage\n", 864 | "(gss\n", 865 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]' }) \n", 866 | " .memory_usage(deep=True)\n", 867 | " .sum() # was 36M\n", 868 | ")" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": null, 874 | "id": "8ad6e733", 875 | "metadata": { 876 | "lines_to_next_cell": 2, 877 | "pycharm": { 878 | "name": "#%%\n" 879 | } 880 | }, 881 | "outputs": [], 882 | "source": [] 883 | }, 884 | { 885 | "cell_type": "markdown", 886 | "id": "f339194e", 887 | "metadata": {}, 888 | "source": [ 889 | "## Int Exercise\n", 890 | "* Try converting *YEAR* to `'int8'`. What do the values look like?\n", 891 | "* Try converting *YEAR* to `'int8[pyarrow]'`. What do the values look like?" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": null, 897 | "id": "908545d1", 898 | "metadata": {}, 899 | "outputs": [], 900 | "source": [] 901 | }, 902 | { 903 | "cell_type": "code", 904 | "execution_count": null, 905 | "id": "18a3bf52", 906 | "metadata": {}, 907 | "outputs": [], 908 | "source": [] 909 | }, 910 | { 911 | "cell_type": "markdown", 912 | "id": "b09f89c6", 913 | "metadata": {}, 914 | "source": [ 915 | "## Floats" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": null, 921 | "id": "e7fed87e", 922 | "metadata": { 923 | "pycharm": { 924 | "name": "#%%\n" 925 | } 926 | }, 927 | "outputs": [], 928 | "source": [ 929 | "(gss\n", 930 | ".select_dtypes('float'))" 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": null, 936 | "id": "49265726", 937 | "metadata": { 938 | "pycharm": { 939 | "name": "#%%\n" 940 | } 941 | }, 942 | "outputs": [], 943 | "source": [ 944 | "# surprise! age and hours worked looks int-like\n", 945 | "gss.HRS1.describe()" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": null, 951 | "id": "cd39df3c", 952 | "metadata": { 953 | "pycharm": { 954 | "name": "#%%\n" 955 | } 956 | }, 957 | "outputs": [], 958 | "source": [ 959 | "# opps! missing values\n", 960 | "gss.HRS1.value_counts(dropna=False)" 961 | ] 962 | }, 963 | { 964 | "cell_type": "code", 965 | "execution_count": null, 966 | "id": "31a67da2", 967 | "metadata": { 968 | "pycharm": { 969 | "name": "#%%\n" 970 | } 971 | }, 972 | "outputs": [], 973 | "source": [ 974 | "# where are they missing?\n", 975 | "(gss\n", 976 | " .query('HRS1.isna()')\n", 977 | ")" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": null, 983 | "id": "e697d070", 984 | "metadata": { 985 | "pycharm": { 986 | "name": "#%%\n" 987 | } 988 | }, 989 | "outputs": [], 990 | "source": [ 991 | "# where are they missing?\n", 992 | "(gss\n", 993 | " .query('AGE.isna()')\n", 994 | ")" 995 | ] 996 | }, 997 | { 998 | "cell_type": "code", 999 | "execution_count": null, 1000 | "id": "a9166e1a", 1001 | "metadata": { 1002 | "pycharm": { 1003 | "name": "#%%\n" 1004 | } 1005 | }, 1006 | "outputs": [], 1007 | "source": [ 1008 | "# where are they missing?\n", 1009 | "# It turns out that ID is not consistent across years\n", 1010 | "(gss\n", 1011 | " .query('ID == 229')\n", 1012 | ")" 1013 | ] 1014 | }, 1015 | { 1016 | "cell_type": "code", 1017 | "execution_count": null, 1018 | "id": "81a8c902", 1019 | "metadata": { 1020 | "lines_to_next_cell": 2, 1021 | "pycharm": { 1022 | "name": "#%%\n" 1023 | } 1024 | }, 1025 | "outputs": [], 1026 | "source": [ 1027 | "# Convert to integers\n", 1028 | "(gss\n", 1029 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1030 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n", 1031 | ")" 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": null, 1037 | "id": "d3388e2c", 1038 | "metadata": { 1039 | "lines_to_next_cell": 2, 1040 | "pycharm": { 1041 | "name": "#%%\n" 1042 | } 1043 | }, 1044 | "outputs": [], 1045 | "source": [ 1046 | "(gss\n", 1047 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1048 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]'})\n", 1049 | " .memory_usage(deep=True)\n", 1050 | " .sum() # was 36M \n", 1051 | ")" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "code", 1056 | "execution_count": null, 1057 | "id": "9bb70ac2", 1058 | "metadata": { 1059 | "lines_to_next_cell": 2, 1060 | "pycharm": { 1061 | "name": "#%%\n" 1062 | } 1063 | }, 1064 | "outputs": [], 1065 | "source": [] 1066 | }, 1067 | { 1068 | "cell_type": "markdown", 1069 | "id": "75bfd716", 1070 | "metadata": {}, 1071 | "source": [ 1072 | "## Float Exercise\n", 1073 | "\n", 1074 | "* What is the mean of the numeric columns?\n", 1075 | "* How many values are missing in the numeric columns?" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "code", 1080 | "execution_count": null, 1081 | "id": "e3e30c4e", 1082 | "metadata": { 1083 | "lines_to_next_cell": 2 1084 | }, 1085 | "outputs": [], 1086 | "source": [] 1087 | }, 1088 | { 1089 | "cell_type": "markdown", 1090 | "id": "a136fe09", 1091 | "metadata": {}, 1092 | "source": [ 1093 | "## Objects" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": null, 1099 | "id": "4f8b0477", 1100 | "metadata": { 1101 | "pycharm": { 1102 | "name": "#%%\n" 1103 | } 1104 | }, 1105 | "outputs": [], 1106 | "source": [ 1107 | "# pandas 1.x\n", 1108 | "(gss\n", 1109 | " .select_dtypes(object)\n", 1110 | ")" 1111 | ] 1112 | }, 1113 | { 1114 | "cell_type": "code", 1115 | "execution_count": null, 1116 | "id": "f80da8d2", 1117 | "metadata": { 1118 | "pycharm": { 1119 | "name": "#%%\n" 1120 | } 1121 | }, 1122 | "outputs": [], 1123 | "source": [ 1124 | "# pandas 2\n", 1125 | "(gss\n", 1126 | " .select_dtypes('string') # str doesn't work\n", 1127 | ")" 1128 | ] 1129 | }, 1130 | { 1131 | "cell_type": "code", 1132 | "execution_count": null, 1133 | "id": "7f762143", 1134 | "metadata": { 1135 | "pycharm": { 1136 | "name": "#%%\n" 1137 | } 1138 | }, 1139 | "outputs": [], 1140 | "source": [ 1141 | "# My goto method - .value_counts\n", 1142 | "# looks categorical\n", 1143 | "(gss.MAJOR1.value_counts(dropna=False))" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": null, 1149 | "id": "55c21c7a", 1150 | "metadata": { 1151 | "lines_to_next_cell": 2, 1152 | "pycharm": { 1153 | "name": "#%%\n" 1154 | } 1155 | }, 1156 | "outputs": [], 1157 | "source": [ 1158 | "(gss\n", 1159 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1160 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1161 | " 'MAJOR1': 'category'})\n", 1162 | " .memory_usage(deep=True)\n", 1163 | " .sum() # was 36M \n", 1164 | ")" 1165 | ] 1166 | }, 1167 | { 1168 | "cell_type": "code", 1169 | "execution_count": null, 1170 | "id": "69969c1b", 1171 | "metadata": {}, 1172 | "outputs": [], 1173 | "source": [ 1174 | "(gss\n", 1175 | " .select_dtypes(object)\n", 1176 | " .columns\n", 1177 | ")" 1178 | ] 1179 | }, 1180 | { 1181 | "cell_type": "code", 1182 | "execution_count": null, 1183 | "id": "f5d51601", 1184 | "metadata": { 1185 | "lines_to_next_cell": 0, 1186 | "pycharm": { 1187 | "name": "#%%\n" 1188 | } 1189 | }, 1190 | "outputs": [], 1191 | "source": [ 1192 | "# wow!\n", 1193 | "(gss\n", 1194 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1195 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1196 | " 'MAJOR1': 'category',\n", 1197 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1198 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}}) \n", 1199 | " .memory_usage(deep=True)\n", 1200 | " .sum() # was 36M \n", 1201 | ")" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "code", 1206 | "execution_count": null, 1207 | "id": "17206364", 1208 | "metadata": {}, 1209 | "outputs": [], 1210 | "source": [] 1211 | }, 1212 | { 1213 | "cell_type": "code", 1214 | "execution_count": null, 1215 | "id": "50bf3fa6", 1216 | "metadata": { 1217 | "lines_to_next_cell": 2 1218 | }, 1219 | "outputs": [], 1220 | "source": [] 1221 | }, 1222 | { 1223 | "cell_type": "markdown", 1224 | "id": "246041ae", 1225 | "metadata": {}, 1226 | "source": [ 1227 | "## Category Exercises\n", 1228 | "* There is a `.cat` attribute on the category columns. What can you do with this attribute? (Use `dir` or tab completion to inspect).\n", 1229 | "* Categories can be ordered. How do you order *INCOME*?\n", 1230 | "* Order the *HONEST* column." 1231 | ] 1232 | }, 1233 | { 1234 | "cell_type": "code", 1235 | "execution_count": null, 1236 | "id": "f543c52c", 1237 | "metadata": {}, 1238 | "outputs": [], 1239 | "source": [] 1240 | }, 1241 | { 1242 | "cell_type": "code", 1243 | "execution_count": null, 1244 | "id": "338e5ba3", 1245 | "metadata": {}, 1246 | "outputs": [], 1247 | "source": [] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": null, 1252 | "id": "3fb313d6", 1253 | "metadata": {}, 1254 | "outputs": [], 1255 | "source": [] 1256 | }, 1257 | { 1258 | "cell_type": "code", 1259 | "execution_count": null, 1260 | "id": "f1d75a84", 1261 | "metadata": {}, 1262 | "outputs": [], 1263 | "source": [] 1264 | }, 1265 | { 1266 | "cell_type": "code", 1267 | "execution_count": null, 1268 | "id": "5c87e18d", 1269 | "metadata": {}, 1270 | "outputs": [], 1271 | "source": [] 1272 | }, 1273 | { 1274 | "cell_type": "code", 1275 | "execution_count": null, 1276 | "id": "85aaccbb", 1277 | "metadata": {}, 1278 | "outputs": [], 1279 | "source": [] 1280 | }, 1281 | { 1282 | "cell_type": "code", 1283 | "execution_count": null, 1284 | "id": "8a321513", 1285 | "metadata": {}, 1286 | "outputs": [], 1287 | "source": [] 1288 | }, 1289 | { 1290 | "cell_type": "markdown", 1291 | "id": "8af7a3d4", 1292 | "metadata": {}, 1293 | "source": [ 1294 | "## Make a Function" 1295 | ] 1296 | }, 1297 | { 1298 | "cell_type": "code", 1299 | "execution_count": null, 1300 | "id": "cb9a32b3", 1301 | "metadata": { 1302 | "lines_to_next_cell": 2, 1303 | "pycharm": { 1304 | "name": "#%%\n" 1305 | } 1306 | }, 1307 | "outputs": [], 1308 | "source": [ 1309 | "# a glorious function\n", 1310 | "# add ordered categories to this\n", 1311 | "def tweak_gss(gss):\n", 1312 | " return (gss\n", 1313 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1314 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1315 | " 'MAJOR1': 'category',\n", 1316 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1317 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1318 | " )\n", 1319 | "\n", 1320 | "tweak_gss(gss)" 1321 | ] 1322 | }, 1323 | { 1324 | "cell_type": "markdown", 1325 | "id": "1c615739", 1326 | "metadata": {}, 1327 | "source": [ 1328 | "## Function Exercise\n", 1329 | "* Rearrange your notebook. Put the imports, code to load raw data, and tweak function at the top of the notebook. Restart the kernel and validate that your code works." 1330 | ] 1331 | }, 1332 | { 1333 | "cell_type": "code", 1334 | "execution_count": null, 1335 | "id": "c61b9f0a", 1336 | "metadata": {}, 1337 | "outputs": [], 1338 | "source": [] 1339 | }, 1340 | { 1341 | "cell_type": "code", 1342 | "execution_count": null, 1343 | "id": "6589902c", 1344 | "metadata": {}, 1345 | "outputs": [], 1346 | "source": [] 1347 | }, 1348 | { 1349 | "cell_type": "markdown", 1350 | "id": "b350e12e", 1351 | "metadata": { 1352 | "lines_to_next_cell": 2 1353 | }, 1354 | "source": [ 1355 | "## Fix Column Names" 1356 | ] 1357 | }, 1358 | { 1359 | "cell_type": "code", 1360 | "execution_count": null, 1361 | "id": "99b39238", 1362 | "metadata": { 1363 | "lines_to_next_cell": 0, 1364 | "pycharm": { 1365 | "name": "#%%\n" 1366 | } 1367 | }, 1368 | "outputs": [], 1369 | "source": [ 1370 | "# a glorious function\n", 1371 | "def tweak_gss(gss):\n", 1372 | " return (gss\n", 1373 | " .astype({'YEAR': 'uint16[pyarrow]', 'ID': 'uint16[pyarrow]', 'OCC': 'uint16[pyarrow]',\n", 1374 | " 'HRS1': 'uint8[pyarrow]', 'AGE': 'uint8[pyarrow]',\n", 1375 | " 'MAJOR1': 'category',\n", 1376 | " **{col: 'category' for col in ['SEX', 'RACE', 'BORN', \n", 1377 | " 'INCOME', 'INCOME06', 'HONEST','TICKET']}})\n", 1378 | " .rename(columns={'YEAR': 'year', 'ID': 'year_id', 'AGE':'age', \n", 1379 | " 'HRS1': 'hours_worked', 'OCC': 'occupation', \n", 1380 | " 'MAJOR1': 'college_major', 'SEX':'sex', \n", 1381 | " 'RACE':'race', 'BORN':'born_in_US',\n", 1382 | " 'INCOME':'income_1970', 'INCOME06': 'income_2006',\n", 1383 | " 'HONEST':'honesty_rank',\n", 1384 | " 'TICKET':'traffic_ticket'})\n", 1385 | " )\n", 1386 | "\n", 1387 | "tweak_gss(gss)" 1388 | ] 1389 | }, 1390 | { 1391 | "cell_type": "code", 1392 | "execution_count": null, 1393 | "id": "84ecc0de", 1394 | "metadata": { 1395 | "lines_to_next_cell": 2, 1396 | "pycharm": { 1397 | "name": "#%%\n" 1398 | } 1399 | }, 1400 | "outputs": [], 1401 | "source": [] 1402 | }, 1403 | { 1404 | "cell_type": "code", 1405 | "execution_count": null, 1406 | "id": "bf14ec3f", 1407 | "metadata": { 1408 | "lines_to_next_cell": 2, 1409 | "pycharm": { 1410 | "name": "#%%\n" 1411 | } 1412 | }, 1413 | "outputs": [], 1414 | "source": [] 1415 | }, 1416 | { 1417 | "cell_type": "markdown", 1418 | "id": "003b96b9", 1419 | "metadata": { 1420 | "pycharm": { 1421 | "name": "#%% md\n" 1422 | } 1423 | }, 1424 | "source": [ 1425 | "## Chain\n", 1426 | "\n", 1427 | "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n", 1428 | "\n", 1429 | "The chain should read like a recipe of ordered steps.\n", 1430 | "\n", 1431 | "(BTW, this is actually what we did above.)\n", 1432 | "\n", 1433 | "