├── .gitignore ├── 01-notes ├── 01-pandas_intro.ipynb ├── 02-assemble.ipynb ├── 03-missing.ipynb ├── 04-tidy.ipynb ├── 05-data_types.ipynb ├── 06-apply.ipynb ├── 07-groupby.ipynb ├── 08a-statsmodels_linear.ipynb ├── 08b-statsmodels_logistic.ipynb └── 09a-sklearn_linear.ipynb ├── 02-lesson ├── 01-intro.ipynb ├── 02-assmeble.ipynb ├── 03-missing.ipynb ├── 04-tidy.ipynb ├── 05-data_types.ipynb ├── 06-apply.ipynb └── 07-groupby.ipynb ├── LICENSE ├── README.md ├── data ├── billboard.csv ├── concat_1.csv ├── concat_2.csv ├── concat_3.csv ├── doctors.csv ├── doctors_unicode.csv ├── ebola_country_timeseries.csv ├── gapminder.tsv ├── pew.csv ├── preg.csv ├── preg2.csv ├── survey_person.csv ├── survey_site.csv ├── survey_survey.csv ├── survey_visited.csv ├── tb.csv └── weather.csv ├── output └── .gitkeep └── test_installation.py /.gitignore: -------------------------------------------------------------------------------- 1 | output/* 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | -------------------------------------------------------------------------------- /01-notes/05-data_types.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import seaborn as sns" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "tips = sns.load_dataset('tips')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "\n", 36 | "RangeIndex: 244 entries, 0 to 243\n", 37 | "Data columns (total 7 columns):\n", 38 | "total_bill 244 non-null float64\n", 39 | "tip 244 non-null float64\n", 40 | "sex 244 non-null category\n", 41 | "smoker 244 non-null category\n", 42 | "day 244 non-null category\n", 43 | "time 244 non-null category\n", 44 | "size 244 non-null int64\n", 45 | "dtypes: category(4), float64(2), int64(1)\n", 46 | "memory usage: 7.2 KB\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "tips.info()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "total_bill float64\n", 63 | "tip float64\n", 64 | "sex category\n", 65 | "smoker category\n", 66 | "day category\n", 67 | "time category\n", 68 | "size int64\n", 69 | "dtype: object" 70 | ] 71 | }, 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "tips.dtypes" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 5, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "# convert to a string (from category)\n", 90 | "tips['sex_str'] = tips['sex'].astype(str)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 6, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/html": [ 101 | "
\n", 102 | "\n", 115 | "\n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | "
total_billtipsexsmokerdaytimesizesex_str
016.991.01FemaleNoSunDinner2Female
110.341.66MaleNoSunDinner3Male
221.013.50MaleNoSunDinner3Male
323.683.31MaleNoSunDinner2Male
424.593.61FemaleNoSunDinner4Female
\n", 187 | "
" 188 | ], 189 | "text/plain": [ 190 | " total_bill tip sex smoker day time size sex_str\n", 191 | "0 16.99 1.01 Female No Sun Dinner 2 Female\n", 192 | "1 10.34 1.66 Male No Sun Dinner 3 Male\n", 193 | "2 21.01 3.50 Male No Sun Dinner 3 Male\n", 194 | "3 23.68 3.31 Male No Sun Dinner 2 Male\n", 195 | "4 24.59 3.61 Female No Sun Dinner 4 Female" 196 | ] 197 | }, 198 | "execution_count": 6, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "tips.head()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 7, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/html": [ 215 | "
\n", 216 | "\n", 229 | "\n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | "
total_billtipsexsmokerdaytimesizesex_strtotal_bill_str
016.991.01FemaleNoSunDinner2Female16.99
110.341.66MaleNoSunDinner3Male10.34
221.013.50MaleNoSunDinner3Male21.01
323.683.31MaleNoSunDinner2Male23.68
424.593.61FemaleNoSunDinner4Female24.59
\n", 307 | "
" 308 | ], 309 | "text/plain": [ 310 | " total_bill tip sex smoker day time size sex_str total_bill_str\n", 311 | "0 16.99 1.01 Female No Sun Dinner 2 Female 16.99\n", 312 | "1 10.34 1.66 Male No Sun Dinner 3 Male 10.34\n", 313 | "2 21.01 3.50 Male No Sun Dinner 3 Male 21.01\n", 314 | "3 23.68 3.31 Male No Sun Dinner 2 Male 23.68\n", 315 | "4 24.59 3.61 Female No Sun Dinner 4 Female 24.59" 316 | ] 317 | }, 318 | "execution_count": 7, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "# convert float to str\n", 325 | "tips['total_bill_str'] = tips['total_bill'].astype(str)\n", 326 | "tips.head()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 8, 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/plain": [ 337 | "total_bill float64\n", 338 | "tip float64\n", 339 | "sex category\n", 340 | "smoker category\n", 341 | "day category\n", 342 | "time category\n", 343 | "size int64\n", 344 | "sex_str object\n", 345 | "total_bill_str object\n", 346 | "dtype: object" 347 | ] 348 | }, 349 | "execution_count": 8, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "tips.dtypes" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "## to numeric" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 9, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "name": "stderr", 372 | "output_type": "stream", 373 | "text": [ 374 | "/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: \n", 375 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 376 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 377 | "\n", 378 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 379 | " self.obj[item] = s\n" 380 | ] 381 | }, 382 | { 383 | "data": { 384 | "text/html": [ 385 | "
\n", 386 | "\n", 399 | "\n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | "
total_billtipsexsmokerdaytimesizesex_strtotal_bill_str
016.991.01FemaleNoSunDinner2Female16.99
1missing1.66MaleNoSunDinner3Male10.34
221.013.50MaleNoSunDinner3Male21.01
3missing3.31MaleNoSunDinner2Male23.68
424.593.61FemaleNoSunDinner4Female24.59
5missing4.71MaleNoSunDinner4Male25.29
68.772.00MaleNoSunDinner2Male8.77
7missing3.12MaleNoSunDinner4Male26.88
815.041.96MaleNoSunDinner2Male15.04
914.783.23MaleNoSunDinner2Male14.78
\n", 537 | "
" 538 | ], 539 | "text/plain": [ 540 | " total_bill tip sex smoker day time size sex_str total_bill_str\n", 541 | "0 16.99 1.01 Female No Sun Dinner 2 Female 16.99\n", 542 | "1 missing 1.66 Male No Sun Dinner 3 Male 10.34\n", 543 | "2 21.01 3.50 Male No Sun Dinner 3 Male 21.01\n", 544 | "3 missing 3.31 Male No Sun Dinner 2 Male 23.68\n", 545 | "4 24.59 3.61 Female No Sun Dinner 4 Female 24.59\n", 546 | "5 missing 4.71 Male No Sun Dinner 4 Male 25.29\n", 547 | "6 8.77 2.00 Male No Sun Dinner 2 Male 8.77\n", 548 | "7 missing 3.12 Male No Sun Dinner 4 Male 26.88\n", 549 | "8 15.04 1.96 Male No Sun Dinner 2 Male 15.04\n", 550 | "9 14.78 3.23 Male No Sun Dinner 2 Male 14.78" 551 | ] 552 | }, 553 | "execution_count": 9, 554 | "metadata": {}, 555 | "output_type": "execute_result" 556 | } 557 | ], 558 | "source": [ 559 | "tips_sub_miss = tips.head(10)\n", 560 | "tips_sub_miss.loc[[1, 3, 5, 7], 'total_bill'] = 'missing'\n", 561 | "tips_sub_miss.head(10)" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 10, 567 | "metadata": { 568 | "scrolled": true 569 | }, 570 | "outputs": [ 571 | { 572 | "ename": "ValueError", 573 | "evalue": "could not convert string to float: 'missing'", 574 | "output_type": "error", 575 | "traceback": [ 576 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 577 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 578 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# will give value error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtips_sub_miss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total_bill'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 579 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 91\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 92\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_deprecate_kwarg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 580 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors, **kwargs)\u001b[0m\n\u001b[1;32m 3297\u001b[0m \u001b[0;31m# else, only a single dtype is given\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3298\u001b[0m new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,\n\u001b[0;32m-> 3299\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 3300\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3301\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 581 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, **kwargs)\u001b[0m\n\u001b[1;32m 3222\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3223\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3224\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'astype'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3225\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3226\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mconvert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 582 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)\u001b[0m\n\u001b[1;32m 3089\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3090\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'mgr'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3091\u001b[0;31m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3092\u001b[0m \u001b[0mresult_blocks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_extend_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mapplied\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult_blocks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3093\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 583 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors, values, **kwargs)\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'raise'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 470\u001b[0m return self._astype(dtype, copy=copy, errors=errors, values=values,\n\u001b[0;32m--> 471\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 472\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 473\u001b[0m def _astype(self, dtype, copy=False, errors='raise', values=None,\n", 584 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m_astype\u001b[0;34m(self, dtype, copy, errors, values, klass, mgr, **kwargs)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;31m# _astype_nansafe works fine with 1-d only\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mastype_nansafe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 585 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py\u001b[0m in \u001b[0;36mastype_nansafe\u001b[0;34m(arr, dtype, copy)\u001b[0m\n\u001b[1;32m 634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 636\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 637\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 586 | "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'missing'" 587 | ] 588 | } 589 | ], 590 | "source": [ 591 | "# will give value error\n", 592 | "tips_sub_miss['total_bill'].astype(float)" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 11, 598 | "metadata": { 599 | "scrolled": true 600 | }, 601 | "outputs": [ 602 | { 603 | "ename": "ValueError", 604 | "evalue": "Unable to parse string \"missing\" at position 1", 605 | "output_type": "error", 606 | "traceback": [ 607 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 608 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 609 | "\u001b[0;32mpandas/_libs/src/inference.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.maybe_convert_numeric (pandas/_libs/lib.c:55951)\u001b[0;34m()\u001b[0m\n", 610 | "\u001b[0;31mValueError\u001b[0m: Unable to parse string \"missing\"", 611 | "\nDuring handling of the above exception, another exception occurred:\n", 612 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 613 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# still causes error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtips_sub_miss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total_bill'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 614 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/tools/numeric.py\u001b[0m in \u001b[0;36mto_numeric\u001b[0;34m(arg, errors, downcast)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0mcoerce_numeric\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'raise'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m values = lib.maybe_convert_numeric(values, set(),\n\u001b[0;32m--> 126\u001b[0;31m coerce_numeric=coerce_numeric)\n\u001b[0m\u001b[1;32m 127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 128\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 615 | "\u001b[0;32mpandas/_libs/src/inference.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.maybe_convert_numeric (pandas/_libs/lib.c:56433)\u001b[0;34m()\u001b[0m\n", 616 | "\u001b[0;31mValueError\u001b[0m: Unable to parse string \"missing\" at position 1" 617 | ] 618 | } 619 | ], 620 | "source": [ 621 | "# still causes error\n", 622 | "pd.to_numeric(tips_sub_miss['total_bill'])" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 12, 628 | "metadata": {}, 629 | "outputs": [ 630 | { 631 | "data": { 632 | "text/plain": [ 633 | "0 16.99\n", 634 | "1 missing\n", 635 | "2 21.01\n", 636 | "3 missing\n", 637 | "4 24.59\n", 638 | "5 missing\n", 639 | "6 8.77\n", 640 | "7 missing\n", 641 | "8 15.04\n", 642 | "9 14.78\n", 643 | "Name: total_bill, dtype: object" 644 | ] 645 | }, 646 | "execution_count": 12, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | } 650 | ], 651 | "source": [ 652 | "pd.to_numeric(tips_sub_miss['total_bill'], errors='ignore')" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 13, 658 | "metadata": {}, 659 | "outputs": [ 660 | { 661 | "ename": "ValueError", 662 | "evalue": "Unable to parse string \"missing\" at position 1", 663 | "output_type": "error", 664 | "traceback": [ 665 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 666 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 667 | "\u001b[0;32mpandas/_libs/src/inference.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.maybe_convert_numeric (pandas/_libs/lib.c:55951)\u001b[0;34m()\u001b[0m\n", 668 | "\u001b[0;31mValueError\u001b[0m: Unable to parse string \"missing\"", 669 | "\nDuring handling of the above exception, another exception occurred:\n", 670 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 671 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# raise is the default\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtips_sub_miss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total_bill'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'raise'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 672 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/tools/numeric.py\u001b[0m in \u001b[0;36mto_numeric\u001b[0;34m(arg, errors, downcast)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0mcoerce_numeric\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'raise'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m values = lib.maybe_convert_numeric(values, set(),\n\u001b[0;32m--> 126\u001b[0;31m coerce_numeric=coerce_numeric)\n\u001b[0m\u001b[1;32m 127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 128\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 673 | "\u001b[0;32mpandas/_libs/src/inference.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.maybe_convert_numeric (pandas/_libs/lib.c:56433)\u001b[0;34m()\u001b[0m\n", 674 | "\u001b[0;31mValueError\u001b[0m: Unable to parse string \"missing\" at position 1" 675 | ] 676 | } 677 | ], 678 | "source": [ 679 | "# raise is the default\n", 680 | "pd.to_numeric(tips_sub_miss['total_bill'], errors='raise')" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 14, 686 | "metadata": {}, 687 | "outputs": [ 688 | { 689 | "data": { 690 | "text/plain": [ 691 | "0 16.99\n", 692 | "1 NaN\n", 693 | "2 21.01\n", 694 | "3 NaN\n", 695 | "4 24.59\n", 696 | "5 NaN\n", 697 | "6 8.77\n", 698 | "7 NaN\n", 699 | "8 15.04\n", 700 | "9 14.78\n", 701 | "Name: total_bill, dtype: float64" 702 | ] 703 | }, 704 | "execution_count": 14, 705 | "metadata": {}, 706 | "output_type": "execute_result" 707 | } 708 | ], 709 | "source": [ 710 | "# coerce to NaN\n", 711 | "pd.to_numeric(tips_sub_miss['total_bill'], errors='coerce')" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "metadata": { 718 | "collapsed": true 719 | }, 720 | "outputs": [], 721 | "source": [] 722 | } 723 | ], 724 | "metadata": { 725 | "kernelspec": { 726 | "display_name": "Python 3", 727 | "language": "python", 728 | "name": "python3" 729 | }, 730 | "language_info": { 731 | "codemirror_mode": { 732 | "name": "ipython", 733 | "version": 3 734 | }, 735 | "file_extension": ".py", 736 | "mimetype": "text/x-python", 737 | "name": "python", 738 | "nbconvert_exporter": "python", 739 | "pygments_lexer": "ipython3", 740 | "version": "3.6.1" 741 | } 742 | }, 743 | "nbformat": 4, 744 | "nbformat_minor": 2 745 | } 746 | -------------------------------------------------------------------------------- /01-notes/07-groupby.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "gapminder = pd.read_csv('../data/gapminder.tsv', sep='\\t')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/html": [ 33 | "
\n", 34 | "\n", 47 | "\n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | "
countrycontinentyearlifeExppopgdpPercap
0AfghanistanAsia195228.8018425333779.445314
1AfghanistanAsia195730.3329240934820.853030
2AfghanistanAsia196231.99710267083853.100710
3AfghanistanAsia196734.02011537966836.197138
4AfghanistanAsia197236.08813079460739.981106
\n", 107 | "
" 108 | ], 109 | "text/plain": [ 110 | " country continent year lifeExp pop gdpPercap\n", 111 | "0 Afghanistan Asia 1952 28.801 8425333 779.445314\n", 112 | "1 Afghanistan Asia 1957 30.332 9240934 820.853030\n", 113 | "2 Afghanistan Asia 1962 31.997 10267083 853.100710\n", 114 | "3 Afghanistan Asia 1967 34.020 11537966 836.197138\n", 115 | "4 Afghanistan Asia 1972 36.088 13079460 739.981106" 116 | ] 117 | }, 118 | "execution_count": 3, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "gapminder.head()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 4, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "year\n", 136 | "1952 49.057620\n", 137 | "1957 51.507401\n", 138 | "1962 53.609249\n", 139 | "1967 55.678290\n", 140 | "1972 57.647386\n", 141 | "1977 59.570157\n", 142 | "1982 61.533197\n", 143 | "1987 63.212613\n", 144 | "1992 64.160338\n", 145 | "1997 65.014676\n", 146 | "2002 65.694923\n", 147 | "2007 67.007423\n", 148 | "Name: lifeExp, dtype: float64" 149 | ] 150 | }, 151 | "execution_count": 4, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "# what we did before\n", 158 | "gapminder.groupby('year')['lifeExp'].mean()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 5, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/html": [ 169 | "
\n", 170 | "\n", 183 | "\n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | "
countrycontinentyearlifeExppopgdpPercap
0AfghanistanAsia195228.8018425333779.445314
12AlbaniaEurope195255.23012826971601.056136
24AlgeriaAfrica195243.07792795252449.008185
36AngolaAfrica195230.01542320953520.610273
48ArgentinaAmericas195262.485178769565911.315053
\n", 243 | "
" 244 | ], 245 | "text/plain": [ 246 | " country continent year lifeExp pop gdpPercap\n", 247 | "0 Afghanistan Asia 1952 28.801 8425333 779.445314\n", 248 | "12 Albania Europe 1952 55.230 1282697 1601.056136\n", 249 | "24 Algeria Africa 1952 43.077 9279525 2449.008185\n", 250 | "36 Angola Africa 1952 30.015 4232095 3520.610273\n", 251 | "48 Argentina Americas 1952 62.485 17876956 5911.315053" 252 | ] 253 | }, 254 | "execution_count": 5, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "# breaking the groupby down\n", 261 | "y1952 = gapminder.loc[gapminder['year'] == 1952, :]\n", 262 | "y1952.head()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 6, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "49.05761971830987" 274 | ] 275 | }, 276 | "execution_count": 6, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "y1952['lifeExp'].mean()" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "methods you can use\n", 290 | "\n", 291 | "- count\n", 292 | "- size\n", 293 | "- mean\n", 294 | "- std\n", 295 | "- min\n", 296 | "- quantile(q=0.25)\n", 297 | "- max\n", 298 | "- sum\n", 299 | "- var\n", 300 | "- sem\n", 301 | "- describe\n", 302 | "- first\n", 303 | "- last\n", 304 | "- nth" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 7, 310 | "metadata": {}, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "text/html": [ 315 | "
\n", 316 | "\n", 329 | "\n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | "
countmeanstdmin25%50%75%max
continent
Africa624.048.8653309.15021023.59942.3725047.792054.4115076.442
Americas300.064.6587379.34508837.57958.4100067.048071.6995080.653
Asia396.060.06490311.86453228.80151.4262561.791569.5052582.603
Europe360.071.9036865.43317843.58569.5700072.241075.4505081.757
Oceania24.074.3262083.79561169.12071.2050073.665077.5525081.235
\n", 412 | "
" 413 | ], 414 | "text/plain": [ 415 | " count mean std min 25% 50% 75% \\\n", 416 | "continent \n", 417 | "Africa 624.0 48.865330 9.150210 23.599 42.37250 47.7920 54.41150 \n", 418 | "Americas 300.0 64.658737 9.345088 37.579 58.41000 67.0480 71.69950 \n", 419 | "Asia 396.0 60.064903 11.864532 28.801 51.42625 61.7915 69.50525 \n", 420 | "Europe 360.0 71.903686 5.433178 43.585 69.57000 72.2410 75.45050 \n", 421 | "Oceania 24.0 74.326208 3.795611 69.120 71.20500 73.6650 77.55250 \n", 422 | "\n", 423 | " max \n", 424 | "continent \n", 425 | "Africa 76.442 \n", 426 | "Americas 80.653 \n", 427 | "Asia 82.603 \n", 428 | "Europe 81.757 \n", 429 | "Oceania 81.235 " 430 | ] 431 | }, 432 | "execution_count": 7, 433 | "metadata": {}, 434 | "output_type": "execute_result" 435 | } 436 | ], 437 | "source": [ 438 | "gapminder.groupby('continent')['lifeExp'].describe()" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 8, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "data": { 448 | "text/plain": [ 449 | "continent\n", 450 | "Africa 48.865330\n", 451 | "Americas 64.658737\n", 452 | "Asia 60.064903\n", 453 | "Europe 71.903686\n", 454 | "Oceania 74.326208\n", 455 | "Name: lifeExp, dtype: float64" 456 | ] 457 | }, 458 | "execution_count": 8, 459 | "metadata": {}, 460 | "output_type": "execute_result" 461 | } 462 | ], 463 | "source": [ 464 | "## use agg to call functions from other libraries\n", 465 | "## or even functions you write yourself\n", 466 | "import numpy as np\n", 467 | "\n", 468 | "# these 2 do the same thing\n", 469 | "gapminder.groupby('continent')['lifeExp'].aggregate(np.mean)\n", 470 | "gapminder.groupby('continent')['lifeExp'].agg(np.mean)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 9, 476 | "metadata": { 477 | "collapsed": true 478 | }, 479 | "outputs": [], 480 | "source": [ 481 | "def my_mean(values):\n", 482 | " n = len(values)\n", 483 | " s = np.sum(values)\n", 484 | " return s / n" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 10, 490 | "metadata": {}, 491 | "outputs": [ 492 | { 493 | "data": { 494 | "text/plain": [ 495 | "continent\n", 496 | "Africa 48.865330\n", 497 | "Americas 64.658737\n", 498 | "Asia 60.064903\n", 499 | "Europe 71.903686\n", 500 | "Oceania 74.326208\n", 501 | "Name: lifeExp, dtype: float64" 502 | ] 503 | }, 504 | "execution_count": 10, 505 | "metadata": {}, 506 | "output_type": "execute_result" 507 | } 508 | ], 509 | "source": [ 510 | "gapminder.groupby('continent')['lifeExp'].agg(my_mean)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 11, 516 | "metadata": {}, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/html": [ 521 | "
\n", 522 | "\n", 535 | "\n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | "
count_nonzeromeanstd
year
1952142.049.05762012.225956
1957142.051.50740112.231286
1962142.053.60924912.097245
1967142.055.67829011.718858
1972142.057.64738611.381953
1977142.059.57015711.227229
1982142.061.53319710.770618
1987142.063.21261310.556285
1992142.064.16033811.227380
1997142.065.01467611.559439
2002142.065.69492312.279823
2007142.067.00742312.073021
\n", 625 | "
" 626 | ], 627 | "text/plain": [ 628 | " count_nonzero mean std\n", 629 | "year \n", 630 | "1952 142.0 49.057620 12.225956\n", 631 | "1957 142.0 51.507401 12.231286\n", 632 | "1962 142.0 53.609249 12.097245\n", 633 | "1967 142.0 55.678290 11.718858\n", 634 | "1972 142.0 57.647386 11.381953\n", 635 | "1977 142.0 59.570157 11.227229\n", 636 | "1982 142.0 61.533197 10.770618\n", 637 | "1987 142.0 63.212613 10.556285\n", 638 | "1992 142.0 64.160338 11.227380\n", 639 | "1997 142.0 65.014676 11.559439\n", 640 | "2002 142.0 65.694923 12.279823\n", 641 | "2007 142.0 67.007423 12.073021" 642 | ] 643 | }, 644 | "execution_count": 11, 645 | "metadata": {}, 646 | "output_type": "execute_result" 647 | } 648 | ], 649 | "source": [ 650 | "# multiple functions\n", 651 | "gapminder.groupby('year')['lifeExp'].agg([\n", 652 | " np.count_nonzero,\n", 653 | " np.mean,\n", 654 | " np.std\n", 655 | "])" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": 12, 661 | "metadata": {}, 662 | "outputs": [ 663 | { 664 | "name": "stderr", 665 | "output_type": "stream", 666 | "text": [ 667 | "/home/dchen/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 668 | "is deprecated and will be removed in a future version\n", 669 | " after removing the cwd from sys.path.\n" 670 | ] 671 | }, 672 | { 673 | "data": { 674 | "text/html": [ 675 | "
\n", 676 | "\n", 689 | "\n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | "
yearncountmeanstd
01952142.049.05762012.225956
11957142.051.50740112.231286
21962142.053.60924912.097245
31967142.055.67829011.718858
41972142.057.64738611.381953
51977142.059.57015711.227229
61982142.061.53319710.770618
71987142.063.21261310.556285
81992142.064.16033811.227380
91997142.065.01467611.559439
102002142.065.69492312.279823
112007142.067.00742312.073021
\n", 786 | "
" 787 | ], 788 | "text/plain": [ 789 | " year ncount mean std\n", 790 | "0 1952 142.0 49.057620 12.225956\n", 791 | "1 1957 142.0 51.507401 12.231286\n", 792 | "2 1962 142.0 53.609249 12.097245\n", 793 | "3 1967 142.0 55.678290 11.718858\n", 794 | "4 1972 142.0 57.647386 11.381953\n", 795 | "5 1977 142.0 59.570157 11.227229\n", 796 | "6 1982 142.0 61.533197 10.770618\n", 797 | "7 1987 142.0 63.212613 10.556285\n", 798 | "8 1992 142.0 64.160338 11.227380\n", 799 | "9 1997 142.0 65.014676 11.559439\n", 800 | "10 2002 142.0 65.694923 12.279823\n", 801 | "11 2007 142.0 67.007423 12.073021" 802 | ] 803 | }, 804 | "execution_count": 12, 805 | "metadata": {}, 806 | "output_type": "execute_result" 807 | } 808 | ], 809 | "source": [ 810 | "gapminder.groupby('year')['lifeExp'].agg({\n", 811 | " 'ncount': np.count_nonzero,\n", 812 | " 'mean': np.mean,\n", 813 | " 'std': np.std\n", 814 | "}).reset_index()" 815 | ] 816 | }, 817 | { 818 | "cell_type": "markdown", 819 | "metadata": {}, 820 | "source": [ 821 | "http://pandas.pydata.org/pandas-docs/version/0.20/whatsnew.html#deprecate-groupby-agg-with-a-dictionary-when-renaming\n" 822 | ] 823 | }, 824 | { 825 | "cell_type": "code", 826 | "execution_count": 13, 827 | "metadata": {}, 828 | "outputs": [ 829 | { 830 | "data": { 831 | "text/html": [ 832 | "
\n", 833 | "\n", 846 | "\n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | "
yearcountavgstd_dev
01952142.049.05762012.225956
11957142.051.50740112.231286
21962142.053.60924912.097245
31967142.055.67829011.718858
41972142.057.64738611.381953
51977142.059.57015711.227229
61982142.061.53319710.770618
71987142.063.21261310.556285
81992142.064.16033811.227380
91997142.065.01467611.559439
102002142.065.69492312.279823
112007142.067.00742312.073021
\n", 943 | "
" 944 | ], 945 | "text/plain": [ 946 | " year count avg std_dev\n", 947 | "0 1952 142.0 49.057620 12.225956\n", 948 | "1 1957 142.0 51.507401 12.231286\n", 949 | "2 1962 142.0 53.609249 12.097245\n", 950 | "3 1967 142.0 55.678290 11.718858\n", 951 | "4 1972 142.0 57.647386 11.381953\n", 952 | "5 1977 142.0 59.570157 11.227229\n", 953 | "6 1982 142.0 61.533197 10.770618\n", 954 | "7 1987 142.0 63.212613 10.556285\n", 955 | "8 1992 142.0 64.160338 11.227380\n", 956 | "9 1997 142.0 65.014676 11.559439\n", 957 | "10 2002 142.0 65.694923 12.279823\n", 958 | "11 2007 142.0 67.007423 12.073021" 959 | ] 960 | }, 961 | "execution_count": 13, 962 | "metadata": {}, 963 | "output_type": "execute_result" 964 | } 965 | ], 966 | "source": [ 967 | "gapminder.groupby('year')['lifeExp'].\\\n", 968 | " agg([np.count_nonzero, np.mean, np.std]).\\\n", 969 | " rename(columns={'count_nonzero': 'count',\n", 970 | " 'mean': 'avg',\n", 971 | " 'std': 'std_dev'}).\\\n", 972 | " reset_index()" 973 | ] 974 | }, 975 | { 976 | "cell_type": "markdown", 977 | "metadata": {}, 978 | "source": [ 979 | "other things to look into for groupby\n", 980 | "\n", 981 | "- transform (returns same number of rows)\n", 982 | "- filter (returns a subset)" 983 | ] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": null, 988 | "metadata": { 989 | "collapsed": true 990 | }, 991 | "outputs": [], 992 | "source": [] 993 | } 994 | ], 995 | "metadata": { 996 | "kernelspec": { 997 | "display_name": "Python 3", 998 | "language": "python", 999 | "name": "python3" 1000 | }, 1001 | "language_info": { 1002 | "codemirror_mode": { 1003 | "name": "ipython", 1004 | "version": 3 1005 | }, 1006 | "file_extension": ".py", 1007 | "mimetype": "text/x-python", 1008 | "name": "python", 1009 | "nbconvert_exporter": "python", 1010 | "pygments_lexer": "ipython3", 1011 | "version": "3.6.1" 1012 | } 1013 | }, 1014 | "nbformat": 4, 1015 | "nbformat_minor": 2 1016 | } 1017 | -------------------------------------------------------------------------------- /01-notes/08a-statsmodels_linear.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/dchen/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", 13 | " from pandas.core import datetools\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import seaborn as sns\n", 20 | "import statsmodels.api as sm\n", 21 | "import statsmodels.formula.api as smf" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "additional reference: https://www.datarobot.com/blog/multiple-regression-using-statsmodels/" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/html": [ 39 | "
\n", 40 | "\n", 53 | "\n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
\n", 119 | "
" 120 | ], 121 | "text/plain": [ 122 | " total_bill tip sex smoker day time size\n", 123 | "0 16.99 1.01 Female No Sun Dinner 2\n", 124 | "1 10.34 1.66 Male No Sun Dinner 3\n", 125 | "2 21.01 3.50 Male No Sun Dinner 3\n", 126 | "3 23.68 3.31 Male No Sun Dinner 2\n", 127 | "4 24.59 3.61 Female No Sun Dinner 4" 128 | ] 129 | }, 130 | "execution_count": 2, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "tips = sns.load_dataset('tips')\n", 137 | "tips.head()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 3, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "model = sm.OLS(endog=tips['tip'], exog=tips['total_bill'])" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 4, 152 | "metadata": { 153 | "collapsed": true 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "results = model.fit()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/html": [ 168 | "\n", 169 | "\n", 170 | "\n", 171 | " \n", 172 | "\n", 173 | "\n", 174 | " \n", 175 | "\n", 176 | "\n", 177 | " \n", 178 | "\n", 179 | "\n", 180 | " \n", 181 | "\n", 182 | "\n", 183 | " \n", 184 | "\n", 185 | "\n", 186 | " \n", 187 | "\n", 188 | "\n", 189 | " \n", 190 | "\n", 191 | "\n", 192 | " \n", 193 | "\n", 194 | "\n", 195 | " \n", 196 | "\n", 197 | "
OLS Regression Results
Dep. Variable: tip R-squared: 0.892
Model: OLS Adj. R-squared: 0.891
Method: Least Squares F-statistic: 2004.
Date: Tue, 11 Jul 2017 Prob (F-statistic): 2.26e-119
Time: 01:37:47 Log-Likelihood: -366.22
No. Observations: 244 AIC: 734.4
Df Residuals: 243 BIC: 737.9
Df Model: 1
Covariance Type: nonrobust
\n", 198 | "\n", 199 | "\n", 200 | " \n", 201 | "\n", 202 | "\n", 203 | " \n", 204 | "\n", 205 | "
coef std err t P>|t| [0.025 0.975]
total_bill 0.1437 0.003 44.771 0.000 0.137 0.150
\n", 206 | "\n", 207 | "\n", 208 | " \n", 209 | "\n", 210 | "\n", 211 | " \n", 212 | "\n", 213 | "\n", 214 | " \n", 215 | "\n", 216 | "\n", 217 | " \n", 218 | "\n", 219 | "
Omnibus: 21.126 Durbin-Watson: 2.136
Prob(Omnibus): 0.000 Jarque-Bera (JB): 47.017
Skew: -0.398 Prob(JB): 6.17e-11
Kurtosis: 4.998 Cond. No. 1.00
" 220 | ], 221 | "text/plain": [ 222 | "\n", 223 | "\"\"\"\n", 224 | " OLS Regression Results \n", 225 | "==============================================================================\n", 226 | "Dep. Variable: tip R-squared: 0.892\n", 227 | "Model: OLS Adj. R-squared: 0.891\n", 228 | "Method: Least Squares F-statistic: 2004.\n", 229 | "Date: Tue, 11 Jul 2017 Prob (F-statistic): 2.26e-119\n", 230 | "Time: 01:37:47 Log-Likelihood: -366.22\n", 231 | "No. Observations: 244 AIC: 734.4\n", 232 | "Df Residuals: 243 BIC: 737.9\n", 233 | "Df Model: 1 \n", 234 | "Covariance Type: nonrobust \n", 235 | "==============================================================================\n", 236 | " coef std err t P>|t| [0.025 0.975]\n", 237 | "------------------------------------------------------------------------------\n", 238 | "total_bill 0.1437 0.003 44.771 0.000 0.137 0.150\n", 239 | "==============================================================================\n", 240 | "Omnibus: 21.126 Durbin-Watson: 2.136\n", 241 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 47.017\n", 242 | "Skew: -0.398 Prob(JB): 6.17e-11\n", 243 | "Kurtosis: 4.998 Cond. No. 1.00\n", 244 | "==============================================================================\n", 245 | "\n", 246 | "Warnings:\n", 247 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 248 | "\"\"\"" 249 | ] 250 | }, 251 | "execution_count": 5, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "results.summary()" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 6, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "total_bill 0.143732\n", 269 | "dtype: float64" 270 | ] 271 | }, 272 | "execution_count": 6, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "# just get the coefficients\n", 279 | "results.params" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 7, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/html": [ 290 | "\n", 291 | "\n", 292 | "\n", 293 | " \n", 294 | "\n", 295 | "\n", 296 | " \n", 297 | "\n", 298 | "\n", 299 | " \n", 300 | "\n", 301 | "\n", 302 | " \n", 303 | "\n", 304 | "\n", 305 | " \n", 306 | "\n", 307 | "\n", 308 | " \n", 309 | "\n", 310 | "\n", 311 | " \n", 312 | "\n", 313 | "\n", 314 | " \n", 315 | "\n", 316 | "\n", 317 | " \n", 318 | "\n", 319 | "
OLS Regression Results
Dep. Variable: tip R-squared: 0.902
Model: OLS Adj. R-squared: 0.901
Method: Least Squares F-statistic: 1117.
Date: Tue, 11 Jul 2017 Prob (F-statistic): 6.16e-123
Time: 01:37:47 Log-Likelihood: -353.88
No. Observations: 244 AIC: 711.8
Df Residuals: 242 BIC: 718.8
Df Model: 2
Covariance Type: nonrobust
\n", 320 | "\n", 321 | "\n", 322 | " \n", 323 | "\n", 324 | "\n", 325 | " \n", 326 | "\n", 327 | "\n", 328 | " \n", 329 | "\n", 330 | "
coef std err t P>|t| [0.025 0.975]
total_bill 0.1007 0.009 11.174 0.000 0.083 0.118
size 0.3621 0.071 5.074 0.000 0.222 0.503
\n", 331 | "\n", 332 | "\n", 333 | " \n", 334 | "\n", 335 | "\n", 336 | " \n", 337 | "\n", 338 | "\n", 339 | " \n", 340 | "\n", 341 | "\n", 342 | " \n", 343 | "\n", 344 | "
Omnibus: 12.830 Durbin-Watson: 2.059
Prob(Omnibus): 0.002 Jarque-Bera (JB): 27.284
Skew: 0.179 Prob(JB): 1.19e-06
Kurtosis: 4.599 Cond. No. 23.7
" 345 | ], 346 | "text/plain": [ 347 | "\n", 348 | "\"\"\"\n", 349 | " OLS Regression Results \n", 350 | "==============================================================================\n", 351 | "Dep. Variable: tip R-squared: 0.902\n", 352 | "Model: OLS Adj. R-squared: 0.901\n", 353 | "Method: Least Squares F-statistic: 1117.\n", 354 | "Date: Tue, 11 Jul 2017 Prob (F-statistic): 6.16e-123\n", 355 | "Time: 01:37:47 Log-Likelihood: -353.88\n", 356 | "No. Observations: 244 AIC: 711.8\n", 357 | "Df Residuals: 242 BIC: 718.8\n", 358 | "Df Model: 2 \n", 359 | "Covariance Type: nonrobust \n", 360 | "==============================================================================\n", 361 | " coef std err t P>|t| [0.025 0.975]\n", 362 | "------------------------------------------------------------------------------\n", 363 | "total_bill 0.1007 0.009 11.174 0.000 0.083 0.118\n", 364 | "size 0.3621 0.071 5.074 0.000 0.222 0.503\n", 365 | "==============================================================================\n", 366 | "Omnibus: 12.830 Durbin-Watson: 2.059\n", 367 | "Prob(Omnibus): 0.002 Jarque-Bera (JB): 27.284\n", 368 | "Skew: 0.179 Prob(JB): 1.19e-06\n", 369 | "Kurtosis: 4.599 Cond. No. 23.7\n", 370 | "==============================================================================\n", 371 | "\n", 372 | "Warnings:\n", 373 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 374 | "\"\"\"" 375 | ] 376 | }, 377 | "execution_count": 7, 378 | "metadata": {}, 379 | "output_type": "execute_result" 380 | } 381 | ], 382 | "source": [ 383 | "# multiple variable regression\n", 384 | "model = sm.OLS(endog=tips['tip'], exog=tips[['total_bill', 'size']])\n", 385 | "results = model.fit()\n", 386 | "results.summary()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 8, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "\n", 399 | "RangeIndex: 244 entries, 0 to 243\n", 400 | "Data columns (total 7 columns):\n", 401 | "total_bill 244 non-null float64\n", 402 | "tip 244 non-null float64\n", 403 | "sex 244 non-null category\n", 404 | "smoker 244 non-null category\n", 405 | "day 244 non-null category\n", 406 | "time 244 non-null category\n", 407 | "size 244 non-null int64\n", 408 | "dtypes: category(4), float64(2), int64(1)\n", 409 | "memory usage: 7.2 KB\n" 410 | ] 411 | } 412 | ], 413 | "source": [ 414 | "tips.info()" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 9, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "data": { 424 | "text/html": [ 425 | "\n", 426 | "\n", 427 | "\n", 428 | " \n", 429 | "\n", 430 | "\n", 431 | " \n", 432 | "\n", 433 | "\n", 434 | " \n", 435 | "\n", 436 | "\n", 437 | " \n", 438 | "\n", 439 | "\n", 440 | " \n", 441 | "\n", 442 | "\n", 443 | " \n", 444 | "\n", 445 | "\n", 446 | " \n", 447 | "\n", 448 | "\n", 449 | " \n", 450 | "\n", 451 | "\n", 452 | " \n", 453 | "\n", 454 | "
OLS Regression Results
Dep. Variable: tip R-squared: 0.469
Model: OLS Adj. R-squared: 0.460
Method: Least Squares F-statistic: 52.72
Date: Tue, 11 Jul 2017 Prob (F-statistic): 8.47e-32
Time: 01:37:47 Log-Likelihood: -347.78
No. Observations: 244 AIC: 705.6
Df Residuals: 239 BIC: 723.0
Df Model: 4
Covariance Type: nonrobust
\n", 455 | "\n", 456 | "\n", 457 | " \n", 458 | "\n", 459 | "\n", 460 | " \n", 461 | "\n", 462 | "\n", 463 | " \n", 464 | "\n", 465 | "\n", 466 | " \n", 467 | "\n", 468 | "\n", 469 | " \n", 470 | "\n", 471 | "\n", 472 | " \n", 473 | "\n", 474 | "
coef std err t P>|t| [0.025 0.975]
Intercept 0.6115 0.219 2.793 0.006 0.180 1.043
sex[T.Female] 0.0273 0.137 0.198 0.843 -0.243 0.298
smoker[T.No] 0.0837 0.138 0.605 0.546 -0.189 0.356
total_bill 0.0941 0.009 9.996 0.000 0.076 0.113
size 0.1803 0.088 2.049 0.042 0.007 0.354
\n", 475 | "\n", 476 | "\n", 477 | " \n", 478 | "\n", 479 | "\n", 480 | " \n", 481 | "\n", 482 | "\n", 483 | " \n", 484 | "\n", 485 | "\n", 486 | " \n", 487 | "\n", 488 | "
Omnibus: 26.891 Durbin-Watson: 2.099
Prob(Omnibus): 0.000 Jarque-Bera (JB): 50.438
Skew: 0.589 Prob(JB): 1.12e-11
Kurtosis: 4.891 Cond. No. 78.5
" 489 | ], 490 | "text/plain": [ 491 | "\n", 492 | "\"\"\"\n", 493 | " OLS Regression Results \n", 494 | "==============================================================================\n", 495 | "Dep. Variable: tip R-squared: 0.469\n", 496 | "Model: OLS Adj. R-squared: 0.460\n", 497 | "Method: Least Squares F-statistic: 52.72\n", 498 | "Date: Tue, 11 Jul 2017 Prob (F-statistic): 8.47e-32\n", 499 | "Time: 01:37:47 Log-Likelihood: -347.78\n", 500 | "No. Observations: 244 AIC: 705.6\n", 501 | "Df Residuals: 239 BIC: 723.0\n", 502 | "Df Model: 4 \n", 503 | "Covariance Type: nonrobust \n", 504 | "=================================================================================\n", 505 | " coef std err t P>|t| [0.025 0.975]\n", 506 | "---------------------------------------------------------------------------------\n", 507 | "Intercept 0.6115 0.219 2.793 0.006 0.180 1.043\n", 508 | "sex[T.Female] 0.0273 0.137 0.198 0.843 -0.243 0.298\n", 509 | "smoker[T.No] 0.0837 0.138 0.605 0.546 -0.189 0.356\n", 510 | "total_bill 0.0941 0.009 9.996 0.000 0.076 0.113\n", 511 | "size 0.1803 0.088 2.049 0.042 0.007 0.354\n", 512 | "==============================================================================\n", 513 | "Omnibus: 26.891 Durbin-Watson: 2.099\n", 514 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 50.438\n", 515 | "Skew: 0.589 Prob(JB): 1.12e-11\n", 516 | "Kurtosis: 4.891 Cond. No. 78.5\n", 517 | "==============================================================================\n", 518 | "\n", 519 | "Warnings:\n", 520 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 521 | "\"\"\"" 522 | ] 523 | }, 524 | "execution_count": 9, 525 | "metadata": {}, 526 | "output_type": "execute_result" 527 | } 528 | ], 529 | "source": [ 530 | "model = smf.ols(formula='tip ~ total_bill + sex + smoker + size',\n", 531 | " data=tips)\n", 532 | "results = model.fit()\n", 533 | "results.summary()" 534 | ] 535 | } 536 | ], 537 | "metadata": { 538 | "kernelspec": { 539 | "display_name": "Python 3", 540 | "language": "python", 541 | "name": "python3" 542 | }, 543 | "language_info": { 544 | "codemirror_mode": { 545 | "name": "ipython", 546 | "version": 3 547 | }, 548 | "file_extension": ".py", 549 | "mimetype": "text/x-python", 550 | "name": "python", 551 | "nbconvert_exporter": "python", 552 | "pygments_lexer": "ipython3", 553 | "version": "3.6.1" 554 | } 555 | }, 556 | "nbformat": 4, 557 | "nbformat_minor": 2 558 | } 559 | -------------------------------------------------------------------------------- /01-notes/08b-statsmodels_logistic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/dchen/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", 13 | " from pandas.core import datetools\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import seaborn as sns\n", 20 | "import statsmodels.api as sm\n", 21 | "import statsmodels.formula.api as smf\n", 22 | "import numpy as np" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "another reference: http://blog.yhat.com/posts/logistic-regression-and-python.html" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/html": [ 40 | "
\n", 41 | "\n", 54 | "\n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue
\n", 168 | "
" 169 | ], 170 | "text/plain": [ 171 | " survived pclass sex age sibsp parch fare embarked class \\\n", 172 | "0 0 3 male 22.0 1 0 7.2500 S Third \n", 173 | "1 1 1 female 38.0 1 0 71.2833 C First \n", 174 | "2 1 3 female 26.0 0 0 7.9250 S Third \n", 175 | "3 1 1 female 35.0 1 0 53.1000 S First \n", 176 | "4 0 3 male 35.0 0 0 8.0500 S Third \n", 177 | "\n", 178 | " who adult_male deck embark_town alive alone \n", 179 | "0 man True NaN Southampton no False \n", 180 | "1 woman False C Cherbourg yes False \n", 181 | "2 woman False NaN Southampton yes True \n", 182 | "3 woman False C Southampton yes False \n", 183 | "4 man True NaN Southampton no True " 184 | ] 185 | }, 186 | "execution_count": 2, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "titanic = sns.load_dataset('titanic')\n", 193 | "titanic.head()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 3, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "\n", 206 | "RangeIndex: 891 entries, 0 to 890\n", 207 | "Data columns (total 15 columns):\n", 208 | "survived 891 non-null int64\n", 209 | "pclass 891 non-null int64\n", 210 | "sex 891 non-null object\n", 211 | "age 714 non-null float64\n", 212 | "sibsp 891 non-null int64\n", 213 | "parch 891 non-null int64\n", 214 | "fare 891 non-null float64\n", 215 | "embarked 889 non-null object\n", 216 | "class 891 non-null category\n", 217 | "who 891 non-null object\n", 218 | "adult_male 891 non-null bool\n", 219 | "deck 203 non-null category\n", 220 | "embark_town 889 non-null object\n", 221 | "alive 891 non-null object\n", 222 | "alone 891 non-null bool\n", 223 | "dtypes: bool(2), category(2), float64(2), int64(4), object(5)\n", 224 | "memory usage: 80.6+ KB\n" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "titanic.info()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 4, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "Optimization terminated successfully.\n", 242 | " Current function value: 0.406942\n", 243 | " Iterations 7\n" 244 | ] 245 | }, 246 | { 247 | "data": { 248 | "text/html": [ 249 | "\n", 250 | "\n", 251 | "\n", 252 | " \n", 253 | "\n", 254 | "\n", 255 | " \n", 256 | "\n", 257 | "\n", 258 | " \n", 259 | "\n", 260 | "\n", 261 | " \n", 262 | "\n", 263 | "\n", 264 | " \n", 265 | "\n", 266 | "\n", 267 | " \n", 268 | "\n", 269 | "\n", 270 | " \n", 271 | "\n", 272 | "
Logit Regression Results
Dep. Variable: survived No. Observations: 184
Model: Logit Df Residuals: 174
Method: MLE Df Model: 9
Date: Tue, 11 Jul 2017 Pseudo R-squ.: 0.3513
Time: 01:58:17 Log-Likelihood: -74.877
converged: True LL-Null: -115.43
LLR p-value: 9.707e-14
\n", 273 | "\n", 274 | "\n", 275 | " \n", 276 | "\n", 277 | "\n", 278 | " \n", 279 | "\n", 280 | "\n", 281 | " \n", 282 | "\n", 283 | "\n", 284 | " \n", 285 | "\n", 286 | "\n", 287 | " \n", 288 | "\n", 289 | "\n", 290 | " \n", 291 | "\n", 292 | "\n", 293 | " \n", 294 | "\n", 295 | "\n", 296 | " \n", 297 | "\n", 298 | "\n", 299 | " \n", 300 | "\n", 301 | "\n", 302 | " \n", 303 | "\n", 304 | "\n", 305 | " \n", 306 | "\n", 307 | "
coef std err z P>|z| [0.025 0.975]
Intercept 5.5057 1.163 4.736 0.000 3.227 7.784
sex[T.male] -3.4464 0.590 -5.836 0.000 -4.604 -2.289
deck[T.B] -0.7764 0.832 -0.934 0.351 -2.406 0.854
deck[T.C] -1.7126 0.808 -2.121 0.034 -3.295 -0.130
deck[T.D] -0.6292 0.837 -0.752 0.452 -2.270 1.012
deck[T.E] -0.1741 0.816 -0.213 0.831 -1.774 1.426
deck[T.F] -1.5135 1.087 -1.392 0.164 -3.644 0.617
deck[T.G] -4.9029 1.457 -3.364 0.001 -7.759 -2.046
age -0.0430 0.015 -2.946 0.003 -0.072 -0.014
fare 0.0023 0.003 0.789 0.430 -0.003 0.008
" 308 | ], 309 | "text/plain": [ 310 | "\n", 311 | "\"\"\"\n", 312 | " Logit Regression Results \n", 313 | "==============================================================================\n", 314 | "Dep. Variable: survived No. Observations: 184\n", 315 | "Model: Logit Df Residuals: 174\n", 316 | "Method: MLE Df Model: 9\n", 317 | "Date: Tue, 11 Jul 2017 Pseudo R-squ.: 0.3513\n", 318 | "Time: 01:58:17 Log-Likelihood: -74.877\n", 319 | "converged: True LL-Null: -115.43\n", 320 | " LLR p-value: 9.707e-14\n", 321 | "===============================================================================\n", 322 | " coef std err z P>|z| [0.025 0.975]\n", 323 | "-------------------------------------------------------------------------------\n", 324 | "Intercept 5.5057 1.163 4.736 0.000 3.227 7.784\n", 325 | "sex[T.male] -3.4464 0.590 -5.836 0.000 -4.604 -2.289\n", 326 | "deck[T.B] -0.7764 0.832 -0.934 0.351 -2.406 0.854\n", 327 | "deck[T.C] -1.7126 0.808 -2.121 0.034 -3.295 -0.130\n", 328 | "deck[T.D] -0.6292 0.837 -0.752 0.452 -2.270 1.012\n", 329 | "deck[T.E] -0.1741 0.816 -0.213 0.831 -1.774 1.426\n", 330 | "deck[T.F] -1.5135 1.087 -1.392 0.164 -3.644 0.617\n", 331 | "deck[T.G] -4.9029 1.457 -3.364 0.001 -7.759 -2.046\n", 332 | "age -0.0430 0.015 -2.946 0.003 -0.072 -0.014\n", 333 | "fare 0.0023 0.003 0.789 0.430 -0.003 0.008\n", 334 | "===============================================================================\n", 335 | "\"\"\"" 336 | ] 337 | }, 338 | "execution_count": 4, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "model = smf.logit('survived ~ sex + age + fare + deck',\n", 345 | " data = titanic)\n", 346 | "results = model.fit()\n", 347 | "results.summary()" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 5, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "Intercept 246.087067\n", 359 | "sex[T.male] 0.031860\n", 360 | "deck[T.B] 0.460038\n", 361 | "deck[T.C] 0.180390\n", 362 | "deck[T.D] 0.532997\n", 363 | "deck[T.E] 0.840224\n", 364 | "deck[T.F] 0.220138\n", 365 | "deck[T.G] 0.007425\n", 366 | "age 0.957889\n", 367 | "fare 1.002292\n", 368 | "dtype: float64" 369 | ] 370 | }, 371 | "execution_count": 5, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [ 377 | "# interpret results\n", 378 | "np.exp(results.params)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 6, 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "data": { 388 | "text/html": [ 389 | "
\n", 390 | "\n", 403 | "\n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | "
01
Intercept25.2051842402.634534
sex[T.male]0.0100140.101363
deck[T.B]0.0901322.348053
deck[T.C]0.0370550.878167
deck[T.D]0.1032792.750650
deck[T.E]0.1695944.162737
deck[T.F]0.0261521.853020
deck[T.G]0.0004270.129203
age0.9308590.985705
fare0.9966091.008008
\n", 464 | "
" 465 | ], 466 | "text/plain": [ 467 | " 0 1\n", 468 | "Intercept 25.205184 2402.634534\n", 469 | "sex[T.male] 0.010014 0.101363\n", 470 | "deck[T.B] 0.090132 2.348053\n", 471 | "deck[T.C] 0.037055 0.878167\n", 472 | "deck[T.D] 0.103279 2.750650\n", 473 | "deck[T.E] 0.169594 4.162737\n", 474 | "deck[T.F] 0.026152 1.853020\n", 475 | "deck[T.G] 0.000427 0.129203\n", 476 | "age 0.930859 0.985705\n", 477 | "fare 0.996609 1.008008" 478 | ] 479 | }, 480 | "execution_count": 6, 481 | "metadata": {}, 482 | "output_type": "execute_result" 483 | } 484 | ], 485 | "source": [ 486 | "np.exp(results.conf_int())" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": { 493 | "collapsed": true 494 | }, 495 | "outputs": [], 496 | "source": [] 497 | } 498 | ], 499 | "metadata": { 500 | "kernelspec": { 501 | "display_name": "Python 3", 502 | "language": "python", 503 | "name": "python3" 504 | }, 505 | "language_info": { 506 | "codemirror_mode": { 507 | "name": "ipython", 508 | "version": 3 509 | }, 510 | "file_extension": ".py", 511 | "mimetype": "text/x-python", 512 | "name": "python", 513 | "nbconvert_exporter": "python", 514 | "pygments_lexer": "ipython3", 515 | "version": "3.6.1" 516 | } 517 | }, 518 | "nbformat": 4, 519 | "nbformat_minor": 2 520 | } 521 | -------------------------------------------------------------------------------- /01-notes/09a-sklearn_linear.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import seaborn as sns\n", 11 | "from sklearn import linear_model" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "
\n", 23 | "\n", 36 | "\n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
\n", 102 | "
" 103 | ], 104 | "text/plain": [ 105 | " total_bill tip sex smoker day time size\n", 106 | "0 16.99 1.01 Female No Sun Dinner 2\n", 107 | "1 10.34 1.66 Male No Sun Dinner 3\n", 108 | "2 21.01 3.50 Male No Sun Dinner 3\n", 109 | "3 23.68 3.31 Male No Sun Dinner 2\n", 110 | "4 24.59 3.61 Female No Sun Dinner 4" 111 | ] 112 | }, 113 | "execution_count": 2, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "tips = sns.load_dataset('tips')\n", 120 | "tips.head()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 3, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "\n", 133 | "RangeIndex: 244 entries, 0 to 243\n", 134 | "Data columns (total 7 columns):\n", 135 | "total_bill 244 non-null float64\n", 136 | "tip 244 non-null float64\n", 137 | "sex 244 non-null category\n", 138 | "smoker 244 non-null category\n", 139 | "day 244 non-null category\n", 140 | "time 244 non-null category\n", 141 | "size 244 non-null int64\n", 142 | "dtypes: category(4), float64(2), int64(1)\n", 143 | "memory usage: 7.2 KB\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "tips.info()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 4, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/html": [ 159 | "
\n", 160 | "\n", 173 | "\n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | "
total_billtipsexsmokerdaytimesizesex_dummy
016.991.01FemaleNoSunDinner21
110.341.66MaleNoSunDinner30
221.013.50MaleNoSunDinner30
323.683.31MaleNoSunDinner20
424.593.61FemaleNoSunDinner41
\n", 245 | "
" 246 | ], 247 | "text/plain": [ 248 | " total_bill tip sex smoker day time size sex_dummy\n", 249 | "0 16.99 1.01 Female No Sun Dinner 2 1\n", 250 | "1 10.34 1.66 Male No Sun Dinner 3 0\n", 251 | "2 21.01 3.50 Male No Sun Dinner 3 0\n", 252 | "3 23.68 3.31 Male No Sun Dinner 2 0\n", 253 | "4 24.59 3.61 Female No Sun Dinner 4 1" 254 | ] 255 | }, 256 | "execution_count": 4, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "tips['sex_dummy'] = pd.get_dummies(tips['sex'],\n", 263 | " drop_first=True)\n", 264 | "tips.head()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 5, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "lr = linear_model.LinearRegression()" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 6, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "X = tips[['total_bill', 'sex_dummy', 'size']]\n", 287 | "Y = tips['tip']" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 7, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "predicted = lr.fit(X, Y)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 8, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "array([ 0.09292034, 0.02641868, 0.19258767])" 308 | ] 309 | }, 310 | "execution_count": 8, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "predicted.coef_" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 9, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "# probably a better way to do this\n", 328 | "results = pd.DataFrame(\n", 329 | " predicted.coef_,\n", 330 | " index = ['total_bill', 'sex_dummy', 'size']\n", 331 | ")" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 10, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/html": [ 342 | "
\n", 343 | "\n", 356 | "\n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | "
0
total_bill0.092920
sex_dummy0.026419
size0.192588
\n", 378 | "
" 379 | ], 380 | "text/plain": [ 381 | " 0\n", 382 | "total_bill 0.092920\n", 383 | "sex_dummy 0.026419\n", 384 | "size 0.192588" 385 | ] 386 | }, 387 | "execution_count": 10, 388 | "metadata": {}, 389 | "output_type": "execute_result" 390 | } 391 | ], 392 | "source": [ 393 | "results" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": { 400 | "collapsed": true 401 | }, 402 | "outputs": [], 403 | "source": [] 404 | } 405 | ], 406 | "metadata": { 407 | "kernelspec": { 408 | "display_name": "Python 3", 409 | "language": "python", 410 | "name": "python3" 411 | }, 412 | "language_info": { 413 | "codemirror_mode": { 414 | "name": "ipython", 415 | "version": 3 416 | }, 417 | "file_extension": ".py", 418 | "mimetype": "text/x-python", 419 | "name": "python", 420 | "nbconvert_exporter": "python", 421 | "pygments_lexer": "ipython3", 422 | "version": "3.6.1" 423 | } 424 | }, 425 | "nbformat": 4, 426 | "nbformat_minor": 2 427 | } 428 | -------------------------------------------------------------------------------- /02-lesson/05-data_types.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import seaborn as sns" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "tips = sns.load_dataset('tips')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
\n", 124 | "
" 125 | ], 126 | "text/plain": [ 127 | " total_bill tip sex smoker day time size\n", 128 | "0 16.99 1.01 Female No Sun Dinner 2\n", 129 | "1 10.34 1.66 Male No Sun Dinner 3\n", 130 | "2 21.01 3.50 Male No Sun Dinner 3\n", 131 | "3 23.68 3.31 Male No Sun Dinner 2\n", 132 | "4 24.59 3.61 Female No Sun Dinner 4" 133 | ] 134 | }, 135 | "execution_count": 4, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "tips.head()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 5, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "\n", 154 | "RangeIndex: 244 entries, 0 to 243\n", 155 | "Data columns (total 7 columns):\n", 156 | "total_bill 244 non-null float64\n", 157 | "tip 244 non-null float64\n", 158 | "sex 244 non-null category\n", 159 | "smoker 244 non-null category\n", 160 | "day 244 non-null category\n", 161 | "time 244 non-null category\n", 162 | "size 244 non-null int64\n", 163 | "dtypes: category(4), float64(2), int64(1)\n", 164 | "memory usage: 7.2 KB\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "tips.info()" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "pd.Categorical()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 7, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "tips['total_bill_str'] = tips['total_bill'].astype(str)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 9, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "\n", 204 | "RangeIndex: 244 entries, 0 to 243\n", 205 | "Data columns (total 8 columns):\n", 206 | "total_bill 244 non-null float64\n", 207 | "tip 244 non-null float64\n", 208 | "sex 244 non-null category\n", 209 | "smoker 244 non-null category\n", 210 | "day 244 non-null category\n", 211 | "time 244 non-null category\n", 212 | "size 244 non-null int64\n", 213 | "total_bill_str 244 non-null object\n", 214 | "dtypes: category(4), float64(2), int64(1), object(1)\n", 215 | "memory usage: 9.1+ KB\n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "tips.info()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 12, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "name": "stderr", 230 | "output_type": "stream", 231 | "text": [ 232 | "/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: \n", 233 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 234 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 235 | "\n", 236 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 237 | " self.obj[item] = s\n" 238 | ] 239 | }, 240 | { 241 | "data": { 242 | "text/html": [ 243 | "
\n", 244 | "\n", 257 | "\n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | "
total_billtipsexsmokerdaytimesizetotal_bill_str
016.991.01FemaleNoSunDinner216.99
1missing1.66MaleNoSunDinner310.34
221.013.50MaleNoSunDinner321.01
3missing3.31MaleNoSunDinner223.68
424.593.61FemaleNoSunDinner424.59
5missing4.71MaleNoSunDinner425.29
68.772.00MaleNoSunDinner28.77
7missing3.12MaleNoSunDinner426.88
815.041.96MaleNoSunDinner215.04
914.783.23MaleNoSunDinner214.78
\n", 384 | "
" 385 | ], 386 | "text/plain": [ 387 | " total_bill tip sex smoker day time size total_bill_str\n", 388 | "0 16.99 1.01 Female No Sun Dinner 2 16.99\n", 389 | "1 missing 1.66 Male No Sun Dinner 3 10.34\n", 390 | "2 21.01 3.50 Male No Sun Dinner 3 21.01\n", 391 | "3 missing 3.31 Male No Sun Dinner 2 23.68\n", 392 | "4 24.59 3.61 Female No Sun Dinner 4 24.59\n", 393 | "5 missing 4.71 Male No Sun Dinner 4 25.29\n", 394 | "6 8.77 2.00 Male No Sun Dinner 2 8.77\n", 395 | "7 missing 3.12 Male No Sun Dinner 4 26.88\n", 396 | "8 15.04 1.96 Male No Sun Dinner 2 15.04\n", 397 | "9 14.78 3.23 Male No Sun Dinner 2 14.78" 398 | ] 399 | }, 400 | "execution_count": 12, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "tips_sub_miss = tips.head(10)\n", 407 | "tips_sub_miss.loc[[1, 3, 5, 7], 'total_bill'] = 'missing'\n", 408 | "tips_sub_miss" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 13, 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "total_bill object\n", 420 | "tip float64\n", 421 | "sex category\n", 422 | "smoker category\n", 423 | "day category\n", 424 | "time category\n", 425 | "size int64\n", 426 | "total_bill_str object\n", 427 | "dtype: object" 428 | ] 429 | }, 430 | "execution_count": 13, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "tips_sub_miss.dtypes" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 14, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "ename": "ValueError", 446 | "evalue": "could not convert string to float: 'missing'", 447 | "output_type": "error", 448 | "traceback": [ 449 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 450 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 451 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtips_sub_miss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total_bill'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 452 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 91\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 92\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_deprecate_kwarg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 453 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors, **kwargs)\u001b[0m\n\u001b[1;32m 3297\u001b[0m \u001b[0;31m# else, only a single dtype is given\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3298\u001b[0m new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,\n\u001b[0;32m-> 3299\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 3300\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3301\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 454 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, **kwargs)\u001b[0m\n\u001b[1;32m 3222\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3223\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3224\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'astype'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3225\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3226\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mconvert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 455 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)\u001b[0m\n\u001b[1;32m 3089\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3090\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'mgr'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3091\u001b[0;31m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3092\u001b[0m \u001b[0mresult_blocks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_extend_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mapplied\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult_blocks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3093\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 456 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors, values, **kwargs)\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'raise'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 470\u001b[0m return self._astype(dtype, copy=copy, errors=errors, values=values,\n\u001b[0;32m--> 471\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 472\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 473\u001b[0m def _astype(self, dtype, copy=False, errors='raise', values=None,\n", 457 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m_astype\u001b[0;34m(self, dtype, copy, errors, values, klass, mgr, **kwargs)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;31m# _astype_nansafe works fine with 1-d only\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mastype_nansafe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 458 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py\u001b[0m in \u001b[0;36mastype_nansafe\u001b[0;34m(arr, dtype, copy)\u001b[0m\n\u001b[1;32m 634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 636\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 637\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 638\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 459 | "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'missing'" 460 | ] 461 | } 462 | ], 463 | "source": [ 464 | "tips_sub_miss['total_bill'].astype(float)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 15, 470 | "metadata": {}, 471 | "outputs": [ 472 | { 473 | "ename": "ValueError", 474 | "evalue": "Unable to parse string \"missing\" at position 1", 475 | "output_type": "error", 476 | "traceback": [ 477 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 478 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 479 | "\u001b[0;32mpandas/_libs/src/inference.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.maybe_convert_numeric (pandas/_libs/lib.c:55951)\u001b[0;34m()\u001b[0m\n", 480 | "\u001b[0;31mValueError\u001b[0m: Unable to parse string \"missing\"", 481 | "\nDuring handling of the above exception, another exception occurred:\n", 482 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 483 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtips_sub_miss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total_bill'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 484 | "\u001b[0;32m/home/dchen/anaconda3/lib/python3.6/site-packages/pandas/core/tools/numeric.py\u001b[0m in \u001b[0;36mto_numeric\u001b[0;34m(arg, errors, downcast)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0mcoerce_numeric\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'raise'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m values = lib.maybe_convert_numeric(values, set(),\n\u001b[0;32m--> 126\u001b[0;31m coerce_numeric=coerce_numeric)\n\u001b[0m\u001b[1;32m 127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 128\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 485 | "\u001b[0;32mpandas/_libs/src/inference.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.maybe_convert_numeric (pandas/_libs/lib.c:56433)\u001b[0;34m()\u001b[0m\n", 486 | "\u001b[0;31mValueError\u001b[0m: Unable to parse string \"missing\" at position 1" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "pd.to_numeric(tips_sub_miss['total_bill'])" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 16, 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "data": { 501 | "text/plain": [ 502 | "0 16.99\n", 503 | "1 missing\n", 504 | "2 21.01\n", 505 | "3 missing\n", 506 | "4 24.59\n", 507 | "5 missing\n", 508 | "6 8.77\n", 509 | "7 missing\n", 510 | "8 15.04\n", 511 | "9 14.78\n", 512 | "Name: total_bill, dtype: object" 513 | ] 514 | }, 515 | "execution_count": 16, 516 | "metadata": {}, 517 | "output_type": "execute_result" 518 | } 519 | ], 520 | "source": [ 521 | "pd.to_numeric(tips_sub_miss['total_bill'], errors='ignore')" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 19, 527 | "metadata": {}, 528 | "outputs": [ 529 | { 530 | "data": { 531 | "text/plain": [ 532 | "0 16.99\n", 533 | "1 NaN\n", 534 | "2 21.01\n", 535 | "3 NaN\n", 536 | "4 24.59\n", 537 | "5 NaN\n", 538 | "6 8.77\n", 539 | "7 NaN\n", 540 | "8 15.04\n", 541 | "9 14.78\n", 542 | "Name: total_bill, dtype: float64" 543 | ] 544 | }, 545 | "execution_count": 19, 546 | "metadata": {}, 547 | "output_type": "execute_result" 548 | } 549 | ], 550 | "source": [ 551 | "pd.to_numeric(tips_sub_miss['total_bill'], errors='coerce')" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": { 558 | "collapsed": true 559 | }, 560 | "outputs": [], 561 | "source": [] 562 | } 563 | ], 564 | "metadata": { 565 | "kernelspec": { 566 | "display_name": "Python 3", 567 | "language": "python", 568 | "name": "python3" 569 | }, 570 | "language_info": { 571 | "codemirror_mode": { 572 | "name": "ipython", 573 | "version": 3 574 | }, 575 | "file_extension": ".py", 576 | "mimetype": "text/x-python", 577 | "name": "python", 578 | "nbconvert_exporter": "python", 579 | "pygments_lexer": "ipython3", 580 | "version": "3.6.1" 581 | } 582 | }, 583 | "nbformat": 4, 584 | "nbformat_minor": 2 585 | } 586 | -------------------------------------------------------------------------------- /02-lesson/07-groupby.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "gapminder = pd.read_csv('../data/gapminder.tsv', delimiter='\\t')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 4, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | "
countrycontinentyearlifeExppopgdpPercap
0AfghanistanAsia195228.8018425333779.445314
1AfghanistanAsia195730.3329240934820.853030
2AfghanistanAsia196231.99710267083853.100710
3AfghanistanAsia196734.02011537966836.197138
4AfghanistanAsia197236.08813079460739.981106
\n", 105 | "
" 106 | ], 107 | "text/plain": [ 108 | " country continent year lifeExp pop gdpPercap\n", 109 | "0 Afghanistan Asia 1952 28.801 8425333 779.445314\n", 110 | "1 Afghanistan Asia 1957 30.332 9240934 820.853030\n", 111 | "2 Afghanistan Asia 1962 31.997 10267083 853.100710\n", 112 | "3 Afghanistan Asia 1967 34.020 11537966 836.197138\n", 113 | "4 Afghanistan Asia 1972 36.088 13079460 739.981106" 114 | ] 115 | }, 116 | "execution_count": 4, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "gapminder.head()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 5, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "year\n", 134 | "1952 49.057620\n", 135 | "1957 51.507401\n", 136 | "1962 53.609249\n", 137 | "1967 55.678290\n", 138 | "1972 57.647386\n", 139 | "1977 59.570157\n", 140 | "1982 61.533197\n", 141 | "1987 63.212613\n", 142 | "1992 64.160338\n", 143 | "1997 65.014676\n", 144 | "2002 65.694923\n", 145 | "2007 67.007423\n", 146 | "Name: lifeExp, dtype: float64" 147 | ] 148 | }, 149 | "execution_count": 5, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "gapminder.groupby('year')['lifeExp'].mean()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 17, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "y1952 = gapminder.loc[gapminder['year'] == 1952, :]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 11, 170 | "metadata": { 171 | "collapsed": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "l = [1, 2, 3, 4, 5]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 14, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "[1, 2, 3, 4, 5]" 187 | ] 188 | }, 189 | "execution_count": 14, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "l[:]" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 9, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "49.05761971830987" 207 | ] 208 | }, 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "y1952['lifeExp'].mean()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/html": [ 226 | "
\n", 227 | "\n", 240 | "\n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | "
countmeanstdmin25%50%75%max
year
1952142.049.05762012.22595628.80139.0590045.135559.7650072.670
1957142.051.50740112.23128630.33241.2475048.360563.0367573.470
1962142.053.60924912.09724531.99743.4685050.881065.2345073.680
1967142.055.67829011.71885834.02046.0337553.825067.4195074.160
1972142.057.64738611.38195335.40048.5002556.530069.2475074.720
1977142.059.57015711.22722931.22050.4755059.672070.3825076.110
1982142.061.53319710.77061838.44552.9400062.441570.9212577.110
1987142.063.21261310.55628539.90654.9407565.834071.8772578.670
1992142.064.16033811.22738023.59956.1217567.703072.5825079.360
1997142.065.01467611.55943936.08755.6337569.394074.1697580.690
2002142.065.69492312.27982339.19355.5222570.825575.4592582.000
2007142.067.00742312.07302139.61357.1602571.935576.4132582.603
\n", 400 | "
" 401 | ], 402 | "text/plain": [ 403 | " count mean std min 25% 50% 75% max\n", 404 | "year \n", 405 | "1952 142.0 49.057620 12.225956 28.801 39.05900 45.1355 59.76500 72.670\n", 406 | "1957 142.0 51.507401 12.231286 30.332 41.24750 48.3605 63.03675 73.470\n", 407 | "1962 142.0 53.609249 12.097245 31.997 43.46850 50.8810 65.23450 73.680\n", 408 | "1967 142.0 55.678290 11.718858 34.020 46.03375 53.8250 67.41950 74.160\n", 409 | "1972 142.0 57.647386 11.381953 35.400 48.50025 56.5300 69.24750 74.720\n", 410 | "1977 142.0 59.570157 11.227229 31.220 50.47550 59.6720 70.38250 76.110\n", 411 | "1982 142.0 61.533197 10.770618 38.445 52.94000 62.4415 70.92125 77.110\n", 412 | "1987 142.0 63.212613 10.556285 39.906 54.94075 65.8340 71.87725 78.670\n", 413 | "1992 142.0 64.160338 11.227380 23.599 56.12175 67.7030 72.58250 79.360\n", 414 | "1997 142.0 65.014676 11.559439 36.087 55.63375 69.3940 74.16975 80.690\n", 415 | "2002 142.0 65.694923 12.279823 39.193 55.52225 70.8255 75.45925 82.000\n", 416 | "2007 142.0 67.007423 12.073021 39.613 57.16025 71.9355 76.41325 82.603" 417 | ] 418 | }, 419 | "execution_count": 10, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "gapminder.groupby('year')['lifeExp'].describe()" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 18, 431 | "metadata": { 432 | "collapsed": true 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "import numpy as np" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 19, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "data": { 446 | "text/plain": [ 447 | "continent\n", 448 | "Africa 48.865330\n", 449 | "Americas 64.658737\n", 450 | "Asia 60.064903\n", 451 | "Europe 71.903686\n", 452 | "Oceania 74.326208\n", 453 | "Name: lifeExp, dtype: float64" 454 | ] 455 | }, 456 | "execution_count": 19, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "gapminder.groupby('continent')['lifeExp'].agg(np.mean)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 20, 468 | "metadata": {}, 469 | "outputs": [ 470 | { 471 | "data": { 472 | "text/plain": [ 473 | "continent\n", 474 | "Africa 9.150210\n", 475 | "Americas 9.345088\n", 476 | "Asia 11.864532\n", 477 | "Europe 5.433178\n", 478 | "Oceania 3.795611\n", 479 | "Name: lifeExp, dtype: float64" 480 | ] 481 | }, 482 | "execution_count": 20, 483 | "metadata": {}, 484 | "output_type": "execute_result" 485 | } 486 | ], 487 | "source": [ 488 | "gapminder.groupby('continent')['lifeExp'].aggregate(np.std)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 22, 494 | "metadata": { 495 | "collapsed": true 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "def my_mean(values):\n", 500 | " n = len(values)\n", 501 | " s = np.sum(values)\n", 502 | " return s / n" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 23, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/plain": [ 513 | "continent\n", 514 | "Africa 48.865330\n", 515 | "Americas 64.658737\n", 516 | "Asia 60.064903\n", 517 | "Europe 71.903686\n", 518 | "Oceania 74.326208\n", 519 | "Name: lifeExp, dtype: float64" 520 | ] 521 | }, 522 | "execution_count": 23, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "gapminder.groupby('continent')['lifeExp'].aggregate(my_mean)" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 25, 534 | "metadata": {}, 535 | "outputs": [ 536 | { 537 | "data": { 538 | "text/html": [ 539 | "
\n", 540 | "\n", 553 | "\n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | "
count_nonzeromeanstd
continent
Africa624.048.8653309.150210
Americas300.064.6587379.345088
Asia396.060.06490311.864532
Europe360.071.9036865.433178
Oceania24.074.3262083.795611
\n", 601 | "
" 602 | ], 603 | "text/plain": [ 604 | " count_nonzero mean std\n", 605 | "continent \n", 606 | "Africa 624.0 48.865330 9.150210\n", 607 | "Americas 300.0 64.658737 9.345088\n", 608 | "Asia 396.0 60.064903 11.864532\n", 609 | "Europe 360.0 71.903686 5.433178\n", 610 | "Oceania 24.0 74.326208 3.795611" 611 | ] 612 | }, 613 | "execution_count": 25, 614 | "metadata": {}, 615 | "output_type": "execute_result" 616 | } 617 | ], 618 | "source": [ 619 | "gapminder.groupby('continent')['lifeExp'].aggregate([\n", 620 | " np.count_nonzero,\n", 621 | " np.mean,\n", 622 | " np.std\n", 623 | "])" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 26, 629 | "metadata": {}, 630 | "outputs": [ 631 | { 632 | "name": "stderr", 633 | "output_type": "stream", 634 | "text": [ 635 | "/home/dchen/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 636 | "is deprecated and will be removed in a future version\n", 637 | " after removing the cwd from sys.path.\n" 638 | ] 639 | }, 640 | { 641 | "data": { 642 | "text/html": [ 643 | "
\n", 644 | "\n", 657 | "\n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | "
ncountmeanstd
continent
Africa624.048.8653309.150210
Americas300.064.6587379.345088
Asia396.060.06490311.864532
Europe360.071.9036865.433178
Oceania24.074.3262083.795611
\n", 705 | "
" 706 | ], 707 | "text/plain": [ 708 | " ncount mean std\n", 709 | "continent \n", 710 | "Africa 624.0 48.865330 9.150210\n", 711 | "Americas 300.0 64.658737 9.345088\n", 712 | "Asia 396.0 60.064903 11.864532\n", 713 | "Europe 360.0 71.903686 5.433178\n", 714 | "Oceania 24.0 74.326208 3.795611" 715 | ] 716 | }, 717 | "execution_count": 26, 718 | "metadata": {}, 719 | "output_type": "execute_result" 720 | } 721 | ], 722 | "source": [ 723 | "gapminder.groupby('continent')['lifeExp'].aggregate({\n", 724 | " 'ncount': np.count_nonzero,\n", 725 | " 'mean': np.mean,\n", 726 | " 'std': np.std\n", 727 | "})" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 30, 733 | "metadata": {}, 734 | "outputs": [ 735 | { 736 | "data": { 737 | "text/html": [ 738 | "
\n", 739 | "\n", 752 | "\n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | "
continentcountavgstd_dev
0Africa624.048.8653309.150210
1Americas300.064.6587379.345088
2Asia396.060.06490311.864532
3Europe360.071.9036865.433178
4Oceania24.074.3262083.795611
\n", 800 | "
" 801 | ], 802 | "text/plain": [ 803 | " continent count avg std_dev\n", 804 | "0 Africa 624.0 48.865330 9.150210\n", 805 | "1 Americas 300.0 64.658737 9.345088\n", 806 | "2 Asia 396.0 60.064903 11.864532\n", 807 | "3 Europe 360.0 71.903686 5.433178\n", 808 | "4 Oceania 24.0 74.326208 3.795611" 809 | ] 810 | }, 811 | "execution_count": 30, 812 | "metadata": {}, 813 | "output_type": "execute_result" 814 | } 815 | ], 816 | "source": [ 817 | "gapminder.groupby('continent')['lifeExp'].aggregate([\n", 818 | " np.count_nonzero,\n", 819 | " np.mean,\n", 820 | " np.std]).\\\n", 821 | " rename(columns = {'count_nonzero': 'count',\n", 822 | " 'mean': 'avg',\n", 823 | " 'std': 'std_dev'}).\\\n", 824 | " reset_index()" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": null, 830 | "metadata": { 831 | "collapsed": true 832 | }, 833 | "outputs": [], 834 | "source": [ 835 | "|" 836 | ] 837 | } 838 | ], 839 | "metadata": { 840 | "kernelspec": { 841 | "display_name": "Python 3", 842 | "language": "python", 843 | "name": "python3" 844 | }, 845 | "language_info": { 846 | "codemirror_mode": { 847 | "name": "ipython", 848 | "version": 3 849 | }, 850 | "file_extension": ".py", 851 | "mimetype": "text/x-python", 852 | "name": "python", 853 | "nbconvert_exporter": "python", 854 | "pygments_lexer": "ipython3", 855 | "version": "3.6.1" 856 | } 857 | }, 858 | "nbformat": 4, 859 | "nbformat_minor": 2 860 | } 861 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ``MIT License 2 | 3 | Copyright (c) 2017 Daniel Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE.`` 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SciPy 2017 Pandas Tutorial 2 | 3 | Thanks for attending the [tutorial](https://youtu.be/oGzU688xCUs). 4 | If you would be so kind to help me be better, please fill out the [feedback](https://docs.google.com/forms/u/0/d/e/1FAIpQLSf_hJ4akzTKIOEirxQAOE98ByD2zauXAW-ArNXX8ClNrNj9nQ/viewform?usp=sf_link) form 5 | 6 | # Installation 7 | 8 | 1. Install anaconda (use the Python 3 version): https://www.continuum.io/downloads 9 | 2. See the Software-Carpentry Installations for `bash`, `git`, `python`, and `text editor`: https://swcarpentry.github.io/workshop-template/#setup 10 | 11 | # Testing your installation 12 | 13 | 1. Run the `test_installation.py` script (or copy/paste the import statments into a python interpreter) 14 | 15 | ## How to run the Jupyter Notebook 16 | 17 | #### Windows/Mac 18 | 19 | There will be an [Anaconda Navigator](https://docs.continuum.io/anaconda/navigator/) application that installs to your system. 20 | You can launch the Jupyter notebook from there to run your python code. 21 | 22 | #### Linux 23 | 24 | Anaconda's Python installation should be your system's default python. 25 | Make sure you open a new terminal window for this to take effect. 26 | You can launch python by typing `jupyter notebook` 27 | 28 | ## Creating a Notebook 29 | 30 | Once you have the Jupyter notebook launched, there's a button towards the top right called `new`. 31 | Click this and select `Python 3`. 32 | 33 | # Get Data 34 | 35 | 1. Download or Clone the this repository. 36 | - Press the green button towards the top right 37 | - click download zip 38 | - extract 39 | - celebrate 40 | -------------------------------------------------------------------------------- /data/concat_1.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D 2 | a0,b0,c0,d0 3 | a1,b1,c1,d1 4 | a2,b2,c2,d2 5 | a3,b3,c3,d3 6 | -------------------------------------------------------------------------------- /data/concat_2.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D 2 | a4,b4,c4,d4 3 | a5,b5,c5,d5 4 | a6,b6,c6,d6 5 | a7,b7,c7,d7 6 | -------------------------------------------------------------------------------- /data/concat_3.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D 2 | a8,b8,c8,d8 3 | a9,b9,c9,d9 4 | a10,b10,c10,d10 5 | a11,b11,c11,d11 6 | -------------------------------------------------------------------------------- /data/doctors.csv: -------------------------------------------------------------------------------- 1 | William Hartnell (1963-66) Patrick Troughton (1966-69) Jon Pertwee (1970 74) Tom Baker (1974-81) Peter Davison (1982-84) Colin Baker (1984-86) Sylvester McCoy (1987-89) Paul McGann (1996) Christopher Eccleston (2005) David Tennant (2005-10) Matt Smith (2010-13) Peter Capaldi (2014-2017) -------------------------------------------------------------------------------- /data/doctors_unicode.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chendaniely/scipy-2017-tutorial-pandas/9d9e081c361cf959f40e21228bb930db7b60068e/data/doctors_unicode.csv -------------------------------------------------------------------------------- /data/ebola_country_timeseries.csv: -------------------------------------------------------------------------------- 1 | Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali 2 | 1/5/2015,289,2776,,10030,,,,,,1786,,2977,,,,, 3 | 1/4/2015,288,2775,,9780,,,,,,1781,,2943,,,,, 4 | 1/3/2015,287,2769,8166,9722,,,,,,1767,3496,2915,,,,, 5 | 1/2/2015,286,,8157,,,,,,,,3496,,,,,, 6 | 12/31/2014,284,2730,8115,9633,,,,,,1739,3471,2827,,,,, 7 | 12/28/2014,281,2706,8018,9446,,,,,,1708,3423,2758,,,,, 8 | 12/27/2014,280,2695,,9409,,,,,,1697,,2732,,,,, 9 | 12/24/2014,277,2630,7977,9203,,,,,,,3413,2655,,,,, 10 | 12/21/2014,273,2597,,9004,,,,,,1607,,2582,,,,, 11 | 12/20/2014,272,2571,7862,8939,,,,,,1586,3384,2556,,,,, 12 | 12/18/2014,271,,7830,,,,,,,,3376,,,,,, 13 | 12/14/2014,267,2416,,8356,,,,,,1525,,2085,,,,, 14 | 12/9/2014,262,,7797,,,,,,,,3290,,,,,, 15 | 12/7/2014,260,2292,,7897,20,1,4,1,7,1428,,1768,8,0,1,0,6 16 | 12/3/2014,256,,7719,,,,,,,,3177,,,,,, 17 | 11/30/2014,253,2164,,7312,20,1,4,1,7,1327,,1583,8,0,1,0,6 18 | 11/28/2014,251,,7635,,,,,,,,3145,,,,,, 19 | 11/23/2014,246,2134,,6599,20,1,4,1,7,1260,,1398,8,0,1,0,6 20 | 11/22/2014,245,,7168,,,,,,,,3016,,,,,, 21 | 11/18/2014,241,2047,7082,6190,20,1,4,1,6,1214,2963,1267,8,0,1,0,6 22 | 11/16/2014,239,1971,,6073,20,1,4,1,5,1192,,1250,8,0,1,0,5 23 | 11/15/2014,238,,7069,,,,,,,,2964,,,,,, 24 | 11/11/2014,234,1919,,5586,20,1,4,1,4,1166,,1187,8,0,1,0,3 25 | 11/10/2014,233,,6878,,,,,,,,2812,,,,,, 26 | 11/9/2014,232,1878,,5368,20,1,4,1,1,1142,,1169,8,0,1,0,1 27 | 11/8/2014,231,,6822,,,,,,,,2836,,,,,, 28 | 11/4/2014,227,,6619,4862,20,1,4,1,1,,2766,1130,8,0,1,0,1 29 | 11/3/2014,226,1760,,,,,,,,1054,,,,,,, 30 | 11/2/2014,225,1731,,4759,20,1,4,1,1,1041,,1070,8,0,1,0,1 31 | 10/31/2014,222,,6525,,,,,,,,2697,,,,,, 32 | 10/29/2014,220,1667,,5338,20,1,4,1,1,1018,,1510,8,0,1,0,1 33 | 10/27/2014,218,1906,,5235,20,1,4,1,1,997,,1500,8,0,1,0,1 34 | 10/25/2014,216,,6535,,,,,,,,2413,,,,,, 35 | 10/22/2014,214,,,3896,,,4,1,1,,,1281,,,1,0,1 36 | 10/21/2014,213,1553,,,,,,,,926,,,,,,, 37 | 10/19/2014,211,1540,,3706,20,1,3,1,,904,,1259,8,0,1,0, 38 | 10/18/2014,210,,4665,,,,,,,,2705,,,,,, 39 | 10/14/2014,206,1519,,3410,20,1,3,1,,862,,1200,8,0,0,1, 40 | 10/13/2014,205,,4262,,,,,,,,2484,,,,,, 41 | 10/12/2014,204,1472,,3252,20,1,2,1,,843,,1183,8,0,1,1, 42 | 10/11/2014,203,,4249,,,,,,,,2458,,,,,, 43 | 10/8/2014,200,,,2950,20,1,1,1,,,,930,8,0,1,1, 44 | 10/7/2014,199,1350,4076,,,,,,,778,2316,,,,,, 45 | 10/5/2014,197,1298,,2789,20,1,1,,,768,,879,8,0,0,, 46 | 10/4/2014,196,,3924,,,,,,,,2210,,,,,, 47 | 10/1/2014,193,1199,3834,2437,20,1,1,,,739,2069,623,8,0,0,, 48 | 9/28/2014,190,1157,3696,2304,20,1,,,,710,1998,622,8,0,,, 49 | 9/23/2014,185,1074,3458,2021,20,1,,,,648,1830,605,8,0,,, 50 | 9/21/2014,183,1022,3280,1940,20,1,,,,635,1677,597,8,0,,, 51 | 9/20/2014,182,,,1813,,,,,,,,593,,,,, 52 | 9/19/2014,181,1008,,,,,,,,632,,,,,,, 53 | 9/17/2014,179,,3022,,,,,,,,1578,,,,,, 54 | 9/14/2014,176,942,2710,1673,,,,,,601,1459,562,,,,, 55 | 9/13/2014,175,936,,1620,21,1,,,,595,1296,562,8,0,,, 56 | 9/10/2014,172,899,,1478,21,1,,,,568,,536,8,,,, 57 | 9/9/2014,171,,2407,,,,,,,,,,,,,, 58 | 9/7/2014,169,861,2081,1424,21,3,,,,557,1137,524,8,0,,, 59 | 9/5/2014,167,812,1871,1261,22,1,,,,517,1089,491,8,,,, 60 | 8/31/2014,162,771,1698,1216,21,1,,,,494,871,476,7,,,, 61 | 8/26/2014,157,648,1378,1026,17,,,,,430,694,422,6,,,, 62 | 8/20/2014,151,607,1082,910,16,,,,,406,624,392,5,,,, 63 | 8/18/2014,149,579,972,907,15,,,,,396,576,374,4,,,, 64 | 8/16/2014,147,543,834,848,15,,,,,394,466,365,4,,,, 65 | 8/13/2014,144,519,786,810,12,,,,,380,413,348,4,,,, 66 | 8/11/2014,142,510,670,783,12,,,,,377,355,334,3,,,, 67 | 8/9/2014,140,506,599,730,13,,,,,373,323,315,2,,,, 68 | 8/6/2014,137,495,554,717,13,,,,,367,294,298,2,,,, 69 | 8/4/2014,135,495,516,691,9,,,,,363,282,286,1,,,, 70 | 8/1/2014,132,485,468,646,4,,,,,358,255,273,1,,,, 71 | 7/30/2014,129,472,391,574,3,,,,,346,227,252,1,,,, 72 | 7/27/2014,126,460,329,533,1,,,,,339,156,233,1,,,, 73 | 7/23/2014,123,427,249,525,0,,,,,319,129,224,0,,,, 74 | 7/20/2014,120,415,224,454,,,,,,314,127,219,,,,, 75 | 7/17/2014,117,410,196,442,,,,,,310,116,206,,,,, 76 | 7/14/2014,114,411,174,397,,,,,,310,106,197,,,,, 77 | 7/12/2014,112,406,172,386,,,,,,304,105,194,,,,, 78 | 7/8/2014,108,409,142,337,,,,,,309,88,142,,,,, 79 | 7/6/2014,106,408,131,305,,,,,,307,84,127,,,,, 80 | 7/2/2014,102,412,115,252,,,,,,305,75,101,,,,, 81 | 6/30/2014,100,413,107,239,,,,,,303,65,99,,,,, 82 | 6/22/2014,92,,51,,,,,,,,34,,,,,, 83 | 6/20/2014,90,390,,158,,,,,,270,,34,,,,, 84 | 6/19/2014,89,,41,,,,,,,,25,,,,,, 85 | 6/18/2014,88,390,,136,,,,,,267,,28,,,,, 86 | 6/17/2014,87,,,97,,,,,,,,49,,,,, 87 | 6/16/2014,86,398,33,,,,,,,264,24,,,,,, 88 | 6/10/2014,80,351,13,89,,,,,,226,24,7,,,,, 89 | 6/5/2014,75,,13,81,,,,,,,,6,,,,, 90 | 6/3/2014,73,344,13,,,,,,,215,12,6,,,,, 91 | 6/1/2014,71,328,13,79,,,,,,208,12,6,,,,, 92 | 5/28/2014,67,291,13,50,,,,,,193,12,6,,,,, 93 | 5/27/2014,66,281,12,16,,,,,,186,11,5,,,,, 94 | 5/23/2014,62,258,12,0,,,,,,174,11,0,,,,, 95 | 5/12/2014,51,248,12,0,,,,,,171,11,0,,,,, 96 | 5/10/2014,49,233,12,0,,,,,,157,11,0,,,,, 97 | 5/7/2014,46,236,13,0,,,,,,158,11,0,,,,, 98 | 5/5/2014,44,235,13,0,,,,,,157,11,0,,,,, 99 | 5/3/2014,42,231,13,0,,,,,,155,11,0,,,,, 100 | 5/1/2014,40,226,13,0,,,,,,149,11,0,,,,, 101 | 4/26/2014,35,224,,0,,,,,,143,,0,,,,, 102 | 4/24/2014,33,,35,0,,,,,,,,0,,,,, 103 | 4/23/2014,32,218,,0,,,,,,141,,0,,,,, 104 | 4/22/2014,31,,,0,,,,,,,,0,,,,, 105 | 4/21/2014,30,,34,,,,,,,,11,,,,,, 106 | 4/20/2014,29,208,,,,,,,,136,6,,,,,, 107 | 4/17/2014,26,203,27,,,,,,,129,,,,,,, 108 | 4/16/2014,25,197,27,,,,,,,122,13,,,,,, 109 | 4/15/2014,24,,,12,,,,,,,,,,,,, 110 | 4/14/2014,23,168,,,,,,,,108,,,,,,, 111 | 4/11/2014,20,159,26,2,,,,,,106,13,2,,,,, 112 | 4/9/2014,18,158,25,2,,,,,,101,12,2,,,,, 113 | 4/7/2014,16,151,21,2,,,,,,95,10,2,,,,, 114 | 4/4/2014,13,143,18,2,,,,,,86,7,2,,,,, 115 | 4/1/2014,10,127,8,2,,,,,,83,5,2,,,,, 116 | 3/31/2014,9,122,8,2,,,,,,80,4,2,,,,, 117 | 3/29/2014,7,112,7,,,,,,,70,2,,,,,, 118 | 3/28/2014,6,112,3,2,,,,,,70,3,2,,,,, 119 | 3/27/2014,5,103,8,6,,,,,,66,6,5,,,,, 120 | 3/26/2014,4,86,,,,,,,,62,,,,,,, 121 | 3/25/2014,3,86,,,,,,,,60,,,,,,, 122 | 3/24/2014,2,86,,,,,,,,59,,,,,,, 123 | 3/22/2014,0,49,,,,,,,,29,,,,,,, -------------------------------------------------------------------------------- /data/pew.csv: -------------------------------------------------------------------------------- 1 | "religion","<$10k","$10-20k","$20-30k","$30-40k","$40-50k","$50-75k","$75-100k","$100-150k",">150k","Don't know/refused" 2 | "Agnostic",27,34,60,81,76,137,122,109,84,96 3 | "Atheist",12,27,37,52,35,70,73,59,74,76 4 | "Buddhist",27,21,30,34,33,58,62,39,53,54 5 | "Catholic",418,617,732,670,638,1116,949,792,633,1489 6 | "Don’t know/refused",15,14,15,11,10,35,21,17,18,116 7 | "Evangelical Prot",575,869,1064,982,881,1486,949,723,414,1529 8 | "Hindu",1,9,7,9,11,34,47,48,54,37 9 | "Historically Black Prot",228,244,236,238,197,223,131,81,78,339 10 | "Jehovah's Witness",20,27,24,24,21,30,15,11,6,37 11 | "Jewish",19,19,25,25,30,95,69,87,151,162 12 | "Mainline Prot",289,495,619,655,651,1107,939,753,634,1328 13 | "Mormon",29,40,48,51,56,112,85,49,42,69 14 | "Muslim",6,7,9,10,9,23,16,8,6,22 15 | "Orthodox",13,17,23,32,32,47,38,42,46,73 16 | "Other Christian",9,7,11,13,13,14,18,14,12,18 17 | "Other Faiths",20,33,40,46,49,63,46,40,41,71 18 | "Other World Religions",5,2,3,4,2,7,3,4,4,8 19 | "Unaffiliated",217,299,374,365,341,528,407,321,258,597 20 | -------------------------------------------------------------------------------- /data/preg.csv: -------------------------------------------------------------------------------- 1 | "name","treatmenta","treatmentb" 2 | "John Smith",NA,18 3 | "Jane Doe",4,1 4 | "Mary Johnson",6,7 5 | -------------------------------------------------------------------------------- /data/preg2.csv: -------------------------------------------------------------------------------- 1 | "treatment","John Smith","Jane Doe","Mary Johnson" 2 | "a",NA,4,6 3 | "b",18,1,7 4 | -------------------------------------------------------------------------------- /data/survey_person.csv: -------------------------------------------------------------------------------- 1 | ident,personal,family 2 | dyer,William,Dyer 3 | pb,Frank,Pabodie 4 | lake,Anderson,Lake 5 | roe,Valentina,Roerich 6 | danforth,Frank,Danforth 7 | -------------------------------------------------------------------------------- /data/survey_site.csv: -------------------------------------------------------------------------------- 1 | name,lat,long 2 | DR-1,-49.85,-128.57 3 | DR-3,-47.15,-126.72 4 | MSK-4,-48.87,-123.4 5 | -------------------------------------------------------------------------------- /data/survey_survey.csv: -------------------------------------------------------------------------------- 1 | taken,person,quant,reading 2 | 619,dyer,rad,9.82 3 | 619,dyer,sal,0.13 4 | 622,dyer,rad,7.8 5 | 622,dyer,sal,0.09 6 | 734,pb,rad,8.41 7 | 734,lake,sal,0.05 8 | 734,pb,temp,-21.5 9 | 735,pb,rad,7.22 10 | 735,,sal,0.06 11 | 735,,temp,-26.0 12 | 751,pb,rad,4.35 13 | 751,pb,temp,-18.5 14 | 751,lake,sal,0.1 15 | 752,lake,rad,2.19 16 | 752,lake,sal,0.09 17 | 752,lake,temp,-16.0 18 | 752,roe,sal,41.6 19 | 837,lake,rad,1.46 20 | 837,lake,sal,0.21 21 | 837,roe,sal,22.5 22 | 844,roe,rad,11.25 23 | -------------------------------------------------------------------------------- /data/survey_visited.csv: -------------------------------------------------------------------------------- 1 | ident,site,dated 2 | 619,DR-1,1927-02-08 3 | 622,DR-1,1927-02-10 4 | 734,DR-3,1939-01-07 5 | 735,DR-3,1930-01-12 6 | 751,DR-3,1930-02-26 7 | 752,DR-3, 8 | 837,MSK-4,1932-01-14 9 | 844,DR-1,1932-03-22 10 | -------------------------------------------------------------------------------- /data/weather.csv: -------------------------------------------------------------------------------- 1 | "id","year","month","element","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 2 | "MX17004",2010,1,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,27.8,NA 3 | "MX17004",2010,1,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,14.5,NA 4 | "MX17004",2010,2,"tmax",NA,27.3,24.1,NA,NA,NA,NA,NA,NA,NA,29.7,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,29.9,NA,NA,NA,NA,NA,NA,NA,NA 5 | "MX17004",2010,2,"tmin",NA,14.4,14.4,NA,NA,NA,NA,NA,NA,NA,13.4,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,10.7,NA,NA,NA,NA,NA,NA,NA,NA 6 | "MX17004",2010,3,"tmax",NA,NA,NA,NA,32.1,NA,NA,NA,NA,34.5,NA,NA,NA,NA,NA,31.1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 7 | "MX17004",2010,3,"tmin",NA,NA,NA,NA,14.2,NA,NA,NA,NA,16.8,NA,NA,NA,NA,NA,17.6,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 8 | "MX17004",2010,4,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,36.3,NA,NA,NA,NA 9 | "MX17004",2010,4,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,16.7,NA,NA,NA,NA 10 | "MX17004",2010,5,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,33.2,NA,NA,NA,NA 11 | "MX17004",2010,5,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,18.2,NA,NA,NA,NA 12 | "MX17004",2010,6,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,28,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,30.1,NA,NA 13 | "MX17004",2010,6,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,17.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,18,NA,NA 14 | "MX17004",2010,7,"tmax",NA,NA,28.6,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,29.9,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 15 | "MX17004",2010,7,"tmin",NA,NA,17.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,16.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 16 | "MX17004",2010,8,"tmax",NA,NA,NA,NA,29.6,NA,NA,29,NA,NA,NA,NA,29.8,NA,NA,NA,NA,NA,NA,NA,NA,NA,26.4,NA,29.7,NA,NA,NA,28,NA,25.4 17 | "MX17004",2010,8,"tmin",NA,NA,NA,NA,15.8,NA,NA,17.3,NA,NA,NA,NA,16.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,15,NA,15.6,NA,NA,NA,15.3,NA,15.4 18 | "MX17004",2010,10,"tmax",NA,NA,NA,NA,27,NA,28.1,NA,NA,NA,NA,NA,NA,29.5,28.7,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,31.2,NA,NA,NA 19 | "MX17004",2010,10,"tmin",NA,NA,NA,NA,14,NA,12.9,NA,NA,NA,NA,NA,NA,13,10.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,15,NA,NA,NA 20 | "MX17004",2010,11,"tmax",NA,31.3,NA,27.2,26.3,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,28.1,27.7,NA,NA,NA,NA 21 | "MX17004",2010,11,"tmin",NA,16.3,NA,12,7.9,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,12.1,14.2,NA,NA,NA,NA 22 | "MX17004",2010,12,"tmax",29.9,NA,NA,NA,NA,27.8,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 23 | "MX17004",2010,12,"tmin",13.8,NA,NA,NA,NA,10.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 24 | -------------------------------------------------------------------------------- /output/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chendaniely/scipy-2017-tutorial-pandas/9d9e081c361cf959f40e21228bb930db7b60068e/output/.gitkeep -------------------------------------------------------------------------------- /test_installation.py: -------------------------------------------------------------------------------- 1 | """ import all the necessary libraries required""" 2 | import pandas as pd 3 | import seaborn as sns 4 | import sklearn as sk 5 | --------------------------------------------------------------------------------