├── .gitignore ├── README.md ├── Tidy Data.ipynb └── data ├── 2014-baby-names-illinois.csv ├── 2015-baby-names-illinois.csv ├── billboard.csv ├── pew-raw.csv ├── tb-raw.csv ├── weather-raw.csv └── weather.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | data/*.tex 3 | data/*.r -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tidy Data in Python 2 | 3 | **Author** 4 | Jean-Nicholas Hould 5 | 6 | **Description** 7 | This notebook demonstrates some manipulations to transform messy datasets into the tidy format using Python pandas. 8 | 9 | **Additional Information** 10 | For any additional details, please read my [blog post](http://www.jeannicholashould.com/tidy-data-in-python.html) which covers in details this notebook. 11 | -------------------------------------------------------------------------------- /Tidy Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tidy Data in Python\n", 8 | "by [Jean-Nicholas Hould](http://www.jeannicholashould.com/)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 6, 14 | "metadata": { 15 | "collapsed": false 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd\n", 20 | "import datetime\n", 21 | "from os import listdir\n", 22 | "from os.path import isfile, join\n", 23 | "import glob\n", 24 | "import re" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Column headers are values, not variable names" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "### Pew Research Center Dataset" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 7, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | "
religion<$10k$10-20k$20-30k$30-40k$40-50k$50-75k
0Agnostic2734608176137
1Atheist122737523570
2Buddhist272130343358
3Catholic4186177326706381116
4Dont know/refused151415111035
5Evangelical Prot57586910649828811486
6Hindu19791134
7Historically Black Prot228244236238197223
8Jehovahs Witness202724242130
9Jewish191925253095
\n", 168 | "
" 169 | ], 170 | "text/plain": [ 171 | " religion <$10k $10-20k $20-30k $30-40k $40-50k \\\n", 172 | "0 Agnostic 27 34 60 81 76 \n", 173 | "1 Atheist 12 27 37 52 35 \n", 174 | "2 Buddhist 27 21 30 34 33 \n", 175 | "3 Catholic 418 617 732 670 638 \n", 176 | "4 Dont know/refused 15 14 15 11 10 \n", 177 | "5 Evangelical Prot 575 869 1064 982 881 \n", 178 | "6 Hindu 1 9 7 9 11 \n", 179 | "7 Historically Black Prot 228 244 236 238 197 \n", 180 | "8 Jehovahs Witness 20 27 24 24 21 \n", 181 | "9 Jewish 19 19 25 25 30 \n", 182 | "\n", 183 | " $50-75k \n", 184 | "0 137 \n", 185 | "1 70 \n", 186 | "2 58 \n", 187 | "3 1116 \n", 188 | "4 35 \n", 189 | "5 1486 \n", 190 | "6 34 \n", 191 | "7 223 \n", 192 | "8 30 \n", 193 | "9 95 " 194 | ] 195 | }, 196 | "execution_count": 7, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "df = pd.read_csv(\"./data/pew-raw.csv\")\n", 203 | "df" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 8, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/html": [ 216 | "
\n", 217 | "\n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | "
religionincomefreq
0Agnostic<$10k27
30Agnostic$30-40k81
40Agnostic$40-50k76
50Agnostic$50-75k137
10Agnostic$10-20k34
20Agnostic$20-30k60
41Atheist$40-50k35
21Atheist$20-30k37
11Atheist$10-20k27
31Atheist$30-40k52
\n", 289 | "
" 290 | ], 291 | "text/plain": [ 292 | " religion income freq\n", 293 | "0 Agnostic <$10k 27\n", 294 | "30 Agnostic $30-40k 81\n", 295 | "40 Agnostic $40-50k 76\n", 296 | "50 Agnostic $50-75k 137\n", 297 | "10 Agnostic $10-20k 34\n", 298 | "20 Agnostic $20-30k 60\n", 299 | "41 Atheist $40-50k 35\n", 300 | "21 Atheist $20-30k 37\n", 301 | "11 Atheist $10-20k 27\n", 302 | "31 Atheist $30-40k 52" 303 | ] 304 | }, 305 | "execution_count": 8, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "formatted_df = pd.melt(df,[\"religion\"], var_name=\"income\", value_name=\"freq\")\n", 312 | "formatted_df = formatted_df.sort_values(by=[\"religion\"])\n", 313 | "formatted_df.head(10)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "### Billboard Top 100 Dataset" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 69, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "text/html": [ 333 | "
\n", 334 | "\n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | "
yearartist.invertedtracktimegenredate.entereddate.peakedx1st.weekx2nd.weekx3rd.week...x67th.weekx68th.weekx69th.weekx70th.weekx71st.weekx72nd.weekx73rd.weekx74th.weekx75th.weekx76th.week
02000Destiny's ChildIndependent Women Part I3:38Rock2000-09-232000-11-187863.049.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12000SantanaMaria, Maria4:18Rock2000-02-122000-04-08158.06.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22000Savage GardenI Knew I Loved You4:07Rock1999-10-232000-01-297148.043.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
32000MadonnaMusic3:45Rock2000-08-122000-09-164123.018.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
42000Aguilera, ChristinaCome On Over Baby (All I Want Is You)3:38Rock2000-08-052000-10-145747.045.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
52000JanetDoesn't Really Matter4:17Rock2000-06-172000-08-265952.043.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
62000Destiny's ChildSay My Name4:31Rock1999-12-252000-03-188383.044.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
72000Iglesias, EnriqueBe With You3:36Latin2000-04-012000-06-246345.034.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
82000SisqoIncomplete3:52Rock2000-06-242000-08-127766.061.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
92000LonestarAmazed4:25Country1999-06-052000-03-048154.044.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", 604 | "

10 rows × 83 columns

\n", 605 | "
" 606 | ], 607 | "text/plain": [ 608 | " year artist.inverted track time \\\n", 609 | "0 2000 Destiny's Child Independent Women Part I 3:38 \n", 610 | "1 2000 Santana Maria, Maria 4:18 \n", 611 | "2 2000 Savage Garden I Knew I Loved You 4:07 \n", 612 | "3 2000 Madonna Music 3:45 \n", 613 | "4 2000 Aguilera, Christina Come On Over Baby (All I Want Is You) 3:38 \n", 614 | "5 2000 Janet Doesn't Really Matter 4:17 \n", 615 | "6 2000 Destiny's Child Say My Name 4:31 \n", 616 | "7 2000 Iglesias, Enrique Be With You 3:36 \n", 617 | "8 2000 Sisqo Incomplete 3:52 \n", 618 | "9 2000 Lonestar Amazed 4:25 \n", 619 | "\n", 620 | " genre date.entered date.peaked x1st.week x2nd.week x3rd.week \\\n", 621 | "0 Rock 2000-09-23 2000-11-18 78 63.0 49.0 \n", 622 | "1 Rock 2000-02-12 2000-04-08 15 8.0 6.0 \n", 623 | "2 Rock 1999-10-23 2000-01-29 71 48.0 43.0 \n", 624 | "3 Rock 2000-08-12 2000-09-16 41 23.0 18.0 \n", 625 | "4 Rock 2000-08-05 2000-10-14 57 47.0 45.0 \n", 626 | "5 Rock 2000-06-17 2000-08-26 59 52.0 43.0 \n", 627 | "6 Rock 1999-12-25 2000-03-18 83 83.0 44.0 \n", 628 | "7 Latin 2000-04-01 2000-06-24 63 45.0 34.0 \n", 629 | "8 Rock 2000-06-24 2000-08-12 77 66.0 61.0 \n", 630 | "9 Country 1999-06-05 2000-03-04 81 54.0 44.0 \n", 631 | "\n", 632 | " ... x67th.week x68th.week x69th.week x70th.week x71st.week \\\n", 633 | "0 ... NaN NaN NaN NaN NaN \n", 634 | "1 ... NaN NaN NaN NaN NaN \n", 635 | "2 ... NaN NaN NaN NaN NaN \n", 636 | "3 ... NaN NaN NaN NaN NaN \n", 637 | "4 ... NaN NaN NaN NaN NaN \n", 638 | "5 ... NaN NaN NaN NaN NaN \n", 639 | "6 ... NaN NaN NaN NaN NaN \n", 640 | "7 ... NaN NaN NaN NaN NaN \n", 641 | "8 ... NaN NaN NaN NaN NaN \n", 642 | "9 ... NaN NaN NaN NaN NaN \n", 643 | "\n", 644 | " x72nd.week x73rd.week x74th.week x75th.week x76th.week \n", 645 | "0 NaN NaN NaN NaN NaN \n", 646 | "1 NaN NaN NaN NaN NaN \n", 647 | "2 NaN NaN NaN NaN NaN \n", 648 | "3 NaN NaN NaN NaN NaN \n", 649 | "4 NaN NaN NaN NaN NaN \n", 650 | "5 NaN NaN NaN NaN NaN \n", 651 | "6 NaN NaN NaN NaN NaN \n", 652 | "7 NaN NaN NaN NaN NaN \n", 653 | "8 NaN NaN NaN NaN NaN \n", 654 | "9 NaN NaN NaN NaN NaN \n", 655 | "\n", 656 | "[10 rows x 83 columns]" 657 | ] 658 | }, 659 | "execution_count": 69, 660 | "metadata": {}, 661 | "output_type": "execute_result" 662 | } 663 | ], 664 | "source": [ 665 | "df = pd.read_csv(\"./data/billboard.csv\", encoding=\"mac_latin2\")\n", 666 | "df.head(10)" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 70, 672 | "metadata": { 673 | "collapsed": false 674 | }, 675 | "outputs": [ 676 | { 677 | "data": { 678 | "text/html": [ 679 | "
\n", 680 | "\n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | "
yearartist.invertedtracktimegenreweekrankdate
24620002 PacBaby Don't Cry (Keep Ya Head Up II)4:22Rap1872000-02-26
56320002 PacBaby Don't Cry (Keep Ya Head Up II)4:22Rap2822000-03-04
88020002 PacBaby Don't Cry (Keep Ya Head Up II)4:22Rap3722000-03-11
119720002 PacBaby Don't Cry (Keep Ya Head Up II)4:22Rap4772000-03-18
151420002 PacBaby Don't Cry (Keep Ya Head Up II)4:22Rap5872000-03-25
183120002 PacBaby Don't Cry (Keep Ya Head Up II)4:22Rap6942000-04-01
214820002 PacBaby Don't Cry (Keep Ya Head Up II)4:22Rap7992000-04-08
28720002Ge+herThe Hardest Part Of Breaking Up (Is Getting Ba...3:15R&B1912000-09-02
60420002Ge+herThe Hardest Part Of Breaking Up (Is Getting Ba...3:15R&B2872000-09-09
92120002Ge+herThe Hardest Part Of Breaking Up (Is Getting Ba...3:15R&B3922000-09-16
\n", 807 | "
" 808 | ], 809 | "text/plain": [ 810 | " year artist.inverted track \\\n", 811 | "246 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n", 812 | "563 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n", 813 | "880 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n", 814 | "1197 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n", 815 | "1514 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n", 816 | "1831 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n", 817 | "2148 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n", 818 | "287 2000 2Ge+her The Hardest Part Of Breaking Up (Is Getting Ba... \n", 819 | "604 2000 2Ge+her The Hardest Part Of Breaking Up (Is Getting Ba... \n", 820 | "921 2000 2Ge+her The Hardest Part Of Breaking Up (Is Getting Ba... \n", 821 | "\n", 822 | " time genre week rank date \n", 823 | "246 4:22 Rap 1 87 2000-02-26 \n", 824 | "563 4:22 Rap 2 82 2000-03-04 \n", 825 | "880 4:22 Rap 3 72 2000-03-11 \n", 826 | "1197 4:22 Rap 4 77 2000-03-18 \n", 827 | "1514 4:22 Rap 5 87 2000-03-25 \n", 828 | "1831 4:22 Rap 6 94 2000-04-01 \n", 829 | "2148 4:22 Rap 7 99 2000-04-08 \n", 830 | "287 3:15 R&B 1 91 2000-09-02 \n", 831 | "604 3:15 R&B 2 87 2000-09-09 \n", 832 | "921 3:15 R&B 3 92 2000-09-16 " 833 | ] 834 | }, 835 | "execution_count": 70, 836 | "metadata": {}, 837 | "output_type": "execute_result" 838 | } 839 | ], 840 | "source": [ 841 | "# Melting\n", 842 | "id_vars = [\"year\",\"artist.inverted\",\"track\",\"time\",\"genre\",\"date.entered\",\"date.peaked\"]\n", 843 | "df = pd.melt(frame=df,id_vars=id_vars, var_name=\"week\", value_name=\"rank\")\n", 844 | "\n", 845 | "# Formatting \n", 846 | "df[\"week\"] = df['week'].str.extract('(\\d+)', expand=False).astype(int)\n", 847 | "df[\"rank\"] = df[\"rank\"].astype(int)\n", 848 | "\n", 849 | "# Cleaning out unnecessary rows\n", 850 | "df = df.dropna()\n", 851 | "\n", 852 | "# Create \"date\" columns\n", 853 | "df['date'] = pd.to_datetime(df['date.entered']) + pd.to_timedelta(df['week'], unit='w') - pd.DateOffset(weeks=1)\n", 854 | "\n", 855 | "df = df[[\"year\", \"artist.inverted\", \"track\", \"time\", \"genre\", \"week\", \"rank\", \"date\"]]\n", 856 | "df = df.sort_values(ascending=True, by=[\"year\",\"artist.inverted\",\"track\",\"week\",\"rank\"])\n", 857 | "\n", 858 | "# Assigning the tidy dataset to a variable for future usage\n", 859 | "billboard = df\n", 860 | "\n", 861 | "df.head(10)" 862 | ] 863 | }, 864 | { 865 | "cell_type": "markdown", 866 | "metadata": {}, 867 | "source": [ 868 | "## Multiple types in one table" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": 11, 874 | "metadata": { 875 | "collapsed": false 876 | }, 877 | "outputs": [ 878 | { 879 | "data": { 880 | "text/html": [ 881 | "
\n", 882 | "\n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | "
yearartist.invertedtracktimegenresong_id
020002 PacBaby Don't Cry (Keep Ya Head Up II)4:22Rap0
120002Ge+herThe Hardest Part Of Breaking Up (Is Getting Ba...3:15R&B1
220003 Doors DownKryptonite3:53Rock2
320003 Doors DownLoser4:24Rock3
42000504 BoyzWobble Wobble3:35Rap4
5200098°Give Me Just One Night (Una Noche)3:24Rock5
62000A*TeensDancing Queen3:44Pop6
72000AaliyahI Don't Wanna4:15Rock7
82000AaliyahTry Again4:03Rock8
92000Adams, YolandaOpen My Heart5:30Gospel9
\n", 987 | "
" 988 | ], 989 | "text/plain": [ 990 | " year artist.inverted track \\\n", 991 | "0 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n", 992 | "1 2000 2Ge+her The Hardest Part Of Breaking Up (Is Getting Ba... \n", 993 | "2 2000 3 Doors Down Kryptonite \n", 994 | "3 2000 3 Doors Down Loser \n", 995 | "4 2000 504 Boyz Wobble Wobble \n", 996 | "5 2000 98° Give Me Just One Night (Una Noche) \n", 997 | "6 2000 A*Teens Dancing Queen \n", 998 | "7 2000 Aaliyah I Don't Wanna \n", 999 | "8 2000 Aaliyah Try Again \n", 1000 | "9 2000 Adams, Yolanda Open My Heart \n", 1001 | "\n", 1002 | " time genre song_id \n", 1003 | "0 4:22 Rap 0 \n", 1004 | "1 3:15 R&B 1 \n", 1005 | "2 3:53 Rock 2 \n", 1006 | "3 4:24 Rock 3 \n", 1007 | "4 3:35 Rap 4 \n", 1008 | "5 3:24 Rock 5 \n", 1009 | "6 3:44 Pop 6 \n", 1010 | "7 4:15 Rock 7 \n", 1011 | "8 4:03 Rock 8 \n", 1012 | "9 5:30 Gospel 9 " 1013 | ] 1014 | }, 1015 | "execution_count": 11, 1016 | "metadata": {}, 1017 | "output_type": "execute_result" 1018 | } 1019 | ], 1020 | "source": [ 1021 | "songs_cols = [\"year\", \"artist.inverted\", \"track\", \"time\", \"genre\"]\n", 1022 | "songs = billboard[songs_cols].drop_duplicates()\n", 1023 | "songs = songs.reset_index(drop=True)\n", 1024 | "songs[\"song_id\"] = songs.index\n", 1025 | "songs.head(10)" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 12, 1031 | "metadata": { 1032 | "collapsed": false 1033 | }, 1034 | "outputs": [ 1035 | { 1036 | "data": { 1037 | "text/html": [ 1038 | "
\n", 1039 | "\n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | "
song_iddaterank
002000-02-2687
102000-03-0482
202000-03-1172
302000-03-1877
402000-03-2587
502000-04-0194
602000-04-0899
712000-09-0291
812000-09-0987
912000-09-1692
\n", 1111 | "
" 1112 | ], 1113 | "text/plain": [ 1114 | " song_id date rank\n", 1115 | "0 0 2000-02-26 87\n", 1116 | "1 0 2000-03-04 82\n", 1117 | "2 0 2000-03-11 72\n", 1118 | "3 0 2000-03-18 77\n", 1119 | "4 0 2000-03-25 87\n", 1120 | "5 0 2000-04-01 94\n", 1121 | "6 0 2000-04-08 99\n", 1122 | "7 1 2000-09-02 91\n", 1123 | "8 1 2000-09-09 87\n", 1124 | "9 1 2000-09-16 92" 1125 | ] 1126 | }, 1127 | "execution_count": 12, 1128 | "metadata": {}, 1129 | "output_type": "execute_result" 1130 | } 1131 | ], 1132 | "source": [ 1133 | "ranks = pd.merge(billboard, songs, on=[\"year\",\"artist.inverted\", \"track\", \"time\", \"genre\"])\n", 1134 | "ranks = ranks[[\"song_id\", \"date\",\"rank\"]]\n", 1135 | "ranks.head(10)" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "markdown", 1140 | "metadata": { 1141 | "collapsed": true 1142 | }, 1143 | "source": [ 1144 | "## Multiple variables stored in one column" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "markdown", 1149 | "metadata": {}, 1150 | "source": [ 1151 | "### Tubercolosis Example" 1152 | ] 1153 | }, 1154 | { 1155 | "cell_type": "markdown", 1156 | "metadata": {}, 1157 | "source": [ 1158 | "A few notes on the raw data set:\n", 1159 | "\n", 1160 | "- The columns starting with \"m\" or \"f\" contain multiple variables: \n", 1161 | " - Sex (\"m\" or \"f\")\n", 1162 | " - Age Group (\"0-14\",\"15-24\", \"25-34\", \"45-54\", \"55-64\", \"65\", \"unknown\")\n", 1163 | "- Mixture of 0s and missing values(\"NaN\"). This is due to the data collection process and the distinction is important for this dataset." 1164 | ] 1165 | }, 1166 | { 1167 | "cell_type": "code", 1168 | "execution_count": 49, 1169 | "metadata": { 1170 | "collapsed": false 1171 | }, 1172 | "outputs": [ 1173 | { 1174 | "data": { 1175 | "text/html": [ 1176 | "
\n", 1177 | "\n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | "
countryyearm014m1524m2534m3544m4554m5564m65muf014
0AD20000.00.01.00.0000.0NaNNaN
1AE20002.04.04.06.051210.0NaN3.0
2AF200052.0228.0183.0149.01299480.0NaN93.0
3AG20000.00.00.00.0001.0NaN1.0
4AL20002.019.021.014.0241916.0NaN3.0
5AM20002.0152.0130.0131.0632621.0NaN1.0
6AN20000.00.01.02.0000.0NaN0.0
7AO2000186.0999.01003.0912.0482312194.0NaN247.0
8AR200097.0278.0594.0402.0419368330.0NaN121.0
9AS2000NaNNaNNaNNaN11NaNNaNNaN
\n", 1337 | "
" 1338 | ], 1339 | "text/plain": [ 1340 | " country year m014 m1524 m2534 m3544 m4554 m5564 m65 mu f014\n", 1341 | "0 AD 2000 0.0 0.0 1.0 0.0 0 0 0.0 NaN NaN\n", 1342 | "1 AE 2000 2.0 4.0 4.0 6.0 5 12 10.0 NaN 3.0\n", 1343 | "2 AF 2000 52.0 228.0 183.0 149.0 129 94 80.0 NaN 93.0\n", 1344 | "3 AG 2000 0.0 0.0 0.0 0.0 0 0 1.0 NaN 1.0\n", 1345 | "4 AL 2000 2.0 19.0 21.0 14.0 24 19 16.0 NaN 3.0\n", 1346 | "5 AM 2000 2.0 152.0 130.0 131.0 63 26 21.0 NaN 1.0\n", 1347 | "6 AN 2000 0.0 0.0 1.0 2.0 0 0 0.0 NaN 0.0\n", 1348 | "7 AO 2000 186.0 999.0 1003.0 912.0 482 312 194.0 NaN 247.0\n", 1349 | "8 AR 2000 97.0 278.0 594.0 402.0 419 368 330.0 NaN 121.0\n", 1350 | "9 AS 2000 NaN NaN NaN NaN 1 1 NaN NaN NaN" 1351 | ] 1352 | }, 1353 | "execution_count": 49, 1354 | "metadata": {}, 1355 | "output_type": "execute_result" 1356 | } 1357 | ], 1358 | "source": [ 1359 | "df = pd.read_csv(\"./data/tb-raw.csv\")\n", 1360 | "df" 1361 | ] 1362 | }, 1363 | { 1364 | "cell_type": "code", 1365 | "execution_count": 50, 1366 | "metadata": { 1367 | "collapsed": false 1368 | }, 1369 | "outputs": [ 1370 | { 1371 | "data": { 1372 | "text/html": [ 1373 | "
\n", 1374 | "\n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | "
countryyearcasessexage
0AD20000.0m0-14
10AD20000.0m15-24
20AD20001.0m25-34
30AD20000.0m35-44
40AD20000.0m45-54
50AD20000.0m55-64
81AE20003.0f0-14
1AE20002.0m0-14
11AE20004.0m15-24
21AE20004.0m25-34
\n", 1468 | "
" 1469 | ], 1470 | "text/plain": [ 1471 | " country year cases sex age\n", 1472 | "0 AD 2000 0.0 m 0-14\n", 1473 | "10 AD 2000 0.0 m 15-24\n", 1474 | "20 AD 2000 1.0 m 25-34\n", 1475 | "30 AD 2000 0.0 m 35-44\n", 1476 | "40 AD 2000 0.0 m 45-54\n", 1477 | "50 AD 2000 0.0 m 55-64\n", 1478 | "81 AE 2000 3.0 f 0-14\n", 1479 | "1 AE 2000 2.0 m 0-14\n", 1480 | "11 AE 2000 4.0 m 15-24\n", 1481 | "21 AE 2000 4.0 m 25-34" 1482 | ] 1483 | }, 1484 | "execution_count": 50, 1485 | "metadata": {}, 1486 | "output_type": "execute_result" 1487 | } 1488 | ], 1489 | "source": [ 1490 | "df = pd.melt(df, id_vars=[\"country\",\"year\"], value_name=\"cases\", var_name=\"sex_and_age\")\n", 1491 | "\n", 1492 | "# Extract Sex, Age lower bound and Age upper bound group\n", 1493 | "tmp_df = df[\"sex_and_age\"].str.extract(\"(\\D)(\\d+)(\\d{2})\", expand=False) \n", 1494 | "\n", 1495 | "# Name columns\n", 1496 | "tmp_df.columns = [\"sex\", \"age_lower\", \"age_upper\"]\n", 1497 | "\n", 1498 | "# Create `age`column based on `age_lower` and `age_upper`\n", 1499 | "tmp_df[\"age\"] = tmp_df[\"age_lower\"] + \"-\" + tmp_df[\"age_upper\"]\n", 1500 | "\n", 1501 | "# Merge \n", 1502 | "df = pd.concat([df, tmp_df], axis=1)\n", 1503 | "\n", 1504 | "# Drop unnecessary columns and rows\n", 1505 | "df = df.drop(['sex_and_age',\"age_lower\",\"age_upper\"], axis=1)\n", 1506 | "df = df.dropna()\n", 1507 | "df = df.sort_values(ascending=True,by=[\"country\", \"year\", \"sex\", \"age\"])\n", 1508 | "df.head(10)" 1509 | ] 1510 | }, 1511 | { 1512 | "cell_type": "markdown", 1513 | "metadata": {}, 1514 | "source": [ 1515 | "## Variables are stored in both rows and columns" 1516 | ] 1517 | }, 1518 | { 1519 | "cell_type": "markdown", 1520 | "metadata": {}, 1521 | "source": [ 1522 | "### Global Historical Climatology Network Dataset" 1523 | ] 1524 | }, 1525 | { 1526 | "cell_type": "code", 1527 | "execution_count": 24, 1528 | "metadata": { 1529 | "collapsed": false 1530 | }, 1531 | "outputs": [], 1532 | "source": [ 1533 | "df = pd.read_csv(\"./data/weather-raw.csv\")" 1534 | ] 1535 | }, 1536 | { 1537 | "cell_type": "code", 1538 | "execution_count": 25, 1539 | "metadata": { 1540 | "collapsed": false 1541 | }, 1542 | "outputs": [ 1543 | { 1544 | "data": { 1545 | "text/html": [ 1546 | "
\n", 1547 | "\n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | "
idyearmonthelementday_rawvalue
0MX1700420101tmaxd1NaN
1MX1700420101tmind1NaN
2MX1700420102tmaxd1NaN
3MX1700420102tmind1NaN
4MX1700420103tmaxd1NaN
5MX1700420103tmind1NaN
6MX1700420104tmaxd1NaN
7MX1700420104tmind1NaN
8MX1700420105tmaxd1NaN
9MX1700420105tmind1NaN
\n", 1652 | "
" 1653 | ], 1654 | "text/plain": [ 1655 | " id year month element day_raw value\n", 1656 | "0 MX17004 2010 1 tmax d1 NaN\n", 1657 | "1 MX17004 2010 1 tmin d1 NaN\n", 1658 | "2 MX17004 2010 2 tmax d1 NaN\n", 1659 | "3 MX17004 2010 2 tmin d1 NaN\n", 1660 | "4 MX17004 2010 3 tmax d1 NaN\n", 1661 | "5 MX17004 2010 3 tmin d1 NaN\n", 1662 | "6 MX17004 2010 4 tmax d1 NaN\n", 1663 | "7 MX17004 2010 4 tmin d1 NaN\n", 1664 | "8 MX17004 2010 5 tmax d1 NaN\n", 1665 | "9 MX17004 2010 5 tmin d1 NaN" 1666 | ] 1667 | }, 1668 | "execution_count": 25, 1669 | "metadata": {}, 1670 | "output_type": "execute_result" 1671 | } 1672 | ], 1673 | "source": [ 1674 | "df = pd.melt(df, id_vars=[\"id\", \"year\",\"month\",\"element\"], var_name=\"day_raw\")\n", 1675 | "df.head(10)" 1676 | ] 1677 | }, 1678 | { 1679 | "cell_type": "code", 1680 | "execution_count": 26, 1681 | "metadata": { 1682 | "collapsed": false 1683 | }, 1684 | "outputs": [ 1685 | { 1686 | "data": { 1687 | "text/html": [ 1688 | "
\n", 1689 | "\n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | "
elementiddatetmaxtmin
0MX170042010-02-0227.314.4
1MX170042010-02-0324.114.4
2MX170042010-03-0532.114.2
\n", 1723 | "
" 1724 | ], 1725 | "text/plain": [ 1726 | "element id date tmax tmin\n", 1727 | "0 MX17004 2010-02-02 27.3 14.4\n", 1728 | "1 MX17004 2010-02-03 24.1 14.4\n", 1729 | "2 MX17004 2010-03-05 32.1 14.2" 1730 | ] 1731 | }, 1732 | "execution_count": 26, 1733 | "metadata": {}, 1734 | "output_type": "execute_result" 1735 | } 1736 | ], 1737 | "source": [ 1738 | "# Extracting day\n", 1739 | "df[\"day\"] = df[\"day_raw\"].str.extract(\"d(\\d+)\", expand=False) \n", 1740 | "df[\"id\"] = \"MX17004\"\n", 1741 | "\n", 1742 | "# To numeric values\n", 1743 | "df[[\"year\",\"month\",\"day\"]] = df[[\"year\",\"month\",\"day\"]].apply(lambda x: pd.to_numeric(x, errors='ignore'))\n", 1744 | "\n", 1745 | "# Creating a date from the different columns\n", 1746 | "def create_date_from_year_month_day(row):\n", 1747 | " return datetime.datetime(year=row[\"year\"], month=int(row[\"month\"]), day=row[\"day\"])\n", 1748 | "\n", 1749 | "df[\"date\"] = df.apply(lambda row: create_date_from_year_month_day(row), axis=1)\n", 1750 | "df = df.drop(['year',\"month\",\"day\", \"day_raw\"], axis=1)\n", 1751 | "df = df.dropna()\n", 1752 | "\n", 1753 | "# Unmelting column \"element\"\n", 1754 | "df = df.pivot_table(index=[\"id\",\"date\"], columns=\"element\", values=\"value\")\n", 1755 | "df.reset_index(drop=False, inplace=True)\n", 1756 | "df" 1757 | ] 1758 | }, 1759 | { 1760 | "cell_type": "markdown", 1761 | "metadata": { 1762 | "collapsed": true 1763 | }, 1764 | "source": [ 1765 | "## One type in multiple tables" 1766 | ] 1767 | }, 1768 | { 1769 | "cell_type": "markdown", 1770 | "metadata": {}, 1771 | "source": [ 1772 | "### Baby Names in Illinois" 1773 | ] 1774 | }, 1775 | { 1776 | "cell_type": "code", 1777 | "execution_count": 5, 1778 | "metadata": { 1779 | "collapsed": false 1780 | }, 1781 | "outputs": [ 1782 | { 1783 | "data": { 1784 | "text/html": [ 1785 | "
\n", 1786 | "\n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | " \n", 1814 | " \n", 1815 | " \n", 1816 | " \n", 1817 | " \n", 1818 | " \n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | " \n", 1825 | " \n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | " \n", 1831 | " \n", 1832 | " \n", 1833 | " \n", 1834 | " \n", 1835 | " \n", 1836 | " \n", 1837 | " \n", 1838 | " \n", 1839 | "
ranknamefrequencysexyear
01Noah837Male2014
12Alexander747Male2014
23William687Male2014
34Michael680Male2014
45Liam670Male2014
\n", 1840 | "
" 1841 | ], 1842 | "text/plain": [ 1843 | " rank name frequency sex year\n", 1844 | "0 1 Noah 837 Male 2014\n", 1845 | "1 2 Alexander 747 Male 2014\n", 1846 | "2 3 William 687 Male 2014\n", 1847 | "3 4 Michael 680 Male 2014\n", 1848 | "4 5 Liam 670 Male 2014" 1849 | ] 1850 | }, 1851 | "execution_count": 5, 1852 | "metadata": {}, 1853 | "output_type": "execute_result" 1854 | } 1855 | ], 1856 | "source": [ 1857 | "def extract_year(string):\n", 1858 | " match = re.match(\".+(\\d{4})\", string) \n", 1859 | " if match != None: return match.group(1)\n", 1860 | " \n", 1861 | "path = './data'\n", 1862 | "allFiles = glob.glob(path + \"/201*-baby-names-illinois.csv\")\n", 1863 | "frame = pd.DataFrame()\n", 1864 | "df_list= []\n", 1865 | "for file_ in allFiles:\n", 1866 | " df = pd.read_csv(file_,index_col=None, header=0)\n", 1867 | " df.columns = map(str.lower, df.columns)\n", 1868 | " df[\"year\"] = extract_year(file_)\n", 1869 | " df_list.append(df)\n", 1870 | " \n", 1871 | "df = pd.concat(df_list)\n", 1872 | "df.head(5)" 1873 | ] 1874 | } 1875 | ], 1876 | "metadata": { 1877 | "kernelspec": { 1878 | "display_name": "Python 2", 1879 | "language": "python", 1880 | "name": "python2" 1881 | }, 1882 | "language_info": { 1883 | "codemirror_mode": { 1884 | "name": "ipython", 1885 | "version": 2 1886 | }, 1887 | "file_extension": ".py", 1888 | "mimetype": "text/x-python", 1889 | "name": "python", 1890 | "nbconvert_exporter": "python", 1891 | "pygments_lexer": "ipython2", 1892 | "version": "2.7.11" 1893 | } 1894 | }, 1895 | "nbformat": 4, 1896 | "nbformat_minor": 1 1897 | } 1898 | -------------------------------------------------------------------------------- /data/2014-baby-names-illinois.csv: -------------------------------------------------------------------------------- 1 | rank,name,frequency,sex 2 | 1,Noah,837,Male 3 | 2,Alexander,747,Male 4 | 3,William,687,Male 5 | 4,Michael,680,Male 6 | 5,Liam,670,Male 7 | 6,Jacob,654,Male 8 | 7,Benjamin,649,Male 9 | 8,Mason,604,Male 10 | 9,Daniel,593,Male 11 | 10,Logan,593,Male 12 | 11,Ethan,579,Male 13 | 12,Anthony,564,Male 14 | 13,Aiden,535,Male 15 | 14,Jayden,530,Male 16 | 15,Joseph,498,Male 17 | 16,James,486,Male 18 | 17,Lucas,481,Male 19 | 18,Henry,477,Male 20 | 19,Jackson,469,Male 21 | 20,David,467,Male 22 | 21,Nathan,446,Male 23 | 22,Elijah,445,Male 24 | 23,Matthew,436,Male 25 | 24,Andrew,433,Male 26 | 25,John,432,Male 27 | 26,Isaac,419,Male 28 | 27,Dylan,417,Male 29 | 28,Jack,411,Male 30 | 29,Joshua,411,Male 31 | 30,Owen,411,Male 32 | 31,Julian,407,Male 33 | 32,Gabriel,399,Male 34 | 33,Ryan,399,Male 35 | 34,Oliver,396,Male 36 | 35,Carter,393,Male 37 | 36,Sebastian,380,Male 38 | 37,Charles,377,Male 39 | 38,Luke,371,Male 40 | 39,Jonathan,364,Male 41 | 40,Samuel,353,Male 42 | 41,Christopher,340,Male 43 | 42,Evan,326,Male 44 | 43,Connor,314,Male 45 | 44,Caleb,313,Male 46 | 45,Christian,308,Male 47 | 46,Thomas,306,Male 48 | 47,Nicholas,300,Male 49 | 48,Wyatt,300,Male 50 | 49,Hunter,298,Male 51 | 50,Adrian,288,Male 52 | 51,Angel,285,Male 53 | 52,Cameron,278,Male 54 | 53,Aaron,274,Male 55 | 54,Landon,266,Male 56 | 55,Nolan,266,Male 57 | 56,Jordan,263,Male 58 | 57,Kevin,261,Male 59 | 58,Gavin,258,Male 60 | 59,Adam,249,Male 61 | 60,Brandon,247,Male 62 | 61,Eli,237,Male 63 | 62,Parker,235,Male 64 | 63,Isaiah,234,Male 65 | 64,Jaxon,232,Male 66 | 65,Levi,232,Male 67 | 66,Tyler,227,Male 68 | 67,Dominic,224,Male 69 | 68,Josiah,224,Male 70 | 69,Jeremiah,222,Male 71 | 70,Austin,220,Male 72 | 71,Robert,210,Male 73 | 72,Cooper,204,Male 74 | 73,Leonardo,203,Male 75 | 74,Ian,202,Male 76 | 75,Blake,200,Male 77 | 76,Brayden,200,Male 78 | 77,Camden,200,Male 79 | 78,Zachary,198,Male 80 | 79,Damian,197,Male 81 | 80,Jace,196,Male 82 | 81,Vincent,196,Male 83 | 82,Ayden,192,Male 84 | 83,Leo,191,Male 85 | 84,Chase,189,Male 86 | 85,Colton,188,Male 87 | 86,Grayson,186,Male 88 | 87,Lincoln,186,Male 89 | 88,Mateo,185,Male 90 | 89,Jose,182,Male 91 | 90,Maxwell,181,Male 92 | 91,Giovanni,178,Male 93 | 92,Jason,177,Male 94 | 93,Kayden,175,Male 95 | 94,Nathaniel,175,Male 96 | 95,Miles,174,Male 97 | 96,Patrick,173,Male 98 | 97,Max,170,Male 99 | 98,Brody,168,Male 100 | 99,Jaxson,168,Male 101 | 100,George,166,Male 102 | 101,Theodore,166,Male -------------------------------------------------------------------------------- /data/2015-baby-names-illinois.csv: -------------------------------------------------------------------------------- 1 | rank,name,frequency,sex 2 | 1,Noah,863,Male 3 | 2,Liam,709,Male 4 | 3,Alexander,703,Male 5 | 4,Jacob,650,Male 6 | 5,William,618,Male 7 | 6,Michael,617,Male 8 | 7,Benjamin,616,Male 9 | 8,Daniel,601,Male 10 | 9,Mason,594,Male 11 | 10,James,576,Male 12 | 11,Logan,568,Male 13 | 12,Ethan,560,Male 14 | 13,Aiden,547,Male 15 | 14,Anthony,524,Male 16 | 15,Henry,514,Male 17 | 16,Oliver,502,Male 18 | 17,Jayden,480,Male 19 | 18,Lucas,471,Male 20 | 19,Matthew,449,Male 21 | 20,Jackson,447,Male 22 | 21,Owen,446,Male 23 | 22,Sebastian,433,Male 24 | 23,Carter,429,Male 25 | 24,Joseph,427,Male 26 | 25,Isaac,421,Male 27 | 26,Elijah,413,Male 28 | 27,John,412,Male 29 | 28,Dylan,403,Male 30 | 29,David,402,Male 31 | 30,Julian,397,Male 32 | 31,Jack,384,Male 33 | 32,Nathan,382,Male 34 | 33,Samuel,379,Male 35 | 34,Andrew,377,Male 36 | 35,Gabriel,375,Male 37 | 36,Joshua,363,Male 38 | 37,Christopher,349,Male 39 | 38,Ryan,335,Male 40 | 39,Caleb,331,Male 41 | 40,Jonathan,328,Male 42 | 41,Charles,320,Male 43 | 42,Luke,308,Male 44 | 43,Wyatt,301,Male 45 | 44,Christian,299,Male 46 | 45,Thomas,299,Male 47 | 46,Dominic,286,Male 48 | 47,Cameron,280,Male 49 | 48,Adrian,279,Male 50 | 49,Nolan,279,Male 51 | 50,Angel,278,Male 52 | 51,Nicholas,274,Male 53 | 52,Connor,269,Male 54 | 53,Levi,268,Male 55 | 54,Hunter,267,Male 56 | 55,Landon,265,Male 57 | 56,Mateo,262,Male 58 | 57,Aaron,260,Male 59 | 58,Grayson,258,Male 60 | 59,Adam,256,Male 61 | 60,Isaiah,254,Male 62 | 61,Jordan,250,Male 63 | 62,Evan,249,Male 64 | 63,Leonardo,246,Male 65 | 64,Leo,244,Male 66 | 65,Jaxon,241,Male 67 | 66,Gavin,236,Male 68 | 67,Josiah,235,Male 69 | 68,Eli,223,Male 70 | 69,Theodore,220,Male 71 | 70,Lincoln,218,Male 72 | 71,Brandon,217,Male 73 | 72,Tyler,215,Male 74 | 73,Brayden,208,Male 75 | 74,Austin,205,Male 76 | 75,Robert,205,Male 77 | 76,Emmett,204,Male 78 | 77,Parker,204,Male 79 | 78,Jeremiah,200,Male 80 | 79,Kevin,200,Male 81 | 80,Colton,199,Male 82 | 81,Ian,199,Male 83 | 82,Vincent,198,Male 84 | 83,Zachary,197,Male 85 | 84,Chase,193,Male 86 | 85,Ayden,187,Male 87 | 86,Cooper,186,Male 88 | 87,Easton,183,Male 89 | 88,Declan,182,Male 90 | 89,Jaxson,180,Male 91 | 90,Xavier,175,Male 92 | 91,Jace,172,Male 93 | 92,Damian,171,Male 94 | 93,Jose,171,Male 95 | 94,Kayden,171,Male 96 | 95,Patrick,169,Male 97 | 96,Giovanni,168,Male 98 | 97,Hudson,167,Male 99 | 98,Camden,165,Male 100 | 99,Max,164,Male 101 | 100,Maxwell,155,Male -------------------------------------------------------------------------------- /data/billboard.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nickhould/tidy-data-python/2ee13434796f7ef2a0674870c651b1ca1e1b9597/data/billboard.csv -------------------------------------------------------------------------------- /data/pew-raw.csv: -------------------------------------------------------------------------------- 1 | religion, <$10k, $10-20k,$20-30k,$30-40k, $40-50k,$50-75k 2 | Agnostic,27,34,60,81,76,137 3 | Atheist,12,27,37,52,35,70 4 | Buddhist,27,21,30,34,33,58 5 | Catholic,418,617,732,670,638,1116 6 | Dont know/refused,15,14,15,11,10,35 7 | Evangelical Prot ,575,869,1064,982,881,1486 8 | Hindu ,1,9,7,9,11,34 9 | Historically Black Prot ,228,244,236,238,197,223 10 | Jehovahs Witness ,20,27,24,24,21,30 11 | Jewish ,19,19,25,25,30,95 -------------------------------------------------------------------------------- /data/tb-raw.csv: -------------------------------------------------------------------------------- 1 | country,year,m014,m1524,m2534,m3544,m4554,m5564,m65,mu,f014 2 | AD,2000,0,0,1,0,0,0,0,, 3 | AE,2000,2,4,4,6,5,12,10,,3 4 | AF,2000,52,228,183,149,129,94,80,,93 5 | AG,2000,0,0,0,0,0,0,1,,1 6 | AL,2000,2,19,21,14,24,19,16,,3 7 | AM,2000,2,152,130,131,63,26,21,,1 8 | AN,2000,0,0,1,2,0,0,0,,0 9 | AO,2000,186,999,1003,912,482,312,194,,247 10 | AR,2000,97,278,594,402,419,368,330,,121 11 | AS,2000,,,,,1,1,,, -------------------------------------------------------------------------------- /data/weather-raw.csv: -------------------------------------------------------------------------------- 1 | id,year,month,element,d1,d2,d3,d4,d5,d6,d7,d8 2 | MX17004,2010,1,tmax,,,,,,,, 3 | MX17004,2010,1,tmin,,,,,,,, 4 | MX17004 ,2010,2,tmax,,27.3,24.1,,,,, 5 | MX17004,2010,2,tmin,,14.4,14.4,,,,, 6 | MX17004,2010,3,tmax,,,,,32.1,,, 7 | MX17004,2010,3,tmin,,,,,14.2,,, 8 | MX17004,2010,4,tmax,,,,,,,, 9 | MX17004,2010,4,tmin,,,,,,,, 10 | MX17004,2010,5,tmax,,,,,,,, 11 | MX17004,2010,5,tmin,,,,,,,, --------------------------------------------------------------------------------