├── .DS_Store └── CRS_Q_0041203_SSC_WSQ_Data_Analytics_and_Visualization_with_Python_Facilitator_Guide.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tertiarycourses/Replace-PythonDataTraining/4ac3b4c3f86f5c46e991a1f038c54d681445bbe5/.DS_Store -------------------------------------------------------------------------------- /CRS_Q_0041203_SSC_WSQ_Data_Analytics_and_Visualization_with_Python_Facilitator_Guide.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.6.5" 21 | }, 22 | "colab": { 23 | "name": "CRS-Q-0041203-SSC - WSQ - Data Analytics and Visualization with Python Facilitator Guide.ipynb", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "DuWuy79LtA4U", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "# WSQ - Data Analytics and Visualization with Python Facilitator Guide" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "id": "7ezBm7D7gEZc", 42 | "colab_type": "text" 43 | }, 44 | "source": [ 45 | "#Topic 1 Data Preparation\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "id": "zhIjWR40gVcA", 52 | "colab_type": "text" 53 | }, 54 | "source": [ 55 | "## Install Python Data Analysis Packages" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "hzBEFiJ6gEZd", 62 | "colab_type": "code", 63 | "colab": {} 64 | }, 65 | "source": [ 66 | "# These packagss have been pre-installed on Google Colab\n", 67 | "!pip install numpy\n", 68 | "!pip install matplotlib\n", 69 | "!pip install seaborn\n", 70 | "!pip install pandas\n", 71 | "!pip install scipy\n", 72 | "!pip install sklearn" 73 | ], 74 | "execution_count": null, 75 | "outputs": [] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "id": "H5rpHv_FjI70", 81 | "colab_type": "code", 82 | "colab": {} 83 | }, 84 | "source": [ 85 | "import numpy as np\n", 86 | "import matplotlib.pyplot as plt\n", 87 | "import seaborn as sb\n", 88 | "import pandas as pd\n", 89 | "import sklearn" 90 | ], 91 | "execution_count": null, 92 | "outputs": [] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "id": "7V8iF4swgZzg", 98 | "colab_type": "text" 99 | }, 100 | "source": [ 101 | "## Series" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "metadata": { 107 | "id": "MCUfMrKAgbVl", 108 | "colab_type": "code", 109 | "colab": {} 110 | }, 111 | "source": [ 112 | "data = np.array([10,20,30,40])\n", 113 | "s = pd.Series(data,index=['2011','2012','2013','2014'])\n", 114 | "s" 115 | ], 116 | "execution_count": null, 117 | "outputs": [] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "metadata": { 122 | "id": "PzEtOBhx7yDa", 123 | "colab_type": "code", 124 | "colab": {} 125 | }, 126 | "source": [ 127 | "data = {'2011':40,'2012':30,'2013':20,'2014':10}\n", 128 | "s = pd.Series(data)\n", 129 | "s" 130 | ], 131 | "execution_count": null, 132 | "outputs": [] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "metadata": { 137 | "id": "-ujT03Mrip11", 138 | "colab_type": "code", 139 | "colab": {} 140 | }, 141 | "source": [ 142 | "s['2012']" 143 | ], 144 | "execution_count": null, 145 | "outputs": [] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": { 150 | "id": "6VI4s6KugEa1", 151 | "colab_type": "text" 152 | }, 153 | "source": [ 154 | "## DataFrame" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "metadata": { 160 | "id": "3FAv-kkIgEa1", 161 | "colab_type": "code", 162 | "colab": {} 163 | }, 164 | "source": [ 165 | "data = {\n", 166 | " 'Name' : [\"Ally\",\"Belinda\",\"Jane\",\"Steve\"],\n", 167 | " 'Height' : [160,165,155,180],\n", 168 | " 'Gender' : ['F','F','F','M']\n", 169 | "}" 170 | ], 171 | "execution_count": null, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "id": "jCGH8LoCgEa3", 178 | "colab_type": "code", 179 | "colab": {} 180 | }, 181 | "source": [ 182 | "df = pd.DataFrame(data)\n", 183 | "df" 184 | ], 185 | "execution_count": null, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "id": "tbM-3JwHgEa5", 192 | "colab_type": "text" 193 | }, 194 | "source": [ 195 | "## Import and Export Data" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "metadata": { 201 | "id": "d6n0myPugEa5", 202 | "colab_type": "code", 203 | "colab": {} 204 | }, 205 | "source": [ 206 | "# Import CSV file\n", 207 | "mtcars = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/mtcars.csv')\n", 208 | "mtcars" 209 | ], 210 | "execution_count": null, 211 | "outputs": [] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "metadata": { 216 | "id": "EIEw3MzzrCtk", 217 | "colab_type": "code", 218 | "colab": {} 219 | }, 220 | "source": [ 221 | "mtcars_sample = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/mtcars.csv',\n", 222 | " index_col='car_names',\n", 223 | " usecols=['car_names','mpg','hp','cyl','am'])\n", 224 | "mtcars_sample" 225 | ], 226 | "execution_count": null, 227 | "outputs": [] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "metadata": { 232 | "id": "tHpotVaX9p0y", 233 | "colab_type": "code", 234 | "colab": {} 235 | }, 236 | "source": [ 237 | "mtcars_sample.to_csv('cars_sample.csv')" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "zZZ3obNlH9fa", 246 | "colab_type": "code", 247 | "colab": {} 248 | }, 249 | "source": [ 250 | "mtcars_sample.to_excel('cars_sample.xlsx', sheet_name='cars', index=False)" 251 | ], 252 | "execution_count": null, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "metadata": { 258 | "id": "F9ty3iKjIFGX", 259 | "colab_type": "code", 260 | "colab": {} 261 | }, 262 | "source": [ 263 | "mtcars_sample2 = pd.read_excel('cars_sample.xlsx', sheet_name='cars')" 264 | ], 265 | "execution_count": null, 266 | "outputs": [] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": { 271 | "id": "krqD-p7dgEa8", 272 | "colab_type": "text" 273 | }, 274 | "source": [ 275 | "## Dataframe Attributes" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "metadata": { 281 | "id": "L2qOLrdELSa0", 282 | "colab_type": "code", 283 | "colab": {} 284 | }, 285 | "source": [ 286 | "mtcars.info()" 287 | ], 288 | "execution_count": null, 289 | "outputs": [] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "metadata": { 294 | "id": "zdcvpThcqwmn", 295 | "colab_type": "code", 296 | "colab": {} 297 | }, 298 | "source": [ 299 | "mtcars_sample.shape" 300 | ], 301 | "execution_count": null, 302 | "outputs": [] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "metadata": { 307 | "id": "bMPIfw_Pq5eX", 308 | "colab_type": "code", 309 | "colab": {} 310 | }, 311 | "source": [ 312 | "mtcars_sample.columns" 313 | ], 314 | "execution_count": null, 315 | "outputs": [] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "metadata": { 320 | "id": "IN7zf4lIsXwP", 321 | "colab_type": "code", 322 | "colab": {} 323 | }, 324 | "source": [ 325 | "mtcars_sample.index" 326 | ], 327 | "execution_count": null, 328 | "outputs": [] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "metadata": { 333 | "id": "IvcD2AFjgEa8", 334 | "colab_type": "code", 335 | "colab": {} 336 | }, 337 | "source": [ 338 | "mtcars_sample['mpg'].values" 339 | ], 340 | "execution_count": null, 341 | "outputs": [] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": { 346 | "id": "YSLwqC1QgEa-", 347 | "colab_type": "text" 348 | }, 349 | "source": [ 350 | "## Activity: Import and Export Data" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "metadata": { 356 | "id": "aps7LQg04tvr", 357 | "colab_type": "code", 358 | "colab": {} 359 | }, 360 | "source": [ 361 | "health_expenditure = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/government-health-expenditure.csv',)\n", 362 | "health_expenditure" 363 | ], 364 | "execution_count": null, 365 | "outputs": [] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "metadata": { 370 | "id": "DrftyDLSgEa-", 371 | "colab_type": "code", 372 | "colab": {} 373 | }, 374 | "source": [ 375 | "health_expenditure_sample = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/government-health-expenditure.csv',\n", 376 | " index_col='financial_year',\n", 377 | " usecols=['financial_year','operating_expenditure','development_expenditure','government_health_expenditure'])\n", 378 | "health_expenditure_sample" 379 | ], 380 | "execution_count": null, 381 | "outputs": [] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "metadata": { 386 | "id": "UOl59bTDK4d3", 387 | "colab_type": "code", 388 | "colab": {} 389 | }, 390 | "source": [ 391 | "health_expenditure_sample.to_csv('Heath_Expenditure.csv')" 392 | ], 393 | "execution_count": null, 394 | "outputs": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "metadata": { 399 | "id": "7A2djumzgEbB", 400 | "colab_type": "code", 401 | "colab": {} 402 | }, 403 | "source": [ 404 | "" 405 | ], 406 | "execution_count": null, 407 | "outputs": [] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": { 412 | "id": "PyQthkxrgEbE", 413 | "colab_type": "text" 414 | }, 415 | "source": [ 416 | "## Selecting Column" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "metadata": { 422 | "id": "WZdePh6lMb-G", 423 | "colab_type": "code", 424 | "colab": {} 425 | }, 426 | "source": [ 427 | "mtcars_sample['mpg']" 428 | ], 429 | "execution_count": null, 430 | "outputs": [] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "metadata": { 435 | "id": "sa0IRnuGgEbF", 436 | "colab_type": "code", 437 | "colab": {} 438 | }, 439 | "source": [ 440 | "mtcars_sample[['mpg','cyl']]" 441 | ], 442 | "execution_count": null, 443 | "outputs": [] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "metadata": { 448 | "id": "EGiaSo1cgEbG", 449 | "colab_type": "code", 450 | "colab": {} 451 | }, 452 | "source": [ 453 | "mtcars_sample.mpg" 454 | ], 455 | "execution_count": null, 456 | "outputs": [] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "metadata": { 461 | "id": "EjTIs0jOgEbJ", 462 | "colab_type": "code", 463 | "colab": {} 464 | }, 465 | "source": [ 466 | "mtcars_sample[['mpg','cyl']]" 467 | ], 468 | "execution_count": null, 469 | "outputs": [] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": { 474 | "id": "7E1s525XgEbL", 475 | "colab_type": "text" 476 | }, 477 | "source": [ 478 | "## Selecting Row" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "metadata": { 484 | "id": "BouqI6IUgEbN", 485 | "colab_type": "code", 486 | "colab": {} 487 | }, 488 | "source": [ 489 | "mtcars_sample.loc['Fiat 128']" 490 | ], 491 | "execution_count": null, 492 | "outputs": [] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "metadata": { 497 | "id": "RW8axahyD2K3", 498 | "colab_type": "code", 499 | "colab": {} 500 | }, 501 | "source": [ 502 | "mtcars_sample.loc[['Fiat 128','Lotus Europa']]" 503 | ], 504 | "execution_count": null, 505 | "outputs": [] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "metadata": { 510 | "id": "thAsYbKDgEbL", 511 | "colab_type": "code", 512 | "colab": {} 513 | }, 514 | "source": [ 515 | "mtcars_sample.iloc[3]" 516 | ], 517 | "execution_count": null, 518 | "outputs": [] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "metadata": { 523 | "id": "dydlYm0iEEbc", 524 | "colab_type": "code", 525 | "colab": {} 526 | }, 527 | "source": [ 528 | "mtcars_sample.iloc[[3,5]]" 529 | ], 530 | "execution_count": null, 531 | "outputs": [] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": { 536 | "id": "2STPI6M1F21F", 537 | "colab_type": "text" 538 | }, 539 | "source": [ 540 | "## Slicing Data" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "metadata": { 546 | "id": "_MfVAimAE7sU", 547 | "colab_type": "code", 548 | "colab": {} 549 | }, 550 | "source": [ 551 | "mtcars_sample.iloc[3:6]" 552 | ], 553 | "execution_count": null, 554 | "outputs": [] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "metadata": { 559 | "id": "_6Qg4LcfFDpp", 560 | "colab_type": "code", 561 | "colab": {} 562 | }, 563 | "source": [ 564 | "mtcars_sample.iloc[:3]" 565 | ], 566 | "execution_count": null, 567 | "outputs": [] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "metadata": { 572 | "id": "twPmOSNSZ6ah", 573 | "colab_type": "code", 574 | "colab": {} 575 | }, 576 | "source": [ 577 | "mtcars_sample.iloc[:,:3]" 578 | ], 579 | "execution_count": null, 580 | "outputs": [] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": { 585 | "id": "ktUTspoOgEbR", 586 | "colab_type": "text" 587 | }, 588 | "source": [ 589 | "## Acitivity: Selecting and Slicing Data" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "metadata": { 595 | "id": "TaiPid8ogEbR", 596 | "colab_type": "code", 597 | "colab": {} 598 | }, 599 | "source": [ 600 | "health_expenditure_sample = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/government-health-expenditure.csv',\n", 601 | " index_col='financial_year',\n", 602 | " usecols=['financial_year','operating_expenditure','development_expenditure','government_health_expenditure'])\n", 603 | "health_expenditure_sample" 604 | ], 605 | "execution_count": null, 606 | "outputs": [] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "metadata": { 611 | "id": "eL5jssLACoKe", 612 | "colab_type": "code", 613 | "colab": {} 614 | }, 615 | "source": [ 616 | "health_expenditure_sample.info()" 617 | ], 618 | "execution_count": null, 619 | "outputs": [] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "metadata": { 624 | "id": "uKzLvpGYOiS2", 625 | "colab_type": "code", 626 | "colab": {} 627 | }, 628 | "source": [ 629 | "health_expenditure_sample[['operating_expenditure','development_expenditure']]" 630 | ], 631 | "execution_count": null, 632 | "outputs": [] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "metadata": { 637 | "id": "WGfo4H5GCS_J", 638 | "colab_type": "code", 639 | "colab": {} 640 | }, 641 | "source": [ 642 | "health_expenditure_sample[['operating_expenditure','development_expenditure']].loc[[2016,2017]]" 643 | ], 644 | "execution_count": null, 645 | "outputs": [] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "metadata": { 650 | "id": "N4NJjk3vC-z7", 651 | "colab_type": "code", 652 | "colab": {} 653 | }, 654 | "source": [ 655 | "health_expenditure_sample[['operating_expenditure','development_expenditure']].iloc[3:8]" 656 | ], 657 | "execution_count": null, 658 | "outputs": [] 659 | }, 660 | { 661 | "cell_type": "markdown", 662 | "metadata": { 663 | "id": "QPtyo6FwgEbf", 664 | "colab_type": "text" 665 | }, 666 | "source": [ 667 | "## Filtering Data" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "metadata": { 673 | "id": "Q-VD2kuPgEbg", 674 | "colab_type": "code", 675 | "colab": {} 676 | }, 677 | "source": [ 678 | "mtcars_sample[mtcars_sample['cyl']>4]" 679 | ], 680 | "execution_count": null, 681 | "outputs": [] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "metadata": { 686 | "id": "v3eUxvoDahFJ", 687 | "colab_type": "code", 688 | "colab": {} 689 | }, 690 | "source": [ 691 | "mtcars_sample[(mtcars_sample[\"mpg\"] > 20) | (mtcars_sample[\"cyl\"] < 6)]" 692 | ], 693 | "execution_count": null, 694 | "outputs": [] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "metadata": { 699 | "id": "rX5fxXOia5Ye", 700 | "colab_type": "code", 701 | "colab": {} 702 | }, 703 | "source": [ 704 | "mtcars_sample[mtcars_sample[\"am\"] == 1]" 705 | ], 706 | "execution_count": null, 707 | "outputs": [] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "metadata": { 712 | "id": "UV95VleWbG37", 713 | "colab_type": "code", 714 | "colab": {} 715 | }, 716 | "source": [ 717 | "mtcars_sample.loc[[\"Mazda RX4\", \"Fiat 128\"], :]" 718 | ], 719 | "execution_count": null, 720 | "outputs": [] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "metadata": { 725 | "id": "zadX14g2gEbh", 726 | "colab_type": "code", 727 | "colab": {} 728 | }, 729 | "source": [ 730 | "mtcars_sample[mtcars_sample['cyl'].isin([6,])]" 731 | ], 732 | "execution_count": null, 733 | "outputs": [] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "metadata": { 738 | "id": "8f8Ms-FxgEbh", 739 | "colab_type": "text" 740 | }, 741 | "source": [ 742 | "## Activity: Filtering Data" 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "metadata": { 748 | "id": "VHoW5H6lgEbi", 749 | "colab_type": "code", 750 | "colab": {} 751 | }, 752 | "source": [ 753 | "health_expenditure_sample[health_expenditure_sample.operating_expenditure>5000]" 754 | ], 755 | "execution_count": null, 756 | "outputs": [] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "metadata": { 761 | "id": "byhZzaRFgEbj", 762 | "colab_type": "code", 763 | "colab": {} 764 | }, 765 | "source": [ 766 | "health_expenditure_sample[(health_expenditure_sample.operating_expenditure>5000) & (health_expenditure_sample.operating_expenditure<8000) ]" 767 | ], 768 | "execution_count": null, 769 | "outputs": [] 770 | }, 771 | { 772 | "cell_type": "markdown", 773 | "metadata": { 774 | "id": "9ZdSa2HugEbk", 775 | "colab_type": "text" 776 | }, 777 | "source": [ 778 | "## Data Cleaning" 779 | ] 780 | }, 781 | { 782 | "cell_type": "markdown", 783 | "metadata": { 784 | "id": "FJZ3Z8_OgEbk", 785 | "colab_type": "text" 786 | }, 787 | "source": [ 788 | "### Missing Data" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "metadata": { 794 | "id": "Y2Dh8Y4rgEbk", 795 | "colab_type": "code", 796 | "colab": {} 797 | }, 798 | "source": [ 799 | "df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',\n", 800 | "'h'],columns=['one', 'two', 'three'])\n", 801 | "\n", 802 | "df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])\n", 803 | "df" 804 | ], 805 | "execution_count": null, 806 | "outputs": [] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "metadata": { 811 | "id": "mMTZlOAfgEbl", 812 | "colab_type": "code", 813 | "colab": {} 814 | }, 815 | "source": [ 816 | "df['one'].isnull()" 817 | ], 818 | "execution_count": null, 819 | "outputs": [] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "metadata": { 824 | "id": "r5XLl3rtgEbm", 825 | "colab_type": "code", 826 | "colab": {} 827 | }, 828 | "source": [ 829 | " df.dropna()" 830 | ], 831 | "execution_count": null, 832 | "outputs": [] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "metadata": { 837 | "id": "jFbbUARXVYZX", 838 | "colab_type": "code", 839 | "colab": {} 840 | }, 841 | "source": [ 842 | " df.fillna(0)\n" 843 | ], 844 | "execution_count": null, 845 | "outputs": [] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "metadata": { 850 | "id": "AqoJ4N9UYDAE", 851 | "colab_type": "code", 852 | "colab": {} 853 | }, 854 | "source": [ 855 | "df.fillna(method='pad')" 856 | ], 857 | "execution_count": null, 858 | "outputs": [] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "metadata": { 863 | "id": "_W74ecfugEbp", 864 | "colab_type": "code", 865 | "colab": {} 866 | }, 867 | "source": [ 868 | "df.fillna(method='backfill')" 869 | ], 870 | "execution_count": null, 871 | "outputs": [] 872 | }, 873 | { 874 | "cell_type": "markdown", 875 | "metadata": { 876 | "id": "VKfI8-4G50wi", 877 | "colab_type": "text" 878 | }, 879 | "source": [ 880 | "## Activity: Missing Data" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "metadata": { 886 | "id": "vVYwauI_1rg1", 887 | "colab_type": "code", 888 | "colab": {} 889 | }, 890 | "source": [ 891 | "health_expenditure = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/hospital-admissions-by-sector-annual.csv')\n", 892 | "health_expenditure" 893 | ], 894 | "execution_count": null, 895 | "outputs": [] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "metadata": { 900 | "id": "kho24nHc2v0g", 901 | "colab_type": "code", 902 | "colab": {} 903 | }, 904 | "source": [ 905 | "health_expenditure_NaN = health_expenditure.replace('na',np.NaN)\n", 906 | "health_expenditure_NaN" 907 | ], 908 | "execution_count": null, 909 | "outputs": [] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "metadata": { 914 | "id": "gLC5Wgts2O_y", 915 | "colab_type": "code", 916 | "colab": {} 917 | }, 918 | "source": [ 919 | "#health_expenditure_na = health_expenditure.dropna()\n", 920 | "health_expenditure_NaN.isnull().values.sum()" 921 | ], 922 | "execution_count": null, 923 | "outputs": [] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "metadata": { 928 | "id": "hhjmqFam5aqv", 929 | "colab_type": "code", 930 | "colab": {} 931 | }, 932 | "source": [ 933 | "health_expenditure_dropna = health_expenditure_NaN.dropna()\n", 934 | "health_expenditure_dropna" 935 | ], 936 | "execution_count": null, 937 | "outputs": [] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "metadata": { 942 | "id": "nfj_820F5esX", 943 | "colab_type": "code", 944 | "colab": {} 945 | }, 946 | "source": [ 947 | "health_expenditure_dropna.isnull().values.sum()" 948 | ], 949 | "execution_count": null, 950 | "outputs": [] 951 | }, 952 | { 953 | "cell_type": "markdown", 954 | "metadata": { 955 | "id": "wY6zdzqxZ1vh", 956 | "colab_type": "text" 957 | }, 958 | "source": [ 959 | "# Topic 2 Data Transformation" 960 | ] 961 | }, 962 | { 963 | "cell_type": "markdown", 964 | "metadata": { 965 | "id": "qrR8YxldgEb0", 966 | "colab_type": "text" 967 | }, 968 | "source": [ 969 | "## Joining Data" 970 | ] 971 | }, 972 | { 973 | "cell_type": "code", 974 | "metadata": { 975 | "id": "0ZJK443QsjFx", 976 | "colab_type": "code", 977 | "colab": {} 978 | }, 979 | "source": [ 980 | "merc = [c for c in mtcars_sample.index if 'Merc' in c]\n", 981 | "merc_cars = mtcars_sample.loc[merc]\n", 982 | "merc_cars" 983 | ], 984 | "execution_count": null, 985 | "outputs": [] 986 | }, 987 | { 988 | "cell_type": "code", 989 | "metadata": { 990 | "id": "yI_1DbYSx_hg", 991 | "colab_type": "code", 992 | "colab": {} 993 | }, 994 | "source": [ 995 | "toyota = [c for c in mtcars_sample.index if 'Toyota' in c]\n", 996 | "toyota_cars = mtcars_sample.loc[toyota]\n", 997 | "toyota_cars" 998 | ], 999 | "execution_count": null, 1000 | "outputs": [] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "metadata": { 1005 | "id": "QmF_ETmdyuWQ", 1006 | "colab_type": "code", 1007 | "colab": {} 1008 | }, 1009 | "source": [ 1010 | "merc_toyota_cars = pd.concat([merc_cars, toyota_cars], axis=0)\n", 1011 | "merc_toyota_cars" 1012 | ], 1013 | "execution_count": null, 1014 | "outputs": [] 1015 | }, 1016 | { 1017 | "cell_type": "code", 1018 | "metadata": { 1019 | "id": "SVpazv9n0bTu", 1020 | "colab_type": "code", 1021 | "colab": {} 1022 | }, 1023 | "source": [ 1024 | "print('Shape of the toyota_cars: ', toyota_cars.shape)\n", 1025 | "print('Shape of the merc_cars: ', merc_cars.shape)\n", 1026 | "print('Shape of the resulting merc_toyota_cars: ', merc_toyota_cars.shape)" 1027 | ], 1028 | "execution_count": null, 1029 | "outputs": [] 1030 | }, 1031 | { 1032 | "cell_type": "markdown", 1033 | "metadata": { 1034 | "id": "rp5jbizT1qjW", 1035 | "colab_type": "text" 1036 | }, 1037 | "source": [ 1038 | "## Activity: Joining Data" 1039 | ] 1040 | }, 1041 | { 1042 | "cell_type": "code", 1043 | "metadata": { 1044 | "id": "wlrIGwcDaRmU", 1045 | "colab_type": "code", 1046 | "colab": {} 1047 | }, 1048 | "source": [ 1049 | "air_quality_no2 = pd.read_csv(\"https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/air_quality_no2_long.csv\")\n", 1050 | "air_quality_no2 = air_quality_no2[[\"date.utc\", \"location\",\"parameter\", \"value\"]]\n", 1051 | "air_quality_no2.head()" 1052 | ], 1053 | "execution_count": null, 1054 | "outputs": [] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "metadata": { 1059 | "id": "Rd828ic8aoJF", 1060 | "colab_type": "code", 1061 | "colab": {} 1062 | }, 1063 | "source": [ 1064 | "air_quality_pm25 = pd.read_csv(\"https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/air_quality_pm25_long.csv\")\n", 1065 | "air_quality_pm25 = air_quality_pm25[[\"date.utc\", \"location\",\"parameter\", \"value\"]]\n", 1066 | "air_quality_pm25.head()" 1067 | ], 1068 | "execution_count": null, 1069 | "outputs": [] 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "metadata": { 1074 | "id": "mkSioRJzcKYD", 1075 | "colab_type": "code", 1076 | "colab": {} 1077 | }, 1078 | "source": [ 1079 | "air_quality = pd.concat([air_quality_pm25, air_quality_no2], axis=0)\n", 1080 | "air_quality.head()" 1081 | ], 1082 | "execution_count": null, 1083 | "outputs": [] 1084 | }, 1085 | { 1086 | "cell_type": "code", 1087 | "metadata": { 1088 | "id": "VTPdBSFfc1eh", 1089 | "colab_type": "code", 1090 | "colab": {} 1091 | }, 1092 | "source": [ 1093 | "print('Shape of the `air_quality_pm25` table: ', air_quality_pm25.shape)\n", 1094 | "print('Shape of the `air_quality_no2` table: ', air_quality_no2.shape)\n", 1095 | "print('Shape of the resulting `air_quality` table: ', air_quality.shape)" 1096 | ], 1097 | "execution_count": null, 1098 | "outputs": [] 1099 | }, 1100 | { 1101 | "cell_type": "markdown", 1102 | "metadata": { 1103 | "id": "jz0HugqlgEb0", 1104 | "colab_type": "text" 1105 | }, 1106 | "source": [ 1107 | "### Appending Data" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "code", 1112 | "metadata": { 1113 | "id": "e0jr3wVpgEb9", 1114 | "colab_type": "code", 1115 | "colab": {} 1116 | }, 1117 | "source": [ 1118 | "toyota_merc_cars2= toyota_cars.append(merc_cars)\n", 1119 | "toyota_merc_cars2" 1120 | ], 1121 | "execution_count": null, 1122 | "outputs": [] 1123 | }, 1124 | { 1125 | "cell_type": "markdown", 1126 | "metadata": { 1127 | "id": "1Qk0-GRygEcA", 1128 | "colab_type": "text" 1129 | }, 1130 | "source": [ 1131 | "## Merging Data" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "metadata": { 1137 | "id": "5xEp6QeJgEcB", 1138 | "colab_type": "code", 1139 | "colab": {} 1140 | }, 1141 | "source": [ 1142 | "left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],\n", 1143 | " 'A': ['A0', 'A1', 'A2', 'A3'],\n", 1144 | " 'B': ['B0', 'B1', 'B2', 'B3']})\n", 1145 | "left" 1146 | ], 1147 | "execution_count": null, 1148 | "outputs": [] 1149 | }, 1150 | { 1151 | "cell_type": "code", 1152 | "metadata": { 1153 | "id": "OzGGgFa96UDw", 1154 | "colab_type": "code", 1155 | "colab": {} 1156 | }, 1157 | "source": [ 1158 | "right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K4'],\n", 1159 | " 'C': ['C0', 'C1', 'C2', 'C4'],\n", 1160 | " 'D': ['D0', 'D1', 'D2', 'D4']})\n", 1161 | "right" 1162 | ], 1163 | "execution_count": null, 1164 | "outputs": [] 1165 | }, 1166 | { 1167 | "cell_type": "code", 1168 | "metadata": { 1169 | "id": "bWJefyWs6evU", 1170 | "colab_type": "code", 1171 | "colab": {} 1172 | }, 1173 | "source": [ 1174 | "result = pd.merge(left, right, on='key')\n", 1175 | "result" 1176 | ], 1177 | "execution_count": null, 1178 | "outputs": [] 1179 | }, 1180 | { 1181 | "cell_type": "code", 1182 | "metadata": { 1183 | "id": "Aqto-er66klo", 1184 | "colab_type": "code", 1185 | "colab": {} 1186 | }, 1187 | "source": [ 1188 | "result = pd.merge(left, right, on='key',how='inner')\n", 1189 | "result" 1190 | ], 1191 | "execution_count": null, 1192 | "outputs": [] 1193 | }, 1194 | { 1195 | "cell_type": "code", 1196 | "metadata": { 1197 | "id": "Krzonfx96omW", 1198 | "colab_type": "code", 1199 | "colab": {} 1200 | }, 1201 | "source": [ 1202 | "result = pd.merge(left, right, on='key',how='outer')\n", 1203 | "result" 1204 | ], 1205 | "execution_count": null, 1206 | "outputs": [] 1207 | }, 1208 | { 1209 | "cell_type": "code", 1210 | "metadata": { 1211 | "id": "tEKbui0M643F", 1212 | "colab_type": "code", 1213 | "colab": {} 1214 | }, 1215 | "source": [ 1216 | "result = pd.merge(left, right, on='key',how='left')\n", 1217 | "result" 1218 | ], 1219 | "execution_count": null, 1220 | "outputs": [] 1221 | }, 1222 | { 1223 | "cell_type": "code", 1224 | "metadata": { 1225 | "id": "iKPAD2Tj6_Rr", 1226 | "colab_type": "code", 1227 | "colab": {} 1228 | }, 1229 | "source": [ 1230 | "result = pd.merge(left, right, on='key',how='right')\n", 1231 | "result" 1232 | ], 1233 | "execution_count": null, 1234 | "outputs": [] 1235 | }, 1236 | { 1237 | "cell_type": "markdown", 1238 | "metadata": { 1239 | "id": "U0VtE3NrgEcD", 1240 | "colab_type": "text" 1241 | }, 1242 | "source": [ 1243 | "## Activiity: Merging Data" 1244 | ] 1245 | }, 1246 | { 1247 | "cell_type": "code", 1248 | "metadata": { 1249 | "id": "kK8TMhXrgEcD", 1250 | "colab_type": "code", 1251 | "colab": {} 1252 | }, 1253 | "source": [ 1254 | "air_quality = pd.merge(air_quality_pm25, air_quality_no2, on='location')\n", 1255 | "air_quality.head()" 1256 | ], 1257 | "execution_count": null, 1258 | "outputs": [] 1259 | }, 1260 | { 1261 | "cell_type": "code", 1262 | "metadata": { 1263 | "id": "yyxVS2dkgEcE", 1264 | "colab_type": "code", 1265 | "colab": {} 1266 | }, 1267 | "source": [ 1268 | "air_quality = pd.merge(air_quality_pm25, air_quality_no2, on='date.utc')\n", 1269 | "air_quality.head()" 1270 | ], 1271 | "execution_count": null, 1272 | "outputs": [] 1273 | }, 1274 | { 1275 | "cell_type": "markdown", 1276 | "metadata": { 1277 | "id": "GGx1k5sfOw5G", 1278 | "colab_type": "text" 1279 | }, 1280 | "source": [ 1281 | "## Sorting Data" 1282 | ] 1283 | }, 1284 | { 1285 | "cell_type": "code", 1286 | "metadata": { 1287 | "id": "0yOl2ag3GYQN", 1288 | "colab_type": "code", 1289 | "colab": {} 1290 | }, 1291 | "source": [ 1292 | "mtcars_sample.sort_values(by=\"cyl\",ascending=False)" 1293 | ], 1294 | "execution_count": null, 1295 | "outputs": [] 1296 | }, 1297 | { 1298 | "cell_type": "markdown", 1299 | "metadata": { 1300 | "id": "WNDTF_GDgEcF", 1301 | "colab_type": "text" 1302 | }, 1303 | "source": [ 1304 | "## GroupBy" 1305 | ] 1306 | }, 1307 | { 1308 | "cell_type": "code", 1309 | "metadata": { 1310 | "id": "4uoSPWMkAvO2", 1311 | "colab_type": "code", 1312 | "colab": {} 1313 | }, 1314 | "source": [ 1315 | "mtcars_sample.groupby(['cyl']).mean()" 1316 | ], 1317 | "execution_count": null, 1318 | "outputs": [] 1319 | }, 1320 | { 1321 | "cell_type": "code", 1322 | "metadata": { 1323 | "id": "OSP0-2Wsc7IW", 1324 | "colab_type": "code", 1325 | "colab": {} 1326 | }, 1327 | "source": [ 1328 | "mtcars_sample.groupby('cyl').hp.mean()" 1329 | ], 1330 | "execution_count": null, 1331 | "outputs": [] 1332 | }, 1333 | { 1334 | "cell_type": "code", 1335 | "metadata": { 1336 | "id": "OEcoFLbfCMUo", 1337 | "colab_type": "code", 1338 | "colab": {} 1339 | }, 1340 | "source": [ 1341 | "mtcars_sample.groupby(['cyl']).sum()" 1342 | ], 1343 | "execution_count": null, 1344 | "outputs": [] 1345 | }, 1346 | { 1347 | "cell_type": "code", 1348 | "metadata": { 1349 | "id": "LmfbBIZBj3Ww", 1350 | "colab_type": "code", 1351 | "colab": {} 1352 | }, 1353 | "source": [ 1354 | "mtcars_sample.groupby(['cyl']).agg(['mean', 'count'])" 1355 | ], 1356 | "execution_count": null, 1357 | "outputs": [] 1358 | }, 1359 | { 1360 | "cell_type": "code", 1361 | "metadata": { 1362 | "id": "rnDXLTzPA0pj", 1363 | "colab_type": "code", 1364 | "colab": {} 1365 | }, 1366 | "source": [ 1367 | "mtcars_sample.groupby(['cyl','am']).mean()" 1368 | ], 1369 | "execution_count": null, 1370 | "outputs": [] 1371 | }, 1372 | { 1373 | "cell_type": "code", 1374 | "metadata": { 1375 | "id": "Epe1nDikjen3", 1376 | "colab_type": "code", 1377 | "colab": {} 1378 | }, 1379 | "source": [ 1380 | "mtcars_sample.groupby('cyl').agg(lambda x:max(x)-min(x))" 1381 | ], 1382 | "execution_count": null, 1383 | "outputs": [] 1384 | }, 1385 | { 1386 | "cell_type": "code", 1387 | "metadata": { 1388 | "id": "E42xpjCdjqsN", 1389 | "colab_type": "code", 1390 | "colab": {} 1391 | }, 1392 | "source": [ 1393 | "mtcars_sample.groupby(['cyl', 'am']).agg(['mean', 'count'])" 1394 | ], 1395 | "execution_count": null, 1396 | "outputs": [] 1397 | }, 1398 | { 1399 | "cell_type": "markdown", 1400 | "metadata": { 1401 | "id": "I4EUqUAZK9kl", 1402 | "colab_type": "text" 1403 | }, 1404 | "source": [ 1405 | "## Activitiy: Groupby" 1406 | ] 1407 | }, 1408 | { 1409 | "cell_type": "code", 1410 | "metadata": { 1411 | "id": "L2LIQrGveBxp", 1412 | "colab_type": "code", 1413 | "colab": {} 1414 | }, 1415 | "source": [ 1416 | "long_term_care = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/number-of-residential-long-term-care-facilities-sector-breakdown.csv',)\n", 1417 | "long_term_care.head()" 1418 | ], 1419 | "execution_count": null, 1420 | "outputs": [] 1421 | }, 1422 | { 1423 | "cell_type": "code", 1424 | "metadata": { 1425 | "id": "jHg_T5DleE45", 1426 | "colab_type": "code", 1427 | "colab": {} 1428 | }, 1429 | "source": [ 1430 | "long_term_care_by_sector = long_term_care.groupby(['year','sector']).sum()\n", 1431 | "long_term_care_by_sector" 1432 | ], 1433 | "execution_count": null, 1434 | "outputs": [] 1435 | }, 1436 | { 1437 | "cell_type": "markdown", 1438 | "metadata": { 1439 | "id": "QsuSAJvUIh6k", 1440 | "colab_type": "text" 1441 | }, 1442 | "source": [ 1443 | "## Pivoting Data" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "code", 1448 | "metadata": { 1449 | "id": "IknTVeXtMLtS", 1450 | "colab_type": "code", 1451 | "colab": {} 1452 | }, 1453 | "source": [ 1454 | "mtcars_sample.pivot(columns='cyl',values='hp')" 1455 | ], 1456 | "execution_count": null, 1457 | "outputs": [] 1458 | }, 1459 | { 1460 | "cell_type": "code", 1461 | "metadata": { 1462 | "id": "Kdq3LpuUSzvu", 1463 | "colab_type": "code", 1464 | "colab": {} 1465 | }, 1466 | "source": [ 1467 | "mtcars_sample.pivot(columns='cyl',values='hp').mean()" 1468 | ], 1469 | "execution_count": null, 1470 | "outputs": [] 1471 | }, 1472 | { 1473 | "cell_type": "code", 1474 | "metadata": { 1475 | "id": "wpLGpRN5OCiM", 1476 | "colab_type": "code", 1477 | "colab": {} 1478 | }, 1479 | "source": [ 1480 | "mtcars_sample.pivot_table(index='cyl',columns='am', values='hp',aggfunc='mean')" 1481 | ], 1482 | "execution_count": null, 1483 | "outputs": [] 1484 | }, 1485 | { 1486 | "cell_type": "markdown", 1487 | "metadata": { 1488 | "id": "FCCuTYPwTRWj", 1489 | "colab_type": "text" 1490 | }, 1491 | "source": [ 1492 | "## Activity: Pivot Table" 1493 | ] 1494 | }, 1495 | { 1496 | "cell_type": "code", 1497 | "metadata": { 1498 | "id": "epXSbRMyJb6r", 1499 | "colab_type": "code", 1500 | "colab": {} 1501 | }, 1502 | "source": [ 1503 | "long_term_care = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/number-of-residential-long-term-care-facilities-sector-breakdown.csv')\n", 1504 | "long_term_care" 1505 | ], 1506 | "execution_count": null, 1507 | "outputs": [] 1508 | }, 1509 | { 1510 | "cell_type": "code", 1511 | "metadata": { 1512 | "id": "H99uCIMpFhCS", 1513 | "colab_type": "code", 1514 | "colab": {} 1515 | }, 1516 | "source": [ 1517 | "long_term_care.pivot(columns='sector',values='count').sum()" 1518 | ], 1519 | "execution_count": null, 1520 | "outputs": [] 1521 | }, 1522 | { 1523 | "cell_type": "code", 1524 | "metadata": { 1525 | "id": "W5RwiZ0zWMUk", 1526 | "colab_type": "code", 1527 | "colab": {} 1528 | }, 1529 | "source": [ 1530 | "long_term_care.pivot_table(index=['year','sector'],values='count',aggfunc='sum')" 1531 | ], 1532 | "execution_count": null, 1533 | "outputs": [] 1534 | }, 1535 | { 1536 | "cell_type": "markdown", 1537 | "metadata": { 1538 | "id": "G1dOkMVjgEcR", 1539 | "colab_type": "text" 1540 | }, 1541 | "source": [ 1542 | "# Topic 3 Data Visualization \n" 1543 | ] 1544 | }, 1545 | { 1546 | "cell_type": "markdown", 1547 | "metadata": { 1548 | "id": "Oo5-hh9wndTS", 1549 | "colab_type": "text" 1550 | }, 1551 | "source": [ 1552 | "## Plot DataFrame" 1553 | ] 1554 | }, 1555 | { 1556 | "cell_type": "code", 1557 | "metadata": { 1558 | "id": "3zBuJvpXgEcR", 1559 | "colab_type": "code", 1560 | "colab": {} 1561 | }, 1562 | "source": [ 1563 | "mtcars_sample.plot()" 1564 | ], 1565 | "execution_count": null, 1566 | "outputs": [] 1567 | }, 1568 | { 1569 | "cell_type": "code", 1570 | "metadata": { 1571 | "id": "xPNmPhoVnCjq", 1572 | "colab_type": "code", 1573 | "colab": {} 1574 | }, 1575 | "source": [ 1576 | "mtcars_sample['hp'].plot()" 1577 | ], 1578 | "execution_count": null, 1579 | "outputs": [] 1580 | }, 1581 | { 1582 | "cell_type": "markdown", 1583 | "metadata": { 1584 | "id": "WqDqNUFSnUfB", 1585 | "colab_type": "text" 1586 | }, 1587 | "source": [ 1588 | "## Activity: Plot DataFrame" 1589 | ] 1590 | }, 1591 | { 1592 | "cell_type": "code", 1593 | "metadata": { 1594 | "id": "m7r7woxkgEcS", 1595 | "colab_type": "code", 1596 | "colab": {} 1597 | }, 1598 | "source": [ 1599 | "health_expenditure_sample.plot()" 1600 | ], 1601 | "execution_count": null, 1602 | "outputs": [] 1603 | }, 1604 | { 1605 | "cell_type": "markdown", 1606 | "metadata": { 1607 | "id": "2uCvKWxPoGSL", 1608 | "colab_type": "text" 1609 | }, 1610 | "source": [ 1611 | "## Scatter Plot" 1612 | ] 1613 | }, 1614 | { 1615 | "cell_type": "code", 1616 | "metadata": { 1617 | "id": "_ZT1rV-qgEcS", 1618 | "colab_type": "code", 1619 | "colab": {} 1620 | }, 1621 | "source": [ 1622 | "mtcars_sample.plot.scatter(x=\"hp\", y=\"mpg\",alpha=0.5)" 1623 | ], 1624 | "execution_count": null, 1625 | "outputs": [] 1626 | }, 1627 | { 1628 | "cell_type": "code", 1629 | "metadata": { 1630 | "id": "rV2lEpL1KkKP", 1631 | "colab_type": "code", 1632 | "colab": {} 1633 | }, 1634 | "source": [ 1635 | "mtcars_sample.plot.scatter(x=\"hp\", y=\"mpg\",alpha=0.5)\n", 1636 | "plt.xlabel(\"hp\")\n", 1637 | "plt.ylabel(\"mpg\")\n", 1638 | "plt.title(\"hp vs mpg\")" 1639 | ], 1640 | "execution_count": null, 1641 | "outputs": [] 1642 | }, 1643 | { 1644 | "cell_type": "markdown", 1645 | "metadata": { 1646 | "id": "35dULZLRpJrP", 1647 | "colab_type": "text" 1648 | }, 1649 | "source": [ 1650 | "## Bar Plot" 1651 | ] 1652 | }, 1653 | { 1654 | "cell_type": "code", 1655 | "metadata": { 1656 | "id": "v-ViXELtpLRQ", 1657 | "colab_type": "code", 1658 | "colab": {} 1659 | }, 1660 | "source": [ 1661 | "mtcars_sample.pivot(columns='cyl',values='hp').mean().plot.bar()" 1662 | ], 1663 | "execution_count": null, 1664 | "outputs": [] 1665 | }, 1666 | { 1667 | "cell_type": "code", 1668 | "metadata": { 1669 | "id": "EK3XwwN0gEcW", 1670 | "colab_type": "code", 1671 | "colab": {} 1672 | }, 1673 | "source": [ 1674 | "mtcars_sample.pivot(columns='cyl',values='hp').mean().plot.bar(color='red')" 1675 | ], 1676 | "execution_count": null, 1677 | "outputs": [] 1678 | }, 1679 | { 1680 | "cell_type": "code", 1681 | "metadata": { 1682 | "id": "0w_rHkBYJUmu", 1683 | "colab_type": "code", 1684 | "colab": {} 1685 | }, 1686 | "source": [ 1687 | "mtcars_cyl= mtcars_sample.pivot(columns='cyl',values='hp').mean().plot(kind='bar',color='red')" 1688 | ], 1689 | "execution_count": null, 1690 | "outputs": [] 1691 | }, 1692 | { 1693 | "cell_type": "code", 1694 | "metadata": { 1695 | "id": "J6F_VhO4gEcW", 1696 | "colab_type": "code", 1697 | "colab": {} 1698 | }, 1699 | "source": [ 1700 | "mtcars_cyl= mtcars_sample.pivot(columns='cyl',values='hp').mean().plot.barh()" 1701 | ], 1702 | "execution_count": null, 1703 | "outputs": [] 1704 | }, 1705 | { 1706 | "cell_type": "code", 1707 | "metadata": { 1708 | "id": "y10HcfVBnhkW", 1709 | "colab_type": "code", 1710 | "colab": {} 1711 | }, 1712 | "source": [ 1713 | "mtcars_sample.pivot_table(index='am',columns='cyl',values='mpg',aggfunc=['mean']).plot.bar()" 1714 | ], 1715 | "execution_count": null, 1716 | "outputs": [] 1717 | }, 1718 | { 1719 | "cell_type": "code", 1720 | "metadata": { 1721 | "id": "1r_krrMQ8flG", 1722 | "colab_type": "code", 1723 | "colab": {} 1724 | }, 1725 | "source": [ 1726 | "mtcars_sample.groupby(['am','cyl']).mean().plot.bar(stacked=True)" 1727 | ], 1728 | "execution_count": null, 1729 | "outputs": [] 1730 | }, 1731 | { 1732 | "cell_type": "markdown", 1733 | "metadata": { 1734 | "id": "KXrmndH-AXVm", 1735 | "colab_type": "text" 1736 | }, 1737 | "source": [ 1738 | "## Pie Plot" 1739 | ] 1740 | }, 1741 | { 1742 | "cell_type": "code", 1743 | "metadata": { 1744 | "id": "YbKPBpubAKXr", 1745 | "colab_type": "code", 1746 | "colab": {} 1747 | }, 1748 | "source": [ 1749 | "mtcars_cyl= mtcars_sample.pivot(columns='cyl',values='hp').mean().plot.pie()" 1750 | ], 1751 | "execution_count": null, 1752 | "outputs": [] 1753 | }, 1754 | { 1755 | "cell_type": "markdown", 1756 | "metadata": { 1757 | "id": "Wi8T2GV-8YJ1", 1758 | "colab_type": "text" 1759 | }, 1760 | "source": [ 1761 | "## Boxplot" 1762 | ] 1763 | }, 1764 | { 1765 | "cell_type": "code", 1766 | "metadata": { 1767 | "id": "ZSHONUAzgEcX", 1768 | "colab_type": "code", 1769 | "colab": {} 1770 | }, 1771 | "source": [ 1772 | "mtcars_cyl= mtcars_sample.pivot(columns='cyl',values='hp').plot.box()" 1773 | ], 1774 | "execution_count": null, 1775 | "outputs": [] 1776 | }, 1777 | { 1778 | "cell_type": "markdown", 1779 | "metadata": { 1780 | "id": "8LqNNWR8_3DJ", 1781 | "colab_type": "text" 1782 | }, 1783 | "source": [ 1784 | "## Area Plot" 1785 | ] 1786 | }, 1787 | { 1788 | "cell_type": "code", 1789 | "metadata": { 1790 | "id": "mniu6MsH_sf9", 1791 | "colab_type": "code", 1792 | "colab": {} 1793 | }, 1794 | "source": [ 1795 | "mtcars_sample2 = mtcars_sample[['cyl','mpg','hp']]\n", 1796 | "mtcars_sample2.groupby(['cyl']).mean().plot.area()" 1797 | ], 1798 | "execution_count": null, 1799 | "outputs": [] 1800 | }, 1801 | { 1802 | "cell_type": "markdown", 1803 | "metadata": { 1804 | "id": "VH1TUqLC8boj", 1805 | "colab_type": "text" 1806 | }, 1807 | "source": [ 1808 | "## Histogram" 1809 | ] 1810 | }, 1811 | { 1812 | "cell_type": "code", 1813 | "metadata": { 1814 | "id": "croK8jpc4z0Y", 1815 | "colab_type": "code", 1816 | "colab": {} 1817 | }, 1818 | "source": [ 1819 | "mtcars_sample.mpg.plot.hist(bins=15)" 1820 | ], 1821 | "execution_count": null, 1822 | "outputs": [] 1823 | }, 1824 | { 1825 | "cell_type": "markdown", 1826 | "metadata": { 1827 | "id": "-WNvXsgO6Eva", 1828 | "colab_type": "text" 1829 | }, 1830 | "source": [ 1831 | "## Activity: Data Visualization" 1832 | ] 1833 | }, 1834 | { 1835 | "cell_type": "code", 1836 | "metadata": { 1837 | "id": "2XCEhmlv5yzh", 1838 | "colab_type": "code", 1839 | "colab": {} 1840 | }, 1841 | "source": [ 1842 | "long_term_care.pivot(columns='sector',values='count').sum().plot.barh()" 1843 | ], 1844 | "execution_count": null, 1845 | "outputs": [] 1846 | }, 1847 | { 1848 | "cell_type": "markdown", 1849 | "metadata": { 1850 | "id": "r1zzTOaY-ZGn", 1851 | "colab_type": "text" 1852 | }, 1853 | "source": [ 1854 | "## Subplot" 1855 | ] 1856 | }, 1857 | { 1858 | "cell_type": "code", 1859 | "metadata": { 1860 | "id": "XdSESyWu-eGG", 1861 | "colab_type": "code", 1862 | "colab": {} 1863 | }, 1864 | "source": [ 1865 | "mtcars_sample.groupby('cyl').mean().plot(subplots=True)" 1866 | ], 1867 | "execution_count": null, 1868 | "outputs": [] 1869 | }, 1870 | { 1871 | "cell_type": "markdown", 1872 | "metadata": { 1873 | "id": "eRRN_s5_-fdV", 1874 | "colab_type": "text" 1875 | }, 1876 | "source": [ 1877 | "## Activity: Subplot" 1878 | ] 1879 | }, 1880 | { 1881 | "cell_type": "code", 1882 | "metadata": { 1883 | "id": "L2JqOrS18iDn", 1884 | "colab_type": "code", 1885 | "colab": {} 1886 | }, 1887 | "source": [ 1888 | "health_expenditure_sample.plot(subplots=True)" 1889 | ], 1890 | "execution_count": null, 1891 | "outputs": [] 1892 | }, 1893 | { 1894 | "cell_type": "markdown", 1895 | "metadata": { 1896 | "id": "RFmMnQq7Bg99", 1897 | "colab_type": "text" 1898 | }, 1899 | "source": [ 1900 | "## Matplotlib - Scatter Plot" 1901 | ] 1902 | }, 1903 | { 1904 | "cell_type": "code", 1905 | "metadata": { 1906 | "id": "AtpWcs-qAiuJ", 1907 | "colab_type": "code", 1908 | "colab": {} 1909 | }, 1910 | "source": [ 1911 | "plt.scatter(x=\"hp\", y=\"mpg\", data=mtcars_sample,color='red')\n", 1912 | "plt.xlabel('hp')\n", 1913 | "plt.ylabel('mpg')\n", 1914 | "plt.title('hp vs mpg')" 1915 | ], 1916 | "execution_count": null, 1917 | "outputs": [] 1918 | }, 1919 | { 1920 | "cell_type": "markdown", 1921 | "metadata": { 1922 | "id": "BHefiSTn_fFW", 1923 | "colab_type": "text" 1924 | }, 1925 | "source": [ 1926 | "## Seaborn - Heatmap" 1927 | ] 1928 | }, 1929 | { 1930 | "cell_type": "code", 1931 | "metadata": { 1932 | "id": "tc_PBNKe-ywq", 1933 | "colab_type": "code", 1934 | "colab": {} 1935 | }, 1936 | "source": [ 1937 | "mtcars_cyl= mtcars_sample.groupby('cyl').mean()\n", 1938 | "sb.heatmap(mtcars_cyl, annot=True)" 1939 | ], 1940 | "execution_count": null, 1941 | "outputs": [] 1942 | }, 1943 | { 1944 | "cell_type": "markdown", 1945 | "metadata": { 1946 | "id": "ei_6QNNnAfJO", 1947 | "colab_type": "text" 1948 | }, 1949 | "source": [ 1950 | "## Seaborn - Regression Line" 1951 | ] 1952 | }, 1953 | { 1954 | "cell_type": "code", 1955 | "metadata": { 1956 | "id": "cTeO7FlwAMwT", 1957 | "colab_type": "code", 1958 | "colab": {} 1959 | }, 1960 | "source": [ 1961 | "sb.lmplot(x=\"hp\", y=\"mpg\",data=mtcars_sample,fit_reg=True) " 1962 | ], 1963 | "execution_count": null, 1964 | "outputs": [] 1965 | }, 1966 | { 1967 | "cell_type": "markdown", 1968 | "metadata": { 1969 | "id": "L2TpPaOmgEcY", 1970 | "colab_type": "text" 1971 | }, 1972 | "source": [ 1973 | "# Topic 4 Data Analysis \n" 1974 | ] 1975 | }, 1976 | { 1977 | "cell_type": "markdown", 1978 | "metadata": { 1979 | "id": "P73Xquh-Fg4u", 1980 | "colab_type": "text" 1981 | }, 1982 | "source": [ 1983 | "## Descriptive Statistics" 1984 | ] 1985 | }, 1986 | { 1987 | "cell_type": "code", 1988 | "metadata": { 1989 | "id": "uvqm6j8zCSgD", 1990 | "colab_type": "code", 1991 | "colab": {} 1992 | }, 1993 | "source": [ 1994 | "mtcars_sample[[\"mpg\", \"hp\"]].median()" 1995 | ], 1996 | "execution_count": null, 1997 | "outputs": [] 1998 | }, 1999 | { 2000 | "cell_type": "code", 2001 | "metadata": { 2002 | "id": "Ie8PE1CoDC1o", 2003 | "colab_type": "code", 2004 | "colab": {} 2005 | }, 2006 | "source": [ 2007 | "mtcars_sample[[\"mpg\", \"hp\"]].describe()" 2008 | ], 2009 | "execution_count": null, 2010 | "outputs": [] 2011 | }, 2012 | { 2013 | "cell_type": "markdown", 2014 | "metadata": { 2015 | "id": "l-NvhSZ6frva", 2016 | "colab_type": "text" 2017 | }, 2018 | "source": [ 2019 | "## Descriptive Statistics for Categorical Data" 2020 | ] 2021 | }, 2022 | { 2023 | "cell_type": "code", 2024 | "metadata": { 2025 | "id": "NGWWil3geNWD", 2026 | "colab_type": "code", 2027 | "colab": {} 2028 | }, 2029 | "source": [ 2030 | "mtcars_sample.groupby('cyl').mpg.agg(['mean', 'median', 'max'])" 2031 | ], 2032 | "execution_count": null, 2033 | "outputs": [] 2034 | }, 2035 | { 2036 | "cell_type": "code", 2037 | "metadata": { 2038 | "id": "epDcrdFJdZ49", 2039 | "colab_type": "code", 2040 | "colab": {} 2041 | }, 2042 | "source": [ 2043 | "mtcars_sample.groupby('cyl').mpg.describe()" 2044 | ], 2045 | "execution_count": null, 2046 | "outputs": [] 2047 | }, 2048 | { 2049 | "cell_type": "code", 2050 | "metadata": { 2051 | "id": "97jOK_x5DjP9", 2052 | "colab_type": "code", 2053 | "colab": {} 2054 | }, 2055 | "source": [ 2056 | "mtcars_sample[[\"cyl\", \"mpg\"]].groupby(\"cyl\").mean()" 2057 | ], 2058 | "execution_count": null, 2059 | "outputs": [] 2060 | }, 2061 | { 2062 | "cell_type": "code", 2063 | "metadata": { 2064 | "id": "ZqgvfbxkEhoY", 2065 | "colab_type": "code", 2066 | "colab": {} 2067 | }, 2068 | "source": [ 2069 | "mtcars_sample.groupby(\"cyl\")[\"mpg\"].mean()" 2070 | ], 2071 | "execution_count": null, 2072 | "outputs": [] 2073 | }, 2074 | { 2075 | "cell_type": "markdown", 2076 | "metadata": { 2077 | "id": "1At9aRe6foNF", 2078 | "colab_type": "text" 2079 | }, 2080 | "source": [ 2081 | "## Count" 2082 | ] 2083 | }, 2084 | { 2085 | "cell_type": "code", 2086 | "metadata": { 2087 | "id": "rwcBjU_nE5d3", 2088 | "colab_type": "code", 2089 | "colab": {} 2090 | }, 2091 | "source": [ 2092 | "mtcars_sample[\"cyl\"].value_counts()" 2093 | ], 2094 | "execution_count": null, 2095 | "outputs": [] 2096 | }, 2097 | { 2098 | "cell_type": "markdown", 2099 | "metadata": { 2100 | "id": "-B9V_ginGSC8", 2101 | "colab_type": "text" 2102 | }, 2103 | "source": [ 2104 | "## Activitiy: Descriptive Statistics" 2105 | ] 2106 | }, 2107 | { 2108 | "cell_type": "code", 2109 | "metadata": { 2110 | "id": "a-hemZw3FfeM", 2111 | "colab_type": "code", 2112 | "colab": {} 2113 | }, 2114 | "source": [ 2115 | "long_term_care = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/number-of-residential-long-term-care-facilities-sector-breakdown.csv',)\n", 2116 | "long_term_care['sector'].value_counts()" 2117 | ], 2118 | "execution_count": null, 2119 | "outputs": [] 2120 | }, 2121 | { 2122 | "cell_type": "markdown", 2123 | "metadata": { 2124 | "id": "MEt4jzoeA_Rf", 2125 | "colab_type": "text" 2126 | }, 2127 | "source": [ 2128 | "## Covariance and Correlation" 2129 | ] 2130 | }, 2131 | { 2132 | "cell_type": "code", 2133 | "metadata": { 2134 | "id": "mBFHvjNKA3KH", 2135 | "colab_type": "code", 2136 | "colab": {} 2137 | }, 2138 | "source": [ 2139 | "mtcars_sample.cov()" 2140 | ], 2141 | "execution_count": null, 2142 | "outputs": [] 2143 | }, 2144 | { 2145 | "cell_type": "code", 2146 | "metadata": { 2147 | "id": "xB6udGwrA6dF", 2148 | "colab_type": "code", 2149 | "colab": {} 2150 | }, 2151 | "source": [ 2152 | "mtcars_sample.corr()" 2153 | ], 2154 | "execution_count": null, 2155 | "outputs": [] 2156 | }, 2157 | { 2158 | "cell_type": "markdown", 2159 | "metadata": { 2160 | "id": "YM8Mo9e5gEcY", 2161 | "colab_type": "text" 2162 | }, 2163 | "source": [ 2164 | "## Time Series Analysis" 2165 | ] 2166 | }, 2167 | { 2168 | "cell_type": "code", 2169 | "metadata": { 2170 | "id": "GMrRPAN7HlIq", 2171 | "colab_type": "code", 2172 | "colab": {} 2173 | }, 2174 | "source": [ 2175 | "air_quality = pd.read_csv(\"https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/air_quality_no2_long.csv\")\n", 2176 | "air_quality = air_quality.rename(columns={\"date.utc\": \"datetime\"})\n", 2177 | "air_quality.head()" 2178 | ], 2179 | "execution_count": null, 2180 | "outputs": [] 2181 | }, 2182 | { 2183 | "cell_type": "code", 2184 | "metadata": { 2185 | "id": "2emAw2JbHymo", 2186 | "colab_type": "code", 2187 | "colab": {} 2188 | }, 2189 | "source": [ 2190 | "air_quality.city.unique()" 2191 | ], 2192 | "execution_count": null, 2193 | "outputs": [] 2194 | }, 2195 | { 2196 | "cell_type": "code", 2197 | "metadata": { 2198 | "id": "5cEXaWEiH3Th", 2199 | "colab_type": "code", 2200 | "colab": {} 2201 | }, 2202 | "source": [ 2203 | "air_quality[\"datetime\"] = pd.to_datetime(air_quality[\"datetime\"])\n", 2204 | "air_quality[\"datetime\"]" 2205 | ], 2206 | "execution_count": null, 2207 | "outputs": [] 2208 | }, 2209 | { 2210 | "cell_type": "markdown", 2211 | "metadata": { 2212 | "id": "yKZNvBOqYzJ9", 2213 | "colab_type": "text" 2214 | }, 2215 | "source": [ 2216 | "## Datetime " 2217 | ] 2218 | }, 2219 | { 2220 | "cell_type": "code", 2221 | "metadata": { 2222 | "id": "FRC_bKl6IFGi", 2223 | "colab_type": "code", 2224 | "colab": {} 2225 | }, 2226 | "source": [ 2227 | "air_quality = pd.read_csv(\"https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/air_quality_no2_long.csv\",parse_dates=[\"date.utc\"])\n", 2228 | "air_quality = air_quality.rename(columns={\"date.utc\": \"datetime\"})\n", 2229 | "air_quality.head()" 2230 | ], 2231 | "execution_count": null, 2232 | "outputs": [] 2233 | }, 2234 | { 2235 | "cell_type": "code", 2236 | "metadata": { 2237 | "id": "Ouw3RmkjIbuS", 2238 | "colab_type": "code", 2239 | "colab": {} 2240 | }, 2241 | "source": [ 2242 | "air_quality[\"datetime\"]" 2243 | ], 2244 | "execution_count": null, 2245 | "outputs": [] 2246 | }, 2247 | { 2248 | "cell_type": "code", 2249 | "metadata": { 2250 | "id": "0_efqmrJIjbQ", 2251 | "colab_type": "code", 2252 | "colab": {} 2253 | }, 2254 | "source": [ 2255 | "air_quality[\"datetime\"].min(), air_quality[\"datetime\"].max()" 2256 | ], 2257 | "execution_count": null, 2258 | "outputs": [] 2259 | }, 2260 | { 2261 | "cell_type": "code", 2262 | "metadata": { 2263 | "id": "F2cD2aH3ImZl", 2264 | "colab_type": "code", 2265 | "colab": {} 2266 | }, 2267 | "source": [ 2268 | "air_quality[\"datetime\"].max() - air_quality[\"datetime\"].min()" 2269 | ], 2270 | "execution_count": null, 2271 | "outputs": [] 2272 | }, 2273 | { 2274 | "cell_type": "code", 2275 | "metadata": { 2276 | "id": "TFsDFrBCIrXx", 2277 | "colab_type": "code", 2278 | "colab": {} 2279 | }, 2280 | "source": [ 2281 | "air_quality[\"month\"] = air_quality[\"datetime\"].dt.month" 2282 | ], 2283 | "execution_count": null, 2284 | "outputs": [] 2285 | }, 2286 | { 2287 | "cell_type": "code", 2288 | "metadata": { 2289 | "id": "cT2xKeheIu1c", 2290 | "colab_type": "code", 2291 | "colab": {} 2292 | }, 2293 | "source": [ 2294 | "air_quality.head()" 2295 | ], 2296 | "execution_count": null, 2297 | "outputs": [] 2298 | }, 2299 | { 2300 | "cell_type": "code", 2301 | "metadata": { 2302 | "id": "JrdE4DSmIzGg", 2303 | "colab_type": "code", 2304 | "colab": {} 2305 | }, 2306 | "source": [ 2307 | "air_quality.groupby([air_quality[\"datetime\"].dt.weekday, \"location\"])[\"value\"].mean()" 2308 | ], 2309 | "execution_count": null, 2310 | "outputs": [] 2311 | }, 2312 | { 2313 | "cell_type": "markdown", 2314 | "metadata": { 2315 | "id": "W8miFX7bYpmS", 2316 | "colab_type": "text" 2317 | }, 2318 | "source": [ 2319 | "## Time Series Plot" 2320 | ] 2321 | }, 2322 | { 2323 | "cell_type": "code", 2324 | "metadata": { 2325 | "id": "BNCwOGZMJCtO", 2326 | "colab_type": "code", 2327 | "colab": {} 2328 | }, 2329 | "source": [ 2330 | "fig, axs = plt.subplots(figsize=(12, 4))\n", 2331 | "air_quality.groupby(air_quality[\"datetime\"].dt.hour)[\"value\"].mean().plot(kind='bar',rot=0,ax=axs)\n", 2332 | "plt.xlabel(\"Hour of the day\")\n", 2333 | "plt.ylabel(\"$NO_2 (µg/m^3)$\")" 2334 | ], 2335 | "execution_count": null, 2336 | "outputs": [] 2337 | }, 2338 | { 2339 | "cell_type": "markdown", 2340 | "metadata": { 2341 | "id": "ST_YmZ6oYkVS", 2342 | "colab_type": "text" 2343 | }, 2344 | "source": [ 2345 | "## Datetime as Index" 2346 | ] 2347 | }, 2348 | { 2349 | "cell_type": "code", 2350 | "metadata": { 2351 | "id": "UqhV39FHK2NQ", 2352 | "colab_type": "code", 2353 | "colab": {} 2354 | }, 2355 | "source": [ 2356 | "no_2 = air_quality.pivot(index=\"datetime\", columns=\"location\", values=\"value\")\n", 2357 | "no_2.head()" 2358 | ], 2359 | "execution_count": null, 2360 | "outputs": [] 2361 | }, 2362 | { 2363 | "cell_type": "code", 2364 | "metadata": { 2365 | "id": "0qfQv1emK_FK", 2366 | "colab_type": "code", 2367 | "colab": {} 2368 | }, 2369 | "source": [ 2370 | "no_2[\"2019-05-20\":\"2019-05-21\"].plot()" 2371 | ], 2372 | "execution_count": null, 2373 | "outputs": [] 2374 | }, 2375 | { 2376 | "cell_type": "markdown", 2377 | "metadata": { 2378 | "id": "YbgiJcagYZcL", 2379 | "colab_type": "text" 2380 | }, 2381 | "source": [ 2382 | "## Resample a Time Series" 2383 | ] 2384 | }, 2385 | { 2386 | "cell_type": "code", 2387 | "metadata": { 2388 | "id": "pwQy8AB9LJsn", 2389 | "colab_type": "code", 2390 | "colab": {} 2391 | }, 2392 | "source": [ 2393 | "monthly_max = no_2.resample(\"M\").max()\n", 2394 | "monthly_max" 2395 | ], 2396 | "execution_count": null, 2397 | "outputs": [] 2398 | }, 2399 | { 2400 | "cell_type": "code", 2401 | "metadata": { 2402 | "id": "Tt6ZCubuLQli", 2403 | "colab_type": "code", 2404 | "colab": {} 2405 | }, 2406 | "source": [ 2407 | "no_2.resample(\"D\").mean().plot(style=\"-o\", figsize=(10, 5));" 2408 | ], 2409 | "execution_count": null, 2410 | "outputs": [] 2411 | }, 2412 | { 2413 | "cell_type": "markdown", 2414 | "metadata": { 2415 | "id": "o1q61KmalJgB", 2416 | "colab_type": "text" 2417 | }, 2418 | "source": [ 2419 | "## Activity: Time Series Analysis" 2420 | ] 2421 | }, 2422 | { 2423 | "cell_type": "code", 2424 | "metadata": { 2425 | "id": "FTGX-ePWjnu6", 2426 | "colab_type": "code", 2427 | "colab": {} 2428 | }, 2429 | "source": [ 2430 | "long_term_care_pivot = long_term_care.pivot(columns='year',values='count').sum()\n", 2431 | "long_term_care_pivot.plot()\n", 2432 | "plt.xlabel('year')\n", 2433 | "plt.ylabel('No of long term care faciliteis')" 2434 | ], 2435 | "execution_count": null, 2436 | "outputs": [] 2437 | }, 2438 | { 2439 | "cell_type": "markdown", 2440 | "metadata": { 2441 | "id": "WpK7XPfuZJs-", 2442 | "colab_type": "text" 2443 | }, 2444 | "source": [ 2445 | "# Topic 5 Advanced Data Analytics" 2446 | ] 2447 | }, 2448 | { 2449 | "cell_type": "markdown", 2450 | "metadata": { 2451 | "id": "DdggqdNXIp0x", 2452 | "colab_type": "text" 2453 | }, 2454 | "source": [ 2455 | "## Pipe" 2456 | ] 2457 | }, 2458 | { 2459 | "cell_type": "code", 2460 | "metadata": { 2461 | "id": "F5O-XpI9Idw6", 2462 | "colab_type": "code", 2463 | "colab": {} 2464 | }, 2465 | "source": [ 2466 | "def load_data():\n", 2467 | " return pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/WSQ-Python-Data-Analytics/master/exercises/data/mtcars.csv',\n", 2468 | " index_col='car_names',\n", 2469 | " usecols=['car_names','mpg','hp','cyl','am'])" 2470 | ], 2471 | "execution_count": null, 2472 | "outputs": [] 2473 | }, 2474 | { 2475 | "cell_type": "code", 2476 | "metadata": { 2477 | "id": "gsSLqZ6WImuw", 2478 | "colab_type": "code", 2479 | "colab": {} 2480 | }, 2481 | "source": [ 2482 | "def plotbar(df):\n", 2483 | " df_cyl= df.pivot(columns='cyl',values='hp').mean()\n", 2484 | " df_cyl.plot.bar(color='red')" 2485 | ], 2486 | "execution_count": null, 2487 | "outputs": [] 2488 | }, 2489 | { 2490 | "cell_type": "code", 2491 | "metadata": { 2492 | "id": "p1hp5RgwIjVx", 2493 | "colab_type": "code", 2494 | "colab": {} 2495 | }, 2496 | "source": [ 2497 | "mtcars_pipe =(\n", 2498 | " load_data()\n", 2499 | " .pipe(plotbar)\n", 2500 | ")\n" 2501 | ], 2502 | "execution_count": null, 2503 | "outputs": [] 2504 | }, 2505 | { 2506 | "cell_type": "markdown", 2507 | "metadata": { 2508 | "id": "RQbQYM3-LBXi", 2509 | "colab_type": "text" 2510 | }, 2511 | "source": [ 2512 | "## Activity: Pipe" 2513 | ] 2514 | }, 2515 | { 2516 | "cell_type": "code", 2517 | "metadata": { 2518 | "id": "UyD7LWyPJ-J2", 2519 | "colab_type": "code", 2520 | "colab": {} 2521 | }, 2522 | "source": [ 2523 | "def load_data():\n", 2524 | " return pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/number-of-residential-long-term-care-facilities-sector-breakdown.csv')" 2525 | ], 2526 | "execution_count": null, 2527 | "outputs": [] 2528 | }, 2529 | { 2530 | "cell_type": "code", 2531 | "metadata": { 2532 | "id": "MtPATB8JKFg8", 2533 | "colab_type": "code", 2534 | "colab": {} 2535 | }, 2536 | "source": [ 2537 | "def plotbarh(df):\n", 2538 | " df_pivot = df.pivot(columns='sector',values='count').sum()\n", 2539 | " df_pivot.plot.barh()\n" 2540 | ], 2541 | "execution_count": null, 2542 | "outputs": [] 2543 | }, 2544 | { 2545 | "cell_type": "code", 2546 | "metadata": { 2547 | "id": "o1kWNSu-K1nJ", 2548 | "colab_type": "code", 2549 | "colab": {} 2550 | }, 2551 | "source": [ 2552 | "long_term_car_pipe =(\n", 2553 | " load_data()\n", 2554 | " .pipe(plotbarh)\n", 2555 | ")\n" 2556 | ], 2557 | "execution_count": null, 2558 | "outputs": [] 2559 | }, 2560 | { 2561 | "cell_type": "markdown", 2562 | "metadata": { 2563 | "id": "QODt9EwcIe17", 2564 | "colab_type": "text" 2565 | }, 2566 | "source": [ 2567 | "## Apply" 2568 | ] 2569 | }, 2570 | { 2571 | "cell_type": "code", 2572 | "metadata": { 2573 | "id": "IE8SCm6BGC0V", 2574 | "colab_type": "code", 2575 | "colab": {} 2576 | }, 2577 | "source": [ 2578 | "df = pd.DataFrame([[9, 25]] * 3, columns=['P', 'Q'])\n", 2579 | "df" 2580 | ], 2581 | "execution_count": null, 2582 | "outputs": [] 2583 | }, 2584 | { 2585 | "cell_type": "code", 2586 | "metadata": { 2587 | "id": "WlSenEFCGLZu", 2588 | "colab_type": "code", 2589 | "colab": {} 2590 | }, 2591 | "source": [ 2592 | "df.apply(np.sqrt)" 2593 | ], 2594 | "execution_count": null, 2595 | "outputs": [] 2596 | }, 2597 | { 2598 | "cell_type": "code", 2599 | "metadata": { 2600 | "id": "pM576cf-f8vC", 2601 | "colab_type": "code", 2602 | "colab": {} 2603 | }, 2604 | "source": [ 2605 | "uppercase = lambda x: x.upper()" 2606 | ], 2607 | "execution_count": null, 2608 | "outputs": [] 2609 | }, 2610 | { 2611 | "cell_type": "code", 2612 | "metadata": { 2613 | "id": "n1YhVsaDf-YV", 2614 | "colab_type": "code", 2615 | "colab": {} 2616 | }, 2617 | "source": [ 2618 | "mtcars_sample2 = mtcars_sample.reset_index()\n", 2619 | "mtcars_sample2['car_names'] = mtcars_sample2['car_names'].apply(uppercase)\n", 2620 | "mtcars_sample2" 2621 | ], 2622 | "execution_count": null, 2623 | "outputs": [] 2624 | }, 2625 | { 2626 | "cell_type": "markdown", 2627 | "metadata": { 2628 | "id": "viSiaJyLnzQ3", 2629 | "colab_type": "text" 2630 | }, 2631 | "source": [ 2632 | "## Activity: Apply" 2633 | ] 2634 | }, 2635 | { 2636 | "cell_type": "code", 2637 | "metadata": { 2638 | "id": "U3MVhK0tmm48", 2639 | "colab_type": "code", 2640 | "colab": {} 2641 | }, 2642 | "source": [ 2643 | "health_expenditure = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/government-health-expenditure.csv')\n", 2644 | "health_expenditure" 2645 | ], 2646 | "execution_count": null, 2647 | "outputs": [] 2648 | }, 2649 | { 2650 | "cell_type": "code", 2651 | "metadata": { 2652 | "id": "7kDCLIpsmyEU", 2653 | "colab_type": "code", 2654 | "colab": {} 2655 | }, 2656 | "source": [ 2657 | "SGD2USD = lambda x: x/1.4\n", 2658 | "health_expenditure[['operating_expenditure','development_expenditure','government_health_expenditure']].apply(SGD2USD)" 2659 | ], 2660 | "execution_count": null, 2661 | "outputs": [] 2662 | }, 2663 | { 2664 | "cell_type": "markdown", 2665 | "metadata": { 2666 | "id": "5515POXngEcl", 2667 | "colab_type": "text" 2668 | }, 2669 | "source": [ 2670 | "## Linear Regression" 2671 | ] 2672 | }, 2673 | { 2674 | "cell_type": "code", 2675 | "metadata": { 2676 | "id": "0QGp0yG0gEcl", 2677 | "colab_type": "code", 2678 | "colab": {} 2679 | }, 2680 | "source": [ 2681 | "x = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/boston-housing-prices.csv')\n", 2682 | "x" 2683 | ], 2684 | "execution_count": null, 2685 | "outputs": [] 2686 | }, 2687 | { 2688 | "cell_type": "code", 2689 | "metadata": { 2690 | "id": "hM0a4prM-2WI", 2691 | "colab_type": "code", 2692 | "colab": {} 2693 | }, 2694 | "source": [ 2695 | "y = x.pop('medv')" 2696 | ], 2697 | "execution_count": null, 2698 | "outputs": [] 2699 | }, 2700 | { 2701 | "cell_type": "code", 2702 | "metadata": { 2703 | "id": "Zv3H8ozZgEcm", 2704 | "colab_type": "code", 2705 | "colab": {} 2706 | }, 2707 | "source": [ 2708 | "from sklearn import linear_model\n", 2709 | "lm = linear_model.LinearRegression()" 2710 | ], 2711 | "execution_count": null, 2712 | "outputs": [] 2713 | }, 2714 | { 2715 | "cell_type": "code", 2716 | "metadata": { 2717 | "id": "8ImgWvipgEcn", 2718 | "colab_type": "code", 2719 | "colab": {} 2720 | }, 2721 | "source": [ 2722 | "lm.fit(x,y)" 2723 | ], 2724 | "execution_count": null, 2725 | "outputs": [] 2726 | }, 2727 | { 2728 | "cell_type": "code", 2729 | "metadata": { 2730 | "id": "gRKKdI0K_cV-", 2731 | "colab_type": "code", 2732 | "colab": {} 2733 | }, 2734 | "source": [ 2735 | "yhat = lm.predict(x)" 2736 | ], 2737 | "execution_count": null, 2738 | "outputs": [] 2739 | }, 2740 | { 2741 | "cell_type": "code", 2742 | "metadata": { 2743 | "id": "RUe3bln7gEco", 2744 | "colab_type": "code", 2745 | "colab": {} 2746 | }, 2747 | "source": [ 2748 | "plt.scatter(y,yhat)\n", 2749 | "plt.xlabel('Actual Housing Price')\n", 2750 | "plt.ylabel('Predicted Housing Price')\n", 2751 | "plt.plot([0,50],[0,50],'r')\n", 2752 | "plt.show()" 2753 | ], 2754 | "execution_count": null, 2755 | "outputs": [] 2756 | }, 2757 | { 2758 | "cell_type": "markdown", 2759 | "metadata": { 2760 | "id": "TLzFdAWwBzDU", 2761 | "colab_type": "text" 2762 | }, 2763 | "source": [ 2764 | "## Activiity: Regression" 2765 | ] 2766 | }, 2767 | { 2768 | "cell_type": "code", 2769 | "metadata": { 2770 | "id": "0FUGhfcAm3xZ", 2771 | "colab_type": "code", 2772 | "colab": {} 2773 | }, 2774 | "source": [ 2775 | "x = pd.read_csv('https://raw.githubusercontent.com/tertiarycourses/datasets/master/government-health-expenditure.csv',usecols=['operating_expenditure','government_health_expenditure'])\n", 2776 | "x" 2777 | ], 2778 | "execution_count": null, 2779 | "outputs": [] 2780 | }, 2781 | { 2782 | "cell_type": "code", 2783 | "metadata": { 2784 | "id": "t2aCWLbrBCeF", 2785 | "colab_type": "code", 2786 | "colab": {} 2787 | }, 2788 | "source": [ 2789 | "y = x.pop('government_health_expenditure')" 2790 | ], 2791 | "execution_count": null, 2792 | "outputs": [] 2793 | }, 2794 | { 2795 | "cell_type": "code", 2796 | "metadata": { 2797 | "id": "y2rUtqE8BN6N", 2798 | "colab_type": "code", 2799 | "colab": {} 2800 | }, 2801 | "source": [ 2802 | "from sklearn import linear_model\n", 2803 | "lm = linear_model.LinearRegression()" 2804 | ], 2805 | "execution_count": null, 2806 | "outputs": [] 2807 | }, 2808 | { 2809 | "cell_type": "code", 2810 | "metadata": { 2811 | "id": "L--JG3teBW2r", 2812 | "colab_type": "code", 2813 | "colab": {} 2814 | }, 2815 | "source": [ 2816 | "lm.fit(x,y)\n", 2817 | "yhat = lm.predict(x)" 2818 | ], 2819 | "execution_count": null, 2820 | "outputs": [] 2821 | }, 2822 | { 2823 | "cell_type": "code", 2824 | "metadata": { 2825 | "id": "p3-D6T-lBasO", 2826 | "colab_type": "code", 2827 | "colab": {} 2828 | }, 2829 | "source": [ 2830 | "plt.scatter(x,y)\n", 2831 | "plt.xlabel('operating_expenditure')\n", 2832 | "plt.ylabel('government_health_expenditure')\n", 2833 | "plt.plot(x,yhat,'r')\n", 2834 | "plt.show()" 2835 | ], 2836 | "execution_count": null, 2837 | "outputs": [] 2838 | } 2839 | ] 2840 | } --------------------------------------------------------------------------------