├── .gitattributes ├── .gitignore ├── 1_module_introduction_pandas ├── 1_1 intro_to_python.ipynb ├── 1_2_intro_to_numpy.ipynb ├── 1_3_intro_to_pandas.ipynb ├── 1_4_loading_and_understanding_data.ipynb ├── 1_5_exploratory_data_analysis.ipynb ├── README.md ├── best_practices_data_science.pdf ├── images │ ├── Anaconda_1.PNG │ ├── Anaconda_2.PNG │ ├── Anaconda_3.PNG │ ├── Anaconda_4.PNG │ ├── Anaconda_5.PNG │ ├── Anaconda_6.PNG │ ├── Anaconda_7.PNG │ ├── Anaconda_7_2.PNG │ ├── Anaconda_8.PNG │ ├── anaconda_nav.png │ └── jupyter_notebook.png ├── intro_to_visualization.pptx └── python_installation_instructions.md ├── 2_module_eda_feature_engineering ├── 2_1_feature_engineering.ipynb ├── README.md └── images │ ├── Anaconda_1.PNG │ ├── Anaconda_2.PNG │ ├── Anaconda_3.PNG │ ├── Anaconda_4.PNG │ ├── Anaconda_5.PNG │ ├── Anaconda_6.PNG │ ├── Anaconda_7.PNG │ ├── Anaconda_7_2.PNG │ ├── Anaconda_8.PNG │ ├── anaconda_nav.png │ └── jupyter_notebook.png ├── 3_module_linear_regression ├── 3_1_linear_regression-build_univariate_model.ipynb ├── 3_2_linear_regression_check_assumptions.ipynb ├── 3_3_linear_regression_build_multivariate_model.ipynb ├── 3_4_polynomial_regression.ipynb ├── 3_5_linear_regression_regularization.ipynb ├── README.md └── images │ └── LinearRegression.png ├── 4_module_classification ├── 4_0_twitter_web_scraping.ipynb ├── 4_1_logistic_regression.ipynb ├── 4_2_naive_bayes.ipynb ├── 4_3_naive_bayes_detail.ipynb ├── 4_4_support_vector_machines.ipynb ├── README.md └── images │ └── intro_to_ml.png ├── 5_module_decision_trees ├── 5_1_decision_trees.ipynb ├── 5_2_random_forests.ipynb ├── README.md └── images │ ├── DecisionTreeExample.png │ └── bagging.png ├── 6_module_unsupervised_learning ├── 6_1_clustering.ipynb ├── README.md └── images │ ├── clustering.png │ └── k_means.png ├── 7_module_advanced_topics ├── 7_1_sentiment_analysis_details.ipynb ├── 7_2_image_processing_with_keras.ipynb └── get_more_100_pictures.py ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── environment.yml ├── images ├── decision_trees.png ├── delta_logo.jpg ├── delta_octocat.png ├── ensemble_algorithms.png ├── introduction_to_machine_learning.png ├── linear_regression.png ├── machine_learning_.png ├── model_selection_evaluation.png ├── nlp_pt_1.png └── nlp_pt_2.png ├── setup.sh └── tests_for_students ├── MPI_data_poverty.csv ├── country_mapper.csv ├── loans_midterm.csv └── midterm_part_2.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb filter=ipynb_stripout 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Data 2 | /data 3 | /datasets 4 | 5 | # Temp files 6 | notebook-extensions/ 7 | Untitled.ipynb 8 | 9 | # Silly mac data 10 | .DS_Store 11 | 12 | # Jupyter Notebook 13 | .ipynb_checkpoints 14 | */.ipynb_checkpoints/* 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | env/ 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *,cover 61 | .hypothesis/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # IPython Notebook 85 | .ipynb_checkpoints 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # celery beat schedule file 91 | celerybeat-schedule 92 | 93 | # dotenv 94 | .env 95 | 96 | # virtualenv 97 | venv/ 98 | ENV/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | -------------------------------------------------------------------------------- /1_module_introduction_pandas/1_1 intro_to_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "Intro To Python\n", 18 | "=====\n", 19 | "\n", 20 | "In this notebook, we will explore basic Python:\n", 21 | "\n", 22 | "- data types, including dictionaries\n", 23 | "- functions \n", 24 | "- loops" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "Please note that we are using Python 3. \n", 32 | "(__NOT__ Python 2! Python 2 has some different functions and syntax)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "3\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "# Let's make sure we are using Python 3\n", 50 | "import sys\n", 51 | "print(sys.version[0])" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "# 1. Basic Data Types: Numbers, Booleans, and Strings\n", 59 | "## 1.1 Numbers" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "\n", 72 | "5\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "a = 5\n", 78 | "\n", 79 | "# Note: use the `type()` function to get the type of a variable\n", 80 | "# Numbers can be integers ('int'), such as 3, 5 and 3049, or floats\n", 81 | "# ('float'), such as 2.5, 3.1, and 2.34938493\n", 82 | "print(type(a))\n", 83 | "print(a)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "### Mathematical Operators: +, -, *, /, **\n", 91 | "Mathematical operators allow you to perform math operations on numbers in Python." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 3, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "6\n", 104 | "4\n", 105 | "10\n", 106 | "2.5\n", 107 | "25\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "b = a + 1\n", 113 | "print(b)\n", 114 | "\n", 115 | "c = a - 1\n", 116 | "print(c)\n", 117 | "\n", 118 | "d = a * 2\n", 119 | "print(d)\n", 120 | "\n", 121 | "e = a / 2\n", 122 | "print(e)\n", 123 | "\n", 124 | "# Note: ** is the exponention operator\n", 125 | "f = a ** 2\n", 126 | "print(f)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### Shorthand mathematical operators\n", 134 | "\n", 135 | "Python has the ability to have a value be added to itself, in shorthand. This is called a \"compound assignment operator\"\n", 136 | "\n", 137 | "The addition version of this is:\n", 138 | "\n", 139 | "* `a += 1` is shorthand for `a = a + 1`\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 4, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "6\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "a += 1\n", 157 | "print(a)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "The multiplication version of a compound assignment operator is:" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 5, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "12\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "a *= 2\n", 182 | "print(a)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## 1.2 Booleans & Logic Operators" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 6, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "im_true = True\n", 207 | "im_false = False\n", 208 | "\n", 209 | "print(type(im_true))" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "### Equality operators\n", 217 | "Equality operators (== and !=) allow you to compare the values of variables on the left and right hand side." 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 7, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "False\n", 230 | "True\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "print(im_true == im_false) # Equality operator\n", 236 | "print(im_true != im_false)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "The `and` operator requires that the variables on each side of the operator are equal to true." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 8, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "False\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "print(im_true and im_false)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "The `or` operator only requires the ONE of the variables on each side of the operator to be true." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 9, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "True\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "print(im_true or im_false)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "## 1.3 Strings\n", 292 | "You can use single or double quotes for strings." 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 10, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "delta analytics\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "my_string = 'delta'\n", 310 | "my_other_string = \"analytics\"\n", 311 | "print(my_string, my_other_string)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "### String methods\n", 319 | "Concatenating strings:" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 11, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "delta analytics\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "another_string = '' + my_string + \" \" + my_other_string\n", 337 | "print(another_string)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "Get the length of the string:" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 12, 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "15\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "print(len(another_string))" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "However, there are more ways to work with strings as well! Python has a list of built-in string functions such as `find()`, `startswith()` and `join()`. Check out [here for more information](https://docs.python.org/2/library/string.html)!" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "# 2. Container Data Types" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "## 2.1 Lists\n", 383 | "A Python `list` stores multiple elements, which can be different types" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 13, 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "name": "stdout", 393 | "output_type": "stream", 394 | "text": [ 395 | "['a', 'b', 'c', 3485]\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "my_list = ['a', 'b', 'c', 3485]\n", 401 | "print(my_list)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "You can access an element in a list with the following syntax:\n", 409 | "Note: the first element in a list has an index of zero." 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 14, 415 | "metadata": {}, 416 | "outputs": [ 417 | { 418 | "name": "stdout", 419 | "output_type": "stream", 420 | "text": [ 421 | "c\n", 422 | "a\n" 423 | ] 424 | } 425 | ], 426 | "source": [ 427 | "print(my_list[2])\n", 428 | "print(my_list[0])" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "Reassigning elements in a list:" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 15, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "name": "stdout", 445 | "output_type": "stream", 446 | "text": [ 447 | "['delta', 'b', 'c', 3485]\n" 448 | ] 449 | } 450 | ], 451 | "source": [ 452 | "my_list[0] = 'delta'\n", 453 | "print(my_list)" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "Adding/removing elements from a list:" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 16, 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "name": "stdout", 470 | "output_type": "stream", 471 | "text": [ 472 | "['delta', 'b', 'c', 3485, 'hello']\n", 473 | "['delta', 'b', 'c', 3485]\n" 474 | ] 475 | } 476 | ], 477 | "source": [ 478 | "my_list.append('hello')\n", 479 | "print(my_list)\n", 480 | "\n", 481 | "my_list.pop()\n", 482 | "print(my_list)" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "Accessing multiple elements in a list:" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 17, 495 | "metadata": {}, 496 | "outputs": [ 497 | { 498 | "name": "stdout", 499 | "output_type": "stream", 500 | "text": [ 501 | "['delta', 'b']\n", 502 | "['c', 3485]\n", 503 | "['delta', 'b']\n" 504 | ] 505 | } 506 | ], 507 | "source": [ 508 | "print(my_list[0:2]) # Access elements in index 0, 1 and 2\n", 509 | "print(my_list[2:]) # Access elements from index 2 to the end\n", 510 | "print(my_list[:2]) # Access elements from the beginning to index 2" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "And of course, the learning may never end! There are more ways to work with lists through built-in string functions as well! (e.g. `count()`, `sort()` and `copy()`.) \n", 518 | "\n", 519 | "For anything on lists and more, we typically like to google the information. Some great resources are like Python's very own dedicated Docs website, [here](https://docs.python.org/3/tutorial/datastructures.html)!" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "## 2.1.5 Tuples" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "An _intermission_, there is another Python native object type called **\"Tuples\"**. We won't cover it here, but Tuples are like Lists, but have the following differences:\n", 534 | "\n", 535 | "* Lists can have different element lengths, while Tuples are fixed\n", 536 | "\n", 537 | "* Lists are in brackets `[]`, tuples is shown by parentheses `()` .\n", 538 | "* List have a mutable nature, tuple have a immutable nature.\n", 539 | "* List have more functionality than tuples" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "## 2.2 Dictionaries\n", 547 | "Dictionaries hold key/value pairs and are useful for storing information." 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 18, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "my_dict = { 'key_one': 'value_one', 'name': 'mike' }" 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "metadata": {}, 562 | "source": [ 563 | "Access a value from a dictionary by a key:" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 19, 569 | "metadata": {}, 570 | "outputs": [ 571 | { 572 | "name": "stdout", 573 | "output_type": "stream", 574 | "text": [ 575 | "value_one\n", 576 | "mike\n" 577 | ] 578 | } 579 | ], 580 | "source": [ 581 | "print(my_dict['key_one'])\n", 582 | "print(my_dict['name'])" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "metadata": {}, 588 | "source": [ 589 | "Looping over values of a dictionary:" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 20, 595 | "metadata": {}, 596 | "outputs": [ 597 | { 598 | "name": "stdout", 599 | "output_type": "stream", 600 | "text": [ 601 | "The key is key_one\n", 602 | "The key is name\n" 603 | ] 604 | } 605 | ], 606 | "source": [ 607 | "for key in my_dict:\n", 608 | " print(\"The key is \" + key)" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 21, 614 | "metadata": {}, 615 | "outputs": [ 616 | { 617 | "name": "stdout", 618 | "output_type": "stream", 619 | "text": [ 620 | "The key is key_one, and the value is value_one\n", 621 | "The key is name, and the value is mike\n" 622 | ] 623 | } 624 | ], 625 | "source": [ 626 | "for key, value in my_dict.items():\n", 627 | " print(\"The key is \" + key + \", and the value is \" + value)" 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": {}, 633 | "source": [ 634 | "## 2.3 Sets\n", 635 | "Sets are similar to lists, but can only contain distinct values." 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 22, 641 | "metadata": {}, 642 | "outputs": [ 643 | { 644 | "name": "stdout", 645 | "output_type": "stream", 646 | "text": [ 647 | "{1, 2, 3, 'hello'}\n" 648 | ] 649 | } 650 | ], 651 | "source": [ 652 | "my_set = {1, 2, 3, 'hello'}\n", 653 | "print(my_set)" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "When defining a set with the same value present multiple times, only one element will be added to the set. For example:" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": 23, 666 | "metadata": {}, 667 | "outputs": [ 668 | { 669 | "name": "stdout", 670 | "output_type": "stream", 671 | "text": [ 672 | "{1, 2, 3, 'hello'}\n" 673 | ] 674 | } 675 | ], 676 | "source": [ 677 | "multiple = {1, 2, 2, 2, 2, 2, 3, 'hello'}\n", 678 | "print(multiple) # This will return {1, 2, 3, 'hello'}" 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": {}, 684 | "source": [ 685 | "# 3. Functions\n", 686 | "A function is a block of reusable code that performs a certain action. Once you've defined a function, you can use it anywhere in your code!" 687 | ] 688 | }, 689 | { 690 | "cell_type": "markdown", 691 | "metadata": {}, 692 | "source": [ 693 | "Defining a function:" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": 24, 699 | "metadata": {}, 700 | "outputs": [], 701 | "source": [ 702 | "def am_i_happy(happiness_level):\n", 703 | " if happiness_level >= 10:\n", 704 | " return \"You're very happy.\"\n", 705 | " elif happiness_level >= 5:\n", 706 | " return \"You're happy.\"\n", 707 | " else:\n", 708 | " return \"You're not happy.\"" 709 | ] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": {}, 714 | "source": [ 715 | "Calling a function:" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 25, 721 | "metadata": {}, 722 | "outputs": [ 723 | { 724 | "name": "stdout", 725 | "output_type": "stream", 726 | "text": [ 727 | "You're not happy.\n" 728 | ] 729 | } 730 | ], 731 | "source": [ 732 | "print(am_i_happy(0))" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": 26, 738 | "metadata": {}, 739 | "outputs": [ 740 | { 741 | "name": "stdout", 742 | "output_type": "stream", 743 | "text": [ 744 | "You're happy.\n" 745 | ] 746 | } 747 | ], 748 | "source": [ 749 | "print(am_i_happy(5))" 750 | ] 751 | }, 752 | { 753 | "cell_type": "markdown", 754 | "metadata": {}, 755 | "source": [ 756 | "# 4. Control Flow\n", 757 | "## 4.1 If/Else If/Else" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 27, 763 | "metadata": {}, 764 | "outputs": [ 765 | { 766 | "name": "stdout", 767 | "output_type": "stream", 768 | "text": [ 769 | "Take a nap\n" 770 | ] 771 | } 772 | ], 773 | "source": [ 774 | "sleepy = True\n", 775 | "hungry = False\n", 776 | "\n", 777 | "if sleepy and hungry:\n", 778 | " print(\"Eat a snack and take a nap.\")\n", 779 | "elif sleepy and not hungry:\n", 780 | " print(\"Take a nap\")\n", 781 | "elif hungry and not sleepy:\n", 782 | " print(\"Eat a snack\")\n", 783 | "else:\n", 784 | " print(\"Go on with your day\")" 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": {}, 790 | "source": [ 791 | "## 4.2 Loops\n", 792 | "### 4.2.1 'while' loops" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": 28, 798 | "metadata": {}, 799 | "outputs": [ 800 | { 801 | "name": "stdout", 802 | "output_type": "stream", 803 | "text": [ 804 | "You have counted to 0\n", 805 | "You have counted to 1\n", 806 | "You have counted to 2\n", 807 | "You have counted to 3\n", 808 | "You have counted to 4\n", 809 | "You have counted to 5\n", 810 | "You have counted to 6\n", 811 | "You have counted to 7\n", 812 | "You have counted to 8\n", 813 | "You have counted to 9\n", 814 | "You're finished counting\n" 815 | ] 816 | } 817 | ], 818 | "source": [ 819 | "counter = 0\n", 820 | "while (counter < 10):\n", 821 | " print(\"You have counted to\", counter)\n", 822 | " counter = counter + 1 # Increment the counter\n", 823 | " \n", 824 | "print(\"You're finished counting\")" 825 | ] 826 | }, 827 | { 828 | "cell_type": "markdown", 829 | "metadata": {}, 830 | "source": [ 831 | "### 4.2.2 'for' loops\n", 832 | "Loop over a list:" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": 29, 838 | "metadata": {}, 839 | "outputs": [ 840 | { 841 | "name": "stdout", 842 | "output_type": "stream", 843 | "text": [ 844 | "cats are cool\n", 845 | "dogs are cool\n", 846 | "lions are cool\n", 847 | "bears are cool\n" 848 | ] 849 | } 850 | ], 851 | "source": [ 852 | "cool_animals = ['cat', 'dog', 'lion', 'bear']\n", 853 | "\n", 854 | "for animal in cool_animals:\n", 855 | " print(animal + \"s are cool\")" 856 | ] 857 | }, 858 | { 859 | "cell_type": "markdown", 860 | "metadata": {}, 861 | "source": [ 862 | "Loop over a dict:" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 30, 868 | "metadata": {}, 869 | "outputs": [ 870 | { 871 | "name": "stdout", 872 | "output_type": "stream", 873 | "text": [ 874 | "The dog says bark!\n", 875 | "The cat says meow!\n", 876 | "The pig says oink!\n" 877 | ] 878 | } 879 | ], 880 | "source": [ 881 | "animal_sounds = {\n", 882 | " 'dog': 'bark',\n", 883 | " 'cat': 'meow',\n", 884 | " 'pig': 'oink'\n", 885 | "}\n", 886 | "\n", 887 | "for animal, sound in animal_sounds.items():\n", 888 | " print(\"The \" + animal + \" says \" + sound + \"!\")" 889 | ] 890 | }, 891 | { 892 | "cell_type": "markdown", 893 | "metadata": {}, 894 | "source": [ 895 | "Congratulations! You made it through the first Notebook. Keep it up!" 896 | ] 897 | }, 898 | { 899 | "cell_type": "markdown", 900 | "metadata": {}, 901 | "source": [ 902 | "
\n", 903 | "
\n", 904 | "
\n", 905 | "\n", 906 | "----" 907 | ] 908 | } 909 | ], 910 | "metadata": { 911 | "kernelspec": { 912 | "display_name": "Python 3", 913 | "language": "python", 914 | "name": "python3" 915 | }, 916 | "language_info": { 917 | "codemirror_mode": { 918 | "name": "ipython", 919 | "version": 3 920 | }, 921 | "file_extension": ".py", 922 | "mimetype": "text/x-python", 923 | "name": "python", 924 | "nbconvert_exporter": "python", 925 | "pygments_lexer": "ipython3", 926 | "version": "3.8.5" 927 | }, 928 | "toc": { 929 | "base_numbering": 1, 930 | "nav_menu": {}, 931 | "number_sections": false, 932 | "sideBar": false, 933 | "skip_h1_title": false, 934 | "title_cell": "Table of Contents", 935 | "title_sidebar": "Contents", 936 | "toc_cell": true, 937 | "toc_position": {}, 938 | "toc_section_display": true, 939 | "toc_window_display": false 940 | } 941 | }, 942 | "nbformat": 4, 943 | "nbformat_minor": 4 944 | } 945 | -------------------------------------------------------------------------------- /1_module_introduction_pandas/1_2_intro_to_numpy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction of Numpy" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### NUMPY\n", 15 | "\n", 16 | "This is a fundamental Package for scientific computing for manipulation of multi-dimensional arrays and matrices. It is particularly useful for linear algebra, Fourier transform, random number simulation etc \n", 17 | "\n", 18 | "Matrices are rectangular array of numbers, symbols and expressions arranged in rows and columns. The numbers, symbols or expressions in the matrix are called its entries or its elements. The horizontal and vertical lines of entries in a matrix are called rows and columns, respectively. Its operations inclue addition, subtraction, multiplication\n", 19 | "\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "#### 1. Importing Numpy\n", 27 | "\n", 28 | "The first step is to import numpy library into the active notebook" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import numpy" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "To shorten the length of any library, a better alternative is to instantiate the library with a shorter name, as in" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "import numpy as np" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "With this, each time numpy is required on this active notebook, **np** will be used instead" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "#### 2. Creating Numpy Array\n", 68 | "\n", 69 | "The [np.array](https://docs.scipy.org/doc/numpy/reference/generated/numpy.array.html) function is used to create an array" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "[1 2 3 4 5]\n", 82 | "The shape of X is (5,)\n", 83 | "[ 9 10]\n", 84 | "The shape of Y is (2,)\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "#creating a 1 dimensional array\n", 90 | "\n", 91 | "x = np.array([1, 2, 3, 4, 5])\n", 92 | "y = np.array([9, 10]) \n", 93 | "print(x)\n", 94 | "print('The shape of X is', x.shape)\n", 95 | "\n", 96 | "print(y)\n", 97 | "print('The shape of Y is', y.shape)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "The [shape](https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.shape.html) property is usually used to get the current shape of an array" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "[[1 2]\n", 117 | " [3 4]\n", 118 | " [5 6]]\n", 119 | "The shape of Z is (3, 2)\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "# Creating a 2D arrays \n", 125 | "z = np.array([[1, 2], [3, 4], [5, 6]]) \n", 126 | "\n", 127 | "print(z)\n", 128 | "print('The shape of Z is', z.shape)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "#### 3. Numpy Functions\n", 136 | "\n", 137 | "Numpy has built-in functions for creating arrays and manipulating. These includes:\n", 138 | "\n", 139 | "- np.arange \n", 140 | "\n", 141 | "- np.reshape\n", 142 | "\n", 143 | "- np.zeros \n", 144 | "\n", 145 | ">The dimensions (no of rows and column) are passed as parameters to the function." 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 5, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n", 158 | " 24]\n", 159 | "(25,)\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "#arange is Used to create arrays with values in a specified range.\n", 165 | "\n", 166 | "A = np.arange(25)\n", 167 | "print(A)\n", 168 | "print(A.shape)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 6, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "[[ 0]\n", 181 | " [ 1]\n", 182 | " [ 2]\n", 183 | " [ 3]\n", 184 | " [ 4]\n", 185 | " [ 5]\n", 186 | " [ 6]\n", 187 | " [ 7]\n", 188 | " [ 8]\n", 189 | " [ 9]\n", 190 | " [10]\n", 191 | " [11]\n", 192 | " [12]\n", 193 | " [13]\n", 194 | " [14]\n", 195 | " [15]\n", 196 | " [16]\n", 197 | " [17]\n", 198 | " [18]\n", 199 | " [19]\n", 200 | " [20]\n", 201 | " [21]\n", 202 | " [22]\n", 203 | " [23]\n", 204 | " [24]]\n", 205 | "The shape of 1D array X = (25, 1)\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "#To change the shape of an array\n", 211 | "\n", 212 | "B = A.reshape(25,1)\n", 213 | "\n", 214 | "print (B)\n", 215 | "print (\"The shape of 1D array X = \", B.shape)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 7, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "[[ 0 1 2 3 4]\n", 228 | " [ 5 6 7 8 9]\n", 229 | " [10 11 12 13 14]\n", 230 | " [15 16 17 18 19]\n", 231 | " [20 21 22 23 24]]\n", 232 | "The shape of array C = (5, 5)\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "C = B.reshape(5,5)\n", 238 | "\n", 239 | "print ( C)\n", 240 | "print (\"The shape of array C = \", C.shape)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "> Note: Before a reshape function will run sucessful, the multiplication of the two parameter supply to the function must be equal with multiplication of the shape of the orginal array you want to reshape.\n", 248 | "\n", 249 | "For example: The shape of variable B is (25, 1) therefore 25 * 1 = 25\n", 250 | "\n", 251 | "The two parameter supply to the reshape function is (5, 5), 5 * 5 = 25\n", 252 | " " 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 8, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "array([[0., 0., 0.],\n", 264 | " [0., 0., 0.]])" 265 | ] 266 | }, 267 | "execution_count": 8, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "#zeros is used to create an array filled with zeros. \n", 274 | "\n", 275 | "np_Zeros = np.zeros((2,3))\n", 276 | "\n", 277 | "np_Zeros" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "#### 4. Accessing elements of Numpy array\n", 285 | "\n", 286 | "To access an element in a two-dimensional array, you need to specify an index for both the row and the column." 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 9, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "D = np.array([[5, 7, 8],[3, 5, 9]])" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 10, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "array([[5, 7, 8],\n", 307 | " [3, 5, 9]])" 308 | ] 309 | }, 310 | "execution_count": 10, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "D" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 11, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/plain": [ 327 | "3" 328 | ] 329 | }, 330 | "execution_count": 11, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "# note that for array numbering in numpy, it that from zero\n", 337 | "#Row 1, column 0 gives a scalar value\n", 338 | "D[1,0]" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 12, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "5" 350 | ] 351 | }, 352 | "execution_count": 12, 353 | "metadata": {}, 354 | "output_type": "execute_result" 355 | } 356 | ], 357 | "source": [ 358 | "#Row 1, column 1\n", 359 | "D[1,1]" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 13, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "data": { 369 | "text/plain": [ 370 | "9" 371 | ] 372 | }, 373 | "execution_count": 13, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "#Row 1, column 2\n", 380 | "D[1,2]" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 14, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "data": { 390 | "text/plain": [ 391 | "array([[5, 7, 8]])" 392 | ] 393 | }, 394 | "execution_count": 14, 395 | "metadata": {}, 396 | "output_type": "execute_result" 397 | } 398 | ], 399 | "source": [ 400 | "# Slicing is also possible in numpy\n", 401 | "D[0:1, :]" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "#### 5. Numpy array math operations" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 15, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "x = np.array([[1,2,3],[4,5,6]])\n", 418 | "y = np.array([[2,2,2],[3,3,3]])\n", 419 | "z = np.array([1,2,3])" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 16, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "data": { 429 | "text/plain": [ 430 | "array([[1, 4],\n", 431 | " [2, 5],\n", 432 | " [3, 6]])" 433 | ] 434 | }, 435 | "execution_count": 16, 436 | "metadata": {}, 437 | "output_type": "execute_result" 438 | } 439 | ], 440 | "source": [ 441 | "#Transpose a matrix\n", 442 | "\n", 443 | "x.T" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 17, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "[[3 4 5]\n", 456 | " [7 8 9]]\n", 457 | "[[3 4 5]\n", 458 | " [7 8 9]]\n" 459 | ] 460 | } 461 | ], 462 | "source": [ 463 | "#Elementwise addittion\n", 464 | "\n", 465 | "print (x+y)\n", 466 | "print (np.add(x,y))" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 18, 472 | "metadata": {}, 473 | "outputs": [ 474 | { 475 | "name": "stdout", 476 | "output_type": "stream", 477 | "text": [ 478 | "[[-1 0 1]\n", 479 | " [ 1 2 3]]\n", 480 | "[[-1 0 1]\n", 481 | " [ 1 2 3]]\n" 482 | ] 483 | } 484 | ], 485 | "source": [ 486 | "#Elementwise Subtraction\n", 487 | "\n", 488 | "print (x-y)\n", 489 | "print (np.subtract(x,y))" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 19, 495 | "metadata": {}, 496 | "outputs": [ 497 | { 498 | "name": "stdout", 499 | "output_type": "stream", 500 | "text": [ 501 | "[[ 1 4 9]\n", 502 | " [ 4 10 18]]\n", 503 | "[[ 1 4 9]\n", 504 | " [ 4 10 18]]\n" 505 | ] 506 | } 507 | ], 508 | "source": [ 509 | "#Elementwise Multiplication\n", 510 | "\n", 511 | "print (x*z)\n", 512 | "print (np.multiply(x,z))" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 20, 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "[14 32]\n" 525 | ] 526 | } 527 | ], 528 | "source": [ 529 | "# Inner product of vectors \n", 530 | "print(np.dot(x, z)) " 531 | ] 532 | } 533 | ], 534 | "metadata": { 535 | "kernelspec": { 536 | "display_name": "Python (Delta Analytics Env)", 537 | "language": "python", 538 | "name": "delta_analytics_env" 539 | }, 540 | "language_info": { 541 | "codemirror_mode": { 542 | "name": "ipython", 543 | "version": 3 544 | }, 545 | "file_extension": ".py", 546 | "mimetype": "text/x-python", 547 | "name": "python", 548 | "nbconvert_exporter": "python", 549 | "pygments_lexer": "ipython3", 550 | "version": "3.6.12" 551 | } 552 | }, 553 | "nbformat": 4, 554 | "nbformat_minor": 2 555 | } 556 | -------------------------------------------------------------------------------- /1_module_introduction_pandas/1_4_loading_and_understanding_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Module 1: Introduction to Exploratory Analysis " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "\n", 15 | " \n", 16 | "\n", 17 | "\n", 18 | "[(Page 17)](https://drive.google.com/file/d/1r4SBY6Dm6xjFqLH12tFb-Bf7wbvoIN_C/view)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "What we'll be doing in this notebook:\n", 26 | "-----\n", 27 | "\n", 28 | " 1. Checking variable types\n", 29 | " 2. Checking for missing variables \n", 30 | " 3. Observing number of observations in the dataset\n", 31 | " 4. Quickly displaying Descriptive statistics" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "### Import packages" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 1, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import pandas as pd\n", 48 | "import numpy as np\n", 49 | "import seaborn as sns\n", 50 | "import matplotlib.pyplot as plt\n", 51 | "from datetime import datetime\n", 52 | "import dateutil.parser\n", 53 | "\n", 54 | "# The command below means that the output of multiple commands in a cell will be output at once\n", 55 | "from IPython.core.interactiveshell import InteractiveShell\n", 56 | "InteractiveShell.ast_node_interactivity = \"all\"\n", 57 | "\n", 58 | "# The command below tells jupyter to display up to 80 columns, this keeps everything visible\n", 59 | "pd.set_option('display.max_columns', 80)\n", 60 | "pd.set_option('expand_frame_repr', True)\n", 61 | "\n", 62 | "# Show figures in notebook\n", 63 | "%matplotlib inline" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Import dataset" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "We read in our dataset" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 2, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "path = '../data/'\n", 87 | "filename = 'loans.csv'\n", 88 | "\n", 89 | "try:\n", 90 | " df = pd.read_csv(path + filename)\n", 91 | "except FileNotFoundError:\n", 92 | " # If data is not found, download it from GitHub\n", 93 | " import os\n", 94 | " os.system(f'git clone --single-branch --depth=1 https://github.com/DeltaAnalytics/machine_learning_for_good_data {path}')\n", 95 | " df = pd.read_csv(path+filename)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "In the cell below, we take a random sample of 2 rows to get a feel for the data." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/html": [ 113 | "
\n", 114 | "\n", 127 | "\n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | "
id_numberloan_amountlender_countstatusfunded_datefunded_amountrepayment_termlocation_country_codesectordescriptionuse
682154864772525funded2018-06-19T12:10:23Z72540CMAgricultureShe is married and a mother of four children l...rent land, labor and purchase seedlings and ot...
3312156397180030funded2018-07-18T16:16:22Z80014NGServicesOluchi is 38 years old and a mother of five ch...to process palm oil for storage.
\n", 175 | "
" 176 | ], 177 | "text/plain": [ 178 | " id_number loan_amount lender_count status funded_date \\\n", 179 | "682 1548647 725 25 funded 2018-06-19T12:10:23Z \n", 180 | "3312 1563971 800 30 funded 2018-07-18T16:16:22Z \n", 181 | "\n", 182 | " funded_amount repayment_term location_country_code sector \\\n", 183 | "682 725 40 CM Agriculture \n", 184 | "3312 800 14 NG Services \n", 185 | "\n", 186 | " description \\\n", 187 | "682 She is married and a mother of four children l... \n", 188 | "3312 Oluchi is 38 years old and a mother of five ch... \n", 189 | "\n", 190 | " use \n", 191 | "682 rent land, labor and purchase seedlings and ot... \n", 192 | "3312 to process palm oil for storage. " 193 | ] 194 | }, 195 | "execution_count": 3, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "df.sample(n=2)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### 1) Type Checking\n", 209 | "" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "Type is very important in Python programing, because it affects the types of functions you can apply to a series. There are a few different types of data you will see regularly (see [this](https://en.wikibooks.org/wiki/Python_Programming/Data_Types) link for more detail):\n", 217 | "* **int** - a number with no decimal places. example: loan_amount field\n", 218 | "* **float** - a number with decimal places. example: partner_id field\n", 219 | "* **str** - str is short for string. This type formally defined as a sequence of unicode characters. More simply, string means that the data is treated as a word, not a number. example: sector\n", 220 | "* **boolean** - can only be True or False. There is not currently an example in the data, but we will be creating a gender field shortly.\n", 221 | "* **datetime** - values meant to hold time data. Example: posted_date\n", 222 | "\n", 223 | "Let's check the type of our variables using the examples we saw in the cell above." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 4, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "['id_number',\n", 235 | " 'loan_amount',\n", 236 | " 'lender_count',\n", 237 | " 'status',\n", 238 | " 'funded_date',\n", 239 | " 'funded_amount',\n", 240 | " 'repayment_term',\n", 241 | " 'location_country_code',\n", 242 | " 'sector',\n", 243 | " 'description',\n", 244 | " 'use']" 245 | ] 246 | }, 247 | "execution_count": 4, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "# Here are all of the columns\n", 254 | "df.columns.tolist()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 5, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "dtype('int64')" 266 | ] 267 | }, 268 | "execution_count": 5, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "# Find the dtype, aka datatype, for a column\n", 275 | "df['id_number'].dtype" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 6, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "# Try this - Pick a couple of columns and check their type on your own\n" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "### 2) Do I have missing values?\n", 292 | "\n", 293 | "" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "If we have missing data, is the missing data at random or not? If data is missing at random, the data distribution is still representative of the population. You can probably ignore the missing values as an inconvenience. However, if the data is systematically missing, the analysis you do may be biased. You should carefully consider the best way to clean the data, it may involve dropping some data." 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "We want to see how many values are missing in certain variable columns. One way to do this is to count the number of null observations. \n", 308 | "\n", 309 | "For this, we wrote a short function to apply to the dataframe. \n", 310 | "\n", 311 | "We print out the first few observations, but you can remove the `.head()` to print out all columns. " 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 7, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "name": "stdout", 321 | "output_type": "stream", 322 | "text": [ 323 | "Missing values per column:\n", 324 | "funded_date 937.0\n", 325 | "location_country_code 17.0\n", 326 | "description 342.0\n", 327 | "use 342.0\n", 328 | "dtype: float64\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "#Create a new function:\n", 334 | "def num_missing(x):\n", 335 | " return sum(x.isnull())\n", 336 | "\n", 337 | "#Applying per column:\n", 338 | "print(\"Missing values per column:\")\n", 339 | "## Check how many are missing by column, and then check which ones have any missing values\n", 340 | "print(df.apply(num_missing, axis=0).where(lambda x : x != 0).dropna().head(20)) \n", 341 | "#axis=0 defines that function is to be applied on each column" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "### 3) Sanity Checks\n", 349 | "" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "**Does the dataset match what you expected to find?**\n", 357 | "- Is the range of values what you would expect? For example, are all loan_amounts above 0.\n", 358 | "- Do you have the number of rows you would expect?\n", 359 | "- Is your data for the date range you would expect? For example, is there a strange year in the data like 1880.\n", 360 | "- Are there unexpected spikes when you plot the data over time?\n", 361 | "\n", 362 | "\n", 363 | "In the command below we find out the number of loans and number of columns by using the function shape. You can also use `len(df.index)` to find the number of rows." 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 8, 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "name": "stdout", 373 | "output_type": "stream", 374 | "text": [ 375 | "There are 6019 observations and 11 features\n" 376 | ] 377 | } 378 | ], 379 | "source": [ 380 | "print(f'There are {df.shape[0]} observations and {df.shape[1]} features')" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "Remember, each row is an observation and each column is a potential feature. \n", 388 | "\n", 389 | "Remember we need large about of data for machine learning." 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "### 4) Descriptive statistics of the dataset\n", 397 | "\n", 398 | "" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "The `describe` command conveniently below provides key summary statistics for each numeric column." 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 9, 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "data": { 415 | "text/html": [ 416 | "
\n", 417 | "\n", 430 | "\n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | "
id_numberloan_amountlender_countfunded_amountrepayment_term
count6.019000e+036019.0000006019.0000006019.0000006019.000000
mean1.359770e+061499.01146435.6614061325.07061011.803290
std3.719316e+052512.51728073.4202562444.7268159.114948
min1.377200e+0450.0000000.0000000.0000003.000000
25%1.425188e+06300.0000007.000000200.0000008.000000
50%1.550673e+06625.00000016.000000525.00000010.000000
75%1.566204e+061825.00000041.0000001525.00000014.000000
max1.573593e+0680000.0000002665.00000080000.000000133.000000
\n", 508 | "
" 509 | ], 510 | "text/plain": [ 511 | " id_number loan_amount lender_count funded_amount repayment_term\n", 512 | "count 6.019000e+03 6019.000000 6019.000000 6019.000000 6019.000000\n", 513 | "mean 1.359770e+06 1499.011464 35.661406 1325.070610 11.803290\n", 514 | "std 3.719316e+05 2512.517280 73.420256 2444.726815 9.114948\n", 515 | "min 1.377200e+04 50.000000 0.000000 0.000000 3.000000\n", 516 | "25% 1.425188e+06 300.000000 7.000000 200.000000 8.000000\n", 517 | "50% 1.550673e+06 625.000000 16.000000 525.000000 10.000000\n", 518 | "75% 1.566204e+06 1825.000000 41.000000 1525.000000 14.000000\n", 519 | "max 1.573593e+06 80000.000000 2665.000000 80000.000000 133.000000" 520 | ] 521 | }, 522 | "execution_count": 9, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "df.describe()" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "In order to get the same summary statistics for categorical columns (string) we need to do a little data wrangling. \n", 536 | "\n", 537 | "The first line of code filters for all columns that are a data type object. As we know from before this means they are considered to be a string. The final row of code provides summary statistics for these character fields." 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 10, 543 | "metadata": {}, 544 | "outputs": [ 545 | { 546 | "data": { 547 | "text/html": [ 548 | "
\n", 549 | "\n", 562 | "\n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | "
statusfunded_datelocation_country_codesectordescriptionuse
count601950826002601956775677
unique34453301452774325
topfunded2018-07-22T15:54:41ZCDFoodConcilie has been selling used clothing for 15...to pay for a stove.
freq508294001738280
\n", 613 | "
" 614 | ], 615 | "text/plain": [ 616 | " status funded_date location_country_code sector \\\n", 617 | "count 6019 5082 6002 6019 \n", 618 | "unique 3 4453 30 14 \n", 619 | "top funded 2018-07-22T15:54:41Z CD Food \n", 620 | "freq 5082 9 400 1738 \n", 621 | "\n", 622 | " description use \n", 623 | "count 5677 5677 \n", 624 | "unique 5277 4325 \n", 625 | "top Concilie has been selling used clothing for 15... to pay for a stove. \n", 626 | "freq 2 80 " 627 | ] 628 | }, 629 | "execution_count": 10, 630 | "metadata": {}, 631 | "output_type": "execute_result" 632 | } 633 | ], 634 | "source": [ 635 | "categorical = df.dtypes[df.dtypes == \"object\"].index\n", 636 | "df[categorical].describe()" 637 | ] 638 | }, 639 | { 640 | "cell_type": "markdown", 641 | "metadata": {}, 642 | "source": [ 643 | "In the table above, there are 4 really useful fields: \n", 644 | "\n", 645 | "1) **count** - total number of fields populated (Not empty). \n", 646 | "\n", 647 | "2) **unique** - tells us how many different unique ways this field is populated. For example 4 in description.languages tells us there are 4 different language descriptions. \n", 648 | "\n", 649 | "3) **top** - tells us the most popular data point. For example, the top activity in this dataset is Farming which tells us most loans are in Farming.\n", 650 | "\n", 651 | "4) **freq** - tells us that how frequent the most popular category is in our dataset. For example, 'en' (English) is the language almost all descriptions (description.languages) are written in (118,306 out of 118,316)." 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "What is next\n", 659 | "-----\n", 660 | "\n", 661 | "In the next section, we move on to exploratory data analysis (EDA)." 662 | ] 663 | }, 664 | { 665 | "cell_type": "markdown", 666 | "metadata": {}, 667 | "source": [ 668 | "
\n", 669 | "
\n", 670 | "
\n", 671 | "\n", 672 | "----" 673 | ] 674 | } 675 | ], 676 | "metadata": { 677 | "kernelspec": { 678 | "display_name": "Python (Delta Analytics Env)", 679 | "language": "python", 680 | "name": "delta_analytics_env" 681 | }, 682 | "language_info": { 683 | "codemirror_mode": { 684 | "name": "ipython", 685 | "version": 3 686 | }, 687 | "file_extension": ".py", 688 | "mimetype": "text/x-python", 689 | "name": "python", 690 | "nbconvert_exporter": "python", 691 | "pygments_lexer": "ipython3", 692 | "version": "3.6.12" 693 | } 694 | }, 695 | "nbformat": 4, 696 | "nbformat_minor": 2 697 | } 698 | -------------------------------------------------------------------------------- /1_module_introduction_pandas/README.md: -------------------------------------------------------------------------------- 1 | Module 1: Introduction to Exploratory Analysis 2 | ===== 3 | 4 | Welcome to first module! In this module, we start exploring our [Kiva](https://www.kiva.org/) dataset. 5 | 6 | Goals 7 | ---- 8 | - Load our data and do some quick exploration 9 | - Understand the data using descriptive statistics and graphs 10 | 11 | Topic overview 12 | ---- 13 | 14 | The goal of exploratory analysis is to summarize the main characteristics of a data set, with the belief that it may lead to new hypotheses that inform algorithm choice and experimentation. Exploratory analysis happens before formal modeling commences, and is extremely important for helping inform or sharpen your hypothesis. 15 | 16 | Installation 17 | ---- 18 | 19 | If you have not installed the datasets yet for this repository, machine_learning_for_good, you can execute the following in this folder, `module_1_introduction_pandas` 20 | 21 | ``` 22 | git clone --single-branch --depth=1 https://github.com/DeltaAnalytics/machine_learning_for_good_data ../data 23 | ``` 24 | > If you are in the main folder, then just remove the `../` at the end, and leave it as `data`. -------------------------------------------------------------------------------- /1_module_introduction_pandas/best_practices_data_science.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/best_practices_data_science.pdf -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/Anaconda_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_1.PNG -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/Anaconda_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_2.PNG -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/Anaconda_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_3.PNG -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/Anaconda_4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_4.PNG -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/Anaconda_5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_5.PNG -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/Anaconda_6.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_6.PNG -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/Anaconda_7.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_7.PNG -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/Anaconda_7_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_7_2.PNG -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/Anaconda_8.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_8.PNG -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/anaconda_nav.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/anaconda_nav.png -------------------------------------------------------------------------------- /1_module_introduction_pandas/images/jupyter_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/jupyter_notebook.png -------------------------------------------------------------------------------- /1_module_introduction_pandas/intro_to_visualization.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/intro_to_visualization.pptx -------------------------------------------------------------------------------- /1_module_introduction_pandas/python_installation_instructions.md: -------------------------------------------------------------------------------- 1 | Module 0: Installing Python with Anaconda 2 | ==== 3 | 4 | 1. Download and install the latest version Python 3 from Anaconda (requires ~1.8Gb space) for your operating system at: [www.anaconda.com/download/](https://www.anaconda.com/download/). 5 | ![1](images/Anaconda_1.PNG) 6 | 7 | 2. Follow the prompts: 8 | 9 | ![2](images/Anaconda_2.PNG) 10 | ![3](images/Anaconda_3.PNG) 11 | ![4](images/Anaconda_4.PNG) 12 | ![5](images/Anaconda_5.PNG) 13 | ![6](images/Anaconda_6.PNG) 14 | ![7](images/Anaconda_7.PNG) 15 | ![8](images/Anaconda_8.PNG) 16 | 17 | 3. Start Anaconda Navigator application. 18 | 19 | 4. Within Anaconda Navigator, click on "Lanuch" button for Jupyter Notebook. 20 | 21 | ![](images/anaconda_nav.png) 22 | 23 | That will open Jupyter Notebook in your favorite web browser. 24 | 25 | ![](images/jupyter_notebook.png) -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/README.md: -------------------------------------------------------------------------------- 1 | Module 2: Feature Engineering 2 | ==== 3 | 4 | Welcome to Module 2 of the introductory course to machine learning, where we will create new variables out of the raw data in a process called feature engineering! 5 | 6 | Goals 7 | ------ 8 | Learn how to execute the following: 9 | 1. Feature pruning 10 | 2. Engineering Temporal Features (month, year, etc) 11 | 3. One-hot encoding / dummy variables 12 | 4. Extracting features from strings 13 | 5. Creating features from Metadata 14 | 6. Feature scaling 15 | 7. Data Imputation / cleaning 16 | 17 | Topic Overview 18 | ----- 19 | 20 | **What is feature engineering?** 21 | 22 | In machine learning, a *feature* is a property or characteristic of a phenomenon being observed. *Feature engineering* is the process of creating and selecting features from the data that are useful for machine learning algorithms. 23 | 24 | The dataset contains many features to start, so why do we need to make some more? 25 | 26 | - Consider a dataset that has a long description string variable. This may not be a useful feature to feed directly into a model, so perhaps we can make a new variable for whether the description contains a certain word. The hope for creating this new feature is that it will have more predictive power. 27 | 28 | How do we know what features will be useful? 29 | 30 | - This comes down to domain expertise, and this is a large part of a data scientist's work! 31 | 32 | Fortunately, there are common starting points for many datasets that we review in this module. 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/Anaconda_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_1.PNG -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/Anaconda_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_2.PNG -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/Anaconda_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_3.PNG -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/Anaconda_4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_4.PNG -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/Anaconda_5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_5.PNG -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/Anaconda_6.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_6.PNG -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/Anaconda_7.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_7.PNG -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/Anaconda_7_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_7_2.PNG -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/Anaconda_8.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_8.PNG -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/anaconda_nav.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/anaconda_nav.png -------------------------------------------------------------------------------- /2_module_eda_feature_engineering/images/jupyter_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/jupyter_notebook.png -------------------------------------------------------------------------------- /3_module_linear_regression/README.md: -------------------------------------------------------------------------------- 1 | # Module 2: Linear Regression 2 | ================================================================================ 3 | 4 | Welcome to module 2 of the introductory course to data for good where we will be exploring linear regression - the first machine learning algorithm of this course! 5 | 6 | Goals 7 | ---- 8 | By end of this module one should feel comfortable with the fundamentals of linear regression. Specific topics included are: 9 | 1. How to split the data between training and test data 10 | 2. Using training data to train a linear regression model 11 | 3. Analyzing the results of the model 12 | 4. Checking the assumptions of linear regression 13 | 5. Building a multivariate regressor 14 | 15 | ## Topic overview 16 | Linear Regression is a parametric model which predicts a continuous outcome feature (**Y**) from one or more explanatory features (**X**). 17 | 18 | **Y** = beta_0 + beta_1 * **X** 19 | 20 | beta_0 is called the intercept term, and represents the expected mean value of Y when all explanatory features equal 0. 21 | beta_1 is called a beta coefficient, and represents the expected change in the value of Y that results from a one unit change in X. 22 | 23 | This is module fits a straight line to your data, where the value of the outcome feature can be calculated as a linear combination of the explanatory features. Sounds relatively simple? Afraid not, there are many nuances and conditions that need to be understood before using linear regression! We are going to delve into these assumptions and conditions and then demonstrate how to use this algorithm on the kiva dataset. 24 | 25 | ![Image](https://imgs.xkcd.com/comics/linear_regression.png) 26 | 27 | 28 | ## Resources 29 | - [Comprehensive Guide to Regression](https://www.analyticsvidhya.com/blog/2015/08/comprehensive-guide-regression/) 30 | - [Understanding key regression statistics](http://connor-johnson.com/2014/02/18/linear-regression-with-python/) 31 | 32 | ## Advanced topics 33 | Linear regression is one member of a family of linear parametric models. Some additional advanced topics we recommend looking up are... 34 | ### Logistic regression 35 | Logistic regression is very similar to linear regression but has a categorical outcome instead. So rather than modeling a continuous dependent variable, it models a binary classification - yes or no, true or false, 1 or 0. This is still a linear model as it assumes a linear relationship between the independent variables and the link function. 36 | 37 | To learn more about Logistic Regression, try to following resources: 38 | - [Beginners guide to Logistic Regression](https://www.analyticsvidhya.com/blog/2015/11/beginners-guide-on-logistic-regression-in-r/): A good overview of the theory and mathematics behind the algorithm 39 | - [Logistic Regression in Python](http://blog.yhat.com/posts/logistic-regression-python-rodeo.html): A thorough tutorial on a publicly available dataset in Python 40 | 41 | ### Ridge and Lasso regression 42 | Both linear and logistic regression have a tendancy to overfit when there are a large number of features. Therefore it is important that we choose the features which have the most predictive power but how do we choose these features? We can use our EDA to a certain extent but that only goes so far. 43 | 44 | This is where ridge and lasso regularization techniques come into play! Both of these techniques can be used to identify which features explain the most variance and should therefore be kept in the model. 45 | 46 | To learn more about ridge and lasso regression and general regulaization techniques, we recommend the following resources: 47 | - [Complete tutorial on ridge and lasso regression in python](https://www.analyticsvidhya.com/blog/2016/01/complete-tutorial-ridge-lasso-regression-python/): A broad tutorial explaining why we use regularization techniques, touching on the mathematics behind the algorithms and giving a few examples in python. 48 | - [An Introduction to Statistical Learning, Chapter 6.2](http://www-bcf.usc.edu/%7Egareth/ISL/ISLR%20Sixth%20Printing.pdf): A comprehensive explanation of both Lasso and Ridge and their application in the context of statistical learning. 49 | 50 | -------------------------------------------------------------------------------- /3_module_linear_regression/images/LinearRegression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/3_module_linear_regression/images/LinearRegression.png -------------------------------------------------------------------------------- /4_module_classification/4_0_twitter_web_scraping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction Into APIs\n", 8 | "\n", 9 | "\n", 10 | "\n", 11 | " \n", 12 | "\n", 13 | "\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import tweepy\n", 23 | "import pandas as pd\n", 24 | "from textblob import TextBlob\n", 25 | "\n", 26 | "def pull_tweets (query, maxTweets = 1000, tweetsPerQuery = 100, max_id = -1, sinceId = None):\n", 27 | " '''\n", 28 | " Finds tweets (Comment, Date, Favorites, User) for a query string.\n", 29 | " Twitter API limit per query is 100. Combines these queries. \n", 30 | " '''\n", 31 | " \n", 32 | " # Fill with your own app details\n", 33 | " API_KEY = \"wCl2jflXpyWmYDM22iKFsaiS2\"\n", 34 | " API_SEC = \"f3DkCao13uCfA58bSQXahsDVNF5qzNztrgt3wB2RDDAV8zyXvT\"\n", 35 | " \n", 36 | " # connect to Twitter using authentication\n", 37 | " auth = tweepy.AppAuthHandler(API_KEY, API_SEC)\n", 38 | " # wait_on_rate_limit means that if the API limit is hit, \n", 39 | " # the pulls will wait until more calls are available\n", 40 | " api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)\n", 41 | " \n", 42 | " # Pull comments from Twitter\n", 43 | " # See https://developer.twitter.com/en/docs/tweets/timelines/guides/working-with-timelines\n", 44 | " tweetCount = 0\n", 45 | " data = pd.DataFrame() \n", 46 | " \n", 47 | " while tweetCount < maxTweets:\n", 48 | " if (max_id <= 0):\n", 49 | " new_tweets = api.search(q=query, count=tweetsPerQuery, \n", 50 | " since_id=sinceId)\n", 51 | " else:\n", 52 | " new_tweets = api.search(q=query, count=tweetsPerQuery,\n", 53 | " max_id=str(max_id - 1), \n", 54 | " since_id=sinceId)\n", 55 | " if not new_tweets:\n", 56 | " print(\"No more tweets found\")\n", 57 | " break\n", 58 | " \n", 59 | " tweetCount += len(new_tweets)\n", 60 | " print(\"Downloaded {0} tweets\".format(tweetCount))\n", 61 | " max_id = new_tweets[-1].id\n", 62 | " \n", 63 | " ## Create a dataset from the downloaded tweets\n", 64 | " new_data = pd.DataFrame([{\n", 65 | " 'Date': tweet.created_at,\n", 66 | " 'Comments': tweet.text, \n", 67 | " 'User': tweet.user.name, \n", 68 | " 'Favorites': tweet.favorite_count} \n", 69 | " for tweet in new_tweets])\n", 70 | " \n", 71 | " data = data.append(new_data)\n", 72 | " return data" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "Downloaded 100 tweets\n", 85 | "Downloaded 200 tweets\n", 86 | "Downloaded 300 tweets\n", 87 | "Downloaded 400 tweets\n", 88 | "Downloaded 500 tweets\n", 89 | "Downloaded 600 tweets\n", 90 | "Downloaded 700 tweets\n", 91 | "Downloaded 800 tweets\n", 92 | "Downloaded 900 tweets\n", 93 | "Downloaded 1000 tweets\n", 94 | "Downloaded 1100 tweets\n", 95 | "Downloaded 1188 tweets\n", 96 | "Downloaded 1288 tweets\n", 97 | "Downloaded 1388 tweets\n", 98 | "Downloaded 1488 tweets\n", 99 | "Downloaded 1588 tweets\n", 100 | "Downloaded 1688 tweets\n", 101 | "Downloaded 1788 tweets\n", 102 | "Downloaded 1888 tweets\n", 103 | "Downloaded 1988 tweets\n", 104 | "Downloaded 2088 tweets\n", 105 | "Downloaded 2188 tweets\n", 106 | "Downloaded 2288 tweets\n", 107 | "Downloaded 2388 tweets\n", 108 | "Downloaded 2487 tweets\n", 109 | "Downloaded 2558 tweets\n", 110 | "No more tweets found\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "data = pull_tweets(\"microfinance\", maxTweets = 5000)\n", 116 | "\n", 117 | "# In real life you might have test data with pre-labeled sentiments. We will use a simple word net algorithm to classify for now.\n", 118 | "data['Polarity'] = [TextBlob(comment).polarity for comment in data.Comments]\n", 119 | "\n", 120 | "data.loc[data['Polarity'] > 0, 'Sentiment'] = 'positive'\n", 121 | "data.loc[data['Polarity'] < 0, 'Sentiment'] = 'negative'\n", 122 | "data.loc[data['Polarity'] == 0, 'Sentiment'] = 'neutral'\n", 123 | "\n", 124 | "#convert data to a csv\n", 125 | "data.to_csv(\"microfinance_tweets.csv\", index = False)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [] 134 | } 135 | ], 136 | "metadata": { 137 | "kernelspec": { 138 | "display_name": "Python 3", 139 | "language": "python", 140 | "name": "python3" 141 | }, 142 | "language_info": { 143 | "codemirror_mode": { 144 | "name": "ipython", 145 | "version": 3 146 | }, 147 | "file_extension": ".py", 148 | "mimetype": "text/x-python", 149 | "name": "python", 150 | "nbconvert_exporter": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.6.8" 153 | } 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 2 157 | } 158 | -------------------------------------------------------------------------------- /4_module_classification/4_4_support_vector_machines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "Support Vector Machines\n", 18 | "------\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stderr", 28 | "output_type": "stream", 29 | "text": [ 30 | "/Users/brian/anaconda3/envs/3.7/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", 31 | " import pandas.util.testing as tm\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "import pandas as pd\n", 37 | "import seaborn as sns\n", 38 | "import numpy as np\n", 39 | "import matplotlib.pyplot as plt\n", 40 | "%matplotlib inline \n", 41 | "\n", 42 | "import nltk\n", 43 | "from sklearn.feature_extraction.text import CountVectorizer\n", 44 | "from sklearn.metrics import classification_report, confusion_matrix \n", 45 | "import re\n", 46 | "import string\n", 47 | "\n", 48 | "from sklearn.model_selection import train_test_split\n", 49 | "from sklearn import svm" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Prepare data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 2, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/html": [ 67 | "
\n", 68 | "\n", 81 | "\n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | "
CommentsDateFavoritesUserPolaritySentiment
0RT @atmadiprayET: Here's why Janalakshmi Finan...3/22/2018 5:400Saloni Shukla-0.100000negative
1RT @ecosmob: Ecosmob's #Mobility solutions for...3/22/2018 5:360Sindhav Bhageerath-0.062500neutral
2Project have big future! Microfinance is belie...3/22/2018 5:270Konstantin #savedroidICO0.166667positive
3#Online #Banking- Yako Microfinance Bank prov...3/22/2018 5:210YakoMicrofinance0.500000positive
4MICROFINANCE EVENT: 3rd BoP Global Network Sum...3/22/2018 5:190MicroCapital0.045455positive
\n", 141 | "
" 142 | ], 143 | "text/plain": [ 144 | " Comments Date \\\n", 145 | "0 RT @atmadiprayET: Here's why Janalakshmi Finan... 3/22/2018 5:40 \n", 146 | "1 RT @ecosmob: Ecosmob's #Mobility solutions for... 3/22/2018 5:36 \n", 147 | "2 Project have big future! Microfinance is belie... 3/22/2018 5:27 \n", 148 | "3 #Online #Banking- Yako Microfinance Bank prov... 3/22/2018 5:21 \n", 149 | "4 MICROFINANCE EVENT: 3rd BoP Global Network Sum... 3/22/2018 5:19 \n", 150 | "\n", 151 | " Favorites User Polarity Sentiment \n", 152 | "0 0 Saloni Shukla -0.100000 negative \n", 153 | "1 0 Sindhav Bhageerath -0.062500 neutral \n", 154 | "2 0 Konstantin #savedroidICO 0.166667 positive \n", 155 | "3 0 YakoMicrofinance 0.500000 positive \n", 156 | "4 0 MicroCapital 0.045455 positive " 157 | ] 158 | }, 159 | "execution_count": 2, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "# Load data\n", 166 | "try: \n", 167 | " # Local version\n", 168 | " path = \"../data/\"\n", 169 | " filename = 'microfinance_tweets.csv'\n", 170 | " data = pd.read_csv(path+filename, encoding=\"ISO-8859-1\")\n", 171 | "except FileNotFoundError or ParserError: \n", 172 | " # If not local, get from remote repo. Helpful if using colab.\n", 173 | " url = 'https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good_data/master/microfinance_tweets.csv'\n", 174 | " data = pd.read_csv(url)\n", 175 | "\n", 176 | "# It always a good to visually inspect the data\n", 177 | "data.head()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 3, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "data.loc[data['Sentiment'] == 'negative', 'Sentiment'] = -1\n", 187 | "data.loc[data['Sentiment'] == 'neutral', 'Sentiment'] = 0\n", 188 | "data.loc[data['Sentiment'] == 'positive', 'Sentiment'] = 1" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 4, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/html": [ 199 | "
\n", 200 | "\n", 213 | "\n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | "
CommentsDateFavoritesUserPolaritySentiment
0RT @atmadiprayET: Here's why Janalakshmi Finan...3/22/2018 5:400Saloni Shukla-0.100000-1
1RT @ecosmob: Ecosmob's #Mobility solutions for...3/22/2018 5:360Sindhav Bhageerath-0.0625000
2Project have big future! Microfinance is belie...3/22/2018 5:270Konstantin #savedroidICO0.1666671
3#Online #Banking- Yako Microfinance Bank prov...3/22/2018 5:210YakoMicrofinance0.5000001
4MICROFINANCE EVENT: 3rd BoP Global Network Sum...3/22/2018 5:190MicroCapital0.0454551
\n", 273 | "
" 274 | ], 275 | "text/plain": [ 276 | " Comments Date \\\n", 277 | "0 RT @atmadiprayET: Here's why Janalakshmi Finan... 3/22/2018 5:40 \n", 278 | "1 RT @ecosmob: Ecosmob's #Mobility solutions for... 3/22/2018 5:36 \n", 279 | "2 Project have big future! Microfinance is belie... 3/22/2018 5:27 \n", 280 | "3 #Online #Banking- Yako Microfinance Bank prov... 3/22/2018 5:21 \n", 281 | "4 MICROFINANCE EVENT: 3rd BoP Global Network Sum... 3/22/2018 5:19 \n", 282 | "\n", 283 | " Favorites User Polarity Sentiment \n", 284 | "0 0 Saloni Shukla -0.100000 -1 \n", 285 | "1 0 Sindhav Bhageerath -0.062500 0 \n", 286 | "2 0 Konstantin #savedroidICO 0.166667 1 \n", 287 | "3 0 YakoMicrofinance 0.500000 1 \n", 288 | "4 0 MicroCapital 0.045455 1 " 289 | ] 290 | }, 291 | "execution_count": 4, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "data.head()" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 11, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "train, test = train_test_split(data, test_size=0.2, random_state=42)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 12, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "vectorizer = CountVectorizer()\n", 316 | "train_features = vectorizer.fit_transform(train['Comments'])\n", 317 | "test_features = vectorizer.transform(test['Comments'])" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "We have vectorized our data such that each index corresponds with a word as well as the frequency of that word in the text." 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 52, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | " (0, 585)\t2\n", 337 | " (0, 778)\t1\n", 338 | " (0, 788)\t1\n", 339 | " (0, 1301)\t1\n", 340 | " (0, 1302)\t1\n", 341 | " (0, 1940)\t1\n", 342 | " (0, 1994)\t1\n", 343 | " (0, 2088)\t1\n", 344 | " (0, 2230)\t1\n", 345 | " (0, 3106)\t1\n", 346 | " (0, 3381)\t2\n", 347 | " (0, 3573)\t1\n", 348 | " (0, 3770)\t2\n", 349 | " (0, 4161)\t1\n", 350 | " (0, 4516)\t1\n", 351 | " (0, 5257)\t1\n" 352 | ] 353 | } 354 | ], 355 | "source": [ 356 | "print(train_features[0])" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "## Linear SVM" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "There are many types of SVMs, but we will first try a linear SVM, the most basic. This means that the decision boundary will be linear.
\n", 371 | "\n", 372 | "There is another input called decision_function_shape. The two options of one versus rest, and one versus one. This relates to how the decision boundary separates points, whether it separates negative points from everyone else or negative points from neutral points, etc. (https://pythonprogramming.net/support-vector-machine-parameters-machine-learning-tutorial/). The default is one versus rest. One versus rest takes less computational power but may be thrown off by outliers and don't do well on imbalanced data sets, e.g. more of one class than another." 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 36, 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/plain": [ 383 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 384 | " decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n", 385 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 386 | " tol=0.001, verbose=False)" 387 | ] 388 | }, 389 | "execution_count": 36, 390 | "metadata": {}, 391 | "output_type": "execute_result" 392 | } 393 | ], 394 | "source": [ 395 | "clf = svm.SVC(kernel='linear') \n", 396 | "clf.fit(train_features, train['Sentiment'])" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 56, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "y_train = clf.predict(train_features) " 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 57, 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "[[ 188 19 0]\n", 418 | " [ 6 1540 0]\n", 419 | " [ 0 0 839]]\n", 420 | " precision recall f1-score support\n", 421 | "\n", 422 | " -1 0.97 0.91 0.94 207\n", 423 | " 0 0.99 1.00 0.99 1546\n", 424 | " 1 1.00 1.00 1.00 839\n", 425 | "\n", 426 | "avg / total 0.99 0.99 0.99 2592\n", 427 | "\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "print(confusion_matrix(train['Sentiment'],y_train)) \n", 433 | "print(classification_report(train['Sentiment'],y_train)) " 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 58, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "y_pred = clf.predict(test_features) " 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 59, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "name": "stdout", 452 | "output_type": "stream", 453 | "text": [ 454 | "[[ 41 8 3]\n", 455 | " [ 8 386 3]\n", 456 | " [ 1 9 190]]\n", 457 | " precision recall f1-score support\n", 458 | "\n", 459 | " -1 0.82 0.79 0.80 52\n", 460 | " 0 0.96 0.97 0.96 397\n", 461 | " 1 0.97 0.95 0.96 200\n", 462 | "\n", 463 | "avg / total 0.95 0.95 0.95 649\n", 464 | "\n" 465 | ] 466 | } 467 | ], 468 | "source": [ 469 | "print(confusion_matrix(test['Sentiment'],y_pred)) \n", 470 | "print(classification_report(test['Sentiment'],y_pred)) " 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "What do you think of the performance of the SVM? We can also adjust gamma to account for overfitting, but it doesn't look like we've overfit too much given the training and test performances." 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "Remember that support vectors are the data points that lie closest to the decision surface (or hyperplane). We can figure out what those data points are below for each class we are classifying, noting that we have three classes for negative, neutral, and positive." 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 39, 490 | "metadata": {}, 491 | "outputs": [ 492 | { 493 | "name": "stdout", 494 | "output_type": "stream", 495 | "text": [ 496 | " (0, 531)\t1.0\n", 497 | " (0, 1440)\t1.0\n", 498 | " (0, 2371)\t1.0\n", 499 | " (0, 2769)\t1.0\n", 500 | " (0, 2775)\t2.0\n", 501 | " (0, 2780)\t1.0\n", 502 | " (0, 3106)\t1.0\n", 503 | " (0, 3157)\t1.0\n", 504 | " (0, 3312)\t1.0\n", 505 | " (0, 3381)\t1.0\n", 506 | " (0, 3496)\t1.0\n", 507 | " (0, 3729)\t1.0\n", 508 | " (0, 4796)\t1.0\n", 509 | " (0, 4864)\t1.0\n", 510 | " (0, 4964)\t1.0\n", 511 | " (0, 5021)\t1.0\n", 512 | " (0, 5059)\t1.0\n", 513 | " (0, 5092)\t1.0\n", 514 | " (0, 5156)\t2.0\n", 515 | " (0, 5638)\t1.0\n", 516 | " (1, 374)\t2.0\n", 517 | " (1, 585)\t1.0\n", 518 | " (1, 1885)\t1.0\n", 519 | " (1, 2484)\t2.0\n", 520 | " (1, 2485)\t1.0\n", 521 | " :\t:\n", 522 | " (1299, 3729)\t1.0\n", 523 | " (1299, 3861)\t1.0\n", 524 | " (1299, 3999)\t1.0\n", 525 | " (1299, 4102)\t1.0\n", 526 | " (1299, 5156)\t2.0\n", 527 | " (1299, 5370)\t1.0\n", 528 | " (1300, 614)\t1.0\n", 529 | " (1300, 934)\t1.0\n", 530 | " (1300, 1213)\t1.0\n", 531 | " (1300, 1401)\t1.0\n", 532 | " (1300, 1473)\t1.0\n", 533 | " (1300, 1518)\t1.0\n", 534 | " (1300, 1684)\t1.0\n", 535 | " (1300, 1925)\t1.0\n", 536 | " (1300, 2097)\t1.0\n", 537 | " (1300, 2501)\t1.0\n", 538 | " (1300, 3106)\t1.0\n", 539 | " (1300, 3487)\t1.0\n", 540 | " (1300, 4358)\t1.0\n", 541 | " (1300, 4913)\t1.0\n", 542 | " (1300, 5104)\t1.0\n", 543 | " (1300, 5156)\t1.0\n", 544 | " (1300, 5158)\t1.0\n", 545 | " (1300, 5573)\t1.0\n", 546 | " (1300, 5627)\t1.0\n" 547 | ] 548 | } 549 | ], 550 | "source": [ 551 | "print(clf.support_vectors_)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "We can check for the number of points in each class using another function. Here we see that most support vectors are in our last class, the positive class." 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 40, 564 | "metadata": {}, 565 | "outputs": [ 566 | { 567 | "data": { 568 | "text/plain": [ 569 | "array([152, 713, 436])" 570 | ] 571 | }, 572 | "execution_count": 40, 573 | "metadata": {}, 574 | "output_type": "execute_result" 575 | } 576 | ], 577 | "source": [ 578 | "clf.n_support_" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": {}, 584 | "source": [ 585 | "We can also find the support vector in our original data using the indices provided for us with clf.support_" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 41, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "data": { 595 | "text/plain": [ 596 | "array([ 8, 21, 34, ..., 2573, 2585, 2587])" 597 | ] 598 | }, 599 | "execution_count": 41, 600 | "metadata": {}, 601 | "output_type": "execute_result" 602 | } 603 | ], 604 | "source": [ 605 | "clf.support_" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": 49, 611 | "metadata": {}, 612 | "outputs": [ 613 | { 614 | "name": "stdout", 615 | "output_type": "stream", 616 | "text": [ 617 | " (0, 531)\t1\n", 618 | " (0, 1440)\t1\n", 619 | " (0, 2371)\t1\n", 620 | " (0, 2769)\t1\n", 621 | " (0, 2775)\t2\n", 622 | " (0, 2780)\t1\n", 623 | " (0, 3106)\t1\n", 624 | " (0, 3157)\t1\n", 625 | " (0, 3312)\t1\n", 626 | " (0, 3381)\t1\n", 627 | " (0, 3496)\t1\n", 628 | " (0, 3729)\t1\n", 629 | " (0, 4796)\t1\n", 630 | " (0, 4864)\t1\n", 631 | " (0, 4964)\t1\n", 632 | " (0, 5021)\t1\n", 633 | " (0, 5059)\t1\n", 634 | " (0, 5092)\t1\n", 635 | " (0, 5156)\t2\n", 636 | " (0, 5638)\t1\n" 637 | ] 638 | } 639 | ], 640 | "source": [ 641 | "print(train_features[8])" 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": {}, 647 | "source": [ 648 | "## Non-linear SVM" 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "metadata": {}, 654 | "source": [ 655 | "We can also check different kernel types, with rbf being gaussian and sigmoid being similar to the sigmoid function in logistic regression. A visualization is simplest to understand below:" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": 30, 661 | "metadata": {}, 662 | "outputs": [], 663 | "source": [ 664 | "clf = svm.SVC(kernel='rbf') \n", 665 | "clf.fit(train_features, train['Sentiment'])\n", 666 | "\n", 667 | "y_pred = clf.predict(test_features) " 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 31, 673 | "metadata": {}, 674 | "outputs": [ 675 | { 676 | "name": "stdout", 677 | "output_type": "stream", 678 | "text": [ 679 | "[[ 0 52 0]\n", 680 | " [ 0 397 0]\n", 681 | " [ 0 200 0]]\n", 682 | " precision recall f1-score support\n", 683 | "\n", 684 | " -1 0.00 0.00 0.00 52\n", 685 | " 0 0.61 1.00 0.76 397\n", 686 | " 1 0.00 0.00 0.00 200\n", 687 | "\n", 688 | "avg / total 0.37 0.61 0.46 649\n", 689 | "\n" 690 | ] 691 | }, 692 | { 693 | "name": "stderr", 694 | "output_type": "stream", 695 | "text": [ 696 | "C:\\Users\\Lina\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", 697 | " 'precision', 'predicted', average, warn_for)\n" 698 | ] 699 | } 700 | ], 701 | "source": [ 702 | "print(confusion_matrix(test['Sentiment'],y_pred)) \n", 703 | "print(classification_report(test['Sentiment'],y_pred)) " 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 34, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "clf = svm.SVC(kernel='sigmoid') \n", 713 | "clf.fit(train_features, train['Sentiment'])\n", 714 | "\n", 715 | "y_pred = clf.predict(test_features) " 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 35, 721 | "metadata": {}, 722 | "outputs": [ 723 | { 724 | "name": "stdout", 725 | "output_type": "stream", 726 | "text": [ 727 | "[[ 0 52 0]\n", 728 | " [ 0 397 0]\n", 729 | " [ 0 200 0]]\n", 730 | " precision recall f1-score support\n", 731 | "\n", 732 | " -1 0.00 0.00 0.00 52\n", 733 | " 0 0.61 1.00 0.76 397\n", 734 | " 1 0.00 0.00 0.00 200\n", 735 | "\n", 736 | "avg / total 0.37 0.61 0.46 649\n", 737 | "\n" 738 | ] 739 | }, 740 | { 741 | "name": "stderr", 742 | "output_type": "stream", 743 | "text": [ 744 | "C:\\Users\\Lina\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", 745 | " 'precision', 'predicted', average, warn_for)\n" 746 | ] 747 | } 748 | ], 749 | "source": [ 750 | "print(confusion_matrix(test['Sentiment'],y_pred)) \n", 751 | "print(classification_report(test['Sentiment'],y_pred)) " 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "It looks like the linear SVM performs best on this model from both a precision and recall perspective. Remember that precision are the accuracy of the prediction and recall is how much of the true positive space we are capturing. \n", 759 | "\n", 760 | "What does this mean about our underlying data?" 761 | ] 762 | }, 763 | { 764 | "cell_type": "markdown", 765 | "metadata": {}, 766 | "source": [ 767 | "References\n", 768 | "-------\n", 769 | "\n", 770 | "- https://stackabuse.com/implementing-svm-and-kernel-svm-with-pythons-scikit-learn/\n", 771 | "- https://jakevdp.github.io/PythonDataScienceHandbook/05.07-support-vector-machines.html\n", 772 | "- https://gist.github.com/WittmannF/60680723ed8dd0cb993051a7448f7805" 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "
\n", 780 | "
\n", 781 | "
\n", 782 | "\n", 783 | "----" 784 | ] 785 | } 786 | ], 787 | "metadata": { 788 | "kernelspec": { 789 | "display_name": "Python 3", 790 | "language": "python", 791 | "name": "python3" 792 | }, 793 | "language_info": { 794 | "codemirror_mode": { 795 | "name": "ipython", 796 | "version": 3 797 | }, 798 | "file_extension": ".py", 799 | "mimetype": "text/x-python", 800 | "name": "python", 801 | "nbconvert_exporter": "python", 802 | "pygments_lexer": "ipython3", 803 | "version": "3.7.7" 804 | }, 805 | "toc": { 806 | "base_numbering": 1, 807 | "nav_menu": {}, 808 | "number_sections": false, 809 | "sideBar": false, 810 | "skip_h1_title": false, 811 | "title_cell": "Table of Contents", 812 | "title_sidebar": "Contents", 813 | "toc_cell": true, 814 | "toc_position": {}, 815 | "toc_section_display": true, 816 | "toc_window_display": false 817 | } 818 | }, 819 | "nbformat": 4, 820 | "nbformat_minor": 2 821 | } 822 | -------------------------------------------------------------------------------- /4_module_classification/README.md: -------------------------------------------------------------------------------- 1 | # Module 4: Classification 2 | ------ 3 | 4 | ![](images/intro_to_ml.png) -------------------------------------------------------------------------------- /4_module_classification/images/intro_to_ml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/4_module_classification/images/intro_to_ml.png -------------------------------------------------------------------------------- /5_module_decision_trees/README.md: -------------------------------------------------------------------------------- 1 | Module 5: Decision Trees and Random Forests 2 | ====== 3 | 4 | Welcome! We'll be exploring Decision Trees and Random Forests - a very powerful way to model data! 5 | 6 | Topic overview 7 | ---- 8 | 9 | A Decision Tree is a very powerful model which can be used alone or as the basis for other powerful models such as Random Forest and Gradient Boosting. At it's simplest, a decision tree asks a series of questions about the features to predict what the outcome should be. Decision Trees also have the added advantage that they can be used for both regression and classification. 10 | 11 | A singular decision tree has the tendency to overfit on training data and to counter act this, Bagging (or Boostrap aggregating) is used. Bagging is an **ensemble approach** where N random subsamples of the dataset are made using selection with replacement and individual decision trees are trained on each subsample. Then the final prediction is the average of all predictions from the N decisions trees. 12 | 13 | 14 | 15 | This is improved upon further by limiting the feature considered at each split to a random subset of features. This is known as a Random Forest. 16 | 17 | In this module, we will work our way incrementally from Decision Trees, though Bagging to Random Forests and evaluating the performance at each step. We will also look into the different parameters for each of these models and investigate which features are the most important. 18 | 19 | Resources 20 | ---- 21 | 22 | Firstly, refer to your lecture notes as they will explain the fundamentals covered here in reference to the Kiva dataset we are using! 23 | For additional help, we find the following resources to be very useful! 24 | - [Visual Intro to Machine Learning](http://www.r2d3.us/visual-intro-to-machine-learning-part-1/): 25 | This is an awesome visualization of how a decision tree works step by step. Take the time to go through this and you should have a good fundamental understanding of whats happening under the hood! 26 | - [A complete tutorial on tree based modeling](https://www.analyticsvidhya.com/blog/2016/04/complete-tutorial-tree-based-modeling-scratch-in-python/): A comprehensive tutorial covering the hows and whys of using tree based models including decision trees, bagging, random forest and boosting. 27 | 28 | Advanced topics 29 | ---- 30 | 31 | ### Gradient Tree Boosting 32 | Gradient Tree Boosting is an alternative method of using decision trees which lowers the variance and bias. Unlike the Random Forest algorithm which trains multiple decision trees independently and then averages the result. Boosting works by incrementally growing multiple trees, where each tree is trained on the errors from the previous tree. 33 | 34 | For more information checkout these resources: 35 | 36 | - [An Introduction to Statistical Learning, Chapter 8.2.3](http://www-bcf.usc.edu/%7Egareth/ISL/ISLR%20Sixth%20Printing.pdf): Following on from Decision Trees and Random Forests, the chapter on Boosting discussed this model in an academic and tree-model context. 37 | - [A kaggle master explains gradient boosting](http://blog.kaggle.com/2017/01/23/a-kaggle-master-explains-gradient-boosting/): A fun and easy to read explanation of how gradient boosting works and why it is so great! 38 | - [A guide to gradient boosting trees with XGBoost in Python](https://jessesw.com/XG-Boost/): A comprehensive tutorial using XGBoost for income classification. A good opportunity to brush up on python and EDA skills too! 39 | 40 | -------------------------------------------------------------------------------- /5_module_decision_trees/images/DecisionTreeExample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/5_module_decision_trees/images/DecisionTreeExample.png -------------------------------------------------------------------------------- /5_module_decision_trees/images/bagging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/5_module_decision_trees/images/bagging.png -------------------------------------------------------------------------------- /6_module_unsupervised_learning/README.md: -------------------------------------------------------------------------------- 1 | Module 6: Introduction to Unsupervised Learning 2 | ====== 3 | 4 | Welcome! We are going to delve into the topic of unsupervised learning! 5 | 6 | Goals 7 | --- 8 | 1. Give a general introduction to unsupervised learning. 9 | 2. Use k-means clustering as unsupervised learning technique. 10 | 3. Find clusters with k-means algorithm. 11 | 4. Evaluate our results with the Elbow method. 12 | 13 | Topic overview 14 | ---- 15 | Unsupervised Learning is the process of identifying patterns in a dataset. Identifying patterns is often an early step in understanding data. Unsupervised learning methods are a set of techniques designed to explore and find "hidden structure" rather than predict outcomes. 16 | 17 | Unsupervised learning does not require labeled data, therefore works for broader range of data. In fact, most data in the world is unlabeled. However, since there are no labels / correct answers there is not always a clear feedback to validate that the results are correct. 18 | 19 | There are two main techniques in the domain of unsupervised learning: 20 | 21 | **Dimensionality Reduction** 22 | Some datasets have too many features causing problems with over-fitting, slow model fitting time and issues with metric interpretability (look up the Curse of Dimensionality!). For this reason, we look for methods to reduce the number of features used to train the model while maintaining most of the variance/signal in the data. 23 | 24 | **Clustering** 25 | Clustering is relatively self explanatory. These are methods which divide the dataset into subgroups based on similar characteristics. These sub-groups can be then be used in further supervised learning algorithms or act as an intuitive way to understand the natural subsets in your dataset. Clustering is sometimes referred to as data segmentation or data partitioning. 26 | 27 | In this module, we will focus on the K-Means Clustering algorithm, how it works and how to evaluate it's performance. 28 | 29 | Resources 30 | ---- 31 | 32 | Firstly, refer to your lecture notes as they will explain the fundamentals covered here in reference to the Kiva dataset we are using! 33 | For additional help, we find the following resources to be very useful. 34 | 35 | - [K-Means handout from Stanford](http://stanford.edu/~cpiech/cs221/handouts/kmeans.html/): 36 | From the computer science course at Stanford University, this is a handout giving an overview of the k-means algorithm, sample code and it provides a bit more detail on how clustering can be improved. 37 | - [Interactive introduction to dimensionality reduction](https://www.kaggle.com/arthurtok/interactive-intro-to-dimensionality-reduction): A comprehensive introduction to three dimensionality reduction methods, PCA, LDA and t-SNE from kaggle. Interactive examples with code are provided so that you can see the impact of these methods on the features. 38 | 39 | 40 | Advanced topics 41 | ---- 42 | 43 | ### Hierarchical Clustering 44 | Hierarchical Clustering is a more complex method to cluster data points and evaluate the clusters. Unlike K-Means, we do not need enforce the number of cluster to look for in hierarchical clustering. The algorithm incrementally creates groups data points together to create cluster, staring with every data point as it's own cluster, until all the data in a single cluster. The results can be displayed in a diagram called a dendrogram which allows us to evaluate the possible combination of clusters. 45 | 46 | For more information checkout these resources: 47 | 48 | - [Hierarchical Clustering Analysis](https://afit-r.github.io/hc_clustering): An thorough introduction to Hierarchical Clustering with examples in R. Although it does not used python to create the analysis, this is an excellent resource to understand the underlying principles of hierarchical clustering and to become familiar with dendrograms. 49 | - [Unsupervised Machine Learning: Hierarchical Clustering](https://pythonprogramming.net/hierarchical-clustering-machine-learning-python-scikit-learn/): Although the text is a bit dense and focuses more on the context for hierarchical clustering, this article provides great examples using python and the scikit-learn library. 50 | - [Hierarchical Clustering Analysis](http://84.89.132.1/~michael/stanford/maeb7.pdf): From Stanford Universities, a step by step pdf guide to hierarchical clustering, covering how it works, how to find the 'right' number of clusters and evaluating the validity of the clusters. -------------------------------------------------------------------------------- /6_module_unsupervised_learning/images/clustering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/6_module_unsupervised_learning/images/clustering.png -------------------------------------------------------------------------------- /6_module_unsupervised_learning/images/k_means.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/6_module_unsupervised_learning/images/k_means.png -------------------------------------------------------------------------------- /7_module_advanced_topics/7_2_image_processing_with_keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "__Disclaimer__:\n", 18 | "\n", 19 | "This lesson could be significantly improved. It does not run as is." 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Image processing with Keras\n", 27 | "\n", 28 | "Keras is a deep learning library build on top of TensorFlow. We can use it to process our image data to arrays. Often times, we use deep learning to do image processing. In this example, I will use naive bayes to later prove how deep learning will do much better than naive bayes." 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "### Get images" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 9, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "ename": "WebDriverException", 45 | "evalue": "Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home\n", 46 | "output_type": "error", 47 | "traceback": [ 48 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 49 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 50 | "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mcmd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommand_line_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m self.process = subprocess.Popen(cmd, env=self.env,\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0mclose_fds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplatform\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'Windows'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 51 | "\u001b[0;32m~/anaconda3/lib/python3.8/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 858\u001b[0;31m self._execute_child(args, executable, preexec_fn, close_fds,\n\u001b[0m\u001b[1;32m 859\u001b[0m \u001b[0mpass_fds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcwd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 52 | "\u001b[0;32m~/anaconda3/lib/python3.8/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m 1705\u001b[0m \u001b[0merr_msg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrerror\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1706\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1707\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 53 | "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'chromedriver'", 54 | "\nDuring handling of the above exception, another exception occurred:\n", 55 | "\u001b[0;31mWebDriverException\u001b[0m Traceback (most recent call last)", 56 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0murl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"https://www.google.co.in/search?q=\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0msearchterm\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"&source=lnms&tbm=isch\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mbrowser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChrome\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Make sure ChromeDriver is intalled https://chromedriver.chromium.org/getting-started\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0mbrowser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 57 | "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/selenium/webdriver/chrome/webdriver.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, executable_path, port, options, service_args, desired_capabilities, service_log_path, chrome_options, keep_alive)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mservice_args\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mservice_args\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m log_path=service_log_path)\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 58 | "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mENOENT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m raise WebDriverException(\n\u001b[0m\u001b[1;32m 82\u001b[0m \"'%s' executable needs to be in PATH. %s\" % (\n\u001b[1;32m 83\u001b[0m os.path.basename(self.path), self.start_error_message)\n", 59 | "\u001b[0;31mWebDriverException\u001b[0m: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "from selenium import webdriver\n", 65 | "import os\n", 66 | "import time\n", 67 | "import requests\n", 68 | "\n", 69 | "# Set up Google search url with term\n", 70 | "searchterm = 'memes'\n", 71 | "url = \"https://www.google.co.in/search?q=\"+searchterm+\"&source=lnms&tbm=isch\"\n", 72 | "\n", 73 | "browser = webdriver.Chrome() # Make sure ChromeDriver is intalled https://chromedriver.chromium.org/getting-started\n", 74 | "browser.get(url)\n", 75 | "\n", 76 | "browser.execute_script(\"window.scrollBy(0,10000)\")\n", 77 | "\n", 78 | "elements = browser.find_elements_by_class_name('rg_i')\n", 79 | "print(len(elements))\n", 80 | "\n", 81 | "# Set up variable to count successful downloads\n", 82 | "counter = 0\n", 83 | "succounter = 0\n", 84 | "\n", 85 | "# Makes the folder if it doesn't already exist\n", 86 | "if not os.path.exists(searchterm):\n", 87 | " os.mkdir(searchterm)\n", 88 | "\n", 89 | "for x in elements:\n", 90 | "\tx.click()\n", 91 | "\ttime.sleep(1)\n", 92 | "\telement = browser.find_elements_by_class_name('v4dQwb')\n", 93 | "\n", 94 | "\tprint(\"Total Count:\", counter)\n", 95 | "\tprint(\"Succsessful Count:\", succounter)\n", 96 | "\t\n", 97 | "\tif counter == 0:\n", 98 | "\t\timg = element[0].find_element_by_class_name('n3VNCb')\n", 99 | "\telse:\n", 100 | "\t\timg = element[1].find_element_by_class_name('n3VNCb')\n", 101 | "\n", 102 | "\t# Saves the image\n", 103 | "\ttry:\n", 104 | "\n", 105 | "\t\tr = requests.get(img.get_attribute(\"src\"))\n", 106 | "\t\t\n", 107 | "\t\tif r.status_code == 200:\n", 108 | "\t\t\twith open(searchterm+\"/image_\"+str(counter)+\".png\", 'wb') as f:\n", 109 | "\t\t\t\tf.write(r.content)\n", 110 | "\t\t\t\n", 111 | "\t\tsuccounter = succounter + 1\n", 112 | "\t\t\n", 113 | "\texcept Exception as e:\n", 114 | "\t\tprint(\"could not load : \"+img)\n", 115 | "\t\tprint(e)\n", 116 | "\n", 117 | "\tcounter = counter + 1\n", 118 | "\t \n", 119 | "print(succounter, \"pictures succesfully downloaded\")\n", 120 | "browser.close()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "import os\n", 130 | "from subprocess import check_output\n", 131 | "import sys\n", 132 | "from time import time, sleep\n", 133 | "\n", 134 | "import numpy as np \n", 135 | "import pandas as pd \n", 136 | "import seaborn as sns\n", 137 | "\n", 138 | "from IPython.display import display\n", 139 | "from IPython.display import Image as _Imgdis\n", 140 | "from PIL import Image\n", 141 | "from scipy import ndimage\n", 142 | "\n", 143 | "from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img\n", 144 | "\n", 145 | "from sklearn.model_selection import train_test_split\n", 146 | "from sklearn.naive_bayes import MultinomialNB" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### Proccess images as arrays" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 10, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "ename": "FileNotFoundError", 163 | "evalue": "[Errno 2] No such file or directory: 'memes'", 164 | "output_type": "error", 165 | "traceback": [ 166 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 167 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 168 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Create a list of files in the folder specified\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmeme_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mf\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder_1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder_1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Working with {0} images\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmeme_files\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 169 | "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'memes'" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "# Create a dataset of memes\n", 175 | "folder_1 = \"memes\"\n", 176 | "\n", 177 | "# Create a list of files in the folder specified\n", 178 | "meme_files = [f for f in os.listdir(folder_1) if os.path.isfile(os.path.join(folder_1, f))]\n", 179 | "\n", 180 | "print(\"Working with {0} images\".format(len(meme_files)))\n", 181 | "print(\"Image examples: \")\n", 182 | "\n", 183 | "# Print two examples using display(_Imgdis()), which can read the image files\n", 184 | "for i in range(150, 152):\n", 185 | " print(meme_files[i])\n", 186 | " display(_Imgdis(filename=folder_1 + \"/\" + meme_files[i], width=240, height=320))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "# Create also a dataset of non-memes, pulled from this dataset: \n", 196 | "# http://press.liacs.nl/mirflickr/mirdownload.html\n", 197 | "\n", 198 | "folder_0 = \"non_memes\"\n", 199 | "\n", 200 | "# Create a list of files in the folder specified\n", 201 | "non_meme_files = [f for f in os.listdir(folder_0) if os.path.isfile(os.path.join(folder_0, f))]\n", 202 | "\n", 203 | "print(\"Working with {0} images\".format(len(non_meme_files)))\n", 204 | "print(\"Image examples: \")\n", 205 | "\n", 206 | "# Print two examples using display(_Imgdis()), which can read the image files\n", 207 | "for i in range(150, 152):\n", 208 | " print(non_meme_files[i])\n", 209 | " display(_Imgdis(filename=folder_0 + \"/\" + non_meme_files[i], width=240, height=320))" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "# Prepare arrays for data to be saved in image processing for loops\n", 219 | "y_data = [1]*len(meme_files) + [0]*len(non_meme_files)\n", 220 | "meme_colors = np.ndarray(shape = (len(meme_files), 3), dtype=np.float32)\n", 221 | "non_meme_colors = np.ndarray(shape = (len(non_meme_files), 3), dtype=np.float32)\n", 222 | "image_size_areas = []\n", 223 | "\n", 224 | "# Dimensions to standardize the images to\n", 225 | "image_height = 120\n", 226 | "image_width = 160\n", 227 | "channels = 3\n", 228 | "\n", 229 | "# Make a 3-layered array (3 for RGB or number of channels)\n", 230 | "dataset = np.ndarray(shape=(len(y_data), channels, image_height, image_width), dtype=np.float32)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "# Add an array of each meme image to our dataset (note this code can be improved by creating a read_image function instead of repeating the for loop twice)\n", 240 | "i = 0\n", 241 | "for j in range(len(meme_files)):\n", 242 | " img = load_img(folder_1 + \"/\" + meme_files[j]) # this is a PIL image\n", 243 | " # Save initial dimensions before resizing\n", 244 | " image_size_areas.append(img.size[0] * img.size[1])\n", 245 | " img = img.resize((image_height, image_width))\n", 246 | " # Convert to numpy array and save colors\n", 247 | " x = img_to_array(img)\n", 248 | " meme_colors[j] = [x[0].sum(), x[1].sum(), x[2].sum()]\n", 249 | " x = x.reshape((channels, image_height, image_width))\n", 250 | " try:\n", 251 | " dataset[i] = x\n", 252 | " i += 1\n", 253 | " if i % 250 == 0:\n", 254 | " print(\"%d images to array\" % i)\n", 255 | " except Exception as e:\n", 256 | " i += 1\n", 257 | " print(\"failed on %d\" %i, e)\n", 258 | "\n", 259 | "# Add an array of each non-meme image to our dataset\n", 260 | "for k in range(len(non_meme_files)):\n", 261 | " img = load_img(folder_0 + \"/\" + non_meme_files[k]) # this is a PIL image\n", 262 | " # Save initial dimensions before resizing\n", 263 | " image_size_areas.append(img.size[0] * img.size[1])\n", 264 | " img = img.resize((image_height, image_width))\n", 265 | " # Convert to numpy array and save colors\n", 266 | " x = img_to_array(img)\n", 267 | " non_meme_colors[k] = [x[0].sum(), x[1].sum(), x[2].sum()]\n", 268 | " x = x.reshape((channels, image_height, image_width))\n", 269 | " try:\n", 270 | " dataset[i] = x\n", 271 | " i += 1\n", 272 | " if i % 250 == 0:\n", 273 | " print(\"%d images to array\" %i)\n", 274 | " except Exception as e:\n", 275 | " i += 1\n", 276 | " print(\"failed on %d\" %i, e)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "### Exploratory analysis" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "# Determine the meme versus non-meme split\n", 293 | "print(len(y_data))\n", 294 | "print(\"memes:\", sum(y_data)/len(y_data), \"non-memes:\", (len(y_data)-sum(y_data))/len(y_data))" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "# Plot the distribution of sizes before the images were cropped\n", 304 | "pd.Series(data = image_size_areas).hist()" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "# Plot a histogram of colors for memes\n", 314 | "sns.distplot(meme_colors[:,0], color = 'r')\n", 315 | "sns.distplot(meme_colors[:,1], color = 'g')\n", 316 | "sns.distplot(meme_colors[:,2], color = 'b')" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "# Plot a histogram of colors for non-memes\n", 326 | "sns.distplot(non_meme_colors[:,0], color = 'r')\n", 327 | "sns.distplot(non_meme_colors[:,1], color = 'g')\n", 328 | "sns.distplot(non_meme_colors[:,2], color = 'b')" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "### Build the model" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "#Model will expect a 2-D array, so we can flatten a 4-D array to a 2-D one\n", 345 | "dataset_flattened = dataset.reshape(len(y_data) * channels, image_height * image_width)\n", 346 | "y_data_flattened = [1]*len(meme_files)*3 + [0]*len(non_meme_files)*3" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "print(len(dataset_flattened), len(dataset)*3)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "#Split the array data into train and test sets\n", 365 | "X_train, X_test, y_train, y_test = train_test_split(dataset_flattened, y_data_flattened, test_size=0.2, random_state=33)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "#Test the randomness of the y_train and y_test set\n", 375 | "print(sum(y_train)/len(y_train), sum(y_test)/len(y_test))" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "#Train your data set using multinomial NB from sklearn library\n", 385 | "nb = MultinomialNB()\n", 386 | "nb.fit(X_train, y_train)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "#Test your data set on your test data\n", 396 | "preds = nb.predict(X_test)\n", 397 | "\n", 398 | "#Print the accuracy of your model\n", 399 | "accuracy = (preds == y_test)\n", 400 | "'Accuracy : {:.2%}'.format(accuracy.sum() / len(accuracy))" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "This is pretty bad performance, considering the accuracy by assigning every picture to a meme would be ~40%." 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "Source: https://www.kaggle.com/lgmoneda/from-image-files-to-numpy-arrays" 415 | ] 416 | } 417 | ], 418 | "metadata": { 419 | "kernelspec": { 420 | "display_name": "Python 3", 421 | "language": "python", 422 | "name": "python3" 423 | }, 424 | "language_info": { 425 | "codemirror_mode": { 426 | "name": "ipython", 427 | "version": 3 428 | }, 429 | "file_extension": ".py", 430 | "mimetype": "text/x-python", 431 | "name": "python", 432 | "nbconvert_exporter": "python", 433 | "pygments_lexer": "ipython3", 434 | "version": "3.8.8" 435 | }, 436 | "toc": { 437 | "base_numbering": 1, 438 | "nav_menu": {}, 439 | "number_sections": false, 440 | "sideBar": false, 441 | "skip_h1_title": false, 442 | "title_cell": "Table of Contents", 443 | "title_sidebar": "Contents", 444 | "toc_cell": true, 445 | "toc_position": {}, 446 | "toc_section_display": true, 447 | "toc_window_display": false 448 | } 449 | }, 450 | "nbformat": 4, 451 | "nbformat_minor": 2 452 | } 453 | -------------------------------------------------------------------------------- /7_module_advanced_topics/get_more_100_pictures.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.keys import Keys 3 | import shutil 4 | import requests 5 | 6 | import json 7 | import os 8 | import argparse 9 | 10 | # Set up Google search url with term 11 | searchterm = 'memes' 12 | url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch" 13 | 14 | # Need to download Chromedriver, insert path to chromedriver inside parentheses in following line 15 | browser = webdriver.Chrome('C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe') 16 | browser.get(url) 17 | header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"} 18 | 19 | # Set up variable to count successful downloads 20 | counter = 0 21 | succounter = 0 22 | 23 | # Makes the folder if it doesn't already exist 24 | if not os.path.exists(searchterm): 25 | os.mkdir(searchterm) 26 | 27 | for _ in range(2000): 28 | # Scrolls the window for us 29 | browser.execute_script("window.scrollBy(0,10000)") 30 | 31 | # Find alls meta data that are links to pictures 32 | for x in browser.find_elements_by_xpath('//div[contains(@class,"rg_meta")]'): 33 | counter = counter + 1 34 | print("Total Count:", counter) 35 | print("Succsessful Count:", succounter) 36 | 37 | img = json.loads(x.get_attribute('innerHTML'))["ou"] 38 | imgtype = json.loads(x.get_attribute('innerHTML'))["ity"] 39 | print("URL:",img, imgtype) 40 | 41 | # Saves the image 42 | try: 43 | r = requests.get(img, stream=True, headers={'User-agent': 'Mozilla/5.0'}) 44 | if r.status_code == 200: 45 | with open(searchterm+"/image_"+str(counter)+".png", 'wb') as f: 46 | r.raw.decode_content = True 47 | shutil.copyfileobj(r.raw, f) 48 | succounter = succounter + 1 49 | except Exception as e: 50 | print("could not load : "+img) 51 | print(e) 52 | 53 | print(succounter, "pictures succesfully downloaded") 54 | browser.close() -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | Delta Analytics Code of Conduct 2 | ----- 3 | 4 | [Delta Analytics](http://www.deltanalytics.org/) has two parallel goals. First, we bridge the skill gap faced by non-profits by providing free data consulting. Second, we build technical capacity in communities around the world to help democratize access to machine learning and data tools. We are a non-profit run entirely without any full-time staff; instead, our work is possible because of the volunteer efforts of a community of data professionals. 5 | We value the participation of each member of the Delta Analytics community and want all our members to have an enjoyable and fulfilling experience. Accordingly, all members are expected to show respect, dignity, and courtesy to others. 6 | To make clear what is expected, all people affiliated in any way with the Delta Analytics as a teaching or data fellow are required to conform to the following Code of Conduct. Organizers will enforce this code of conduct at all times. 7 | 8 | Our Standards 9 | ----- 10 | 11 | Delta Analytics is dedicated to providing a welcoming experience for everyone, regardless of age, gender identity and expression, sexual orientation, disability, physical appearance, body size, ethnicity, nationality, race, or religion (or lack thereof), or socio-economic status. We believe that all members of Delta Analytics are entitled to interact in an environment that is free of harassment, bullying, and discrimination. 12 | We encourage all of our members to contribute to creating a welcoming environment by: 13 | 14 | - Being kind to others 15 | - Behaving professionally 16 | - Using welcoming and inclusive language 17 | - Being respectful of differing viewpoints and experiences 18 | - Focusing on what is best for the community 19 | - Showing empathy towards other community members 20 | - Making sure that the contributions of others are recognized and there is fair and consistent attribution of credit. 21 | 22 | Examples of unacceptable behavior by participants include: 23 | 24 | - Divulging sensitive or confidential data and information 25 | - Harassment of anyone in any form, including: 26 | - Bullying or other actions that create an intimidating, humiliating or uncomfortable environment 27 | - Violent threats or language directed against another person 28 | - Unwelcome sexual attention or advances; sexual language and imagery; the display or circulation of offensive, derogatory or sexually explicit pictures or other materials; repeated, unwanted, and harassing attention or stalking 29 | - Unwelcome physical contact 30 | - Insults or put-downs 31 | - Sexist, racist, homophobic, transphobic, ableist, or exclusionary jokes 32 | - Sustained disruption of talks or other events 33 | - Other conduct that is inappropriate for a professional audience with diverse backgrounds 34 | 35 | Participants asked to stop any inappropriate behavior are expected to comply with the aforementioned guidelines immediately. 36 | 37 | Thank you for helping make this a welcoming, friendly community for all. 38 | 39 | Procedure For A Violating Incident 40 | ------ 41 | 42 | If you believe that you have been treated contrary to this Code of Conduct you should do the following: If you feel safe, approach the person directly and request the person’s behavior to stop. If that does not resolve the incident to your satisfaction or you feel uncomfortable speaking with the person directly, please surface immediately to a member of the [leadership team](http://www.deltanalytics.org/leadership-team.html), in-person, or electronically. If possible, document the incident by including dates, times, places, names of individuals involved, and any witnesses. 43 | 44 | In response, members of Delta Analytics’s leadership team will promptly: 45 | 46 | - Meet with each person involved individually and privately. 47 | - The leadership team will exercise its best efforts to keep these conversations confidential and will not divulge sensitive information to other members of the Delta community unless it receives permission from all direct parties involved to do so. 48 | - Decide the appropriate course of action and communicate it transparently to the parties involved. 49 | - Maintain a private record of the incident, in order to identify and prevent undesirable repeated behavior. 50 | 51 | If a participant engages in behavior that violates this code of conduct, Delta Analytics may take any action they deem appropriate, including warning the offender, expelling the offender from an event, or including permanently banning the offender from the group after the offender has had an opportunity to be heard in front of the leadership team. 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public 379 | licenses. Notwithstanding, Creative Commons may elect to apply one of 380 | its public licenses to material it publishes and in those instances 381 | will be considered the “Licensor.” The text of the Creative Commons 382 | public licenses is dedicated to the public domain under the CC0 Public 383 | Domain Dedication. Except for the limited purpose of indicating that 384 | material is shared under a Creative Commons public license or as 385 | otherwise permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the 393 | public licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. 396 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Introduction to Machine Learning for Good 2 | ==== 3 | 4 | [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/DeltaAnalytics/machine_learning_for_good/master) 5 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DeltaAnalytics/machine_learning_for_good) 6 | 7 |
8 | 9 | How can we use data for social impact? 10 | ------ 11 | 12 | Data is powerful. We believe that anyone can harness that power for change. 13 | 14 | In this introductory course, students will learn the foundational theory and the necessary coding skills to translate data into actionable insights. Students will learn the latest machine learning tools and algorithms. 15 | 16 | Data science is a highly interdisciplinary practice: demanding critical thinking, understanding of statistics, and technical coding ability. Irresponsible application of powerful algorithms or an inadequate exploration of underlying assumptions can lead to spurious results. In this course, we emphasize the fundamentals of proper data science and expose students to what is possible using sophisticated machine learning methods. 17 | 18 | Each of the modules is hands-on, project-based, using real world data from [KIVA](https://www.kiva.org/), a non-profit that connects people through lending to alleviate poverty. 19 | 20 | 21 | Who We Are 22 | ------ 23 | 24 | [Delta Analytics](http://www.deltanalytics.org/) is a 501(c)3 San Francisco Bay Area non-profit dedicated to bringing rigorous data science to problem-solving, effecting change in nonprofits and the public sector, and making data science an accessible and democratic resource for anyone with the same mission. 25 | 26 | 27 | Overview 28 | ---- 29 | 30 | Topics covered in this course include: data cleaning, supervised machine learning, and unsupervised machine learning. 31 | 32 | The slides that cover the theory of the topic are available [here](http://www.deltanalytics.org/curriculum.html). We present theory alongside real-life data science examples will open doors to novices and professionals alike to harness the power of data for good. 33 | 34 | Weebly (our website host) has blocked traffic to certain countries. We have submitted numerous complaints, and apologize to students for the inconvenience caused. Until then, you can access pdf of all course slides below. 35 | 36 | [Module 1 - Introduction to Machine Learning](https://drive.google.com/file/d/1r4SBY6Dm6xjFqLH12tFb-Bf7wbvoIN_C/view?usp=sharing) 37 | 38 | [Module 2 - Machine learning deep dive](https://drive.google.com/file/d/1EZ_xqMaYj77vErVnrQmnFOj-VBEoO5uW/view?usp=sharing) 39 | 40 | [Module 3 - Linear Regression](https://drive.google.com/file/d/1kXbB7fps78xyFYUtmgNlQJJ3LdO0K3TB/view?usp=sharing) 41 | 42 | [Module 4 - Model Selection and Evaluation](https://drive.google.com/file/d/1ESR4U566uPioFCpFOITpuSBaO45MdJ4O/view?usp=sharing) 43 | 44 | [Module 5 - Decision Trees](https://drive.google.com/file/d/1Sd_LN-WE_W3Zo-YZrMBe90H2i4_ieFRs/view?usp=sharing) 45 | 46 | [Module 6 - Ensemble Algorithms](https://drive.google.com/file/d/1g2AT3S5cgu5HjMYt4X-WiVs0RUvI6Z3s/view?usp=sharing) 47 | 48 | [Module 7 - Unsupervised Algorithms](https://drive.google.com/file/d/1YdA-HHYP1V05QgvwLCvfnuuau67Zl38n/view?usp=sharing) 49 | 50 | [Module 8 - Natural Language Processing Pt. 1](https://drive.google.com/file/d/1Y7gOfnPfyCSu1chWEoHMqhgXVI5KZpRx/view?usp=sharing) 51 | 52 | [Module 9 - Natural Language Processing Pt. 2](https://drive.google.com/file/d/1BUJ0FyMzSxCfHNA0AHwBOxjHt7V2FJj8/view?usp=sharing) 53 | 54 | Course outcomes 55 | ---- 56 | 57 | By the end of the course, students will be able to: 58 | 59 | 1. Explain the fundamental statistical and machine learning algorithms that underlying common data science methods. 60 | 1. Write code to clean, process, analyze, and visualize real-world data. 61 | 1. Be able to communicate with other data scientists using technical terms. 62 | 63 | Our students 64 | ---- 65 | 66 | The course is intended for any and all individuals interested in harnessing data towards solving problems in their communities. Minimal prior coding or mathematical/statistical experience is expected. Computer proficiency is necessary. 67 | 68 | Our teachers 69 | ----- 70 | 71 | [Delta Teaching Fellows](http://www.deltanalytics.org/teaching-fellows.html) are all data professionals working in the San Francisco Bay Area. All of our time is donated for free to build out a curriculum that makes machine learning tools and knowledge more accessible to communities around the world. You can learn more about our team [here](http://www.deltanalytics.org/teaching-fellows.html). 72 | 73 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | # Create environment: 2 | # $ conda env create --name good --force 3 | name: good 4 | channels: 5 | - conda-forge 6 | dependencies: 7 | - python==3.6 8 | - jupyter 9 | - numpy 10 | - pandas 11 | - requests 12 | - scikit-learn 13 | - scipy 14 | - seaborn 15 | - statsmodels 16 | - pip: 17 | - graphviz 18 | - git+https://github.com/stroxler/batch_nbconvert 19 | - watermark 20 | - wordcloud -------------------------------------------------------------------------------- /images/decision_trees.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/decision_trees.png -------------------------------------------------------------------------------- /images/delta_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/delta_logo.jpg -------------------------------------------------------------------------------- /images/delta_octocat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/delta_octocat.png -------------------------------------------------------------------------------- /images/ensemble_algorithms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/ensemble_algorithms.png -------------------------------------------------------------------------------- /images/introduction_to_machine_learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/introduction_to_machine_learning.png -------------------------------------------------------------------------------- /images/linear_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/linear_regression.png -------------------------------------------------------------------------------- /images/machine_learning_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/machine_learning_.png -------------------------------------------------------------------------------- /images/model_selection_evaluation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/model_selection_evaluation.png -------------------------------------------------------------------------------- /images/nlp_pt_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/nlp_pt_1.png -------------------------------------------------------------------------------- /images/nlp_pt_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/nlp_pt_2.png -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | # Create environment: 6 | conda update -n base conda -y 7 | conda env create --name good --force -q 8 | 9 | # Start environment: 10 | source activate good 11 | 12 | # Update environment (might break stuff. move fast!?) 13 | conda update --all --yes 14 | 15 | # Get local copy of data 16 | git clone --single-branch --depth=1 https://github.com/DeltaAnalytics/machine_learning_for_good_data data 17 | 18 | # Setup spell checking and other notebook enhancements 19 | git clone https://github.com/Calysto/notebook-extensions.git 20 | cd notebook-extensions 21 | jupyter nbextension install calysto --user 22 | jupyter nbextension enable calysto/spell-check/main 23 | jupyter nbextension enable calysto/cell-tools/main 24 | jupyter nbextension enable calysto/annotate/main 25 | rm -r -f notebook-extensions 26 | 27 | # Start Jupyter Notebook 28 | if [[ "$1" != "--no-start" ]]; then 29 | jupyter notebook --browser=Chrome 30 | fi 31 | -------------------------------------------------------------------------------- /tests_for_students/MPI_data_poverty.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/tests_for_students/MPI_data_poverty.csv -------------------------------------------------------------------------------- /tests_for_students/country_mapper.csv: -------------------------------------------------------------------------------- 1 | location_country_code,ISO country code 2 | AF,AFG 3 | AX,ALA 4 | AL,ALB 5 | DZ,DZA 6 | AS,ASM 7 | AD,AND 8 | AO,AGO 9 | AI,AIA 10 | AQ,ATA 11 | AG,ATG 12 | AR,ARG 13 | AM,ARM 14 | AW,ABW 15 | AU,AUS 16 | AT,AUT 17 | AZ,AZE 18 | BS,BHS 19 | BH,BHR 20 | BD,BGD 21 | BB,BRB 22 | BY,BLR 23 | BE,BEL 24 | BZ,BLZ 25 | BJ,BEN 26 | BM,BMU 27 | BT,BTN 28 | BO,BOL 29 | BA,BIH 30 | BW,BWA 31 | BV,BVT 32 | BR,BRA 33 | VG,VGB 34 | IO,IOT 35 | BN,BRN 36 | BG,BGR 37 | BF,BFA 38 | BI,BDI 39 | KH,KHM 40 | CM,CMR 41 | CA,CAN 42 | CV,CPV 43 | KY,CYM 44 | CF,CAF 45 | TD,TCD 46 | CL,CHL 47 | CN,CHN 48 | HK,HKG 49 | MO,MAC 50 | CX,CXR 51 | CC,CCK 52 | CO,COL 53 | KM,COM 54 | CG,COG 55 | CD,COD 56 | CK,COK 57 | CR,CRI 58 | CI,CIV 59 | HR,HRV 60 | CU,CUB 61 | CY,CYP 62 | CZ,CZE 63 | DK,DNK 64 | DJ,DJI 65 | DM,DMA 66 | DO,DOM 67 | EC,ECU 68 | EG,EGY 69 | SV,SLV 70 | GQ,GNQ 71 | ER,ERI 72 | EE,EST 73 | ET,ETH 74 | FK,FLK 75 | FO,FRO 76 | FJ,FJI 77 | FI,FIN 78 | FR,FRA 79 | GF,GUF 80 | PF,PYF 81 | TF,ATF 82 | GA,GAB 83 | GM,GMB 84 | GE,GEO 85 | DE,DEU 86 | GH,GHA 87 | GI,GIB 88 | GR,GRC 89 | GL,GRL 90 | GD,GRD 91 | GP,GLP 92 | GU,GUM 93 | GT,GTM 94 | GG,GGY 95 | GN,GIN 96 | GW,GNB 97 | GY,GUY 98 | HT,HTI 99 | HM,HMD 100 | VA,VAT 101 | HN,HND 102 | HU,HUN 103 | IS,ISL 104 | IN,IND 105 | ID,IDN 106 | IR,IRN 107 | IQ,IRQ 108 | IE,IRL 109 | IM,IMN 110 | IL,ISR 111 | IT,ITA 112 | JM,JAM 113 | JP,JPN 114 | JE,JEY 115 | JO,JOR 116 | KZ,KAZ 117 | KE,KEN 118 | KI,KIR 119 | KP,PRK 120 | KR,KOR 121 | KW,KWT 122 | KG,KGZ 123 | LA,LAO 124 | LV,LVA 125 | LB,LBN 126 | LS,LSO 127 | LR,LBR 128 | LY,LBY 129 | LI,LIE 130 | LT,LTU 131 | LU,LUX 132 | MK,MKD 133 | MG,MDG 134 | MW,MWI 135 | MY,MYS 136 | MV,MDV 137 | ML,MLI 138 | MT,MLT 139 | MH,MHL 140 | MQ,MTQ 141 | MR,MRT 142 | MU,MUS 143 | YT,MYT 144 | MX,MEX 145 | FM,FSM 146 | MD,MDA 147 | MC,MCO 148 | MN,MNG 149 | ME,MNE 150 | MS,MSR 151 | MA,MAR 152 | MZ,MOZ 153 | MM,MMR 154 | NA,NAM 155 | NR,NRU 156 | NP,NPL 157 | NL,NLD 158 | AN,ANT 159 | NC,NCL 160 | NZ,NZL 161 | NI,NIC 162 | NE,NER 163 | NG,NGA 164 | NU,NIU 165 | NF,NFK 166 | MP,MNP 167 | NO,NOR 168 | OM,OMN 169 | PK,PAK 170 | PW,PLW 171 | PS,PSE 172 | PA,PAN 173 | PG,PNG 174 | PY,PRY 175 | PE,PER 176 | PH,PHL 177 | PN,PCN 178 | PL,POL 179 | PT,PRT 180 | PR,PRI 181 | QA,QAT 182 | RE,REU 183 | RO,ROU 184 | RU,RUS 185 | RW,RWA 186 | BL,BLM 187 | SH,SHN 188 | KN,KNA 189 | LC,LCA 190 | MF,MAF 191 | PM,SPM 192 | VC,VCT 193 | WS,WSM 194 | SM,SMR 195 | ST,STP 196 | SA,SAU 197 | SN,SEN 198 | RS,SRB 199 | SC,SYC 200 | SL,SLE 201 | SG,SGP 202 | SK,SVK 203 | SI,SVN 204 | SB,SLB 205 | SO,SOM 206 | ZA,ZAF 207 | GS,SGS 208 | SS,SSD 209 | ES,ESP 210 | LK,LKA 211 | SD,SDN 212 | SR,SUR 213 | SJ,SJM 214 | SZ,SWZ 215 | SE,SWE 216 | CH,CHE 217 | SY,SYR 218 | TW,TWN 219 | TJ,TJK 220 | TZ,TZA 221 | TH,THA 222 | TL,TLS 223 | TG,TGO 224 | TK,TKL 225 | TO,TON 226 | TT,TTO 227 | TN,TUN 228 | TR,TUR 229 | TM,TKM 230 | TC,TCA 231 | TV,TUV 232 | UG,UGA 233 | UA,UKR 234 | AE,ARE 235 | GB,GBR 236 | US,USA 237 | UM,UMI 238 | UY,URY 239 | UZ,UZB 240 | VU,VUT 241 | VE,VEN 242 | VN,VNM 243 | VI,VIR 244 | WF,WLF 245 | EH,ESH 246 | YE,YEM 247 | ZM,ZMB 248 | ZW,ZWE 249 | --------------------------------------------------------------------------------