├── .gitattributes
├── .gitignore
├── 1_module_introduction_pandas
├── 1_1 intro_to_python.ipynb
├── 1_2_intro_to_numpy.ipynb
├── 1_3_intro_to_pandas.ipynb
├── 1_4_loading_and_understanding_data.ipynb
├── 1_5_exploratory_data_analysis.ipynb
├── README.md
├── best_practices_data_science.pdf
├── images
│ ├── Anaconda_1.PNG
│ ├── Anaconda_2.PNG
│ ├── Anaconda_3.PNG
│ ├── Anaconda_4.PNG
│ ├── Anaconda_5.PNG
│ ├── Anaconda_6.PNG
│ ├── Anaconda_7.PNG
│ ├── Anaconda_7_2.PNG
│ ├── Anaconda_8.PNG
│ ├── anaconda_nav.png
│ └── jupyter_notebook.png
├── intro_to_visualization.pptx
└── python_installation_instructions.md
├── 2_module_eda_feature_engineering
├── 2_1_feature_engineering.ipynb
├── README.md
└── images
│ ├── Anaconda_1.PNG
│ ├── Anaconda_2.PNG
│ ├── Anaconda_3.PNG
│ ├── Anaconda_4.PNG
│ ├── Anaconda_5.PNG
│ ├── Anaconda_6.PNG
│ ├── Anaconda_7.PNG
│ ├── Anaconda_7_2.PNG
│ ├── Anaconda_8.PNG
│ ├── anaconda_nav.png
│ └── jupyter_notebook.png
├── 3_module_linear_regression
├── 3_1_linear_regression-build_univariate_model.ipynb
├── 3_2_linear_regression_check_assumptions.ipynb
├── 3_3_linear_regression_build_multivariate_model.ipynb
├── 3_4_polynomial_regression.ipynb
├── 3_5_linear_regression_regularization.ipynb
├── README.md
└── images
│ └── LinearRegression.png
├── 4_module_classification
├── 4_0_twitter_web_scraping.ipynb
├── 4_1_logistic_regression.ipynb
├── 4_2_naive_bayes.ipynb
├── 4_3_naive_bayes_detail.ipynb
├── 4_4_support_vector_machines.ipynb
├── README.md
└── images
│ └── intro_to_ml.png
├── 5_module_decision_trees
├── 5_1_decision_trees.ipynb
├── 5_2_random_forests.ipynb
├── README.md
└── images
│ ├── DecisionTreeExample.png
│ └── bagging.png
├── 6_module_unsupervised_learning
├── 6_1_clustering.ipynb
├── README.md
└── images
│ ├── clustering.png
│ └── k_means.png
├── 7_module_advanced_topics
├── 7_1_sentiment_analysis_details.ipynb
├── 7_2_image_processing_with_keras.ipynb
└── get_more_100_pictures.py
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── environment.yml
├── images
├── decision_trees.png
├── delta_logo.jpg
├── delta_octocat.png
├── ensemble_algorithms.png
├── introduction_to_machine_learning.png
├── linear_regression.png
├── machine_learning_.png
├── model_selection_evaluation.png
├── nlp_pt_1.png
└── nlp_pt_2.png
├── setup.sh
└── tests_for_students
├── MPI_data_poverty.csv
├── country_mapper.csv
├── loans_midterm.csv
└── midterm_part_2.ipynb
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb filter=ipynb_stripout
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Data
2 | /data
3 | /datasets
4 |
5 | # Temp files
6 | notebook-extensions/
7 | Untitled.ipynb
8 |
9 | # Silly mac data
10 | .DS_Store
11 |
12 | # Jupyter Notebook
13 | .ipynb_checkpoints
14 | */.ipynb_checkpoints/*
15 |
16 | # Byte-compiled / optimized / DLL files
17 | __pycache__/
18 | *.py[cod]
19 | *$py.class
20 |
21 | # C extensions
22 | *.so
23 |
24 | # Distribution / packaging
25 | .Python
26 | env/
27 | build/
28 | develop-eggs/
29 | dist/
30 | downloads/
31 | eggs/
32 | .eggs/
33 | lib/
34 | lib64/
35 | parts/
36 | sdist/
37 | var/
38 | *.egg-info/
39 | .installed.cfg
40 | *.egg
41 |
42 | # PyInstaller
43 | # Usually these files are written by a python script from a template
44 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
45 | *.manifest
46 | *.spec
47 |
48 | # Installer logs
49 | pip-log.txt
50 | pip-delete-this-directory.txt
51 |
52 | # Unit test / coverage reports
53 | htmlcov/
54 | .tox/
55 | .coverage
56 | .coverage.*
57 | .cache
58 | nosetests.xml
59 | coverage.xml
60 | *,cover
61 | .hypothesis/
62 |
63 | # Translations
64 | *.mo
65 | *.pot
66 |
67 | # Django stuff:
68 | *.log
69 | local_settings.py
70 |
71 | # Flask stuff:
72 | instance/
73 | .webassets-cache
74 |
75 | # Scrapy stuff:
76 | .scrapy
77 |
78 | # Sphinx documentation
79 | docs/_build/
80 |
81 | # PyBuilder
82 | target/
83 |
84 | # IPython Notebook
85 | .ipynb_checkpoints
86 |
87 | # pyenv
88 | .python-version
89 |
90 | # celery beat schedule file
91 | celerybeat-schedule
92 |
93 | # dotenv
94 | .env
95 |
96 | # virtualenv
97 | venv/
98 | ENV/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 |
103 | # Rope project settings
104 | .ropeproject
105 |
106 |
--------------------------------------------------------------------------------
/1_module_introduction_pandas/1_1 intro_to_python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "toc": true
7 | },
8 | "source": [
9 | "
Table of Contents
\n",
10 | ""
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "Intro To Python\n",
18 | "=====\n",
19 | "\n",
20 | "In this notebook, we will explore basic Python:\n",
21 | "\n",
22 | "- data types, including dictionaries\n",
23 | "- functions \n",
24 | "- loops"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "Please note that we are using Python 3. \n",
32 | "(__NOT__ Python 2! Python 2 has some different functions and syntax)"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 1,
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "3\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "# Let's make sure we are using Python 3\n",
50 | "import sys\n",
51 | "print(sys.version[0])"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "# 1. Basic Data Types: Numbers, Booleans, and Strings\n",
59 | "## 1.1 Numbers"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 2,
65 | "metadata": {},
66 | "outputs": [
67 | {
68 | "name": "stdout",
69 | "output_type": "stream",
70 | "text": [
71 | "\n",
72 | "5\n"
73 | ]
74 | }
75 | ],
76 | "source": [
77 | "a = 5\n",
78 | "\n",
79 | "# Note: use the `type()` function to get the type of a variable\n",
80 | "# Numbers can be integers ('int'), such as 3, 5 and 3049, or floats\n",
81 | "# ('float'), such as 2.5, 3.1, and 2.34938493\n",
82 | "print(type(a))\n",
83 | "print(a)"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "### Mathematical Operators: +, -, *, /, **\n",
91 | "Mathematical operators allow you to perform math operations on numbers in Python."
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 3,
97 | "metadata": {},
98 | "outputs": [
99 | {
100 | "name": "stdout",
101 | "output_type": "stream",
102 | "text": [
103 | "6\n",
104 | "4\n",
105 | "10\n",
106 | "2.5\n",
107 | "25\n"
108 | ]
109 | }
110 | ],
111 | "source": [
112 | "b = a + 1\n",
113 | "print(b)\n",
114 | "\n",
115 | "c = a - 1\n",
116 | "print(c)\n",
117 | "\n",
118 | "d = a * 2\n",
119 | "print(d)\n",
120 | "\n",
121 | "e = a / 2\n",
122 | "print(e)\n",
123 | "\n",
124 | "# Note: ** is the exponention operator\n",
125 | "f = a ** 2\n",
126 | "print(f)"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "### Shorthand mathematical operators\n",
134 | "\n",
135 | "Python has the ability to have a value be added to itself, in shorthand. This is called a \"compound assignment operator\"\n",
136 | "\n",
137 | "The addition version of this is:\n",
138 | "\n",
139 | "* `a += 1` is shorthand for `a = a + 1`\n"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 4,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "6\n"
152 | ]
153 | }
154 | ],
155 | "source": [
156 | "a += 1\n",
157 | "print(a)"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "The multiplication version of a compound assignment operator is:"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 5,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "12\n"
177 | ]
178 | }
179 | ],
180 | "source": [
181 | "a *= 2\n",
182 | "print(a)"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "## 1.2 Booleans & Logic Operators"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 6,
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | "\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "im_true = True\n",
207 | "im_false = False\n",
208 | "\n",
209 | "print(type(im_true))"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "### Equality operators\n",
217 | "Equality operators (== and !=) allow you to compare the values of variables on the left and right hand side."
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 7,
223 | "metadata": {},
224 | "outputs": [
225 | {
226 | "name": "stdout",
227 | "output_type": "stream",
228 | "text": [
229 | "False\n",
230 | "True\n"
231 | ]
232 | }
233 | ],
234 | "source": [
235 | "print(im_true == im_false) # Equality operator\n",
236 | "print(im_true != im_false)"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "The `and` operator requires that the variables on each side of the operator are equal to true."
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 8,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "False\n"
256 | ]
257 | }
258 | ],
259 | "source": [
260 | "print(im_true and im_false)"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "The `or` operator only requires the ONE of the variables on each side of the operator to be true."
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 9,
273 | "metadata": {},
274 | "outputs": [
275 | {
276 | "name": "stdout",
277 | "output_type": "stream",
278 | "text": [
279 | "True\n"
280 | ]
281 | }
282 | ],
283 | "source": [
284 | "print(im_true or im_false)"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "## 1.3 Strings\n",
292 | "You can use single or double quotes for strings."
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 10,
298 | "metadata": {},
299 | "outputs": [
300 | {
301 | "name": "stdout",
302 | "output_type": "stream",
303 | "text": [
304 | "delta analytics\n"
305 | ]
306 | }
307 | ],
308 | "source": [
309 | "my_string = 'delta'\n",
310 | "my_other_string = \"analytics\"\n",
311 | "print(my_string, my_other_string)"
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "### String methods\n",
319 | "Concatenating strings:"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 11,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "name": "stdout",
329 | "output_type": "stream",
330 | "text": [
331 | "delta analytics\n"
332 | ]
333 | }
334 | ],
335 | "source": [
336 | "another_string = '' + my_string + \" \" + my_other_string\n",
337 | "print(another_string)"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {},
343 | "source": [
344 | "Get the length of the string:"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 12,
350 | "metadata": {},
351 | "outputs": [
352 | {
353 | "name": "stdout",
354 | "output_type": "stream",
355 | "text": [
356 | "15\n"
357 | ]
358 | }
359 | ],
360 | "source": [
361 | "print(len(another_string))"
362 | ]
363 | },
364 | {
365 | "cell_type": "markdown",
366 | "metadata": {},
367 | "source": [
368 | "However, there are more ways to work with strings as well! Python has a list of built-in string functions such as `find()`, `startswith()` and `join()`. Check out [here for more information](https://docs.python.org/2/library/string.html)!"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "metadata": {},
374 | "source": [
375 | "# 2. Container Data Types"
376 | ]
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "metadata": {},
381 | "source": [
382 | "## 2.1 Lists\n",
383 | "A Python `list` stores multiple elements, which can be different types"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 13,
389 | "metadata": {},
390 | "outputs": [
391 | {
392 | "name": "stdout",
393 | "output_type": "stream",
394 | "text": [
395 | "['a', 'b', 'c', 3485]\n"
396 | ]
397 | }
398 | ],
399 | "source": [
400 | "my_list = ['a', 'b', 'c', 3485]\n",
401 | "print(my_list)"
402 | ]
403 | },
404 | {
405 | "cell_type": "markdown",
406 | "metadata": {},
407 | "source": [
408 | "You can access an element in a list with the following syntax:\n",
409 | "Note: the first element in a list has an index of zero."
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 14,
415 | "metadata": {},
416 | "outputs": [
417 | {
418 | "name": "stdout",
419 | "output_type": "stream",
420 | "text": [
421 | "c\n",
422 | "a\n"
423 | ]
424 | }
425 | ],
426 | "source": [
427 | "print(my_list[2])\n",
428 | "print(my_list[0])"
429 | ]
430 | },
431 | {
432 | "cell_type": "markdown",
433 | "metadata": {},
434 | "source": [
435 | "Reassigning elements in a list:"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": 15,
441 | "metadata": {},
442 | "outputs": [
443 | {
444 | "name": "stdout",
445 | "output_type": "stream",
446 | "text": [
447 | "['delta', 'b', 'c', 3485]\n"
448 | ]
449 | }
450 | ],
451 | "source": [
452 | "my_list[0] = 'delta'\n",
453 | "print(my_list)"
454 | ]
455 | },
456 | {
457 | "cell_type": "markdown",
458 | "metadata": {},
459 | "source": [
460 | "Adding/removing elements from a list:"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 16,
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "name": "stdout",
470 | "output_type": "stream",
471 | "text": [
472 | "['delta', 'b', 'c', 3485, 'hello']\n",
473 | "['delta', 'b', 'c', 3485]\n"
474 | ]
475 | }
476 | ],
477 | "source": [
478 | "my_list.append('hello')\n",
479 | "print(my_list)\n",
480 | "\n",
481 | "my_list.pop()\n",
482 | "print(my_list)"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "metadata": {},
488 | "source": [
489 | "Accessing multiple elements in a list:"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": 17,
495 | "metadata": {},
496 | "outputs": [
497 | {
498 | "name": "stdout",
499 | "output_type": "stream",
500 | "text": [
501 | "['delta', 'b']\n",
502 | "['c', 3485]\n",
503 | "['delta', 'b']\n"
504 | ]
505 | }
506 | ],
507 | "source": [
508 | "print(my_list[0:2]) # Access elements in index 0, 1 and 2\n",
509 | "print(my_list[2:]) # Access elements from index 2 to the end\n",
510 | "print(my_list[:2]) # Access elements from the beginning to index 2"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {},
516 | "source": [
517 | "And of course, the learning may never end! There are more ways to work with lists through built-in string functions as well! (e.g. `count()`, `sort()` and `copy()`.) \n",
518 | "\n",
519 | "For anything on lists and more, we typically like to google the information. Some great resources are like Python's very own dedicated Docs website, [here](https://docs.python.org/3/tutorial/datastructures.html)!"
520 | ]
521 | },
522 | {
523 | "cell_type": "markdown",
524 | "metadata": {},
525 | "source": [
526 | "## 2.1.5 Tuples"
527 | ]
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "metadata": {},
532 | "source": [
533 | "An _intermission_, there is another Python native object type called **\"Tuples\"**. We won't cover it here, but Tuples are like Lists, but have the following differences:\n",
534 | "\n",
535 | "* Lists can have different element lengths, while Tuples are fixed\n",
536 | "\n",
537 | "* Lists are in brackets `[]`, tuples is shown by parentheses `()` .\n",
538 | "* List have a mutable nature, tuple have a immutable nature.\n",
539 | "* List have more functionality than tuples"
540 | ]
541 | },
542 | {
543 | "cell_type": "markdown",
544 | "metadata": {},
545 | "source": [
546 | "## 2.2 Dictionaries\n",
547 | "Dictionaries hold key/value pairs and are useful for storing information."
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": 18,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": [
556 | "my_dict = { 'key_one': 'value_one', 'name': 'mike' }"
557 | ]
558 | },
559 | {
560 | "cell_type": "markdown",
561 | "metadata": {},
562 | "source": [
563 | "Access a value from a dictionary by a key:"
564 | ]
565 | },
566 | {
567 | "cell_type": "code",
568 | "execution_count": 19,
569 | "metadata": {},
570 | "outputs": [
571 | {
572 | "name": "stdout",
573 | "output_type": "stream",
574 | "text": [
575 | "value_one\n",
576 | "mike\n"
577 | ]
578 | }
579 | ],
580 | "source": [
581 | "print(my_dict['key_one'])\n",
582 | "print(my_dict['name'])"
583 | ]
584 | },
585 | {
586 | "cell_type": "markdown",
587 | "metadata": {},
588 | "source": [
589 | "Looping over values of a dictionary:"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": 20,
595 | "metadata": {},
596 | "outputs": [
597 | {
598 | "name": "stdout",
599 | "output_type": "stream",
600 | "text": [
601 | "The key is key_one\n",
602 | "The key is name\n"
603 | ]
604 | }
605 | ],
606 | "source": [
607 | "for key in my_dict:\n",
608 | " print(\"The key is \" + key)"
609 | ]
610 | },
611 | {
612 | "cell_type": "code",
613 | "execution_count": 21,
614 | "metadata": {},
615 | "outputs": [
616 | {
617 | "name": "stdout",
618 | "output_type": "stream",
619 | "text": [
620 | "The key is key_one, and the value is value_one\n",
621 | "The key is name, and the value is mike\n"
622 | ]
623 | }
624 | ],
625 | "source": [
626 | "for key, value in my_dict.items():\n",
627 | " print(\"The key is \" + key + \", and the value is \" + value)"
628 | ]
629 | },
630 | {
631 | "cell_type": "markdown",
632 | "metadata": {},
633 | "source": [
634 | "## 2.3 Sets\n",
635 | "Sets are similar to lists, but can only contain distinct values."
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 22,
641 | "metadata": {},
642 | "outputs": [
643 | {
644 | "name": "stdout",
645 | "output_type": "stream",
646 | "text": [
647 | "{1, 2, 3, 'hello'}\n"
648 | ]
649 | }
650 | ],
651 | "source": [
652 | "my_set = {1, 2, 3, 'hello'}\n",
653 | "print(my_set)"
654 | ]
655 | },
656 | {
657 | "cell_type": "markdown",
658 | "metadata": {},
659 | "source": [
660 | "When defining a set with the same value present multiple times, only one element will be added to the set. For example:"
661 | ]
662 | },
663 | {
664 | "cell_type": "code",
665 | "execution_count": 23,
666 | "metadata": {},
667 | "outputs": [
668 | {
669 | "name": "stdout",
670 | "output_type": "stream",
671 | "text": [
672 | "{1, 2, 3, 'hello'}\n"
673 | ]
674 | }
675 | ],
676 | "source": [
677 | "multiple = {1, 2, 2, 2, 2, 2, 3, 'hello'}\n",
678 | "print(multiple) # This will return {1, 2, 3, 'hello'}"
679 | ]
680 | },
681 | {
682 | "cell_type": "markdown",
683 | "metadata": {},
684 | "source": [
685 | "# 3. Functions\n",
686 | "A function is a block of reusable code that performs a certain action. Once you've defined a function, you can use it anywhere in your code!"
687 | ]
688 | },
689 | {
690 | "cell_type": "markdown",
691 | "metadata": {},
692 | "source": [
693 | "Defining a function:"
694 | ]
695 | },
696 | {
697 | "cell_type": "code",
698 | "execution_count": 24,
699 | "metadata": {},
700 | "outputs": [],
701 | "source": [
702 | "def am_i_happy(happiness_level):\n",
703 | " if happiness_level >= 10:\n",
704 | " return \"You're very happy.\"\n",
705 | " elif happiness_level >= 5:\n",
706 | " return \"You're happy.\"\n",
707 | " else:\n",
708 | " return \"You're not happy.\""
709 | ]
710 | },
711 | {
712 | "cell_type": "markdown",
713 | "metadata": {},
714 | "source": [
715 | "Calling a function:"
716 | ]
717 | },
718 | {
719 | "cell_type": "code",
720 | "execution_count": 25,
721 | "metadata": {},
722 | "outputs": [
723 | {
724 | "name": "stdout",
725 | "output_type": "stream",
726 | "text": [
727 | "You're not happy.\n"
728 | ]
729 | }
730 | ],
731 | "source": [
732 | "print(am_i_happy(0))"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": 26,
738 | "metadata": {},
739 | "outputs": [
740 | {
741 | "name": "stdout",
742 | "output_type": "stream",
743 | "text": [
744 | "You're happy.\n"
745 | ]
746 | }
747 | ],
748 | "source": [
749 | "print(am_i_happy(5))"
750 | ]
751 | },
752 | {
753 | "cell_type": "markdown",
754 | "metadata": {},
755 | "source": [
756 | "# 4. Control Flow\n",
757 | "## 4.1 If/Else If/Else"
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": 27,
763 | "metadata": {},
764 | "outputs": [
765 | {
766 | "name": "stdout",
767 | "output_type": "stream",
768 | "text": [
769 | "Take a nap\n"
770 | ]
771 | }
772 | ],
773 | "source": [
774 | "sleepy = True\n",
775 | "hungry = False\n",
776 | "\n",
777 | "if sleepy and hungry:\n",
778 | " print(\"Eat a snack and take a nap.\")\n",
779 | "elif sleepy and not hungry:\n",
780 | " print(\"Take a nap\")\n",
781 | "elif hungry and not sleepy:\n",
782 | " print(\"Eat a snack\")\n",
783 | "else:\n",
784 | " print(\"Go on with your day\")"
785 | ]
786 | },
787 | {
788 | "cell_type": "markdown",
789 | "metadata": {},
790 | "source": [
791 | "## 4.2 Loops\n",
792 | "### 4.2.1 'while' loops"
793 | ]
794 | },
795 | {
796 | "cell_type": "code",
797 | "execution_count": 28,
798 | "metadata": {},
799 | "outputs": [
800 | {
801 | "name": "stdout",
802 | "output_type": "stream",
803 | "text": [
804 | "You have counted to 0\n",
805 | "You have counted to 1\n",
806 | "You have counted to 2\n",
807 | "You have counted to 3\n",
808 | "You have counted to 4\n",
809 | "You have counted to 5\n",
810 | "You have counted to 6\n",
811 | "You have counted to 7\n",
812 | "You have counted to 8\n",
813 | "You have counted to 9\n",
814 | "You're finished counting\n"
815 | ]
816 | }
817 | ],
818 | "source": [
819 | "counter = 0\n",
820 | "while (counter < 10):\n",
821 | " print(\"You have counted to\", counter)\n",
822 | " counter = counter + 1 # Increment the counter\n",
823 | " \n",
824 | "print(\"You're finished counting\")"
825 | ]
826 | },
827 | {
828 | "cell_type": "markdown",
829 | "metadata": {},
830 | "source": [
831 | "### 4.2.2 'for' loops\n",
832 | "Loop over a list:"
833 | ]
834 | },
835 | {
836 | "cell_type": "code",
837 | "execution_count": 29,
838 | "metadata": {},
839 | "outputs": [
840 | {
841 | "name": "stdout",
842 | "output_type": "stream",
843 | "text": [
844 | "cats are cool\n",
845 | "dogs are cool\n",
846 | "lions are cool\n",
847 | "bears are cool\n"
848 | ]
849 | }
850 | ],
851 | "source": [
852 | "cool_animals = ['cat', 'dog', 'lion', 'bear']\n",
853 | "\n",
854 | "for animal in cool_animals:\n",
855 | " print(animal + \"s are cool\")"
856 | ]
857 | },
858 | {
859 | "cell_type": "markdown",
860 | "metadata": {},
861 | "source": [
862 | "Loop over a dict:"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": 30,
868 | "metadata": {},
869 | "outputs": [
870 | {
871 | "name": "stdout",
872 | "output_type": "stream",
873 | "text": [
874 | "The dog says bark!\n",
875 | "The cat says meow!\n",
876 | "The pig says oink!\n"
877 | ]
878 | }
879 | ],
880 | "source": [
881 | "animal_sounds = {\n",
882 | " 'dog': 'bark',\n",
883 | " 'cat': 'meow',\n",
884 | " 'pig': 'oink'\n",
885 | "}\n",
886 | "\n",
887 | "for animal, sound in animal_sounds.items():\n",
888 | " print(\"The \" + animal + \" says \" + sound + \"!\")"
889 | ]
890 | },
891 | {
892 | "cell_type": "markdown",
893 | "metadata": {},
894 | "source": [
895 | "Congratulations! You made it through the first Notebook. Keep it up!"
896 | ]
897 | },
898 | {
899 | "cell_type": "markdown",
900 | "metadata": {},
901 | "source": [
902 | "
\n",
903 | "
\n",
904 | "
\n",
905 | "\n",
906 | "----"
907 | ]
908 | }
909 | ],
910 | "metadata": {
911 | "kernelspec": {
912 | "display_name": "Python 3",
913 | "language": "python",
914 | "name": "python3"
915 | },
916 | "language_info": {
917 | "codemirror_mode": {
918 | "name": "ipython",
919 | "version": 3
920 | },
921 | "file_extension": ".py",
922 | "mimetype": "text/x-python",
923 | "name": "python",
924 | "nbconvert_exporter": "python",
925 | "pygments_lexer": "ipython3",
926 | "version": "3.8.5"
927 | },
928 | "toc": {
929 | "base_numbering": 1,
930 | "nav_menu": {},
931 | "number_sections": false,
932 | "sideBar": false,
933 | "skip_h1_title": false,
934 | "title_cell": "Table of Contents",
935 | "title_sidebar": "Contents",
936 | "toc_cell": true,
937 | "toc_position": {},
938 | "toc_section_display": true,
939 | "toc_window_display": false
940 | }
941 | },
942 | "nbformat": 4,
943 | "nbformat_minor": 4
944 | }
945 |
--------------------------------------------------------------------------------
/1_module_introduction_pandas/1_2_intro_to_numpy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Introduction of Numpy"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "#### NUMPY\n",
15 | "\n",
16 | "This is a fundamental Package for scientific computing for manipulation of multi-dimensional arrays and matrices. It is particularly useful for linear algebra, Fourier transform, random number simulation etc \n",
17 | "\n",
18 | "Matrices are rectangular array of numbers, symbols and expressions arranged in rows and columns. The numbers, symbols or expressions in the matrix are called its entries or its elements. The horizontal and vertical lines of entries in a matrix are called rows and columns, respectively. Its operations inclue addition, subtraction, multiplication\n",
19 | "\n"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "#### 1. Importing Numpy\n",
27 | "\n",
28 | "The first step is to import numpy library into the active notebook"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 1,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "import numpy"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "To shorten the length of any library, a better alternative is to instantiate the library with a shorter name, as in"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 2,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import numpy as np"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "With this, each time numpy is required on this active notebook, **np** will be used instead"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "#### 2. Creating Numpy Array\n",
68 | "\n",
69 | "The [np.array](https://docs.scipy.org/doc/numpy/reference/generated/numpy.array.html) function is used to create an array"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 3,
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "name": "stdout",
79 | "output_type": "stream",
80 | "text": [
81 | "[1 2 3 4 5]\n",
82 | "The shape of X is (5,)\n",
83 | "[ 9 10]\n",
84 | "The shape of Y is (2,)\n"
85 | ]
86 | }
87 | ],
88 | "source": [
89 | "#creating a 1 dimensional array\n",
90 | "\n",
91 | "x = np.array([1, 2, 3, 4, 5])\n",
92 | "y = np.array([9, 10]) \n",
93 | "print(x)\n",
94 | "print('The shape of X is', x.shape)\n",
95 | "\n",
96 | "print(y)\n",
97 | "print('The shape of Y is', y.shape)"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "The [shape](https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.shape.html) property is usually used to get the current shape of an array"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 4,
110 | "metadata": {},
111 | "outputs": [
112 | {
113 | "name": "stdout",
114 | "output_type": "stream",
115 | "text": [
116 | "[[1 2]\n",
117 | " [3 4]\n",
118 | " [5 6]]\n",
119 | "The shape of Z is (3, 2)\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "# Creating a 2D arrays \n",
125 | "z = np.array([[1, 2], [3, 4], [5, 6]]) \n",
126 | "\n",
127 | "print(z)\n",
128 | "print('The shape of Z is', z.shape)"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "#### 3. Numpy Functions\n",
136 | "\n",
137 | "Numpy has built-in functions for creating arrays and manipulating. These includes:\n",
138 | "\n",
139 | "- np.arange \n",
140 | "\n",
141 | "- np.reshape\n",
142 | "\n",
143 | "- np.zeros \n",
144 | "\n",
145 | ">The dimensions (no of rows and column) are passed as parameters to the function."
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 5,
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "name": "stdout",
155 | "output_type": "stream",
156 | "text": [
157 | "[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n",
158 | " 24]\n",
159 | "(25,)\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "#arange is Used to create arrays with values in a specified range.\n",
165 | "\n",
166 | "A = np.arange(25)\n",
167 | "print(A)\n",
168 | "print(A.shape)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 6,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "[[ 0]\n",
181 | " [ 1]\n",
182 | " [ 2]\n",
183 | " [ 3]\n",
184 | " [ 4]\n",
185 | " [ 5]\n",
186 | " [ 6]\n",
187 | " [ 7]\n",
188 | " [ 8]\n",
189 | " [ 9]\n",
190 | " [10]\n",
191 | " [11]\n",
192 | " [12]\n",
193 | " [13]\n",
194 | " [14]\n",
195 | " [15]\n",
196 | " [16]\n",
197 | " [17]\n",
198 | " [18]\n",
199 | " [19]\n",
200 | " [20]\n",
201 | " [21]\n",
202 | " [22]\n",
203 | " [23]\n",
204 | " [24]]\n",
205 | "The shape of 1D array X = (25, 1)\n"
206 | ]
207 | }
208 | ],
209 | "source": [
210 | "#To change the shape of an array\n",
211 | "\n",
212 | "B = A.reshape(25,1)\n",
213 | "\n",
214 | "print (B)\n",
215 | "print (\"The shape of 1D array X = \", B.shape)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 7,
221 | "metadata": {},
222 | "outputs": [
223 | {
224 | "name": "stdout",
225 | "output_type": "stream",
226 | "text": [
227 | "[[ 0 1 2 3 4]\n",
228 | " [ 5 6 7 8 9]\n",
229 | " [10 11 12 13 14]\n",
230 | " [15 16 17 18 19]\n",
231 | " [20 21 22 23 24]]\n",
232 | "The shape of array C = (5, 5)\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "C = B.reshape(5,5)\n",
238 | "\n",
239 | "print ( C)\n",
240 | "print (\"The shape of array C = \", C.shape)"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "> Note: Before a reshape function will run sucessful, the multiplication of the two parameter supply to the function must be equal with multiplication of the shape of the orginal array you want to reshape.\n",
248 | "\n",
249 | "For example: The shape of variable B is (25, 1) therefore 25 * 1 = 25\n",
250 | "\n",
251 | "The two parameter supply to the reshape function is (5, 5), 5 * 5 = 25\n",
252 | " "
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 8,
258 | "metadata": {},
259 | "outputs": [
260 | {
261 | "data": {
262 | "text/plain": [
263 | "array([[0., 0., 0.],\n",
264 | " [0., 0., 0.]])"
265 | ]
266 | },
267 | "execution_count": 8,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "#zeros is used to create an array filled with zeros. \n",
274 | "\n",
275 | "np_Zeros = np.zeros((2,3))\n",
276 | "\n",
277 | "np_Zeros"
278 | ]
279 | },
280 | {
281 | "cell_type": "markdown",
282 | "metadata": {},
283 | "source": [
284 | "#### 4. Accessing elements of Numpy array\n",
285 | "\n",
286 | "To access an element in a two-dimensional array, you need to specify an index for both the row and the column."
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 9,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "D = np.array([[5, 7, 8],[3, 5, 9]])"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 10,
301 | "metadata": {},
302 | "outputs": [
303 | {
304 | "data": {
305 | "text/plain": [
306 | "array([[5, 7, 8],\n",
307 | " [3, 5, 9]])"
308 | ]
309 | },
310 | "execution_count": 10,
311 | "metadata": {},
312 | "output_type": "execute_result"
313 | }
314 | ],
315 | "source": [
316 | "D"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 11,
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/plain": [
327 | "3"
328 | ]
329 | },
330 | "execution_count": 11,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": [
336 | "# note that for array numbering in numpy, it that from zero\n",
337 | "#Row 1, column 0 gives a scalar value\n",
338 | "D[1,0]"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 12,
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "data": {
348 | "text/plain": [
349 | "5"
350 | ]
351 | },
352 | "execution_count": 12,
353 | "metadata": {},
354 | "output_type": "execute_result"
355 | }
356 | ],
357 | "source": [
358 | "#Row 1, column 1\n",
359 | "D[1,1]"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 13,
365 | "metadata": {},
366 | "outputs": [
367 | {
368 | "data": {
369 | "text/plain": [
370 | "9"
371 | ]
372 | },
373 | "execution_count": 13,
374 | "metadata": {},
375 | "output_type": "execute_result"
376 | }
377 | ],
378 | "source": [
379 | "#Row 1, column 2\n",
380 | "D[1,2]"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 14,
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "data": {
390 | "text/plain": [
391 | "array([[5, 7, 8]])"
392 | ]
393 | },
394 | "execution_count": 14,
395 | "metadata": {},
396 | "output_type": "execute_result"
397 | }
398 | ],
399 | "source": [
400 | "# Slicing is also possible in numpy\n",
401 | "D[0:1, :]"
402 | ]
403 | },
404 | {
405 | "cell_type": "markdown",
406 | "metadata": {},
407 | "source": [
408 | "#### 5. Numpy array math operations"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": 15,
414 | "metadata": {},
415 | "outputs": [],
416 | "source": [
417 | "x = np.array([[1,2,3],[4,5,6]])\n",
418 | "y = np.array([[2,2,2],[3,3,3]])\n",
419 | "z = np.array([1,2,3])"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": 16,
425 | "metadata": {},
426 | "outputs": [
427 | {
428 | "data": {
429 | "text/plain": [
430 | "array([[1, 4],\n",
431 | " [2, 5],\n",
432 | " [3, 6]])"
433 | ]
434 | },
435 | "execution_count": 16,
436 | "metadata": {},
437 | "output_type": "execute_result"
438 | }
439 | ],
440 | "source": [
441 | "#Transpose a matrix\n",
442 | "\n",
443 | "x.T"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 17,
449 | "metadata": {},
450 | "outputs": [
451 | {
452 | "name": "stdout",
453 | "output_type": "stream",
454 | "text": [
455 | "[[3 4 5]\n",
456 | " [7 8 9]]\n",
457 | "[[3 4 5]\n",
458 | " [7 8 9]]\n"
459 | ]
460 | }
461 | ],
462 | "source": [
463 | "#Elementwise addittion\n",
464 | "\n",
465 | "print (x+y)\n",
466 | "print (np.add(x,y))"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 18,
472 | "metadata": {},
473 | "outputs": [
474 | {
475 | "name": "stdout",
476 | "output_type": "stream",
477 | "text": [
478 | "[[-1 0 1]\n",
479 | " [ 1 2 3]]\n",
480 | "[[-1 0 1]\n",
481 | " [ 1 2 3]]\n"
482 | ]
483 | }
484 | ],
485 | "source": [
486 | "#Elementwise Subtraction\n",
487 | "\n",
488 | "print (x-y)\n",
489 | "print (np.subtract(x,y))"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": 19,
495 | "metadata": {},
496 | "outputs": [
497 | {
498 | "name": "stdout",
499 | "output_type": "stream",
500 | "text": [
501 | "[[ 1 4 9]\n",
502 | " [ 4 10 18]]\n",
503 | "[[ 1 4 9]\n",
504 | " [ 4 10 18]]\n"
505 | ]
506 | }
507 | ],
508 | "source": [
509 | "#Elementwise Multiplication\n",
510 | "\n",
511 | "print (x*z)\n",
512 | "print (np.multiply(x,z))"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": 20,
518 | "metadata": {},
519 | "outputs": [
520 | {
521 | "name": "stdout",
522 | "output_type": "stream",
523 | "text": [
524 | "[14 32]\n"
525 | ]
526 | }
527 | ],
528 | "source": [
529 | "# Inner product of vectors \n",
530 | "print(np.dot(x, z)) "
531 | ]
532 | }
533 | ],
534 | "metadata": {
535 | "kernelspec": {
536 | "display_name": "Python (Delta Analytics Env)",
537 | "language": "python",
538 | "name": "delta_analytics_env"
539 | },
540 | "language_info": {
541 | "codemirror_mode": {
542 | "name": "ipython",
543 | "version": 3
544 | },
545 | "file_extension": ".py",
546 | "mimetype": "text/x-python",
547 | "name": "python",
548 | "nbconvert_exporter": "python",
549 | "pygments_lexer": "ipython3",
550 | "version": "3.6.12"
551 | }
552 | },
553 | "nbformat": 4,
554 | "nbformat_minor": 2
555 | }
556 |
--------------------------------------------------------------------------------
/1_module_introduction_pandas/1_4_loading_and_understanding_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Module 1: Introduction to Exploratory Analysis "
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "\n",
15 | "
\n",
16 | "\n",
17 | "\n",
18 | "[(Page 17)](https://drive.google.com/file/d/1r4SBY6Dm6xjFqLH12tFb-Bf7wbvoIN_C/view)"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "What we'll be doing in this notebook:\n",
26 | "-----\n",
27 | "\n",
28 | " 1. Checking variable types\n",
29 | " 2. Checking for missing variables \n",
30 | " 3. Observing number of observations in the dataset\n",
31 | " 4. Quickly displaying Descriptive statistics"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "### Import packages"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 1,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "import pandas as pd\n",
48 | "import numpy as np\n",
49 | "import seaborn as sns\n",
50 | "import matplotlib.pyplot as plt\n",
51 | "from datetime import datetime\n",
52 | "import dateutil.parser\n",
53 | "\n",
54 | "# The command below means that the output of multiple commands in a cell will be output at once\n",
55 | "from IPython.core.interactiveshell import InteractiveShell\n",
56 | "InteractiveShell.ast_node_interactivity = \"all\"\n",
57 | "\n",
58 | "# The command below tells jupyter to display up to 80 columns, this keeps everything visible\n",
59 | "pd.set_option('display.max_columns', 80)\n",
60 | "pd.set_option('expand_frame_repr', True)\n",
61 | "\n",
62 | "# Show figures in notebook\n",
63 | "%matplotlib inline"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "### Import dataset"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "We read in our dataset"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 2,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "path = '../data/'\n",
87 | "filename = 'loans.csv'\n",
88 | "\n",
89 | "try:\n",
90 | " df = pd.read_csv(path + filename)\n",
91 | "except FileNotFoundError:\n",
92 | " # If data is not found, download it from GitHub\n",
93 | " import os\n",
94 | " os.system(f'git clone --single-branch --depth=1 https://github.com/DeltaAnalytics/machine_learning_for_good_data {path}')\n",
95 | " df = pd.read_csv(path+filename)"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "In the cell below, we take a random sample of 2 rows to get a feel for the data."
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 3,
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "data": {
112 | "text/html": [
113 | "\n",
114 | "\n",
127 | "
\n",
128 | " \n",
129 | " \n",
130 | " | \n",
131 | " id_number | \n",
132 | " loan_amount | \n",
133 | " lender_count | \n",
134 | " status | \n",
135 | " funded_date | \n",
136 | " funded_amount | \n",
137 | " repayment_term | \n",
138 | " location_country_code | \n",
139 | " sector | \n",
140 | " description | \n",
141 | " use | \n",
142 | "
\n",
143 | " \n",
144 | " \n",
145 | " \n",
146 | " 682 | \n",
147 | " 1548647 | \n",
148 | " 725 | \n",
149 | " 25 | \n",
150 | " funded | \n",
151 | " 2018-06-19T12:10:23Z | \n",
152 | " 725 | \n",
153 | " 40 | \n",
154 | " CM | \n",
155 | " Agriculture | \n",
156 | " She is married and a mother of four children l... | \n",
157 | " rent land, labor and purchase seedlings and ot... | \n",
158 | "
\n",
159 | " \n",
160 | " 3312 | \n",
161 | " 1563971 | \n",
162 | " 800 | \n",
163 | " 30 | \n",
164 | " funded | \n",
165 | " 2018-07-18T16:16:22Z | \n",
166 | " 800 | \n",
167 | " 14 | \n",
168 | " NG | \n",
169 | " Services | \n",
170 | " Oluchi is 38 years old and a mother of five ch... | \n",
171 | " to process palm oil for storage. | \n",
172 | "
\n",
173 | " \n",
174 | "
\n",
175 | "
"
176 | ],
177 | "text/plain": [
178 | " id_number loan_amount lender_count status funded_date \\\n",
179 | "682 1548647 725 25 funded 2018-06-19T12:10:23Z \n",
180 | "3312 1563971 800 30 funded 2018-07-18T16:16:22Z \n",
181 | "\n",
182 | " funded_amount repayment_term location_country_code sector \\\n",
183 | "682 725 40 CM Agriculture \n",
184 | "3312 800 14 NG Services \n",
185 | "\n",
186 | " description \\\n",
187 | "682 She is married and a mother of four children l... \n",
188 | "3312 Oluchi is 38 years old and a mother of five ch... \n",
189 | "\n",
190 | " use \n",
191 | "682 rent land, labor and purchase seedlings and ot... \n",
192 | "3312 to process palm oil for storage. "
193 | ]
194 | },
195 | "execution_count": 3,
196 | "metadata": {},
197 | "output_type": "execute_result"
198 | }
199 | ],
200 | "source": [
201 | "df.sample(n=2)"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "### 1) Type Checking\n",
209 | ""
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "Type is very important in Python programing, because it affects the types of functions you can apply to a series. There are a few different types of data you will see regularly (see [this](https://en.wikibooks.org/wiki/Python_Programming/Data_Types) link for more detail):\n",
217 | "* **int** - a number with no decimal places. example: loan_amount field\n",
218 | "* **float** - a number with decimal places. example: partner_id field\n",
219 | "* **str** - str is short for string. This type formally defined as a sequence of unicode characters. More simply, string means that the data is treated as a word, not a number. example: sector\n",
220 | "* **boolean** - can only be True or False. There is not currently an example in the data, but we will be creating a gender field shortly.\n",
221 | "* **datetime** - values meant to hold time data. Example: posted_date\n",
222 | "\n",
223 | "Let's check the type of our variables using the examples we saw in the cell above."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 4,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | "['id_number',\n",
235 | " 'loan_amount',\n",
236 | " 'lender_count',\n",
237 | " 'status',\n",
238 | " 'funded_date',\n",
239 | " 'funded_amount',\n",
240 | " 'repayment_term',\n",
241 | " 'location_country_code',\n",
242 | " 'sector',\n",
243 | " 'description',\n",
244 | " 'use']"
245 | ]
246 | },
247 | "execution_count": 4,
248 | "metadata": {},
249 | "output_type": "execute_result"
250 | }
251 | ],
252 | "source": [
253 | "# Here are all of the columns\n",
254 | "df.columns.tolist()"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 5,
260 | "metadata": {},
261 | "outputs": [
262 | {
263 | "data": {
264 | "text/plain": [
265 | "dtype('int64')"
266 | ]
267 | },
268 | "execution_count": 5,
269 | "metadata": {},
270 | "output_type": "execute_result"
271 | }
272 | ],
273 | "source": [
274 | "# Find the dtype, aka datatype, for a column\n",
275 | "df['id_number'].dtype"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 6,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "# Try this - Pick a couple of columns and check their type on your own\n"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "### 2) Do I have missing values?\n",
292 | "\n",
293 | ""
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "If we have missing data, is the missing data at random or not? If data is missing at random, the data distribution is still representative of the population. You can probably ignore the missing values as an inconvenience. However, if the data is systematically missing, the analysis you do may be biased. You should carefully consider the best way to clean the data, it may involve dropping some data."
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "We want to see how many values are missing in certain variable columns. One way to do this is to count the number of null observations. \n",
308 | "\n",
309 | "For this, we wrote a short function to apply to the dataframe. \n",
310 | "\n",
311 | "We print out the first few observations, but you can remove the `.head()` to print out all columns. "
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 7,
317 | "metadata": {},
318 | "outputs": [
319 | {
320 | "name": "stdout",
321 | "output_type": "stream",
322 | "text": [
323 | "Missing values per column:\n",
324 | "funded_date 937.0\n",
325 | "location_country_code 17.0\n",
326 | "description 342.0\n",
327 | "use 342.0\n",
328 | "dtype: float64\n"
329 | ]
330 | }
331 | ],
332 | "source": [
333 | "#Create a new function:\n",
334 | "def num_missing(x):\n",
335 | " return sum(x.isnull())\n",
336 | "\n",
337 | "#Applying per column:\n",
338 | "print(\"Missing values per column:\")\n",
339 | "## Check how many are missing by column, and then check which ones have any missing values\n",
340 | "print(df.apply(num_missing, axis=0).where(lambda x : x != 0).dropna().head(20)) \n",
341 | "#axis=0 defines that function is to be applied on each column"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {},
347 | "source": [
348 | "### 3) Sanity Checks\n",
349 | ""
350 | ]
351 | },
352 | {
353 | "cell_type": "markdown",
354 | "metadata": {},
355 | "source": [
356 | "**Does the dataset match what you expected to find?**\n",
357 | "- Is the range of values what you would expect? For example, are all loan_amounts above 0.\n",
358 | "- Do you have the number of rows you would expect?\n",
359 | "- Is your data for the date range you would expect? For example, is there a strange year in the data like 1880.\n",
360 | "- Are there unexpected spikes when you plot the data over time?\n",
361 | "\n",
362 | "\n",
363 | "In the command below we find out the number of loans and number of columns by using the function shape. You can also use `len(df.index)` to find the number of rows."
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 8,
369 | "metadata": {},
370 | "outputs": [
371 | {
372 | "name": "stdout",
373 | "output_type": "stream",
374 | "text": [
375 | "There are 6019 observations and 11 features\n"
376 | ]
377 | }
378 | ],
379 | "source": [
380 | "print(f'There are {df.shape[0]} observations and {df.shape[1]} features')"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {},
386 | "source": [
387 | "Remember, each row is an observation and each column is a potential feature. \n",
388 | "\n",
389 | "Remember we need large about of data for machine learning."
390 | ]
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "metadata": {},
395 | "source": [
396 | "### 4) Descriptive statistics of the dataset\n",
397 | "\n",
398 | ""
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {},
404 | "source": [
405 | "The `describe` command conveniently below provides key summary statistics for each numeric column."
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": 9,
411 | "metadata": {},
412 | "outputs": [
413 | {
414 | "data": {
415 | "text/html": [
416 | "\n",
417 | "\n",
430 | "
\n",
431 | " \n",
432 | " \n",
433 | " | \n",
434 | " id_number | \n",
435 | " loan_amount | \n",
436 | " lender_count | \n",
437 | " funded_amount | \n",
438 | " repayment_term | \n",
439 | "
\n",
440 | " \n",
441 | " \n",
442 | " \n",
443 | " count | \n",
444 | " 6.019000e+03 | \n",
445 | " 6019.000000 | \n",
446 | " 6019.000000 | \n",
447 | " 6019.000000 | \n",
448 | " 6019.000000 | \n",
449 | "
\n",
450 | " \n",
451 | " mean | \n",
452 | " 1.359770e+06 | \n",
453 | " 1499.011464 | \n",
454 | " 35.661406 | \n",
455 | " 1325.070610 | \n",
456 | " 11.803290 | \n",
457 | "
\n",
458 | " \n",
459 | " std | \n",
460 | " 3.719316e+05 | \n",
461 | " 2512.517280 | \n",
462 | " 73.420256 | \n",
463 | " 2444.726815 | \n",
464 | " 9.114948 | \n",
465 | "
\n",
466 | " \n",
467 | " min | \n",
468 | " 1.377200e+04 | \n",
469 | " 50.000000 | \n",
470 | " 0.000000 | \n",
471 | " 0.000000 | \n",
472 | " 3.000000 | \n",
473 | "
\n",
474 | " \n",
475 | " 25% | \n",
476 | " 1.425188e+06 | \n",
477 | " 300.000000 | \n",
478 | " 7.000000 | \n",
479 | " 200.000000 | \n",
480 | " 8.000000 | \n",
481 | "
\n",
482 | " \n",
483 | " 50% | \n",
484 | " 1.550673e+06 | \n",
485 | " 625.000000 | \n",
486 | " 16.000000 | \n",
487 | " 525.000000 | \n",
488 | " 10.000000 | \n",
489 | "
\n",
490 | " \n",
491 | " 75% | \n",
492 | " 1.566204e+06 | \n",
493 | " 1825.000000 | \n",
494 | " 41.000000 | \n",
495 | " 1525.000000 | \n",
496 | " 14.000000 | \n",
497 | "
\n",
498 | " \n",
499 | " max | \n",
500 | " 1.573593e+06 | \n",
501 | " 80000.000000 | \n",
502 | " 2665.000000 | \n",
503 | " 80000.000000 | \n",
504 | " 133.000000 | \n",
505 | "
\n",
506 | " \n",
507 | "
\n",
508 | "
"
509 | ],
510 | "text/plain": [
511 | " id_number loan_amount lender_count funded_amount repayment_term\n",
512 | "count 6.019000e+03 6019.000000 6019.000000 6019.000000 6019.000000\n",
513 | "mean 1.359770e+06 1499.011464 35.661406 1325.070610 11.803290\n",
514 | "std 3.719316e+05 2512.517280 73.420256 2444.726815 9.114948\n",
515 | "min 1.377200e+04 50.000000 0.000000 0.000000 3.000000\n",
516 | "25% 1.425188e+06 300.000000 7.000000 200.000000 8.000000\n",
517 | "50% 1.550673e+06 625.000000 16.000000 525.000000 10.000000\n",
518 | "75% 1.566204e+06 1825.000000 41.000000 1525.000000 14.000000\n",
519 | "max 1.573593e+06 80000.000000 2665.000000 80000.000000 133.000000"
520 | ]
521 | },
522 | "execution_count": 9,
523 | "metadata": {},
524 | "output_type": "execute_result"
525 | }
526 | ],
527 | "source": [
528 | "df.describe()"
529 | ]
530 | },
531 | {
532 | "cell_type": "markdown",
533 | "metadata": {},
534 | "source": [
535 | "In order to get the same summary statistics for categorical columns (string) we need to do a little data wrangling. \n",
536 | "\n",
537 | "The first line of code filters for all columns that are a data type object. As we know from before this means they are considered to be a string. The final row of code provides summary statistics for these character fields."
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 10,
543 | "metadata": {},
544 | "outputs": [
545 | {
546 | "data": {
547 | "text/html": [
548 | "\n",
549 | "\n",
562 | "
\n",
563 | " \n",
564 | " \n",
565 | " | \n",
566 | " status | \n",
567 | " funded_date | \n",
568 | " location_country_code | \n",
569 | " sector | \n",
570 | " description | \n",
571 | " use | \n",
572 | "
\n",
573 | " \n",
574 | " \n",
575 | " \n",
576 | " count | \n",
577 | " 6019 | \n",
578 | " 5082 | \n",
579 | " 6002 | \n",
580 | " 6019 | \n",
581 | " 5677 | \n",
582 | " 5677 | \n",
583 | "
\n",
584 | " \n",
585 | " unique | \n",
586 | " 3 | \n",
587 | " 4453 | \n",
588 | " 30 | \n",
589 | " 14 | \n",
590 | " 5277 | \n",
591 | " 4325 | \n",
592 | "
\n",
593 | " \n",
594 | " top | \n",
595 | " funded | \n",
596 | " 2018-07-22T15:54:41Z | \n",
597 | " CD | \n",
598 | " Food | \n",
599 | " Concilie has been selling used clothing for 15... | \n",
600 | " to pay for a stove. | \n",
601 | "
\n",
602 | " \n",
603 | " freq | \n",
604 | " 5082 | \n",
605 | " 9 | \n",
606 | " 400 | \n",
607 | " 1738 | \n",
608 | " 2 | \n",
609 | " 80 | \n",
610 | "
\n",
611 | " \n",
612 | "
\n",
613 | "
"
614 | ],
615 | "text/plain": [
616 | " status funded_date location_country_code sector \\\n",
617 | "count 6019 5082 6002 6019 \n",
618 | "unique 3 4453 30 14 \n",
619 | "top funded 2018-07-22T15:54:41Z CD Food \n",
620 | "freq 5082 9 400 1738 \n",
621 | "\n",
622 | " description use \n",
623 | "count 5677 5677 \n",
624 | "unique 5277 4325 \n",
625 | "top Concilie has been selling used clothing for 15... to pay for a stove. \n",
626 | "freq 2 80 "
627 | ]
628 | },
629 | "execution_count": 10,
630 | "metadata": {},
631 | "output_type": "execute_result"
632 | }
633 | ],
634 | "source": [
635 | "categorical = df.dtypes[df.dtypes == \"object\"].index\n",
636 | "df[categorical].describe()"
637 | ]
638 | },
639 | {
640 | "cell_type": "markdown",
641 | "metadata": {},
642 | "source": [
643 | "In the table above, there are 4 really useful fields: \n",
644 | "\n",
645 | "1) **count** - total number of fields populated (Not empty). \n",
646 | "\n",
647 | "2) **unique** - tells us how many different unique ways this field is populated. For example 4 in description.languages tells us there are 4 different language descriptions. \n",
648 | "\n",
649 | "3) **top** - tells us the most popular data point. For example, the top activity in this dataset is Farming which tells us most loans are in Farming.\n",
650 | "\n",
651 | "4) **freq** - tells us that how frequent the most popular category is in our dataset. For example, 'en' (English) is the language almost all descriptions (description.languages) are written in (118,306 out of 118,316)."
652 | ]
653 | },
654 | {
655 | "cell_type": "markdown",
656 | "metadata": {},
657 | "source": [
658 | "What is next\n",
659 | "-----\n",
660 | "\n",
661 | "In the next section, we move on to exploratory data analysis (EDA)."
662 | ]
663 | },
664 | {
665 | "cell_type": "markdown",
666 | "metadata": {},
667 | "source": [
668 | "
\n",
669 | "
\n",
670 | "
\n",
671 | "\n",
672 | "----"
673 | ]
674 | }
675 | ],
676 | "metadata": {
677 | "kernelspec": {
678 | "display_name": "Python (Delta Analytics Env)",
679 | "language": "python",
680 | "name": "delta_analytics_env"
681 | },
682 | "language_info": {
683 | "codemirror_mode": {
684 | "name": "ipython",
685 | "version": 3
686 | },
687 | "file_extension": ".py",
688 | "mimetype": "text/x-python",
689 | "name": "python",
690 | "nbconvert_exporter": "python",
691 | "pygments_lexer": "ipython3",
692 | "version": "3.6.12"
693 | }
694 | },
695 | "nbformat": 4,
696 | "nbformat_minor": 2
697 | }
698 |
--------------------------------------------------------------------------------
/1_module_introduction_pandas/README.md:
--------------------------------------------------------------------------------
1 | Module 1: Introduction to Exploratory Analysis
2 | =====
3 |
4 | Welcome to first module! In this module, we start exploring our [Kiva](https://www.kiva.org/) dataset.
5 |
6 | Goals
7 | ----
8 | - Load our data and do some quick exploration
9 | - Understand the data using descriptive statistics and graphs
10 |
11 | Topic overview
12 | ----
13 |
14 | The goal of exploratory analysis is to summarize the main characteristics of a data set, with the belief that it may lead to new hypotheses that inform algorithm choice and experimentation. Exploratory analysis happens before formal modeling commences, and is extremely important for helping inform or sharpen your hypothesis.
15 |
16 | Installation
17 | ----
18 |
19 | If you have not installed the datasets yet for this repository, machine_learning_for_good, you can execute the following in this folder, `module_1_introduction_pandas`
20 |
21 | ```
22 | git clone --single-branch --depth=1 https://github.com/DeltaAnalytics/machine_learning_for_good_data ../data
23 | ```
24 | > If you are in the main folder, then just remove the `../` at the end, and leave it as `data`.
--------------------------------------------------------------------------------
/1_module_introduction_pandas/best_practices_data_science.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/best_practices_data_science.pdf
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/Anaconda_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_1.PNG
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/Anaconda_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_2.PNG
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/Anaconda_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_3.PNG
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/Anaconda_4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_4.PNG
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/Anaconda_5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_5.PNG
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/Anaconda_6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_6.PNG
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/Anaconda_7.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_7.PNG
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/Anaconda_7_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_7_2.PNG
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/Anaconda_8.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/Anaconda_8.PNG
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/anaconda_nav.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/anaconda_nav.png
--------------------------------------------------------------------------------
/1_module_introduction_pandas/images/jupyter_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/images/jupyter_notebook.png
--------------------------------------------------------------------------------
/1_module_introduction_pandas/intro_to_visualization.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/1_module_introduction_pandas/intro_to_visualization.pptx
--------------------------------------------------------------------------------
/1_module_introduction_pandas/python_installation_instructions.md:
--------------------------------------------------------------------------------
1 | Module 0: Installing Python with Anaconda
2 | ====
3 |
4 | 1. Download and install the latest version Python 3 from Anaconda (requires ~1.8Gb space) for your operating system at: [www.anaconda.com/download/](https://www.anaconda.com/download/).
5 | 
6 |
7 | 2. Follow the prompts:
8 |
9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 
16 |
17 | 3. Start Anaconda Navigator application.
18 |
19 | 4. Within Anaconda Navigator, click on "Lanuch" button for Jupyter Notebook.
20 |
21 | 
22 |
23 | That will open Jupyter Notebook in your favorite web browser.
24 |
25 | 
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/README.md:
--------------------------------------------------------------------------------
1 | Module 2: Feature Engineering
2 | ====
3 |
4 | Welcome to Module 2 of the introductory course to machine learning, where we will create new variables out of the raw data in a process called feature engineering!
5 |
6 | Goals
7 | ------
8 | Learn how to execute the following:
9 | 1. Feature pruning
10 | 2. Engineering Temporal Features (month, year, etc)
11 | 3. One-hot encoding / dummy variables
12 | 4. Extracting features from strings
13 | 5. Creating features from Metadata
14 | 6. Feature scaling
15 | 7. Data Imputation / cleaning
16 |
17 | Topic Overview
18 | -----
19 |
20 | **What is feature engineering?**
21 |
22 | In machine learning, a *feature* is a property or characteristic of a phenomenon being observed. *Feature engineering* is the process of creating and selecting features from the data that are useful for machine learning algorithms.
23 |
24 | The dataset contains many features to start, so why do we need to make some more?
25 |
26 | - Consider a dataset that has a long description string variable. This may not be a useful feature to feed directly into a model, so perhaps we can make a new variable for whether the description contains a certain word. The hope for creating this new feature is that it will have more predictive power.
27 |
28 | How do we know what features will be useful?
29 |
30 | - This comes down to domain expertise, and this is a large part of a data scientist's work!
31 |
32 | Fortunately, there are common starting points for many datasets that we review in this module.
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/Anaconda_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_1.PNG
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/Anaconda_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_2.PNG
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/Anaconda_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_3.PNG
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/Anaconda_4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_4.PNG
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/Anaconda_5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_5.PNG
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/Anaconda_6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_6.PNG
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/Anaconda_7.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_7.PNG
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/Anaconda_7_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_7_2.PNG
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/Anaconda_8.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/Anaconda_8.PNG
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/anaconda_nav.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/anaconda_nav.png
--------------------------------------------------------------------------------
/2_module_eda_feature_engineering/images/jupyter_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/2_module_eda_feature_engineering/images/jupyter_notebook.png
--------------------------------------------------------------------------------
/3_module_linear_regression/README.md:
--------------------------------------------------------------------------------
1 | # Module 2: Linear Regression
2 | ================================================================================
3 |
4 | Welcome to module 2 of the introductory course to data for good where we will be exploring linear regression - the first machine learning algorithm of this course!
5 |
6 | Goals
7 | ----
8 | By end of this module one should feel comfortable with the fundamentals of linear regression. Specific topics included are:
9 | 1. How to split the data between training and test data
10 | 2. Using training data to train a linear regression model
11 | 3. Analyzing the results of the model
12 | 4. Checking the assumptions of linear regression
13 | 5. Building a multivariate regressor
14 |
15 | ## Topic overview
16 | Linear Regression is a parametric model which predicts a continuous outcome feature (**Y**) from one or more explanatory features (**X**).
17 |
18 | **Y** = beta_0 + beta_1 * **X**
19 |
20 | beta_0 is called the intercept term, and represents the expected mean value of Y when all explanatory features equal 0.
21 | beta_1 is called a beta coefficient, and represents the expected change in the value of Y that results from a one unit change in X.
22 |
23 | This is module fits a straight line to your data, where the value of the outcome feature can be calculated as a linear combination of the explanatory features. Sounds relatively simple? Afraid not, there are many nuances and conditions that need to be understood before using linear regression! We are going to delve into these assumptions and conditions and then demonstrate how to use this algorithm on the kiva dataset.
24 |
25 | 
26 |
27 |
28 | ## Resources
29 | - [Comprehensive Guide to Regression](https://www.analyticsvidhya.com/blog/2015/08/comprehensive-guide-regression/)
30 | - [Understanding key regression statistics](http://connor-johnson.com/2014/02/18/linear-regression-with-python/)
31 |
32 | ## Advanced topics
33 | Linear regression is one member of a family of linear parametric models. Some additional advanced topics we recommend looking up are...
34 | ### Logistic regression
35 | Logistic regression is very similar to linear regression but has a categorical outcome instead. So rather than modeling a continuous dependent variable, it models a binary classification - yes or no, true or false, 1 or 0. This is still a linear model as it assumes a linear relationship between the independent variables and the link function.
36 |
37 | To learn more about Logistic Regression, try to following resources:
38 | - [Beginners guide to Logistic Regression](https://www.analyticsvidhya.com/blog/2015/11/beginners-guide-on-logistic-regression-in-r/): A good overview of the theory and mathematics behind the algorithm
39 | - [Logistic Regression in Python](http://blog.yhat.com/posts/logistic-regression-python-rodeo.html): A thorough tutorial on a publicly available dataset in Python
40 |
41 | ### Ridge and Lasso regression
42 | Both linear and logistic regression have a tendancy to overfit when there are a large number of features. Therefore it is important that we choose the features which have the most predictive power but how do we choose these features? We can use our EDA to a certain extent but that only goes so far.
43 |
44 | This is where ridge and lasso regularization techniques come into play! Both of these techniques can be used to identify which features explain the most variance and should therefore be kept in the model.
45 |
46 | To learn more about ridge and lasso regression and general regulaization techniques, we recommend the following resources:
47 | - [Complete tutorial on ridge and lasso regression in python](https://www.analyticsvidhya.com/blog/2016/01/complete-tutorial-ridge-lasso-regression-python/): A broad tutorial explaining why we use regularization techniques, touching on the mathematics behind the algorithms and giving a few examples in python.
48 | - [An Introduction to Statistical Learning, Chapter 6.2](http://www-bcf.usc.edu/%7Egareth/ISL/ISLR%20Sixth%20Printing.pdf): A comprehensive explanation of both Lasso and Ridge and their application in the context of statistical learning.
49 |
50 |
--------------------------------------------------------------------------------
/3_module_linear_regression/images/LinearRegression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/3_module_linear_regression/images/LinearRegression.png
--------------------------------------------------------------------------------
/4_module_classification/4_0_twitter_web_scraping.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Introduction Into APIs\n",
8 | "\n",
9 | "\n",
10 | "\n",
11 | "
\n",
12 | "\n",
13 | "\n"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 3,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import tweepy\n",
23 | "import pandas as pd\n",
24 | "from textblob import TextBlob\n",
25 | "\n",
26 | "def pull_tweets (query, maxTweets = 1000, tweetsPerQuery = 100, max_id = -1, sinceId = None):\n",
27 | " '''\n",
28 | " Finds tweets (Comment, Date, Favorites, User) for a query string.\n",
29 | " Twitter API limit per query is 100. Combines these queries. \n",
30 | " '''\n",
31 | " \n",
32 | " # Fill with your own app details\n",
33 | " API_KEY = \"wCl2jflXpyWmYDM22iKFsaiS2\"\n",
34 | " API_SEC = \"f3DkCao13uCfA58bSQXahsDVNF5qzNztrgt3wB2RDDAV8zyXvT\"\n",
35 | " \n",
36 | " # connect to Twitter using authentication\n",
37 | " auth = tweepy.AppAuthHandler(API_KEY, API_SEC)\n",
38 | " # wait_on_rate_limit means that if the API limit is hit, \n",
39 | " # the pulls will wait until more calls are available\n",
40 | " api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)\n",
41 | " \n",
42 | " # Pull comments from Twitter\n",
43 | " # See https://developer.twitter.com/en/docs/tweets/timelines/guides/working-with-timelines\n",
44 | " tweetCount = 0\n",
45 | " data = pd.DataFrame() \n",
46 | " \n",
47 | " while tweetCount < maxTweets:\n",
48 | " if (max_id <= 0):\n",
49 | " new_tweets = api.search(q=query, count=tweetsPerQuery, \n",
50 | " since_id=sinceId)\n",
51 | " else:\n",
52 | " new_tweets = api.search(q=query, count=tweetsPerQuery,\n",
53 | " max_id=str(max_id - 1), \n",
54 | " since_id=sinceId)\n",
55 | " if not new_tweets:\n",
56 | " print(\"No more tweets found\")\n",
57 | " break\n",
58 | " \n",
59 | " tweetCount += len(new_tweets)\n",
60 | " print(\"Downloaded {0} tweets\".format(tweetCount))\n",
61 | " max_id = new_tweets[-1].id\n",
62 | " \n",
63 | " ## Create a dataset from the downloaded tweets\n",
64 | " new_data = pd.DataFrame([{\n",
65 | " 'Date': tweet.created_at,\n",
66 | " 'Comments': tweet.text, \n",
67 | " 'User': tweet.user.name, \n",
68 | " 'Favorites': tweet.favorite_count} \n",
69 | " for tweet in new_tweets])\n",
70 | " \n",
71 | " data = data.append(new_data)\n",
72 | " return data"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "Downloaded 100 tweets\n",
85 | "Downloaded 200 tweets\n",
86 | "Downloaded 300 tweets\n",
87 | "Downloaded 400 tweets\n",
88 | "Downloaded 500 tweets\n",
89 | "Downloaded 600 tweets\n",
90 | "Downloaded 700 tweets\n",
91 | "Downloaded 800 tweets\n",
92 | "Downloaded 900 tweets\n",
93 | "Downloaded 1000 tweets\n",
94 | "Downloaded 1100 tweets\n",
95 | "Downloaded 1188 tweets\n",
96 | "Downloaded 1288 tweets\n",
97 | "Downloaded 1388 tweets\n",
98 | "Downloaded 1488 tweets\n",
99 | "Downloaded 1588 tweets\n",
100 | "Downloaded 1688 tweets\n",
101 | "Downloaded 1788 tweets\n",
102 | "Downloaded 1888 tweets\n",
103 | "Downloaded 1988 tweets\n",
104 | "Downloaded 2088 tweets\n",
105 | "Downloaded 2188 tweets\n",
106 | "Downloaded 2288 tweets\n",
107 | "Downloaded 2388 tweets\n",
108 | "Downloaded 2487 tweets\n",
109 | "Downloaded 2558 tweets\n",
110 | "No more tweets found\n"
111 | ]
112 | }
113 | ],
114 | "source": [
115 | "data = pull_tweets(\"microfinance\", maxTweets = 5000)\n",
116 | "\n",
117 | "# In real life you might have test data with pre-labeled sentiments. We will use a simple word net algorithm to classify for now.\n",
118 | "data['Polarity'] = [TextBlob(comment).polarity for comment in data.Comments]\n",
119 | "\n",
120 | "data.loc[data['Polarity'] > 0, 'Sentiment'] = 'positive'\n",
121 | "data.loc[data['Polarity'] < 0, 'Sentiment'] = 'negative'\n",
122 | "data.loc[data['Polarity'] == 0, 'Sentiment'] = 'neutral'\n",
123 | "\n",
124 | "#convert data to a csv\n",
125 | "data.to_csv(\"microfinance_tweets.csv\", index = False)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": []
134 | }
135 | ],
136 | "metadata": {
137 | "kernelspec": {
138 | "display_name": "Python 3",
139 | "language": "python",
140 | "name": "python3"
141 | },
142 | "language_info": {
143 | "codemirror_mode": {
144 | "name": "ipython",
145 | "version": 3
146 | },
147 | "file_extension": ".py",
148 | "mimetype": "text/x-python",
149 | "name": "python",
150 | "nbconvert_exporter": "python",
151 | "pygments_lexer": "ipython3",
152 | "version": "3.6.8"
153 | }
154 | },
155 | "nbformat": 4,
156 | "nbformat_minor": 2
157 | }
158 |
--------------------------------------------------------------------------------
/4_module_classification/4_4_support_vector_machines.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "toc": true
7 | },
8 | "source": [
9 | "Table of Contents
\n",
10 | ""
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "Support Vector Machines\n",
18 | "------\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stderr",
28 | "output_type": "stream",
29 | "text": [
30 | "/Users/brian/anaconda3/envs/3.7/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
31 | " import pandas.util.testing as tm\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "import pandas as pd\n",
37 | "import seaborn as sns\n",
38 | "import numpy as np\n",
39 | "import matplotlib.pyplot as plt\n",
40 | "%matplotlib inline \n",
41 | "\n",
42 | "import nltk\n",
43 | "from sklearn.feature_extraction.text import CountVectorizer\n",
44 | "from sklearn.metrics import classification_report, confusion_matrix \n",
45 | "import re\n",
46 | "import string\n",
47 | "\n",
48 | "from sklearn.model_selection import train_test_split\n",
49 | "from sklearn import svm"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Prepare data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 2,
62 | "metadata": {},
63 | "outputs": [
64 | {
65 | "data": {
66 | "text/html": [
67 | "\n",
68 | "\n",
81 | "
\n",
82 | " \n",
83 | " \n",
84 | " | \n",
85 | " Comments | \n",
86 | " Date | \n",
87 | " Favorites | \n",
88 | " User | \n",
89 | " Polarity | \n",
90 | " Sentiment | \n",
91 | "
\n",
92 | " \n",
93 | " \n",
94 | " \n",
95 | " 0 | \n",
96 | " RT @atmadiprayET: Here's why Janalakshmi Finan... | \n",
97 | " 3/22/2018 5:40 | \n",
98 | " 0 | \n",
99 | " Saloni Shukla | \n",
100 | " -0.100000 | \n",
101 | " negative | \n",
102 | "
\n",
103 | " \n",
104 | " 1 | \n",
105 | " RT @ecosmob: Ecosmob's #Mobility solutions for... | \n",
106 | " 3/22/2018 5:36 | \n",
107 | " 0 | \n",
108 | " Sindhav Bhageerath | \n",
109 | " -0.062500 | \n",
110 | " neutral | \n",
111 | "
\n",
112 | " \n",
113 | " 2 | \n",
114 | " Project have big future! Microfinance is belie... | \n",
115 | " 3/22/2018 5:27 | \n",
116 | " 0 | \n",
117 | " Konstantin #savedroidICO | \n",
118 | " 0.166667 | \n",
119 | " positive | \n",
120 | "
\n",
121 | " \n",
122 | " 3 | \n",
123 | " #Online #Banking- Yako Microfinance Bank prov... | \n",
124 | " 3/22/2018 5:21 | \n",
125 | " 0 | \n",
126 | " YakoMicrofinance | \n",
127 | " 0.500000 | \n",
128 | " positive | \n",
129 | "
\n",
130 | " \n",
131 | " 4 | \n",
132 | " MICROFINANCE EVENT: 3rd BoP Global Network Sum... | \n",
133 | " 3/22/2018 5:19 | \n",
134 | " 0 | \n",
135 | " MicroCapital | \n",
136 | " 0.045455 | \n",
137 | " positive | \n",
138 | "
\n",
139 | " \n",
140 | "
\n",
141 | "
"
142 | ],
143 | "text/plain": [
144 | " Comments Date \\\n",
145 | "0 RT @atmadiprayET: Here's why Janalakshmi Finan... 3/22/2018 5:40 \n",
146 | "1 RT @ecosmob: Ecosmob's #Mobility solutions for... 3/22/2018 5:36 \n",
147 | "2 Project have big future! Microfinance is belie... 3/22/2018 5:27 \n",
148 | "3 #Online #Banking- Yako Microfinance Bank prov... 3/22/2018 5:21 \n",
149 | "4 MICROFINANCE EVENT: 3rd BoP Global Network Sum... 3/22/2018 5:19 \n",
150 | "\n",
151 | " Favorites User Polarity Sentiment \n",
152 | "0 0 Saloni Shukla -0.100000 negative \n",
153 | "1 0 Sindhav Bhageerath -0.062500 neutral \n",
154 | "2 0 Konstantin #savedroidICO 0.166667 positive \n",
155 | "3 0 YakoMicrofinance 0.500000 positive \n",
156 | "4 0 MicroCapital 0.045455 positive "
157 | ]
158 | },
159 | "execution_count": 2,
160 | "metadata": {},
161 | "output_type": "execute_result"
162 | }
163 | ],
164 | "source": [
165 | "# Load data\n",
166 | "try: \n",
167 | " # Local version\n",
168 | " path = \"../data/\"\n",
169 | " filename = 'microfinance_tweets.csv'\n",
170 | " data = pd.read_csv(path+filename, encoding=\"ISO-8859-1\")\n",
171 | "except FileNotFoundError or ParserError: \n",
172 | " # If not local, get from remote repo. Helpful if using colab.\n",
173 | " url = 'https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good_data/master/microfinance_tweets.csv'\n",
174 | " data = pd.read_csv(url)\n",
175 | "\n",
176 | "# It always a good to visually inspect the data\n",
177 | "data.head()"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 3,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "data.loc[data['Sentiment'] == 'negative', 'Sentiment'] = -1\n",
187 | "data.loc[data['Sentiment'] == 'neutral', 'Sentiment'] = 0\n",
188 | "data.loc[data['Sentiment'] == 'positive', 'Sentiment'] = 1"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 4,
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "data": {
198 | "text/html": [
199 | "\n",
200 | "\n",
213 | "
\n",
214 | " \n",
215 | " \n",
216 | " | \n",
217 | " Comments | \n",
218 | " Date | \n",
219 | " Favorites | \n",
220 | " User | \n",
221 | " Polarity | \n",
222 | " Sentiment | \n",
223 | "
\n",
224 | " \n",
225 | " \n",
226 | " \n",
227 | " 0 | \n",
228 | " RT @atmadiprayET: Here's why Janalakshmi Finan... | \n",
229 | " 3/22/2018 5:40 | \n",
230 | " 0 | \n",
231 | " Saloni Shukla | \n",
232 | " -0.100000 | \n",
233 | " -1 | \n",
234 | "
\n",
235 | " \n",
236 | " 1 | \n",
237 | " RT @ecosmob: Ecosmob's #Mobility solutions for... | \n",
238 | " 3/22/2018 5:36 | \n",
239 | " 0 | \n",
240 | " Sindhav Bhageerath | \n",
241 | " -0.062500 | \n",
242 | " 0 | \n",
243 | "
\n",
244 | " \n",
245 | " 2 | \n",
246 | " Project have big future! Microfinance is belie... | \n",
247 | " 3/22/2018 5:27 | \n",
248 | " 0 | \n",
249 | " Konstantin #savedroidICO | \n",
250 | " 0.166667 | \n",
251 | " 1 | \n",
252 | "
\n",
253 | " \n",
254 | " 3 | \n",
255 | " #Online #Banking- Yako Microfinance Bank prov... | \n",
256 | " 3/22/2018 5:21 | \n",
257 | " 0 | \n",
258 | " YakoMicrofinance | \n",
259 | " 0.500000 | \n",
260 | " 1 | \n",
261 | "
\n",
262 | " \n",
263 | " 4 | \n",
264 | " MICROFINANCE EVENT: 3rd BoP Global Network Sum... | \n",
265 | " 3/22/2018 5:19 | \n",
266 | " 0 | \n",
267 | " MicroCapital | \n",
268 | " 0.045455 | \n",
269 | " 1 | \n",
270 | "
\n",
271 | " \n",
272 | "
\n",
273 | "
"
274 | ],
275 | "text/plain": [
276 | " Comments Date \\\n",
277 | "0 RT @atmadiprayET: Here's why Janalakshmi Finan... 3/22/2018 5:40 \n",
278 | "1 RT @ecosmob: Ecosmob's #Mobility solutions for... 3/22/2018 5:36 \n",
279 | "2 Project have big future! Microfinance is belie... 3/22/2018 5:27 \n",
280 | "3 #Online #Banking- Yako Microfinance Bank prov... 3/22/2018 5:21 \n",
281 | "4 MICROFINANCE EVENT: 3rd BoP Global Network Sum... 3/22/2018 5:19 \n",
282 | "\n",
283 | " Favorites User Polarity Sentiment \n",
284 | "0 0 Saloni Shukla -0.100000 -1 \n",
285 | "1 0 Sindhav Bhageerath -0.062500 0 \n",
286 | "2 0 Konstantin #savedroidICO 0.166667 1 \n",
287 | "3 0 YakoMicrofinance 0.500000 1 \n",
288 | "4 0 MicroCapital 0.045455 1 "
289 | ]
290 | },
291 | "execution_count": 4,
292 | "metadata": {},
293 | "output_type": "execute_result"
294 | }
295 | ],
296 | "source": [
297 | "data.head()"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 11,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "train, test = train_test_split(data, test_size=0.2, random_state=42)"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 12,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "vectorizer = CountVectorizer()\n",
316 | "train_features = vectorizer.fit_transform(train['Comments'])\n",
317 | "test_features = vectorizer.transform(test['Comments'])"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "We have vectorized our data such that each index corresponds with a word as well as the frequency of that word in the text."
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 52,
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "name": "stdout",
334 | "output_type": "stream",
335 | "text": [
336 | " (0, 585)\t2\n",
337 | " (0, 778)\t1\n",
338 | " (0, 788)\t1\n",
339 | " (0, 1301)\t1\n",
340 | " (0, 1302)\t1\n",
341 | " (0, 1940)\t1\n",
342 | " (0, 1994)\t1\n",
343 | " (0, 2088)\t1\n",
344 | " (0, 2230)\t1\n",
345 | " (0, 3106)\t1\n",
346 | " (0, 3381)\t2\n",
347 | " (0, 3573)\t1\n",
348 | " (0, 3770)\t2\n",
349 | " (0, 4161)\t1\n",
350 | " (0, 4516)\t1\n",
351 | " (0, 5257)\t1\n"
352 | ]
353 | }
354 | ],
355 | "source": [
356 | "print(train_features[0])"
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "metadata": {},
362 | "source": [
363 | "## Linear SVM"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "There are many types of SVMs, but we will first try a linear SVM, the most basic. This means that the decision boundary will be linear.
\n",
371 | "\n",
372 | "There is another input called decision_function_shape. The two options of one versus rest, and one versus one. This relates to how the decision boundary separates points, whether it separates negative points from everyone else or negative points from neutral points, etc. (https://pythonprogramming.net/support-vector-machine-parameters-machine-learning-tutorial/). The default is one versus rest. One versus rest takes less computational power but may be thrown off by outliers and don't do well on imbalanced data sets, e.g. more of one class than another."
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": 36,
378 | "metadata": {},
379 | "outputs": [
380 | {
381 | "data": {
382 | "text/plain": [
383 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
384 | " decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n",
385 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
386 | " tol=0.001, verbose=False)"
387 | ]
388 | },
389 | "execution_count": 36,
390 | "metadata": {},
391 | "output_type": "execute_result"
392 | }
393 | ],
394 | "source": [
395 | "clf = svm.SVC(kernel='linear') \n",
396 | "clf.fit(train_features, train['Sentiment'])"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": 56,
402 | "metadata": {},
403 | "outputs": [],
404 | "source": [
405 | "y_train = clf.predict(train_features) "
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": 57,
411 | "metadata": {},
412 | "outputs": [
413 | {
414 | "name": "stdout",
415 | "output_type": "stream",
416 | "text": [
417 | "[[ 188 19 0]\n",
418 | " [ 6 1540 0]\n",
419 | " [ 0 0 839]]\n",
420 | " precision recall f1-score support\n",
421 | "\n",
422 | " -1 0.97 0.91 0.94 207\n",
423 | " 0 0.99 1.00 0.99 1546\n",
424 | " 1 1.00 1.00 1.00 839\n",
425 | "\n",
426 | "avg / total 0.99 0.99 0.99 2592\n",
427 | "\n"
428 | ]
429 | }
430 | ],
431 | "source": [
432 | "print(confusion_matrix(train['Sentiment'],y_train)) \n",
433 | "print(classification_report(train['Sentiment'],y_train)) "
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 58,
439 | "metadata": {},
440 | "outputs": [],
441 | "source": [
442 | "y_pred = clf.predict(test_features) "
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": 59,
448 | "metadata": {},
449 | "outputs": [
450 | {
451 | "name": "stdout",
452 | "output_type": "stream",
453 | "text": [
454 | "[[ 41 8 3]\n",
455 | " [ 8 386 3]\n",
456 | " [ 1 9 190]]\n",
457 | " precision recall f1-score support\n",
458 | "\n",
459 | " -1 0.82 0.79 0.80 52\n",
460 | " 0 0.96 0.97 0.96 397\n",
461 | " 1 0.97 0.95 0.96 200\n",
462 | "\n",
463 | "avg / total 0.95 0.95 0.95 649\n",
464 | "\n"
465 | ]
466 | }
467 | ],
468 | "source": [
469 | "print(confusion_matrix(test['Sentiment'],y_pred)) \n",
470 | "print(classification_report(test['Sentiment'],y_pred)) "
471 | ]
472 | },
473 | {
474 | "cell_type": "markdown",
475 | "metadata": {},
476 | "source": [
477 | "What do you think of the performance of the SVM? We can also adjust gamma to account for overfitting, but it doesn't look like we've overfit too much given the training and test performances."
478 | ]
479 | },
480 | {
481 | "cell_type": "markdown",
482 | "metadata": {},
483 | "source": [
484 | "Remember that support vectors are the data points that lie closest to the decision surface (or hyperplane). We can figure out what those data points are below for each class we are classifying, noting that we have three classes for negative, neutral, and positive."
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": 39,
490 | "metadata": {},
491 | "outputs": [
492 | {
493 | "name": "stdout",
494 | "output_type": "stream",
495 | "text": [
496 | " (0, 531)\t1.0\n",
497 | " (0, 1440)\t1.0\n",
498 | " (0, 2371)\t1.0\n",
499 | " (0, 2769)\t1.0\n",
500 | " (0, 2775)\t2.0\n",
501 | " (0, 2780)\t1.0\n",
502 | " (0, 3106)\t1.0\n",
503 | " (0, 3157)\t1.0\n",
504 | " (0, 3312)\t1.0\n",
505 | " (0, 3381)\t1.0\n",
506 | " (0, 3496)\t1.0\n",
507 | " (0, 3729)\t1.0\n",
508 | " (0, 4796)\t1.0\n",
509 | " (0, 4864)\t1.0\n",
510 | " (0, 4964)\t1.0\n",
511 | " (0, 5021)\t1.0\n",
512 | " (0, 5059)\t1.0\n",
513 | " (0, 5092)\t1.0\n",
514 | " (0, 5156)\t2.0\n",
515 | " (0, 5638)\t1.0\n",
516 | " (1, 374)\t2.0\n",
517 | " (1, 585)\t1.0\n",
518 | " (1, 1885)\t1.0\n",
519 | " (1, 2484)\t2.0\n",
520 | " (1, 2485)\t1.0\n",
521 | " :\t:\n",
522 | " (1299, 3729)\t1.0\n",
523 | " (1299, 3861)\t1.0\n",
524 | " (1299, 3999)\t1.0\n",
525 | " (1299, 4102)\t1.0\n",
526 | " (1299, 5156)\t2.0\n",
527 | " (1299, 5370)\t1.0\n",
528 | " (1300, 614)\t1.0\n",
529 | " (1300, 934)\t1.0\n",
530 | " (1300, 1213)\t1.0\n",
531 | " (1300, 1401)\t1.0\n",
532 | " (1300, 1473)\t1.0\n",
533 | " (1300, 1518)\t1.0\n",
534 | " (1300, 1684)\t1.0\n",
535 | " (1300, 1925)\t1.0\n",
536 | " (1300, 2097)\t1.0\n",
537 | " (1300, 2501)\t1.0\n",
538 | " (1300, 3106)\t1.0\n",
539 | " (1300, 3487)\t1.0\n",
540 | " (1300, 4358)\t1.0\n",
541 | " (1300, 4913)\t1.0\n",
542 | " (1300, 5104)\t1.0\n",
543 | " (1300, 5156)\t1.0\n",
544 | " (1300, 5158)\t1.0\n",
545 | " (1300, 5573)\t1.0\n",
546 | " (1300, 5627)\t1.0\n"
547 | ]
548 | }
549 | ],
550 | "source": [
551 | "print(clf.support_vectors_)"
552 | ]
553 | },
554 | {
555 | "cell_type": "markdown",
556 | "metadata": {},
557 | "source": [
558 | "We can check for the number of points in each class using another function. Here we see that most support vectors are in our last class, the positive class."
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": 40,
564 | "metadata": {},
565 | "outputs": [
566 | {
567 | "data": {
568 | "text/plain": [
569 | "array([152, 713, 436])"
570 | ]
571 | },
572 | "execution_count": 40,
573 | "metadata": {},
574 | "output_type": "execute_result"
575 | }
576 | ],
577 | "source": [
578 | "clf.n_support_"
579 | ]
580 | },
581 | {
582 | "cell_type": "markdown",
583 | "metadata": {},
584 | "source": [
585 | "We can also find the support vector in our original data using the indices provided for us with clf.support_"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 41,
591 | "metadata": {},
592 | "outputs": [
593 | {
594 | "data": {
595 | "text/plain": [
596 | "array([ 8, 21, 34, ..., 2573, 2585, 2587])"
597 | ]
598 | },
599 | "execution_count": 41,
600 | "metadata": {},
601 | "output_type": "execute_result"
602 | }
603 | ],
604 | "source": [
605 | "clf.support_"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": 49,
611 | "metadata": {},
612 | "outputs": [
613 | {
614 | "name": "stdout",
615 | "output_type": "stream",
616 | "text": [
617 | " (0, 531)\t1\n",
618 | " (0, 1440)\t1\n",
619 | " (0, 2371)\t1\n",
620 | " (0, 2769)\t1\n",
621 | " (0, 2775)\t2\n",
622 | " (0, 2780)\t1\n",
623 | " (0, 3106)\t1\n",
624 | " (0, 3157)\t1\n",
625 | " (0, 3312)\t1\n",
626 | " (0, 3381)\t1\n",
627 | " (0, 3496)\t1\n",
628 | " (0, 3729)\t1\n",
629 | " (0, 4796)\t1\n",
630 | " (0, 4864)\t1\n",
631 | " (0, 4964)\t1\n",
632 | " (0, 5021)\t1\n",
633 | " (0, 5059)\t1\n",
634 | " (0, 5092)\t1\n",
635 | " (0, 5156)\t2\n",
636 | " (0, 5638)\t1\n"
637 | ]
638 | }
639 | ],
640 | "source": [
641 | "print(train_features[8])"
642 | ]
643 | },
644 | {
645 | "cell_type": "markdown",
646 | "metadata": {},
647 | "source": [
648 | "## Non-linear SVM"
649 | ]
650 | },
651 | {
652 | "cell_type": "markdown",
653 | "metadata": {},
654 | "source": [
655 | "We can also check different kernel types, with rbf being gaussian and sigmoid being similar to the sigmoid function in logistic regression. A visualization is simplest to understand below:"
656 | ]
657 | },
658 | {
659 | "cell_type": "code",
660 | "execution_count": 30,
661 | "metadata": {},
662 | "outputs": [],
663 | "source": [
664 | "clf = svm.SVC(kernel='rbf') \n",
665 | "clf.fit(train_features, train['Sentiment'])\n",
666 | "\n",
667 | "y_pred = clf.predict(test_features) "
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": 31,
673 | "metadata": {},
674 | "outputs": [
675 | {
676 | "name": "stdout",
677 | "output_type": "stream",
678 | "text": [
679 | "[[ 0 52 0]\n",
680 | " [ 0 397 0]\n",
681 | " [ 0 200 0]]\n",
682 | " precision recall f1-score support\n",
683 | "\n",
684 | " -1 0.00 0.00 0.00 52\n",
685 | " 0 0.61 1.00 0.76 397\n",
686 | " 1 0.00 0.00 0.00 200\n",
687 | "\n",
688 | "avg / total 0.37 0.61 0.46 649\n",
689 | "\n"
690 | ]
691 | },
692 | {
693 | "name": "stderr",
694 | "output_type": "stream",
695 | "text": [
696 | "C:\\Users\\Lina\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
697 | " 'precision', 'predicted', average, warn_for)\n"
698 | ]
699 | }
700 | ],
701 | "source": [
702 | "print(confusion_matrix(test['Sentiment'],y_pred)) \n",
703 | "print(classification_report(test['Sentiment'],y_pred)) "
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": 34,
709 | "metadata": {},
710 | "outputs": [],
711 | "source": [
712 | "clf = svm.SVC(kernel='sigmoid') \n",
713 | "clf.fit(train_features, train['Sentiment'])\n",
714 | "\n",
715 | "y_pred = clf.predict(test_features) "
716 | ]
717 | },
718 | {
719 | "cell_type": "code",
720 | "execution_count": 35,
721 | "metadata": {},
722 | "outputs": [
723 | {
724 | "name": "stdout",
725 | "output_type": "stream",
726 | "text": [
727 | "[[ 0 52 0]\n",
728 | " [ 0 397 0]\n",
729 | " [ 0 200 0]]\n",
730 | " precision recall f1-score support\n",
731 | "\n",
732 | " -1 0.00 0.00 0.00 52\n",
733 | " 0 0.61 1.00 0.76 397\n",
734 | " 1 0.00 0.00 0.00 200\n",
735 | "\n",
736 | "avg / total 0.37 0.61 0.46 649\n",
737 | "\n"
738 | ]
739 | },
740 | {
741 | "name": "stderr",
742 | "output_type": "stream",
743 | "text": [
744 | "C:\\Users\\Lina\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
745 | " 'precision', 'predicted', average, warn_for)\n"
746 | ]
747 | }
748 | ],
749 | "source": [
750 | "print(confusion_matrix(test['Sentiment'],y_pred)) \n",
751 | "print(classification_report(test['Sentiment'],y_pred)) "
752 | ]
753 | },
754 | {
755 | "cell_type": "markdown",
756 | "metadata": {},
757 | "source": [
758 | "It looks like the linear SVM performs best on this model from both a precision and recall perspective. Remember that precision are the accuracy of the prediction and recall is how much of the true positive space we are capturing. \n",
759 | "\n",
760 | "What does this mean about our underlying data?"
761 | ]
762 | },
763 | {
764 | "cell_type": "markdown",
765 | "metadata": {},
766 | "source": [
767 | "References\n",
768 | "-------\n",
769 | "\n",
770 | "- https://stackabuse.com/implementing-svm-and-kernel-svm-with-pythons-scikit-learn/\n",
771 | "- https://jakevdp.github.io/PythonDataScienceHandbook/05.07-support-vector-machines.html\n",
772 | "- https://gist.github.com/WittmannF/60680723ed8dd0cb993051a7448f7805"
773 | ]
774 | },
775 | {
776 | "cell_type": "markdown",
777 | "metadata": {},
778 | "source": [
779 | "
\n",
780 | "
\n",
781 | "
\n",
782 | "\n",
783 | "----"
784 | ]
785 | }
786 | ],
787 | "metadata": {
788 | "kernelspec": {
789 | "display_name": "Python 3",
790 | "language": "python",
791 | "name": "python3"
792 | },
793 | "language_info": {
794 | "codemirror_mode": {
795 | "name": "ipython",
796 | "version": 3
797 | },
798 | "file_extension": ".py",
799 | "mimetype": "text/x-python",
800 | "name": "python",
801 | "nbconvert_exporter": "python",
802 | "pygments_lexer": "ipython3",
803 | "version": "3.7.7"
804 | },
805 | "toc": {
806 | "base_numbering": 1,
807 | "nav_menu": {},
808 | "number_sections": false,
809 | "sideBar": false,
810 | "skip_h1_title": false,
811 | "title_cell": "Table of Contents",
812 | "title_sidebar": "Contents",
813 | "toc_cell": true,
814 | "toc_position": {},
815 | "toc_section_display": true,
816 | "toc_window_display": false
817 | }
818 | },
819 | "nbformat": 4,
820 | "nbformat_minor": 2
821 | }
822 |
--------------------------------------------------------------------------------
/4_module_classification/README.md:
--------------------------------------------------------------------------------
1 | # Module 4: Classification
2 | ------
3 |
4 | 
--------------------------------------------------------------------------------
/4_module_classification/images/intro_to_ml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/4_module_classification/images/intro_to_ml.png
--------------------------------------------------------------------------------
/5_module_decision_trees/README.md:
--------------------------------------------------------------------------------
1 | Module 5: Decision Trees and Random Forests
2 | ======
3 |
4 | Welcome! We'll be exploring Decision Trees and Random Forests - a very powerful way to model data!
5 |
6 | Topic overview
7 | ----
8 |
9 | A Decision Tree is a very powerful model which can be used alone or as the basis for other powerful models such as Random Forest and Gradient Boosting. At it's simplest, a decision tree asks a series of questions about the features to predict what the outcome should be. Decision Trees also have the added advantage that they can be used for both regression and classification.
10 |
11 | A singular decision tree has the tendency to overfit on training data and to counter act this, Bagging (or Boostrap aggregating) is used. Bagging is an **ensemble approach** where N random subsamples of the dataset are made using selection with replacement and individual decision trees are trained on each subsample. Then the final prediction is the average of all predictions from the N decisions trees.
12 |
13 |
14 |
15 | This is improved upon further by limiting the feature considered at each split to a random subset of features. This is known as a Random Forest.
16 |
17 | In this module, we will work our way incrementally from Decision Trees, though Bagging to Random Forests and evaluating the performance at each step. We will also look into the different parameters for each of these models and investigate which features are the most important.
18 |
19 | Resources
20 | ----
21 |
22 | Firstly, refer to your lecture notes as they will explain the fundamentals covered here in reference to the Kiva dataset we are using!
23 | For additional help, we find the following resources to be very useful!
24 | - [Visual Intro to Machine Learning](http://www.r2d3.us/visual-intro-to-machine-learning-part-1/):
25 | This is an awesome visualization of how a decision tree works step by step. Take the time to go through this and you should have a good fundamental understanding of whats happening under the hood!
26 | - [A complete tutorial on tree based modeling](https://www.analyticsvidhya.com/blog/2016/04/complete-tutorial-tree-based-modeling-scratch-in-python/): A comprehensive tutorial covering the hows and whys of using tree based models including decision trees, bagging, random forest and boosting.
27 |
28 | Advanced topics
29 | ----
30 |
31 | ### Gradient Tree Boosting
32 | Gradient Tree Boosting is an alternative method of using decision trees which lowers the variance and bias. Unlike the Random Forest algorithm which trains multiple decision trees independently and then averages the result. Boosting works by incrementally growing multiple trees, where each tree is trained on the errors from the previous tree.
33 |
34 | For more information checkout these resources:
35 |
36 | - [An Introduction to Statistical Learning, Chapter 8.2.3](http://www-bcf.usc.edu/%7Egareth/ISL/ISLR%20Sixth%20Printing.pdf): Following on from Decision Trees and Random Forests, the chapter on Boosting discussed this model in an academic and tree-model context.
37 | - [A kaggle master explains gradient boosting](http://blog.kaggle.com/2017/01/23/a-kaggle-master-explains-gradient-boosting/): A fun and easy to read explanation of how gradient boosting works and why it is so great!
38 | - [A guide to gradient boosting trees with XGBoost in Python](https://jessesw.com/XG-Boost/): A comprehensive tutorial using XGBoost for income classification. A good opportunity to brush up on python and EDA skills too!
39 |
40 |
--------------------------------------------------------------------------------
/5_module_decision_trees/images/DecisionTreeExample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/5_module_decision_trees/images/DecisionTreeExample.png
--------------------------------------------------------------------------------
/5_module_decision_trees/images/bagging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/5_module_decision_trees/images/bagging.png
--------------------------------------------------------------------------------
/6_module_unsupervised_learning/README.md:
--------------------------------------------------------------------------------
1 | Module 6: Introduction to Unsupervised Learning
2 | ======
3 |
4 | Welcome! We are going to delve into the topic of unsupervised learning!
5 |
6 | Goals
7 | ---
8 | 1. Give a general introduction to unsupervised learning.
9 | 2. Use k-means clustering as unsupervised learning technique.
10 | 3. Find clusters with k-means algorithm.
11 | 4. Evaluate our results with the Elbow method.
12 |
13 | Topic overview
14 | ----
15 | Unsupervised Learning is the process of identifying patterns in a dataset. Identifying patterns is often an early step in understanding data. Unsupervised learning methods are a set of techniques designed to explore and find "hidden structure" rather than predict outcomes.
16 |
17 | Unsupervised learning does not require labeled data, therefore works for broader range of data. In fact, most data in the world is unlabeled. However, since there are no labels / correct answers there is not always a clear feedback to validate that the results are correct.
18 |
19 | There are two main techniques in the domain of unsupervised learning:
20 |
21 | **Dimensionality Reduction**
22 | Some datasets have too many features causing problems with over-fitting, slow model fitting time and issues with metric interpretability (look up the Curse of Dimensionality!). For this reason, we look for methods to reduce the number of features used to train the model while maintaining most of the variance/signal in the data.
23 |
24 | **Clustering**
25 | Clustering is relatively self explanatory. These are methods which divide the dataset into subgroups based on similar characteristics. These sub-groups can be then be used in further supervised learning algorithms or act as an intuitive way to understand the natural subsets in your dataset. Clustering is sometimes referred to as data segmentation or data partitioning.
26 |
27 | In this module, we will focus on the K-Means Clustering algorithm, how it works and how to evaluate it's performance.
28 |
29 | Resources
30 | ----
31 |
32 | Firstly, refer to your lecture notes as they will explain the fundamentals covered here in reference to the Kiva dataset we are using!
33 | For additional help, we find the following resources to be very useful.
34 |
35 | - [K-Means handout from Stanford](http://stanford.edu/~cpiech/cs221/handouts/kmeans.html/):
36 | From the computer science course at Stanford University, this is a handout giving an overview of the k-means algorithm, sample code and it provides a bit more detail on how clustering can be improved.
37 | - [Interactive introduction to dimensionality reduction](https://www.kaggle.com/arthurtok/interactive-intro-to-dimensionality-reduction): A comprehensive introduction to three dimensionality reduction methods, PCA, LDA and t-SNE from kaggle. Interactive examples with code are provided so that you can see the impact of these methods on the features.
38 |
39 |
40 | Advanced topics
41 | ----
42 |
43 | ### Hierarchical Clustering
44 | Hierarchical Clustering is a more complex method to cluster data points and evaluate the clusters. Unlike K-Means, we do not need enforce the number of cluster to look for in hierarchical clustering. The algorithm incrementally creates groups data points together to create cluster, staring with every data point as it's own cluster, until all the data in a single cluster. The results can be displayed in a diagram called a dendrogram which allows us to evaluate the possible combination of clusters.
45 |
46 | For more information checkout these resources:
47 |
48 | - [Hierarchical Clustering Analysis](https://afit-r.github.io/hc_clustering): An thorough introduction to Hierarchical Clustering with examples in R. Although it does not used python to create the analysis, this is an excellent resource to understand the underlying principles of hierarchical clustering and to become familiar with dendrograms.
49 | - [Unsupervised Machine Learning: Hierarchical Clustering](https://pythonprogramming.net/hierarchical-clustering-machine-learning-python-scikit-learn/): Although the text is a bit dense and focuses more on the context for hierarchical clustering, this article provides great examples using python and the scikit-learn library.
50 | - [Hierarchical Clustering Analysis](http://84.89.132.1/~michael/stanford/maeb7.pdf): From Stanford Universities, a step by step pdf guide to hierarchical clustering, covering how it works, how to find the 'right' number of clusters and evaluating the validity of the clusters.
--------------------------------------------------------------------------------
/6_module_unsupervised_learning/images/clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/6_module_unsupervised_learning/images/clustering.png
--------------------------------------------------------------------------------
/6_module_unsupervised_learning/images/k_means.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/6_module_unsupervised_learning/images/k_means.png
--------------------------------------------------------------------------------
/7_module_advanced_topics/7_2_image_processing_with_keras.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "toc": true
7 | },
8 | "source": [
9 | "Table of Contents
\n",
10 | ""
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "__Disclaimer__:\n",
18 | "\n",
19 | "This lesson could be significantly improved. It does not run as is."
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "# Image processing with Keras\n",
27 | "\n",
28 | "Keras is a deep learning library build on top of TensorFlow. We can use it to process our image data to arrays. Often times, we use deep learning to do image processing. In this example, I will use naive bayes to later prove how deep learning will do much better than naive bayes."
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "### Get images"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 9,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "ename": "WebDriverException",
45 | "evalue": "Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home\n",
46 | "output_type": "error",
47 | "traceback": [
48 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
49 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
50 | "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mcmd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommand_line_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m self.process = subprocess.Popen(cmd, env=self.env,\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0mclose_fds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplatform\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'Windows'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
51 | "\u001b[0;32m~/anaconda3/lib/python3.8/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 858\u001b[0;31m self._execute_child(args, executable, preexec_fn, close_fds,\n\u001b[0m\u001b[1;32m 859\u001b[0m \u001b[0mpass_fds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcwd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
52 | "\u001b[0;32m~/anaconda3/lib/python3.8/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m 1705\u001b[0m \u001b[0merr_msg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrerror\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1706\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1707\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
53 | "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'chromedriver'",
54 | "\nDuring handling of the above exception, another exception occurred:\n",
55 | "\u001b[0;31mWebDriverException\u001b[0m Traceback (most recent call last)",
56 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0murl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"https://www.google.co.in/search?q=\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0msearchterm\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"&source=lnms&tbm=isch\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mbrowser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChrome\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Make sure ChromeDriver is intalled https://chromedriver.chromium.org/getting-started\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0mbrowser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
57 | "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/selenium/webdriver/chrome/webdriver.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, executable_path, port, options, service_args, desired_capabilities, service_log_path, chrome_options, keep_alive)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mservice_args\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mservice_args\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m log_path=service_log_path)\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
58 | "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mENOENT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m raise WebDriverException(\n\u001b[0m\u001b[1;32m 82\u001b[0m \"'%s' executable needs to be in PATH. %s\" % (\n\u001b[1;32m 83\u001b[0m os.path.basename(self.path), self.start_error_message)\n",
59 | "\u001b[0;31mWebDriverException\u001b[0m: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home\n"
60 | ]
61 | }
62 | ],
63 | "source": [
64 | "from selenium import webdriver\n",
65 | "import os\n",
66 | "import time\n",
67 | "import requests\n",
68 | "\n",
69 | "# Set up Google search url with term\n",
70 | "searchterm = 'memes'\n",
71 | "url = \"https://www.google.co.in/search?q=\"+searchterm+\"&source=lnms&tbm=isch\"\n",
72 | "\n",
73 | "browser = webdriver.Chrome() # Make sure ChromeDriver is intalled https://chromedriver.chromium.org/getting-started\n",
74 | "browser.get(url)\n",
75 | "\n",
76 | "browser.execute_script(\"window.scrollBy(0,10000)\")\n",
77 | "\n",
78 | "elements = browser.find_elements_by_class_name('rg_i')\n",
79 | "print(len(elements))\n",
80 | "\n",
81 | "# Set up variable to count successful downloads\n",
82 | "counter = 0\n",
83 | "succounter = 0\n",
84 | "\n",
85 | "# Makes the folder if it doesn't already exist\n",
86 | "if not os.path.exists(searchterm):\n",
87 | " os.mkdir(searchterm)\n",
88 | "\n",
89 | "for x in elements:\n",
90 | "\tx.click()\n",
91 | "\ttime.sleep(1)\n",
92 | "\telement = browser.find_elements_by_class_name('v4dQwb')\n",
93 | "\n",
94 | "\tprint(\"Total Count:\", counter)\n",
95 | "\tprint(\"Succsessful Count:\", succounter)\n",
96 | "\t\n",
97 | "\tif counter == 0:\n",
98 | "\t\timg = element[0].find_element_by_class_name('n3VNCb')\n",
99 | "\telse:\n",
100 | "\t\timg = element[1].find_element_by_class_name('n3VNCb')\n",
101 | "\n",
102 | "\t# Saves the image\n",
103 | "\ttry:\n",
104 | "\n",
105 | "\t\tr = requests.get(img.get_attribute(\"src\"))\n",
106 | "\t\t\n",
107 | "\t\tif r.status_code == 200:\n",
108 | "\t\t\twith open(searchterm+\"/image_\"+str(counter)+\".png\", 'wb') as f:\n",
109 | "\t\t\t\tf.write(r.content)\n",
110 | "\t\t\t\n",
111 | "\t\tsuccounter = succounter + 1\n",
112 | "\t\t\n",
113 | "\texcept Exception as e:\n",
114 | "\t\tprint(\"could not load : \"+img)\n",
115 | "\t\tprint(e)\n",
116 | "\n",
117 | "\tcounter = counter + 1\n",
118 | "\t \n",
119 | "print(succounter, \"pictures succesfully downloaded\")\n",
120 | "browser.close()"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "import os\n",
130 | "from subprocess import check_output\n",
131 | "import sys\n",
132 | "from time import time, sleep\n",
133 | "\n",
134 | "import numpy as np \n",
135 | "import pandas as pd \n",
136 | "import seaborn as sns\n",
137 | "\n",
138 | "from IPython.display import display\n",
139 | "from IPython.display import Image as _Imgdis\n",
140 | "from PIL import Image\n",
141 | "from scipy import ndimage\n",
142 | "\n",
143 | "from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img\n",
144 | "\n",
145 | "from sklearn.model_selection import train_test_split\n",
146 | "from sklearn.naive_bayes import MultinomialNB"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "### Proccess images as arrays"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 10,
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "ename": "FileNotFoundError",
163 | "evalue": "[Errno 2] No such file or directory: 'memes'",
164 | "output_type": "error",
165 | "traceback": [
166 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
167 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
168 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Create a list of files in the folder specified\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmeme_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mf\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder_1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder_1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Working with {0} images\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmeme_files\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
169 | "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'memes'"
170 | ]
171 | }
172 | ],
173 | "source": [
174 | "# Create a dataset of memes\n",
175 | "folder_1 = \"memes\"\n",
176 | "\n",
177 | "# Create a list of files in the folder specified\n",
178 | "meme_files = [f for f in os.listdir(folder_1) if os.path.isfile(os.path.join(folder_1, f))]\n",
179 | "\n",
180 | "print(\"Working with {0} images\".format(len(meme_files)))\n",
181 | "print(\"Image examples: \")\n",
182 | "\n",
183 | "# Print two examples using display(_Imgdis()), which can read the image files\n",
184 | "for i in range(150, 152):\n",
185 | " print(meme_files[i])\n",
186 | " display(_Imgdis(filename=folder_1 + \"/\" + meme_files[i], width=240, height=320))"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "# Create also a dataset of non-memes, pulled from this dataset: \n",
196 | "# http://press.liacs.nl/mirflickr/mirdownload.html\n",
197 | "\n",
198 | "folder_0 = \"non_memes\"\n",
199 | "\n",
200 | "# Create a list of files in the folder specified\n",
201 | "non_meme_files = [f for f in os.listdir(folder_0) if os.path.isfile(os.path.join(folder_0, f))]\n",
202 | "\n",
203 | "print(\"Working with {0} images\".format(len(non_meme_files)))\n",
204 | "print(\"Image examples: \")\n",
205 | "\n",
206 | "# Print two examples using display(_Imgdis()), which can read the image files\n",
207 | "for i in range(150, 152):\n",
208 | " print(non_meme_files[i])\n",
209 | " display(_Imgdis(filename=folder_0 + \"/\" + non_meme_files[i], width=240, height=320))"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "# Prepare arrays for data to be saved in image processing for loops\n",
219 | "y_data = [1]*len(meme_files) + [0]*len(non_meme_files)\n",
220 | "meme_colors = np.ndarray(shape = (len(meme_files), 3), dtype=np.float32)\n",
221 | "non_meme_colors = np.ndarray(shape = (len(non_meme_files), 3), dtype=np.float32)\n",
222 | "image_size_areas = []\n",
223 | "\n",
224 | "# Dimensions to standardize the images to\n",
225 | "image_height = 120\n",
226 | "image_width = 160\n",
227 | "channels = 3\n",
228 | "\n",
229 | "# Make a 3-layered array (3 for RGB or number of channels)\n",
230 | "dataset = np.ndarray(shape=(len(y_data), channels, image_height, image_width), dtype=np.float32)"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "# Add an array of each meme image to our dataset (note this code can be improved by creating a read_image function instead of repeating the for loop twice)\n",
240 | "i = 0\n",
241 | "for j in range(len(meme_files)):\n",
242 | " img = load_img(folder_1 + \"/\" + meme_files[j]) # this is a PIL image\n",
243 | " # Save initial dimensions before resizing\n",
244 | " image_size_areas.append(img.size[0] * img.size[1])\n",
245 | " img = img.resize((image_height, image_width))\n",
246 | " # Convert to numpy array and save colors\n",
247 | " x = img_to_array(img)\n",
248 | " meme_colors[j] = [x[0].sum(), x[1].sum(), x[2].sum()]\n",
249 | " x = x.reshape((channels, image_height, image_width))\n",
250 | " try:\n",
251 | " dataset[i] = x\n",
252 | " i += 1\n",
253 | " if i % 250 == 0:\n",
254 | " print(\"%d images to array\" % i)\n",
255 | " except Exception as e:\n",
256 | " i += 1\n",
257 | " print(\"failed on %d\" %i, e)\n",
258 | "\n",
259 | "# Add an array of each non-meme image to our dataset\n",
260 | "for k in range(len(non_meme_files)):\n",
261 | " img = load_img(folder_0 + \"/\" + non_meme_files[k]) # this is a PIL image\n",
262 | " # Save initial dimensions before resizing\n",
263 | " image_size_areas.append(img.size[0] * img.size[1])\n",
264 | " img = img.resize((image_height, image_width))\n",
265 | " # Convert to numpy array and save colors\n",
266 | " x = img_to_array(img)\n",
267 | " non_meme_colors[k] = [x[0].sum(), x[1].sum(), x[2].sum()]\n",
268 | " x = x.reshape((channels, image_height, image_width))\n",
269 | " try:\n",
270 | " dataset[i] = x\n",
271 | " i += 1\n",
272 | " if i % 250 == 0:\n",
273 | " print(\"%d images to array\" %i)\n",
274 | " except Exception as e:\n",
275 | " i += 1\n",
276 | " print(\"failed on %d\" %i, e)"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "### Exploratory analysis"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "# Determine the meme versus non-meme split\n",
293 | "print(len(y_data))\n",
294 | "print(\"memes:\", sum(y_data)/len(y_data), \"non-memes:\", (len(y_data)-sum(y_data))/len(y_data))"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": null,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "# Plot the distribution of sizes before the images were cropped\n",
304 | "pd.Series(data = image_size_areas).hist()"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "# Plot a histogram of colors for memes\n",
314 | "sns.distplot(meme_colors[:,0], color = 'r')\n",
315 | "sns.distplot(meme_colors[:,1], color = 'g')\n",
316 | "sns.distplot(meme_colors[:,2], color = 'b')"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "# Plot a histogram of colors for non-memes\n",
326 | "sns.distplot(non_meme_colors[:,0], color = 'r')\n",
327 | "sns.distplot(non_meme_colors[:,1], color = 'g')\n",
328 | "sns.distplot(non_meme_colors[:,2], color = 'b')"
329 | ]
330 | },
331 | {
332 | "cell_type": "markdown",
333 | "metadata": {},
334 | "source": [
335 | "### Build the model"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": [
344 | "#Model will expect a 2-D array, so we can flatten a 4-D array to a 2-D one\n",
345 | "dataset_flattened = dataset.reshape(len(y_data) * channels, image_height * image_width)\n",
346 | "y_data_flattened = [1]*len(meme_files)*3 + [0]*len(non_meme_files)*3"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": null,
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "print(len(dataset_flattened), len(dataset)*3)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "#Split the array data into train and test sets\n",
365 | "X_train, X_test, y_train, y_test = train_test_split(dataset_flattened, y_data_flattened, test_size=0.2, random_state=33)"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "#Test the randomness of the y_train and y_test set\n",
375 | "print(sum(y_train)/len(y_train), sum(y_test)/len(y_test))"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "metadata": {},
382 | "outputs": [],
383 | "source": [
384 | "#Train your data set using multinomial NB from sklearn library\n",
385 | "nb = MultinomialNB()\n",
386 | "nb.fit(X_train, y_train)"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "#Test your data set on your test data\n",
396 | "preds = nb.predict(X_test)\n",
397 | "\n",
398 | "#Print the accuracy of your model\n",
399 | "accuracy = (preds == y_test)\n",
400 | "'Accuracy : {:.2%}'.format(accuracy.sum() / len(accuracy))"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {},
406 | "source": [
407 | "This is pretty bad performance, considering the accuracy by assigning every picture to a meme would be ~40%."
408 | ]
409 | },
410 | {
411 | "cell_type": "markdown",
412 | "metadata": {},
413 | "source": [
414 | "Source: https://www.kaggle.com/lgmoneda/from-image-files-to-numpy-arrays"
415 | ]
416 | }
417 | ],
418 | "metadata": {
419 | "kernelspec": {
420 | "display_name": "Python 3",
421 | "language": "python",
422 | "name": "python3"
423 | },
424 | "language_info": {
425 | "codemirror_mode": {
426 | "name": "ipython",
427 | "version": 3
428 | },
429 | "file_extension": ".py",
430 | "mimetype": "text/x-python",
431 | "name": "python",
432 | "nbconvert_exporter": "python",
433 | "pygments_lexer": "ipython3",
434 | "version": "3.8.8"
435 | },
436 | "toc": {
437 | "base_numbering": 1,
438 | "nav_menu": {},
439 | "number_sections": false,
440 | "sideBar": false,
441 | "skip_h1_title": false,
442 | "title_cell": "Table of Contents",
443 | "title_sidebar": "Contents",
444 | "toc_cell": true,
445 | "toc_position": {},
446 | "toc_section_display": true,
447 | "toc_window_display": false
448 | }
449 | },
450 | "nbformat": 4,
451 | "nbformat_minor": 2
452 | }
453 |
--------------------------------------------------------------------------------
/7_module_advanced_topics/get_more_100_pictures.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.common.keys import Keys
3 | import shutil
4 | import requests
5 |
6 | import json
7 | import os
8 | import argparse
9 |
10 | # Set up Google search url with term
11 | searchterm = 'memes'
12 | url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch"
13 |
14 | # Need to download Chromedriver, insert path to chromedriver inside parentheses in following line
15 | browser = webdriver.Chrome('C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe')
16 | browser.get(url)
17 | header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
18 |
19 | # Set up variable to count successful downloads
20 | counter = 0
21 | succounter = 0
22 |
23 | # Makes the folder if it doesn't already exist
24 | if not os.path.exists(searchterm):
25 | os.mkdir(searchterm)
26 |
27 | for _ in range(2000):
28 | # Scrolls the window for us
29 | browser.execute_script("window.scrollBy(0,10000)")
30 |
31 | # Find alls meta data that are links to pictures
32 | for x in browser.find_elements_by_xpath('//div[contains(@class,"rg_meta")]'):
33 | counter = counter + 1
34 | print("Total Count:", counter)
35 | print("Succsessful Count:", succounter)
36 |
37 | img = json.loads(x.get_attribute('innerHTML'))["ou"]
38 | imgtype = json.loads(x.get_attribute('innerHTML'))["ity"]
39 | print("URL:",img, imgtype)
40 |
41 | # Saves the image
42 | try:
43 | r = requests.get(img, stream=True, headers={'User-agent': 'Mozilla/5.0'})
44 | if r.status_code == 200:
45 | with open(searchterm+"/image_"+str(counter)+".png", 'wb') as f:
46 | r.raw.decode_content = True
47 | shutil.copyfileobj(r.raw, f)
48 | succounter = succounter + 1
49 | except Exception as e:
50 | print("could not load : "+img)
51 | print(e)
52 |
53 | print(succounter, "pictures succesfully downloaded")
54 | browser.close()
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | Delta Analytics Code of Conduct
2 | -----
3 |
4 | [Delta Analytics](http://www.deltanalytics.org/) has two parallel goals. First, we bridge the skill gap faced by non-profits by providing free data consulting. Second, we build technical capacity in communities around the world to help democratize access to machine learning and data tools. We are a non-profit run entirely without any full-time staff; instead, our work is possible because of the volunteer efforts of a community of data professionals.
5 | We value the participation of each member of the Delta Analytics community and want all our members to have an enjoyable and fulfilling experience. Accordingly, all members are expected to show respect, dignity, and courtesy to others.
6 | To make clear what is expected, all people affiliated in any way with the Delta Analytics as a teaching or data fellow are required to conform to the following Code of Conduct. Organizers will enforce this code of conduct at all times.
7 |
8 | Our Standards
9 | -----
10 |
11 | Delta Analytics is dedicated to providing a welcoming experience for everyone, regardless of age, gender identity and expression, sexual orientation, disability, physical appearance, body size, ethnicity, nationality, race, or religion (or lack thereof), or socio-economic status. We believe that all members of Delta Analytics are entitled to interact in an environment that is free of harassment, bullying, and discrimination.
12 | We encourage all of our members to contribute to creating a welcoming environment by:
13 |
14 | - Being kind to others
15 | - Behaving professionally
16 | - Using welcoming and inclusive language
17 | - Being respectful of differing viewpoints and experiences
18 | - Focusing on what is best for the community
19 | - Showing empathy towards other community members
20 | - Making sure that the contributions of others are recognized and there is fair and consistent attribution of credit.
21 |
22 | Examples of unacceptable behavior by participants include:
23 |
24 | - Divulging sensitive or confidential data and information
25 | - Harassment of anyone in any form, including:
26 | - Bullying or other actions that create an intimidating, humiliating or uncomfortable environment
27 | - Violent threats or language directed against another person
28 | - Unwelcome sexual attention or advances; sexual language and imagery; the display or circulation of offensive, derogatory or sexually explicit pictures or other materials; repeated, unwanted, and harassing attention or stalking
29 | - Unwelcome physical contact
30 | - Insults or put-downs
31 | - Sexist, racist, homophobic, transphobic, ableist, or exclusionary jokes
32 | - Sustained disruption of talks or other events
33 | - Other conduct that is inappropriate for a professional audience with diverse backgrounds
34 |
35 | Participants asked to stop any inappropriate behavior are expected to comply with the aforementioned guidelines immediately.
36 |
37 | Thank you for helping make this a welcoming, friendly community for all.
38 |
39 | Procedure For A Violating Incident
40 | ------
41 |
42 | If you believe that you have been treated contrary to this Code of Conduct you should do the following: If you feel safe, approach the person directly and request the person’s behavior to stop. If that does not resolve the incident to your satisfaction or you feel uncomfortable speaking with the person directly, please surface immediately to a member of the [leadership team](http://www.deltanalytics.org/leadership-team.html), in-person, or electronically. If possible, document the incident by including dates, times, places, names of individuals involved, and any witnesses.
43 |
44 | In response, members of Delta Analytics’s leadership team will promptly:
45 |
46 | - Meet with each person involved individually and privately.
47 | - The leadership team will exercise its best efforts to keep these conversations confidential and will not divulge sensitive information to other members of the Delta community unless it receives permission from all direct parties involved to do so.
48 | - Decide the appropriate course of action and communicate it transparently to the parties involved.
49 | - Maintain a private record of the incident, in order to identify and prevent undesirable repeated behavior.
50 |
51 | If a participant engages in behavior that violates this code of conduct, Delta Analytics may take any action they deem appropriate, including warning the offender, expelling the offender from an event, or including permanently banning the offender from the group after the offender has had an opportunity to be heard in front of the leadership team.
52 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Attribution 4.0 International
2 |
3 | =======================================================================
4 |
5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
6 | does not provide legal services or legal advice. Distribution of
7 | Creative Commons public licenses does not create a lawyer-client or
8 | other relationship. Creative Commons makes its licenses and related
9 | information available on an "as-is" basis. Creative Commons gives no
10 | warranties regarding its licenses, any material licensed under their
11 | terms and conditions, or any related information. Creative Commons
12 | disclaims all liability for damages resulting from their use to the
13 | fullest extent possible.
14 |
15 | Using Creative Commons Public Licenses
16 |
17 | Creative Commons public licenses provide a standard set of terms and
18 | conditions that creators and other rights holders may use to share
19 | original works of authorship and other material subject to copyright
20 | and certain other rights specified in the public license below. The
21 | following considerations are for informational purposes only, are not
22 | exhaustive, and do not form part of our licenses.
23 |
24 | Considerations for licensors: Our public licenses are
25 | intended for use by those authorized to give the public
26 | permission to use material in ways otherwise restricted by
27 | copyright and certain other rights. Our licenses are
28 | irrevocable. Licensors should read and understand the terms
29 | and conditions of the license they choose before applying it.
30 | Licensors should also secure all rights necessary before
31 | applying our licenses so that the public can reuse the
32 | material as expected. Licensors should clearly mark any
33 | material not subject to the license. This includes other CC-
34 | licensed material, or material used under an exception or
35 | limitation to copyright. More considerations for licensors:
36 | wiki.creativecommons.org/Considerations_for_licensors
37 |
38 | Considerations for the public: By using one of our public
39 | licenses, a licensor grants the public permission to use the
40 | licensed material under specified terms and conditions. If
41 | the licensor's permission is not necessary for any reason--for
42 | example, because of any applicable exception or limitation to
43 | copyright--then that use is not regulated by the license. Our
44 | licenses grant only permissions under copyright and certain
45 | other rights that a licensor has authority to grant. Use of
46 | the licensed material may still be restricted for other
47 | reasons, including because others have copyright or other
48 | rights in the material. A licensor may make special requests,
49 | such as asking that all changes be marked or described.
50 | Although not required by our licenses, you are encouraged to
51 | respect those requests where reasonable. More_considerations
52 | for the public:
53 | wiki.creativecommons.org/Considerations_for_licensees
54 |
55 | =======================================================================
56 |
57 | Creative Commons Attribution 4.0 International Public License
58 |
59 | By exercising the Licensed Rights (defined below), You accept and agree
60 | to be bound by the terms and conditions of this Creative Commons
61 | Attribution 4.0 International Public License ("Public License"). To the
62 | extent this Public License may be interpreted as a contract, You are
63 | granted the Licensed Rights in consideration of Your acceptance of
64 | these terms and conditions, and the Licensor grants You such rights in
65 | consideration of benefits the Licensor receives from making the
66 | Licensed Material available under these terms and conditions.
67 |
68 |
69 | Section 1 -- Definitions.
70 |
71 | a. Adapted Material means material subject to Copyright and Similar
72 | Rights that is derived from or based upon the Licensed Material
73 | and in which the Licensed Material is translated, altered,
74 | arranged, transformed, or otherwise modified in a manner requiring
75 | permission under the Copyright and Similar Rights held by the
76 | Licensor. For purposes of this Public License, where the Licensed
77 | Material is a musical work, performance, or sound recording,
78 | Adapted Material is always produced where the Licensed Material is
79 | synched in timed relation with a moving image.
80 |
81 | b. Adapter's License means the license You apply to Your Copyright
82 | and Similar Rights in Your contributions to Adapted Material in
83 | accordance with the terms and conditions of this Public License.
84 |
85 | c. Copyright and Similar Rights means copyright and/or similar rights
86 | closely related to copyright including, without limitation,
87 | performance, broadcast, sound recording, and Sui Generis Database
88 | Rights, without regard to how the rights are labeled or
89 | categorized. For purposes of this Public License, the rights
90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar
91 | Rights.
92 |
93 | d. Effective Technological Measures means those measures that, in the
94 | absence of proper authority, may not be circumvented under laws
95 | fulfilling obligations under Article 11 of the WIPO Copyright
96 | Treaty adopted on December 20, 1996, and/or similar international
97 | agreements.
98 |
99 | e. Exceptions and Limitations means fair use, fair dealing, and/or
100 | any other exception or limitation to Copyright and Similar Rights
101 | that applies to Your use of the Licensed Material.
102 |
103 | f. Licensed Material means the artistic or literary work, database,
104 | or other material to which the Licensor applied this Public
105 | License.
106 |
107 | g. Licensed Rights means the rights granted to You subject to the
108 | terms and conditions of this Public License, which are limited to
109 | all Copyright and Similar Rights that apply to Your use of the
110 | Licensed Material and that the Licensor has authority to license.
111 |
112 | h. Licensor means the individual(s) or entity(ies) granting rights
113 | under this Public License.
114 |
115 | i. Share means to provide material to the public by any means or
116 | process that requires permission under the Licensed Rights, such
117 | as reproduction, public display, public performance, distribution,
118 | dissemination, communication, or importation, and to make material
119 | available to the public including in ways that members of the
120 | public may access the material from a place and at a time
121 | individually chosen by them.
122 |
123 | j. Sui Generis Database Rights means rights other than copyright
124 | resulting from Directive 96/9/EC of the European Parliament and of
125 | the Council of 11 March 1996 on the legal protection of databases,
126 | as amended and/or succeeded, as well as other essentially
127 | equivalent rights anywhere in the world.
128 |
129 | k. You means the individual or entity exercising the Licensed Rights
130 | under this Public License. Your has a corresponding meaning.
131 |
132 |
133 | Section 2 -- Scope.
134 |
135 | a. License grant.
136 |
137 | 1. Subject to the terms and conditions of this Public License,
138 | the Licensor hereby grants You a worldwide, royalty-free,
139 | non-sublicensable, non-exclusive, irrevocable license to
140 | exercise the Licensed Rights in the Licensed Material to:
141 |
142 | a. reproduce and Share the Licensed Material, in whole or
143 | in part; and
144 |
145 | b. produce, reproduce, and Share Adapted Material.
146 |
147 | 2. Exceptions and Limitations. For the avoidance of doubt, where
148 | Exceptions and Limitations apply to Your use, this Public
149 | License does not apply, and You do not need to comply with
150 | its terms and conditions.
151 |
152 | 3. Term. The term of this Public License is specified in Section
153 | 6(a).
154 |
155 | 4. Media and formats; technical modifications allowed. The
156 | Licensor authorizes You to exercise the Licensed Rights in
157 | all media and formats whether now known or hereafter created,
158 | and to make technical modifications necessary to do so. The
159 | Licensor waives and/or agrees not to assert any right or
160 | authority to forbid You from making technical modifications
161 | necessary to exercise the Licensed Rights, including
162 | technical modifications necessary to circumvent Effective
163 | Technological Measures. For purposes of this Public License,
164 | simply making modifications authorized by this Section 2(a)
165 | (4) never produces Adapted Material.
166 |
167 | 5. Downstream recipients.
168 |
169 | a. Offer from the Licensor -- Licensed Material. Every
170 | recipient of the Licensed Material automatically
171 | receives an offer from the Licensor to exercise the
172 | Licensed Rights under the terms and conditions of this
173 | Public License.
174 |
175 | b. No downstream restrictions. You may not offer or impose
176 | any additional or different terms or conditions on, or
177 | apply any Effective Technological Measures to, the
178 | Licensed Material if doing so restricts exercise of the
179 | Licensed Rights by any recipient of the Licensed
180 | Material.
181 |
182 | 6. No endorsement. Nothing in this Public License constitutes or
183 | may be construed as permission to assert or imply that You
184 | are, or that Your use of the Licensed Material is, connected
185 | with, or sponsored, endorsed, or granted official status by,
186 | the Licensor or others designated to receive attribution as
187 | provided in Section 3(a)(1)(A)(i).
188 |
189 | b. Other rights.
190 |
191 | 1. Moral rights, such as the right of integrity, are not
192 | licensed under this Public License, nor are publicity,
193 | privacy, and/or other similar personality rights; however, to
194 | the extent possible, the Licensor waives and/or agrees not to
195 | assert any such rights held by the Licensor to the limited
196 | extent necessary to allow You to exercise the Licensed
197 | Rights, but not otherwise.
198 |
199 | 2. Patent and trademark rights are not licensed under this
200 | Public License.
201 |
202 | 3. To the extent possible, the Licensor waives any right to
203 | collect royalties from You for the exercise of the Licensed
204 | Rights, whether directly or through a collecting society
205 | under any voluntary or waivable statutory or compulsory
206 | licensing scheme. In all other cases the Licensor expressly
207 | reserves any right to collect such royalties.
208 |
209 |
210 | Section 3 -- License Conditions.
211 |
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 |
215 | a. Attribution.
216 |
217 | 1. If You Share the Licensed Material (including in modified
218 | form), You must:
219 |
220 | a. retain the following if it is supplied by the Licensor
221 | with the Licensed Material:
222 |
223 | i. identification of the creator(s) of the Licensed
224 | Material and any others designated to receive
225 | attribution, in any reasonable manner requested by
226 | the Licensor (including by pseudonym if
227 | designated);
228 |
229 | ii. a copyright notice;
230 |
231 | iii. a notice that refers to this Public License;
232 |
233 | iv. a notice that refers to the disclaimer of
234 | warranties;
235 |
236 | v. a URI or hyperlink to the Licensed Material to the
237 | extent reasonably practicable;
238 |
239 | b. indicate if You modified the Licensed Material and
240 | retain an indication of any previous modifications; and
241 |
242 | c. indicate the Licensed Material is licensed under this
243 | Public License, and include the text of, or the URI or
244 | hyperlink to, this Public License.
245 |
246 | 2. You may satisfy the conditions in Section 3(a)(1) in any
247 | reasonable manner based on the medium, means, and context in
248 | which You Share the Licensed Material. For example, it may be
249 | reasonable to satisfy the conditions by providing a URI or
250 | hyperlink to a resource that includes the required
251 | information.
252 |
253 | 3. If requested by the Licensor, You must remove any of the
254 | information required by Section 3(a)(1)(A) to the extent
255 | reasonably practicable.
256 |
257 | 4. If You Share Adapted Material You produce, the Adapter's
258 | License You apply must not prevent recipients of the Adapted
259 | Material from complying with this Public License.
260 |
261 |
262 | Section 4 -- Sui Generis Database Rights.
263 |
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 |
267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 | to extract, reuse, reproduce, and Share all or a substantial
269 | portion of the contents of the database;
270 |
271 | b. if You include all or a substantial portion of the database
272 | contents in a database in which You have Sui Generis Database
273 | Rights, then the database in which You have Sui Generis Database
274 | Rights (but not its individual contents) is Adapted Material; and
275 |
276 | c. You must comply with the conditions in Section 3(a) if You Share
277 | all or a substantial portion of the contents of the database.
278 |
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 |
283 |
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 |
286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 |
297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 |
307 | c. The disclaimer of warranties and limitation of liability provided
308 | above shall be interpreted in a manner that, to the extent
309 | possible, most closely approximates an absolute disclaimer and
310 | waiver of all liability.
311 |
312 |
313 | Section 6 -- Term and Termination.
314 |
315 | a. This Public License applies for the term of the Copyright and
316 | Similar Rights licensed here. However, if You fail to comply with
317 | this Public License, then Your rights under this Public License
318 | terminate automatically.
319 |
320 | b. Where Your right to use the Licensed Material has terminated under
321 | Section 6(a), it reinstates:
322 |
323 | 1. automatically as of the date the violation is cured, provided
324 | it is cured within 30 days of Your discovery of the
325 | violation; or
326 |
327 | 2. upon express reinstatement by the Licensor.
328 |
329 | For the avoidance of doubt, this Section 6(b) does not affect any
330 | right the Licensor may have to seek remedies for Your violations
331 | of this Public License.
332 |
333 | c. For the avoidance of doubt, the Licensor may also offer the
334 | Licensed Material under separate terms or conditions or stop
335 | distributing the Licensed Material at any time; however, doing so
336 | will not terminate this Public License.
337 |
338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 | License.
340 |
341 |
342 | Section 7 -- Other Terms and Conditions.
343 |
344 | a. The Licensor shall not be bound by any additional or different
345 | terms or conditions communicated by You unless expressly agreed.
346 |
347 | b. Any arrangements, understandings, or agreements regarding the
348 | Licensed Material not stated herein are separate from and
349 | independent of the terms and conditions of this Public License.
350 |
351 |
352 | Section 8 -- Interpretation.
353 |
354 | a. For the avoidance of doubt, this Public License does not, and
355 | shall not be interpreted to, reduce, limit, restrict, or impose
356 | conditions on any use of the Licensed Material that could lawfully
357 | be made without permission under this Public License.
358 |
359 | b. To the extent possible, if any provision of this Public License is
360 | deemed unenforceable, it shall be automatically reformed to the
361 | minimum extent necessary to make it enforceable. If the provision
362 | cannot be reformed, it shall be severed from this Public License
363 | without affecting the enforceability of the remaining terms and
364 | conditions.
365 |
366 | c. No term or condition of this Public License will be waived and no
367 | failure to comply consented to unless expressly agreed to by the
368 | Licensor.
369 |
370 | d. Nothing in this Public License constitutes or may be interpreted
371 | as a limitation upon, or waiver of, any privileges and immunities
372 | that apply to the Licensor or You, including from the legal
373 | processes of any jurisdiction or authority.
374 |
375 |
376 | =======================================================================
377 |
378 | Creative Commons is not a party to its public
379 | licenses. Notwithstanding, Creative Commons may elect to apply one of
380 | its public licenses to material it publishes and in those instances
381 | will be considered the “Licensor.” The text of the Creative Commons
382 | public licenses is dedicated to the public domain under the CC0 Public
383 | Domain Dedication. Except for the limited purpose of indicating that
384 | material is shared under a Creative Commons public license or as
385 | otherwise permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the
393 | public licenses.
394 |
395 | Creative Commons may be contacted at creativecommons.org.
396 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Introduction to Machine Learning for Good
2 | ====
3 |
4 | [](https://mybinder.org/v2/gh/DeltaAnalytics/machine_learning_for_good/master)
5 | [](https://colab.research.google.com/github/DeltaAnalytics/machine_learning_for_good)
6 |
7 |
8 |
9 | How can we use data for social impact?
10 | ------
11 |
12 | Data is powerful. We believe that anyone can harness that power for change.
13 |
14 | In this introductory course, students will learn the foundational theory and the necessary coding skills to translate data into actionable insights. Students will learn the latest machine learning tools and algorithms.
15 |
16 | Data science is a highly interdisciplinary practice: demanding critical thinking, understanding of statistics, and technical coding ability. Irresponsible application of powerful algorithms or an inadequate exploration of underlying assumptions can lead to spurious results. In this course, we emphasize the fundamentals of proper data science and expose students to what is possible using sophisticated machine learning methods.
17 |
18 | Each of the modules is hands-on, project-based, using real world data from [KIVA](https://www.kiva.org/), a non-profit that connects people through lending to alleviate poverty.
19 |
20 |
21 | Who We Are
22 | ------
23 |
24 | [Delta Analytics](http://www.deltanalytics.org/) is a 501(c)3 San Francisco Bay Area non-profit dedicated to bringing rigorous data science to problem-solving, effecting change in nonprofits and the public sector, and making data science an accessible and democratic resource for anyone with the same mission.
25 |
26 |
27 | Overview
28 | ----
29 |
30 | Topics covered in this course include: data cleaning, supervised machine learning, and unsupervised machine learning.
31 |
32 | The slides that cover the theory of the topic are available [here](http://www.deltanalytics.org/curriculum.html). We present theory alongside real-life data science examples will open doors to novices and professionals alike to harness the power of data for good.
33 |
34 | Weebly (our website host) has blocked traffic to certain countries. We have submitted numerous complaints, and apologize to students for the inconvenience caused. Until then, you can access pdf of all course slides below.
35 |
36 | [Module 1 - Introduction to Machine Learning](https://drive.google.com/file/d/1r4SBY6Dm6xjFqLH12tFb-Bf7wbvoIN_C/view?usp=sharing)
37 |
38 | [Module 2 - Machine learning deep dive](https://drive.google.com/file/d/1EZ_xqMaYj77vErVnrQmnFOj-VBEoO5uW/view?usp=sharing)
39 |
40 | [Module 3 - Linear Regression](https://drive.google.com/file/d/1kXbB7fps78xyFYUtmgNlQJJ3LdO0K3TB/view?usp=sharing)
41 |
42 | [Module 4 - Model Selection and Evaluation](https://drive.google.com/file/d/1ESR4U566uPioFCpFOITpuSBaO45MdJ4O/view?usp=sharing)
43 |
44 | [Module 5 - Decision Trees](https://drive.google.com/file/d/1Sd_LN-WE_W3Zo-YZrMBe90H2i4_ieFRs/view?usp=sharing)
45 |
46 | [Module 6 - Ensemble Algorithms](https://drive.google.com/file/d/1g2AT3S5cgu5HjMYt4X-WiVs0RUvI6Z3s/view?usp=sharing)
47 |
48 | [Module 7 - Unsupervised Algorithms](https://drive.google.com/file/d/1YdA-HHYP1V05QgvwLCvfnuuau67Zl38n/view?usp=sharing)
49 |
50 | [Module 8 - Natural Language Processing Pt. 1](https://drive.google.com/file/d/1Y7gOfnPfyCSu1chWEoHMqhgXVI5KZpRx/view?usp=sharing)
51 |
52 | [Module 9 - Natural Language Processing Pt. 2](https://drive.google.com/file/d/1BUJ0FyMzSxCfHNA0AHwBOxjHt7V2FJj8/view?usp=sharing)
53 |
54 | Course outcomes
55 | ----
56 |
57 | By the end of the course, students will be able to:
58 |
59 | 1. Explain the fundamental statistical and machine learning algorithms that underlying common data science methods.
60 | 1. Write code to clean, process, analyze, and visualize real-world data.
61 | 1. Be able to communicate with other data scientists using technical terms.
62 |
63 | Our students
64 | ----
65 |
66 | The course is intended for any and all individuals interested in harnessing data towards solving problems in their communities. Minimal prior coding or mathematical/statistical experience is expected. Computer proficiency is necessary.
67 |
68 | Our teachers
69 | -----
70 |
71 | [Delta Teaching Fellows](http://www.deltanalytics.org/teaching-fellows.html) are all data professionals working in the San Francisco Bay Area. All of our time is donated for free to build out a curriculum that makes machine learning tools and knowledge more accessible to communities around the world. You can learn more about our team [here](http://www.deltanalytics.org/teaching-fellows.html).
72 |
73 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | # Create environment:
2 | # $ conda env create --name good --force
3 | name: good
4 | channels:
5 | - conda-forge
6 | dependencies:
7 | - python==3.6
8 | - jupyter
9 | - numpy
10 | - pandas
11 | - requests
12 | - scikit-learn
13 | - scipy
14 | - seaborn
15 | - statsmodels
16 | - pip:
17 | - graphviz
18 | - git+https://github.com/stroxler/batch_nbconvert
19 | - watermark
20 | - wordcloud
--------------------------------------------------------------------------------
/images/decision_trees.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/decision_trees.png
--------------------------------------------------------------------------------
/images/delta_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/delta_logo.jpg
--------------------------------------------------------------------------------
/images/delta_octocat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/delta_octocat.png
--------------------------------------------------------------------------------
/images/ensemble_algorithms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/ensemble_algorithms.png
--------------------------------------------------------------------------------
/images/introduction_to_machine_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/introduction_to_machine_learning.png
--------------------------------------------------------------------------------
/images/linear_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/linear_regression.png
--------------------------------------------------------------------------------
/images/machine_learning_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/machine_learning_.png
--------------------------------------------------------------------------------
/images/model_selection_evaluation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/model_selection_evaluation.png
--------------------------------------------------------------------------------
/images/nlp_pt_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/nlp_pt_1.png
--------------------------------------------------------------------------------
/images/nlp_pt_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/images/nlp_pt_2.png
--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 |
5 | # Create environment:
6 | conda update -n base conda -y
7 | conda env create --name good --force -q
8 |
9 | # Start environment:
10 | source activate good
11 |
12 | # Update environment (might break stuff. move fast!?)
13 | conda update --all --yes
14 |
15 | # Get local copy of data
16 | git clone --single-branch --depth=1 https://github.com/DeltaAnalytics/machine_learning_for_good_data data
17 |
18 | # Setup spell checking and other notebook enhancements
19 | git clone https://github.com/Calysto/notebook-extensions.git
20 | cd notebook-extensions
21 | jupyter nbextension install calysto --user
22 | jupyter nbextension enable calysto/spell-check/main
23 | jupyter nbextension enable calysto/cell-tools/main
24 | jupyter nbextension enable calysto/annotate/main
25 | rm -r -f notebook-extensions
26 |
27 | # Start Jupyter Notebook
28 | if [[ "$1" != "--no-start" ]]; then
29 | jupyter notebook --browser=Chrome
30 | fi
31 |
--------------------------------------------------------------------------------
/tests_for_students/MPI_data_poverty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeltaAnalytics/machine_learning_for_good/334836aafc39c2e037f827d93988dc19869c6184/tests_for_students/MPI_data_poverty.csv
--------------------------------------------------------------------------------
/tests_for_students/country_mapper.csv:
--------------------------------------------------------------------------------
1 | location_country_code,ISO country code
2 | AF,AFG
3 | AX,ALA
4 | AL,ALB
5 | DZ,DZA
6 | AS,ASM
7 | AD,AND
8 | AO,AGO
9 | AI,AIA
10 | AQ,ATA
11 | AG,ATG
12 | AR,ARG
13 | AM,ARM
14 | AW,ABW
15 | AU,AUS
16 | AT,AUT
17 | AZ,AZE
18 | BS,BHS
19 | BH,BHR
20 | BD,BGD
21 | BB,BRB
22 | BY,BLR
23 | BE,BEL
24 | BZ,BLZ
25 | BJ,BEN
26 | BM,BMU
27 | BT,BTN
28 | BO,BOL
29 | BA,BIH
30 | BW,BWA
31 | BV,BVT
32 | BR,BRA
33 | VG,VGB
34 | IO,IOT
35 | BN,BRN
36 | BG,BGR
37 | BF,BFA
38 | BI,BDI
39 | KH,KHM
40 | CM,CMR
41 | CA,CAN
42 | CV,CPV
43 | KY,CYM
44 | CF,CAF
45 | TD,TCD
46 | CL,CHL
47 | CN,CHN
48 | HK,HKG
49 | MO,MAC
50 | CX,CXR
51 | CC,CCK
52 | CO,COL
53 | KM,COM
54 | CG,COG
55 | CD,COD
56 | CK,COK
57 | CR,CRI
58 | CI,CIV
59 | HR,HRV
60 | CU,CUB
61 | CY,CYP
62 | CZ,CZE
63 | DK,DNK
64 | DJ,DJI
65 | DM,DMA
66 | DO,DOM
67 | EC,ECU
68 | EG,EGY
69 | SV,SLV
70 | GQ,GNQ
71 | ER,ERI
72 | EE,EST
73 | ET,ETH
74 | FK,FLK
75 | FO,FRO
76 | FJ,FJI
77 | FI,FIN
78 | FR,FRA
79 | GF,GUF
80 | PF,PYF
81 | TF,ATF
82 | GA,GAB
83 | GM,GMB
84 | GE,GEO
85 | DE,DEU
86 | GH,GHA
87 | GI,GIB
88 | GR,GRC
89 | GL,GRL
90 | GD,GRD
91 | GP,GLP
92 | GU,GUM
93 | GT,GTM
94 | GG,GGY
95 | GN,GIN
96 | GW,GNB
97 | GY,GUY
98 | HT,HTI
99 | HM,HMD
100 | VA,VAT
101 | HN,HND
102 | HU,HUN
103 | IS,ISL
104 | IN,IND
105 | ID,IDN
106 | IR,IRN
107 | IQ,IRQ
108 | IE,IRL
109 | IM,IMN
110 | IL,ISR
111 | IT,ITA
112 | JM,JAM
113 | JP,JPN
114 | JE,JEY
115 | JO,JOR
116 | KZ,KAZ
117 | KE,KEN
118 | KI,KIR
119 | KP,PRK
120 | KR,KOR
121 | KW,KWT
122 | KG,KGZ
123 | LA,LAO
124 | LV,LVA
125 | LB,LBN
126 | LS,LSO
127 | LR,LBR
128 | LY,LBY
129 | LI,LIE
130 | LT,LTU
131 | LU,LUX
132 | MK,MKD
133 | MG,MDG
134 | MW,MWI
135 | MY,MYS
136 | MV,MDV
137 | ML,MLI
138 | MT,MLT
139 | MH,MHL
140 | MQ,MTQ
141 | MR,MRT
142 | MU,MUS
143 | YT,MYT
144 | MX,MEX
145 | FM,FSM
146 | MD,MDA
147 | MC,MCO
148 | MN,MNG
149 | ME,MNE
150 | MS,MSR
151 | MA,MAR
152 | MZ,MOZ
153 | MM,MMR
154 | NA,NAM
155 | NR,NRU
156 | NP,NPL
157 | NL,NLD
158 | AN,ANT
159 | NC,NCL
160 | NZ,NZL
161 | NI,NIC
162 | NE,NER
163 | NG,NGA
164 | NU,NIU
165 | NF,NFK
166 | MP,MNP
167 | NO,NOR
168 | OM,OMN
169 | PK,PAK
170 | PW,PLW
171 | PS,PSE
172 | PA,PAN
173 | PG,PNG
174 | PY,PRY
175 | PE,PER
176 | PH,PHL
177 | PN,PCN
178 | PL,POL
179 | PT,PRT
180 | PR,PRI
181 | QA,QAT
182 | RE,REU
183 | RO,ROU
184 | RU,RUS
185 | RW,RWA
186 | BL,BLM
187 | SH,SHN
188 | KN,KNA
189 | LC,LCA
190 | MF,MAF
191 | PM,SPM
192 | VC,VCT
193 | WS,WSM
194 | SM,SMR
195 | ST,STP
196 | SA,SAU
197 | SN,SEN
198 | RS,SRB
199 | SC,SYC
200 | SL,SLE
201 | SG,SGP
202 | SK,SVK
203 | SI,SVN
204 | SB,SLB
205 | SO,SOM
206 | ZA,ZAF
207 | GS,SGS
208 | SS,SSD
209 | ES,ESP
210 | LK,LKA
211 | SD,SDN
212 | SR,SUR
213 | SJ,SJM
214 | SZ,SWZ
215 | SE,SWE
216 | CH,CHE
217 | SY,SYR
218 | TW,TWN
219 | TJ,TJK
220 | TZ,TZA
221 | TH,THA
222 | TL,TLS
223 | TG,TGO
224 | TK,TKL
225 | TO,TON
226 | TT,TTO
227 | TN,TUN
228 | TR,TUR
229 | TM,TKM
230 | TC,TCA
231 | TV,TUV
232 | UG,UGA
233 | UA,UKR
234 | AE,ARE
235 | GB,GBR
236 | US,USA
237 | UM,UMI
238 | UY,URY
239 | UZ,UZB
240 | VU,VUT
241 | VE,VEN
242 | VN,VNM
243 | VI,VIR
244 | WF,WLF
245 | EH,ESH
246 | YE,YEM
247 | ZM,ZMB
248 | ZW,ZWE
249 |
--------------------------------------------------------------------------------