├── README.md
├── .gitignore
├── 0x_svm
└── two_classes.png
├── 05_neighbors
├── seismic.png
└── 05_neighbors.ipynb
├── universal_images
├── mac_icon.png
├── linux_icon.jpg
├── so_confused.jpg
├── windows_icon.jpg
├── changing_stuff.jpg
├── logos.3.600.wide.png
├── red_sticky.600px.png
├── green_sticky.300px.png
└── dark_art_logo.600px.png
├── 07_conclusion
├── task_breakdown.jpg
└── 07_conclusion.ipynb
├── 04_naive_bayes
├── naive_bayes_ftw.png
└── 04_naive_bayes.ipynb
├── 03_linear_reg
├── 500px-Linear_least_squares_example2.svg.png
└── 03_linear_reg.ipynb
├── universal_datasets
├── coffee.csv
├── svm_train.csv
├── linreg_train.csv
├── skincancer.txt
├── nbayes_train.csv
├── bananas.csv
├── seeds_dataset.txt
└── svm_test.csv
├── behind_the_scenes
├── scikit_learn_outline.txt
└── lesson_template.ipynb
├── 00_basics
├── 00_intro.ipynb
└── 01_install.ipynb
├── 01_intro_to_sklearn
└── 01_intro_to_sklearn.ipynb
└── 06_special_topics
└── 06_special_topics.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | Introduction to Machine Learning
2 |
3 | A class on machine learning fundamentals.
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .ipynb_checkpoints
3 | Icon
4 | toc_generator.py
5 | toc_content.txt
6 | tmp/
7 |
--------------------------------------------------------------------------------
/0x_svm/two_classes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/0x_svm/two_classes.png
--------------------------------------------------------------------------------
/05_neighbors/seismic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/05_neighbors/seismic.png
--------------------------------------------------------------------------------
/universal_images/mac_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/mac_icon.png
--------------------------------------------------------------------------------
/07_conclusion/task_breakdown.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/07_conclusion/task_breakdown.jpg
--------------------------------------------------------------------------------
/universal_images/linux_icon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/linux_icon.jpg
--------------------------------------------------------------------------------
/universal_images/so_confused.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/so_confused.jpg
--------------------------------------------------------------------------------
/04_naive_bayes/naive_bayes_ftw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/04_naive_bayes/naive_bayes_ftw.png
--------------------------------------------------------------------------------
/universal_images/windows_icon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/windows_icon.jpg
--------------------------------------------------------------------------------
/universal_images/changing_stuff.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/changing_stuff.jpg
--------------------------------------------------------------------------------
/universal_images/logos.3.600.wide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/logos.3.600.wide.png
--------------------------------------------------------------------------------
/universal_images/red_sticky.600px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/red_sticky.600px.png
--------------------------------------------------------------------------------
/universal_images/green_sticky.300px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/green_sticky.300px.png
--------------------------------------------------------------------------------
/universal_images/dark_art_logo.600px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/dark_art_logo.600px.png
--------------------------------------------------------------------------------
/03_linear_reg/500px-Linear_least_squares_example2.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/03_linear_reg/500px-Linear_least_squares_example2.svg.png
--------------------------------------------------------------------------------
/universal_datasets/coffee.csv:
--------------------------------------------------------------------------------
1 | size,price
2 | 12,2.95
3 | 16,3.65
4 | 20,4.15
5 | 14,3.25
6 | 18,4.20
7 | 12,3.00
8 | 16,3.70
9 | 20,4.25
10 | 14,3.10
11 | 18,4.20
12 | 12,2.90
13 | 16,3.60
14 | 20,4.05
15 | 14,3.15
16 | 18,4.35
--------------------------------------------------------------------------------
/universal_datasets/svm_train.csv:
--------------------------------------------------------------------------------
1 | -2.17754709631256,-9.470645062274183,0
2 | 0.5866631983391473,-1.5769729846575844,1
3 | -0.7122111151907693,-1.4777463824468018,1
4 | 0.22806354259302142,-2.7186868912968043,1
5 | 0.6703812053149667,1.3840879990294623,1
6 | 1.5954782057117587,-2.306608592230297,1
7 | 0.9932612713400982,-0.642729061055628,1
8 | -3.4322246593937606,-10.49157220222962,0
9 | -1.2302546855618406,-10.822985897578395,0
10 | -0.6766442565702735,-10.975821067370852,0
11 | 0.6170397211820922,-0.36833806195116914,1
12 | -1.248459227902333,0.7838694515420985,1
13 | -0.6183571037042156,-6.730825748064225,0
14 | -1.92847153425107,-1.1582803418544843,1
15 | 1.437783001570064,0.33800848757265234,1
16 | -1.3030585529853065,-8.071473900041825,0
17 | -0.6332319724629782,-10.196867003209935,0
18 | -0.97221741475837,-10.66801022196572,0
19 | -2.549644619794732,-10.572284501354392,0
20 | -2.333831429048335,-9.669196367903723,0
21 |
--------------------------------------------------------------------------------
/universal_datasets/linreg_train.csv:
--------------------------------------------------------------------------------
1 | 22.809099198935886,11.567585959270929
2 | 11.6002266930792,5.214937584688396
3 | 9.619660981755743,4.057581988322643
4 | 6.852974097597672,3.9478072508732525
5 | 14.336799457163249,5.05697380655601
6 | 27.618843876985558,9.992782461863234
7 | 10.29024909134175,5.142966413981185
8 | 13.186541309609833,6.5939678453617665
9 | 1.786110876458814,1.0347533985797999
10 | 13.667809317547082,4.525240768384208
11 | 11.380781174154695,6.406030700340814
12 | 13.41295660992788,7.363962149193506
13 | 2.5807461535222007,0.94853271931713
14 | 7.460994666297125,3.9897745985035664
15 | 28.85091126934303,10.051321497518035
16 | 3.5964057742406252,3.2798432383591973
17 | 19.52171538417383,8.57219785187176
18 | 19.941153477382414,7.02134024799522
19 | 14.058323902941659,7.1724067091601205
20 | 24.45510798931772,12.249732225364653
21 | 24.648900143933385,8.694067487593259
22 | 26.838259421867857,11.824256053953313
23 | 6.3423082978744905,4.3025399313148665
24 | 28.83670420598234,10.755643054594055
25 | 4.770686384401824,5.635315577273484
26 |
--------------------------------------------------------------------------------
/universal_datasets/skincancer.txt:
--------------------------------------------------------------------------------
1 | State Lat Mort Ocean Long
2 | Alabama 33.0 219 1 87.0
3 | Arizona 34.5 160 0 112.0
4 | Arkansas 35.0 170 0 92.5
5 | California 37.5 182 1 119.5
6 | Colorado 39.0 149 0 105.5
7 | Connecticut 41.8 159 1 72.8
8 | Delaware 39.0 200 1 75.5
9 | "Wash,D.C." 39.0 177 0 77.0
10 | Florida 28.0 197 1 82.0
11 | Georgia 33.0 214 1 83.5
12 | Idaho 44.5 116 0 114.0
13 | Illinois 40.0 124 0 89.5
14 | Indiana 40.2 128 0 86.2
15 | Iowa 42.2 128 0 93.8
16 | Kansas 38.5 166 0 98.5
17 | Kentucky 37.8 147 0 85.0
18 | Louisiana 31.2 190 1 91.8
19 | Maine 45.2 117 1 69.0
20 | Maryland 39.0 162 1 76.5
21 | Massachusetts 42.2 143 1 71.8
22 | Michigan 43.5 117 0 84.5
23 | Minnesota 46.0 116 0 94.5
24 | Mississippi 32.8 207 1 90.0
25 | Missouri 38.5 131 0 92.0
26 | Montana 47.0 109 0 110.5
27 | Nebraska 41.5 122 0 99.5
28 | Nevada 39.0 191 0 117.0
29 | NewHampshire 43.8 129 1 71.5
30 | NewJersey 40.2 159 1 74.5
31 | NewMexico 35.0 141 0 106.0
32 | MewYork 43.0 152 1 75.5
33 | NorthCarolina 35.5 199 1 79.5
34 | NorthDakota 47.5 115 0 100.5
35 | Ohio 40.2 131 0 82.8
36 | Oklahoma 35.5 182 0 97.2
37 | Oregon 44.0 136 1 120.5
38 | Pennsylvania 40.8 132 0 77.8
39 | RhodeIsland 41.8 137 1 71.5
40 | SouthCarolina 33.8 178 1 81.0
41 | SouthDakota 44.8 86 0 100.0
42 | Tennessee 36.0 186 0 86.2
43 | Texas 31.5 229 1 98.0
44 | Utah 39.5 142 0 111.5
45 | Vermont 44.0 153 1 72.5
46 | Virginia 37.5 166 1 78.5
47 | Washington 47.5 117 1 121.0
48 | WestVirginia 38.8 136 0 80.8
49 | Wisconsin 44.5 110 0 90.2
50 | Wyoming 43.0 134 0 107.5
51 |
--------------------------------------------------------------------------------
/universal_datasets/nbayes_train.csv:
--------------------------------------------------------------------------------
1 | -5.382404587194567,3.099757608778995,0
2 | -4.102454769426915,-0.22318587720290328,1
3 | -0.4753390073762995,-0.20172584185812448,1
4 | -6.9095652430213725,4.632675192089428,0
5 | -6.846011195271036,4.600321549939559,0
6 | -7.656509201546659,4.746101085547851,0
7 | -7.19580142739034,5.161875549959177,0
8 | -4.711391336587894,0.5627270202928222,1
9 | -5.802642378452722,1.023672360976985,1
10 | -5.037444232393378,4.629984227572444,0
11 | -4.293389972832816,-2.808418382062139,1
12 | -11.323126282568126,5.550184180900258,0
13 | -4.8570155627180185,7.092042000728633,0
14 | -7.717286574801877,3.9563751696693097,0
15 | -7.0776310695122095,7.939538842461423,0
16 | -7.562696063113033,4.519086752088996,0
17 | -5.698129865720822,1.7984704979523385,1
18 | -2.60172122921113,2.7214232713504187,1
19 | -2.625438418290591,-0.14748265422911344,1
20 | -5.411343701460463,-1.4435342962465745,1
21 | -7.033572036360779,5.749922474394444,0
22 | -6.8671602678098775,3.851870419136311,0
23 | -2.1733506951924717,-0.7266142954139785,1
24 | -7.064118336583272,5.5178249516953,0
25 | -7.308452726945211,0.7640780874799677,1
26 | -3.703422741243391,1.2828341163468846,1
27 | -4.35235223938933,-0.6820592305071826,1
28 | -5.499221484638512,3.5215771249198795,0
29 | -8.3128285248011,3.801595127658697,0
30 | -5.515423287441246,4.729012936907532,0
31 | -3.5129443255240678,-1.3501153523090208,1
32 | -6.662334142960418,2.026832754701778,1
33 | -2.0271979061887766,-0.8121447283980057,1
34 | -5.862343184877673,2.6523840540168337,0
35 | -8.35808544273085,6.494708959792456,0
36 | -5.663716299462879,1.166657620704456,1
37 | -6.5912565372692855,6.015650661543293,0
38 | -10.022692075450513,3.894590838076258,0
39 | -5.046579278266077,5.247125333530684,0
40 | -5.595737605677421,5.301695845965116,0
41 | -6.6157768246245015,4.2628916067331915,0
42 | -9.057322823688411,6.241889576076393,0
43 | -6.138258628898663,5.112801175501586,0
44 | -7.717376638627667,4.652786688842826,0
45 | -5.733255032723233,-0.11185066077215056,1
46 | -4.484849884991392,0.7195962963673159,1
47 | -8.524240952351331,7.761075868345809,0
48 | -6.214121927115906,4.995378578326823,0
49 | -7.662067394981727,0.8799714558836418,1
50 | -7.753695239575434,5.462433721258156,0
51 | -3.7991982154292914,0.7397515581252385,1
52 | -7.389376502476604,3.981618341097725,0
53 | -9.271202266959733,2.297661978821074,0
54 | -8.400990347812272,1.7094527468795904,1
55 | -8.791888061942707,2.4354798418127253,0
56 | -6.693040375103849,3.7982249094531513,0
57 | -5.084823494715631,-0.11641218397809415,1
58 | -8.532847481709918,1.7023270591525432,0
59 | -5.759380434769507,1.8546258180107675,1
60 | -8.046517782408685,8.82154108329258,0
61 | -5.484773465774849,0.9518765904458804,1
62 | -7.272208861735192,4.903546834753613,0
63 | -5.386591238920379,1.1735000778359066,1
64 | -6.301757242672959,1.9195203217829664,1
65 | -6.531304540024218,-1.1096130729876057,1
66 | -4.945924283197794,2.4382190668422017,0
67 | -6.64559543579283,-0.9974342683680084,1
68 | -4.680981347817782,0.33011178853495704,1
69 | -3.899427641599536,-2.175418983200336,1
70 | -8.549505415642951,3.38760761707493,0
71 | -6.019677593728213,5.539259661463021,0
72 | -6.319156087139491,1.058795271813489,1
73 | -3.801191061195107,2.6459157448893134,1
74 | -3.2262637138239785,1.315554620842794,1
75 | -5.290731322602657,-1.0881250542469745,1
76 | -4.6620908354242125,-0.6355769376118812,1
77 | -4.5618334160382,3.0397967988983368,1
78 | -6.959431746877026,-0.6965484236578581,1
79 | -2.821505616627221,2.399650690978495,1
80 | -6.277440411755358,-0.42040754951710796,1
81 |
--------------------------------------------------------------------------------
/universal_datasets/bananas.csv:
--------------------------------------------------------------------------------
1 | length,width,category
2 | 192.31775277264344,42.11007560961724,0
3 | 237.74108507802904,36.20243450997909,1
4 | 191.60045941916462,43.46940778882997,0
5 | 234.8701831630404,40.06614294198304,1
6 | 228.32958068374643,37.36357939528025,1
7 | 201.5640210495342,50.010346848962094,0
8 | 228.8711201957312,35.16376499623423,1
9 | 208.77022366137354,31.686755406013408,1
10 | 193.2754797044879,36.25323574038754,0
11 | 188.3572766334926,41.54378919667501,0
12 | 198.3314796405088,36.43091673251474,0
13 | 212.2210635834421,25.320415628182857,1
14 | 192.42445114365717,45.44774611909541,0
15 | 182.9786228510589,43.86329325039323,0
16 | 218.3472494657376,32.43700455014038,1
17 | 217.21173518740238,29.8963931555431,1
18 | 226.4900537124523,42.662368120376684,1
19 | 214.04770392442876,33.58384034135039,1
20 | 226.467817306007,26.13389457309542,1
21 | 198.59803793083216,33.851321727405654,0
22 | 228.18255258853324,25.66287236905584,1
23 | 223.71155079050865,32.75460042384817,1
24 | 234.52887282057452,27.204482603676563,1
25 | 236.42465882478854,33.264421019944734,1
26 | 228.68623213983366,34.95586326713179,1
27 | 214.96001035952295,46.47842176127723,0
28 | 224.97940259954586,32.93612165606207,1
29 | 206.52645343527783,49.17711271694269,0
30 | 231.56887862488688,32.73368819047123,1
31 | 222.9162962374835,43.8399504516882,1
32 | 195.01298226698495,41.887884866642665,0
33 | 181.32492749489995,39.66172663085066,0
34 | 184.9277400372963,44.68296183674407,0
35 | 213.71080584707875,41.92620454209646,0
36 | 201.3684705741947,34.88041187391387,0
37 | 232.28294943490397,41.41209672311486,1
38 | 227.73251158746402,33.13854642857629,1
39 | 201.23256713438084,47.43226108530389,0
40 | 188.69523975711658,40.66129280289728,0
41 | 199.88616893399396,33.979191184877436,0
42 | 205.71180176889115,43.74645054144711,0
43 | 234.77621079416735,25.73104698783016,1
44 | 215.10602303060045,34.979979538638695,1
45 | 231.03313527340998,31.038709255141786,1
46 | 223.12825212706943,27.917884351513962,1
47 | 213.84550388109832,40.585607418905525,0
48 | 197.19458295873218,50.132296008473936,0
49 | 200.84599243665198,44.15751591681757,0
50 | 185.41898467487533,37.47839889229,0
51 | 212.72318293387187,34.381712477724165,1
52 | 225.97054221375797,38.034539625123486,1
53 | 178.61457323129827,40.383861410305286,0
54 | 192.49107476800455,40.826417882129405,0
55 | 229.7631823669321,33.942770850673526,1
56 | 200.25025437616802,34.19181146464484,0
57 | 232.70986222851607,27.44386963791116,1
58 | 198.65652243177007,47.40063545926883,0
59 | 228.6605757010514,31.031297990890856,1
60 | 216.04286267722026,27.6333779653942,1
61 | 201.5666656644656,40.00421371071958,0
62 | 198.5461871395615,41.59056222969118,0
63 | 223.36943076451664,34.64501070274121,1
64 | 191.76130714750423,45.31348832109596,0
65 | 204.72103803481156,31.25098650901021,1
66 | 238.57207277218245,27.611047812373783,1
67 | 216.10508960488855,32.07708184040278,1
68 | 246.57556631641683,32.172656746965046,1
69 | 182.0195912263871,41.033054856651844,0
70 | 208.1452058229275,32.21874695642551,1
71 | 228.58294275690565,32.96039606501358,1
72 | 192.52164290838397,47.80615612282977,0
73 | 214.93162182666345,28.776977177473075,1
74 | 246.35104102569332,33.544242658998144,1
75 | 238.39857236276623,34.81886817130017,1
76 | 220.91585693036376,35.2976695788017,1
77 | 224.0533785747749,33.22440819855242,1
78 | 226.92368641906506,29.06491169011987,1
79 | 196.79655805155798,47.560538367998184,0
80 | 215.71385482652917,38.476374461149504,1
81 | 197.44019991682435,38.113170828278534,0
82 | 229.0513186093865,28.098397772198272,1
83 | 190.96989665441828,51.450620931659046,0
84 | 206.6366634220767,37.36547919029256,0
85 | 187.6004045582083,36.48795395487299,0
86 | 214.14218055211384,33.54314041189703,1
87 | 196.316538282671,41.580953981208836,0
88 | 189.86608535629077,41.45922774741997,0
89 | 168.77147324395446,46.728065731046186,0
90 | 225.44569193634334,26.01606743239626,1
91 | 186.447527065263,44.40131286038809,0
92 | 241.25825882339808,24.947357733687834,1
93 | 190.98164379983479,41.624082899440566,0
94 | 201.15359305228438,41.19605213408496,0
95 | 190.1425273855761,45.76452890984706,0
96 | 193.81880970714718,39.283908435394,0
97 | 208.9296641363971,47.22937979860898,0
98 | 207.65809268366445,44.467788505266626,0
99 | 216.51279050434854,24.42945865834223,1
100 | 195.172278622729,44.59777853875525,0
101 | 197.6449174120087,41.88582155802365,0
102 |
--------------------------------------------------------------------------------
/07_conclusion/07_conclusion.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Welcome to the Dark Art of Coding:\n",
8 | "## Introduction to Machine Learning\n",
9 | "Conclusion\n",
10 | "\n",
11 | "
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Objectives\n",
19 | "---"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "In this session, students should expect to:\n",
27 | "\n",
28 | "* Review the content of the tutorial\n",
29 | "* Find places to learn more\n",
30 | "* Be inspired to continue the journey"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "# Review\n",
38 | "---"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## The Processs"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "As we saw, for each model, **The Process** was by and large the same. When I started out, that discovery gave me a sense of relief.\n",
53 | "\n",
54 | "As your familiarity with Scikit Learn increases, you will find that each of the steps is pretty small and pretty straightforward, even if it doesn't feel that way today.\n",
55 | "\n",
56 | "* Prep the data\n",
57 | "* Choose the model\n",
58 | "* Choose appropriate hyperparameters\n",
59 | "* Fit the model\n",
60 | "* Apply the model\n",
61 | "* Examine the results\n",
62 | "\n"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "# Where to go next\n",
70 | "---"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "Machine learning is just one part of a data analysis system.\n",
78 | "\n",
79 | "* Frame the problem\n",
80 | "* Data acquisition\n",
81 | "* Data wrangling (cleansing, normalization, filtering, deduplication, etc)\n",
82 | "* Data exploration\n",
83 | "* In-depth analysis (sometimes machine learning)\n",
84 | "* Communication of results\n",
85 | "\n",
86 | "If you are part of a team, you may get lucky enough to find someone who is adept at some of these steps (hopefully the most tedious and time-consuming steps).\n",
87 | "\n",
88 | "But for the rest of us, we have to do some or all of the steps ourselves.\n",
89 | "\n",
90 | "If that is the case, then it behooves you to grow your skills not only on the In-depth Analysis steps, but on all of the other steps as well. Especially the ones where you will spend the greatest amount of time.\n",
91 | "\n",
92 | "
\n",
93 | "\n",
94 | "Source: [Forbes article on data analysis task breakdowns](https://www.forbes.com/sites/gilpress/2016/03/23/data-preparation-most-time-consuming-least-enjoyable-data-science-task-survey-says/#50b1f0236f63)\n",
95 | "\n",
96 | "\n",
97 | "**Yeah, but what about machine learning?**\n",
98 | "\n",
99 | "As you grow your skills in the ancillary skills, how do you grow your machine learning skills?\n",
100 | "\n",
101 | "* find the environment you learn best in: books, videos, classrooms\n",
102 | "* meet folks at meetups\n",
103 | "* find a pet project that will keep your attention\n",
104 | "* write code. every day. no, really, every day!\n",
105 | "\n"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "# A little inspiration\n",
113 | "---"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "As you continue your studies, you will often come across tutorials and code samples and by and large, each of them will follow the steps in **The Process™** (whether they call it that or not). When faced with a wall of code...\n",
121 | "\n",
122 | "* take a deep breath\n",
123 | "* break the code into parts \n",
124 | "* build the code line by line (don't just cut and paste)\n",
125 | "* identify which step are we working on in any given code block\n",
126 | "* separate the `must have` code from the `nice to have` code (the machine learning steps from the data engineering OR from the data viz steps)\n",
127 | "* run the code line by line, and thoroughly think about and examine what is produced by each line of code (is it a numpy array, a dataframe, a model, a prediction, etc)\n",
128 | "\n",
129 | "And most importantly:"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "**Don't give up**\n",
137 | "\n",
138 | "**Be curious**\n",
139 | "\n",
140 | "**Study hard**"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "# Experience Points!\n",
148 | "---"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {
154 | "slideshow": {
155 | "slide_type": "slide"
156 | }
157 | },
158 | "source": [
159 | "# Final task: task 01\n",
160 | "\n",
161 | "Tell yourself: \"**I've got this!**\"\n",
162 | "\n",
163 | "Repeat"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "---\n",
171 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
172 | "\n",
173 | "If you want to continue on at your own-pace, please feel free to do so.\n",
174 | "\n",
175 | "
"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "# References\n",
183 | "---"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "Below are references that may assist you in learning more:\n",
191 | " \n",
192 | "|Title (link)|Comments|\n",
193 | "|---|---|\n",
194 | "|[General API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n",
195 | "|[Forbes article](https://www.forbes.com/sites/gilpress/2016/03/23/data-preparation-most-time-consuming-least-enjoyable-data-science-task-survey-says/#50b1f0236f63)|Article on data analysis task breakdowns|\n"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": []
204 | }
205 | ],
206 | "metadata": {
207 | "kernelspec": {
208 | "display_name": "Python 3",
209 | "language": "python",
210 | "name": "python3"
211 | },
212 | "language_info": {
213 | "codemirror_mode": {
214 | "name": "ipython",
215 | "version": 3
216 | },
217 | "file_extension": ".py",
218 | "mimetype": "text/x-python",
219 | "name": "python",
220 | "nbconvert_exporter": "python",
221 | "pygments_lexer": "ipython3",
222 | "version": "3.6.7"
223 | }
224 | },
225 | "nbformat": 4,
226 | "nbformat_minor": 2
227 | }
228 |
--------------------------------------------------------------------------------
/behind_the_scenes/scikit_learn_outline.txt:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------
2 |
3 | 0) Introduction: (10 mins - discussion)
4 | * Teacher introduction [95% done]
5 | * Agenda overview [95% done]
6 | * Accessing the course material
7 | > Individual student environments and course material (notebooks and
8 | datafiles) will be hosted in a Jupyter Hub and accessible via the
9 | Internet.
10 | > Instructions will ALSO be provided in advance for students to install
11 | the requisite libraries a) should there be a constraint in accessing the
12 | materials online and b) so that students can use the course materials
13 | after the course is done.
14 |
15 | 1) Machine Learning Overview (30 mins)
16 | * An overview of machine learning types and techniques
17 | * Supervised learning
18 | * Unsupervised learning
19 | * Classification
20 | * Regression
21 | * Clustering
22 | * Dimensionality reduction
23 | * Intro to Scikit-Learn
24 |
25 | 2) Naive bayes Classification (20 mins)
26 | * Overview
27 | * Hands-on code examples
28 | * When to use and when not to use Naive Bayes Classification
29 |
30 | BREAK (10 mins) -------------------
31 |
32 | 3) Linear Regression (15 mins)
33 | * Overview
34 | * Hands-on code examples
35 | * When to use and when not to use Linear Regression
36 |
37 | 4) Support Vector Machines (20 mins)
38 | * Overview
39 | * Hands-on code examples
40 | * When to use and when not to use Support Vector Machines
41 |
42 | BREAK (10 mins) -------------------
43 |
44 | 5) Decision Trees and Random Forests (20 mins)
45 | * Overview
46 | * Hands-on code examples
47 | * When to use and when not to use Decision Trees
48 |
49 | 6) Principal Component Analysis (PCA) (20 mins)
50 | * Overview
51 | * Hands-on code examples
52 | * When to use and when not to use PCA
53 |
54 | 7) Conclusion (10 mins)
55 | * Gotchas and problems with using machine learning
56 | * Places to learn more
57 |
58 | TITLE
59 | Scikit-learn, wrapping your head around machine learning
60 |
61 | DESCRIPTION
62 | Edit using Markdown.
63 |
64 | Both your title and this description are made public and displayed in the conference program to help attendees decide whether they are interested in this presentation. Limit this description to a few concise paragraphs.
65 |
66 | A gentle introduction to machine learning through scikit-learn. This tutorial will enable attendees to understand the capabilities and limitations of machine learning through hands-on code examples and fun and interesting datasets. Learn when to turn to machine learning and which tools apply to your problem. Also learn about gotchas and problems that are likely to show up when attempting to use machine learning.
67 |
68 | AUDIENCE
69 | 1–2 paragraphs that should answer three questions: (1) Who is this tutorial for? (2) What background knowledge or experience do you expect students to have? (3) What do you expect students to learn, or to be able to do after attending your tutorial?
70 |
71 | Students that attend this tutorial should have a basic understanding of the following:
72 | * Python, to include importing libraries, writing simple functions, using datatypes such as dicts, sets and lists, and reading and writing files
73 | * The use of simple Jupyter/IPython notebooks
74 | * Familiarity with pandas and dataframes will be useful
75 |
76 | NOTE: previous knowledge of machine learning OR scikit-learn is not required.
77 |
78 | OUTLINE
79 |
80 | Make an outline that lists the topics and activities you will guide your students through over the 3 hours of your tutorial. Provide timings for each activity — indicate when and for how long you will lecture, and when and for how long students will be tackling hands-on exercises. This is a very important criteria! Generally speaking, the more detailed the outline, the more confidence the committee will have that you can deliver the material in the allotted time.
81 |
82 | ADDITIONAL NOTES
83 |
84 | (a) If you have offered this tutorial before, please provide links to the material and video, if possible. Otherwise, please provide links to one (or two!) previous presentations by each speaker. (b) Please summarize your teaching or public speaking experience and your experience with the subject of the tutorial. (c) Let us know if you have specific needs or special requests — for example, requests that involve accessibility, audio, or restrictions on when your talk can be scheduled.
85 |
86 | a) The material in this tutorial is based on a planned series of scikit-learn mentoring sessions designed for the PyHawaii Python meetup. These mentoring sessions are first taught to members of PyHawaii as a community service and are then delivered to clients in the data analysis industry. Thus this mentoring material will undergo at least two cycles of revision before being taught a Pycon.
87 |
88 | I am the past Chair of the Python Education Summit (held annually at PyCon) and have given technical presentations at Pycon, PyHawaii and PyOhio, in business settings, at the collegiate level and for high schoolers and youth. At Pycon and PyOhio, I host the 3-hour workshop on Preparing to Contribute to Open Source using git, virtual environments and Github. At PyHawaii, I am the lead instructor for our fortnightly mentoring sessions. I am the founder and lead instructor for Dark Art of Coding, a programming school. I served as adjunct faculty, teaching Programming in Python, at the University of Hawaii: (https://www.sis.hawaii.edu/uhdad/avail.class?i=MAN&t=201740&c=92245)
89 |
90 | b) I have been teaching technical computer-related topics for decades to a wide range of students, young and old, newbie to advanced. Through this experience, I have been able to evolve a teaching style that helps to match the message to the student: easing students into a subject at a pace that matches their knowledge level. Some examples of my training courses include:
91 |
92 | * Statistics and probability: your first steps on the road to data science (3 hours) @ Pycon 2018 (https://www.youtube.com/watch?v=zzbw0JbiI6Y)
93 | * Introduction to Bokeh: data visualization (3 hours) @ Pycon 2017 (https://www.youtube.com/watch?v=xId9B1BVusA)
94 | * Jupyter: Introduction to Jupyter Lab/Notebooks Tutorial (3 hours)
95 | * Python for Analysts bootcamp (40 hours)
96 | * Founder/Lead instructor for Dark Art of Coding (Intro to Programming with Python & Automating Everyday Tasks Using Python)
97 | * Adjunct Faculty, University of Hawaii, Introduction to Programming
98 | * Lead instructor for DjangoGirls Workshop (10 hours)
99 | * Preparing to Contribute to Open Source (4 hours)
100 | * Operating System and Network Security at Champlain College
101 | * Operating System Fundamentals
102 | * BASH Scripting
103 | * Windows Exploitation
104 | * Linux Exploitation
105 |
106 |
107 |
108 | BIO:
109 | Chalmer Lowe has served on Pycon's Python Education Summit Committee for many years. He helps run the Pycon Sprint Workshops every year. He founded Dark Art of Coding, a programming school. Chalmer founded PyHawaii. He performs data analysis for his employer: Booz Allen Hamilton and teaches Python to his colleagues, clients and anyone who will stand still long enough. Chalmer has a long history in the cyber security and programming fields including: Python, scripting/automation, penetration testing, vulnerability assessment, incident response, intel analysis, data analysis and the fundamentals of data science.
110 |
--------------------------------------------------------------------------------
/universal_datasets/seeds_dataset.txt:
--------------------------------------------------------------------------------
1 | 15.26 14.84 0.871 5.763 3.312 2.221 5.22 1
2 | 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 1
3 | 14.29 14.09 0.905 5.291 3.337 2.699 4.825 1
4 | 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 1
5 | 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 1
6 | 14.38 14.21 0.8951 5.386 3.312 2.462 4.956 1
7 | 14.69 14.49 0.8799 5.563 3.259 3.586 5.219 1
8 | 14.11 14.1 0.8911 5.42 3.302 2.7 5 1
9 | 16.63 15.46 0.8747 6.053 3.465 2.04 5.877 1
10 | 16.44 15.25 0.888 5.884 3.505 1.969 5.533 1
11 | 15.26 14.85 0.8696 5.714 3.242 4.543 5.314 1
12 | 14.03 14.16 0.8796 5.438 3.201 1.717 5.001 1
13 | 13.89 14.02 0.888 5.439 3.199 3.986 4.738 1
14 | 13.78 14.06 0.8759 5.479 3.156 3.136 4.872 1
15 | 13.74 14.05 0.8744 5.482 3.114 2.932 4.825 1
16 | 14.59 14.28 0.8993 5.351 3.333 4.185 4.781 1
17 | 13.99 13.83 0.9183 5.119 3.383 5.234 4.781 1
18 | 15.69 14.75 0.9058 5.527 3.514 1.599 5.046 1
19 | 14.7 14.21 0.9153 5.205 3.466 1.767 4.649 1
20 | 12.72 13.57 0.8686 5.226 3.049 4.102 4.914 1
21 | 14.16 14.4 0.8584 5.658 3.129 3.072 5.176 1
22 | 14.11 14.26 0.8722 5.52 3.168 2.688 5.219 1
23 | 15.88 14.9 0.8988 5.618 3.507 0.7651 5.091 1
24 | 12.08 13.23 0.8664 5.099 2.936 1.415 4.961 1
25 | 15.01 14.76 0.8657 5.789 3.245 1.791 5.001 1
26 | 16.19 15.16 0.8849 5.833 3.421 0.903 5.307 1
27 | 13.02 13.76 0.8641 5.395 3.026 3.373 4.825 1
28 | 12.74 13.67 0.8564 5.395 2.956 2.504 4.869 1
29 | 14.11 14.18 0.882 5.541 3.221 2.754 5.038 1
30 | 13.45 14.02 0.8604 5.516 3.065 3.531 5.097 1
31 | 13.16 13.82 0.8662 5.454 2.975 0.8551 5.056 1
32 | 15.49 14.94 0.8724 5.757 3.371 3.412 5.228 1
33 | 14.09 14.41 0.8529 5.717 3.186 3.92 5.299 1
34 | 13.94 14.17 0.8728 5.585 3.15 2.124 5.012 1
35 | 15.05 14.68 0.8779 5.712 3.328 2.129 5.36 1
36 | 16.12 15 0.9 5.709 3.485 2.27 5.443 1
37 | 16.2 15.27 0.8734 5.826 3.464 2.823 5.527 1
38 | 17.08 15.38 0.9079 5.832 3.683 2.956 5.484 1
39 | 14.8 14.52 0.8823 5.656 3.288 3.112 5.309 1
40 | 14.28 14.17 0.8944 5.397 3.298 6.685 5.001 1
41 | 13.54 13.85 0.8871 5.348 3.156 2.587 5.178 1
42 | 13.5 13.85 0.8852 5.351 3.158 2.249 5.176 1
43 | 13.16 13.55 0.9009 5.138 3.201 2.461 4.783 1
44 | 15.5 14.86 0.882 5.877 3.396 4.711 5.528 1
45 | 15.11 14.54 0.8986 5.579 3.462 3.128 5.18 1
46 | 13.8 14.04 0.8794 5.376 3.155 1.56 4.961 1
47 | 15.36 14.76 0.8861 5.701 3.393 1.367 5.132 1
48 | 14.99 14.56 0.8883 5.57 3.377 2.958 5.175 1
49 | 14.79 14.52 0.8819 5.545 3.291 2.704 5.111 1
50 | 14.86 14.67 0.8676 5.678 3.258 2.129 5.351 1
51 | 14.43 14.4 0.8751 5.585 3.272 3.975 5.144 1
52 | 15.78 14.91 0.8923 5.674 3.434 5.593 5.136 1
53 | 14.49 14.61 0.8538 5.715 3.113 4.116 5.396 1
54 | 14.33 14.28 0.8831 5.504 3.199 3.328 5.224 1
55 | 14.52 14.6 0.8557 5.741 3.113 1.481 5.487 1
56 | 15.03 14.77 0.8658 5.702 3.212 1.933 5.439 1
57 | 14.46 14.35 0.8818 5.388 3.377 2.802 5.044 1
58 | 14.92 14.43 0.9006 5.384 3.412 1.142 5.088 1
59 | 15.38 14.77 0.8857 5.662 3.419 1.999 5.222 1
60 | 12.11 13.47 0.8392 5.159 3.032 1.502 4.519 1
61 | 11.42 12.86 0.8683 5.008 2.85 2.7 4.607 1
62 | 11.23 12.63 0.884 4.902 2.879 2.269 4.703 1
63 | 12.36 13.19 0.8923 5.076 3.042 3.22 4.605 1
64 | 13.22 13.84 0.868 5.395 3.07 4.157 5.088 1
65 | 12.78 13.57 0.8716 5.262 3.026 1.176 4.782 1
66 | 12.88 13.5 0.8879 5.139 3.119 2.352 4.607 1
67 | 14.34 14.37 0.8726 5.63 3.19 1.313 5.15 1
68 | 14.01 14.29 0.8625 5.609 3.158 2.217 5.132 1
69 | 14.37 14.39 0.8726 5.569 3.153 1.464 5.3 1
70 | 12.73 13.75 0.8458 5.412 2.882 3.533 5.067 1
71 | 17.63 15.98 0.8673 6.191 3.561 4.076 6.06 2
72 | 16.84 15.67 0.8623 5.998 3.484 4.675 5.877 2
73 | 17.26 15.73 0.8763 5.978 3.594 4.539 5.791 2
74 | 19.11 16.26 0.9081 6.154 3.93 2.936 6.079 2
75 | 16.82 15.51 0.8786 6.017 3.486 4.004 5.841 2
76 | 16.77 15.62 0.8638 5.927 3.438 4.92 5.795 2
77 | 17.32 15.91 0.8599 6.064 3.403 3.824 5.922 2
78 | 20.71 17.23 0.8763 6.579 3.814 4.451 6.451 2
79 | 18.94 16.49 0.875 6.445 3.639 5.064 6.362 2
80 | 17.12 15.55 0.8892 5.85 3.566 2.858 5.746 2
81 | 16.53 15.34 0.8823 5.875 3.467 5.532 5.88 2
82 | 18.72 16.19 0.8977 6.006 3.857 5.324 5.879 2
83 | 20.2 16.89 0.8894 6.285 3.864 5.173 6.187 2
84 | 19.57 16.74 0.8779 6.384 3.772 1.472 6.273 2
85 | 19.51 16.71 0.878 6.366 3.801 2.962 6.185 2
86 | 18.27 16.09 0.887 6.173 3.651 2.443 6.197 2
87 | 18.88 16.26 0.8969 6.084 3.764 1.649 6.109 2
88 | 18.98 16.66 0.859 6.549 3.67 3.691 6.498 2
89 | 21.18 17.21 0.8989 6.573 4.033 5.78 6.231 2
90 | 20.88 17.05 0.9031 6.45 4.032 5.016 6.321 2
91 | 20.1 16.99 0.8746 6.581 3.785 1.955 6.449 2
92 | 18.76 16.2 0.8984 6.172 3.796 3.12 6.053 2
93 | 18.81 16.29 0.8906 6.272 3.693 3.237 6.053 2
94 | 18.59 16.05 0.9066 6.037 3.86 6.001 5.877 2
95 | 18.36 16.52 0.8452 6.666 3.485 4.933 6.448 2
96 | 16.87 15.65 0.8648 6.139 3.463 3.696 5.967 2
97 | 19.31 16.59 0.8815 6.341 3.81 3.477 6.238 2
98 | 18.98 16.57 0.8687 6.449 3.552 2.144 6.453 2
99 | 18.17 16.26 0.8637 6.271 3.512 2.853 6.273 2
100 | 18.72 16.34 0.881 6.219 3.684 2.188 6.097 2
101 | 16.41 15.25 0.8866 5.718 3.525 4.217 5.618 2
102 | 17.99 15.86 0.8992 5.89 3.694 2.068 5.837 2
103 | 19.46 16.5 0.8985 6.113 3.892 4.308 6.009 2
104 | 19.18 16.63 0.8717 6.369 3.681 3.357 6.229 2
105 | 18.95 16.42 0.8829 6.248 3.755 3.368 6.148 2
106 | 18.83 16.29 0.8917 6.037 3.786 2.553 5.879 2
107 | 18.85 16.17 0.9056 6.152 3.806 2.843 6.2 2
108 | 17.63 15.86 0.88 6.033 3.573 3.747 5.929 2
109 | 19.94 16.92 0.8752 6.675 3.763 3.252 6.55 2
110 | 18.55 16.22 0.8865 6.153 3.674 1.738 5.894 2
111 | 18.45 16.12 0.8921 6.107 3.769 2.235 5.794 2
112 | 19.38 16.72 0.8716 6.303 3.791 3.678 5.965 2
113 | 19.13 16.31 0.9035 6.183 3.902 2.109 5.924 2
114 | 19.14 16.61 0.8722 6.259 3.737 6.682 6.053 2
115 | 20.97 17.25 0.8859 6.563 3.991 4.677 6.316 2
116 | 19.06 16.45 0.8854 6.416 3.719 2.248 6.163 2
117 | 18.96 16.2 0.9077 6.051 3.897 4.334 5.75 2
118 | 19.15 16.45 0.889 6.245 3.815 3.084 6.185 2
119 | 18.89 16.23 0.9008 6.227 3.769 3.639 5.966 2
120 | 20.03 16.9 0.8811 6.493 3.857 3.063 6.32 2
121 | 20.24 16.91 0.8897 6.315 3.962 5.901 6.188 2
122 | 18.14 16.12 0.8772 6.059 3.563 3.619 6.011 2
123 | 16.17 15.38 0.8588 5.762 3.387 4.286 5.703 2
124 | 18.43 15.97 0.9077 5.98 3.771 2.984 5.905 2
125 | 15.99 14.89 0.9064 5.363 3.582 3.336 5.144 2
126 | 18.75 16.18 0.8999 6.111 3.869 4.188 5.992 2
127 | 18.65 16.41 0.8698 6.285 3.594 4.391 6.102 2
128 | 17.98 15.85 0.8993 5.979 3.687 2.257 5.919 2
129 | 20.16 17.03 0.8735 6.513 3.773 1.91 6.185 2
130 | 17.55 15.66 0.8991 5.791 3.69 5.366 5.661 2
131 | 18.3 15.89 0.9108 5.979 3.755 2.837 5.962 2
132 | 18.94 16.32 0.8942 6.144 3.825 2.908 5.949 2
133 | 15.38 14.9 0.8706 5.884 3.268 4.462 5.795 2
134 | 16.16 15.33 0.8644 5.845 3.395 4.266 5.795 2
135 | 15.56 14.89 0.8823 5.776 3.408 4.972 5.847 2
136 | 15.38 14.66 0.899 5.477 3.465 3.6 5.439 2
137 | 17.36 15.76 0.8785 6.145 3.574 3.526 5.971 2
138 | 15.57 15.15 0.8527 5.92 3.231 2.64 5.879 2
139 | 15.6 15.11 0.858 5.832 3.286 2.725 5.752 2
140 | 16.23 15.18 0.885 5.872 3.472 3.769 5.922 2
141 | 13.07 13.92 0.848 5.472 2.994 5.304 5.395 3
142 | 13.32 13.94 0.8613 5.541 3.073 7.035 5.44 3
143 | 13.34 13.95 0.862 5.389 3.074 5.995 5.307 3
144 | 12.22 13.32 0.8652 5.224 2.967 5.469 5.221 3
145 | 11.82 13.4 0.8274 5.314 2.777 4.471 5.178 3
146 | 11.21 13.13 0.8167 5.279 2.687 6.169 5.275 3
147 | 11.43 13.13 0.8335 5.176 2.719 2.221 5.132 3
148 | 12.49 13.46 0.8658 5.267 2.967 4.421 5.002 3
149 | 12.7 13.71 0.8491 5.386 2.911 3.26 5.316 3
150 | 10.79 12.93 0.8107 5.317 2.648 5.462 5.194 3
151 | 11.83 13.23 0.8496 5.263 2.84 5.195 5.307 3
152 | 12.01 13.52 0.8249 5.405 2.776 6.992 5.27 3
153 | 12.26 13.6 0.8333 5.408 2.833 4.756 5.36 3
154 | 11.18 13.04 0.8266 5.22 2.693 3.332 5.001 3
155 | 11.36 13.05 0.8382 5.175 2.755 4.048 5.263 3
156 | 11.19 13.05 0.8253 5.25 2.675 5.813 5.219 3
157 | 11.34 12.87 0.8596 5.053 2.849 3.347 5.003 3
158 | 12.13 13.73 0.8081 5.394 2.745 4.825 5.22 3
159 | 11.75 13.52 0.8082 5.444 2.678 4.378 5.31 3
160 | 11.49 13.22 0.8263 5.304 2.695 5.388 5.31 3
161 | 12.54 13.67 0.8425 5.451 2.879 3.082 5.491 3
162 | 12.02 13.33 0.8503 5.35 2.81 4.271 5.308 3
163 | 12.05 13.41 0.8416 5.267 2.847 4.988 5.046 3
164 | 12.55 13.57 0.8558 5.333 2.968 4.419 5.176 3
165 | 11.14 12.79 0.8558 5.011 2.794 6.388 5.049 3
166 | 12.1 13.15 0.8793 5.105 2.941 2.201 5.056 3
167 | 12.44 13.59 0.8462 5.319 2.897 4.924 5.27 3
168 | 12.15 13.45 0.8443 5.417 2.837 3.638 5.338 3
169 | 11.35 13.12 0.8291 5.176 2.668 4.337 5.132 3
170 | 11.24 13 0.8359 5.09 2.715 3.521 5.088 3
171 | 11.02 13 0.8189 5.325 2.701 6.735 5.163 3
172 | 11.55 13.1 0.8455 5.167 2.845 6.715 4.956 3
173 | 11.27 12.97 0.8419 5.088 2.763 4.309 5 3
174 | 11.4 13.08 0.8375 5.136 2.763 5.588 5.089 3
175 | 10.83 12.96 0.8099 5.278 2.641 5.182 5.185 3
176 | 10.8 12.57 0.859 4.981 2.821 4.773 5.063 3
177 | 11.26 13.01 0.8355 5.186 2.71 5.335 5.092 3
178 | 10.74 12.73 0.8329 5.145 2.642 4.702 4.963 3
179 | 11.48 13.05 0.8473 5.18 2.758 5.876 5.002 3
180 | 12.21 13.47 0.8453 5.357 2.893 1.661 5.178 3
181 | 11.41 12.95 0.856 5.09 2.775 4.957 4.825 3
182 | 12.46 13.41 0.8706 5.236 3.017 4.987 5.147 3
183 | 12.19 13.36 0.8579 5.24 2.909 4.857 5.158 3
184 | 11.65 13.07 0.8575 5.108 2.85 5.209 5.135 3
185 | 12.89 13.77 0.8541 5.495 3.026 6.185 5.316 3
186 | 11.56 13.31 0.8198 5.363 2.683 4.062 5.182 3
187 | 11.81 13.45 0.8198 5.413 2.716 4.898 5.352 3
188 | 10.91 12.8 0.8372 5.088 2.675 4.179 4.956 3
189 | 11.23 12.82 0.8594 5.089 2.821 7.524 4.957 3
190 | 10.59 12.41 0.8648 4.899 2.787 4.975 4.794 3
191 | 10.93 12.8 0.839 5.046 2.717 5.398 5.045 3
192 | 11.27 12.86 0.8563 5.091 2.804 3.985 5.001 3
193 | 11.87 13.02 0.8795 5.132 2.953 3.597 5.132 3
194 | 10.82 12.83 0.8256 5.18 2.63 4.853 5.089 3
195 | 12.11 13.27 0.8639 5.236 2.975 4.132 5.012 3
196 | 12.8 13.47 0.886 5.16 3.126 4.873 4.914 3
197 | 12.79 13.53 0.8786 5.224 3.054 5.483 4.958 3
198 | 13.37 13.78 0.8849 5.32 3.128 4.67 5.091 3
199 | 12.62 13.67 0.8481 5.41 2.911 3.306 5.231 3
200 | 12.76 13.38 0.8964 5.073 3.155 2.828 4.83 3
201 | 12.38 13.44 0.8609 5.219 2.989 5.472 5.045 3
202 | 12.67 13.32 0.8977 4.984 3.135 2.3 4.745 3
203 | 11.18 12.72 0.868 5.009 2.81 4.051 4.828 3
204 | 12.7 13.41 0.8874 5.183 3.091 8.456 5 3
205 | 12.37 13.47 0.8567 5.204 2.96 3.919 5.001 3
206 | 12.19 13.2 0.8783 5.137 2.981 3.631 4.87 3
207 | 11.23 12.88 0.8511 5.14 2.795 4.325 5.003 3
208 | 13.2 13.66 0.8883 5.236 3.232 8.315 5.056 3
209 | 11.84 13.21 0.8521 5.175 2.836 3.598 5.044 3
210 | 12.3 13.34 0.8684 5.243 2.974 5.637 5.063 3
--------------------------------------------------------------------------------
/00_basics/00_intro.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Welcome to the Dark Art of Coding:\n",
8 | "## Introduction to Machine Learning\n",
9 | "Class overview\n",
10 | "\n",
11 | "
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Objectives\n",
19 | "---"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "In this session, students should expect to:\n",
27 | "\n",
28 | "* Get to know the instructor\n",
29 | "* Review 'what to expect' and 'what not to expect'\n",
30 | "* Explore how to access the class materials"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "# Instructor Intro\n",
38 | "---"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "My name is **Chalmer Lowe**. I like to stay busy ... To that end, I:\n",
46 | "\n",
47 | "* work for **[Booz Allen Hamilton](https://www.boozallen.com/about.html)**, a technology and consulting firm with one guiding purpose—to empower people to change the world. \n",
48 | "* founded a programming school called **[Dark Art of Coding](https://darkartofcoding.com/)**, where I teach Python, data analysis and data science.\n",
49 | "* founded **[PyHawaii](https://www.meetup.com/PyHawaii-Python-Users-Group/)**, the largest and most active programming meetup in Hawaii. \n",
50 | "* serve on the **[Pycon Education Summit Committee](https://us.pycon.org/2019/events/edusummit/#!)**.\n",
51 | "* help teach the **[Introduction to Sprinting Hands-on Tutorial](https://us.pycon.org/2019/community/sprints/)** (late Sunday night!)\n",
52 | "* this is my third Pycon tutorial... previous trips into the lion's den include:\n",
53 | " * Introduction to Bokeh\n",
54 | " * Introduction to Statistics and Probability\n",
55 | "* have contributed (in minor ways) to bokeh, pandas, Jupyter and more.\n",
56 | "\n",
57 | "|Social Media|Contact |\n",
58 | "|----:|----:|\n",
59 | "|Twitter|@chalmer_lowe|\n",
60 | "|Email|info@darkartofcoding.com|\n",
61 | "|Linkedin|https://www.linkedin.com/in/chalmerlowe/ |"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "# What to expect/not expect\n",
69 | "---"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "We have a lot of material to cover...\n",
77 | "\n",
78 | "1. We are gonna move at a **rapid pace**\n",
79 | "\n",
80 | "1. I will gladly take questions, but depending on the depth of the question and/or the relevance of the question to what we are trying to accomplish and/or whether I can answer the question, I might defer the question to a **parking lot** and cover it at the end OR after the tutorial ends\n",
81 | "\n",
82 | "1. During certain points in the tutorial, I may be able to support some one-on-one conversations and/or over-the-shoulder help, but it will be dependant on our progress\n",
83 | "\n",
84 | "1. There is a lot of math under the hood, when doing machine learning. There is almost no math in this tutorial\n",
85 | "\n",
86 | "1. If someone runs into a significant snag that can't be rectified in a timely fashion, I will invite you to simply take a deep breath, relax and enjoy the show OR partner with someone beside you >>> Please note, I would be happy to visit with you later during the conference OR during the Sprints to do a deep dive troubleshooting session.\n",
87 | "\n",
88 | "1. My main ambition today is **to teach you to learn**. I will do what I can to:\n",
89 | " * Help you know where to look for more info\n",
90 | " * Help you explore some of the most commonly used grammar and vocabulary\n",
91 | "\n",
92 | "1. Expectation management: No one is gonna walk out of this room, after only three hours, as a ninja/guru/rockstar\n",
93 | "\n",
94 | "1. For you to get there, will take considerable time, effort, practice, repetition and additional study"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "# Lesson layout...\n",
102 | "---"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {
108 | "slideshow": {
109 | "slide_type": "slide"
110 | }
111 | },
112 | "source": [
113 | "1. Each lesson starts with a high level **overview**\n",
114 | "\n",
115 | "1. The lesson then provides some **code samples** that you should be able to run and explore\n",
116 | "\n",
117 | "1. After the code, there will generally be some written documentation in a **deep dive** that we will cover at a high level\n",
118 | "\n",
119 | "1. To help us keep to our schedule, each section is **time-boxed**... we will cover as much as we can and then we will move on\n",
120 | "\n",
121 | "1. That deep dive is specifically designed for you to go back to for additional clarification at your leisure and to help account for the fact that with limited time, we won't have the ability to dwell deeply on any given topic\n",
122 | "\n",
123 | "1. In some of the sessions, there may be some opportunities for you to do some hands-on exercises... when you complete the exercise, put up your **green post-it**. If you run into a snag, put up your **red post-it**. (see a sample exercise below)\n",
124 | "\n",
125 | "1. Each lesson and exercise should be fairly stand-alone with the mindset that if something in a particular lesson doesn't work for you, it hopefully won't affect any of the follow-on lessons. If you run into a snag that we can't resolve, feel free to **make a note of it** and we can explore it together later today OR this weekend. "
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "# Experience Points!\n",
133 | "---"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {
139 | "slideshow": {
140 | "slide_type": "slide"
141 | }
142 | },
143 | "source": [
144 | "Sample exercise...\n",
145 | "\n",
146 | "1. Remind yourself \"I've got this!\"\n",
147 | "1. Repeat\n"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "---\n",
155 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
156 | "\n",
157 | "If you want to continue on at your own-pace, please feel free to do so.\n",
158 | "\n",
159 | "
"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "# Accessing the class materials\n",
167 | "---"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "There are several primary ways to access the class materials:\n",
175 | " \n",
176 | "|Source|Interactive?|\n",
177 | "|:---|:---|\n",
178 | "|Via the class Jupyter Hub |Fully interactive\n",
179 | "|Via a local install you run in an environment on your machine |Fully interactive\n",
180 | "|Via Github |Non-interactive \n"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "## Via Jupyter Hub"
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {},
193 | "source": [
194 | "Each of you should have received an **instruction flyer** with your **personal student username** that will walk you through a process to access the class material.\n",
195 | "\n",
196 | "If you follow those steps, you should be able to access all the course content **without having to install anything** on your local machine.\n",
197 | "\n",
198 | "The Jupyter Hub will only be available for the duration of the tutorial, so instructions are provided below to allow you to install the needed software and download all the course content to your local machine."
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {},
204 | "source": [
205 | "## Via local install"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | "To install the software needed to run the class and to go through the class material on your local machine, go to this link and follow the instructions you find there:\n",
213 | "\n",
214 | "[**https://github.com/chalmerlowe/machine_learning/blob/master/00_basics/01_install.ipynb**](https://github.com/chalmerlowe/machine_learning/blob/master/00_basics/01_install.ipynb)"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {},
220 | "source": [
221 | "## Via Github"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "If all else fails, you can follow along in a static repository of files hosted on Github. You won't be able to run code, but you can still track the class. Chalmer's Machine Learning Tutorial can be found here:\n",
229 | "\n",
230 | "[**https://github.com/chalmerlowe/machine_learning**](https://github.com/chalmerlowe/machine_learning)"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": []
239 | }
240 | ],
241 | "metadata": {
242 | "kernelspec": {
243 | "display_name": "Python 3",
244 | "language": "python",
245 | "name": "python3"
246 | },
247 | "language_info": {
248 | "codemirror_mode": {
249 | "name": "ipython",
250 | "version": 3
251 | },
252 | "file_extension": ".py",
253 | "mimetype": "text/x-python",
254 | "name": "python",
255 | "nbconvert_exporter": "python",
256 | "pygments_lexer": "ipython3",
257 | "version": "3.6.7"
258 | }
259 | },
260 | "nbformat": 4,
261 | "nbformat_minor": 2
262 | }
263 |
--------------------------------------------------------------------------------
/behind_the_scenes/lesson_template.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Welcome to the Dark Art of Coding:\n",
8 | "## Introduction to Machine Learning\n",
9 | "replace_with_topic\n",
10 | "\n",
11 | "
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Objectives\n",
19 | "---"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "In this session, students should expect to:\n",
27 | "\n",
28 | "* \n",
29 | "* \n",
30 | "* \n",
31 | "\n",
32 | "\n",
33 | "* DELETE: Short list of expected outcomes\n",
34 | "* DELETE: Use active verbs: Create, Change, Manipulate, Explore, etc.\n",
35 | "* DELETE: Keep outcomes measurable, where possible: Success means the thing was created OR the object was changed "
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "# Overview: Model X\n",
43 | "---"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {
50 | "collapsed": true
51 | },
52 | "outputs": [],
53 | "source": []
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "## Prep the data"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "We start with a set of standard imports..."
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "import matplotlib.pyplot as plt\n",
76 | "import numpy as np\n",
77 | "import pandas as pd\n",
78 | "import sklearn\n",
79 | "\n",
80 | "# NOTE: during the Choose the Model step, we will import the \n",
81 | "# model we want, but there is no reason you can't import it here.\n",
82 | "# from sklearn.xx import XX"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "### Prep the training data and test data"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "df = pd.read_csv('../universal_datasets/skincancer.txt',\n",
99 | " names=[])\n",
100 | "df.head()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "df.shape"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "X = df[].to_frame()\n",
119 | "y = df[]"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "X[:5]"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "y[:5]"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "With our data imported, let's separate it into training data and test data."
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 2,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "from sklearn.model_selection import train_test_split"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "plt.scatter(X_train, y_train)\n",
172 | "plt.title(\"\")\n",
173 | "plt.xlabel(\"\")\n",
174 | "plt.ylabel(\"\");"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "## Choose the Model"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "metadata": {
188 | "collapsed": true
189 | },
190 | "outputs": [],
191 | "source": []
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "## Choose Appropriate Hyperparameters"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "Here we choose to assign xx hyperparameters: `xx` and `xx`. We will discuss both later."
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": []
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "There are a number of hyperparameters\n",
221 | "\n",
222 | "```python\n",
223 | "XX()\n",
224 | "```"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "## Fit the Model"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {
238 | "collapsed": true
239 | },
240 | "outputs": [],
241 | "source": [
242 | "model.fit(X_train, y_train)"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "collapsed": true
250 | },
251 | "outputs": [],
252 | "source": []
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "## Apply the Model"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {
265 | "collapsed": true
266 | },
267 | "outputs": [],
268 | "source": [
269 | "y_pred = model.predict(X_test)"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {
276 | "collapsed": true
277 | },
278 | "outputs": [],
279 | "source": [
280 | "y_pred.shape"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {
287 | "collapsed": true
288 | },
289 | "outputs": [],
290 | "source": [
291 | "y_pred[::100]"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "metadata": {
298 | "collapsed": true
299 | },
300 | "outputs": [],
301 | "source": []
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "## Examine the results"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "collapsed": true
315 | },
316 | "outputs": [],
317 | "source": []
318 | },
319 | {
320 | "cell_type": "markdown",
321 | "metadata": {},
322 | "source": [
323 | "# Gotchas\n",
324 | "---"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {
331 | "collapsed": true
332 | },
333 | "outputs": [],
334 | "source": []
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "# Deep Dive\n",
341 | "---"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": null,
347 | "metadata": {
348 | "collapsed": true
349 | },
350 | "outputs": [],
351 | "source": []
352 | },
353 | {
354 | "cell_type": "markdown",
355 | "metadata": {},
356 | "source": [
357 | "# Gotchas\n",
358 | "---"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {
365 | "collapsed": true
366 | },
367 | "outputs": [],
368 | "source": []
369 | },
370 | {
371 | "cell_type": "markdown",
372 | "metadata": {},
373 | "source": [
374 | "# How to learn more: tips and hints\n",
375 | "---"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "metadata": {
382 | "collapsed": true
383 | },
384 | "outputs": [],
385 | "source": []
386 | },
387 | {
388 | "cell_type": "markdown",
389 | "metadata": {},
390 | "source": [
391 | "# Experience Points!\n",
392 | "---"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "metadata": {
398 | "slideshow": {
399 | "slide_type": "slide"
400 | }
401 | },
402 | "source": [
403 | "# delete_this_line: task 01\n",
404 | "\n",
405 | "In **`jupyter`** create a simple script to complete the following tasks:\n",
406 | "\n",
407 | "\n",
408 | "**REPLACE THE FOLLOWING**\n",
409 | "\n",
410 | "Create a function called `me()` that prints out 3 things:\n",
411 | "\n",
412 | "* Your name\n",
413 | "* Your favorite food\n",
414 | "* Your favorite color\n",
415 | "\n",
416 | "Lastly, call the function, so that it executes when the script is run"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "---\n",
424 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
425 | "\n",
426 | "If you want to continue on at your own-pace, please feel free to do so.\n",
427 | "\n",
428 | "
"
429 | ]
430 | },
431 | {
432 | "cell_type": "markdown",
433 | "metadata": {},
434 | "source": [
435 | "# Experience Points!\n",
436 | "---"
437 | ]
438 | },
439 | {
440 | "cell_type": "markdown",
441 | "metadata": {
442 | "slideshow": {
443 | "slide_type": "slide"
444 | }
445 | },
446 | "source": [
447 | "# delete_this_line: task 02\n",
448 | "\n",
449 | "In **`jupyter`** create a simple script to complete the following tasks:\n",
450 | "\n",
451 | "**REPLACE THE FOLLOWING**\n",
452 | "\n",
453 | "Task | Sample Object(s)\n",
454 | ":---|:---\n",
455 | "Compare two items using `and` | 'Bruce', 0\n",
456 | "Compare two items using `or` | '', 42\n",
457 | "Use the `not` operator to make an object False | 'Selina' \n",
458 | "Compare two numbers using comparison operators | `>, <, >=, !=, ==`\n",
459 | "Create a more complex/nested comparison using parenthesis and Boolean operators| `('kara' _ 'clark') _ (0 _ 0.0)`"
460 | ]
461 | },
462 | {
463 | "cell_type": "markdown",
464 | "metadata": {},
465 | "source": [
466 | "---\n",
467 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
468 | "\n",
469 | "If you want to continue on at your own-pace, please feel free to do so.\n",
470 | "\n",
471 | "
"
472 | ]
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {},
477 | "source": [
478 | "# Experience Points!\n",
479 | "---"
480 | ]
481 | },
482 | {
483 | "cell_type": "markdown",
484 | "metadata": {},
485 | "source": [
486 | "# delete_this_line: sample 03\n",
487 | "\n",
488 | "In your **text editor** create a simple script called:\n",
489 | "\n",
490 | "```bash\n",
491 | "my_lessonname_03.py```\n",
492 | "\n",
493 | "Execute your script on the command line using **`ipython`** via this command:\n",
494 | "\n",
495 | "```bash\n",
496 | "ipython -i my_lessonname_03.py```\n",
497 | "\n",
498 | "**REPLACE THE FOLLOWING**\n",
499 | "\n",
500 | "I suggest that as you add each feature to your script that you run it right away to test it incrementally. \n",
501 | "\n",
502 | "1. Create a variable with your first name as a string AND save it with the label: `myfname`.\n",
503 | "1. Create a variable with your age as an integer AND save it with the label: `myage`.\n",
504 | "\n",
505 | "1. Use `input()` to prompt for your first name AND save it with the label: `fname`.\n",
506 | "1. Create an `if` statement to test whether `fname` is equivalent to `myfname`. \n",
507 | "1. In the `if` code block: \n",
508 | " 1. Use `input()` prompt for your age AND save it with the label: `age` \n",
509 | " 1. NOTE: don't forget to convert the value to an integer.\n",
510 | " 1. Create a nested `if` statement to test whether `myage` and `age` are equivalent.\n",
511 | "1. If both tests pass, have the script print: `Your identity has been verified`"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "metadata": {},
517 | "source": [
518 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
519 | "\n",
520 | "If you want to continue on at your own-pace, please feel free to do so.\n",
521 | "\n",
522 | "
"
523 | ]
524 | },
525 | {
526 | "cell_type": "markdown",
527 | "metadata": {},
528 | "source": [
529 | "# References\n",
530 | "---"
531 | ]
532 | },
533 | {
534 | "cell_type": "markdown",
535 | "metadata": {},
536 | "source": [
537 | "Below are references that may assist you in learning more:\n",
538 | " \n",
539 | "|Title (link)|Comments|\n",
540 | "|---|---|\n",
541 | "|[General API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n",
542 | "|[XX API Reference]()||\n",
543 | "|[User Guide]()||"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": null,
549 | "metadata": {
550 | "collapsed": true
551 | },
552 | "outputs": [],
553 | "source": []
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": null,
558 | "metadata": {
559 | "collapsed": true
560 | },
561 | "outputs": [],
562 | "source": []
563 | }
564 | ],
565 | "metadata": {
566 | "kernelspec": {
567 | "display_name": "Python 3",
568 | "language": "python",
569 | "name": "python3"
570 | },
571 | "language_info": {
572 | "codemirror_mode": {
573 | "name": "ipython",
574 | "version": 3
575 | },
576 | "file_extension": ".py",
577 | "mimetype": "text/x-python",
578 | "name": "python",
579 | "nbconvert_exporter": "python",
580 | "pygments_lexer": "ipython3",
581 | "version": "3.6.7"
582 | }
583 | },
584 | "nbformat": 4,
585 | "nbformat_minor": 2
586 | }
587 |
--------------------------------------------------------------------------------
/03_linear_reg/03_linear_reg.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Welcome to the Dark Art of Coding:\n",
8 | "## Introduction to Machine Learning\n",
9 | "Linear Regression\n",
10 | "\n",
11 | "
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Objectives\n",
19 | "---"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "In this session, students should expect to:\n",
27 | "\n",
28 | "* Cover an overview of Linear Regression\n",
29 | "* Examine code samples that walk us through **The Process™**:\n",
30 | " * Prep the data\n",
31 | " * Choose the model\n",
32 | " * Choose appropriate hyperparameters\n",
33 | " * Fit the model\n",
34 | " * Apply the model\n",
35 | " * Examine the results\n",
36 | "* Explore a deep dive into this model\n",
37 | "* Review some gotchas that might complicate things\n",
38 | "* Review tips related to learning more"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "# Overview: Linear Regression\n",
46 | "---"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "Linear Regression models are popular machine learning models because they:\n",
54 | "* are often fast\n",
55 | "* are often simple with few tunable hyperparameters\n",
56 | "* are very easy to interpret\n",
57 | "* can provide a nice baseline classification to start with before considering more sophisticated models\n",
58 | "\n",
59 | "Several cases where you might use a linear regression to predict an output based on a set of inputs include:\n",
60 | "\n",
61 | "|Inputs|Outputs|\n",
62 | "|:---|:---|\n",
63 | "|ad dollars spent|sales dollars earned|\n",
64 | "|car age|sale price|\n",
65 | "|latitude|skin cancer mortality|\n",
66 | "\n",
67 | "The LinearRegression model that we will examine here relies upon the Ordinary Least Squares (OLS) method to calculate a linear function that fits the input data.\n",
68 | "\n",
69 | "From [Wikipedia](https://en.wikipedia.org/wiki/Ordinary_least_squares): \n",
70 | "\n",
71 | "> \"Geometrically, this is seen as the sum of the squared distances, ... between each data point in the set and the corresponding point on the regression surface – **the smaller the differences, the better the model fits the data**.\"\n",
72 | "\n",
73 | "
\n",
74 | "\n",
75 | "Image source: [Wikimedia](https://commons.wikimedia.org/wiki/File:Linear_least_squares_example2.svg)\n",
76 | "\n",
77 | "The result of the simplest type of linear regression calculation is a formula for straight line (although sophisticated curved surfaces can also be determined using linear regression):\n",
78 | "\n",
79 | "$$y = mx + b$$\n",
80 | "\n",
81 | "Where if given some value of $x$, if we know the slope of the line ($m$) and the y-intercept ($b$) we can calculate $y$.\n",
82 | "\n",
83 | "Beyond that, we won't cover the math here. 😀\n",
84 | "\n",
85 | "Scikit-Learn has a number of Linear Models based on calculations besides OLS: \n",
86 | "\n",
87 | "* Ridge \n",
88 | "* Lasso\n",
89 | "* Huber\n",
90 | "* and many more...\n",
91 | "\n",
92 | "Each one has slightly different approaches to calculating a line that fits the data.\n",
93 | "\n",
94 | "**Ridge**: addresses some issues related to OLS by controlling the size of coefficients.\n",
95 | "\n",
96 | "**Lasso**: encourages simple, sparse models (i.e. models with fewer parameters). Can be useful when you want to automate certain parts of model selection, like variable selection/parameter elimination. \n",
97 | "\n",
98 | "**Huber**: applies a linear loss (lower weight) to samples that are classified as outliers, thus minimizing the impact of random outliers.\n",
99 | "\n",
100 | "With this background, let's apply **The Process™** on a LinearRegression model."
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "## Prep the data"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "We start with a set of standard imports..."
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "import matplotlib.pyplot as plt\n",
124 | "import numpy as np\n",
125 | "import pandas as pd\n",
126 | "import sklearn\n",
127 | "\n",
128 | "# NOTE: during the Choose the Model step, we will import the \n",
129 | "# model we want, but there is no reason you can't import it here.\n",
130 | "# from sklearn.linear_model import LinearRegression"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "### Prep the training data and test data"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "For this example, we will use a dataset hosted by Penn State:\n",
145 | " \n",
146 | "[skincancer.txt](https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/skincancer/index.txt)\n",
147 | "\n",
148 | "I don't have a clear understanding of the origin of this data and **we are simply using the dataset to demo a technique**. Please don't draw conclusions from the results of this simplistic analysis."
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "df = pd.read_csv('../universal_datasets/skincancer.txt',\n",
158 | " delim_whitespace=True,\n",
159 | " header=0,\n",
160 | " names=['state', 'lat', 'mort', 'ocean', 'long'])\n",
161 | "df.head()"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "df.shape \n",
171 | "\n",
172 | "# Ummm. One line per state?\n",
173 | "# How did we get 49 lines?\n",
174 | "# Weird."
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "X = df['lat'].to_frame()\n",
184 | "y = df['mort']"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "X[:5]"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "y[:5]"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "With our data imported, let's separate it into training data and test data."
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "from sklearn.model_selection import train_test_split"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "plt.scatter(X_train, y_train)\n",
237 | "plt.title(\"Mortality vs Latitude\")\n",
238 | "plt.xlabel(\"Latitude\")\n",
239 | "plt.ylabel(\"Number of deaths\");"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "## Choose the Model"
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "metadata": {},
252 | "source": [
253 | "In this case, we have already decided upon using the LinearRegression model, so importing it is straightforward. But if we aren't sure what model we want we can always refer back to the [API Reference](https://scikit-learn.org/stable/modules/classes.html)."
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {
260 | "collapsed": true
261 | },
262 | "outputs": [],
263 | "source": [
264 | "from sklearn.linear_model import LinearRegression"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "## Choose Appropriate Hyperparameters"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "For our purposes, this model doesn't require any hyperparameters, so we simply call the `LinearRegression` class."
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {
285 | "collapsed": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "model = LinearRegression()"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "If we were to look at the possible hyperparameters, we would see this:\n",
297 | "\n",
298 | "```python\n",
299 | "LinearRegression(\n",
300 | " fit_intercept=True,\n",
301 | " normalize=False,\n",
302 | " copy_X=True,\n",
303 | " n_jobs=None,\n",
304 | ")\n",
305 | "```"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "**Yeah, but what do these even mean?**\n",
313 | "\n",
314 | "Some hyperparameters can be tricky to understand. Good places to start are the documentation:\n",
315 | "\n",
316 | "> [sklearn.linear_model.LinearRegression¶](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)\n",
317 | "\n",
318 | "A number of these items are also explained on Stackoverflow:\n",
319 | "\n",
320 | "> [how fit intercept parameter impacts linear regression with scikit learn](https://stackoverflow.com/questions/46510242/how-fit-intercept-parameter-impacts-linear-regression-with-scikit-learn)\n",
321 | "\n",
322 | "It might take:\n",
323 | "\n",
324 | "* several readings\n",
325 | "* multiple sources\n",
326 | "* some tests and examples\n",
327 | "\n",
328 | "...before you start to wrap your head around the expected outcomes.\n",
329 | "\n",
330 | "*This is OK. You are just like the rest of us!*\n",
331 | "\n",
332 | "
\n"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "## Fit the Model"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {
346 | "collapsed": true
347 | },
348 | "outputs": [],
349 | "source": [
350 | "model.fit(X_train, y_train)"
351 | ]
352 | },
353 | {
354 | "cell_type": "markdown",
355 | "metadata": {},
356 | "source": [
357 | "## Apply the Model"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": null,
363 | "metadata": {
364 | "collapsed": true
365 | },
366 | "outputs": [],
367 | "source": [
368 | "y_pred = model.predict(X_test)"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": null,
374 | "metadata": {},
375 | "outputs": [],
376 | "source": [
377 | "y_pred.shape"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "y_pred[:5]"
387 | ]
388 | },
389 | {
390 | "cell_type": "markdown",
391 | "metadata": {},
392 | "source": [
393 | "## Examine the results"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "metadata": {
400 | "collapsed": true
401 | },
402 | "outputs": [],
403 | "source": [
404 | "plt.title(\"Red and Purple Results\")\n",
405 | "plt.scatter(X_train, y_train, color='rebeccapurple')\n",
406 | "plt.scatter(X_test, y_pred, color='red', alpha=0.2);"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": null,
412 | "metadata": {
413 | "collapsed": true
414 | },
415 | "outputs": [],
416 | "source": [
417 | "plt.title(\"Red and Purple Results\")\n",
418 | "plt.scatter(X_train, y_train, color='rebeccapurple')\n",
419 | "plt.plot(X_test, y_pred, color='red');"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "metadata": {},
426 | "outputs": [],
427 | "source": [
428 | "# For reference, against the above graph:\n",
429 | "\n",
430 | "print('Coefficient/slope:', model.coef_)\n",
431 | "print('y-intercept:', model.intercept_)\n"
432 | ]
433 | },
434 | {
435 | "cell_type": "markdown",
436 | "metadata": {},
437 | "source": [
438 | "# Gotchas\n",
439 | "---"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "**Understanding the data formats**: When I first began experimenting with scikit-learn, I kept messing up the format of the data. I kept trying to feed it 1D arrays instead of 2D feature matrices. I would try to apply a model from a book OR a tutorial and would end up totally flummoxed, which was very stressful, especially with something that seems as simple as a linear regression. \n",
447 | "\n",
448 | "As we discussed in the data handling lesson, the `X` inputs (`X_train`, `X_test`) needed to be a 2D matrix."
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "# Deep Dive\n",
456 | "---"
457 | ]
458 | },
459 | {
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | "N/A"
464 | ]
465 | },
466 | {
467 | "cell_type": "markdown",
468 | "metadata": {},
469 | "source": [
470 | "# How to learn more: tips and hints\n",
471 | "---"
472 | ]
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {},
477 | "source": [
478 | "**Read the outputs**: Pay close attention to the outputs that Scikit-Learn prints to the screen. Regular exposure to these outputs will regularly expose you to terms, arguments, vocabulary and grammar that are fundamental to understanding the inner workings of the models specifically and machine learning more generally. \n",
479 | "\n",
480 | "**Do outside research**: When you find a new word OR a word used in ways that you are not used to, look it up, read articles about that concept, read stackoverflow answers about that concept, and of course read the documentation. The word **regression** has been a thorn in my side since I first saw it. I just couldn't put my finger on what it means. I know what is happening in a regression calculation, but the **meaning** just escaped me. Why that word, to describe that phenomena? \n",
481 | "\n",
482 | "> \"The term \"regression\" was coined by Francis Galton in the nineteenth century to describe a biological phenomenon. The phenomenon was that the heights of descendants of tall ancestors tend to regress down towards a normal average (a phenomenon also known as regression toward the mean).\" \n",
483 | "\n",
484 | "> Source: [Wikipedia: Regression Analysis](https://en.wikipedia.org/wiki/Regression_analysis)\n"
485 | ]
486 | },
487 | {
488 | "cell_type": "markdown",
489 | "metadata": {},
490 | "source": [
491 | "# Experience Points!\n",
492 | "---"
493 | ]
494 | },
495 | {
496 | "cell_type": "markdown",
497 | "metadata": {
498 | "slideshow": {
499 | "slide_type": "slide"
500 | }
501 | },
502 | "source": [
503 | "# Read the docs...\n",
504 | "\n",
505 | "Explore the docs related to Naive Bayes models for about 3 - 4 minutes, in particular the section related to GaussianNB (naive bayes).\n",
506 | "\n",
507 | "[**GaussianNB (link)**](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB)\n",
508 | "\n",
509 | "Find answers to the following:\n",
510 | "\n",
511 | "* Gaussian Naive Bayes has a method you can call that allows you to update models and can be used if the dataset is too large to fit into memory all at once. What is that method?\n",
512 | "* There is a link to the User Guide. Find the link and skim the overview. There are two cases mentioned where Naive Bayes Classifiers have worked quite well in many real-world situations. What are those two cases?"
513 | ]
514 | },
515 | {
516 | "cell_type": "markdown",
517 | "metadata": {},
518 | "source": [
519 | "---\n",
520 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
521 | "\n",
522 | "If you want to continue on at your own-pace, please feel free to do so.\n",
523 | "\n",
524 | "
"
525 | ]
526 | },
527 | {
528 | "cell_type": "markdown",
529 | "metadata": {},
530 | "source": [
531 | "# References\n",
532 | "---"
533 | ]
534 | },
535 | {
536 | "cell_type": "markdown",
537 | "metadata": {},
538 | "source": [
539 | "Below are references that may assist you in learning more:\n",
540 | " \n",
541 | "|Title (link)|Comments|\n",
542 | "|---|---|\n",
543 | "|[API docs on linear models](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model)||\n",
544 | "|[sklearn description of overfitting](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html)||\n",
545 | "|[Wikipedia article on overfitting](https://en.wikipedia.org/wiki/Overfitting)||\n",
546 | "|[Wikipedia article on regression analysis](https://en.wikipedia.org/wiki/Regression_analysis)||\n",
547 | "|[Skincancer dataset](https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/skincancer/index.txt)||"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": null,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": []
556 | }
557 | ],
558 | "metadata": {
559 | "kernelspec": {
560 | "display_name": "Python 3",
561 | "language": "python",
562 | "name": "python3"
563 | },
564 | "language_info": {
565 | "codemirror_mode": {
566 | "name": "ipython",
567 | "version": 3
568 | },
569 | "file_extension": ".py",
570 | "mimetype": "text/x-python",
571 | "name": "python",
572 | "nbconvert_exporter": "python",
573 | "pygments_lexer": "ipython3",
574 | "version": "3.6.7"
575 | }
576 | },
577 | "nbformat": 4,
578 | "nbformat_minor": 2
579 | }
580 |
--------------------------------------------------------------------------------
/04_naive_bayes/04_naive_bayes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Welcome to the Dark Art of Coding:\n",
8 | "## Introduction to Machine Learning\n",
9 | "Naive Bayes Classification\n",
10 | "\n",
11 | "
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Objectives\n",
19 | "---"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "In this session, students should expect to:\n",
27 | "\n",
28 | "* Cover an overview of Naive Bayes Classification\n",
29 | "* Examine code samples that walk us through **The Process™**:\n",
30 | " * Prep the data\n",
31 | " * Choose the model\n",
32 | " * Choose appropriate hyperparameters\n",
33 | " * Fit the model\n",
34 | " * Apply the model\n",
35 | " * Examine the results\n",
36 | "* Explore a deep dive into this model\n",
37 | "* Review some gotchas that might complicate things\n",
38 | "* Review tips related to learning more"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "# Overview: Naive Bayes Classification\n",
46 | "---"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "Naive Bayes Classification models are popular machine learning models because they:\n",
54 | "* are fast\n",
55 | "* are simple with few tunable hyperparameters\n",
56 | "* are suitable for datasets with very high dimensions\n",
57 | "* can provide a nice baseline classification to start with before considering more sophisticated models\n",
58 | "\n",
59 | "
\n",
60 | "
\n",
61 | "\n",
62 | "\n",
63 | "Naive Bayes Classifiers rely upon Bayes Theorem that allows you to predict the probability of a `label` if given some set of `features`:\n",
64 | "\n",
65 | "$$P(label | features)$$\n",
66 | "\n",
67 | "We won't cover the math here. 😀\n",
68 | "\n",
69 | "I do go into it in my [**Intro to Statistics and Probability**](https://www.youtube.com/watch?v=zzbw0JbiI6Y) tutorial from Pycon 2018. Check it out!\n",
70 | "\n",
71 | "Scikit-learn has a number of Naive Bayes Classifiers. They are referred to as **naive** because they make certain presumptions about the data.\n",
72 | "\n",
73 | "Each of the following has slightly different assumptions about the data. For example, the GaussianNB model that we will look at presumes that the \"likelihood of the features is assumed to be Gaussian\" (i.e. the likelihood of any given feature falls on a bell curve).\n",
74 | "\n",
75 | "* BernoulliNB\n",
76 | "* ComplementNB\n",
77 | "* GaussianNB\n",
78 | "* MultinomialNB"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "Let's go through the steps of **The Process™** to see how this works."
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "## Prep the data"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "We start with a set of standard imports..."
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {
106 | "collapsed": true
107 | },
108 | "outputs": [],
109 | "source": [
110 | "import matplotlib.pyplot as plt\n",
111 | "import numpy as np\n",
112 | "import pandas as pd\n",
113 | "import sklearn\n",
114 | "\n",
115 | "# NOTE: during the Choose the Model step, we will import the \n",
116 | "# model we want, but there is no reason you can't import it here.\n",
117 | "# from sklearn.naive_bayes import GaussianNB "
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "### Prep the training data and test data"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "# For the banana lovers in the room:\n",
134 | "# fake data warning...\n",
135 | "\n",
136 | "df = pd.read_csv('../universal_datasets/bananas.csv')\n",
137 | "df.head()"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "Here two columns from a `pandas DataFrame` represent a suitable 2D matrix for the `features`.\n",
145 | "\n",
146 | "One column from the `pandas DataFrame` (i.e. a `pandas Series`) is suitable as the `target` array."
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "X = df[['length', 'width']]\n",
156 | "y = df['category']"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "It can be really useful to take a look at the features matrix and target array of the training data. \n",
164 | "\n",
165 | "* In the raw form\n",
166 | "* In a visualization tool\n",
167 | "\n",
168 | "For this dataset, let's use a scatter plot."
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": [
177 | "X[:5]"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "y[:5]"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "from sklearn.model_selection import train_test_split"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "X_train, X_test, y_train, y_test = train_test_split(X, y, \n",
205 | " test_size=0.33,\n",
206 | " random_state=42)"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "for item in X_train, X_test, y_train, y_test:\n",
216 | " print(item[:2]) # Let's look at just two samples\n",
217 | " print(item.shape) # Let's confirm the number of samples\n",
218 | " print()"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "plt.scatter(X_train['length'], X_train['width'], c=y_train,\n",
228 | " cmap='seismic')\n",
229 | "plt.title(\"'Cavendish' vs. 'Apple Banana' Training Data\");\n",
230 | "\n",
231 | "# NOTE TO SELF: Blue is cat zero, red is cat one"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "In the following plot of the test data, we chose to set the `alpha` channel for the dots at `0.15` which makes the dots largely transparent, so that they are visually distinct. Later we will plot the training data and the test data on the same graph and that transparencey will help to segregate them visually.\n",
239 | "\n",
240 | "AND, although we know what category each of these falls into, we chose to keep them all the same color, since we want to rely upon the model to categorize them."
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "plt.title(\"All the test points\")\n",
250 | "plt.scatter(X_test['length'], X_test['width'], alpha=0.15);"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "## Choose the Model"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "In this case, we have already decided upon using the GaussianNB model, so importing it is straightforward. But if we aren't sure what model we want we can always refer back to the [API Reference](https://scikit-learn.org/stable/modules/classes.html)."
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {
271 | "collapsed": true
272 | },
273 | "outputs": [],
274 | "source": [
275 | "from sklearn.naive_bayes import GaussianNB"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {},
281 | "source": [
282 | "## Choose Appropriate Hyperparameters"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {},
288 | "source": [
289 | "This model doesn't require any hyperparameters, so we simply call the `GaussianNB` class."
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {
296 | "collapsed": true
297 | },
298 | "outputs": [],
299 | "source": [
300 | "model = GaussianNB()"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "## Fit the Model"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {},
313 | "source": [
314 | "Here we supply the **features matrix** and a **target array** that we generated above. Notice that it immediately provides a summary of the hyperparameters (in this case, the defaults) that were supplied."
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": null,
320 | "metadata": {
321 | "collapsed": true
322 | },
323 | "outputs": [],
324 | "source": [
325 | "model.fit(X_train, y_train)"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "metadata": {},
331 | "source": [
332 | "## Apply the Model"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "We can now supply the test features matrix in expectation that the model will produce an array of labels (categories): one label for each sample in the features matrix."
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {
346 | "collapsed": true
347 | },
348 | "outputs": [],
349 | "source": [
350 | "y_pred = model.predict(X_test)"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {
357 | "collapsed": true
358 | },
359 | "outputs": [],
360 | "source": [
361 | "y_pred.shape"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": null,
367 | "metadata": {
368 | "collapsed": true
369 | },
370 | "outputs": [],
371 | "source": [
372 | "y_pred[:10]"
373 | ]
374 | },
375 | {
376 | "cell_type": "markdown",
377 | "metadata": {},
378 | "source": [
379 | "## Examine the results"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {
386 | "collapsed": true
387 | },
388 | "outputs": [],
389 | "source": [
390 | "plt.title(\"Red and Blue Results\")\n",
391 | "\n",
392 | "plt.scatter(X_train['length'], X_train['width'], c=y_train,\n",
393 | " cmap='seismic')\n",
394 | "\n",
395 | "plt.scatter(X_test['length'], X_test['width'], c=y_pred,\n",
396 | " cmap='seismic',\n",
397 | " alpha=0.15);"
398 | ]
399 | },
400 | {
401 | "cell_type": "markdown",
402 | "metadata": {},
403 | "source": [
404 | "# Gotchas\n",
405 | "---"
406 | ]
407 | },
408 | {
409 | "cell_type": "markdown",
410 | "metadata": {},
411 | "source": [
412 | "A number of problems arose the first time I dove into this model:\n",
413 | " \n",
414 | "**Naming conventions**: I ran into snags with naming conventions. My first toy dataset was randomly generated and I just pictured it as **x and y coordinates**. But having x values and y values on my graph (and in my head) threw everything out of whack when I tried to translate that to the `X` and `y` inputs and outputs that are commonly used in models and in statistics, etc. If my data were naturally labeled as anything else, it might have been less painful to mentally translate:\n",
415 | "\n",
416 | "|Alternate Labels||\n",
417 | "|:---|:---|\n",
418 | "|lat|long|\n",
419 | "|price|quantity sold|\n",
420 | "|passing yards|wins|"
421 | ]
422 | },
423 | {
424 | "cell_type": "markdown",
425 | "metadata": {},
426 | "source": [
427 | "**Making graphs shouldn't distract you from the machine learning**: Above, (and in most of these sessions) we have a number of dataviz steps intermixed with our machine learning steps. And there is the possibility that it might lead to confusion about what parts are *critical* to the machine learning and which parts are *nice to have*. Presuming your data is prepared properly, **just these four lines are necessary** to predict the category OR label for the values in the test set. \n",
428 | "\n",
429 | "```python\n",
430 | " from sklearn.naive_bayes import GaussianNB\n",
431 | " model = GaussianNB()\n",
432 | " model.fit(X_train, y_train)\n",
433 | " y_pred = model.predict(X_test)\n",
434 | "```\n",
435 | "\n",
436 | "**Sometimes starting too big is too confusing**: I often recommend that students pare back their problem to a small handful of items so that they can really see what is happening. Since this model takes a pair of coordinates and returns a label to say whether the coordinates fit in the **blue** category OR the **red** category, let's take a pair of coordinates that we know should fit clearly into the blue category (i.e. something above the dividing line."
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": null,
442 | "metadata": {},
443 | "outputs": [],
444 | "source": [
445 | "# 83 187.600405 36.487954 0\n",
446 | "\n",
447 | "y_pred_single = model.predict([[187, 36]])\n",
448 | "y_pred_single"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "Now, let's take a pair of points (one in each category **red** and **blue**) and ensure that we get two different labels:"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": null,
461 | "metadata": {},
462 | "outputs": [],
463 | "source": [
464 | "# 28 231.568879 32.733688 1\n",
465 | "# 93 190.142527 45.764529 0\n",
466 | "\n",
467 | "y_pred_pair = model.predict([[232, 33],\n",
468 | " [190, 46]])\n",
469 | "y_pred_pair"
470 | ]
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "metadata": {},
475 | "source": [
476 | "# Deep Dive\n",
477 | "---"
478 | ]
479 | },
480 | {
481 | "cell_type": "markdown",
482 | "metadata": {},
483 | "source": [
484 | "N/A"
485 | ]
486 | },
487 | {
488 | "cell_type": "markdown",
489 | "metadata": {},
490 | "source": [
491 | "# How to learn more: tips and hints\n",
492 | "---"
493 | ]
494 | },
495 | {
496 | "cell_type": "markdown",
497 | "metadata": {},
498 | "source": [
499 | "What should you do to advance your skills?\n",
500 | "\n",
501 | "**Play with the tools**:\n",
502 | "\n",
503 | "
\n",
504 | "\n",
505 | "**Get familiar with your favorite graphing library**: being able to visualize the results will help you get a sense of whether your model is accurately predicting. It will also help you to better succeed at the **ultimate goal of data science**:\n",
506 | "\n",
507 | "> Data science is meant to inform and thus enable action.\n",
508 | "\n",
509 | "\n",
510 | "**Read the docs**: yeah... I know they can be scary. I love math, but sometimes my eyes glaze over when row after row of equations come rolling out. Regardless, the more time you spend reading the docs, the faster you will begin to better understand the nuances of different models, which models apply in which situations. Don't be afraid if there are words in there that you don't understand. The vocabulary will come, given time and plenty of exposure. From this lesson, several good resources include:\n",
511 | "* [API Reference](https://scikit-learn.org/stable/modules/classes.html)\n",
512 | "* [Gaussian Naive Bayes Page](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)\n",
513 | "* [User Guide: Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html)\n",
514 | "\n",
515 | "**Don't just copy-paste other people's models**: go home, find a dataset with values that are suitable to a given model and create your own model. Then put in some test values and see if it predicts properly."
516 | ]
517 | },
518 | {
519 | "cell_type": "markdown",
520 | "metadata": {},
521 | "source": [
522 | "# Experience Points!\n",
523 | "---"
524 | ]
525 | },
526 | {
527 | "cell_type": "markdown",
528 | "metadata": {
529 | "slideshow": {
530 | "slide_type": "slide"
531 | }
532 | },
533 | "source": [
534 | "# Read the docs...\n",
535 | "\n",
536 | "Explore the docs related to clustering and KMEANS for about 3 - 4 minutes.\n",
537 | "\n",
538 | "[**Clustering (link)**](https://scikit-learn.org/stable/modules/clustering.html#k-means)\n",
539 | "\n",
540 | "[**KMEANS API (link)**](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)\n",
541 | "\n",
542 | "Find answers to the following:\n",
543 | "\n",
544 | "* On the Clustering page, there is a section focused on K-means. In that section, there are four small graphs. Those graphs indicate four cases where K-means may struggle in producing accurate clusters. What are those four cases? (Hint: look for inertia).\n",
545 | "* On the KMEANS API page, there is an argument mentioned that controls the number of times the k-means algorithm will be run with different centroid seeds. What is the default number of times?"
546 | ]
547 | },
548 | {
549 | "cell_type": "markdown",
550 | "metadata": {},
551 | "source": [
552 | "---\n",
553 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
554 | "\n",
555 | "If you want to continue on at your own-pace, please feel free to do so.\n",
556 | "\n",
557 | "
"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {},
563 | "source": [
564 | "# References\n",
565 | "---"
566 | ]
567 | },
568 | {
569 | "cell_type": "markdown",
570 | "metadata": {},
571 | "source": [
572 | "Below are references that may assist you in learning more:\n",
573 | " \n",
574 | "|Title (link)|Comments|\n",
575 | "|---|---|\n",
576 | "|[API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n",
577 | "|[Gaussian Naive Bayes Page](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)||\n",
578 | "|[User Guide: Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html)||"
579 | ]
580 | }
581 | ],
582 | "metadata": {
583 | "kernelspec": {
584 | "display_name": "Python 3",
585 | "language": "python",
586 | "name": "python3"
587 | },
588 | "language_info": {
589 | "codemirror_mode": {
590 | "name": "ipython",
591 | "version": 3
592 | },
593 | "file_extension": ".py",
594 | "mimetype": "text/x-python",
595 | "name": "python",
596 | "nbconvert_exporter": "python",
597 | "pygments_lexer": "ipython3",
598 | "version": "3.6.7"
599 | }
600 | },
601 | "nbformat": 4,
602 | "nbformat_minor": 2
603 | }
604 |
--------------------------------------------------------------------------------
/universal_datasets/svm_test.csv:
--------------------------------------------------------------------------------
1 | -1.1128777354788033,-4.198293550514309
2 | -1.5474321011450844,-8.513296710085173
3 | -0.8413329171075709,-4.570114718395777
4 | 0.5912805806348089,-2.0298235499910566
5 | -2.5034573722452205,1.5984402199917822
6 | -4.581650231154441,-10.557697330825775
7 | -1.20990781248348,-11.682959807907126
8 | 0.4394508714648315,-0.17059459964294366
9 | -1.6445130595044544,-1.1214521417663437
10 | 0.19031516462512865,0.08734902225413332
11 | -1.19167337459382,-6.146809041308661
12 | -1.3085533869201318,-4.350813803390592
13 | -2.306946193513669,-1.4452818792232835
14 | -3.14688610811814,-9.70465692233303
15 | -1.0242914317012626,-6.529314650292045
16 | 0.4919143370694552,-6.6995136789470875
17 | 0.016185879349561283,-13.13359909954168
18 | 0.0360166874566048,-5.115758650312007
19 | -1.7698348835851956,-8.466899151755907
20 | -1.0736152379772443,-2.1532893668854047
21 | 0.4098044799186551,-2.6293513081646123
22 | -1.6851622373555832,-0.9835611450213841
23 | -1.6106341312508223,-6.5409431708702845
24 | -0.7430206938447779,-2.14729472131521
25 | -2.266473221541881,-5.3291539664110035
26 | -1.581066658517325,-4.922281617698713
27 | -1.9558827223378632,-6.021700877640337
28 | -0.9387609606326968,-8.718706338879006
29 | -0.2301104407724368,-4.732083990372785
30 | -1.5906587209581595,-2.0727547646030704
31 | -1.3903369815871975,-2.8919671133878744
32 | 1.479076786668335,-6.498974380475772
33 | -2.9346328244050133,1.0986661301577705
34 | -1.15070382621266,-3.4134380455882227
35 | -1.9542548079513282,-5.507156905206045
36 | -0.5684234625902876,-0.610820333397216
37 | -1.5679922068669918,-2.7929728643040432
38 | -0.7041198804938382,-0.1781956770802795
39 | -0.8836271411491521,10.8173982416032
40 | -0.4721529571719072,-5.474383220946176
41 | 0.2960366741598628,-7.297419726991481
42 | -2.095078961965399,-7.818206265656964
43 | -0.6265028246144968,-3.1475975034269634
44 | -1.3150713066566837,-0.27925473687410474
45 | -2.488778585455615,-5.078831149589483
46 | -0.6057255796745875,-8.513211751127557
47 | -0.48235756401064556,-8.959721657796088
48 | -0.2838109663390621,-4.523799455667192
49 | -2.0231548987762293,-4.335880453508577
50 | -0.996074358538087,3.0597117469927166
51 | 0.9901845607455066,0.5224254058947455
52 | -3.3771895470127506,-0.9292548641916207
53 | -1.3071800647197884,-3.070239491532864
54 | -1.2758938439899323,-7.000760648613763
55 | -0.9948128966276316,-6.0383081608925195
56 | 0.2434595319317041,-2.7064273482009567
57 | -1.1690940009147741,-0.9043762635196266
58 | -3.335869765263701,-5.628384123440624
59 | 0.9821869305938502,5.261865789762641
60 | -0.19147247303603931,1.1908892227316379
61 | -0.2181997473744839,-16.44773455805294
62 | 0.12960189093060603,-9.60980858058559
63 | -3.009611866707992,-3.0964255940602476
64 | -2.007766638226739,-1.5753611236593463
65 | -2.8251980124114953,-7.031778004054083
66 | 2.00473200982919,-3.274821617856249
67 | -1.7679256563802483,-3.6804803210456596
68 | -2.2335824595287894,-10.616225074507568
69 | 0.053487932897701373,-3.1563098797363254
70 | -2.1857322638354835,-5.277172986142428
71 | -0.6711630915485909,-4.334429142497399
72 | -0.7285043236068133,-3.707072940066287
73 | -1.563981387280442,-3.9247507097832375
74 | -2.631795438719293,-10.45130809027092
75 | 0.4406367794407118,-7.711596650127005
76 | -0.5468932107633326,-5.175736655529768
77 | -0.536519025771141,-6.356582747502822
78 | 1.4970857713734436,-6.01928804348192
79 | -0.02868140249853024,-6.524236218716373
80 | -0.5263339336681936,1.5962961566214293
81 | -1.6071354163066518,-8.423417090968968
82 | -0.9891988917270499,-5.148505008129691
83 | -0.3653995723747301,-3.02437293221283
84 | -0.9822850126215252,-0.5543305941117551
85 | 0.12393723777135057,-5.632331929032225
86 | 0.81010747537499,-9.715903558896468
87 | 1.4070684434538498,-0.798908877631185
88 | -2.4430172601404907,-7.519256236670962
89 | 0.17940627657545782,-2.668367943717899
90 | -0.9615413296626997,-6.087085020051929
91 | -0.19223997006077254,4.23465919579189
92 | -1.3128419414454324,-10.37338828506749
93 | -0.9866059510816588,2.0142188988848693
94 | -1.335904576602386,-3.704809267653785
95 | 0.5185367751335748,-6.132426741716719
96 | 1.7071907683790912,-7.463475598725525
97 | -0.16916555502687247,-3.8264338023130167
98 | -2.4132164113431545,-13.043569405732988
99 | 0.20137642533920252,3.2059240047449897
100 | -0.40747209518681127,-7.51457395604197
101 | 0.27382129858365056,-5.917867348765813
102 | -1.4562171240822945,-3.0340017925613747
103 | -0.5482698906275856,-2.980823871339118
104 | 1.2291130733484739,-5.590924248231709
105 | -1.5553596089240154,-2.3355326335062268
106 | -0.8006998821492153,4.39825586935649
107 | -1.3324880544027575,-4.710594485681928
108 | -0.536249055451949,-2.227055346483483
109 | -3.5945315564892124,-0.988128753703394
110 | -1.2605411231623436,2.814814052218316
111 | 2.412182553036236,0.1128229738256259
112 | -1.2871657660629863,-0.1453039966800489
113 | 0.46176674674106044,-6.2351510463130415
114 | -1.0452744568713892,-5.33223840545686
115 | -0.6515562210836497,-11.876539420388724
116 | -2.4198144999165843,-9.042416520761627
117 | -2.727844731412838,-0.0979870392619957
118 | -0.5197555178132393,-2.5988945669278336
119 | -0.714344854470871,-1.5174770901825783
120 | -1.5079231732020961,-3.477998928994165
121 | 0.9226893964735177,-9.888991302372752
122 | -2.5449144755222926,-0.9352796786917144
123 | 0.029554117762038867,-3.9612231798876087
124 | 0.24873022816942592,-7.363411033550648
125 | -1.3146547455503872,-3.1666592242191935
126 | -1.169912733526902,3.5169501219545705
127 | 0.29208822534881507,-5.899481505379233
128 | 0.1397504920912147,-6.741801808912094
129 | -1.3843895153191639,0.8755126119758376
130 | 0.22820574588570675,-2.0662376202181862
131 | -2.5372903787533145,-2.648502256573363
132 | 0.4673792257018752,1.8146046142746766
133 | -1.3120397720769954,-2.89541342871493
134 | 1.9368240880477532,-3.1073675008983055
135 | -2.4549412376138826,-6.1293883863116925
136 | -0.2746470893857751,-4.3677310786379575
137 | -1.784459871132847,1.6715553735647246
138 | 0.1402103786720339,-6.490842918160093
139 | 0.179680659791027,-1.8273590187608306
140 | -1.3246019440511527,-7.217612148373873
141 | 0.12031256638922905,-2.813658106757887
142 | -1.326285145078372,-7.330428746758571
143 | -3.1147084957898525,-9.434828059975843
144 | -2.5535537001405273,-4.2864313097643505
145 | -2.025297732713089,0.8081762661781102
146 | 0.4364556450314867,-0.36902614279795776
147 | -0.14295350853438704,-7.091404216909701
148 | 1.7939770213011967,-4.267594335097729
149 | -1.0984189071325916,-8.849200519658748
150 | -3.404224052194068,-1.4929371133042935
151 | -2.592841650911873,-0.8137261872522261
152 | 0.2440419485026808,3.7801304782547884
153 | 0.511612897545638,-4.753008017658611
154 | -1.0715433116632096,-2.490352366355628
155 | -1.60151335822812,1.0572219068535036
156 | 1.470372421121279,-9.909566514087295
157 | -0.57983340915765,-1.5251375534063212
158 | -1.9432703247978385,-3.7075936700221073
159 | 2.0611647957057775,-1.9693761357270425
160 | 0.8353192892179313,-7.58989014746479
161 | -2.2588973984409195,-6.618525497360318
162 | -0.29943419554851824,-10.078916500258547
163 | 0.5447103991330242,-5.286234559925591
164 | -1.263249164014744,-1.465723575583775
165 | 0.24196363595697035,-0.6451136170690162
166 | 1.0590111972933545,-3.155389054839493
167 | -2.155207294566483,4.521830011420883
168 | -1.6213016326065264,-7.991147816925385
169 | 1.0134518329138489,-10.549085068940713
170 | -2.6367527830965445,-0.3862670569628648
171 | -2.962755818320897,-4.803764489168577
172 | -2.3124992092204213,-6.284494590117147
173 | -0.5803291296377401,-6.0125042700666755
174 | -2.094405504981626,-2.3731302649672847
175 | 0.40390456634281313,-2.8360411011601676
176 | 0.3113593084095836,-5.331204209847755
177 | -0.7941343188944578,-8.856231074412683
178 | -0.4559661420037868,-1.9152707824758601
179 | -2.0603420398922765,-0.6885666719508574
180 | -2.075788369930776,-1.9163382385696401
181 | -1.2350777886647473,-9.722168418113693
182 | -0.07020042927254444,-3.4198365841517036
183 | -0.5285793163048296,-2.865147670538677
184 | -1.2862490833421456,1.1971108440240128
185 | -2.012721152140739,-1.5785213504708366
186 | -0.15902799791158984,-5.936280986059682
187 | 0.7760866086200271,-7.449449528293108
188 | -0.16763009805643359,-11.754207983464923
189 | -0.9299175118643797,-8.827276806254526
190 | -2.6867687929018103,-2.9181327165713737
191 | -0.9852704699454691,-0.9775181031343916
192 | 1.0207590525289714,-7.886427243400431
193 | -1.6572918282234985,-9.105034909569302
194 | -1.39067345067344,4.041168670705808
195 | 0.8022849707320943,3.187660106071009
196 | -0.3917813091122211,2.1226251494452466
197 | 0.7336349726266675,-6.534662357427745
198 | 0.048954383815224434,-5.655673996480912
199 | -3.5469532640120267,-3.1359491834882007
200 | 0.46764049139741704,-1.9332327370148148
201 | 1.545204507640002,0.9662192776398486
202 | -0.947209598895472,-8.713750176649674
203 | -1.4876012400395109,-4.627899496104078
204 | -2.0836694537214155,-1.2946024051675553
205 | -1.034693731302668,-8.702365178766676
206 | -1.116660332356774,-3.055897693472799
207 | -1.3324359796098209,-7.916101647461197
208 | -1.8322339697284262,-7.271336959293851
209 | -0.5813279766268333,-3.436475114950773
210 | -1.2654560161346229,2.245679198760117
211 | 0.5296821362123993,-5.2793592696408425
212 | -0.5866418398291119,-5.048133604670716
213 | 0.3233017994841798,-5.095824845994039
214 | 1.448422160569376,-0.860826240019513
215 | -1.8161714575819536,-6.737962303756498
216 | -0.38975285182147834,-0.5965182953296884
217 | -1.2487488109951144,-4.292458314434005
218 | -0.48939306735942834,-13.652797915157418
219 | -1.7358565770206127,-3.829973239655861
220 | -2.220932414029518,-5.73661455558891
221 | -3.368987111684689,-3.902579869562497
222 | 0.5877943865755928,1.7449973268795205
223 | -0.3540845973764529,-2.0792115449795188
224 | -1.0715985137285198,-12.061838337341282
225 | -1.5042898118276156,-0.7321109895948288
226 | -0.08456099179558785,1.041618513336358
227 | -2.534050217913368,-3.820388386664164
228 | -0.2742876314386189,-4.679599764323371
229 | -0.5543339008814908,-4.699346544242372
230 | 0.6961672068271421,2.251803710561407
231 | -0.7050586462947833,5.085363196753761
232 | -3.770554850925653,-3.592556033486702
233 | -1.1085894774047764,-3.8641139353523286
234 | 0.4551258697912641,-2.421916861182247
235 | 0.24362205114534285,-3.5781485255508088
236 | -1.0880138328041307,-4.2454829492528185
237 | -2.374163440446767,-2.8240230821032575
238 | -0.4003667290980757,-1.9904745066529905
239 | -3.1213814371526833,8.439381149457407
240 | 0.5824419668992531,-0.4690592719057509
241 | -1.9700121172295944,2.4400110048206196
242 | -2.8036777324069906,-7.272866429872092
243 | -1.7270659520237779,-7.882909034550316
244 | -3.120080147253502,-3.981479385425046
245 | -2.302319166513156,-4.381050115632078
246 | 0.28277391601410273,-4.455259677168917
247 | 0.4653973465708314,-8.549125023488344
248 | -2.7683769415331962,-12.688622991951636
249 | -1.2043844226691438,-6.171178064769993
250 | -0.4224743491930061,-2.4185872061228912
251 | 0.8307589376358631,-0.3825808216958424
252 | -1.9788783268539336,-5.219912498877827
253 | -3.0282531355188977,-2.528981443790983
254 | -1.8325168896155002,-4.679277475839315
255 | -0.7029749765706805,-8.677707691382043
256 | -0.38822700186965575,-1.1835637127312397
257 | -0.9288948600748226,-6.787805327829065
258 | -1.7831200486025336,-2.9488994654990757
259 | -1.3666581174181738,0.622504221722811
260 | -2.0014259136181667,-1.8622557198407188
261 | 0.4984192235837317,3.281833406443777
262 | -1.0434966327533859,-2.3054775650520636
263 | 1.1304889945184344,-9.073609061313611
264 | -0.2523506128293508,-4.480874694567567
265 | 0.9891831220914369,-2.371330457585594
266 | 1.17981507670804,-2.294507636202413
267 | -2.433890733514144,-8.798023215839837
268 | -0.6837892666734493,-7.194064096846326
269 | -0.11468582618766421,-5.338142301372787
270 | -1.1721409247642693,-1.2110638857332678
271 | -3.962885813923088,3.473522280799205
272 | -2.1402047832081923,-2.075862253029036
273 | 0.10002800442499793,-1.4857525654286285
274 | 0.3383325900782621,-10.478521823512333
275 | -3.004835640877978,-2.0339014307924703
276 | 0.1381214162835216,-3.3641898836511475
277 | 0.4292677251481145,-4.009232669192184
278 | -1.551632148526525,-1.2067185911867124
279 | -0.5783626688735155,-4.741920666867494
280 | -0.4742995565578716,-6.989081942361601
281 | -1.1942403665194754,-4.409391657957
282 | 0.23640163580809936,-4.597337367505926
283 | 0.18967860363430966,-4.05266521947363
284 | -1.9139441746698154,-3.092321440413615
285 | -0.7312765666764722,-1.193218869514641
286 | 0.4600640827095619,-4.150528979550504
287 | -2.0769899561772043,-3.021199695878228
288 | -2.6537950166799815,-3.1448371753847546
289 | 0.4885289132770172,-5.72774367826829
290 | -2.7715119475581598,-3.1998475355543
291 | -1.5990739783790202,-3.700376959494573
292 | -2.264012838682739,-3.481640211557375
293 | -2.383269713327781,-2.867466115796966
294 | -2.1888752645635092,-4.837765085816123
295 | -1.4107858132174655,-6.25379045110369
296 | -0.0763569664967575,-6.238396418830908
297 | -2.7667950531451986,-8.911639519511573
298 | -1.2545826042097925,-3.185261102421432
299 | -2.0677260798091748,-8.404608795569388
300 | -1.1307010403332611,1.5236960730851328
301 | -1.8871714417434489,-3.3239905989209246
302 | -0.6951156192991823,-1.2238946250701783
303 | 1.4103192235590494,-4.001907508163434
304 | -0.1192319775028381,-6.4037388198575185
305 | -1.192042789170495,-2.105600330068654
306 | -2.3005387351149986,-0.3419595360002319
307 | -1.4823391336874172,-5.082767469943232
308 | -2.058698164768944,6.001845664450034
309 | 1.5887038015637804,-8.299243344801436
310 | -0.9916892776054702,-6.698249834933088
311 | -1.315624204153219,-7.107155893422096
312 | -2.5426768516597753,-4.177803548411678
313 | -0.6487509857259672,-6.58725057753119
314 | 0.02977790681244108,1.1782908350417278
315 | -1.7118139366846252,-8.627470140338058
316 | -1.597277907383274,-10.073829697745001
317 | 0.4556748715787271,-5.528181692789595
318 | -0.24811094555841073,-1.097820059800568
319 | -0.06871717468816896,-3.501088588352608
320 | -2.3459084798204985,-1.3324235351623983
321 | -1.6086593246047047,-10.037042995927797
322 | -0.5754197688892831,-7.470604284287438
323 | -1.5296687936388906,0.8526555493182189
324 | -3.1163347464018716,-2.1541081061714387
325 | -0.6061807140684177,0.6862802875440588
326 | -1.7522744140333137,-3.5896662037610696
327 | -2.6543854462057475,4.161196397717035
328 | -2.6340863052782666,1.0060654125962802
329 | -2.7122158534069416,-1.99318896871802
330 | -4.139975485154663,-2.451094383189929
331 | -1.4187808026179622,-1.915947688925081
332 | -0.7288644902986623,1.786518734646032
333 | -1.9452277358113772,-0.9072184837671919
334 | -0.5971730681724873,-0.45014879984070344
335 | 0.9707179729986437,-3.8880583205989603
336 | 0.6592075577281982,-4.76190980165592
337 | -1.27604219598435,-5.396304199594539
338 | -2.215717579153721,-6.707947283207385
339 | -0.11535522541635268,1.0629738188498807
340 | -1.9811239007375194,-8.454363501880222
341 | 0.649840596229458,-1.919357510627262
342 | -0.5466551860660194,-4.658693729235955
343 | -0.7521148890263412,-5.5063179134790445
344 | -0.6845949921755581,-9.004190031472334
345 | -1.7810146340970543,-7.43623322937134
346 | 0.9921959632662554,-8.887132092335467
347 | -1.4204751641100768,-2.559195059850611
348 | -2.672087780536039,-10.938191084875562
349 | -1.9715740875742331,0.7923574363632229
350 | -0.6840412142529101,-4.831457010093357
351 | 0.0011800861809636576,-7.472655071780515
352 | -2.853942035088541,-4.634998500812769
353 | -0.42049919841058403,-7.327093701378728
354 | 0.2285443297387042,-5.095645629990317
355 | -0.6538558619296958,-3.861309765879895
356 | -1.8272160734297702,-4.6428282411088
357 | -0.564598715004617,-2.829099614537295
358 | -2.7699507652821334,-1.81265514094191
359 | -0.9250525757520809,-3.071344763382676
360 | -2.594266840500027,-4.214897495038795
361 | -0.5205929690840647,-4.586501365762072
362 | -0.8951711876107677,1.6183325723260973
363 | 0.5237495833518975,-2.325722514614627
364 | -0.6458338857860921,-3.8435898878824073
365 | -1.0703143742579868,-1.9675501511892546
366 | -0.7225681876762677,-0.7686424967109806
367 | -2.1651041126047854,-7.471259961532812
368 | -0.37038074055618453,-0.45859163545557813
369 | -0.6219795346224191,-2.1981828058525017
370 | -0.37789298998491216,-1.1865786448800768
371 | -1.353196812367155,-6.865230516562836
372 | -1.977375103898015,-0.3943520453537599
373 | -1.6060245805373023,-1.8566934666772155
374 | -2.6436184354371357,-1.6149194176843102
375 | -0.18183489561980537,-8.489237102752753
376 | -0.8858341205172254,1.90020292387522
377 | -1.5881119795132732,-14.480215294118565
378 | -2.6587449752383456,-6.490094074316531
379 | -2.1336567494660756,-0.8796377901395602
380 | -0.2557174740604984,-0.9772614146373115
381 | -0.11130751984866016,-0.4748158350199758
382 | -2.05429005386282,-7.023205921962287
383 | -0.7841960790757913,-4.5734286882985655
384 | -0.23298206687445122,-0.09363102229937548
385 | 0.639872018532196,-5.713547269906027
386 | -2.4314720691460003,-0.29456504415536555
387 | 0.2130277422909448,-7.819467713103724
388 | -1.9267946814623316,-4.189824482154564
389 | -0.6654687045430873,0.529541933188213
390 | -4.942654856670764,-4.219131790677451
391 | -0.7853355302764866,-4.592963828074572
392 | -0.7060179301336605,-1.430537614726842
393 | -1.0631307594358403,-6.391036849800224
394 | -1.6324106052478293,-6.444010208401991
395 | -3.2755159949646737,-2.9939191265268486
396 | -2.56047150796073,0.5841987698391771
397 | -2.262393700892434,2.055669721963797
398 | 0.11043200692146105,-2.8910807669965406
399 | -0.6580171921512933,-11.648501295402882
400 | 1.4008453971047086,-8.831505716943603
401 | -1.2943954122829755,-11.643975912182736
402 | -0.5197910603947153,-8.487847392080011
403 | -0.481585522326919,-6.630750024428158
404 | -1.1472417485195456,0.28532496069873936
405 | -0.7790469474112999,-0.49922926004874624
406 | 0.46245304842934165,-2.0901440849578714
407 | -1.3980062091390344,2.2674671872828007
408 | -3.0170996525795735,-10.462996177896642
409 | -2.6418891000420457,-1.6571518912837928
410 | -1.2720924993920695,-0.010621254246447442
411 | -2.3301604056981478,-2.958922140276937
412 | -1.7940996870630719,-8.324480581162852
413 | -0.026488424662141297,-10.117678707844252
414 | -0.23256257108067013,-8.104835307589259
415 | 0.45905545457326125,-6.400615541482103
416 | -1.8540180406230888,-7.195830995326459
417 | -1.7799935662164998,-10.480595597655421
418 | 0.2148560837940583,-5.677461505474805
419 | -3.9874598533479,-6.4927987338978275
420 | -2.1593225607308013,2.5917387684731246
421 | -2.7161117646470276,-0.0011437797282791706
422 | -0.6477429360935066,-0.5753396324753455
423 | -3.0695584463992214,-5.313628079483232
424 | -0.7701383171503423,-1.4365085728346223
425 | 0.35687187455105507,-0.8606141371916074
426 | -2.3146197401490154,0.6889084450900214
427 | -0.8529635997977232,-7.975193669209747
428 | 1.1409711944777583,-9.212423704276159
429 | 0.2774697239245445,-1.848797266193242
430 | 0.5130375997828074,-5.507060328978735
431 | -2.662561549081989,-3.820615024450105
432 | -2.5352670295141513,-4.114760838242309
433 | -0.18488220171073955,-4.551419196839449
434 | -2.0582730897747146,-7.9681999653974085
435 | -0.6244995981186756,-3.9290029048698423
436 | -2.663028536082492,-3.228255904697134
437 | -2.104197030277769,6.488834366459962
438 | -1.0409752482905588,-7.513788332117563
439 | -0.7971094398286516,1.079105562447075
440 | -2.858587301358905,-6.93014924232981
441 | -0.9123082791698328,-7.457343544647026
442 | -0.406711038479041,-2.8195347517786993
443 | -1.4709126731207471,-10.2676065467578
444 | -1.6893430262769895,-4.709391169256514
445 | 0.8919021309588049,-5.717087483592456
446 | -1.4066782954601156,-7.082325011812102
447 | -1.2727656545108998,-12.446137073299711
448 | -1.3561371295370355,-6.3217115244444555
449 | 1.061119765838721,-9.958693089346145
450 | -0.04050844470867698,3.3030485179747657
451 | -3.6897565468019184,-0.005620257604205925
452 | -0.6418820746913338,-6.385889611832532
453 | -1.1841749693175885,-3.6693533102356928
454 | -3.362309558345096,-1.695826985773194
455 | -0.015303768424369757,-2.2396377435434003
456 | 0.03797682408415293,0.9276257596472774
457 | -1.5466131112968382,-9.162976808819367
458 | -0.4035731406373806,-0.706367739427161
459 | 0.4509463666457787,-11.729442532747505
460 | -0.8851773400699331,-10.394053316125312
461 | -4.377192530544903,-5.529454322794213
462 | 0.42830425720463583,-2.8238844682962716
463 | -1.5594108603131156,-3.678004952447857
464 | 0.38743795408864656,-5.0706077146573545
465 | -0.7584332669373329,-13.084355695691887
466 | -0.4612017916542708,1.190697864228702
467 | -5.442679592745944,-0.7842790171019605
468 | -2.0301137313464706,-3.780227382913715
469 | 0.3345370484203216,-2.558521345196407
470 | 0.413834582309232,-5.081969587796571
471 | 1.712227051874871,0.17829571356413165
472 | -2.327724795142675,-4.493660899821206
473 | -2.0875018546953594,-2.783256684416428
474 | -1.562131770127968,-8.039862457958638
475 | -1.6842518241833462,-8.487377033508453
476 | -1.5777397829494197,0.40442719202459987
477 | -1.1508990822032477,0.7393756320446032
478 | 0.07443648093934363,-6.923236872010624
479 | -1.5973860283246681,-7.903461748286943
480 | -3.574039840032565,-5.3297535668958425
481 | 0.4200029428201366,-1.3839853049907886
482 | 0.44661014415683353,-1.2406381415058578
483 | 0.1494828086674762,-3.567655822129713
484 | 0.15927756996516496,1.8943249080662419
485 | -0.5976881226213171,-6.068102563371678
486 | -0.3342659088685722,-1.8915608688768781
487 | -3.088372957390626,-8.546943916265866
488 | 0.6308549419261529,-4.670021540262622
489 | 0.322467943804547,-7.855805145044611
490 | -1.591852983742711,-3.010394053029088
491 | 1.120176468942152,-8.681359705682391
492 | -0.11518032383231758,-3.307271215112081
493 | -1.1401047752655846,-6.88038100623831
494 | 1.0260241233712688,-3.2575842350433493
495 | -0.6259514637034067,-7.610907283725005
496 | -0.03719539188837184,-6.932780964839495
497 | -0.9324910148636657,-4.039039081706546
498 | 0.09158484715088999,-5.184091555458755
499 | -1.1918134901862092,-3.9387633301732223
500 | 0.3089777988835327,-1.007219813891004
501 |
--------------------------------------------------------------------------------
/00_basics/01_install.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Welcome to the Dark Art of Coding:\n",
8 | "## Introduction to Machine Learning\n",
9 | "Preparation and installation guide\n",
10 | "\n",
11 | "
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Objectives\n",
19 | "---"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "By the end of this module, you will be able to:\n",
27 | "\n",
28 | "* Download the tools we will be using (conda, specific Python libraries)\n",
29 | "* Install the tools\n",
30 | "* Test them for successful installation\n",
31 | "* Open the Juptyer Lab interface that we will be using in class\n",
32 | "* Run the code samples found in the notebooks\n",
33 | "* Understand the importance of the tools for our tasks today"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "# Installing the Software You'll Need\n",
41 | "---"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "## Step Zero: Read through ALL the steps...\n",
49 | "\n",
50 | "We strongly recommend that you read through **ALL** the steps below, before you start to install, etc. For some more advanced practitioners, you **may** already have some tools installed OR available. \n",
51 | "\n",
52 | "IF you can successfully\n",
53 | "* open Jupyter Lab\n",
54 | "* open the Notebooks in this tutorial AND\n",
55 | "* import the data libraries listed below in Step Three\n",
56 | "\n",
57 | "...then you shouldn't need to do anything. \n",
58 | "\n",
59 | "For folks who aren't sure OR for folks who are fairly new to Python/programming... these steps should get us to the point we need to be."
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "## Step One: Download and Install Miniconda\n",
67 | "\n",
68 | "Follow the instructions for your operating system in the **`miniconda`** quickstart guides.\n",
69 | "\n",
70 | "**Some warnings/cautions:**\n",
71 | "\n",
72 | "1. We **highly recommend** the use of `conda` as a package manager and virtual environment manager for this tutorial. This material has been tested using `conda` but has not been tested using `pip`, `virtualenv`, `pyenv`, etc.\n",
73 | "\n",
74 | "1. **IF you already have `conda`** installed via a previous `Anaconda` OR `miniconda` install, you should not need to reinstall. How can you tell? If you type `conda` on your command line and get a response similar to this, then you should not need to reinstall conda:\n",
75 | "\n",
76 | " ```\n",
77 | " my_macbook:my_folder chalmerlowe$ conda\n",
78 | " usage: conda [-h] [-V] command ...\n",
79 | "\n",
80 | " conda is a tool for managing and deploying applications, environments and packages.\n",
81 | " .\n",
82 | " .\n",
83 | " .\n",
84 | " ```\n",
85 | "1. Be sure you use a **Python 3.x** version of `miniconda` to install Python 3.x.\n",
86 | "1. Based on our experience in workshops, **the most common problems** we experience with installs is that a step got missed OR a command was typed incorrectly. It happens to all of us, so stay sharp, folks!"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "With that in mind, please choose the appropriate version and install `conda` as described below:\n",
94 | "\n",
95 | "**`conda` for Windows**:\n",
96 | "\n",
97 | "* Download the installer: [Miniconda installer for Windows.](https://conda.io/miniconda.html)\n",
98 | "* Double-click the .exe file.\n",
99 | "* Follow the instructions on the screen.\n",
100 | "* NOTE: If you are unsure about any setting, accept the defaults. You can change them later.\n",
101 | "* When installation is finished, from the **Start menu**, open the **Anaconda Prompt**.\n",
102 | "\n",
103 | "**`conda` for MacOS**:\n",
104 | "\n",
105 | "* Download the installer: [Miniconda installer for MacOS.](https://conda.io/miniconda.html)\n",
106 | "* In your Terminal window navigate to the folder where you downloaded the miniconda installer\n",
107 | "* At your Terminal prompt, run this command:\n",
108 | " \n",
109 | " ```bash\n",
110 | " $ bash Miniconda3-latest-MacOSX-x86_64.sh\n",
111 | " ```\n",
112 | "
\n",
113 | "* Follow the prompts on the installer screens.\n",
114 | "* NOTE: If you are unsure about any setting, accept the defaults. You can change them later.\n",
115 | "* **Close** and then **re-open** your Terminal window, to make the changes take effect.\n",
116 | "\n",
117 | "\n",
118 | "**`conda` for Linux**:\n",
119 | "\n",
120 | "* Download the installer: [Miniconda installer for Linux.](https://conda.io/miniconda.html)\n",
121 | "* In your Terminal window navigate to the folder where you downloaded the miniconda installer\n",
122 | "* At your Terminal prompt, run this command:\n",
123 | " \n",
124 | " ```bash\n",
125 | " $ bash Miniconda3-latest-Linux-x86_64.sh\n",
126 | " ```\n",
127 | "
\n",
128 | "* Follow the prompts on the installer screens.\n",
129 | "* If you are unsure about any setting, accept the defaults. You can change them later.\n",
130 | "* **Close** and then **re-open** your Terminal window, to make the changes take effect."
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "## Step Two: Confirm your conda install\n",
138 | "\n",
139 | "In a command prompt type `conda list`. If `conda` is installed properly, you will see a summary of the packages installed by `conda`."
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "### Troubleshooting\n",
147 | "\n",
148 | "Here's a list of error messages & how to fix them.\n",
149 | "\n",
150 | "- **`conda: Command not found.` **IF you see this, the most common reason is that your command shell is not yet aware of the installation of `conda`. The easiest fix is to simply **close** your terminal/command prompt & **reopen** your terminal/command prompt. If that doesn't fix it, ask for help."
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "## Step Three: Install Python, and other packages..."
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "With `conda` installed, we want to ensure that we have a suitable version of Python installed and that we have the necessary libraries also installed.\n",
165 | "\n",
166 | "We will create a directory to hold our lesson content. For consistency, we will call this directory `mltutorial` and then we will create a virtual environment called `mlenv` and populate it with Python and our libraries.\n",
167 | "\n",
168 | "1. On your command prompt, make sure you are in a directory where you want your project folder to be located (many people put this in their `My Documents` OR `home` folder. From that directory, run the following command:\n",
169 | "\n",
170 | " ```bash\n",
171 | " chalmerlowe$ mkdir mltutorial\n",
172 | " ```\n",
173 | "
\n",
174 | "1. Change directories into the new folder:\n",
175 | "\n",
176 | " ```bash\n",
177 | " chalmerlowe$ cd mltutorial\n",
178 | " ```\n",
179 | "
\n",
180 | "1. Create a virtual environment with Python 3, using the following command (don't worry, we will explain this below):\n",
181 | "\n",
182 | " ```bash\n",
183 | " chalmerlowe$ conda create -n mlenv python=3\n",
184 | " ```\n",
185 | "
\n",
186 | "1. Activate your virtual environment using the command appropriate to your operating system. NOTICE your prompt will change to reflect the fact that you are now in a virtual environment:\n",
187 | "\n",
188 | " **Mac/ Linux** \n",
189 | "\n",
190 | " ```bash\n",
191 | " chalmerlowe$ conda activate mlenv\n",
192 | " (mlenv) chalmerlowe$ \n",
193 | " ```\n",
194 | "
\n",
195 | " \n",
196 | " **Windows**\n",
197 | "\n",
198 | " ```bat\n",
199 | " C:\\> activate mlenv\n",
200 | " (mlenv) C:\\>\n",
201 | " ```\n",
202 | "
\n",
203 | "\n",
204 | "1. Install the following additional packages to your virtual environment (there may be a lot, make sure you get them all):\n",
205 | "\n",
206 | " ```bash\n",
207 | " (mlenv) chalmerlowe$ conda install -c conda-forge jupyter jupyterlab pandas matplotlib scipy numpy scikit-learn requests ipython seaborn \n",
208 | " ```\n",
209 | "
\n",
210 | "\n",
211 | "1. Test your installation, by typing the following on your commmand line/terminal:\n",
212 | "\n",
213 | " ```bash\n",
214 | " (mlenv) chalmerlowe$ jupyter lab \n",
215 | " ```\n",
216 | "
\n",
217 | " \n",
218 | "If your browser opens with a Jupyter Lab instance, you will know the install process succeeded. \n",
219 | "\n",
220 | "## Getting the actual class notebooks:\n",
221 | "\n",
222 | "1. You can do this step now, but I would suggest waiting til the morning of the class (see WARNING below). To get the latest version of the course material, navigate to the [**class github repository (link)**](https://github.com/chalmerlowe/machine_learning) and press the Big Green `Clone OR Download` button.\n",
223 | "\n",
224 | "1. Then press the `Download Zip` button to download a zip file of all the course content.\n",
225 | "\n",
226 | "1. Unzip the content into the new folder you have made (`mltutorial`). Once it is unzipped, you should see it in your Jupyter Lab interface.\n",
227 | "\n",
228 | "**WARNING**: the class material will be undergoing revision all the way up until the day of the class (\"...Conference Driven Development\") ... please be prepared to update your copy of the course material on the morning of the tutorial (either by using `git` if you are familiar with that tool OR by deleting your local files and downloading a fresh copy."
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "## Done with commands for now!\n",
236 | "\n",
237 | "If you have been successful with the above steps, you are done for now.\n",
238 | "\n",
239 | "
\n",
240 | "\n",
241 | "If you like reading, you can also keep reading this page to learn more about what we did, why we chose `conda`, what happened behind the scenes, etc!\n",
242 | "\n",
243 | "If you ran into any problems, feel free to reach out via info@darkartofcoding.com"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "# The Big Picture \n",
251 | "---"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "## What is miniconda (conda) and why did we install it?\n",
259 | "\n",
260 | "Miniconda contains the `conda` package manager/virtual environment manager and `Python`. `conda` is language agnostic, so you can also use it to support work with languages besides Python. Once miniconda is installed, you will be able to: \n",
261 | "\n",
262 | "* create virtual environments and \n",
263 | "* manage separate installations of `Python` (including different versions) \n",
264 | "* manage Python packages/libraries\n",
265 | "* as well as manage packages in other languages ... packages that are fundamentally unmanageable by Python-only tools like `pip` & `virtualenv`.\n",
266 | "\n",
267 | "Whenever you work on a new project, you should create a separate environment for that project. `conda` lets you do this easily and efficiently. \n",
268 | "\n",
269 | "## And what is a virtual environment?\n",
270 | "\n",
271 | "When you create a virtual environment, `conda` will add subdirectories to a miniconda directory on your computer. Specifically it will create a directory that will contain:\n",
272 | "\n",
273 | "* a database and metadata about the virtualenv\n",
274 | "* software and libraries related to the project (i.e., Python and any modules you install in the virtualenv)\n",
275 | "\n",
276 | "NOTE: this virtualenv folder is **NOT** a duplicate of your project folder **NOR** does it contain your code/class material"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "# Deep dive\n",
284 | "---"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "## What is a virtual environment?\n",
292 | "\n",
293 | "As mentioned above, virtual environments (also called virtualenvs) are tools used to keep projects separate, especially in terms of keeping different software versions separate and different library versions separate. For example, virtualenvs prevent Python's site packages folder from getting filled with potentially conflicting versions of software AND thus prevents problems that arise when one project needs **version x.x** of a library but another project needs **version y.y** of the same library. At their core, virtualenvs are glorified directories that use scripts and metadata to organize and control the environment. You are allowed to have an essentially unlimited number of virtual environments. And as you saw above, they are very easy to create using various command line tools, such as `conda`.\n",
294 | "\n",
295 | "## When should we use a virtual environment?\n",
296 | "\n",
297 | "Anytime you have more than one project and there is a possibility of conflicts between your libraries, it is a good time to use a virtual environment. Having said that, many programmers use virtual environments for **all but the most trivial** programming tasks. Especially for beginners, using virtual environmentss early on in your learning career will build a valuable skill AND help prevent sneaky bugs related to version discrepancies. Bugs that can be hard to diagnose.\n",
298 | "\n",
299 | "## How do you create a virtual environment?\n",
300 | "\n",
301 | "While there are several programs or libraries that can generate virtual environments for today's lesson, we will be using the `conda` package manager, which includes the capability to simply and easily produce virtual environments.\n",
302 | "\n",
303 | "Presuming you have `conda` installed, these steps enable you to create and activate a virtual environment.\n",
304 | "\n",
305 | "```bash\n",
306 | "$ conda create -n mlenv python=3\n",
307 | "```\n",
308 | "\n",
309 | "Description:\n",
310 | "* `conda` runs the conda program.\n",
311 | "* `create` tells it to create a virtualenv\n",
312 | "* `-n` identifies the name of the virtualenv, in this case, `mlenv`\n",
313 | "* `python=3` tells conda that you want to install Python version 3 in this virtualenv\n",
314 | "\n",
315 | "**NOTE**: for other projects, you **can** use `python=2` or `python=3` and regardless which you choose, conda will default to the most recent version of Python within the version 2 OR version 3 family. If you need to select a specific minor version of python, use the following syntax:\n",
316 | "\n",
317 | "`python=3.7`\n",
318 | "\n",
319 | "When you execute the `conda create` command, `conda` prepares to install Python and any dependencies that Python relies upon. It will display output similar to the following. \n",
320 | "\n",
321 | "```bash\n",
322 | "my_macbook:my_folder chalmerlowe$ conda create -n mlenv python=3\n",
323 | "Fetching package metadata .......\n",
324 | "Solving package specifications: ..........\n",
325 | "\n",
326 | "Package plan for installation in environment /Users/chalmerlowe/miniconda3/envs/stats:\n",
327 | "\n",
328 | "The following packages will be downloaded:\n",
329 | "\n",
330 | " package | build\n",
331 | " ---------------------------|-----------------\n",
332 | " openssl-1.0.2k | 1 3.0 MB\n",
333 | " python-3.6.0 | 0 11.7 MB\n",
334 | " setuptools-27.2.0 | py36_0 523 KB\n",
335 | " wheel-0.29.0 | py36_0 87 KB\n",
336 | " pip-9.0.1 | py36_1 1.7 MB\n",
337 | " ------------------------------------------------------------\n",
338 | " Total: 17.0 MB\n",
339 | "\n",
340 | "The following NEW packages will be INSTALLED:\n",
341 | "\n",
342 | " openssl: 1.0.2k-1\n",
343 | " pip: 9.0.1-py36_1\n",
344 | " python: 3.6.0-0\n",
345 | " readline: 6.2-2\n",
346 | " setuptools: 27.2.0-py36_0\n",
347 | " sqlite: 3.13.0-0\n",
348 | " tk: 8.5.18-0\n",
349 | " wheel: 0.29.0-py36_0\n",
350 | " xz: 5.2.2-1\n",
351 | " zlib: 1.2.8-3\n",
352 | "\n",
353 | "Proceed ([y]/n)?\n",
354 | "```\n",
355 | "\n",
356 | "To finish the creation of the virtualenv and install the software, press `y`.\n",
357 | "\n",
358 | "## Activating a virtualenv\n",
359 | "\n",
360 | "Once you have created a virtualenv, you will need to activate it. Activation has several side effects:\n",
361 | "\n",
362 | "* It temporarily changes your `$PATH` variable so calls to the `python` command (and similar commands) will look first in the virtual environment's `bin/` directory. \n",
363 | "* It temporarily changes your shell prompt to show which virtual environment you are using. Your prompt will likely look something like this, with the name of your virtual environment in parenthesis in front of the prompt:\n",
364 | " * Mac/Linux: `(mlenv) chalmerlowe$`\n",
365 | " * Windows: `(mlenv) C:\\>`\n",
366 | "\n",
367 | "To activate your virtual environment, run the appropriate command for your operating system:\n",
368 | "\n",
369 | "**Mac/Linux**\n",
370 | "\n",
371 | "```bash\n",
372 | "$ conda activate mlenv\n",
373 | "```\n",
374 | "\n",
375 | "**Windows**\n",
376 | "\n",
377 | "```bat\n",
378 | "C:\\> activate mlenv\n",
379 | "```\n",
380 | "\n",
381 | "**Note:** If you are using **Power Shell**, `activate` won't work out of the box. Type `cmd` first to get a regular command prompt, *then* `activate stats`.\n",
382 | "\n",
383 | "### Adding software to your virtualenv \n",
384 | "\n",
385 | "To add more software to the virtualenv, you can use `conda` to install the software. The maintainers of conda provide access to many Python and non-Python libraries, but not all of them. If conda cannot install a particular library that you need, you can generally use `pip` or a similar package installation tool to install it instead (covering `pip` is outside the scope of this workshop).\n",
386 | "\n",
387 | "For example, to install Jupyter, you can use the following `conda` command:\n",
388 | "\n",
389 | "```\n",
390 | "conda install -c conda-forge jupyter jupyterlab pandas matplotlib scipy numpy scikit-learn requests ipython seaborn \n",
391 | "```\n",
392 | "\n",
393 | "Conda will prepare to install Jupyter and any dependencies that Jupyter relies upon. It will display output similar to the following (truncated to save space).\n",
394 | "\n",
395 | "```bash\n",
396 | "Fetching package metadata .......\n",
397 | "Solving package specifications: ..........\n",
398 | "\n",
399 | "Package plan for installation in environment /Users/chalmerlowe/miniconda3:\n",
400 | "\n",
401 | "The following packages will be downloaded:\n",
402 | "\n",
403 | " package | build\n",
404 | " ---------------------------|-----------------\n",
405 | " conda-env-2.6.0 | 0 601 B\n",
406 | " ...\n",
407 | " ipython-5.3.0 | py35_0 1021 KB\n",
408 | " conda-4.3.14 | py35_0 505 KB\n",
409 | " ------------------------------------------------------------\n",
410 | " Total: 3.8 MB\n",
411 | "\n",
412 | "The following NEW packages will be INSTALLED:\n",
413 | "\n",
414 | " appnope: 0.1.0-py35_0\n",
415 | " ...\n",
416 | " wcwidth: 0.1.7-py35_0\n",
417 | "\n",
418 | "The following packages will be UPDATED:\n",
419 | "\n",
420 | " conda: 4.1.11-py35_0 --> 4.3.14-py35_0\n",
421 | " conda-env: 2.5.2-py35_0 --> 2.6.0-0\n",
422 | " requests: 2.10.0-py35_0 --> 2.13.0-py35_0\n",
423 | "\n",
424 | "Proceed ([y]/n)?\n",
425 | "```\n",
426 | "\n",
427 | "To finish the installation of Jupyter and its dependencies, press `y`.\n",
428 | "\n",
429 | "### Multiple packages\n",
430 | "\n",
431 | "Multiple packages can be installed at the same time, by separating the package names with spaces:\n",
432 | "\n",
433 | "`conda install matplotlib numpy pandas scipy`\n",
434 | "\n",
435 | "**IF** there are special packages that you need to get from a specific repository channel (i.e. the conda-forge channel), you can designate a channel using the `-c` flag and the name of the channel (such as `conda-forge`) as shown here:\n",
436 | "\n",
437 | "`conda install -c conda-forge jupyter jupyterlab pandas matplotlib scipy numpy scikit-learn requests ipython seaborn`\n",
438 | "\n",
439 | "### Leaving the virtualenv when you are done\n",
440 | "\n",
441 | "When you are done working in your virtualenv, you can deactivate it using the following command:\n",
442 | "\n",
443 | "**Mac/Linux**\n",
444 | "\n",
445 | "```bash\n",
446 | "(mlenv) $ conda deactivate\n",
447 | "$\n",
448 | "```\n",
449 | "\n",
450 | "**Windows**\n",
451 | "\n",
452 | "```bat\n",
453 | "(mlenv) C:\\> deactivate\n",
454 | "C:\\>\n",
455 | "```"
456 | ]
457 | },
458 | {
459 | "cell_type": "markdown",
460 | "metadata": {},
461 | "source": [
462 | "## Resources\n",
463 | "\n",
464 | "\n",
465 | "\n",
466 | "* [Using conda](http://conda.pydata.org/docs/using/index.html): A tutorial on how to use `conda`\n",
467 | "\n",
468 | "* [conda cheatsheet](https://conda.io/docs/_downloads/conda-cheatsheet.pdf): A cheatsheet of the most common `conda` commands\n",
469 | "\n",
470 | "* [conda myths and misconceptions](http://jakevdp.github.io/blog/2016/08/25/conda-myths-and-misconceptions/): Reasons why conda was created and how it differs from `pip`, `virtualenv`, etc.\n",
471 | "\n",
472 | "* [Python's `venv` and `virtualenv` can also create virtual environments.](http://stackoverflow.com/questions/41573587/what-is-the-difference-between-venv-pyvenv-pyenv-virtualenv-virtualenvwrappe)\n",
473 | "\n",
474 | "* [`pip` is Python's package manager.](https://en.wikipedia.org/wiki/Pip_(package_manager))"
475 | ]
476 | }
477 | ],
478 | "metadata": {
479 | "kernelspec": {
480 | "display_name": "Python 3",
481 | "language": "python",
482 | "name": "python3"
483 | },
484 | "language_info": {
485 | "codemirror_mode": {
486 | "name": "ipython",
487 | "version": 3
488 | },
489 | "file_extension": ".py",
490 | "mimetype": "text/x-python",
491 | "name": "python",
492 | "nbconvert_exporter": "python",
493 | "pygments_lexer": "ipython3",
494 | "version": "3.6.7"
495 | }
496 | },
497 | "nbformat": 4,
498 | "nbformat_minor": 2
499 | }
500 |
--------------------------------------------------------------------------------
/05_neighbors/05_neighbors.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Welcome to the Dark Art of Coding:\n",
8 | "## Introduction to Machine Learning\n",
9 | "k-Means Clustering\n",
10 | "\n",
11 | "
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Objectives\n",
19 | "---"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "In this session, students should expect to:\n",
27 | "\n",
28 | "* Cover an overview of k-Means Clustering\n",
29 | "* Examine code samples that walk us through **The Process™**:\n",
30 | " * Prep the data\n",
31 | " * Choose the model\n",
32 | " * Choose appropriate hyperparameters\n",
33 | " * Fit the model\n",
34 | " * Apply the model\n",
35 | " * Examine the results\n",
36 | "* Explore a deep dive into this model\n",
37 | "* Review some gotchas that might complicate things\n",
38 | "* Review tips related to learning more"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "# Overview: k-Means Clustering\n",
46 | "---"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "The goal of a clustering algorithm is to assign data points to the same group if they are similar and to assign data points to different groups if they are different.\n",
54 | "\n",
55 | "Clustering models are popular machine learning models because they:\n",
56 | "\n",
57 | "* are **unsupervised** and thus don't require pre-determined labels\n",
58 | "* can accommodate multidimensional datasets\n",
59 | "* can, for simple cases, be fairly easy to interpret, especially in 2D/3D via charts\n",
60 | "\n",
61 | "The k-Means Clustering algorithm: \n",
62 | "\n",
63 | "* looks for the arithmetic mean of all points in a cluster to identify the cluster centers\n",
64 | "* groups points together by identifying the closest cluster center\n",
65 | "\n",
66 | "For this example, we will use the `KMeans` model. `The sklearn.cluster` module has a number of clustering models, including:\n",
67 | "\n",
68 | "* AffinityPropagation\n",
69 | "* DBSCAN\n",
70 | "* KMeans\n",
71 | "* MeanShift\n",
72 | "* SpectralClustering\n",
73 | "* and more...\n",
74 | "\n",
75 | "With this background, let's apply **The Process™** on the `KMeans` Clustering model."
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "## Prep the data"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "We start with a set of standard imports..."
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "import matplotlib.pyplot as plt\n",
99 | "import numpy as np\n",
100 | "import pandas as pd\n",
101 | "import sklearn\n",
102 | "from sklearn.model_selection import train_test_split\n",
103 | "\n",
104 | "# NOTE: during the Choose the Model step, we will import the \n",
105 | "# model we want, but there is no reason you can't import it here.\n",
106 | "# from sklearn.cluster import KMeans"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### Prep the training data and test data"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "A number of data generating functions exist in Scikit-Learn to help you create data sets that you can use to play with and manipulate the models. For this example, I want to explore one of these data generation libraries: \n",
121 | "\n",
122 | "```python\n",
123 | "sklearn.datasets.samples_generator.make_blobs\n",
124 | "```\n",
125 | "\n",
126 | "This dataset generator produces preformatted `features` matrices and `target` arrays."
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "This dataset is composed of:\n",
134 | "\n",
135 | "* a `features matrix` of `x`-`y` vectors that can be plotted on a chart\n",
136 | "* a `target array` of cluster labels"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "from sklearn.datasets.samples_generator import make_blobs\n",
146 | "\n",
147 | "X, y = make_blobs(n_samples=400,\n",
148 | " centers=4,\n",
149 | " cluster_std=0.70,\n",
150 | " random_state=13)"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "Since we have never made blobs before, we should check to see what the output looks like:"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "X.shape"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "X[:5]"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "Initially, looking at only a five element slice of the labels didn't show me all the possible categories, so I expanded the slice a little, then more, then alot."
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "y[:100]"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "**TIP**: A quick way to confirm exactly which categories you have, if you have lots of them is to use the `np.unique()` method to deduplicate the elements stored in your array (i.e. `y`)."
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "np.unique(y)"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "In this case, we are gonna stick with the defaults in terms of the size of the test set and in terms of the random seed."
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "X_train, X_test, y_train, y_test = train_test_split(X, y)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "plt.scatter(X_train[:, 0], X_train[:, 1])\n",
233 | "plt.title(\"Four well behaved clusters\");"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "## Choose the Model"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {
247 | "collapsed": true
248 | },
249 | "outputs": [],
250 | "source": [
251 | "from sklearn.cluster import KMeans"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "## Choose Appropriate Hyperparameters"
259 | ]
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {},
264 | "source": [
265 | "Here we choose to assign xx hyperparameters: `xx` and `xx`. We will discuss both later."
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {
272 | "collapsed": true
273 | },
274 | "outputs": [],
275 | "source": [
276 | "model = KMeans(n_clusters=4)"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "There are a number of hyperparameters... we will cover several in greater depth later.\n",
284 | "\n",
285 | "```python\n",
286 | "KMeans(\n",
287 | " n_clusters=8,\n",
288 | " init='k-means++',\n",
289 | " n_init=10,\n",
290 | " max_iter=300,\n",
291 | " tol=0.0001,\n",
292 | " precompute_distances='auto',\n",
293 | " verbose=0,\n",
294 | " random_state=None,\n",
295 | " copy_x=True,\n",
296 | " n_jobs=None,\n",
297 | " algorithm='auto',\n",
298 | ")\n",
299 | "```"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "## Fit the Model"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "This model doesn't need OR use any labels, so we simply feed in the `X_train` data."
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {
320 | "collapsed": true
321 | },
322 | "outputs": [],
323 | "source": [
324 | "model.fit(X_train)"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "## Apply the Model"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {
338 | "collapsed": true
339 | },
340 | "outputs": [],
341 | "source": [
342 | "y_pred = model.predict(X_test)"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": null,
348 | "metadata": {
349 | "collapsed": true
350 | },
351 | "outputs": [],
352 | "source": [
353 | "y_pred.shape"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "Again, here as above, we don't get to see all the categories by only looking at a five element slice."
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": null,
366 | "metadata": {
367 | "collapsed": true
368 | },
369 | "outputs": [],
370 | "source": [
371 | "y_pred[:5]"
372 | ]
373 | },
374 | {
375 | "cell_type": "markdown",
376 | "metadata": {},
377 | "source": [
378 | "## Examine the results"
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "metadata": {},
384 | "source": [
385 | "If we plot the clusters and use the predicted labels as the basis for assigning colors, we see that the model correctly grouped the samples into clusters."
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "metadata": {
392 | "collapsed": true
393 | },
394 | "outputs": [],
395 | "source": [
396 | "plt.scatter(X_test[:, 0], X_test[:, 1],\n",
397 | " c=y_pred,\n",
398 | " cmap='seismic', alpha=0.2);"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {},
404 | "source": [
405 | "We mentioned that the model drills down to a center for each cluster. If you want to know where the centers are, the model stores them as an attribute called `.cluster_centers_`. \n",
406 | "\n",
407 | "**Notice** the trailing underscore (`_`) at the end of the attribute name to show that it is a computed value."
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": null,
413 | "metadata": {},
414 | "outputs": [],
415 | "source": [
416 | "model.cluster_centers_"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": null,
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "ctrs = model.cluster_centers_"
426 | ]
427 | },
428 | {
429 | "cell_type": "markdown",
430 | "metadata": {},
431 | "source": [
432 | "With these points, it is simple enough to plot them on the chart. Here we highlight them as large (`s=150`) white dots (`c='white'`) outlined in black (`edgecolors='black'`)"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": null,
438 | "metadata": {},
439 | "outputs": [],
440 | "source": [
441 | "plt.scatter(X_test[:, 0], X_test[:, 1],\n",
442 | " c=y_pred,\n",
443 | " cmap='seismic', alpha=0.2)\n",
444 | "\n",
445 | "plt.scatter(ctrs[:, 0], ctrs[:, 1],\n",
446 | " c='white',\n",
447 | " edgecolors='black',\n",
448 | " s=150,\n",
449 | " );"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {},
455 | "source": [
456 | "# Deep Dive\n",
457 | "---\n",
458 | "\n",
459 | "The k-Means Clustering model works based on a process called **Expectation-Maximization**. In this process, the model:\n",
460 | "\n",
461 | "* starts by randomly picking some cluster centers\n",
462 | "* repeats the following cycle until the model converges\n",
463 | " * Expectation: assign points to the closest cluster center\n",
464 | " * Maximization: use the points of the newly formed clusters to calculate a new mean to use as a new cluster center\n",
465 | " \n",
466 | "The process is designed such that for every cycle of the Expectation and Maximization steps, the model will always have a better estimation of any given cluster."
467 | ]
468 | },
469 | {
470 | "cell_type": "markdown",
471 | "metadata": {},
472 | "source": [
473 | "# Gotchas\n",
474 | "---"
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "metadata": {},
480 | "source": [
481 | "**No global guarantees**: despite the promise of convergence... there is no guarantee that as a whole the clusters produced will globally be the most suitable clusters.\n",
482 | "\n",
483 | "It really depends on the randomly selected initial cluster centers. To overcome this limitation, the model typically runs the algorithm multiple times. The default `n_init` is set at `10`.\n",
484 | "\n",
485 | "**You must decide on the number of clusters**: when we set the hyperparameters, we need to initialize the model with the some number of clusters. The default `n_clusters` is set at `8`.\n",
486 | "\n",
487 | "* There are other models that may provide some measure of the fitness of the number of clusters: `GaussianMixture`\n",
488 | "* There are other models that can choose a suitable number of clusters: `DBSCAN`, `MeanShift`\n",
489 | "\n",
490 | "**Speed considerations**: clustering algorithms can be slow on large datasets."
491 | ]
492 | },
493 | {
494 | "cell_type": "markdown",
495 | "metadata": {},
496 | "source": [
497 | "**What do the colors OR category labels really mean?**:\n",
498 | "\n",
499 | "Turns out, not much. The labels aren't magic, they don't carry meaning beyond: every sample in this set with this label is tied to or associated with samples that carry the same label. Due to the way that the model randomizes the clustering process, during one run of the algorithm, a group of samples may be numbered `0`, while during a subsequent run of the algorithm, a similar grouping of samples might be numbered `1`.\n",
500 | "\n",
501 | "To demo this principle, let's plot the **test** data and the **training** data on the same chart.\n",
502 | "\n",
503 | "Remember, in scatter plots:\n",
504 | "\n",
505 | "* `c` values are assigned based on the labels we provide\n",
506 | "* `cmap` maps a color to each value associated with `c`\n",
507 | "* `seismic` is a range of colors from deep blue to deep red\n",
508 | "\n",
509 | "
\n",
510 | "\n",
511 | "* the plot selects four colors from the `seismic` range and assigns one color to each of the four labels that are present in `c` \n",
512 | " * Almost black\n",
513 | " * Blue\n",
514 | " * Maroon\n",
515 | " * Red\n",
516 | "\n",
517 | "Notice, the algorithms correctly lump the samples into clusters, but there is no correlation between the labels from the training run to the test run.\n",
518 | "\n",
519 | "The cluster at the top of the chart ended up having to separate labels and thus shows up as two different colors. Don't let that throw you."
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": null,
525 | "metadata": {},
526 | "outputs": [],
527 | "source": [
528 | "plt.scatter(X_test[:, 0], X_test[:, 1],\n",
529 | " c=y_pred,\n",
530 | " cmap='seismic', alpha=0.5)\n",
531 | "\n",
532 | "plt.scatter(X_train[:, 0], X_train[:, 1],\n",
533 | " c=y_train,\n",
534 | " cmap='seismic', alpha=0.2);\n"
535 | ]
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "metadata": {},
540 | "source": [
541 | "# How to learn more: tips and hints\n",
542 | "---"
543 | ]
544 | },
545 | {
546 | "cell_type": "markdown",
547 | "metadata": {},
548 | "source": [
549 | "**Read the error messages**: They are sometimes scarier than the docs but they will often give you some insight into the nature of the problem.\n",
550 | "\n",
551 | "Pay attention to the errors. While putting this lesson together, I cobbled some content together from various notebooks using cut and paste and I failed to paste the line where I actually called the `.fit()` method. This error message was the result.\n",
552 | "\n",
553 | "```python\n",
554 | "NotFittedError: This KMeans instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.```\n",
555 | "\n",
556 | "One of the most common mistakes that I see with students is that they don't read the error messages. Why?\n",
557 | "\n",
558 | "* Sometimes error messages aren't written by humans\n",
559 | "* Sometimes error messages are well-written, clear and concise, but a beginner doesn't yet understand what the message is trying to say\n",
560 | "* Sometimes students three or four changes to a snippet and then run the code and immediately presume that the last edit was the breaking change. So they go down some rabbithole on the wrong line of code\n",
561 | "\n",
562 | "So get used to reading the error messages, try to understand them. If they are foreign to you, Google some of the key phrases... it is often comforting to see how many folks out there have asked the same question about what does a specific error mean."
563 | ]
564 | },
565 | {
566 | "cell_type": "markdown",
567 | "metadata": {},
568 | "source": [
569 | "As far as additional topics to explore? Read up on **Scoring and validating your models**:\n",
570 | "\n",
571 | "[https://scikit-learn.org/stable/modules/learning_curve.html]()\n",
572 | "\n",
573 | "[https://scikit-learn.org/stable/modules/cross_validation.html#multimetric-cross-validation]()"
574 | ]
575 | },
576 | {
577 | "cell_type": "markdown",
578 | "metadata": {},
579 | "source": [
580 | "# Experience Points!\n",
581 | "---"
582 | ]
583 | },
584 | {
585 | "cell_type": "markdown",
586 | "metadata": {
587 | "slideshow": {
588 | "slide_type": "slide"
589 | }
590 | },
591 | "source": [
592 | "## Task 01"
593 | ]
594 | },
595 | {
596 | "cell_type": "markdown",
597 | "metadata": {},
598 | "source": [
599 | "Let's play around a bit...\n",
600 | "\n",
601 | "The following code is set up to make it easy for you to change the standard deviation for the clusters that `make_blobs()` will generate.\n",
602 | "\n",
603 | "The initial `cluster_std` is set at `0.7` (just as it was in the examples above).\n",
604 | "Execute the following cells to see the scatter plots showing the training data and the test/predicted data.\n",
605 | "\n",
606 | "Then change the values for `cluster_std` to each of the following values one by one, executing the cells for each value so that you can see how the clustering model responds as the spread of the input data increases.\n",
607 | "\n",
608 | "* 1\n",
609 | "* 2\n",
610 | "* 3"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": null,
616 | "metadata": {
617 | "slideshow": {
618 | "slide_type": "slide"
619 | }
620 | },
621 | "outputs": [],
622 | "source": [
623 | "from sklearn.datasets.samples_generator import make_blobs\n",
624 | "\n",
625 | "cluster_std = 0.7 # std of 0.70 gives tight clusters, try other options!\n",
626 | "random_state = 13\n",
627 | "\n",
628 | "X, y = make_blobs(n_samples=400,\n",
629 | " centers=4,\n",
630 | " cluster_std=cluster_std,\n",
631 | " random_state=random_state)\n",
632 | "\n",
633 | "X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
634 | "\n",
635 | "plt.scatter(X_train[:, 0], X_train[:, 1],\n",
636 | " c=y_train,\n",
637 | " cmap='seismic', alpha=0.5);"
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": null,
643 | "metadata": {},
644 | "outputs": [],
645 | "source": [
646 | "from sklearn.cluster import KMeans"
647 | ]
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": null,
652 | "metadata": {},
653 | "outputs": [],
654 | "source": [
655 | "model = KMeans(n_clusters=4)\n",
656 | "model.fit(X_train)"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": null,
662 | "metadata": {},
663 | "outputs": [],
664 | "source": [
665 | "y_pred = model.predict(X_test)"
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": null,
671 | "metadata": {},
672 | "outputs": [],
673 | "source": [
674 | "plt.scatter(X_test[:, 0], X_test[:, 1],\n",
675 | " c=y_pred,\n",
676 | " cmap='seismic', alpha=0.5);"
677 | ]
678 | },
679 | {
680 | "cell_type": "markdown",
681 | "metadata": {},
682 | "source": [
683 | "---\n",
684 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
685 | "\n",
686 | "If you want to continue on at your own-pace, please feel free to do so.\n",
687 | "\n",
688 | "
"
689 | ]
690 | },
691 | {
692 | "cell_type": "markdown",
693 | "metadata": {},
694 | "source": [
695 | "# References\n",
696 | "---"
697 | ]
698 | },
699 | {
700 | "cell_type": "markdown",
701 | "metadata": {},
702 | "source": [
703 | "Below are references that may assist you in learning more:\n",
704 | " \n",
705 | "|Title (link)|Comments|\n",
706 | "|---|---|\n",
707 | "|[General API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n",
708 | "|[KMeans API Reference](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)||\n",
709 | "|[User Guide](https://scikit-learn.org/stable/modules/clustering.html#k-means)||\n",
710 | "|[Sample datasets](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets)|Load or create datasets for practice and study|\n",
711 | "|[Make blobs](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html#sklearn.datasets.make_blobs)|Specifically make clusters of values|"
712 | ]
713 | }
714 | ],
715 | "metadata": {
716 | "kernelspec": {
717 | "display_name": "Python 3",
718 | "language": "python",
719 | "name": "python3"
720 | },
721 | "language_info": {
722 | "codemirror_mode": {
723 | "name": "ipython",
724 | "version": 3
725 | },
726 | "file_extension": ".py",
727 | "mimetype": "text/x-python",
728 | "name": "python",
729 | "nbconvert_exporter": "python",
730 | "pygments_lexer": "ipython3",
731 | "version": "3.6.7"
732 | }
733 | },
734 | "nbformat": 4,
735 | "nbformat_minor": 2
736 | }
737 |
--------------------------------------------------------------------------------
/01_intro_to_sklearn/01_intro_to_sklearn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Welcome to the Dark Art of Coding:\n",
8 | "## Introduction to Machine Learning\n",
9 | "Intro to Scikit-Learn\n",
10 | "\n",
11 | "
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Objectives\n",
19 | "---"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "In this session, students should expect to:\n",
27 | "\n",
28 | "* Explore machine learning techniques, tools and categories\n",
29 | " * Supervised learning\n",
30 | " * Unsupervised learning\n",
31 | " * Classification\n",
32 | " * Regression\n",
33 | " * Clustering\n",
34 | " * Dimensionality reduction\n",
35 | "* Review key characterisitcs of Scikit-Learn, especially the application programming interface (API)"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "# Machine Learning Techniques, Tools and Categories\n",
43 | "---"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "Machine learning falls into two main categories: **supervised learning** and **unsupervised learning**."
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "## Supervised learning"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "Supervised learning is the process of modeling the relationship between features of a dataset and targets (labels) associated with each sample of that dataset. With a model in hand, it is possible to use the model to either assign labels to a new dataset that doesn't yet have labels or calculate output values. The most common examples of supervised learning include: **classification** and **regression**."
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "## Classification"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "Classification allows you to assign discrete **labels or categories** to new input data.\n",
79 | "\n",
80 | "|Inputs|Classification|\n",
81 | "|:---|:---|\n",
82 | "|Texts, emails, or comments|Spam detection|\n",
83 | "|Flowers, insects, or animals|Species detection|\n",
84 | "|Viewers, readers, buyers|Customer detection|"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "## Regression"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "Regression analysis allows you to predict **continuous quantities** based on new input data. \n",
99 | "\n",
100 | "|Inputs|Outputs|\n",
101 | "|:---|:---|\n",
102 | "|Auto characteristics (color, model, age, etc)|Price|\n",
103 | "|Advertising dollars spent|Sales revenue|\n",
104 | "|Candidate characteristics|Salary|"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Unsupervised learning"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "Unsupervised learning is the process of modeling relationships amongst features of a dataset in a way that classifies the raw data without supplying any input labels. There are many algorithms that enable relationships to be identified and each of these models seek to replicate human logic in finding patterns in data. Two of the most common unsupervised learning approaches are **clustering** and **dimensionality reduction**."
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "## Clustering"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "Cluster analysis or clustering is a technique for grouping a collection of objects so that all the objects in a single cluster are more similar to each other than to objects in other clusters.\n",
133 | "\n",
134 | "|Inputs|Classification|\n",
135 | "|:---|:---|\n",
136 | "|Images|Grouping/categorization|\n",
137 | "|Marketing data|Customer segmentation|\n",
138 | "|Social network data|Community classification|"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "## Dimensionality reduction"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "Dimensionality reduction (also dimension reduction) is the process of reducing the number of random variables in a dataset by identifying a set of principal variables. Dimensionality reduction can be used for feature selection or feature extraction.\n",
153 | "\n",
154 | "As an example, presume you have a dataset with 10 features for coffees:\n",
155 | "* Cup size\n",
156 | "* Roast (dark, etc)\n",
157 | "* Flavoring (nutmeg, vanilla, etc)\n",
158 | "* Country of origin\n",
159 | "* Organic status (organic, not organic)\n",
160 | "* Sustainability status (sustainably harvested?)\n",
161 | "* Preparation (espresso, latte, etc)\n",
162 | "* etc\n",
163 | "\n",
164 | "If, through dimensionality reduction, we can determine that the most influential determinant of whether a coffee will sell well is cup size, roast, flavoring, and preparation, we may be able to speed up our analysis OR reduce our computational overhead by reducing the 10 features down to three OR four. \n",
165 | "\n",
166 | "In some cases, data analysis such as regression or classification can be done in the reduced space more easily and/or accurately than in the original space.\n",
167 | "\n",
168 | "Some benefits from using dimensionality reduction include:\n",
169 | "\n",
170 | "* It reduces the computation time and storage space requirements\n",
171 | "* It can enable easier data visualization if the dimensions can be reduced to much lower dimensions like 2D/3D\n",
172 | "* It can improve the interpretation of the parameters of a machine learning model\n",
173 | "* It helps to avoid issues related to increase in data sparsity as data volume increases ([curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality))"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "# Key Characteristics of Scikit-Learn\n",
181 | "---"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "Scikit-Learn is a well known package that provides access to many common machine learning algorithms through a consistent, well-organized Application Programming Interface (API) and is supported by very thorough and comprehensive documentation.\n",
189 | "\n",
190 | "The uniform syntax and the consistency in how the API is designed means that once you learn one model, it is surprisingly easy to pick up additional models.\n",
191 | "\n",
192 | "A key goal for this workshop is for you to walk away:\n",
193 | "\n",
194 | "* understanding the API\n",
195 | "* with an improved knowledge of the vocabulary of machine learning\n",
196 | "* knowing how to learn more\n",
197 | "\n",
198 | "If we succeed in these goals, you will be well-poised to continue your journey and to pursue future studies in the awesomeness that is machine learning."
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {},
204 | "source": [
205 | "# The Scikit-Learn API\n",
206 | "---"
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | "The Scikit-Learn interface follows a number of guidelines covered in the API Contract (as defined in the [**API design paper**](https://arxiv.org/abs/1309.0238)). Quoting from that paper:\n",
214 | "\n",
215 | "> As much as possible, our design choices have been guided so as to avoid the\n",
216 | "proliferation of framework code. We try to adopt simple conventions and to\n",
217 | "limit to a minimum the number of methods an object must implement. The API\n",
218 | "is designed to adhere to the following broad principles:\n",
219 | "\n",
220 | "> **Consistency**. All objects (basic or composite) share a consistent interface composed of a limited set of methods. This interface is documented in a consistent manner for all objects.\n",
221 | "\n",
222 | "> **Inspection**. Constructor parameters and parameter values determined by learning algorithms are stored and exposed as public attributes.\n",
223 | "\n",
224 | "> **Non-proliferation of classes**. Learning algorithms are the only objects to be\n",
225 | "represented using custom classes. Datasets are represented as NumPy arrays\n",
226 | "or SciPy sparse matrices. Hyper-parameter names and values are represented\n",
227 | "as standard Python strings or numbers whenever possible. This keeps scikit-learn easy to use and easy to combine with other libraries.\n",
228 | "\n",
229 | "> **Composition**. Many machine learning tasks are expressible as sequences or\n",
230 | "combinations of transformations to data. Some learning algorithms are also\n",
231 | "naturally viewed as meta-algorithms parametrized on other algorithms. Whenever feasible, such algorithms are implemented and composed from existing\n",
232 | "building blocks.\n",
233 | "\n",
234 | "> **Sensible defaults**. Whenever an operation requires a user-defined parameter,\n",
235 | "an appropriate default value is defined by the library. The default value\n",
236 | "should cause the operation to be performed in a sensible way (giving a baseline solution for the task at hand)."
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "For some concrete details on how the API is put together: **[Contributors API Overview](https://scikit-learn.org/stable/developers/contributing.html#api-overview)**"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "## Using the Scikit-Learn API"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "By and large, using any given model in Scikit-Learn will follow a set of straightforward steps. Each of our examples will follow what I call **The Process™**.\n",
258 | "\n",
259 | "1. **Prep the data**: the data must be well prepared for it to be usable in the various models. This preparation may include normalization, cleansing, wrangling of the data. It often needs to be separated into a `features` matrix and a `target` vector (array) and/or may need to be broken into separate collections of data for training versus testing purposes.\n",
260 | "\n",
261 | "1. **Choose the model**: to choose a model, we will import the appropriate estimator class\n",
262 | "\n",
263 | "1. **Choose appropriate hyperparameters**: to prepare the model, we create a class instance and provide hyperparameters as arguments to the class\n",
264 | "\n",
265 | "1. **Fit the model**: to fit the model to the existing data, we call the `.fit()` method on the model instance and provide training data\n",
266 | "\n",
267 | "1. **Apply the model**: next, we apply the model to new data, primarily by calling one of two methods:\n",
268 | "\n",
269 | " * **Supervised learning**: generally, we use the `.predict()` method to predict new labels\n",
270 | " * **Unsupervised learning**: generally, we use either the `.predict()` or `.transform()` methods to predict properties OR transform properties of the data.\n",
271 | " \n",
272 | "1. **Examine the results**: lastly, it is recommended that we look over the results and do a sanity check. Some of this can be done by simply looking at output values. Other times it really helps to have some form of data visualization (i.e. graph/chart) to help us examine the model predictions or transformations."
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "## A quick demo"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "To whet our appetite for what's to come, we will take a quick look at coffee prices near the North Shore of Oahu, Hawaii. Our goal will be to predict the price of a cup of coffee, given a cup size.\n",
287 | "\n",
288 | "These prices come from several coffee shops in the area, in 2019.\n",
289 | "\n",
290 | "|Size (oz)|Price ($)|\n",
291 | "|----|----|\n",
292 | "|12|2.95|\n",
293 | "|16|3.65|\n",
294 | "|20|4.15|\n",
295 | "|14|3.25|\n",
296 | "|18|4.20|\n",
297 | "|20|4.00|\n"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "### Prep the data"
305 | ]
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "metadata": {},
310 | "source": [
311 | "Let's look at the data in a simple scatter plot to compare the cost of coffee versus the size of the cup."
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "We start with a set of standard imports..."
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "import matplotlib.pyplot as plt\n",
328 | "import numpy as np\n",
329 | "\n",
330 | "# NOTE: during the Choose the Model step, we will import the \n",
331 | "# model we want, but there is no reason you can't import it here.\n",
332 | "# from sklearn.linear_model import LinearRegression"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "### Prep the training and test data"
340 | ]
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "metadata": {},
345 | "source": [
346 | "**The training data**:\n",
347 | "\n",
348 | "We start off by making two `numpy` arrays."
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {},
355 | "outputs": [],
356 | "source": [
357 | "x_train = np.array([12, 16, 20, 14, 18, 20]) # Coffee sizes\n",
358 | "y_train = np.array([2.95, 3.65, 4.15, 3.25, 4.20, 4.00]) # Coffee prices"
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "metadata": {},
364 | "source": [
365 | "Then we plot them using a `matplotlib` scatter plot."
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "plt.scatter(x_train, y_train);"
375 | ]
376 | },
377 | {
378 | "cell_type": "markdown",
379 | "metadata": {},
380 | "source": [
381 | "In order to put this data into a linear regression machine learning algorithm, we need to create our features matrix, which includes just our coffee sizes (`x_train` values).\n",
382 | "\n",
383 | "In this case, we will use one of the `numpy` techniques to increase the dimensionality of the `x_train` array. We will discuss this process in greater detail in a few minutes.\n",
384 | "```\n",
385 | "X_train = x_train[:, np.newaxis]\n",
386 | "```\n",
387 | "\n",
388 | "We will call our training set: `X_train` (with an upper case `X`)."
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "X_train = x_train[:, np.newaxis] # creates an array of arrays\n",
398 | "X_train"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {},
404 | "source": [
405 | "Our target values are generally labeled `y_train` (with a lower case `y`) and these values can be a simple array."
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": null,
411 | "metadata": {},
412 | "outputs": [],
413 | "source": [
414 | "y_train"
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "metadata": {},
420 | "source": [
421 | "**Now, the test data**:"
422 | ]
423 | },
424 | {
425 | "cell_type": "markdown",
426 | "metadata": {},
427 | "source": [
428 | "We need to have some test data to see what values the model will predict. Let's presume that some friends will be coming to the North Shore of Oahu and want to buy some coffee in various sizes, include some potentially unusual sizes.\n",
429 | "\n",
430 | "Based on their requests, we prep several cup sizes to see what price the model will predict.\n",
431 | "\n",
432 | "We generate a set of `x_test` values (representing size in oz.) in an array. Then we convert the array to a 2D matrix for inclusion as an argument when we get to the prediction phase. As noted above, we will discuss this in detail shortly."
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": null,
438 | "metadata": {},
439 | "outputs": [],
440 | "source": [
441 | "x_test = np.array([16, 15, 12, 20, 17])"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "metadata": {},
448 | "outputs": [],
449 | "source": [
450 | "X_test = x_test[:, None] # None will accomplish the same\n",
451 | "X_test # outcome as np.newaxis"
452 | ]
453 | },
454 | {
455 | "cell_type": "markdown",
456 | "metadata": {},
457 | "source": [
458 | "### Choose the Model"
459 | ]
460 | },
461 | {
462 | "cell_type": "markdown",
463 | "metadata": {},
464 | "source": [
465 | "For this quick example, we are gonna import a simple **linear regression** model from the sklearn collection of linear models."
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": null,
471 | "metadata": {},
472 | "outputs": [],
473 | "source": [
474 | "from sklearn.linear_model import LinearRegression"
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "metadata": {},
480 | "source": [
481 | "### Choose Appropriate Hyperparameters"
482 | ]
483 | },
484 | {
485 | "cell_type": "markdown",
486 | "metadata": {},
487 | "source": [
488 | "This model comes, as do most of the models in sklearn with arguments (or hyperparameters) set to sane defaults, so for this case, we won't add or change any arguments.\n",
489 | "\n",
490 | "**NOTE**: When Jupyter evaluates a model, it displays a string representation of that model with the current settings for the model, including any defaults."
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": null,
496 | "metadata": {},
497 | "outputs": [],
498 | "source": [
499 | "model = LinearRegression()\n",
500 | "model"
501 | ]
502 | },
503 | {
504 | "cell_type": "markdown",
505 | "metadata": {},
506 | "source": [
507 | "### Fit the model"
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {},
513 | "source": [
514 | "With a prepared model, we need to feed it data to evaluate. For this linear regression model, we give it two arguments: `X` and `y`."
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": null,
520 | "metadata": {},
521 | "outputs": [],
522 | "source": [
523 | "model.fit(X_train, y_train)"
524 | ]
525 | },
526 | {
527 | "cell_type": "markdown",
528 | "metadata": {},
529 | "source": [
530 | "With these inputs, the model was able to calculate the **slope** (coefficient) and the **y-intercept** of the line that aligns most closely with our training data.\n",
531 | "\n",
532 | "Let's look at both of these calculated results.\n",
533 | "\n",
534 | "```python\n",
535 | "model.coef_\n",
536 | "model.intercept_\n",
537 | "```\n",
538 | "\n",
539 | "**NOTE**: scikit-learn appends an `_` to the end of attributes that return **calculated** values. It does this to help distinguish between inputs and outputs"
540 | ]
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": null,
545 | "metadata": {},
546 | "outputs": [],
547 | "source": [
548 | "model.coef_"
549 | ]
550 | },
551 | {
552 | "cell_type": "code",
553 | "execution_count": null,
554 | "metadata": {},
555 | "outputs": [],
556 | "source": [
557 | "model.intercept_"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {},
563 | "source": [
564 | "### Apply the model"
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "metadata": {},
571 | "outputs": [],
572 | "source": [
573 | "y_pred = model.predict(X_test)\n",
574 | "y_pred\n",
575 | "\n",
576 | "# reminder, these were the test cup sizes: \n",
577 | "# [16, 15, 12, 20, 17]"
578 | ]
579 | },
580 | {
581 | "cell_type": "markdown",
582 | "metadata": {},
583 | "source": [
584 | "### Examine the Results"
585 | ]
586 | },
587 | {
588 | "cell_type": "markdown",
589 | "metadata": {},
590 | "source": [
591 | "From here, we can plot all of the data points together on one chart:\n",
592 | "\n",
593 | "* original values in purple\n",
594 | "* predicted values in red\n",
595 | "* predicted slope of the line that best fits the original training data"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": null,
601 | "metadata": {},
602 | "outputs": [],
603 | "source": [
604 | "plt.scatter(x_train, y_train, color='rebeccapurple')\n",
605 | "plt.scatter(x_test, y_pred, color='red', alpha=0.20)"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": null,
611 | "metadata": {},
612 | "outputs": [],
613 | "source": [
614 | "plt.scatter(x_train, y_train, color='rebeccapurple')\n",
615 | "plt.plot(x_test, y_pred, color='red');"
616 | ]
617 | },
618 | {
619 | "cell_type": "markdown",
620 | "metadata": {},
621 | "source": [
622 | "### Deep Dive"
623 | ]
624 | },
625 | {
626 | "cell_type": "markdown",
627 | "metadata": {},
628 | "source": [
629 | "**The scikit-learn API**: The scikit-learn API is very rich and has many well-thought out approaches. The [API design contract document ](https://arxiv.org/pdf/1309.0238.pdf) helps characterize some of the philosophy behind the tool, which I found to be useful in understand how to use the tool. I probably wouldn't suggest reading the whole document right off the bat, but everything up to *Section 3 Advanced API* is a good overview of the philosophy, some aspects of data formats, and scikit-learn Estimators and Predictors.\n",
630 | "\n",
631 | "**Linear Regression**: We will revisit Linear Regression model in a later lesson and will provide more of a deep dive there."
632 | ]
633 | },
634 | {
635 | "cell_type": "markdown",
636 | "metadata": {},
637 | "source": [
638 | "### Gotchas"
639 | ]
640 | },
641 | {
642 | "cell_type": "markdown",
643 | "metadata": {},
644 | "source": [
645 | "A significant struggle for beginners can be ensuring that the data is in the right format. We will cover that topic in the next session."
646 | ]
647 | },
648 | {
649 | "cell_type": "markdown",
650 | "metadata": {},
651 | "source": [
652 | "### How to learn more: tips and tricks"
653 | ]
654 | },
655 | {
656 | "cell_type": "markdown",
657 | "metadata": {},
658 | "source": [
659 | "As we explore the Scikit-Learn API, and as we progress through the upcoming examples I want to pre-position you for success by showing you where and how you can learn more.\n",
660 | "\n",
661 | "One great resource to better understand the many options available to you in terms of the machine learning algorithms and the hyper parameters in scikit-learn is the API Reference. Throughout the following discussions, we will revisit the API reference repeatedly.\n",
662 | "\n",
663 | "**[API Reference](https://scikit-learn.org/stable/modules/classes.html)**: A one-stop shop for the classes and functions in `sklearn`"
664 | ]
665 | },
666 | {
667 | "cell_type": "markdown",
668 | "metadata": {},
669 | "source": [
670 | "# Experience Points!\n",
671 | "---"
672 | ]
673 | },
674 | {
675 | "cell_type": "markdown",
676 | "metadata": {
677 | "slideshow": {
678 | "slide_type": "slide"
679 | }
680 | },
681 | "source": [
682 | "**Task 01**\n",
683 | "\n",
684 | "* Open the API Reference (mentioned above) and find the section on `model_selection.train_test_split`\n",
685 | "* Review that section (at a high level) for about 2 minutes looking for the following:\n",
686 | " * Make notes of any words that you aren't familiar with. See if you hear them later in this tutorial\n",
687 | " * Explore the section that describes what type of data the `train_test_split()` function returns. What will you get back?"
688 | ]
689 | },
690 | {
691 | "cell_type": "markdown",
692 | "metadata": {},
693 | "source": [
694 | "---\n",
695 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
696 | "\n",
697 | "If you want to continue on at your own-pace, please feel free to do so.\n",
698 | "\n",
699 | "
"
700 | ]
701 | },
702 | {
703 | "cell_type": "markdown",
704 | "metadata": {},
705 | "source": [
706 | "# References\n",
707 | "---"
708 | ]
709 | },
710 | {
711 | "cell_type": "markdown",
712 | "metadata": {},
713 | "source": [
714 | "Below are references that may assist you in learning more:\n",
715 | " \n",
716 | "|Title (link)|Comments|\n",
717 | "|---|---|\n",
718 | "|[API Reference](https://scikit-learn.org/stable/modules/classes.html)|One stop shop for the classes and functions in `sklearn`|\n",
719 | "|[Contributors API Overview](https://scikit-learn.org/stable/developers/contributing.html#api-overview)|Overview of the API for contributors to scikit learn|\n",
720 | "|[API design contract](https://arxiv.org/abs/1309.0238)|An overview of the philosophy behind the API design|\n",
721 | "|[Regression Analysis](https://en.wikipedia.org/wiki/Regression_analysis)|An article on regression analysis|\n",
722 | "|[Cluster analysis](https://en.wikipedia.org/wiki/Cluster_analysis)|An article on cluster analysis|\n",
723 | "|[curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality)|An article on the curse of dimensionality|"
724 | ]
725 | }
726 | ],
727 | "metadata": {
728 | "kernelspec": {
729 | "display_name": "Python 3",
730 | "language": "python",
731 | "name": "python3"
732 | },
733 | "language_info": {
734 | "codemirror_mode": {
735 | "name": "ipython",
736 | "version": 3
737 | },
738 | "file_extension": ".py",
739 | "mimetype": "text/x-python",
740 | "name": "python",
741 | "nbconvert_exporter": "python",
742 | "pygments_lexer": "ipython3",
743 | "version": "3.6.7"
744 | }
745 | },
746 | "nbformat": 4,
747 | "nbformat_minor": 2
748 | }
749 |
--------------------------------------------------------------------------------
/06_special_topics/06_special_topics.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Welcome to the Dark Art of Coding:\n",
8 | "## Introduction to Machine Learning\n",
9 | "Special Topics\n",
10 | "\n",
11 | "
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Objectives\n",
19 | "---"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "In this session, students should expect to:\n",
27 | "\n",
28 | "* Understand the use of the `PolynomialFeatures()` method\n",
29 | "* Explore the use of `Pipelines` to create a workflow of transforms in combination with a final estimator\n",
30 | "* Use `PolynomialFeatures` in a `Pipeline` to explore underfitting and overfitting"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "# Overview: PolynomialFeatures\n",
38 | "---"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## PolynomialFeatures\n",
46 | "\n",
47 | "The PolynomialFeature class has a `.fit_transform()` method that transforms input values into a series of output values. These values are often used as inputs in other models.\n",
48 | "\n",
49 | "PolynomialFeatures generates a new feature matrix that has all the polynomial combinations of the original features with a degree less than or equal to the specified degree. \n",
50 | "\n",
51 | "As an example: \n",
52 | "\n",
53 | "An input sample has two dimensions (i.e. $[a, b]$) the resulting degree-2 polynomial features will be $[1, a, b, a^2, ab, b^2]$."
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "We start with some standard imports:\n",
61 | " "
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "import matplotlib.pyplot as plt\n",
71 | "import numpy as np\n",
72 | "import pandas as pd\n",
73 | "import sklearn\n",
74 | "from sklearn.preprocessing import PolynomialFeatures"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "Let's start with a three element matrix:"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "X = np.arange(3).reshape(3, 1)\n",
91 | "X"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "The simplest PolynomialFeatures is simply to return the original array, but notice that in this case, the function returns a column of `1`s as well as the original matrix."
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "poly = PolynomialFeatures(1)\n",
108 | "poly.fit_transform(X)"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "Yields $1, a$ for each element in the X matrix"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "If you want to have a features matrix that doesn't include the column of `1`s, you can avoid it by using the `include_bias=False` argument.\n",
123 | "\n",
124 | "Including a bias column acts as an intercept term in a linear model."
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "poly = PolynomialFeatures(1, include_bias=False)\n",
134 | "poly.fit_transform(X)"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "poly = PolynomialFeatures(2)\n",
144 | "poly.fit_transform(X)"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "Yields $1, a, a^2$ for each element in the X matrix"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "poly = PolynomialFeatures(4)\n",
161 | "poly.fit_transform(X)"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "Yields $1, a, a^2, a^3, a^4$ for each element in the X matrix"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": [
177 | "X2 = np.arange(6).reshape(3, 2)\n",
178 | "X2"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "poly = PolynomialFeatures(1)\n",
188 | "poly.fit_transform(X2)"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "Yields $1, a, b$ for each element in the X matrix"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "poly = PolynomialFeatures(2)\n",
205 | "poly.fit_transform(X2)"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | "Yields $1, a, b, a^2, ab, b^2$ for each element in the X matrix"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "poly = PolynomialFeatures(3)\n",
222 | "poly.fit_transform(X2)\n",
223 | "\n",
224 | "# 1 a b a^2 ab b^2 a^3 a^2*b a*b^2 b^3"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "Yields $1, a, b, a^2, ab, b^2, a^3, a^2b, ab^2, b^3$ for each element in the X matrix"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "Thus for any `degree` that we feed into the PolynomialFeature model, we can transform an input matrix into a higher order matrix that may allow for potentially more precise calculations of `y` values, given values of `x`."
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "Why does this matter?... if you recall from your math days it is possible to create very sophisticated curves using formulas such as this:\n",
246 | "\n",
247 | "$$\n",
248 | "y = mx + b \\\\\n",
249 | "y = ax^2 + bx + c \\\\\n",
250 | "y = ax^3 + bx^2 + cx + d \\\\\n",
251 | "y = ax^4 + bx^3 + cx^2 + dx + e \\\\\n",
252 | "$$\n",
253 | "\n",
254 | "With every additional argument and with the appropriate slopes, you have the ability to match a wide array of datasets.\n",
255 | "\n",
256 | "PolynomialFeatures helps you to generate matrices with multiple degrees so that you can run them through models like the LinearRegression model to identify the coefficients and intercept values for equations that resemble those above."
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "With that very brief intro to `PolynomialFeatures`, we will turn our attention to a new topic, **Pipelines**, but will come back to `PolynomialFeatures` momentarily."
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {},
269 | "source": [
270 | "# Overview: Pipelines\n",
271 | "---"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "In some cases, it might be necessary to transform the data in some way before feeding it into a particular machine learning model.\n",
279 | "\n",
280 | "The data may need to be:\n",
281 | "* scaled (ie. using `StandardScaler`\n",
282 | "* changed into another format (ie. using `PolynomialFeatures` or `CountVectorizer`)\n",
283 | "* normalized (i.e. using `TfidfTransformer`)\n",
284 | "\n",
285 | "In the example we just looked at, we used a `PolynomialFeatures` function to generate a higher degree matrix.\n",
286 | "\n",
287 | "Pipelines allow you to feed inputs into one \"end\" of a series of components and get transformations or predictions out the other end, without having to take the output of one model and manually drop it into the inputs of the next model.\n",
288 | "\n",
289 | "The following example uses the `PolynomialFeatures` model to transform inputs from a degree 1 polynomial into higher degree polynomials. It then takes the results of those transformations and then feeds them into the `LinearRegression` model. \n",
290 | "\n",
291 | "The `Pipeline` simplifies things so that we only have to call `.fit()` once on the pipeline."
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "## A first trivial example..."
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "## Prep the data\n",
306 | "\n",
307 | "Start with some standard imports"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "import matplotlib.pyplot as plt\n",
317 | "import numpy as np\n",
318 | "import pandas as pd\n",
319 | "import sklearn"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {},
325 | "source": [
326 | "### Prep the training and test data"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "df = pd.read_csv('../universal_datasets/skincancer.txt',\n",
336 | " delim_whitespace=True,\n",
337 | " header=0,\n",
338 | " names=['state', 'lat', 'mort', 'ocean', 'long'])\n",
339 | "df.head()"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "X = df['lat'].to_frame()\n",
349 | "y = df['mort']"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "from sklearn.model_selection import train_test_split"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "plt.scatter(X_train, y_train)\n",
377 | "plt.title(\"Mortality vs Latitude\")\n",
378 | "plt.xlabel(\"Latitude\")\n",
379 | "plt.ylabel(\"Number of deaths\");"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {},
386 | "outputs": [],
387 | "source": [
388 | "from sklearn.linear_model import LinearRegression\n",
389 | "from sklearn.preprocessing import PolynomialFeatures\n",
390 | "from sklearn.pipeline import Pipeline"
391 | ]
392 | },
393 | {
394 | "cell_type": "markdown",
395 | "metadata": {},
396 | "source": [
397 | "**NOTE**: for this example, we are simply gonna regurgitate the input data rather than change the degree, so we choose to use a `degree=1` and to avoid the bias column (column of `1`s), we set `include_bias=False`. In a moment, we will look at tweaking the degree to explore underfitting and overfitting. In this first example, I merely want to focus on putting the `Pipeline` together."
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": null,
403 | "metadata": {},
404 | "outputs": [],
405 | "source": [
406 | "polynomial_features = PolynomialFeatures(degree=1,\n",
407 | " include_bias=False)"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": null,
413 | "metadata": {},
414 | "outputs": [],
415 | "source": [
416 | "linear_regression = LinearRegression()"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "This is where the magic comes into play. By providing as an argument to the Pipeline constructor a list containing a series of tuples, we can establish which models to call and in what order.\n",
424 | "\n",
425 | "* Each tuple is a step in the pipeline.\n",
426 | "* Each tuple is comprised of a name for that step and the function or model to call during that step.\n",
427 | "* Each step should be sequentially in the order we want\n",
428 | "* Every step, except for the last step must have either a `.transform()` OR `.fit_transform()` method. As we have seen, `PolynomialFeatures` does indeed have a `.fit_transform()` method."
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": null,
434 | "metadata": {},
435 | "outputs": [],
436 | "source": [
437 | "pipeline = Pipeline([(\"poly_f\", polynomial_features),\n",
438 | " (\"linear_r\", linear_regression)])"
439 | ]
440 | },
441 | {
442 | "cell_type": "markdown",
443 | "metadata": {},
444 | "source": [
445 | "NOTE: in the next cell, we simply call `.fit()` on the Pipeline. We don't have to call the `fit_transform()` method on the PolynomialFeatures at all, the Pipeline does it automagically."
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": null,
451 | "metadata": {},
452 | "outputs": [],
453 | "source": [
454 | "pipeline.fit(X_train, y_train)"
455 | ]
456 | },
457 | {
458 | "cell_type": "markdown",
459 | "metadata": {},
460 | "source": [
461 | "Now that our model has been fit, we simply call `.predict()`, like normal."
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": null,
467 | "metadata": {},
468 | "outputs": [],
469 | "source": [
470 | "y_test = pipeline.predict(X_test)"
471 | ]
472 | },
473 | {
474 | "cell_type": "markdown",
475 | "metadata": {},
476 | "source": [
477 | "Of course, let's take a quick look via a chart."
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": null,
483 | "metadata": {},
484 | "outputs": [],
485 | "source": [
486 | "plt.plot(X_test, y_test, label=\"Model\")\n",
487 | "plt.scatter(X_train, y_train);"
488 | ]
489 | },
490 | {
491 | "cell_type": "markdown",
492 | "metadata": {},
493 | "source": [
494 | "## An example of under/overfitting"
495 | ]
496 | },
497 | {
498 | "cell_type": "markdown",
499 | "metadata": {},
500 | "source": [
501 | "Now that we have a sense for how we can use a Pipeline, we are gonna create one and use it to explore the phenomena of **Underfitting** and **Overfitting**."
502 | ]
503 | },
504 | {
505 | "cell_type": "markdown",
506 | "metadata": {},
507 | "source": [
508 | "A risk in machine learning is using a model that doesn't match the data well enough (**underfitting**) OR matches the training data so well, that it doesn't apply well to test data, it only applies to the training data (**overfitting**).\n",
509 | "\n",
510 | "\n",
511 | "\n",
512 | "
\n",
513 | "\n",
514 | "\n",
515 | "Image source: [Overfitting mattress](http://cdn.stylefrizz.com/img/human-body-shape-matress.jpg)\n",
516 | "\n",
517 | "\n",
518 | "For this example, we will look at three graphs. This example comes from the Scikit Learn [Underfitting/Overfitting documentation](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html), with various modifications by me."
519 | ]
520 | },
521 | {
522 | "cell_type": "markdown",
523 | "metadata": {},
524 | "source": [
525 | "We will do this process three times using `degree=` of `1`, `4`, and `15` to demonstrate underfitting, a good fit, and overfitting.\n",
526 | "\n",
527 | "Two of these cases will generate linear regressions that are not straight lines."
528 | ]
529 | },
530 | {
531 | "cell_type": "markdown",
532 | "metadata": {},
533 | "source": [
534 | "## Prep the training and test data"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": null,
540 | "metadata": {},
541 | "outputs": [],
542 | "source": [
543 | "from sklearn.pipeline import Pipeline\n",
544 | "from sklearn.preprocessing import PolynomialFeatures\n",
545 | "from sklearn.linear_model import LinearRegression"
546 | ]
547 | },
548 | {
549 | "cell_type": "markdown",
550 | "metadata": {},
551 | "source": [
552 | "In the example, they create a function (`true_fun`) that generates a series of points on a graph in the shape of a Cosine."
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": null,
558 | "metadata": {},
559 | "outputs": [],
560 | "source": [
561 | "def true_fun(X):\n",
562 | " return np.cos(1.5 * np.pi * X)"
563 | ]
564 | },
565 | {
566 | "cell_type": "markdown",
567 | "metadata": {},
568 | "source": [
569 | "Using 30 random values as `X` inputs, they use the function to generate 30 related `y` values."
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": null,
575 | "metadata": {},
576 | "outputs": [],
577 | "source": [
578 | "np.random.seed(0)\n",
579 | "\n",
580 | "n_samples = 30\n",
581 | "\n",
582 | "x = np.sort(np.random.rand(n_samples))\n",
583 | "y = true_fun(x) + np.random.randn(n_samples) * 0.1"
584 | ]
585 | },
586 | {
587 | "cell_type": "markdown",
588 | "metadata": {},
589 | "source": [
590 | "Let's look at X and y."
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": null,
596 | "metadata": {},
597 | "outputs": [],
598 | "source": [
599 | "X = x[:, np.newaxis]\n",
600 | "\n",
601 | "X[:5]"
602 | ]
603 | },
604 | {
605 | "cell_type": "code",
606 | "execution_count": null,
607 | "metadata": {},
608 | "outputs": [],
609 | "source": [
610 | "y[:5]"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": null,
616 | "metadata": {},
617 | "outputs": [],
618 | "source": [
619 | "plt.scatter(X, y)\n",
620 | "plt.title(\"Cosine Dots\");"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": null,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "X_test = np.linspace(0.05, 1, 100)[:, np.newaxis]"
630 | ]
631 | },
632 | {
633 | "cell_type": "markdown",
634 | "metadata": {},
635 | "source": [
636 | "## Choose Appropriate Hyperparameters"
637 | ]
638 | },
639 | {
640 | "cell_type": "markdown",
641 | "metadata": {},
642 | "source": [
643 | "Let's:\n",
644 | "* start with PolynomialFeatures **degree of 1**\n",
645 | "* use the default values for LinearRegression\n",
646 | "* feed each into our Pipeline"
647 | ]
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": null,
652 | "metadata": {},
653 | "outputs": [],
654 | "source": [
655 | "polynomial_features = PolynomialFeatures(degree=1,\n",
656 | " include_bias=False)\n",
657 | "linear_regression = LinearRegression()\n",
658 | "pipeline = Pipeline([(\"polynomial_features\", polynomial_features),\n",
659 | " (\"linear_regression\", linear_regression)])"
660 | ]
661 | },
662 | {
663 | "cell_type": "markdown",
664 | "metadata": {},
665 | "source": [
666 | "## Fit the Model"
667 | ]
668 | },
669 | {
670 | "cell_type": "markdown",
671 | "metadata": {},
672 | "source": [
673 | "We only have to call `.fit()` on the pipeline, not on each of the components in the pipeline."
674 | ]
675 | },
676 | {
677 | "cell_type": "code",
678 | "execution_count": null,
679 | "metadata": {},
680 | "outputs": [],
681 | "source": [
682 | "pipeline.fit(X, y)"
683 | ]
684 | },
685 | {
686 | "cell_type": "markdown",
687 | "metadata": {},
688 | "source": [
689 | "## Apply the Model"
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": null,
695 | "metadata": {},
696 | "outputs": [],
697 | "source": [
698 | "y_test = pipeline.predict(X_test)"
699 | ]
700 | },
701 | {
702 | "cell_type": "markdown",
703 | "metadata": {},
704 | "source": [
705 | "## Examine the results"
706 | ]
707 | },
708 | {
709 | "cell_type": "code",
710 | "execution_count": null,
711 | "metadata": {},
712 | "outputs": [],
713 | "source": [
714 | "plt.plot(X_test, y_test, label=\"Model\")\n",
715 | "plt.plot(X_test, true_fun(X_test), label=\"True function\")\n",
716 | "\n",
717 | "plt.scatter(X, y, edgecolor='b', s=20, label=\"Samples\")\n",
718 | "plt.legend()\n",
719 | "plt.title(\"Underfit\"); "
720 | ]
721 | },
722 | {
723 | "cell_type": "markdown",
724 | "metadata": {},
725 | "source": [
726 | "## Choose Appropriate Hyperparameters"
727 | ]
728 | },
729 | {
730 | "cell_type": "markdown",
731 | "metadata": {},
732 | "source": [
733 | "Repeating the process to generate polynomial features of **degree 4**:"
734 | ]
735 | },
736 | {
737 | "cell_type": "code",
738 | "execution_count": null,
739 | "metadata": {},
740 | "outputs": [],
741 | "source": [
742 | "polynomial_features = PolynomialFeatures(degree=4,\n",
743 | " include_bias=False)\n",
744 | "linear_regression = LinearRegression()\n",
745 | "pipeline = Pipeline([(\"polynomial_features\", polynomial_features),\n",
746 | " (\"linear_regression\", linear_regression)])"
747 | ]
748 | },
749 | {
750 | "cell_type": "markdown",
751 | "metadata": {},
752 | "source": [
753 | "## Fit the Model"
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "execution_count": null,
759 | "metadata": {},
760 | "outputs": [],
761 | "source": [
762 | "pipeline.fit(X, y)"
763 | ]
764 | },
765 | {
766 | "cell_type": "markdown",
767 | "metadata": {},
768 | "source": [
769 | "## Apply the Model"
770 | ]
771 | },
772 | {
773 | "cell_type": "code",
774 | "execution_count": null,
775 | "metadata": {},
776 | "outputs": [],
777 | "source": [
778 | "y_test = pipeline.predict(X_test)"
779 | ]
780 | },
781 | {
782 | "cell_type": "markdown",
783 | "metadata": {},
784 | "source": [
785 | "## Examine the results"
786 | ]
787 | },
788 | {
789 | "cell_type": "code",
790 | "execution_count": null,
791 | "metadata": {},
792 | "outputs": [],
793 | "source": [
794 | "plt.plot(X_test, y_test, label=\"Model\")\n",
795 | "plt.plot(X_test, true_fun(X_test), label=\"True function\")\n",
796 | "plt.scatter(X, y, edgecolor='b', s=20, label=\"Samples\")\n",
797 | "plt.legend()\n",
798 | "plt.title(\"Good match\"); "
799 | ]
800 | },
801 | {
802 | "cell_type": "markdown",
803 | "metadata": {},
804 | "source": [
805 | "## Choose Appropriate Hyperparameters"
806 | ]
807 | },
808 | {
809 | "cell_type": "markdown",
810 | "metadata": {},
811 | "source": [
812 | "Lastly, let's generate polynomial features of **degree 15**:"
813 | ]
814 | },
815 | {
816 | "cell_type": "code",
817 | "execution_count": null,
818 | "metadata": {},
819 | "outputs": [],
820 | "source": [
821 | "polynomial_features = PolynomialFeatures(degree=15,\n",
822 | " include_bias=False)\n",
823 | "linear_regression = LinearRegression()\n",
824 | "pipeline = Pipeline([(\"polynomial_features\", polynomial_features),\n",
825 | " (\"linear_regression\", linear_regression)])"
826 | ]
827 | },
828 | {
829 | "cell_type": "markdown",
830 | "metadata": {},
831 | "source": [
832 | "## Fit the Model"
833 | ]
834 | },
835 | {
836 | "cell_type": "code",
837 | "execution_count": null,
838 | "metadata": {},
839 | "outputs": [],
840 | "source": [
841 | "pipeline.fit(X, y)"
842 | ]
843 | },
844 | {
845 | "cell_type": "markdown",
846 | "metadata": {},
847 | "source": [
848 | "## Apply the Model"
849 | ]
850 | },
851 | {
852 | "cell_type": "code",
853 | "execution_count": null,
854 | "metadata": {},
855 | "outputs": [],
856 | "source": [
857 | "y_test = pipeline.predict(X_test)"
858 | ]
859 | },
860 | {
861 | "cell_type": "markdown",
862 | "metadata": {},
863 | "source": [
864 | "## Examine the results"
865 | ]
866 | },
867 | {
868 | "cell_type": "code",
869 | "execution_count": null,
870 | "metadata": {},
871 | "outputs": [],
872 | "source": [
873 | "plt.plot(X_test, y_test, label=\"Model\")\n",
874 | "plt.plot(X_test, true_fun(X_test), label=\"True function\")\n",
875 | "plt.scatter(X, y, edgecolor='b', s=20, label=\"Samples\")\n",
876 | "plt.legend()\n",
877 | "plt.title(\"Overfit\"); "
878 | ]
879 | },
880 | {
881 | "cell_type": "markdown",
882 | "metadata": {},
883 | "source": [
884 | "# Gotchas\n",
885 | "---"
886 | ]
887 | },
888 | {
889 | "cell_type": "markdown",
890 | "metadata": {},
891 | "source": [
892 | "N/A"
893 | ]
894 | },
895 | {
896 | "cell_type": "markdown",
897 | "metadata": {},
898 | "source": [
899 | "# Deep Dive\n",
900 | "---"
901 | ]
902 | },
903 | {
904 | "cell_type": "markdown",
905 | "metadata": {},
906 | "source": [
907 | "N/A"
908 | ]
909 | },
910 | {
911 | "cell_type": "markdown",
912 | "metadata": {},
913 | "source": [
914 | "# How to learn more: tips and hints\n",
915 | "---"
916 | ]
917 | },
918 | {
919 | "cell_type": "markdown",
920 | "metadata": {},
921 | "source": [
922 | "\n",
923 | "**Tear apart the examples**: The [original example](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html) showing underfitting/overfitting was a bit more complicated than what I showed here, cause they opted to create a three panel chart in `matplotlib` and to automate the processing by putting the degrees into a list and cycling through the list using a for loop to generate all the charts...\n",
924 | "\n",
925 | "I took individual lines, looked at each line, stripped away as much of the extraneous complications as I could to look at just the machine learning components and that greatly helped clarify what was going on."
926 | ]
927 | },
928 | {
929 | "cell_type": "markdown",
930 | "metadata": {},
931 | "source": [
932 | "# Experience Points!\n",
933 | "---"
934 | ]
935 | },
936 | {
937 | "cell_type": "markdown",
938 | "metadata": {
939 | "slideshow": {
940 | "slide_type": "slide"
941 | }
942 | },
943 | "source": [
944 | "**Task 01**\n",
945 | "\n",
946 | "\n",
947 | "Explore this documentation regarding underfitting/overfitting.\n",
948 | "\n",
949 | "[**Overfitting (link)**](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html)\n",
950 | "\n",
951 | "Answer this question:\n",
952 | "\n",
953 | "* What technique can be used to quantitatively evaluate underfitting/overfitting?"
954 | ]
955 | },
956 | {
957 | "cell_type": "markdown",
958 | "metadata": {},
959 | "source": [
960 | "---\n",
961 | "When you complete this exercise, please put your **green** post-it on your monitor. \n",
962 | "\n",
963 | "If you want to continue on at your own-pace, please feel free to do so.\n",
964 | "\n",
965 | "
"
966 | ]
967 | },
968 | {
969 | "cell_type": "markdown",
970 | "metadata": {},
971 | "source": [
972 | "# References\n",
973 | "---"
974 | ]
975 | },
976 | {
977 | "cell_type": "markdown",
978 | "metadata": {},
979 | "source": [
980 | "Below are references that may assist you in learning more:\n",
981 | " \n",
982 | "|Title (link)|Comments|\n",
983 | "|---|---|\n",
984 | "|[General API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n",
985 | "|[Overfitting Reference](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html)||"
986 | ]
987 | },
988 | {
989 | "cell_type": "code",
990 | "execution_count": null,
991 | "metadata": {},
992 | "outputs": [],
993 | "source": []
994 | }
995 | ],
996 | "metadata": {
997 | "kernelspec": {
998 | "display_name": "Python 3",
999 | "language": "python",
1000 | "name": "python3"
1001 | },
1002 | "language_info": {
1003 | "codemirror_mode": {
1004 | "name": "ipython",
1005 | "version": 3
1006 | },
1007 | "file_extension": ".py",
1008 | "mimetype": "text/x-python",
1009 | "name": "python",
1010 | "nbconvert_exporter": "python",
1011 | "pygments_lexer": "ipython3",
1012 | "version": "3.6.7"
1013 | }
1014 | },
1015 | "nbformat": 4,
1016 | "nbformat_minor": 2
1017 | }
1018 |
--------------------------------------------------------------------------------